Add missing NULL check in CanBeType

[WIP] add check for polymorphic functions
[WIP] typechecking for casts to polymorphic types
2017-05-02 22:26:21 -04:00 · 2017-05-02 14:59:04 -04:00 · 2017-04-29 15:56:43 -04:00 · 2017-04-29 11:11:39 -04:00 · 2017-04-29 11:10:36 -04:00 · 2017-04-28 23:37:06 -04:00
1262 changed files with 220414 additions and 22505 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,8 +1,31 @@
 *.pyc
 *~
+tags
 depend
 ispc
 ispc_test
+ispc_ref
+llvm/
 objs
 docs/doxygen
-docs/ispc.html
+docs/*.html
+tests*/*cpp
+tests*/*run
+tests*/*.o
+tests_ispcpp/*.h
+tests_ispcpp/*pre*
+logs/
+notify_log.log
+alloy_results_*
+examples/*/*.png
+examples/*/*.ppm
+examples/*/objs/*
+examples/*/ref
+examples/*/test
+*.swp
+check_isa.exe
+.vscode
+configure
+ispc.dSYM
+
+
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -1,4 +1,4 @@
-Copyright (c) 2010-2011, Intel Corporation
+Copyright (c) 2010-2016, Intel Corporation
 All rights reserved.

 Redistribution and use in source and binary forms, with or without
@@ -77,7 +77,7 @@ covered by the following license:
 University of Illinois/NCSA
 Open Source License

-Copyright (c) 2003-2010 University of Illinois at Urbana-Champaign.
+Copyright (c) 2003-2014 University of Illinois at Urbana-Champaign.
 All rights reserved.

 Developed by:
@@ -141,3 +141,46 @@ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
 POSSIBILITY OF SUCH DAMAGE.
+
+---------------------------------------------------------------------------
+
+The ptxtools use parts of the PTX parser code from GPU Ocelot project
+(https://code.google.com/p/gpuocelot/), which is covered by the following
+license:
+
+Copyright 2011
+GEORGIA TECH RESEARCH CORPORATION
+ALL RIGHTS RESERVED
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+    * Redistributions of source code must retain the above copyright
+notice,   this list of conditions and the following disclaimers.
+    * Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimers in the
+      documentation and/or other materials provided with the
+distribution.
+    * Neither the name of GEORGIA TECH RESEARCH CORPORATION nor the
+names of  its contributors may be used to endorse or promote
+products derived  from this software without specific prior
+written permission.
+
+THIS SOFTWARE IS PROVIDED BY GEORGIA TECH RESEARCH CORPORATION ''AS IS''
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GEORGIA TECH RESEARCH
+CORPORATION BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+You agree that the Software will not be shipped, transferred, exported,
+or re-exported directly into any country prohibited by the United States
+Export Administration Act and the regulations thereunder nor will be
+used for any purpose prohibited by the Act.
+
+ 
--- a/338
+++ b/338
@@ -1,42 +1,193 @@
+#
+#  Copyright (c) 2010-2016, Intel Corporation
+#  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+#
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in the
+#      documentation and/or other materials provided with the distribution.
+#
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived from
+#      this software without specific prior written permission.
+#
+#
+#   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+#   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+#   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+#   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+#   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+#   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+#   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+#   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+#   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+#   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 #
 # ispc Makefile
 #

+define newline
+
+
+endef
+
+define WARNING_BODY
+ ============================== !!! WARNING !!! =============================== \n
+Location of LLVM files in your PATH is different than path in LLVM_HOME \n
+variable (or LLVM_HOME is not set). The most likely this means that you are \n
+using default LLVM installation on your system, which is very bad sign. \n
+Note, that ISPC uses LLVM optimizer and is highly dependent on it. We recommend \n
+using *patched* version of LLVM 3.8. Patches are availible in \n
+llvm_patches folder. You can build LLVM manually, or run our scripts, which \n
+will do all the work for you. Do the following: \n
+1. Create a folder, where LLVM will reside and set LLVM_HOME variable to its \n
+  path. \n
+2. Set ISPC_HOME variable to your ISPC location (probably current folder).
+3. Run alloy.py tool to checkout and build LLVM: \n
+  alloy.py -b --version=3.8 \n
+4. Add $$LLVM_HOME/bin-3.8/bin path to your PATH. \n
+==============================================================================
+endef
+
+# If you have your own special version of llvm and/or clang, change
+# these variables to match.
+LLVM_CONFIG=$(shell which llvm-config)
+CLANG_INCLUDE=$(shell $(LLVM_CONFIG) --includedir)
+
+RIGHT_LLVM = $(WARNING_BODY)
+ifdef LLVM_HOME
+	ifeq ($(findstring $(LLVM_HOME), $(LLVM_CONFIG)), $(LLVM_HOME))
+		RIGHT_LLVM = LLVM from $$LLVM_HOME is used.
+	endif
+endif
+
+# Enable ARM by request
+# To enable: make ARM_ENABLED=1
+ARM_ENABLED=0
+
+# Disable NVPTX by request
+# To enable: make NVPTX_ENABLED=1
+NVPTX_ENABLED=0
+
+# Add llvm bin to the path so any scripts run will go to the right llvm-config
+LLVM_BIN= $(shell $(LLVM_CONFIG) --bindir)
+export PATH:=$(LLVM_BIN):$(PATH)
+
 ARCH_OS = $(shell uname)
+ifeq ($(ARCH_OS), Darwin)
+	ARCH_OS2 = "OSX"
+else
+	ARCH_OS2 = $(shell uname -o)
+endif
 ARCH_TYPE = $(shell arch)

+DNDEBUG_FLAG=$(shell $(LLVM_CONFIG) --cxxflags | grep -o "\-DNDEBUG")
+LLVM_CXXFLAGS=$(shell $(LLVM_CONFIG) --cppflags) $(DNDEBUG_FLAG)
+LLVM_VERSION=LLVM_$(shell $(LLVM_CONFIG) --version | sed -e 's/svn//' -e 's/\./_/' -e 's/\..*//')
+LLVM_VERSION_DEF=-D$(LLVM_VERSION)
+
+LLVM_COMPONENTS = engine ipo bitreader bitwriter instrumentation linker 
+# Component "option" was introduced in 3.3 and starting with 3.4 it is required for the link step.
+# We check if it's available before adding it (to not break 3.2 and earlier).
+ifeq ($(shell $(LLVM_CONFIG) --components |grep -c option), 1)
+    LLVM_COMPONENTS+=option
+endif
+ifneq ($(ARM_ENABLED), 0)
+    LLVM_COMPONENTS+=arm
+endif
+ifneq ($(NVPTX_ENABLED), 0)
+    LLVM_COMPONENTS+=nvptx
+endif	
+LLVM_LIBS=$(shell $(LLVM_CONFIG) --libs $(LLVM_COMPONENTS))
+
 CLANG=clang
 CLANG_LIBS = -lclangFrontend -lclangDriver \
             -lclangSerialization -lclangParse -lclangSema \
-             -lclangAnalysis -lclangAST -lclangLex -lclangBasic
+             -lclangAnalysis -lclangAST -lclangBasic \
+             -lclangEdit -lclangLex

-ISPC_LIBS=$(CLANG_LIBS) \
-	$(shell llvm-config --ldflags --libs) \
-	-lpthread -ldl
-ISPC_TEST_LIBS=$(shell llvm-config --ldflags --libs) \
-	-lpthread -ldl
+ISPC_LIBS=$(shell $(LLVM_CONFIG) --ldflags) $(CLANG_LIBS) $(LLVM_LIBS) \
+	-lpthread

-LLVM_CXXFLAGS=$(shell llvm-config --cppflags)
-LLVM_VERSION=$(shell llvm-config --version | sed s/\\./_/)
-LLVM_VERSION_DEF=-DLLVM_$(LLVM_VERSION)
+ifeq ($(LLVM_VERSION),LLVM_3_4)
+    ISPC_LIBS += -lcurses
+endif

+# There is no logical OR in GNU make. 
+# This 'ifneq' acts like if( !($(LLVM_VERSION) == LLVM_3_2 || $(LLVM_VERSION) == LLVM_3_3 || $(LLVM_VERSION) == LLVM_3_4))
+ifeq (,$(filter $(LLVM_VERSION), LLVM_3_2 LLVM_3_3 LLVM_3_4))
+    ISPC_LIBS += -lcurses -lz
+    # This is here because llvm-config fails to report dependency on tinfo library in some case.
+    # This is described in LLVM bug 16902.
+    ifeq ($(ARCH_OS),Linux)
+        ifneq ($(shell ldconfig -p |grep -c tinfo), 0)
+            ISPC_LIBS += -ltinfo
+	endif
+    endif
+endif
+
+ifeq ($(ARCH_OS),Linux)
+	ISPC_LIBS += -ldl
+endif
+
+ifeq ($(ARCH_OS2),Msys)
+	ISPC_LIBS += -lshlwapi -limagehlp -lpsapi
+endif
+
+# Define build time stamp and revision.
+# For revision we use GIT or SVN info.
 BUILD_DATE=$(shell date +%Y%m%d)
-BUILD_VERSION=$(shell git log --abbrev-commit --abbrev=16 | head -1)
+GIT_REVISION:=$(shell git log --abbrev-commit --abbrev=16 2>/dev/null | head -1)
+ifeq (${GIT_REVISION},)
+    SVN_REVISION:=$(shell svn log -l 1 2>/dev/null | grep -o \^r[[:digit:]]\* )
+    ifeq (${SVN_REVISION},)
+        # Failed to get revision info
+        BUILD_VERSION:="no_version_info"
+    else
+        # SVN revision info
+        BUILD_VERSION:=$(SVN_REVISION)
+    endif
+else
+    # GIT revision info
+    BUILD_VERSION:=$(GIT_REVISION)
+endif

-CXX=g++
-CPP=cpp
-CXXFLAGS=-g3 $(LLVM_CXXFLAGS) -I. -Iobjs/ -Wall $(LLVM_VERSION_DEF) \
-	-DBUILD_DATE="\"$(BUILD_DATE)\"" -DBUILD_VERSION="\"$(BUILD_VERSION)\""
+CXX=clang++
+OPT=-O2
+CXXFLAGS=$(OPT) $(LLVM_CXXFLAGS) -I. -Iobjs/ -I$(CLANG_INCLUDE)  \
+	$(LLVM_VERSION_DEF) \
+	-Wall \
+	-DBUILD_DATE="\"$(BUILD_DATE)\"" -DBUILD_VERSION="\"$(BUILD_VERSION)\"" \
+	-Wno-sign-compare -Wno-unused-function -Werror
+
+# if( !($(LLVM_VERSION) == LLVM_3_2 || $(LLVM_VERSION) == LLVM_3_3 || $(LLVM_VERSION) == LLVM_3_4))
+ifeq (,$(filter $(LLVM_VERSION), LLVM_3_2 LLVM_3_3 LLVM_3_4))
+	CXXFLAGS+=-std=c++11 -Wno-c99-extensions -Wno-deprecated-register -fno-rtti
+endif
+ifneq ($(ARM_ENABLED), 0)
+    CXXFLAGS+=-DISPC_ARM_ENABLED
+endif
+ifneq ($(NVPTX_ENABLED), 0)
+    CXXFLAGS+=-DISPC_NVPTX_ENABLED
+endif

 LDFLAGS=
 ifeq ($(ARCH_OS),Linux)
  # try to link everything statically under Linux (including libstdc++) so
  # that the binaries we generate will be portable across distributions...
-  ifeq ($(ARCH_TYPE),x86_64)
-    LDFLAGS=-static -L/usr/lib/gcc/x86_64-linux-gnu/4.4
-  else
-    LDFLAGS=-L/usr/lib/gcc/i686-redhat-linux/4.6.0
-  endif
+#    LDFLAGS=-static
+  # Linking everything statically isn't easy (too many things are required),
+  # but linking libstdc++ and libgcc is necessary when building with relatively
+  # new gcc, when going to distribute to old systems.
+#    LDFLAGS=-static-libgcc -static-libstdc++
 endif

 LEX=flex
@@ -44,28 +195,44 @@ YACC=bison -d -v -t

 ###########################################################################

-CXX_SRC=ast.cpp builtins.cpp ctx.cpp decl.cpp expr.cpp func.cpp ispc.cpp \
-	llvmutil.cpp main.cpp module.cpp opt.cpp stmt.cpp sym.cpp type.cpp \
-	util.cpp
+CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
+	ispc.cpp llvmutil.cpp main.cpp module.cpp opt.cpp stmt.cpp sym.cpp \
+	type.cpp util.cpp
 HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
 	opt.h stmt.h sym.h type.h util.h
-BUILTINS_SRC=builtins-avx.ll builtins-avx-x2.ll builtins-sse2.ll \
-	builtins-sse4.ll builtins-sse4-x2.ll builtins-dispatch.ll
+TARGETS=avx2-i64x4 avx11-i64x4 avx1-i64x4 avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \
+	sse2 sse2-x2 sse4-8 sse4-16 sse4 sse4-x2 \
+	generic-4 generic-8 generic-16 generic-32 generic-64 generic-1 knl skx
+ifneq ($(ARM_ENABLED), 0)
+    TARGETS+=neon-32 neon-16 neon-8
+endif
+ifneq ($(NVPTX_ENABLED), 0)
+    TARGETS+=nvptx
+endif
+# These files need to be compiled in two versions - 32 and 64 bits.
+BUILTINS_SRC_TARGET=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS)))
+# These are files to be compiled in single version.
+BUILTINS_SRC_COMMON=builtins/dispatch.ll
+BUILTINS_OBJS_32=$(addprefix builtins-, $(notdir $(BUILTINS_SRC_TARGET:.ll=-32bit.o)))
+BUILTINS_OBJS_64=$(addprefix builtins-, $(notdir $(BUILTINS_SRC_TARGET:.ll=-64bit.o)))
+BUILTINS_OBJS=$(addprefix builtins-, $(notdir $(BUILTINS_SRC_COMMON:.ll=.o))) \
+	$(BUILTINS_OBJS_32) $(BUILTINS_OBJS_64) \
+	builtins-c-32.cpp builtins-c-64.cpp
 BISON_SRC=parse.yy
 FLEX_SRC=lex.ll

-OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_SRC:.ll=.o) \
-	builtins-c-32.o builtins-c-64.o stdlib_ispc.o $(BISON_SRC:.yy=.o) \
-	$(FLEX_SRC:.ll=.o))
+OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_OBJS) \
+       stdlib_mask1_ispc.o stdlib_mask8_ispc.o stdlib_mask16_ispc.o stdlib_mask32_ispc.o stdlib_mask64_ispc.o \
+	$(BISON_SRC:.yy=.o) $(FLEX_SRC:.ll=.o))

-default: ispc ispc_test
+default: ispc

-.PHONY: dirs clean depend doxygen print_llvm_src
+.PHONY: dirs clean depend doxygen print_llvm_src llvm_check
 .PRECIOUS: objs/builtins-%.cpp

-depend: $(CXX_SRC) $(HEADERS)
+depend: llvm_check $(CXX_SRC) $(HEADERS)
 	@echo Updating dependencies
-	@gcc -MM $(CXXFLAGS) $(CXX_SRC) | sed 's_^\([a-z]\)_objs/\1_g' > depend
+	@$(CXX) -MM $(CXXFLAGS) $(CXX_SRC) | sed 's_^\([a-z]\)_objs/\1_g' > depend

 -include depend

@@ -73,11 +240,21 @@ dirs:
 	@echo Creating objs/ directory
 	@/bin/mkdir -p objs

-print_llvm_src:
+llvm_check:
+	@llvm-config --version > /dev/null || \
+	(echo; \
+	 echo "******************************************"; \
+	 echo "ERROR: llvm-config not found in your PATH";  \
+	 echo "******************************************"; \
+	 echo; exit 1)
+	@echo -e '$(subst $(newline), ,$(RIGHT_LLVM))'
+
+print_llvm_src: llvm_check
 	@echo Using LLVM `llvm-config --version` from `llvm-config --libdir`
+	@echo Using compiler to build: `$(CXX) --version | head -1`

 clean:
-	/bin/rm -rf objs ispc ispc_test
+	/bin/rm -rf objs ispc

 doxygen:
 	/bin/rm -rf docs/doxygen
@@ -85,16 +262,42 @@ doxygen:

 ispc: print_llvm_src dirs $(OBJS)
 	@echo Creating ispc executable
-	@$(CXX) $(LDFLAGS) -o $@ $(OBJS) $(ISPC_LIBS)
+	@$(CXX) $(OPT) $(LDFLAGS) -o $@ $(OBJS) $(ISPC_LIBS)

-ispc_test: dirs ispc_test.cpp
-	@echo Creating ispc_test executable
-	@$(CXX) $(LDFLAGS) $(CXXFLAGS) -o $@ ispc_test.cpp $(ISPC_TEST_LIBS)
+# Use clang as a default compiler, instead of gcc
+# This is default now.
+clang: ispc
+clang: CXX=clang++
+
+# Use gcc as a default compiler
+gcc: ispc
+gcc: CXX=g++
+
+# Build ispc with address sanitizer instrumentation using clang compiler
+# Note that this is not portable build
+asan: clang
+asan: OPT+=-fsanitize=address
+
+# Do debug build, i.e. -O0 -g
+debug: ispc
+debug: OPT=-O0 -g

 objs/%.o: %.cpp
 	@echo Compiling $<
 	@$(CXX) $(CXXFLAGS) -o $@ -c $<

+objs/cbackend.o: cbackend.cpp
+	@echo Compiling $<
+	@$(CXX) -fno-rtti -fno-exceptions $(CXXFLAGS) -o $@ -c $<
+
+objs/opt.o: opt.cpp
+	@echo Compiling $<
+	@$(CXX) -fno-rtti $(CXXFLAGS) -o $@ -c $<
+
+objs/%.o: objs/%.cpp
+	@echo Compiling $<
+	@$(CXX) $(CXXFLAGS) -o $@ -c $<
+
 objs/parse.cc: parse.yy
 	@echo Running bison on $<
 	@$(YACC) -o $@ $<
@@ -111,34 +314,47 @@ objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc
 	@echo Compiling $<
 	@$(CXX) $(CXXFLAGS) -o $@ -c $<

-objs/builtins-%.cpp: builtins-%.ll builtins.m4 builtins-sse.ll builtins-avx-common.ll
-	@echo Creating C++ source from builtin definitions file $<
-	@m4 -DLLVM_VERSION=$(LLVM_VERSION) builtins.m4 $< | ./bitcode2cpp.py $< > $@
-
-objs/builtins-%.o: objs/builtins-%.cpp
-	@echo Compiling $<
-	@$(CXX) $(CXXFLAGS) -o $@ -c $<
-
-objs/builtins-c-32.cpp: builtins-c.c
+objs/builtins-dispatch.cpp: builtins/dispatch.ll builtins/util.m4 builtins/util-nvptx.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
 	@echo Creating C++ source from builtins definition file $<
-	@$(CLANG) -m32 -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py builtins-c-32.c > $@
+	@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX $< | python bitcode2cpp.py $< > $@

-objs/builtins-c-32.o: objs/builtins-c-32.cpp
-	@echo Compiling $<
-	@$(CXX) $(CXXFLAGS) -o $@ -c $<
+objs/builtins-%-32bit.cpp: builtins/%.ll builtins/util.m4 builtins/util-nvptx.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
+	@echo Creating C++ source from builtins definition file $< \(32 bit version\)
+	@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX -DRUNTIME=32 $< | python bitcode2cpp.py $< 32bit > $@

-objs/builtins-c-64.cpp: builtins-c.c
+objs/builtins-%-64bit.cpp: builtins/%.ll builtins/util.m4 builtins/util-nvptx.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
+	@echo Creating C++ source from builtins definition file $< \(64 bit version\)
+	@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX -DRUNTIME=64 $< | python bitcode2cpp.py $< 64bit > $@
+
+objs/builtins-c-32.cpp: builtins/builtins.c
 	@echo Creating C++ source from builtins definition file $<
-	@$(CLANG) -m64 -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py builtins-c-64.c > $@
+	@$(CLANG) -m32 -emit-llvm -c $< -o - | llvm-dis - | python bitcode2cpp.py c 32 > $@

-objs/builtins-c-64.o: objs/builtins-c-64.cpp
-	@echo Compiling $<
-	@$(CXX) $(CXXFLAGS) -o $@ -c $<
+objs/builtins-c-64.cpp: builtins/builtins.c
+	@echo Creating C++ source from builtins definition file $<
+	@$(CLANG) -m64 -emit-llvm -c $< -o - | llvm-dis - | python bitcode2cpp.py c 64 > $@

-objs/stdlib_ispc.cpp: stdlib.ispc
-	@echo Creating C++ source from $<
-	@$(CLANG) -E -x c -DISPC=1 -DPI=3.1415926536 $< -o - | ./stdlib2cpp.py > $@
+objs/stdlib_mask1_ispc.cpp: stdlib.ispc
+	@echo Creating C++ source from $< for mask1
+	@$(CLANG) -E -x c -DISPC_MASK_BITS=1 -DISPC=1 -DPI=3.14159265358979 $< -o - | \
+		python stdlib2cpp.py mask1 > $@

-objs/stdlib_ispc.o: objs/stdlib_ispc.cpp
-	@echo Compiling $<
-	@$(CXX) $(CXXFLAGS) -o $@ -c $<
+objs/stdlib_mask8_ispc.cpp: stdlib.ispc
+	@echo Creating C++ source from $< for mask8
+	@$(CLANG) -E -x c -DISPC_MASK_BITS=8 -DISPC=1 -DPI=3.14159265358979 $< -o - | \
+		python stdlib2cpp.py mask8 > $@
+
+objs/stdlib_mask16_ispc.cpp: stdlib.ispc
+	@echo Creating C++ source from $< for mask16
+	@$(CLANG) -E -x c -DISPC_MASK_BITS=16 -DISPC=1 -DPI=3.14159265358979 $< -o - | \
+		python stdlib2cpp.py mask16 > $@
+
+objs/stdlib_mask32_ispc.cpp: stdlib.ispc
+	@echo Creating C++ source from $< for mask32
+	@$(CLANG) -E -x c -DISPC_MASK_BITS=32 -DISPC=1 -DPI=3.14159265358979 $< -o - | \
+		python stdlib2cpp.py mask32 > $@
+
+objs/stdlib_mask64_ispc.cpp: stdlib.ispc
+	@echo Creating C++ source from $< for mask64
+	@$(CLANG) -E -x c -DISPC_MASK_BITS=64 -DISPC=1 -DPI=3.14159265358979 $< -o - | \
+		python stdlib2cpp.py mask64 > $@
--- a/README.rst
+++ b/README.rst
@@ -0,0 +1,90 @@
+==============================
+Intel(r) SPMD Program Compiler
+==============================
+
+``ispc`` is a compiler for a variant of the C programming language, with
+extensions for `single program, multiple data
+<http://en.wikipedia.org/wiki/SPMD>`_ programming.  Under the SPMD model,
+the programmer writes a program that generally appears to be a regular
+serial program, though the execution model is actually that a number of
+*program instances* execute in parallel on the hardware.
+
+Overview
+--------
+
+``ispc`` compiles a C-based SPMD programming language to run on the SIMD
+units of CPUs; it frequently provides a 3x or more speedup on CPUs with
+4-wide vector SSE units and 5x-6x on CPUs with 8-wide AVX vector units,
+without any of the difficulty of writing intrinsics code.  Parallelization
+across multiple cores is also supported by ``ispc``, making it
+possible to write programs that achieve performance improvement that scales
+by both number of cores and vector unit size.
+
+There are a few key principles in the design of ``ispc``:
+
+  * To build a small set of extensions to the C language that
+    would deliver excellent performance to performance-oriented
+    programmers who want to run SPMD programs on the CPU.
+
+  * To provide a thin abstraction layer between the programmer
+    and the hardware--in particular, to have an execution and
+    data model where the programmer can cleanly reason about the
+    mapping of their source program to compiled assembly language
+    and the underlying hardware.
+
+  * To make it possible to harness the computational power of SIMD
+    vector units without the extremely low-programmer-productivity
+    activity of directly writing intrinsics.
+
+  * To explore opportunities from close coupling between C/C++
+    application code and SPMD ``ispc`` code running on the
+    same processor--to have lightweight function calls between
+    the two languages and to share data directly via pointers without
+    copying or reformatting.
+
+``ispc`` is an open source compiler with the BSD license.  It uses the
+remarkable `LLVM Compiler Infrastructure <http://llvm.org>`_ for back-end
+code generation and optimization and is `hosted on
+github <http://github.com/ispc/ispc/>`_. It supports Windows, Mac, and
+Linux, with both x86 and x86-64 targets.  It currently supports the SSE2,
+SSE4, AVX1, and AVX2 instruction sets.
+
+Features
+--------
+
+``ispc`` provides a number of key features to developers:
+
+  * Familiarity as an extension of the C programming
+    language: ``ispc`` supports familiar C syntax and
+    programming idioms, while adding the ability to write SPMD
+    programs.
+
+  * High-quality SIMD code generation: the performance
+    of code generated by ``ispc`` is often close to that of
+    hand-written intrinsics code.
+
+  * Ease of adoption with existing software
+    systems: functions written in ``ispc`` directly
+    interoperate with application functions written in C/C++ and
+    with application data structures.
+            
+  * Portability across over a decade of CPU
+    generations: ``ispc`` has targets for SSE2, SSE4, AVX
+    (and soon, AVX2).
+
+  * Portability across operating systems: Microsoft
+    Windows, Mac OS X, and Linux are all supported
+    by ``ispc``.
+
+  * Debugging with standard tools: ``ispc``
+    programs can be debugged with standard debuggers (OS X and
+    Linux only).
+
+Additional Resources
+--------------------
+
+Prebuilt ``ispc`` binaries for Windows, OS X and Linux can be downloaded
+from the `ispc downloads page <http://ispc.github.com/downloads.html>`_.
+See also additional
+`documentation <http://ispc.github.com/documentation.html>`_ and additional
+`performance information <http://ispc.github.com/perf.html>`_.
--- a/README.txt
+++ b/README.txt
@@ -1,22 +0,0 @@
-==============================
-Intel(r) SPMD Program Compiler
-==============================
-
-Welcome to the Intel(r) SPMD Program Compiler (ispc)!  
-
-ispc is a new compiler for "single program, multiple data" (SPMD)
-programs. Under the SPMD model, the programmer writes a program that mostly
-appears to be a regular serial program, though the execution model is
-actually that a number of program instances execute in parallel on the
-hardware. ispc compiles a C-based SPMD programming language to run on the
-SIMD units of CPUs; it frequently provides a a 3x or more speedup on CPUs
-with 4-wide SSE units, without any of the difficulty of writing intrinsics
-code.
-
-ispc is an open source compiler under the BSD license; see the file
-LICENSE.txt.  ispc supports Windows, Mac, and Linux, with both x86 and
-x86-64 targets.  It currently supports the SSE2, SSE4, and AVX instruction
-sets.
-
-For more information and examples, as well as a wiki and the bug database,
-see the ispc distribution site, http://ispc.github.com.
--- a/alloy.py
+++ b/alloy.py
--- a/ast.cpp
+++ b/ast.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2011, Intel Corporation
+  Copyright (c) 2011-2015, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -28,18 +28,21 @@
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

 /** @file ast.cpp
-    @brief 
-*/
+
+    @brief General functionality related to abstract syntax trees and
+    traversal of them.
+ */

 #include "ast.h"
-#include "decl.h"
+#include "expr.h"
 #include "func.h"
-#include "type.h"
+#include "stmt.h"
 #include "sym.h"
+#include "util.h"

 ///////////////////////////////////////////////////////////////////////////
 // ASTNode
@@ -52,8 +55,17 @@ ASTNode::~ASTNode() {
 // AST

 void
-AST::AddFunction(DeclSpecs *ds, Declarator *decl, Stmt *code) {
-    functions.push_back(new Function(ds, decl, code));
+AST::AddFunction(Symbol *sym, Stmt *code) {
+    if (sym == NULL)
+        return;
+
+    Function *f = new Function(sym, code);
+
+    if (f->IsPolyFunction()) {
+        FATAL("This is a good start, but implement me!");
+    } else {
+        functions.push_back(f);
+    }
 }


@@ -63,3 +75,443 @@ AST::GenerateIR() {
        functions[i]->GenerateIR();
 }

+///////////////////////////////////////////////////////////////////////////
+
+ASTNode *
+WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
+        void *data) {
+    if (node == NULL)
+        return node;
+
+    // Call the callback function
+    if (preFunc != NULL) {
+        if (preFunc(node, data) == false)
+            // The function asked us to not continue recursively, so stop.
+            return node;
+    }
+
+    ////////////////////////////////////////////////////////////////////////////
+    // Handle Statements
+    if (llvm::dyn_cast<Stmt>(node) != NULL) {
+        ExprStmt *es;
+        DeclStmt *ds;
+        IfStmt *is;
+        DoStmt *dos;
+        ForStmt *fs;
+        ForeachStmt *fes;
+        ForeachActiveStmt *fas;
+        ForeachUniqueStmt *fus;
+        CaseStmt *cs;
+        DefaultStmt *defs;
+        SwitchStmt *ss;
+        ReturnStmt *rs;
+        LabeledStmt *ls;
+        StmtList *sl;
+        PrintStmt *ps;
+        AssertStmt *as;
+        DeleteStmt *dels;
+        UnmaskedStmt *ums;
+
+        if ((es = llvm::dyn_cast<ExprStmt>(node)) != NULL)
+            es->expr = (Expr *)WalkAST(es->expr, preFunc, postFunc, data);
+        else if ((ds = llvm::dyn_cast<DeclStmt>(node)) != NULL) {
+            for (unsigned int i = 0; i < ds->vars.size(); ++i)
+                ds->vars[i].init = (Expr *)WalkAST(ds->vars[i].init, preFunc,
+                                                   postFunc, data);
+        }
+        else if ((is = llvm::dyn_cast<IfStmt>(node)) != NULL) {
+            is->test = (Expr *)WalkAST(is->test, preFunc, postFunc, data);
+            is->trueStmts = (Stmt *)WalkAST(is->trueStmts, preFunc,
+                                            postFunc, data);
+            is->falseStmts = (Stmt *)WalkAST(is->falseStmts, preFunc,
+                                             postFunc, data);
+        }
+        else if ((dos = llvm::dyn_cast<DoStmt>(node)) != NULL) {
+            dos->testExpr = (Expr *)WalkAST(dos->testExpr, preFunc,
+                                            postFunc, data);
+            dos->bodyStmts = (Stmt *)WalkAST(dos->bodyStmts, preFunc,
+                                             postFunc, data);
+        }
+        else if ((fs = llvm::dyn_cast<ForStmt>(node)) != NULL) {
+            fs->init = (Stmt *)WalkAST(fs->init, preFunc, postFunc, data);
+            fs->test = (Expr *)WalkAST(fs->test, preFunc, postFunc, data);
+            fs->step = (Stmt *)WalkAST(fs->step, preFunc, postFunc, data);
+            fs->stmts = (Stmt *)WalkAST(fs->stmts, preFunc, postFunc, data);
+        }
+        else if ((fes = llvm::dyn_cast<ForeachStmt>(node)) != NULL) {
+            for (unsigned int i = 0; i < fes->startExprs.size(); ++i)
+                fes->startExprs[i] = (Expr *)WalkAST(fes->startExprs[i], preFunc,
+                                                     postFunc, data);
+            for (unsigned int i = 0; i < fes->endExprs.size(); ++i)
+                fes->endExprs[i] = (Expr *)WalkAST(fes->endExprs[i], preFunc,
+                                                   postFunc, data);
+            fes->stmts = (Stmt *)WalkAST(fes->stmts, preFunc, postFunc, data);
+        }
+        else if ((fas = llvm::dyn_cast<ForeachActiveStmt>(node)) != NULL) {
+            fas->stmts = (Stmt *)WalkAST(fas->stmts, preFunc, postFunc, data);
+        }
+        else if ((fus = llvm::dyn_cast<ForeachUniqueStmt>(node)) != NULL) {
+            fus->expr = (Expr *)WalkAST(fus->expr, preFunc, postFunc, data);
+            fus->stmts = (Stmt *)WalkAST(fus->stmts, preFunc, postFunc, data);
+        }
+        else if ((cs = llvm::dyn_cast<CaseStmt>(node)) != NULL)
+            cs->stmts = (Stmt *)WalkAST(cs->stmts, preFunc, postFunc, data);
+        else if ((defs = llvm::dyn_cast<DefaultStmt>(node)) != NULL)
+            defs->stmts = (Stmt *)WalkAST(defs->stmts, preFunc, postFunc, data);
+        else if ((ss = llvm::dyn_cast<SwitchStmt>(node)) != NULL) {
+            ss->expr = (Expr *)WalkAST(ss->expr, preFunc, postFunc, data);
+            ss->stmts = (Stmt *)WalkAST(ss->stmts, preFunc, postFunc, data);
+        }
+        else if (llvm::dyn_cast<BreakStmt>(node) != NULL ||
+                 llvm::dyn_cast<ContinueStmt>(node) != NULL ||
+                 llvm::dyn_cast<GotoStmt>(node) != NULL) {
+            // nothing
+        }
+        else if ((ls = llvm::dyn_cast<LabeledStmt>(node)) != NULL)
+            ls->stmt = (Stmt *)WalkAST(ls->stmt, preFunc, postFunc, data);
+        else if ((rs = llvm::dyn_cast<ReturnStmt>(node)) != NULL)
+            rs->expr = (Expr *)WalkAST(rs->expr, preFunc, postFunc, data);
+        else if ((sl = llvm::dyn_cast<StmtList>(node)) != NULL) {
+            std::vector<Stmt *> &sls = sl->stmts;
+            for (unsigned int i = 0; i < sls.size(); ++i)
+                sls[i] = (Stmt *)WalkAST(sls[i], preFunc, postFunc, data);
+        }
+        else if ((ps = llvm::dyn_cast<PrintStmt>(node)) != NULL)
+            ps->values = (Expr *)WalkAST(ps->values, preFunc, postFunc, data);
+        else if ((as = llvm::dyn_cast<AssertStmt>(node)) != NULL)
+            as->expr = (Expr *)WalkAST(as->expr, preFunc, postFunc, data);
+        else if ((dels = llvm::dyn_cast<DeleteStmt>(node)) != NULL)
+            dels->expr = (Expr *)WalkAST(dels->expr, preFunc, postFunc, data);
+        else if ((ums = llvm::dyn_cast<UnmaskedStmt>(node)) != NULL)
+            ums->stmts = (Stmt *)WalkAST(ums->stmts, preFunc, postFunc, data);
+        else
+            FATAL("Unhandled statement type in WalkAST()");
+    }
+    else {
+        ///////////////////////////////////////////////////////////////////////////
+        // Handle expressions
+        Assert(llvm::dyn_cast<Expr>(node) != NULL);
+        UnaryExpr *ue;
+        BinaryExpr *be;
+        AssignExpr *ae;
+        SelectExpr *se;
+        ExprList *el;
+        FunctionCallExpr *fce;
+        IndexExpr *ie;
+        MemberExpr *me;
+        TypeCastExpr *tce;
+        ReferenceExpr *re;
+        PtrDerefExpr *ptrderef;
+        RefDerefExpr *refderef;
+        SizeOfExpr *soe;
+        AddressOfExpr *aoe;
+        NewExpr *newe;
+
+        if ((ue = llvm::dyn_cast<UnaryExpr>(node)) != NULL)
+            ue->expr = (Expr *)WalkAST(ue->expr, preFunc, postFunc, data);
+        else if ((be = llvm::dyn_cast<BinaryExpr>(node)) != NULL) {
+            be->arg0 = (Expr *)WalkAST(be->arg0, preFunc, postFunc, data);
+            be->arg1 = (Expr *)WalkAST(be->arg1, preFunc, postFunc, data);
+        }
+        else if ((ae = llvm::dyn_cast<AssignExpr>(node)) != NULL) {
+            ae->lvalue = (Expr *)WalkAST(ae->lvalue, preFunc, postFunc, data);
+            ae->rvalue = (Expr *)WalkAST(ae->rvalue, preFunc, postFunc, data);
+        }
+        else if ((se = llvm::dyn_cast<SelectExpr>(node)) != NULL) {
+            se->test = (Expr *)WalkAST(se->test, preFunc, postFunc, data);
+            se->expr1 = (Expr *)WalkAST(se->expr1, preFunc, postFunc, data);
+            se->expr2 = (Expr *)WalkAST(se->expr2, preFunc, postFunc, data);
+        }
+        else if ((el = llvm::dyn_cast<ExprList>(node)) != NULL) {
+            for (unsigned int i = 0; i < el->exprs.size(); ++i)
+                el->exprs[i] = (Expr *)WalkAST(el->exprs[i], preFunc,
+                                               postFunc, data);
+        }
+        else if ((fce = llvm::dyn_cast<FunctionCallExpr>(node)) != NULL) {
+            fce->func = (Expr *)WalkAST(fce->func, preFunc, postFunc, data);
+            fce->args = (ExprList *)WalkAST(fce->args, preFunc, postFunc, data);
+            for (int k = 0; k < 3; k++)
+              fce->launchCountExpr[0] = (Expr *)WalkAST(fce->launchCountExpr[0], preFunc,
+                                                   postFunc, data);
+        }
+        else if ((ie = llvm::dyn_cast<IndexExpr>(node)) != NULL) {
+            ie->baseExpr = (Expr *)WalkAST(ie->baseExpr, preFunc, postFunc, data);
+            ie->index = (Expr *)WalkAST(ie->index, preFunc, postFunc, data);
+        }
+        else if ((me = llvm::dyn_cast<MemberExpr>(node)) != NULL)
+            me->expr = (Expr *)WalkAST(me->expr, preFunc, postFunc, data);
+        else if ((tce = llvm::dyn_cast<TypeCastExpr>(node)) != NULL)
+            tce->expr = (Expr *)WalkAST(tce->expr, preFunc, postFunc, data);
+        else if ((re = llvm::dyn_cast<ReferenceExpr>(node)) != NULL)
+            re->expr = (Expr *)WalkAST(re->expr, preFunc, postFunc, data);
+        else if ((ptrderef = llvm::dyn_cast<PtrDerefExpr>(node)) != NULL)
+            ptrderef->expr = (Expr *)WalkAST(ptrderef->expr, preFunc, postFunc,
+                                             data);
+        else if ((refderef = llvm::dyn_cast<RefDerefExpr>(node)) != NULL)
+            refderef->expr = (Expr *)WalkAST(refderef->expr, preFunc, postFunc,
+                                             data);
+        else if ((soe = llvm::dyn_cast<SizeOfExpr>(node)) != NULL)
+            soe->expr = (Expr *)WalkAST(soe->expr, preFunc, postFunc, data);
+        else if ((aoe = llvm::dyn_cast<AddressOfExpr>(node)) != NULL)
+            aoe->expr = (Expr *)WalkAST(aoe->expr, preFunc, postFunc, data);
+        else if ((newe = llvm::dyn_cast<NewExpr>(node)) != NULL) {
+            newe->countExpr = (Expr *)WalkAST(newe->countExpr, preFunc,
+                                              postFunc, data);
+            newe->initExpr = (Expr *)WalkAST(newe->initExpr, preFunc,
+                                             postFunc, data);
+        }
+        else if (llvm::dyn_cast<SymbolExpr>(node) != NULL ||
+                 llvm::dyn_cast<ConstExpr>(node) != NULL ||
+                 llvm::dyn_cast<FunctionSymbolExpr>(node) != NULL ||
+                 llvm::dyn_cast<SyncExpr>(node) != NULL ||
+                 llvm::dyn_cast<NullPointerExpr>(node) != NULL) {
+            // nothing to do
+        }
+        else
+            FATAL("Unhandled expression type in WalkAST().");
+    }
+
+    // Call the callback function
+    if (postFunc != NULL)
+        return postFunc(node, data);
+    else
+        return node;
+}
+
+
+static ASTNode *
+lOptimizeNode(ASTNode *node, void *) {
+    return node->Optimize();
+}
+
+
+ASTNode *
+Optimize(ASTNode *root) {
+    return WalkAST(root, NULL, lOptimizeNode, NULL);
+}
+
+
+Expr *
+Optimize(Expr *expr) {
+    return (Expr *)Optimize((ASTNode *)expr);
+}
+
+
+Stmt *
+Optimize(Stmt *stmt) {
+    return (Stmt *)Optimize((ASTNode *)stmt);
+}
+
+
+static ASTNode *
+lTypeCheckNode(ASTNode *node, void *) {
+    return node->TypeCheck();
+}
+
+
+ASTNode *
+TypeCheck(ASTNode *root) {
+    return WalkAST(root, NULL, lTypeCheckNode, NULL);
+}
+
+
+Expr *
+TypeCheck(Expr *expr) {
+    return (Expr *)TypeCheck((ASTNode *)expr);
+}
+
+
+Stmt *
+TypeCheck(Stmt *stmt) {
+    return (Stmt *)TypeCheck((ASTNode *)stmt);
+}
+
+
+struct CostData {
+    CostData() { cost = foreachDepth = 0; }
+
+    int cost;
+    int foreachDepth;
+};
+
+
+static bool
+lCostCallbackPre(ASTNode *node, void *d) {
+    CostData *data = (CostData *)d;
+    if (llvm::dyn_cast<ForeachStmt>(node) != NULL)
+        ++data->foreachDepth;
+    if (data->foreachDepth == 0)
+        data->cost += node->EstimateCost();
+    return true;
+}
+
+
+static ASTNode *
+lCostCallbackPost(ASTNode *node, void *d) {
+    CostData *data = (CostData *)d;
+    if (llvm::dyn_cast<ForeachStmt>(node) != NULL)
+        --data->foreachDepth;
+    return node;
+}
+
+
+int
+EstimateCost(ASTNode *root) {
+    CostData data;
+    WalkAST(root, lCostCallbackPre, lCostCallbackPost, &data);
+    return data.cost;
+}
+
+
+/** Given an AST node, check to see if it's safe if we happen to run the
+    code for that node with the execution mask all off.
+ */
+static bool
+lCheckAllOffSafety(ASTNode *node, void *data) {
+    bool *okPtr = (bool *)data;
+
+    FunctionCallExpr *fce;
+    if ((fce = llvm::dyn_cast<FunctionCallExpr>(node)) != NULL) {
+        if (fce->func == NULL)
+            return false;
+
+        const Type *type = fce->func->GetType();
+        const PointerType *pt = CastType<PointerType>(type);
+        if (pt != NULL)
+            type = pt->GetBaseType();
+        const FunctionType *ftype = CastType<FunctionType>(type);
+        Assert(ftype != NULL);
+
+        if (ftype->isSafe == false) {
+            *okPtr = false;
+            return false;
+        }
+    }
+
+    if (llvm::dyn_cast<AssertStmt>(node) != NULL) {
+        // While it's fine to run the assert for varying tests, it's not
+        // desirable to check an assert on a uniform variable if all of the
+        // lanes are off.
+        *okPtr = false;
+        return false;
+    }
+
+    if (llvm::dyn_cast<PrintStmt>(node) != NULL) {
+        *okPtr = false;
+        return false;
+    }
+
+    if (llvm::dyn_cast<NewExpr>(node) != NULL ||
+        llvm::dyn_cast<DeleteStmt>(node) != NULL) {
+        // We definitely don't want to run the uniform variants of these if
+        // the mask is all off.  It's also worth skipping the overhead of
+        // executing the varying versions of them in the all-off mask case.
+        *okPtr = false;
+        return false;
+    }
+
+    if (llvm::dyn_cast<ForeachStmt>(node) != NULL ||
+        llvm::dyn_cast<ForeachActiveStmt>(node) != NULL ||
+        llvm::dyn_cast<ForeachUniqueStmt>(node) != NULL ||
+        llvm::dyn_cast<UnmaskedStmt>(node) != NULL) {
+        // The various foreach statements also shouldn't be run with an
+        // all-off mask.  Since they can re-establish an 'all on' mask,
+        // this would be pretty unintuitive.  (More generally, it's
+        // possibly a little strange to allow foreach in the presence of
+        // any non-uniform control flow...)
+        //
+        // Similarly, the implementation of foreach_unique assumes as a
+        // precondition that the mask won't be all off going into it, so
+        // we'll enforce that here...
+        *okPtr = false;
+        return false;
+    }
+
+    if (llvm::dyn_cast<BinaryExpr>(node) != NULL) {
+        BinaryExpr* binaryExpr = llvm::dyn_cast<BinaryExpr>(node);
+        if (binaryExpr->op == BinaryExpr::Mod || binaryExpr->op == BinaryExpr::Div) {
+            *okPtr = false;
+            return false;
+        }
+    }
+    IndexExpr *ie;
+    if ((ie = llvm::dyn_cast<IndexExpr>(node)) != NULL && ie->baseExpr != NULL) {
+        const Type *type = ie->baseExpr->GetType();
+        if (type == NULL)
+            return true;
+        if (CastType<ReferenceType>(type) != NULL)
+            type = type->GetReferenceTarget();
+
+        ConstExpr *ce = llvm::dyn_cast<ConstExpr>(ie->index);
+        if (ce == NULL) {
+            // indexing with a variable... -> not safe
+            *okPtr = false;
+            return false;
+        }
+
+        const PointerType *pointerType = CastType<PointerType>(type);
+        if (pointerType != NULL) {
+            // pointer[index] -> can't be sure -> not safe
+            *okPtr = false;
+            return false;
+        }
+
+        const SequentialType *seqType = CastType<SequentialType>(type);
+        Assert(seqType != NULL);
+        int nElements = seqType->GetElementCount();
+        if (nElements == 0) {
+            // Unsized array, so we can't be sure -> not safe
+            *okPtr = false;
+            return false;
+        }
+
+        int32_t indices[ISPC_MAX_NVEC];
+        int count = ce->GetValues(indices);
+        for (int i = 0; i < count; ++i) {
+            if (indices[i] < 0 || indices[i] >= nElements) {
+                // Index is out of bounds -> not safe
+                *okPtr = false;
+                return false;
+            }
+        }
+
+        // All indices are in-bounds
+        return true;
+    }
+
+    MemberExpr *me;
+    if ((me = llvm::dyn_cast<MemberExpr>(node)) != NULL &&
+        me->dereferenceExpr) {
+        *okPtr = false;
+        return false;
+    }
+
+    if (llvm::dyn_cast<PtrDerefExpr>(node) != NULL) {
+        *okPtr = false;
+        return false;
+    }
+
+    /*
+      Don't allow turning if/else to straight-line-code if we 
+      assign to a uniform.
+    */
+    AssignExpr *ae;
+    if ((ae = llvm::dyn_cast<AssignExpr>(node)) != NULL) {
+      if (ae->GetType()) {
+        if (ae->GetType()->IsUniformType()) {
+          *okPtr = false;
+          return false;
+        }
+      }
+    }
+
+    return true;
+}
+
+
+bool
+SafeToRunWithMaskAllOff(ASTNode *root) {
+    bool safe = true;
+    WalkAST(root, lCheckAllOffSafety, NULL, &safe);
+    return safe;
+}
--- a/ast.h
+++ b/ast.h
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2011, Intel Corporation
+  Copyright (c) 2011-2013, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -28,11 +28,11 @@
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

 /** @file ast.h
-    @brief 
+    @brief
 */

 #ifndef ISPC_AST_H
@@ -48,15 +48,17 @@
    (Expr) and statements (Stmt) inherit from this class.
 */
 class ASTNode {
+    const unsigned char SubclassID;   // Subclass identifier (for isa/dyn_cast)
 public:
-    ASTNode(SourcePos p) : pos(p) { }
+    ASTNode(SourcePos p, unsigned scid) : SubclassID(scid), pos(p) { }
    virtual ~ASTNode();

    /** The Optimize() method should perform any appropriate early-stage
-        optimizations on the node (e.g. constant folding).  The caller
-        should use the returned ASTNode * in place of the original node.
-        This method may return NULL if an error is encountered during
-        optimization. */
+        optimizations on the node (e.g. constant folding).  This method
+        will be called after the node's children have already been
+        optimized, and the caller will store the returned ASTNode * in
+        place of the original node.  This method should return NULL if an
+        error is encountered during optimization. */
    virtual ASTNode *Optimize() = 0;

    /** Type checking should be performed by the node when this method is
@@ -65,22 +67,85 @@ public:
        pointer in place of the original ASTNode *. */
    virtual ASTNode *TypeCheck() = 0;

+    /** Estimate the execution cost of the node (not including the cost of
+        the children.  The value returned should be based on the COST_*
+        enumerant values defined in ispc.h. */
    virtual int EstimateCost() const = 0;

    /** All AST nodes must track the file position where they are
        defined. */
-    const SourcePos pos;
+    SourcePos pos;
+   
+    /** An enumeration for keeping track of the concrete subclass of Value 
+        that is actually instantiated.*/
+    enum ASTNodeTy {
+        /* For classes inherited from Expr */
+        AddressOfExprID,
+        AssignExprID,
+        BinaryExprID,
+        ConstExprID,
+        DerefExprID,
+        PtrDerefExprID,
+        RefDerefExprID,
+        ExprListID,
+        FunctionCallExprID,
+        FunctionSymbolExprID,
+        IndexExprID,
+        StructMemberExprID,
+        VectorMemberExprID,
+        NewExprID,
+        NullPointerExprID,
+        ReferenceExprID,
+        SelectExprID,
+        SizeOfExprID,
+        SymbolExprID,
+        SyncExprID,
+        TypeCastExprID,
+        UnaryExprID,
+        /* This is a convenience separator to shorten classof implementations */
+        MaxExprID,
+        /* For classes inherited from Stmt */
+        AssertStmtID,
+        BreakStmtID,
+        CaseStmtID,
+        ContinueStmtID,
+        DeclStmtID,
+        DefaultStmtID,
+        DeleteStmtID,
+        DoStmtID,
+        ExprStmtID,
+        ForeachActiveStmtID,
+        ForeachStmtID,
+        ForeachUniqueStmtID,
+        ForStmtID,
+        GotoStmtID,
+        IfStmtID,
+        LabeledStmtID,
+        PrintStmtID,
+        ReturnStmtID,
+        StmtListID,
+        SwitchStmtID,
+        UnmaskedStmtID
+    };
+   
+    /** Return an ID for the concrete type of this object. This is used to
+        implement the classof checks.  This should not be used for any 
+        other purpose, as the values may change as ISPC evolves */
+    unsigned getValueID() const {
+        return SubclassID;
+    }
+
+    static inline bool classof(ASTNode const*) { return true; }
 };


-/** Simple representation of the abstract syntax trees for all of the
-    functions declared in a compilation unit.
- */
+
+
 class AST {
 public:
    /** Add the AST for a function described by the given declaration
        information and source code. */
-    void AddFunction(DeclSpecs *ds, Declarator *decl, Stmt *code);
+    void AddFunction(Symbol *sym, Stmt *code);

    /** Generate LLVM IR for all of the functions into the current
        module. */
@@ -90,4 +155,57 @@ private:
    std::vector<Function *> functions;
 };

+
+/** Callback function type for preorder traversial visiting function for
+    the AST walk.
+ */
+typedef bool (* ASTPreCallBackFunc)(ASTNode *node, void *data);
+
+/** Callback function type for postorder traversial visiting function for
+    the AST walk.
+ */
+typedef ASTNode * (* ASTPostCallBackFunc)(ASTNode *node, void *data);
+
+/** Walk (some portion of) an AST, starting from the given root node.  At
+    each node, if preFunc is non-NULL, call it, passing the given void
+    *data pointer; if the call to preFunc function returns false, then the
+    children of the node aren't visited.  This function then makes
+    recursive calls to WalkAST() to process the node's children; after
+    doing so, calls postFunc, at the node.  The return value from the
+    postFunc call is ignored. */
+extern ASTNode *WalkAST(ASTNode *root, ASTPreCallBackFunc preFunc,
+                        ASTPostCallBackFunc postFunc, void *data);
+
+/** Perform simple optimizations on the AST or portion thereof passed to
+    this function, returning the resulting AST. */
+extern ASTNode *Optimize(ASTNode *root);
+
+/** Convenience version of Optimize() for Expr *s that returns an Expr *
+    (rather than an ASTNode *, which would require the caller to cast back
+    to an Expr *). */
+extern Expr *Optimize(Expr *);
+
+/** Convenience version of Optimize() for Expr *s that returns an Stmt *
+    (rather than an ASTNode *, which would require the caller to cast back
+    to a Stmt *). */
+extern Stmt *Optimize(Stmt *);
+
+/** Perform type-checking on the given AST (or portion of one), returning a
+    pointer to the root of the resulting AST. */
+extern ASTNode *TypeCheck(ASTNode *root);
+
+/** Convenience version of TypeCheck() for Expr *s that returns an Expr *. */
+extern Expr *TypeCheck(Expr *);
+
+/** Convenience version of TypeCheck() for Stmt *s that returns an Stmt *. */
+extern Stmt *TypeCheck(Stmt *);
+
+/** Returns an estimate of the execution cost of the tree starting at
+    the given root. */
+extern int EstimateCost(ASTNode *root);
+
+/** Returns true if it would be safe to run the given code with an "all
+    off" mask. */
+extern bool SafeToRunWithMaskAllOff(ASTNode *root);
+
 #endif // ISPC_AST_H
--- a/bitcode2cpp.py
+++ b/bitcode2cpp.py
@@ -1,7 +1,6 @@
 #!/usr/bin/python

 import sys
-import string
 import re
 import subprocess
 import platform
@@ -10,30 +9,42 @@ import os
 length=0

 src=str(sys.argv[1])
+if (len(sys.argv) > 2):
+    runtime=str(sys.argv[2])

-target = re.sub(".*builtins-", "", src)
+target = re.sub("builtins/target-", "", src)
+target = re.sub(r"builtins\\target-", "", target)
+target = re.sub("builtins/", "", target)
+target = re.sub(r"builtins\\", "", target)
 target = re.sub("\.ll$", "", target)
 target = re.sub("\.c$", "", target)
 target = re.sub("-", "_", target)

 llvm_as="llvm-as"
-if platform.system() == 'Windows' or string.find(platform.system(), "CYGWIN_NT") != -1:
+if platform.system() == 'Windows' or platform.system().find("CYGWIN_NT") != -1:
    llvm_as = os.getenv("LLVM_INSTALL_DIR").replace("\\", "/") + "/bin/" + llvm_as

 try:
    as_out=subprocess.Popen([llvm_as, "-", "-o", "-"], stdout=subprocess.PIPE)
 except IOError:
-    print >> sys.stderr, "Couldn't open " + src
+    sys.stderr.write("Couldn't open " + src)
    sys.exit(1)

-print "unsigned char builtins_bitcode_" + target + "[] = {"
-for line in as_out.stdout.readlines():
-    length = length + len(line)
-    for c in line:
-        print ord(c)
-        print ", "
-print " 0 };\n\n"
-print "int builtins_bitcode_" + target + "_length = " + str(length) + ";\n"
+name = target
+if (len(sys.argv) > 2):
+    name += "_" + runtime;
+width = 16;
+sys.stdout.write("unsigned char builtins_bitcode_" + name + "[] = {\n")
+
+data = as_out.stdout.read()
+for i in range(0, len(data), 1):
+        sys.stdout.write("0x%0.2X, " % ord(data[i:i+1]))
+
+        if i%width == (width-1):
+            sys.stdout.write("\n")
+
+sys.stdout.write("0x00 };\n\n")
+sys.stdout.write("int builtins_bitcode_" + name + "_length = " + str(len(data)) + ";\n")

 as_out.wait()

--- a/buildall.bat
+++ b/buildall.bat
@@ -8,7 +8,6 @@ REM Both the LLVM binaries and python need to be in the path
 set path=%LLVM_INSTALL_DIR%\bin;%PATH%;c:\cygwin\bin

 msbuild ispc.vcxproj /V:m /p:Platform=Win32 /p:Configuration=Release
-msbuild ispc_test.vcxproj /V:m /p:Platform=Win32 /p:Configuration=Release

 msbuild examples\examples.sln /V:m /p:Platform=x64 /p:Configuration=Release /t:rebuild
 msbuild examples\examples.sln /V:m /p:Platform=x64 /p:Configuration=Debug /t:rebuild
--- a/buildispc.bat
+++ b/buildispc.bat
@@ -0,0 +1,11 @@
+@echo off
+
+REM If LLVM_INSTALL_DIR isn't set globally in your environment,
+REM it can be set here_
+REM set LLVM_INSTALL_DIR=c:\users\mmp\llvm-dev
+REM set LLVM_VERSION=LLVM_3_2
+
+REM Both the LLVM binaries and python need to be in the path
+set path=%LLVM_INSTALL_DIR%\bin;%PATH%;c:\cygwin\bin
+
+msbuild ispc.vcxproj /V:m /p:Platform=Win32 /p:Configuration=Release
--- a/builtins-dispatch.ll
+++ b/builtins-dispatch.ll
@@ -1,123 +0,0 @@
-;;  Copyright (c) 2011, Intel Corporation
-;;  All rights reserved.
-;;
-;;  Redistribution and use in source and binary forms, with or without
-;;  modification, are permitted provided that the following conditions are
-;;  met:
-;;
-;;    * Redistributions of source code must retain the above copyright
-;;      notice, this list of conditions and the following disclaimer.
-;;
-;;    * Redistributions in binary form must reproduce the above copyright
-;;      notice, this list of conditions and the following disclaimer in the
-;;      documentation and/or other materials provided with the distribution.
-;;
-;;    * Neither the name of Intel Corporation nor the names of its
-;;      contributors may be used to endorse or promote products derived from
-;;      this software without specific prior written permission.
-;;
-;;
-;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
-;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
-
-;; This file defines various functions that are used when generating the
-;; the "dispatch" object/assembly file that has entrypoints for each
-;; exported function in a module that dispatch to the best available
-;; variant of that function that will run on the system's CPU.
-
-;; Stores the best target ISA that the system on which we're actually
-;; running supports.  -1 represents "uninitialized", otherwise this value
-;; should correspond to one of the enumerant values of Target::ISA from
-;; ispc.h.
-
-@__system_best_isa = internal global i32 -1
-
-declare void @abort() noreturn
-
-;; The below is the result of running "clang -O2 -emit-llvm -c -o -" on the
-;; following code...  Specifically, __get_system_isa should return a value
-;; corresponding to one of the Target::ISA enumerant values that gives the
-;; most capable ISA that the curremt system can run.
-;;
-;; #ifdef _MSC_VER
-;; extern void __stdcall __cpuid(int info[4], int infoType);
-;; #else
-;; static void __cpuid(int info[4], int infoType) {
-;;     __asm__ __volatile__ ("cpuid"
-;;                           : "=a" (info[0]), "=b" (info[1]), "=c" (info[2]), "=d" (info[3])
-;;                           : "0" (infoType));
-;; }
-;; #endif
-;; 
-;; int32_t __get_system_isa() {
-;;     int info[4];
-;;     __cpuid(info, 1);
-;;     /* NOTE: the values returned below must be the same as the
-;;        corresponding enumerant values in Target::ISA. */
-;;     if ((info[2] & (1 << 28)) != 0)
-;;         return 2; // AVX
-;;     else if ((info[2] & (1 << 19)) != 0)
-;;         return 1; // SSE4
-;;     else if ((info[3] & (1 << 26)) != 0)
-;;         return 0; // SSE2
-;;     else
-;;         abort();
-;; }
-
-%0 = type { i32, i32, i32, i32 }
-
-define internal i32 @__get_system_isa() nounwind ssp {
-  %1 = tail call %0 asm sideeffect "cpuid", "={ax},={bx},={cx},={dx},0,~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind
-  %2 = extractvalue %0 %1, 2
-  %3 = extractvalue %0 %1, 3
-  %4 = and i32 %2, 268435456
-  %5 = icmp eq i32 %4, 0
-  br i1 %5, label %6, label %13
-
-; <label>:6                                       ; preds = %0
-  %7 = and i32 %2, 524288
-  %8 = icmp eq i32 %7, 0
-  br i1 %8, label %9, label %13
-
-; <label>:9                                       ; preds = %6
-  %10 = and i32 %3, 67108864
-  %11 = icmp eq i32 %10, 0
-  br i1 %11, label %12, label %13
-
-; <label>:12                                      ; preds = %9
-  tail call void @abort() noreturn nounwind
-  unreachable
-
-; <label>:13                                      ; preds = %9, %6, %0
-  %.0 = phi i32 [ 2, %0 ], [ 1, %6 ], [ 0, %9 ]
-  ret i32 %.0
-}
-
-
-;; This function is called by each of the dispatch functions we generate;
-;; it sets @__system_best_isa if it is unset.
-
-define internal void @__set_system_isa() {
-entry:
-  %bi = load i32* @__system_best_isa
-  %unset = icmp eq i32 %bi, -1
-  br i1 %unset, label %set_system_isa, label %done
-
-set_system_isa:
-  %bival = call i32 @__get_system_isa()
-  store i32 %bival, i32* @__system_best_isa
-  ret void
-
-done:
-  ret void
-}
-
--- a/builtins-sse.ll
+++ b/builtins-sse.ll
@@ -1,417 +0,0 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
-;;  All rights reserved.
-;;
-;;  Redistribution and use in source and binary forms, with or without
-;;  modification, are permitted provided that the following conditions are
-;;  met:
-;;
-;;    * Redistributions of source code must retain the above copyright
-;;      notice, this list of conditions and the following disclaimer.
-;;
-;;    * Redistributions in binary form must reproduce the above copyright
-;;      notice, this list of conditions and the following disclaimer in the
-;;      documentation and/or other materials provided with the distribution.
-;;
-;;    * Neither the name of Intel Corporation nor the names of its
-;;      contributors may be used to endorse or promote products derived from
-;;      this software without specific prior written permission.
-;;
-;;
-;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
-;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
-
-;; This file declares implementations of various stdlib builtins that
-;; only require SSE version 1 and 2 functionality; this file, in turn
-;; is then included by builtins-sse2.ll and builtins-sse4.ll to provide
-;; those definitions for them.
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-int64minmax(4)
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; rcp
-
-declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
-declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
-
-define internal <4 x float> @__rcp_varying_float(<4 x float>) nounwind readonly alwaysinline {
-  %call = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %0)
-  ; do one N-R iteration to improve precision
-  ;  float iv = __rcp_v(v);
-  ;  return iv * (2. - v * iv);
-  %v_iv = fmul <4 x float> %0, %call
-  %two_minus = fsub <4 x float> <float 2., float 2., float 2., float 2.>, %v_iv  
-  %iv_mul = fmul <4 x float> %call, %two_minus
-  ret <4 x float> %iv_mul
-}
-
-define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
-  ; do the rcpss call
-  %vecval = insertelement <4 x float> undef, float %0, i32 0
-  %call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
-  %scall = extractelement <4 x float> %call, i32 0
-
-  ; do one N-R iteration to improve precision, as above
-  %v_iv = fmul float %0, %scall
-  %two_minus = fsub float 2., %v_iv  
-  %iv_mul = fmul float %scall, %two_minus
-  ret float %iv_mul
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; rsqrt
-
-declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
-declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
-
-define internal <4 x float> @__rsqrt_varying_float(<4 x float> %v) nounwind readonly alwaysinline {
-  ;  float is = __rsqrt_v(v);
-  %is = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %v)
-  ; Newton-Raphson iteration to improve precision
-  ;  return 0.5 * is * (3. - (v * is) * is);
-  %v_is = fmul <4 x float> %v, %is
-  %v_is_is = fmul <4 x float> %v_is, %is
-  %three_sub = fsub <4 x float> <float 3., float 3., float 3., float 3.>, %v_is_is
-  %is_mul = fmul <4 x float> %is, %three_sub
-  %half_scale = fmul <4 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
-  ret <4 x float> %half_scale
-}
-
-define internal float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
-  ;  uniform float is = extract(__rsqrt_u(v), 0);
-  %v = insertelement <4 x float> undef, float %0, i32 0
-  %vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
-  %is = extractelement <4 x float> %vis, i32 0
-
-  ; Newton-Raphson iteration to improve precision
-  ;  return 0.5 * is * (3. - (v * is) * is);
-  %v_is = fmul float %0, %is
-  %v_is_is = fmul float %v_is, %is
-  %three_sub = fsub float 3., %v_is_is
-  %is_mul = fmul float %is, %three_sub
-  %half_scale = fmul float 0.5, %is_mul
-  ret float %half_scale
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; sqrt
-
-declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
-declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
-
-define internal <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysinline {
-  %call = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %0)
-  ret <4 x float> %call
-}
-
-define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
-  sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
-  ret float %ret
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; fast math mode
-
-declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
-declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
-
-define internal void @__fastmath() nounwind alwaysinline {
-  %ptr = alloca i32
-  %ptr8 = bitcast i32 * %ptr to i8 *
-  call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
-  %oldval = load i32 *%ptr
-
-  ; turn on DAZ (64)/FTZ (32768) -> 32832
-  %update = or i32 %oldval, 32832
-  store i32 %update, i32 *%ptr
-  call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8)
-  ret void
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; svml stuff
-
-declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
-
-
-define internal <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define internal <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define internal void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
-  %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
-  store <4 x float> %s, <4 x float> * %1
-  ret void
-}
-
-define internal <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define internal <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define internal <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %ret
-}
-
-define internal <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_expf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define internal <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_logf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define internal <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %ret
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; float min/max
-
-declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
-
-define internal <4 x float> @__max_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
-  %call = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %call
-}
-
-define internal float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
-  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
-  ret float %ret
-}
-
-define internal <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
-  %call = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %call
-}
-
-define internal float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
-  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
-  ret float %ret
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; double precision sqrt
-
-declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
-declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
-
-define internal <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
-  unary2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
-  ret <4 x double> %ret
-}
-
-
-define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
-  sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
-  ret double %ret
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; double precision min/max
-
-declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
-declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
-declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
-declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
-
-define internal <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone {
-  binary2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
-  ret <4 x double> %ret
-}
-
-
-define internal double @__min_uniform_double(double, double) nounwind readnone {
-  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
-  ret double %ret
-}
-
-
-define internal <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone {
-  binary2to4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
-  ret <4 x double> %ret
-}
-
-
-define internal double @__max_uniform_double(double, double) nounwind readnone {
-  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
-  ret double %ret
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; horizontal ops / reductions
-
-declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
-
-define internal i32 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
-  %floatmask = bitcast <4 x i32> %0 to <4 x float>
-  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
-  ret i32 %v
-}
-
-define internal float @__reduce_min_float(<4 x float>) nounwind readnone {
-  reduce4(float, @__min_varying_float, @__min_uniform_float)
-}
-
-define internal float @__reduce_max_float(<4 x float>) nounwind readnone {
-  reduce4(float, @__max_varying_float, @__max_uniform_float)
-}
-
-define internal i32 @__reduce_add_int32(<4 x i32> %v) nounwind readnone {
-  %v1 = shufflevector <4 x i32> %v, <4 x i32> undef,
-                      <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-  %m1 = add <4 x i32> %v1, %v
-  %m1a = extractelement <4 x i32> %m1, i32 0
-  %m1b = extractelement <4 x i32> %m1, i32 1
-  %sum = add i32 %m1a, %m1b
-  ret i32 %sum
-}
-
-define internal i32 @__reduce_min_int32(<4 x i32>) nounwind readnone {
-  reduce4(i32, @__min_varying_int32, @__min_uniform_int32)
-}
-
-define internal i32 @__reduce_max_int32(<4 x i32>) nounwind readnone {
-  reduce4(i32, @__max_varying_int32, @__max_uniform_int32)
-}
-
-define internal i32 @__reduce_add_uint32(<4 x i32> %v) nounwind readnone {
-  %r = call i32 @__reduce_add_int32(<4 x i32> %v)
-  ret i32 %r
-}
-
-define internal i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone {
-  reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32)
-}
-
-define internal i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone {
-  reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32)
- }
-
-
-define internal double @__reduce_add_double(<4 x double>) nounwind readnone {
-  %v0 = shufflevector <4 x double> %0, <4 x double> undef,
-                      <2 x i32> <i32 0, i32 1>
-  %v1 = shufflevector <4 x double> %0, <4 x double> undef,
-                      <2 x i32> <i32 2, i32 3>
-  %sum = fadd <2 x double> %v0, %v1
-  %e0 = extractelement <2 x double> %sum, i32 0
-  %e1 = extractelement <2 x double> %sum, i32 1
-  %m = fadd double %e0, %e1
-  ret double %m
-}
-
-define internal double @__reduce_min_double(<4 x double>) nounwind readnone {
-  reduce4(double, @__min_varying_double, @__min_uniform_double)
-}
-
-define internal double @__reduce_max_double(<4 x double>) nounwind readnone {
-  reduce4(double, @__max_varying_double, @__max_uniform_double)
-}
-
-define internal i64 @__reduce_add_int64(<4 x i64>) nounwind readnone {
-  %v0 = shufflevector <4 x i64> %0, <4 x i64> undef,
-                      <2 x i32> <i32 0, i32 1>
-  %v1 = shufflevector <4 x i64> %0, <4 x i64> undef,
-                      <2 x i32> <i32 2, i32 3>
-  %sum = add <2 x i64> %v0, %v1
-  %e0 = extractelement <2 x i64> %sum, i32 0
-  %e1 = extractelement <2 x i64> %sum, i32 1
-  %m = add i64 %e0, %e1
-  ret i64 %m
-}
-
-define internal i64 @__reduce_min_int64(<4 x i64>) nounwind readnone {
-  reduce4(i64, @__min_varying_int64, @__min_uniform_int64)
-}
-
-define internal i64 @__reduce_max_int64(<4 x i64>) nounwind readnone {
-  reduce4(i64, @__max_varying_int64, @__max_uniform_int64)
-}
-
-define internal i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone {
-  reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64)
-}
-
-define internal i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone {
-  reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
-}
-
-reduce_equal(4)
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; masked store
-
-masked_store_blend_8_16_by_4()
-
-gen_masked_store(4, i8, 8)
-gen_masked_store(4, i16, 16)
-gen_masked_store(4, i32, 32)
-gen_masked_store(4, i64, 64)
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; unaligned loads/loads+broadcasts
-
-load_and_broadcast(4, i8, 8)
-load_and_broadcast(4, i16, 16)
-load_and_broadcast(4, i32, 32)
-load_and_broadcast(4, i64, 64)
-
-load_masked(4, i8,  8,  1)
-load_masked(4, i16, 16, 2)
-load_masked(4, i32, 32, 4)
-load_masked(4, i64, 64, 8)
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; gather/scatter
-
-; define these with the macros from stdlib.m4
-
-gen_gather(4, i8)
-gen_gather(4, i16)
-gen_gather(4, i32)
-gen_gather(4, i64)
-
-gen_scatter(4, i8)
-gen_scatter(4, i16)
-gen_scatter(4, i32)
-gen_scatter(4, i64)
--- a/builtins-sse2.ll
+++ b/builtins-sse2.ll
@@ -1,357 +0,0 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
-;;  All rights reserved.
-;;
-;;  Redistribution and use in source and binary forms, with or without
-;;  modification, are permitted provided that the following conditions are
-;;  met:
-;;
-;;    * Redistributions of source code must retain the above copyright
-;;      notice, this list of conditions and the following disclaimer.
-;;
-;;    * Redistributions in binary form must reproduce the above copyright
-;;      notice, this list of conditions and the following disclaimer in the
-;;      documentation and/or other materials provided with the distribution.
-;;
-;;    * Neither the name of Intel Corporation nor the names of its
-;;      contributors may be used to endorse or promote products derived from
-;;      this software without specific prior written permission.
-;;
-;;
-;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
-;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; Define the standard library builtins for the SSE2 target
-
-; Define some basics for a 4-wide target
-stdlib_core(4)
-packed_load_and_store(4)
-scans(4)
-
-; Include the various definitions of things that only require SSE1 and SSE2
-include(`builtins-sse.ll')
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; rounding
-;;
-;; There are not any rounding instructions in SSE2, so we have to emulate
-;; the functionality with multiple instructions...
-
-; The code for __round_* is the result of compiling the following source
-; code.
-;
-; export float Round(float x) {
-;    unsigned int sign = signbits(x);
-;    unsigned int ix = intbits(x);
-;    ix ^= sign;
-;    x = floatbits(ix);
-;    x += 0x1.0p23f;
-;    x -= 0x1.0p23f;
-;    ix = intbits(x);
-;    ix ^= sign;
-;    x = floatbits(ix);
-;    return x;
-;}
-
-define internal <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline {
-  %float_to_int_bitcast.i.i.i.i = bitcast <4 x float> %0 to <4 x i32>
-  %bitop.i.i = and <4 x i32> %float_to_int_bitcast.i.i.i.i, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
-  %bitop.i = xor <4 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
-  %int_to_float_bitcast.i.i40.i = bitcast <4 x i32> %bitop.i to <4 x float>
-  %binop.i = fadd <4 x float> %int_to_float_bitcast.i.i40.i, <float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06>
-  %binop21.i = fadd <4 x float> %binop.i, <float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06>
-  %float_to_int_bitcast.i.i.i = bitcast <4 x float> %binop21.i to <4 x i32>
-  %bitop31.i = xor <4 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
-  %int_to_float_bitcast.i.i.i = bitcast <4 x i32> %bitop31.i to <4 x float>
-  ret <4 x float> %int_to_float_bitcast.i.i.i
-}
-
-define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
-  %float_to_int_bitcast.i.i.i.i = bitcast float %0 to i32
-  %bitop.i.i = and i32 %float_to_int_bitcast.i.i.i.i, -2147483648
-  %bitop.i = xor i32 %bitop.i.i, %float_to_int_bitcast.i.i.i.i
-  %int_to_float_bitcast.i.i40.i = bitcast i32 %bitop.i to float
-  %binop.i = fadd float %int_to_float_bitcast.i.i40.i, 8.388608e+06
-  %binop21.i = fadd float %binop.i, -8.388608e+06
-  %float_to_int_bitcast.i.i.i = bitcast float %binop21.i to i32
-  %bitop31.i = xor i32 %float_to_int_bitcast.i.i.i, %bitop.i.i
-  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop31.i to float
-  ret float %int_to_float_bitcast.i.i.i
-}
-
-;; Similarly, for implementations of the __floor* functions below, we have the
-;; bitcode from compiling the following source code...
-
-;export float Floor(float x) {
-;    float y = Round(x);
-;    unsigned int cmp = y > x ? 0xffffffff : 0;
-;    float delta = -1.f;
-;    unsigned int idelta = intbits(delta);
-;    idelta &= cmp;
-;    delta = floatbits(idelta);
-;    return y + delta;
-;}
-
-define internal <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
-  %calltmp.i = tail call <4 x float> @__round_varying_float(<4 x float> %0) nounwind
-  %bincmp.i = fcmp ogt <4 x float> %calltmp.i, %0
-  %val_to_boolvec32.i = sext <4 x i1> %bincmp.i to <4 x i32>
-  %bitop.i = and <4 x i32> %val_to_boolvec32.i, <i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432>
-  %int_to_float_bitcast.i.i.i = bitcast <4 x i32> %bitop.i to <4 x float>
-  %binop.i = fadd <4 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
-  ret <4 x float> %binop.i
-}
-
-define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
-  %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
-  %bincmp.i = fcmp ogt float %calltmp.i, %0
-  %selectexpr.i = sext i1 %bincmp.i to i32
-  %bitop.i = and i32 %selectexpr.i, -1082130432
-  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
-  %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
-  ret float %binop.i
-}
-
-;; And here is the code we compiled to get the __ceil* functions below
-;
-;export uniform float Ceil(uniform float x) {
-;    uniform float y = Round(x);
-;    uniform int yltx = y < x ? 0xffffffff : 0;
-;    uniform float delta = 1.f;
-;    uniform int idelta = intbits(delta);
-;    idelta &= yltx;
-;    delta = floatbits(idelta);
-;    return y + delta;
-;}
-
-define internal <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
-  %calltmp.i = tail call <4 x float> @__round_varying_float(<4 x float> %0) nounwind
-  %bincmp.i = fcmp olt <4 x float> %calltmp.i, %0
-  %val_to_boolvec32.i = sext <4 x i1> %bincmp.i to <4 x i32>
-  %bitop.i = and <4 x i32> %val_to_boolvec32.i, <i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216>
-  %int_to_float_bitcast.i.i.i = bitcast <4 x i32> %bitop.i to <4 x float>
-  %binop.i = fadd <4 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
-  ret <4 x float> %binop.i
-}
-
-define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
-  %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
-  %bincmp.i = fcmp olt float %calltmp.i, %0
-  %selectexpr.i = sext i1 %bincmp.i to i32
-  %bitop.i = and i32 %selectexpr.i, 1065353216
-  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
-  %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
-  ret float %binop.i
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; rounding doubles
-
-declare double @round(double)
-declare double @floor(double)
-declare double @ceil(double)
-
-define internal <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline {
-  unary1to4(double, @round)
-}
-
-define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
-  %r = call double @round(double %0)
-  ret double %r
-}
-
-define internal <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
-  unary1to4(double, @floor)
-}
-
-define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
-  %r = call double @floor(double %0)
-  ret double %r
-}
-
-define internal <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
-  unary1to4(double, @ceil)
-}
-
-define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
-  %r = call double @ceil(double %0)
-  ret double %r
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; min/max
-
-; There is no blend instruction with SSE2, so we simulate it with bit
-; operations on i32s.  For these two vselect functions, for each
-; vector element, if the mask is on, we return the corresponding value
-; from %1, and otherwise return the value from %0.
-
-define internal <4 x i32> @__vselect_i32(<4 x i32>, <4 x i32> ,
-                                         <4 x i32> %mask) nounwind readnone alwaysinline {
-  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
-  %cleared_old = and <4 x i32> %0, %notmask
-  %masked_new = and <4 x i32> %1, %mask
-  %new = or <4 x i32> %cleared_old, %masked_new
-  ret <4 x i32> %new
-}
-
-define internal <4 x float> @__vselect_float(<4 x float>, <4 x float>,
-                                             <4 x i32> %mask) nounwind readnone alwaysinline {
-  %v0 = bitcast <4 x float> %0 to <4 x i32>
-  %v1 = bitcast <4 x float> %1 to <4 x i32>
-  %r = call <4 x i32> @__vselect_i32(<4 x i32> %v0, <4 x i32> %v1, <4 x i32> %mask)
-  %rf = bitcast <4 x i32> %r to <4 x float>
-  ret <4 x float> %rf
-}
-
-
-; To do vector integer min and max, we do the vector compare and then sign
-; extend the i1 vector result to an i32 mask.  The __vselect does the
-; rest...
-
-define internal <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
-  %c = icmp slt <4 x i32> %0, %1
-  %mask = sext <4 x i1> %c to <4 x i32>
-  %v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
-  ret <4 x i32> %v
-}
-
-define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
-  %c = icmp slt i32 %0, %1
-  %r = select i1 %c, i32 %0, i32 %1
-  ret i32 %r
-}
-
-define internal <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
-  %c = icmp sgt <4 x i32> %0, %1
-  %mask = sext <4 x i1> %c to <4 x i32>
-  %v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
-  ret <4 x i32> %v
-}
-
-define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
-  %c = icmp sgt i32 %0, %1
-  %r = select i1 %c, i32 %0, i32 %1
-  ret i32 %r
-}
-
-; The functions for unsigned ints are similar, just with unsigned
-; comparison functions...
-
-define internal <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
-  %c = icmp ult <4 x i32> %0, %1
-  %mask = sext <4 x i1> %c to <4 x i32>
-  %v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
-  ret <4 x i32> %v
-}
-
-define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
-  %c = icmp ult i32 %0, %1
-  %r = select i1 %c, i32 %0, i32 %1
-  ret i32 %r
-}
-
-define internal <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
-  %c = icmp ugt <4 x i32> %0, %1
-  %mask = sext <4 x i1> %c to <4 x i32>
-  %v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
-  ret <4 x i32> %v
-}
-
-define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
-  %c = icmp ugt i32 %0, %1
-  %r = select i1 %c, i32 %0, i32 %1
-  ret i32 %r
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; horizontal ops / reductions
-
-declare i32 @llvm.ctpop.i32(i32)
-declare i64 @llvm.ctpop.i64(i64)
-
-define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
-  %val = call i32 @llvm.ctpop.i32(i32 %0)
-  ret i32 %val
-}
-
-define internal i64 @__popcnt_int64(i64) nounwind readnone alwaysinline {
-  %val = call i64 @llvm.ctpop.i64(i64 %0)
-  ret i64 %val
-}
-
-
-define internal float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline {
-  %v1 = shufflevector <4 x float> %v, <4 x float> undef,
-                      <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-  %m1 = fadd <4 x float> %v1, %v
-  %m1a = extractelement <4 x float> %m1, i32 0
-  %m1b = extractelement <4 x float> %m1, i32 1
-  %sum = fadd float %m1a, %m1b
-  ret float %sum
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; masked store
-
-define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>, 
-                                     <4 x i32> %mask) nounwind alwaysinline {
-  %val = load <4 x i32> * %0, align 4
-  %newval = call <4 x i32> @__vselect_i32(<4 x i32> %val, <4 x i32> %1, <4 x i32> %mask) 
-  store <4 x i32> %newval, <4 x i32> * %0, align 4
-  ret void
-}
-
-define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
-                                     <4 x i32> %mask) nounwind alwaysinline {
-  %oldValue = load <4 x i64>* %ptr, align 8
-
-  ; Do 4x64-bit blends by doing two <4 x i32> blends, where the <4 x i32> values
-  ; are actually bitcast <2 x i64> values
-  ;
-  ; set up the first two 64-bit values
-  %old01  = shufflevector <4 x i64> %oldValue, <4 x i64> undef,
-                          <2 x i32> <i32 0, i32 1>
-  %old01f = bitcast <2 x i64> %old01 to <4 x float>
-  %new01  = shufflevector <4 x i64> %new, <4 x i64> undef,
-                          <2 x i32> <i32 0, i32 1>
-  %new01f = bitcast <2 x i64> %new01 to <4 x float>
-  ; compute mask--note that the indices 0 and 1 are doubled-up
-  %mask01 = shufflevector <4 x i32> %mask, <4 x i32> undef,
-                          <4 x i32> <i32 0, i32 0, i32 1, i32 1>
-  ; and blend the two of the values
-  %result01f = call <4 x float> @__vselect_float(<4 x float> %old01f, <4 x float> %new01f, <4 x i32> %mask01)
-  %result01 = bitcast <4 x float> %result01f to <2 x i64>
-
-  ; and again
-  %old23  = shufflevector <4 x i64> %oldValue, <4 x i64> undef,
-                          <2 x i32> <i32 2, i32 3>
-  %old23f = bitcast <2 x i64> %old23 to <4 x float>
-  %new23  = shufflevector <4 x i64> %new, <4 x i64> undef,
-                          <2 x i32> <i32 2, i32 3>
-  %new23f = bitcast <2 x i64> %new23 to <4 x float>
-  ; compute mask--note that the values 2 and 3 are doubled-up
-  %mask23 = shufflevector <4 x i32> %mask, <4 x i32> undef,
-                          <4 x i32> <i32 2, i32 2, i32 3, i32 3>
-  ; and blend the two of the values
-  %result23f = call <4 x float> @__vselect_float(<4 x float> %old23f, <4 x float> %new23f, <4 x i32> %mask23)
-  %result23 = bitcast <4 x float> %result23f to <2 x i64>
-
-  ; reconstruct the final <4 x i64> vector
-  %final = shufflevector <2 x i64> %result01, <2 x i64> %result23,
-                         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  store <4 x i64> %final, <4 x i64> * %ptr, align 8
-  ret void
-}
-
--- a/builtins-sse4-x2.ll
+++ b/builtins-sse4-x2.ll
@@ -1,761 +0,0 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
-;;  All rights reserved.
-;;
-;;  Redistribution and use in source and binary forms, with or without
-;;  modification, are permitted provided that the following conditions are
-;;  met:
-;;
-;;    * Redistributions of source code must retain the above copyright
-;;      notice, this list of conditions and the following disclaimer.
-;;
-;;    * Redistributions in binary form must reproduce the above copyright
-;;      notice, this list of conditions and the following disclaimer in the
-;;      documentation and/or other materials provided with the distribution.
-;;
-;;    * Neither the name of Intel Corporation nor the names of its
-;;      contributors may be used to endorse or promote products derived from
-;;      this software without specific prior written permission.
-;;
-;;
-;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
-;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
-
-
-;; This file defines the target for "double-pumped" SSE4, i.e. running
-;; with 8-wide vectors
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; standard 8-wide definitions from m4 macros
-
-stdlib_core(8)
-packed_load_and_store(8)
-scans(8)
-int64minmax(8)
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; rcp
-
-declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
-declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
-
-define internal <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
-  ;  float iv = __rcp_v(v);
-  ;  return iv * (2. - v * iv);
-
-  unary4to8(call, float, @llvm.x86.sse.rcp.ps, %0)
-  ; do one N-R iteration
-  %v_iv = fmul <8 x float> %0, %call
-  %two_minus = fsub <8 x float> <float 2., float 2., float 2., float 2.,
-                                 float 2., float 2., float 2., float 2.>, %v_iv  
-  %iv_mul = fmul <8 x float> %call, %two_minus
-  ret <8 x float> %iv_mul
-}
-
-define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
-;    uniform float iv = extract(__rcp_u(v), 0);
-;    return iv * (2. - v * iv);
-  %vecval = insertelement <4 x float> undef, float %0, i32 0
-  %call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
-  %scall = extractelement <4 x float> %call, i32 0
-
-  ; do one N-R iteration
-  %v_iv = fmul float %0, %scall
-  %two_minus = fsub float 2., %v_iv  
-  %iv_mul = fmul float %scall, %two_minus
-  ret float %iv_mul
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; rsqrt
-
-declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
-declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
-
-define internal <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
-  ;  float is = __rsqrt_v(v);
-  unary4to8(is, float, @llvm.x86.sse.rsqrt.ps, %v)
-  ;  return 0.5 * is * (3. - (v * is) * is);
-  %v_is = fmul <8 x float> %v, %is
-  %v_is_is = fmul <8 x float> %v_is, %is
-  %three_sub = fsub <8 x float> <float 3., float 3., float 3., float 3.,
-                                 float 3., float 3., float 3., float 3.>, %v_is_is
-  %is_mul = fmul <8 x float> %is, %three_sub
-  %half_scale = fmul <8 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
-                                  float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
-  ret <8 x float> %half_scale
-}
-
-define internal float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
-  ;  uniform float is = extract(__rsqrt_u(v), 0);
-  %v = insertelement <4 x float> undef, float %0, i32 0
-  %vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
-  %is = extractelement <4 x float> %vis, i32 0
-
-  ;  return 0.5 * is * (3. - (v * is) * is);
-  %v_is = fmul float %0, %is
-  %v_is_is = fmul float %v_is, %is
-  %three_sub = fsub float 3., %v_is_is
-  %is_mul = fmul float %is, %three_sub
-  %half_scale = fmul float 0.5, %is_mul
-  ret float %half_scale
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; sqrt
-
-declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
-declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
-
-define internal <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
-  unary4to8(call, float, @llvm.x86.sse.sqrt.ps, %0)
-  ret <8 x float> %call
-}
-
-define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
-  sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
-  ret float %ret
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; fast math
-
-declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
-declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
-
-define internal void @__fastmath() nounwind alwaysinline {
-  %ptr = alloca i32
-  %ptr8 = bitcast i32 * %ptr to i8 *
-  call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
-  %oldval = load i32 *%ptr
-
-  ; turn on DAZ (64)/FTZ (32768) -> 32832
-  %update = or i32 %oldval, 32832
-  store i32 %update, i32 *%ptr
-  call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8)
-  ret void
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; svml stuff
-
-declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
-
-
-define internal <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_sinf4, %0)
-  ret <8 x float> %ret
-}
-
-define internal <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_cosf4, %0)
-  ret <8 x float> %ret
-}
-
-define internal void @__svml_sincos(<8 x float>, <8 x float> *,
-                                    <8 x float> *) nounwind readnone alwaysinline {
-  ; call svml_sincosf4 two times with the two 4-wide sub-vectors
-  %a = shufflevector <8 x float> %0, <8 x float> undef,
-         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %b = shufflevector <8 x float> %0, <8 x float> undef,
-         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-
-  %cospa = alloca <4 x float>
-  %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a)
-
-  %cospb = alloca <4 x float>
-  %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b)
-
-  %sin = shufflevector <4 x float> %sa, <4 x float> %sb,
-         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
-                    i32 4, i32 5, i32 6, i32 7>
-  store <8 x float> %sin, <8 x float> * %1
-
-  %cosa = load <4 x float> * %cospa
-  %cosb = load <4 x float> * %cospb
-  %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb,
-         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
-                    i32 4, i32 5, i32 6, i32 7>
-  store <8 x float> %cos, <8 x float> * %2
-
-  ret void
-}
-
-define internal <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_tanf4, %0)
-  ret <8 x float> %ret
-}
-
-define internal <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_atanf4, %0)
-  ret <8 x float> %ret
-}
-
-define internal <8 x float> @__svml_atan2(<8 x float>,
-                                          <8 x float>) nounwind readnone alwaysinline {
-  binary4to8(ret, float, @__svml_atan2f4, %0, %1)
-  ret <8 x float> %ret
-}
-
-define internal <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_expf4, %0)
-  ret <8 x float> %ret
-}
-
-define internal <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_logf4, %0)
-  ret <8 x float> %ret
-}
-
-define internal <8 x float> @__svml_pow(<8 x float>,
-                                        <8 x float>) nounwind readnone alwaysinline {
-  binary4to8(ret, float, @__svml_powf4, %0, %1)
-  ret <8 x float> %ret
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; float min/max
-
-declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
-
-define internal <8 x float> @__max_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
-  binary4to8(call, float, @llvm.x86.sse.max.ps, %0, %1)
-  ret <8 x float> %call
-}
-
-define internal float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
-  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
-  ret float %ret
-}
-
-define internal <8 x float> @__min_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
-  binary4to8(call, float, @llvm.x86.sse.min.ps, %0, %1)
-  ret <8 x float> %call
-}
-
-define internal float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
-  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
-  ret float %ret
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; int32 min/max
-
-declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
-declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
-
-define internal <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
-  binary4to8(call, i32, @llvm.x86.sse41.pminsd, %0, %1)
-  ret <8 x i32> %call
-}
-
-define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
-  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminsd, %0, %1)
-  ret i32 %ret
-}
-
-define internal <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
-  binary4to8(call, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
-  ret <8 x i32> %call
-}
-
-define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
-  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
-  ret i32 %ret
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; unsigned int min/max
-
-declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
-declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
-
-define internal <8 x i32> @__min_varying_uint32(<8 x i32>,
-                                                <8 x i32>) nounwind readonly alwaysinline {
-  binary4to8(call, i32, @llvm.x86.sse41.pminud, %0, %1)
-  ret <8 x i32> %call
-}
-
-define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
-  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminud, %0, %1)
-  ret i32 %ret
-}
-
-define internal <8 x i32> @__max_varying_uint32(<8 x i32>,
-                                                <8 x i32>) nounwind readonly alwaysinline {
-  binary4to8(call, i32, @llvm.x86.sse41.pmaxud, %0, %1)
-  ret <8 x i32> %call
-}
-
-define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
-  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxud, %0, %1)
-  ret i32 %ret
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; horizontal ops / reductions
-
-declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
-
-define internal i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
-  ; first do two 4-wide movmsk calls
-  %floatmask = bitcast <8 x i32> %0 to <8 x float>
-  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
-          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
-  %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
-          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
-
-  ; and shift the first one over by 4 before ORing it with the value 
-  ; of the second one
-  %v1s = shl i32 %v1, 4
-  %v = or i32 %v0, %v1s
-  ret i32 %v
-}
-
-define internal float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
-  reduce8by4(float, @llvm.x86.sse.min.ps, @__min_uniform_float)
-}
-
-define internal float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
-  reduce8by4(float, @llvm.x86.sse.max.ps, @__max_uniform_float)
-}
-
-; helper function for reduce_add_int32
-define internal <4 x i32> @__vec4_add_int32(<4 x i32> %v0,
-                                            <4 x i32> %v1) nounwind readnone alwaysinline {
-  %v = add <4 x i32> %v0, %v1
-  ret <4 x i32> %v
-}
-
-; helper function for reduce_add_int32
-define internal i32 @__add_int32(i32, i32) nounwind readnone alwaysinline {
-  %v = add i32 %0, %1
-  ret i32 %v
-}
-
-define internal i32 @__reduce_add_int32(<8 x i32>) nounwind readnone alwaysinline {
-  reduce8by4(i32, @__vec4_add_int32, @__add_int32)
-}
-
-define internal i32 @__reduce_min_int32(<8 x i32>) nounwind readnone alwaysinline {
-  reduce8by4(i32, @llvm.x86.sse41.pminsd, @__min_uniform_int32)
-}
-
-define internal i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
-  reduce8by4(i32, @llvm.x86.sse41.pmaxsd, @__max_uniform_int32)
-}
-
-define internal i32 @__reduce_add_uint32(<8 x i32> %v) nounwind readnone alwaysinline {
-  %r = call i32 @__reduce_add_int32(<8 x i32> %v)
-  ret i32 %r
-}
-
-define internal i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
-  reduce8by4(i32, @llvm.x86.sse41.pminud, @__min_uniform_uint32)
-}
-
-define internal i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
-  reduce8by4(i32, @llvm.x86.sse41.pmaxud, @__max_uniform_uint32)
-}
-
-define internal <4 x double> @__add_varying_double(<4 x double>,
-                                     <4 x double>) nounwind readnone alwaysinline {
-  %r = fadd <4 x double> %0, %1
-  ret <4 x double> %r
-}
-
-define internal double @__add_uniform_double(double, double) nounwind readnone alwaysinline {
-  %r = fadd double %0, %1
-  ret double %r
-}
-
-define internal double @__reduce_add_double(<8 x double>) nounwind readnone {
-  reduce8by4(double, @__add_varying_double, @__add_uniform_double)
-}
-
-define internal double @__reduce_min_double(<8 x double>) nounwind readnone {
-  reduce8(double, @__min_varying_double, @__min_uniform_double)
-}
-
-define internal double @__reduce_max_double(<8 x double>) nounwind readnone {
-  reduce8(double, @__max_varying_double, @__max_uniform_double)
-}
-
-define internal <4 x i64> @__add_varying_int64(<4 x i64>,
-                                               <4 x i64>) nounwind readnone alwaysinline {
-  %r = add <4 x i64> %0, %1
-  ret <4 x i64> %r
-}
-
-define internal i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
-  %r = add i64 %0, %1
-  ret i64 %r
-}
-
-define internal i64 @__reduce_add_int64(<8 x i64>) nounwind readnone {
-  reduce8by4(i64, @__add_varying_int64, @__add_uniform_int64)
-}
-
-define internal i64 @__reduce_min_int64(<8 x i64>) nounwind readnone {
-  reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
-}
-
-define internal i64 @__reduce_max_int64(<8 x i64>) nounwind readnone {
-  reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
-}
-
-define internal i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone {
-  reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
-}
-
-define internal i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone {
-  reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
-}
-
-reduce_equal(8)
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; unaligned loads/loads+broadcasts
-
-load_and_broadcast(8, i8, 8)
-load_and_broadcast(8, i16, 16)
-load_and_broadcast(8, i32, 32)
-load_and_broadcast(8, i64, 64)
-
-load_masked(8, i8,  8,  1)
-load_masked(8, i16, 16, 2)
-load_masked(8, i32, 32, 4)
-load_masked(8, i64, 64, 8)
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; gather/scatter
-
-gen_gather(8, i8)
-gen_gather(8, i16)
-gen_gather(8, i32)
-gen_gather(8, i64)
-
-gen_scatter(8, i8)
-gen_scatter(8, i16)
-gen_scatter(8, i32)
-gen_scatter(8, i64)
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; float rounding
-
-declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
-declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
-
-define internal <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
-  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
-  round4to8(%0, 8)
-}
-
-define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
-  ; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
-  ; the roundss intrinsic is a total mess--docs say:
-  ;
-  ;  __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
-  ;       
-  ;  b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
-  ;  on b0. The higher order 96 bits are copied directly from input parameter a. The
-  ;  return value is described by the following equations:
-  ;
-  ;  r0 = RND(b0)
-  ;  r1 = a1
-  ;  r2 = a2
-  ;  r3 = a3
-  ;
-  ;  It doesn't matter what we pass as a, since we only need the r0 value
-  ;  here.  So we pass the same register for both.  
-  %xi = insertelement <4 x float> undef, float %0, i32 0
-  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
-  %rs = extractelement <4 x float> %xr, i32 0
-  ret float %rs
-}
-
-define internal <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
-  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
-  round4to8(%0, 9)
-}
-
-define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
-  ; see above for round_ss instrinsic discussion...
-  %xi = insertelement <4 x float> undef, float %0, i32 0
-  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
-  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
-  %rs = extractelement <4 x float> %xr, i32 0
-  ret float %rs
-}
-
-define internal <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
-  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
-  round4to8(%0, 10)
-}
-
-define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
-  ; see above for round_ss instrinsic discussion...
-  %xi = insertelement <4 x float> undef, float %0, i32 0
-  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
-  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
-  %rs = extractelement <4 x float> %xr, i32 0
-  ret float %rs
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; rounding doubles
-
-declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
-declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
-
-define internal <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
-  round2to8double(%0, 8)
-}
-
-define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
-  %xi = insertelement <2 x double> undef, double %0, i32 0
-  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
-  %rs = extractelement <2 x double> %xr, i32 0
-  ret double %rs
-}
-
-define internal <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
-  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
-  round2to8double(%0, 9)
-}
-
-define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
-  ; see above for round_ss instrinsic discussion...
-  %xi = insertelement <2 x double> undef, double %0, i32 0
-  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
-  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
-  %rs = extractelement <2 x double> %xr, i32 0
-  ret double %rs
-}
-
-define internal <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
-  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
-  round2to8double(%0, 10)
-}
-
-define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
-  ; see above for round_ss instrinsic discussion...
-  %xi = insertelement <2 x double> undef, double %0, i32 0
-  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
-  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
-  %rs = extractelement <2 x double> %xr, i32 0
-  ret double %rs
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; horizontal ops / reductions
-
-declare i32 @llvm.ctpop.i32(i32) nounwind readnone
-
-define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
-  %call = call i32 @llvm.ctpop.i32(i32 %0)
-  ret i32 %call
-}
-
-declare i64 @llvm.ctpop.i64(i64) nounwind readnone
-
-define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
-  %call = call i64 @llvm.ctpop.i64(i64 %0)
-  ret i64 %call
-}
-
-declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
-
-define internal float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
-  %a = shufflevector <8 x float> %0, <8 x float> undef,
-         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %b = shufflevector <8 x float> %0, <8 x float> undef,
-         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %ab = fadd <4 x float> %a, %b
-  %hab = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %ab, <4 x float> %ab)
-  %a_scalar = extractelement <4 x float> %hab, i32 0
-  %b_scalar = extractelement <4 x float> %hab, i32 1
-  %sum = fadd float %a_scalar, %b_scalar
-  ret float %sum
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; masked store
-
-gen_masked_store(8, i8, 8)
-gen_masked_store(8, i16, 16)
-gen_masked_store(8, i32, 32)
-gen_masked_store(8, i64, 64)
-
-masked_store_blend_8_16_by_8()
-
-declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
-                                             <4 x float>) nounwind readnone
-
-define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>, 
-                                     <8 x i32> %mask) nounwind alwaysinline {
-  ; do two 4-wide blends with blendvps
-  %mask_as_float = bitcast <8 x i32> %mask to <8 x float>
-  %mask_a = shufflevector <8 x float> %mask_as_float, <8 x float> undef,
-                <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %mask_b = shufflevector <8 x float> %mask_as_float, <8 x float> undef,
-                <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %oldValue = load <8 x i32>* %0, align 4
-  %oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float>
-  %newAsFloat = bitcast <8 x i32> %1 to <8 x float>
-  %old_a = shufflevector <8 x float> %oldAsFloat, <8 x float> undef,
-                <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %old_b = shufflevector <8 x float> %oldAsFloat, <8 x float> undef,
-                <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %new_a = shufflevector <8 x float> %newAsFloat, <8 x float> undef,
-                <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %new_b = shufflevector <8 x float> %newAsFloat, <8 x float> undef,
-                <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %blend_a = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %old_a, <4 x float> %new_a,
-                                                       <4 x float> %mask_a)
-  %blend_b = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %old_b, <4 x float> %new_b,
-                                                       <4 x float> %mask_b)
-  %blend = shufflevector <4 x float> %blend_a, <4 x float> %blend_b,
-               <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %blendAsInt = bitcast <8 x float> %blend to <8 x i32>
-  store <8 x i32> %blendAsInt, <8 x i32>* %0, align 4
-  ret void
-}
-
-define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
-                                     <8 x i32> %mask) nounwind alwaysinline {
-  ; implement this as 4 blends of <4 x i32>s, which are actually bitcast
-  ; <2 x i64>s...
-
-  %mask_as_float = bitcast <8 x i32> %mask to <8 x float>
-
-  %old = load <8 x i64>* %ptr, align 8
-
-  ; set up the first two 64-bit values
-  %old01 = shufflevector <8 x i64> %old, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
-  %old01f = bitcast <2 x i64> %old01 to <4 x float>
-  %new01  = shufflevector <8 x i64> %new, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
-  %new01f = bitcast <2 x i64> %new01 to <4 x float>
-  ; compute mask--note that the values mask0 and mask1 are doubled-up
-  %mask01 = shufflevector <8 x float> %mask_as_float, <8 x float> undef,
-                          <4 x i32> <i32 0, i32 0, i32 1, i32 1>
-  ; and blend the two of them values
-  %result01f = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %old01f,
-                                                         <4 x float> %new01f,
-                                                         <4 x float> %mask01)
-  %result01 = bitcast <4 x float> %result01f to <2 x i64>
-
-  ; and again
-  %old23 = shufflevector <8 x i64> %old, <8 x i64> undef, <2 x i32> <i32 2, i32 3>
-  %old23f = bitcast <2 x i64> %old23 to <4 x float>
-  %new23  = shufflevector <8 x i64> %new, <8 x i64> undef, <2 x i32> <i32 2, i32 3>
-  %new23f = bitcast <2 x i64> %new23 to <4 x float>
-  %mask23 = shufflevector <8 x float> %mask_as_float, <8 x float> undef,
-                          <4 x i32> <i32 2, i32 2, i32 3, i32 3>
-  %result23f = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %old23f,
-                                                         <4 x float> %new23f,
-                                                         <4 x float> %mask23)
-  %result23 = bitcast <4 x float> %result23f to <2 x i64>
-
-  %old45 = shufflevector <8 x i64> %old, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
-  %old45f = bitcast <2 x i64> %old45 to <4 x float>
-  %new45  = shufflevector <8 x i64> %new, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
-  %new45f = bitcast <2 x i64> %new45 to <4 x float>
-  %mask45 = shufflevector <8 x float> %mask_as_float, <8 x float> undef,
-                          <4 x i32> <i32 4, i32 4, i32 5, i32 5>
-  %result45f = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %old45f,
-                                                         <4 x float> %new45f,
-                                                         <4 x float> %mask45)
-  %result45 = bitcast <4 x float> %result45f to <2 x i64>
-
-  %old67 = shufflevector <8 x i64> %old, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
-  %old67f = bitcast <2 x i64> %old67 to <4 x float>
-  %new67  = shufflevector <8 x i64> %new, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
-  %new67f = bitcast <2 x i64> %new67 to <4 x float>
-  %mask67 = shufflevector <8 x float> %mask_as_float, <8 x float> undef,
-                          <4 x i32> <i32 6, i32 6, i32 7, i32 7>
-  %result67f = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %old67f,
-                                                         <4 x float> %new67f,
-                                                         <4 x float> %mask67)
-  %result67 = bitcast <4 x float> %result67f to <2 x i64>
-
-  %final0123 = shufflevector <2 x i64> %result01, <2 x i64> %result23,
-       <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %final4567 = shufflevector <2 x i64> %result45, <2 x i64> %result67,
-       <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %final = shufflevector <4 x i64> %final0123, <4 x i64> %final4567,
-       <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  store <8 x i64> %final, <8 x i64> * %ptr, align 8
-  ret void
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; double precision sqrt
-
-declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
-declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
-
-define internal <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
-  unary2to8(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
-  ret <8 x double> %ret
-}
-
-
-define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
-  sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.pd, %0)
-  ret double %ret
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; double precision float min/max
-
-declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
-declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
-declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
-declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
-
-define internal <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
-  binary2to8(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
-  ret <8 x double> %ret
-}
-
-define internal double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
-  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.pd, %0, %1)
-  ret double %ret
-}
-
-define internal <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
-  binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
-  ret <8 x double> %ret
-}
-
-define internal double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
-  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.pd, %0, %1)
-  ret double %ret
-
-}
--- a/builtins-sse4.ll
+++ b/builtins-sse4.ll
@@ -1,300 +0,0 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
-;;  All rights reserved.
-;;
-;;  Redistribution and use in source and binary forms, with or without
-;;  modification, are permitted provided that the following conditions are
-;;  met:
-;;
-;;    * Redistributions of source code must retain the above copyright
-;;      notice, this list of conditions and the following disclaimer.
-;;
-;;    * Redistributions in binary form must reproduce the above copyright
-;;      notice, this list of conditions and the following disclaimer in the
-;;      documentation and/or other materials provided with the distribution.
-;;
-;;    * Neither the name of Intel Corporation nor the names of its
-;;      contributors may be used to endorse or promote products derived from
-;;      this software without specific prior written permission.
-;;
-;;
-;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
-;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-; Define common 4-wide stuff
-stdlib_core(4)
-packed_load_and_store(4)
-scans(4)
-
-; Define the stuff that can be done with base SSE1/SSE2 instructions
-include(`builtins-sse.ll')
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; rounding floats
-
-declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
-declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
-
-define internal <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline {
-  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
-  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 8)
-  ret <4 x float> %call
-}
-
-define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
-  ; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
-  ; the roundss intrinsic is a total mess--docs say:
-  ;
-  ;  __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
-  ;       
-  ;  b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
-  ;  on b0. The higher order 96 bits are copied directly from input parameter a. The
-  ;  return value is described by the following equations:
-  ;
-  ;  r0 = RND(b0)
-  ;  r1 = a1
-  ;  r2 = a2
-  ;  r3 = a3
-  ;
-  ;  It doesn't matter what we pass as a, since we only need the r0 value
-  ;  here.  So we pass the same register for both.  Further, only the 0th
-  ;  element of the b parameter matters
-  %xi = insertelement <4 x float> undef, float %0, i32 0
-  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
-  %rs = extractelement <4 x float> %xr, i32 0
-  ret float %rs
-}
-
-define internal <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
-  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
-  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 9)
-  ret <4 x float> %call
-}
-
-define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
-  ; see above for round_ss instrinsic discussion...
-  %xi = insertelement <4 x float> undef, float %0, i32 0
-  ; roundps, round down 0b01 | don't signal precision exceptions 0b1010 = 9
-  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
-  %rs = extractelement <4 x float> %xr, i32 0
-  ret float %rs
-}
-
-define internal <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
-  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
-  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 10)
-  ret <4 x float> %call
-}
-
-define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
-  ; see above for round_ss instrinsic discussion...
-  %xi = insertelement <4 x float> undef, float %0, i32 0
-  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
-  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
-  %rs = extractelement <4 x float> %xr, i32 0
-  ret float %rs
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; rounding doubles
-
-declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
-declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
-
-define internal <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline {
-  round2to4double(%0, 8)
-}
-
-define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
-  %xi = insertelement <2 x double> undef, double %0, i32 0
-  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
-  %rs = extractelement <2 x double> %xr, i32 0
-  ret double %rs
-}
-
-define internal <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
-  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
-  round2to4double(%0, 9)
-}
-
-define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
-  ; see above for round_ss instrinsic discussion...
-  %xi = insertelement <2 x double> undef, double %0, i32 0
-  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
-  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
-  %rs = extractelement <2 x double> %xr, i32 0
-  ret double %rs
-}
-
-define internal <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
-  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
-  round2to4double(%0, 10)
-}
-
-define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
-  ; see above for round_ss instrinsic discussion...
-  %xi = insertelement <2 x double> undef, double %0, i32 0
-  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
-  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
-  %rs = extractelement <2 x double> %xr, i32 0
-  ret double %rs
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; int32 min/max
-
-declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
-declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
-
-define internal <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
-  %call = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %0, <4 x i32> %1)
-  ret <4 x i32> %call
-}
-
-define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
-  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminsd, %0, %1)
-  ret i32 %ret
-}
-
-define internal <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
-  %call = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %0, <4 x i32> %1)
-  ret <4 x i32> %call
-}
-
-define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
-  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
-  ret i32 %ret
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; unsigned int min/max
-
-declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
-declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
-
-define internal <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
-  %call = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %0, <4 x i32> %1)
-  ret <4 x i32> %call
-}
-
-define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
-  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminud, %0, %1)
-  ret i32 %ret
-}
-
-define internal <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
-  %call = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %0, <4 x i32> %1)
-  ret <4 x i32> %call
-}
-
-define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
-  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxud, %0, %1)
-  ret i32 %ret
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; horizontal ops / reductions
-
-declare i32 @llvm.ctpop.i32(i32) nounwind readnone
-
-define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
-  %call = call i32 @llvm.ctpop.i32(i32 %0)
-  ret i32 %call
-}
-
-declare i64 @llvm.ctpop.i64(i64) nounwind readnone
-
-define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
-  %call = call i64 @llvm.ctpop.i64(i64 %0)
-  ret i64 %call
-}
-
-declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
-
-define internal float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline {
-  %v1 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %0, <4 x float> %0)
-  %v2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %v1, <4 x float> %v1)
-  %scalar = extractelement <4 x float> %v2, i32 0
-  ret float %scalar
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; masked store
-
-declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
-                                             <4 x float>) nounwind readnone
-
-
-define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>, 
-                                     <4 x i32> %mask) nounwind alwaysinline {
-  %mask_as_float = bitcast <4 x i32> %mask to <4 x float>
-  %oldValue = load <4 x i32>* %0, align 4
-  %oldAsFloat = bitcast <4 x i32> %oldValue to <4 x float>
-  %newAsFloat = bitcast <4 x i32> %1 to <4 x float>
-  %blend = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %oldAsFloat,
-                                                     <4 x float> %newAsFloat,
-                                                     <4 x float> %mask_as_float)
-  %blendAsInt = bitcast <4 x float> %blend to <4 x i32>
-  store <4 x i32> %blendAsInt, <4 x i32>* %0, align 4
-  ret void
-}
-
-
-define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
-                                     <4 x i32> %i32mask) nounwind alwaysinline {
-  %oldValue = load <4 x i64>* %ptr, align 8
-  %mask = bitcast <4 x i32> %i32mask to <4 x float>
-
-  ; Do 4x64-bit blends by doing two <4 x i32> blends, where the <4 x i32> values
-  ; are actually bitcast <2 x i64> values
-  ;
-  ; set up the first two 64-bit values
-  %old01  = shufflevector <4 x i64> %oldValue, <4 x i64> undef,
-                          <2 x i32> <i32 0, i32 1>
-  %old01f = bitcast <2 x i64> %old01 to <4 x float>
-  %new01  = shufflevector <4 x i64> %new, <4 x i64> undef,
-                          <2 x i32> <i32 0, i32 1>
-  %new01f = bitcast <2 x i64> %new01 to <4 x float>
-  ; compute mask--note that the indices 0 and 1 are doubled-up
-  %mask01 = shufflevector <4 x float> %mask, <4 x float> undef,
-                          <4 x i32> <i32 0, i32 0, i32 1, i32 1>
-  ; and blend the two of the values
-  %result01f = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %old01f,
-                                                         <4 x float> %new01f,
-                                                         <4 x float> %mask01)
-  %result01 = bitcast <4 x float> %result01f to <2 x i64>
-
-  ; and again
-  %old23  = shufflevector <4 x i64> %oldValue, <4 x i64> undef,
-                          <2 x i32> <i32 2, i32 3>
-  %old23f = bitcast <2 x i64> %old23 to <4 x float>
-  %new23  = shufflevector <4 x i64> %new, <4 x i64> undef,
-                          <2 x i32> <i32 2, i32 3>
-  %new23f = bitcast <2 x i64> %new23 to <4 x float>
-  ; compute mask--note that the values 2 and 3 are doubled-up
-  %mask23 = shufflevector <4 x float> %mask, <4 x float> undef,
-                          <4 x i32> <i32 2, i32 2, i32 3, i32 3>
-  ; and blend the two of the values
-  %result23f = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %old23f,
-                                                         <4 x float> %new23f,
-                                                         <4 x float> %mask23)
-  %result23 = bitcast <4 x float> %result23f to <2 x i64>
-
-  ; reconstruct the final <4 x i64> vector
-  %final = shufflevector <2 x i64> %result01, <2 x i64> %result23,
-                         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  store <4 x i64> %final, <4 x i64> * %ptr, align 8
-  ret void
-}
--- a/builtins.cpp
+++ b/builtins.cpp
--- a/builtins.h
+++ b/builtins.h
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2015, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -28,11 +28,11 @@
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

 /** @file builtins.h
-    @brief Declarations of functions related to builtins and the 
+    @brief Declarations of functions related to builtins and the
           standard library
 */

@@ -56,6 +56,7 @@ void DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module
                  bool includeStdlib);

 void AddBitcodeToModule(const unsigned char *bitcode, int length,
-                        llvm::Module *module, SymbolTable *symbolTable = NULL);
+                        llvm::Module *module, SymbolTable *symbolTable = NULL,
+                        bool warn = true);

 #endif // ISPC_STDLIB_H
--- a/builtins.m4
+++ b/builtins.m4
--- a/builtins/__do_print_nvptx.cu
+++ b/builtins/__do_print_nvptx.cu
@@ -0,0 +1,163 @@
+/*
+  Copyright (c) 2014-2015, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <cstdio>
+
+#define PRINT_BUF_SIZE 4096
+#define uint64_t unsigned long long
+
+static __device__ size_t d_strlen(const char *str)
+{
+  const char *s;
+
+  for (s = str; *s; ++s)
+    ;
+  return (s - str);
+}
+
+static __device__  char* d_strncat(char *dest, const char *src, size_t n)
+{
+  size_t dest_len = d_strlen(dest);
+  size_t i;
+
+  for (i = 0 ; i < n && src[i] != '\0' ; i++)
+    dest[dest_len + i] = src[i];
+  dest[dest_len + i] = '\0';
+
+  return dest;
+}
+ 
+#define APPEND(str)                                        \
+    do {                                                   \
+        int offset = bufp - &printString[0];               \
+        *bufp = '\0';                                      \
+        d_strncat(bufp, str, PRINT_BUF_SIZE-offset);         \
+        bufp += d_strlen(str);                               \
+        if (bufp >= &printString[PRINT_BUF_SIZE])          \
+            goto done;                                     \
+    } while (0) /* eat semicolon */
+
+
+#define PRINT_SCALAR(fmt, type)                  \
+    sprintf(tmpBuf, fmt, *((type *)ptr));        \
+    APPEND(tmpBuf);                              \
+    break
+
+#define PRINT_VECTOR(fmt, type)                                         \
+    *bufp++ = '[';                                                      \
+    if (bufp == &printString[PRINT_BUF_SIZE]) break;                    \
+    for (int i = 0; i < width; ++i) {                                   \
+        /* only print the value if the current lane is executing */     \
+        type val0 = *((type*)ptr);                                      \
+        type val = val0;                                                \
+        if (mask & (1ull<<i))                                           \
+            sprintf(tmpBuf, fmt, val);                                  \
+        else                                                            \
+            sprintf(tmpBuf, "(( * )) ");                                \
+        APPEND(tmpBuf);                                                 \
+        *bufp++ = (i != width-1 ? ',' : ']');                           \
+    }                                                                   \
+    break
+
+extern "C"
+__device__ void __do_print_nvptx(const char *format, const char *types, int width, uint64_t mask,
+                void **args) {
+    char printString[PRINT_BUF_SIZE+1]; // +1 for trailing NUL
+    char *bufp = &printString[0];
+    char tmpBuf[256];
+    const char  trueBuf[] = "true";
+    const char falseBuf[] = "false";
+
+    int argCount = 0;
+    while (*format && bufp < &printString[PRINT_BUF_SIZE]) {
+        // Format strings are just single percent signs.
+        if (*format != '%') {
+            *bufp++ = *format;
+        }
+        else {
+            if (*types) {
+                void *ptr = args[argCount++];
+                // Based on the encoding in the types string, cast the
+                // value appropriately and print it with a reasonable
+                // printf() formatting string.
+                switch (*types) {
+                case 'b': {
+                    const char *tmpBuf1 =  *((bool *)ptr) ? trueBuf : falseBuf;
+                    APPEND(tmpBuf1);
+                    break;
+                }
+                case 'B': {
+                    *bufp++ = '[';
+                    if (bufp == &printString[PRINT_BUF_SIZE])
+                        break;
+                    for (int i = 0; i < width; ++i) {
+                        bool val0 = *((bool*)ptr);
+                        bool val = val0;                                                \
+                        if (mask & (1ull << i)) {
+                            const char *tmpBuf1 =  val ? trueBuf : falseBuf;
+                            APPEND(tmpBuf1);
+                        }
+                        else
+                            APPEND("_________");
+                        *bufp++ = (i != width-1) ? ',' : ']';
+                    }
+                    break;
+                }
+                case 'i': PRINT_SCALAR("%d", int);
+                case 'I': PRINT_VECTOR("%d", int);
+                case 'u': PRINT_SCALAR("%u", unsigned int);
+                case 'U': PRINT_VECTOR("%u", unsigned int);
+                case 'f': PRINT_SCALAR("%f", float);
+                case 'F': PRINT_VECTOR("%f", float);
+                case 'l': PRINT_SCALAR("%lld", long long);
+                case 'L': PRINT_VECTOR("%lld", long long);
+                case 'v': PRINT_SCALAR("%llu", unsigned long long);
+                case 'V': PRINT_VECTOR("%llu", unsigned long long);
+                case 'd': PRINT_SCALAR("%f", double);
+                case 'D': PRINT_VECTOR("%f", double);
+                case 'p': PRINT_SCALAR("%p", void *);
+                case 'P': PRINT_VECTOR("%p", void *);
+                default:
+                    APPEND("UNKNOWN TYPE ");
+                    *bufp++ = *types;
+                }
+                ++types;
+            }
+        }
+        ++format;
+    }
+
+ done:
+    *bufp = '\n'; bufp++;
+    *bufp = '\0';
+}
--- a/builtins/builtins.c
+++ b/builtins/builtins.c
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2013, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -28,7 +28,7 @@
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

 /** @file builtins-c.c
@@ -50,6 +50,16 @@
    available to ispc programs at compile time automatically.
  */

+#ifdef _MSC_VER
+// We do want old school sprintf and don't want secure Microsoft extensions.
+// And we also don't want warnings about it, so the define.
+#define _CRT_SECURE_NO_WARNINGS
+#else
+// Some versions of glibc has "fortification" feature, which expands sprintf
+// to __builtin___sprintf_chk(..., __builtin_object_size(...), ...).
+// We don't want this kind of expansion, as we don't support these intrinsics.
+#define _FORTIFY_SOURCE 0
+#endif

 #ifndef _MSC_VER
 #include <unistd.h>
@@ -59,22 +69,39 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdarg.h>
+#include <string.h>

 typedef int Bool;

-#define PRINT_SCALAR(fmt, type)  \
-    printf(fmt, *((type *)ptr)); \
+#define PRINT_BUF_SIZE 4096
+
+#define APPEND(str)                                        \
+    do {                                                   \
+        int offset = bufp - &printString[0];               \
+        *bufp = '\0';                                      \
+        strncat(bufp, str, PRINT_BUF_SIZE-offset);         \
+        bufp += strlen(str);                               \
+        if (bufp >= &printString[PRINT_BUF_SIZE])          \
+            goto done;                                     \
+    } while (0) /* eat semicolon */
+
+
+#define PRINT_SCALAR(fmt, type)                  \
+    sprintf(tmpBuf, fmt, *((type *)ptr));        \
+    APPEND(tmpBuf);                              \
    break

 #define PRINT_VECTOR(fmt, type)                                         \
-    putchar('[');                                                       \
+    *bufp++ = '[';                                                      \
+    if (bufp == &printString[PRINT_BUF_SIZE]) break;                    \
    for (int i = 0; i < width; ++i) {                                   \
        /* only print the value if the current lane is executing */     \
-        if (mask & (1<<i))                                              \
-            printf(fmt, ((type *)ptr)[i]);                              \
+        if (mask & (1ull<<i))                                           \
+            sprintf(tmpBuf, fmt, ((type *)ptr)[i]);                     \
        else                                                            \
-            printf("((" fmt "))", ((type *)ptr)[i]);                    \
-        putchar(i != width-1 ? ',' : ']');                              \
+            sprintf(tmpBuf, "((" fmt "))", ((type *)ptr)[i]);           \
+        APPEND(tmpBuf);                                                 \
+        *bufp++ = (i != width-1 ? ',' : ']');                           \
    }                                                                   \
    break

@@ -84,21 +111,23 @@ typedef int Bool;

    @param format  Print format string
    @param types   Encoded types of the values being printed.
-                   (See lEncodeType()). 
+                   (See lEncodeType()).
    @param width   Vector width of the compilation target
    @param mask    Current lane mask when the print statemnt is called
    @param args    Array of pointers to the values to be printed
 */
-void __do_print(const char *format, const char *types, int width, int mask, 
+void __do_print(const char *format, const char *types, int width, uint64_t mask,
                void **args) {
-    if (mask == 0) 
-        return;
+    char printString[PRINT_BUF_SIZE+1]; // +1 for trailing NUL
+    char *bufp = &printString[0];
+    char tmpBuf[256];

    int argCount = 0;
-    while (*format) {
+    while (*format && bufp < &printString[PRINT_BUF_SIZE]) {
        // Format strings are just single percent signs.
-        if (*format != '%')
-            putchar(*format);
+        if (*format != '%') {
+            *bufp++ = *format;
+        }
        else {
            if (*types) {
                void *ptr = args[argCount++];
@@ -107,17 +136,22 @@ void __do_print(const char *format, const char *types, int width, int mask,
                // printf() formatting string.
                switch (*types) {
                case 'b': {
-                    printf("%s", *((Bool *)ptr) ? "true" : "false");
+                    sprintf(tmpBuf, "%s", *((Bool *)ptr) ? "true" : "false");
+                    APPEND(tmpBuf);
                    break;
                }
                case 'B': {
-                    putchar('[');
+                    *bufp++ = '[';
+                    if (bufp == &printString[PRINT_BUF_SIZE])
+                        break;
                    for (int i = 0; i < width; ++i) {
-                        if (mask & (1<<i))
-                            printf("%s", ((Bool *)ptr)[i] ? "true" : "false");
+                        if (mask & (1ull << i)) {
+                            sprintf(tmpBuf, "%s", ((Bool *)ptr)[i] ? "true" : "false");
+                            APPEND(tmpBuf);
+                        }
                        else
-                            printf("_________");
-                        putchar(i != width-1 ? ',' : ']');
+                            APPEND("_________");
+                        *bufp++ = (i != width-1) ? ',' : ']';
                    }
                    break;
                }
@@ -133,21 +167,102 @@ void __do_print(const char *format, const char *types, int width, int mask,
                case 'V': PRINT_VECTOR("%llu", unsigned long long);
                case 'd': PRINT_SCALAR("%f", double);
                case 'D': PRINT_VECTOR("%f", double);
+                case 'p': PRINT_SCALAR("%p", void *);
+                case 'P': PRINT_VECTOR("%p", void *);
                default:
-                    printf("UNKNOWN TYPE ");
-                    putchar(*types);
+                    APPEND("UNKNOWN TYPE ");
+                    *bufp++ = *types;
                }
                ++types;
            }
        }
        ++format;
    }
+
+ done:
+    *bufp = '\0';
+    fputs(printString, stdout);
    fflush(stdout);
 }

+/* this is print for PTX target only */
+int __puts_nvptx(const char *);
+void __do_print_nvptx(const char *format, const char *types, int width, uint64_t mask,
+                void **args) {
+#if 0
+    char printString[PRINT_BUF_SIZE+1]; // +1 for trailing NUL
+    char *bufp = &printString[0];
+    char tmpBuf[256];
+
+    int argCount = 0;
+    while (*format && bufp < &printString[PRINT_BUF_SIZE]) {
+        // Format strings are just single percent signs.
+        if (*format != '%') {
+            *bufp++ = *format;
+        }
+        else {
+            if (*types) {
+                void *ptr = args[argCount++];
+                // Based on the encoding in the types string, cast the
+                // value appropriately and print it with a reasonable
+                // printf() formatting string.
+                switch (*types) {
+                case 'b': {
+                    sprintf(tmpBuf, "%s", *((Bool *)ptr) ? "true" : "false");
+                    APPEND(tmpBuf);
+                    break;
+                }
+                case 'B': {
+                    *bufp++ = '[';
+                    if (bufp == &printString[PRINT_BUF_SIZE])
+                        break;
+                    for (int i = 0; i < width; ++i) {
+                        if (mask & (1ull << i)) {
+                            sprintf(tmpBuf, "%s", ((Bool *)ptr)[i] ? "true" : "false");
+                            APPEND(tmpBuf);
+                        }
+                        else
+                            APPEND("_________");
+                        *bufp++ = (i != width-1) ? ',' : ']';
+                    }
+                    break;
+                }
+                case 'i': PRINT_SCALAR("%d", int);
+                case 'I': PRINT_VECTOR("%d", int);
+                case 'u': PRINT_SCALAR("%u", unsigned int);
+                case 'U': PRINT_VECTOR("%u", unsigned int);
+                case 'f': PRINT_SCALAR("%f", float);
+                case 'F': PRINT_VECTOR("%f", float);
+                case 'l': PRINT_SCALAR("%lld", long long);
+                case 'L': PRINT_VECTOR("%lld", long long);
+                case 'v': PRINT_SCALAR("%llu", unsigned long long);
+                case 'V': PRINT_VECTOR("%llu", unsigned long long);
+                case 'd': PRINT_SCALAR("%f", double);
+                case 'D': PRINT_VECTOR("%f", double);
+                case 'p': PRINT_SCALAR("%p", void *);
+                case 'P': PRINT_VECTOR("%p", void *);
+                default:
+                    APPEND("UNKNOWN TYPE ");
+                    *bufp++ = *types;
+                }
+                ++types;
+            }
+        }
+        ++format;
+    }
+
+ done:
+    *bufp = '\n'; bufp++;
+    *bufp = '\0';
+    __puts_nvptx(printString);
+#else
+    __puts_nvptx("---nvptx printing is not support---\n");
+#endif
+}
+

 int __num_cores() {
-#ifdef _MSC_VER
+#if defined(_MSC_VER) || defined(__MINGW32__)
    // This is quite a hack.  Including all of windows.h to get this definition
    // pulls in a bunch of stuff that leads to undefined symbols at link time.
    // So we don't #include <windows.h> but instead have the equivalent declarations
--- a/builtins/dispatch.ll
+++ b/builtins/dispatch.ll
@@ -0,0 +1,260 @@
+;;  Copyright (c) 2011-2016, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+;; This file defines various functions that are used when generating the
+;; the "dispatch" object/assembly file that has entrypoints for each
+;; exported function in a module that dispatch to the best available
+;; variant of that function that will run on the system's CPU.
+
+;; Stores the best target ISA that the system on which we're actually
+;; running supports.  -1 represents "uninitialized", otherwise this value
+;; should correspond to one of the enumerant values of Target::ISA from
+;; ispc.h.
+
+@__system_best_isa = internal global i32 -1
+
+;; The below is the result of running "clang -O2 -emit-llvm -c -o -" on the
+;; following code...  Specifically, __get_system_isa should return a value
+;; corresponding to one of the Target::ISA enumerant values that gives the
+;; most capable ISA that the curremt system can run.
+;;
+;;
+;; #include <stdint.h>
+;; #include <stdlib.h>
+;; 
+;; static void __cpuid(int info[4], int infoType) {
+;;     __asm__ __volatile__ ("cpuid"
+;;                           : "=a" (info[0]), "=b" (info[1]), "=c" (info[2]), "=d" (info[3])
+;;                           : "0" (infoType));
+;; }
+;; 
+;; // Save %ebx in case it's the PIC register.
+;; static void __cpuid_count(int info[4], int level, int count) {
+;;   __asm__ __volatile__ ("xchg{l}\t{%%}ebx, %1\n\t"
+;;                         "cpuid\n\t"
+;;                         "xchg{l}\t{%%}ebx, %1\n\t"
+;;                         : "=a" (info[0]), "=r" (info[1]), "=c" (info[2]), "=d" (info[3])
+;;                         : "0" (level), "2" (count));
+;; }
+;; 
+;; static int __os_has_avx_support() {
+;;     // Check xgetbv; this uses a .byte sequence instead of the instruction
+;;     // directly because older assemblers do not include support for xgetbv and
+;;     // there is no easy way to conditionally compile based on the assembler used.
+;;     int rEAX, rEDX;
+;;     __asm__ __volatile__ (".byte 0x0f, 0x01, 0xd0" : "=a" (rEAX), "=d" (rEDX) : "c" (0));
+;;     return (rEAX & 6) == 6;
+;; }
+;; 
+;; static int __os_has_avx512_support() {
+;;     // Check if the OS saves the XMM, YMM and ZMM registers, i.e. it supports AVX2 and AVX512.
+;;     // See section 2.1 of software.intel.com/sites/default/files/managed/0d/53/319433-022.pdf
+;;     // Check xgetbv; this uses a .byte sequence instead of the instruction
+;;     // directly because older assemblers do not include support for xgetbv and
+;;     // there is no easy way to conditionally compile based on the assembler used.
+;;     int rEAX, rEDX;
+;;     __asm__ __volatile__ (".byte 0x0f, 0x01, 0xd0" : "=a" (rEAX), "=d" (rEDX) : "c" (0));
+;;     return (rEAX & 0xE6) == 0xE6;
+;; }
+;; 
+;; int32_t __get_system_isa() {
+;;     int info[4];
+;;     __cpuid(info, 1);
+;; 
+;;     // Call cpuid with eax=7, ecx=0
+;;     int info2[4];
+;;     __cpuid_count(info2, 7, 0);
+;; 
+;;     // NOTE: the values returned below must be the same as the
+;;     // corresponding enumerant values in Target::ISA.
+;;     if ((info[2] & (1 << 27)) != 0 &&  // OSXSAVE
+;;         (info2[1] & (1 <<  5)) != 0 && // AVX2
+;;         (info2[1] & (1 << 16)) != 0 && // AVX512 F
+;;         __os_has_avx512_support()) {
+;;         // We need to verify that AVX2 is also available,
+;;         // as well as AVX512, because our targets are supposed 
+;;         // to use both.
+;; 
+;;         if ((info2[1] & (1 << 17)) != 0 && // AVX512 DQ
+;;             (info2[1] & (1 << 28)) != 0 && // AVX512 CDI
+;;             (info2[1] & (1 << 30)) != 0 && // AVX512 BW
+;;             (info2[1] & (1 << 31)) != 0) { // AVX512 VL
+;;             return 6; // SKX
+;;         }
+;;         else if ((info2[1] & (1 << 26)) != 0 && // AVX512 PF
+;;                  (info2[1] & (1 << 27)) != 0 && // AVX512 ER
+;;                  (info2[1] & (1 << 28)) != 0) { // AVX512 CDI
+;;             return 5; // KNL_AVX512
+;;         }
+;;         // If it's unknown AVX512 target, fall through and use AVX2
+;;         // or whatever is available in the machine.
+;;     }
+;; 
+;;     if ((info[2] & (1 << 27)) != 0 && // OSXSAVE
+;;         (info[2] & (1 << 28)) != 0 &&
+;;         __os_has_avx_support()) {
+;;        if ((info[2] & (1 << 29)) != 0 &&  // F16C
+;;            (info[2] & (1 << 30)) != 0) {  // RDRAND
+;;            // So far, so good.  AVX2?
+;;            if ((info2[1] & (1 << 5)) != 0)
+;;                return 4;
+;;            else
+;;                return 3;
+;;        }
+;;        // Regular AVX
+;;        return 2;
+;;     }
+;;     else if ((info[2] & (1 << 19)) != 0)
+;;         return 1; // SSE4
+;;     else if ((info[3] & (1 << 26)) != 0)
+;;         return 0; // SSE2
+;;     else
+;;         abort();
+;; }
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; LLVM has different IR for different versions since 3.7
+
+define(`PTR_OP_ARGS',
+  ifelse(LLVM_VERSION, LLVM_3_7,
+    ``$1 , $1 *'',
+         LLVM_VERSION, LLVM_3_8,
+    ``$1 , $1 *'',
+         LLVM_VERSION, LLVM_3_9,
+    ``$1 , $1 *'',
+         LLVM_VERSION, LLVM_4_0,
+    ``$1 , $1 *'',
+         LLVM_VERSION, LLVM_5_0,
+    ``$1 , $1 *'',
+    ``$1 *''
+  )
+)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define i32 @__get_system_isa() nounwind uwtable {
+entry:
+  %0 = tail call { i32, i32, i32, i32 } asm sideeffect "cpuid", "={ax},={bx},={cx},={dx},0,~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind
+  %asmresult5.i = extractvalue { i32, i32, i32, i32 } %0, 2
+  %asmresult6.i = extractvalue { i32, i32, i32, i32 } %0, 3
+  %1 = tail call { i32, i32, i32, i32 } asm sideeffect "xchg$(l$)\09$(%$)ebx, $1\0A\09cpuid\0A\09xchg$(l$)\09$(%$)ebx, $1\0A\09", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 7, i32 0) nounwind
+  %asmresult4.i87 = extractvalue { i32, i32, i32, i32 } %1, 1
+  %and = and i32 %asmresult5.i, 134217728
+  %cmp = icmp eq i32 %and, 0
+  br i1 %cmp, label %if.else65, label %land.lhs.true
+
+land.lhs.true:                                    ; preds = %entry
+  %2 = and i32 %asmresult4.i87, 65568
+  %3 = icmp eq i32 %2, 65568
+  br i1 %3, label %land.lhs.true9, label %if.end39
+
+land.lhs.true9:                                   ; preds = %land.lhs.true
+  %4 = tail call { i32, i32 } asm sideeffect ".byte 0x0f, 0x01, 0xd0", "={ax},={dx},{cx},~{dirflag},~{fpsr},~{flags}"(i32 0) nounwind
+  %asmresult.i90 = extractvalue { i32, i32 } %4, 0
+  %and.i = and i32 %asmresult.i90, 230
+  %cmp.i = icmp eq i32 %and.i, 230
+  br i1 %cmp.i, label %if.then, label %if.end39
+
+if.then:                                          ; preds = %land.lhs.true9
+  %5 = and i32 %asmresult4.i87, -805175296
+  %6 = icmp eq i32 %5, -805175296
+  br i1 %6, label %return, label %if.else
+
+if.else:                                          ; preds = %if.then
+  %7 = and i32 %asmresult4.i87, 469762048
+  %8 = icmp eq i32 %7, 469762048
+  br i1 %8, label %return, label %if.end39
+
+if.end39:                                         ; preds = %if.else, %land.lhs.true9, %land.lhs.true
+  %9 = and i32 %asmresult5.i, 402653184
+  %10 = icmp eq i32 %9, 402653184
+  br i1 %10, label %land.lhs.true47, label %if.else65
+
+land.lhs.true47:                                  ; preds = %if.end39
+  %11 = tail call { i32, i32 } asm sideeffect ".byte 0x0f, 0x01, 0xd0", "={ax},={dx},{cx},~{dirflag},~{fpsr},~{flags}"(i32 0) nounwind
+  %asmresult.i91 = extractvalue { i32, i32 } %11, 0
+  %and.i92 = and i32 %asmresult.i91, 6
+  %cmp.i93 = icmp eq i32 %and.i92, 6
+  br i1 %cmp.i93, label %if.then50, label %if.else65
+
+if.then50:                                        ; preds = %land.lhs.true47
+  %12 = and i32 %asmresult5.i, 1610612736
+  %13 = icmp eq i32 %12, 1610612736
+  br i1 %13, label %if.then58, label %return
+
+if.then58:                                        ; preds = %if.then50
+  %and60 = lshr i32 %asmresult4.i87, 5
+  %14 = and i32 %and60, 1
+  %15 = add i32 %14, 3
+  br label %return
+
+if.else65:                                        ; preds = %land.lhs.true47, %if.end39, %entry
+  %and67 = and i32 %asmresult5.i, 524288
+  %cmp68 = icmp eq i32 %and67, 0
+  br i1 %cmp68, label %if.else70, label %return
+
+if.else70:                                        ; preds = %if.else65
+  %and72 = and i32 %asmresult6.i, 67108864
+  %cmp73 = icmp eq i32 %and72, 0
+  br i1 %cmp73, label %if.else75, label %return
+
+if.else75:                                        ; preds = %if.else70
+  tail call void @abort() noreturn nounwind
+  unreachable
+
+return:                                           ; preds = %if.else70, %if.else65, %if.then58, %if.then50, %if.else, %if.then
+  %retval.0 = phi i32 [ 6, %if.then ], [ 5, %if.else ], [ %15, %if.then58 ], [ 2, %if.then50 ], [ 1, %if.else65 ], [ 0, %if.else70 ]
+  ret i32 %retval.0
+}
+
+declare void @abort() noreturn nounwind
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; This function is called by each of the dispatch functions we generate;
+;; it sets @__system_best_isa if it is unset.
+
+define void @__set_system_isa() {
+entry:
+  %bi = load PTR_OP_ARGS(`i32 ')  @__system_best_isa
+  %unset = icmp eq i32 %bi, -1
+  br i1 %unset, label %set_system_isa, label %done
+
+set_system_isa:
+  %bival = call i32 @__get_system_isa()
+  store i32 %bival, i32* @__system_best_isa
+  ret void
+
+done:
+  ret void
+}
+
--- a/builtins/svml.m4
+++ b/builtins/svml.m4
@@ -0,0 +1,216 @@
+;;  Copyright (c) 2013-2015, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+
+;; svml macro
+
+;; svml_stubs : stubs for svml calls
+;; $1 - type ("float" or "double")
+;; $2 - svml internal function suffix ("f" for float, "d" for double)
+;; $3 - vector width
+define(`svml_stubs',`
+  declare <$3 x $1> @__svml_sin$2(<$3 x $1>) nounwind readnone alwaysinline
+  declare <$3 x $1> @__svml_asin$2(<$3 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_cos$2(<$3 x $1>) nounwind readnone alwaysinline 
+  declare void @__svml_sincos$2(<$3 x $1>, <$3 x $1> *, <$3 x $1> *) nounwind alwaysinline 
+  declare <$3 x $1> @__svml_tan$2(<$3 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_atan$2(<$3 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_atan2$2(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_exp$2(<$3 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_log$2(<$3 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_pow$2(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline 
+')
+
+;; svml_declare : declaration of __svml_* intrinsics 
+;; $1 - type ("float" or "double")
+;; $2 - __svml_* intrinsic function suffix 
+;;      float:  "f4"(sse) "f8"(avx) "f16"(avx512)
+;;      double:  "2"(sse)  "4"(avx)   "8"(avx512)
+;; $3 - vector width
+define(`svml_declare',`
+  declare <$3 x $1> @__svml_sin$2(<$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_asin$2(<$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_cos$2(<$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_sincos$2(<$3 x $1> *, <$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_tan$2(<$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_atan$2(<$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_atan2$2(<$3 x $1>, <$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_exp$2(<$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_log$2(<$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_pow$2(<$3 x $1>, <$3 x $1>) nounwind readnone
+');
+
+;; defintition of __svml_* internal functions
+;; $1 - type ("float" or "double")
+;; $2 - __svml_* intrinsic function suffix 
+;;      float:  "f4"(sse) "f8"(avx) "f16"(avx512)
+;;      double:  "2"(sse)  "4"(avx)   "8"(avx512)
+;; $3 - vector width
+;; $4 - svml internal function suffix ("f" for float, "d" for double)
+define(`svml_define',`
+  define <$3 x $1> @__svml_sin$4(<$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_sin$2(<$3 x $1> %0)
+    ret <$3 x $1> %ret
+  }
+  define <$3 x $1> @__svml_asin$4(<$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_asin$2(<$3 x $1> %0)
+    ret <$3 x $1> %ret
+  }
+
+  define <$3 x $1> @__svml_cos$4(<$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_cos$2(<$3 x $1> %0)
+    ret <$3 x $1> %ret
+  }
+
+  define void @__svml_sincos$4(<$3 x $1>, <$3 x $1> *, <$3 x $1> *) nounwind alwaysinline {
+    %s = call <$3 x $1> @__svml_sincos$2(<$3 x $1> * %2, <$3 x $1> %0)
+    store <$3 x $1> %s, <$3 x $1> * %1
+    ret void
+  }
+
+  define <$3 x $1> @__svml_tan$4(<$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_tan$2(<$3 x $1> %0)
+    ret <$3 x $1> %ret
+  }
+
+  define <$3 x $1> @__svml_atan$4(<$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_atan$2(<$3 x $1> %0)
+    ret <$3 x $1> %ret
+  }
+
+  define <$3 x $1> @__svml_atan2$4(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_atan2$2(<$3 x $1> %0, <$3 x $1> %1)
+    ret <$3 x $1> %ret
+  }
+
+  define <$3 x $1> @__svml_exp$4(<$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_exp$2(<$3 x $1> %0)
+    ret <$3 x $1> %ret
+  }
+
+  define <$3 x $1> @__svml_log$4(<$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_log$2(<$3 x $1> %0)
+    ret <$3 x $1> %ret
+  }
+
+  define <$3 x $1> @__svml_pow$4(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_pow$2(<$3 x $1> %0, <$3 x $1> %1)
+    ret <$3 x $1> %ret
+  }
+')
+
+
+;; svml_define_x : defintition of __svml_* internal functions operation on extended width
+;; $1 - type ("float" or "double")
+;; $2 - __svml_* intrinsic function suffix 
+;;      float:  "f4"(sse) "f8"(avx) "f16"(avx512)
+;;      double:  "2"(sse)  "4"(avx)   "8"(avx512)
+;; $3 - vector width
+;; $4 - svml internal function suffix ("f" for float, "d" for double)
+;; $5 - extended width, must be at least twice the native vector width
+;;      contigent on existing of unary$3to$5 and binary$3to$5 macros
+
+;; *todo*: in sincos call use __svml_sincos[f][2,4,8,16] call, e.g.
+;;define void @__svml_sincosf(<8 x float>, <8 x float> *,
+;;                                    <8 x float> *) nounwind alwaysinline {
+;;  ; call svml_sincosf4 two times with the two 4-wide sub-vectors
+;;  %a = shufflevector <8 x float> %0, <8 x float> undef,
+;;         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+;;  %b = shufflevector <8 x float> %0, <8 x float> undef,
+;;         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+;;
+;;  %cospa = alloca <4 x float>
+;;  %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a)
+;;
+;;  %cospb = alloca <4 x float>
+;;  %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b)
+;;
+;;  %sin = shufflevector <4 x float> %sa, <4 x float> %sb,
+;;         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
+;;                    i32 4, i32 5, i32 6, i32 7>
+;;  store <8 x float> %sin, <8 x float> * %1
+;;
+;;  %cosa = load <4 x float> * %cospa
+;;  %cosb = load <4 x float> * %cospb
+;;  %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb,
+;;         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
+;;                    i32 4, i32 5, i32 6, i32 7>
+;;  store <8 x float> %cos, <8 x float> * %2
+;;
+;;  ret void
+;;}
+define(`svml_define_x',`
+  define <$5 x $1> @__svml_sin$4(<$5 x $1>) nounwind readnone alwaysinline {
+    unary$3to$5(ret, $1, @__svml_sin$2, %0)
+    ret <$5 x $1> %ret
+  }
+  define <$5 x $1> @__svml_asin$4(<$5 x $1>) nounwind readnone alwaysinline {
+    unary$3to$5(ret, $1, @__svml_asin$2, %0)
+    ret <$5 x $1> %ret
+  }
+  define <$5 x $1> @__svml_cos$4(<$5 x $1>) nounwind readnone alwaysinline {
+    unary$3to$5(ret, $1, @__svml_cos$2, %0)
+    ret <$5 x $1> %ret
+  }
+  define void @__svml_sincos$4(<$5 x $1>,<$5 x $1>*,<$5 x $1>*) nounwind alwaysinline 
+  {
+    %s = call <$5 x $1> @__svml_sin$4(<$5 x $1> %0)
+    %c = call <$5 x $1> @__svml_cos$4(<$5 x $1> %0)
+    store <$5 x $1> %s, <$5 x $1> * %1
+    store <$5 x $1> %c, <$5 x $1> * %2
+    ret void
+  }
+  define <$5 x $1> @__svml_tan$4(<$5 x $1>) nounwind readnone alwaysinline {
+    unary$3to$5(ret, $1, @__svml_tan$2, %0)
+    ret <$5 x $1> %ret
+  }
+  define <$5 x $1> @__svml_atan$4(<$5 x $1>) nounwind readnone alwaysinline {
+    unary$3to$5(ret, $1, @__svml_atan$2, %0)
+    ret <$5 x $1> %ret
+  }
+  define <$5 x $1> @__svml_atan2$4(<$5 x $1>,<$5 x $1>) nounwind readnone alwaysinline {
+    binary$3to$5(ret, $1, @__svml_atan2$2, %0, %1)
+    ret <$5 x $1> %ret
+  }
+  define <$5 x $1> @__svml_exp$4(<$5 x $1>) nounwind readnone alwaysinline {
+    unary$3to$5(ret, $1, @__svml_exp$2, %0)
+    ret <$5 x $1> %ret
+  }
+  define <$5 x $1> @__svml_log$4(<$5 x $1>) nounwind readnone alwaysinline {
+    unary$3to$5(ret, $1, @__svml_log$2, %0)
+    ret <$5 x $1> %ret
+  }
+  define <$5 x $1> @__svml_pow$4(<$5 x $1>,<$5 x $1>) nounwind readnone alwaysinline {
+    binary$3to$5(ret, $1, @__svml_pow$2, %0, %1)
+    ret <$5 x $1> %ret
+  }
+')
+
--- a/builtins/target-avx-common.ll
+++ b/builtins/target-avx-common.ll
@@ -0,0 +1,293 @@
+;;  Copyright (c) 2010-2015, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; AVX target implementation.
+;;
+;; Please note that this file uses SSE intrinsics, but LLVM generates AVX
+;; instructions, so it doesn't makes sense to change this implemenation.
+
+
+ctlztz()
+define_prefetches()
+define_shuffles()
+aossoa()
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding floats
+
+declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
+
+define float @__round_uniform_float(float) nounwind readonly alwaysinline {
+  ; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  ; the roundss intrinsic is a total mess--docs say:
+  ;
+  ;  __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
+  ;       
+  ;  b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
+  ;  on b0. The higher order 96 bits are copied directly from input parameter a. The
+  ;  return value is described by the following equations:
+  ;
+  ;  r0 = RND(b0)
+  ;  r1 = a1
+  ;  r2 = a2
+  ;  r3 = a3
+  ;
+  ;  It doesn't matter what we pass as a, since we only need the r0 value
+  ;  here.  So we pass the same register for both.  Further, only the 0th
+  ;  element of the b parameter matters
+  %xi = insertelement <4 x float> undef, float %0, i32 0
+  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
+  %rs = extractelement <4 x float> %xr, i32 0
+  ret float %rs
+}
+
+define float @__floor_uniform_float(float) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <4 x float> undef, float %0, i32 0
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
+  %rs = extractelement <4 x float> %xr, i32 0
+  ret float %rs
+}
+
+define float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <4 x float> undef, float %0, i32 0
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
+  %rs = extractelement <4 x float> %xr, i32 0
+  ret float %rs
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
+
+define double @__round_uniform_double(double) nounwind readonly alwaysinline {
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
+define double @__floor_uniform_double(double) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  ; roundsd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
+define double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  ; roundsd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
+
+define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
+  ; do the rcpss call
+  ;    uniform float iv = extract(__rcp_u(v), 0);
+  ;    return iv * (2. - v * iv);
+  %vecval = insertelement <4 x float> undef, float %0, i32 0
+  %call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
+  %scall = extractelement <4 x float> %call, i32 0
+
+  ; do one N-R iteration to improve precision, as above
+  %v_iv = fmul float %0, %scall
+  %two_minus = fsub float 2., %v_iv  
+  %iv_mul = fmul float %scall, %two_minus
+  ret float %iv_mul
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rsqrt
+
+declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
+
+define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
+  ;  uniform float is = extract(__rsqrt_u(v), 0);
+  %v = insertelement <4 x float> undef, float %0, i32 0
+  %vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
+  %is = extractelement <4 x float> %vis, i32 0
+
+  ; Newton-Raphson iteration to improve precision
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul float %0, %is
+  %v_is_is = fmul float %v_is, %is
+  %three_sub = fsub float 3., %v_is_is
+  %is_mul = fmul float %is, %three_sub
+  %half_scale = fmul float 0.5, %is_mul
+  ret float %half_scale
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; sqrt
+
+declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
+
+define float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
+  sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
+  ret float %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
+
+define double @__sqrt_uniform_double(double) nounwind alwaysinline {
+  sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
+  ret double %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; fast math mode
+
+declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
+declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
+
+define void @__fastmath() nounwind alwaysinline {
+  %ptr = alloca i32
+  %ptr8 = bitcast i32 * %ptr to i8 *
+  call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
+  %oldval = load PTR_OP_ARGS(`i32 ') %ptr
+
+  ; turn on DAZ (64)/FTZ (32768) -> 32832
+  %update = or i32 %oldval, 32832
+  store i32 %update, i32 *%ptr
+  call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8)
+  ret void
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+define float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
+  %cmp = fcmp ogt float %1, %0
+  %ret = select i1 %cmp, float %1, float %0
+  ret float %ret
+}
+
+define float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
+  %cmp = fcmp ogt float %1, %0
+  %ret = select i1 %cmp, float %0, float %1
+  ret float %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+define double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
+  %cmp = fcmp ogt double %1, %0
+  %ret = select i1 %cmp, double %0, double %1
+  ret double %ret
+}
+
+define double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
+  %cmp = fcmp ogt double %1, %0
+  %ret = select i1 %cmp, double %1, double %0
+  ret double %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+define i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  %cmp = icmp sgt i32 %1, %0
+  %ret = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %ret
+}
+
+define i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  %cmp = icmp sgt i32 %1, %0
+  %ret = select i1 %cmp, i32 %1, i32 %0
+  ret i32 %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+define i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  %cmp = icmp ugt i32 %1, %0
+  %ret = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %ret
+}
+
+define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  %cmp = icmp ugt i32 %1, %0
+  %ret = select i1 %cmp, i32 %1, i32 %0
+  ret i32 %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal ops / reductions
+
+declare i32 @llvm.ctpop.i32(i32) nounwind readnone
+
+define i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
+  %call = call i32 @llvm.ctpop.i32(i32 %0)
+  ret i32 %call
+}
+
+declare i64 @llvm.ctpop.i64(i64) nounwind readnone
+
+define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
+  %call = call i64 @llvm.ctpop.i64(i64 %0)
+  ret i64 %call
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+define_avgs()
+declare_nvptx()
+
--- a/builtins/target-avx-x2.ll
+++ b/builtins/target-avx-x2.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2010-2015, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -29,29 +29,27 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  

-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; *** Untested *** AVX target implementation.
-;;
-;; The LLVM AVX code generator is incomplete, so the ispc AVX target
-;; hasn't yet been tested.  There is therefore a higher-than-normal
-;; chance that there are bugs in the code in this file.
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; Basic 16-wide definitions

-stdlib_core(16)
-packed_load_and_store(16)
-scans(16)
-int64minmax(16)
+define(`WIDTH',`16')
+define(`MASK',`i32')
+include(`util.m4')

-include(`builtins-avx-common.ll')
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+saturation_arithmetic()
+
+include(`target-avx-common.ll')

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp

 declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone

-define internal <16 x float> @__rcp_varying_float(<16 x float>) nounwind readonly alwaysinline {
+define <16 x float> @__rcp_varying_float(<16 x float>) nounwind readonly alwaysinline {
  ;  float iv = __rcp_v(v);
  ;  return iv * (2. - v * iv);

@@ -71,17 +69,17 @@ define internal <16 x float> @__rcp_varying_float(<16 x float>) nounwind readonl

 declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone

-define internal <16 x float> @__round_varying_float(<16 x float>) nounwind readonly alwaysinline {
+define <16 x float> @__round_varying_float(<16 x float>) nounwind readonly alwaysinline {
  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
  round8to16(%0, 8)
 }

-define internal <16 x float> @__floor_varying_float(<16 x float>) nounwind readonly alwaysinline {
+define <16 x float> @__floor_varying_float(<16 x float>) nounwind readonly alwaysinline {
  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  round8to16(%0, 9)
 }

-define internal <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly alwaysinline {
+define <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly alwaysinline {
  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  round8to16(%0, 10)
 }
@@ -91,15 +89,15 @@ define internal <16 x float> @__ceil_varying_float(<16 x float>) nounwind readon

 declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone

-define internal <16 x double> @__round_varying_double(<16 x double>) nounwind readonly alwaysinline {
+define <16 x double> @__round_varying_double(<16 x double>) nounwind readonly alwaysinline {
  round4to16double(%0, 8)
 }

-define internal <16 x double> @__floor_varying_double(<16 x double>) nounwind readonly alwaysinline {
+define <16 x double> @__floor_varying_double(<16 x double>) nounwind readonly alwaysinline {
  round4to16double(%0, 9)
 }

-define internal <16 x double> @__ceil_varying_double(<16 x double>) nounwind readonly alwaysinline {
+define <16 x double> @__ceil_varying_double(<16 x double>) nounwind readonly alwaysinline {
  round4to16double(%0, 10)
 }

@@ -109,7 +107,7 @@ define internal <16 x double> @__ceil_varying_double(<16 x double>) nounwind rea

 declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone

-define internal <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline {
+define <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline {
  ;  float is = __rsqrt_v(v);
  unary8to16(is, float, @llvm.x86.avx.rsqrt.ps.256, %v)
  ;  return 0.5 * is * (3. - (v * is) * is);
@@ -132,7 +130,7 @@ define internal <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind re

 declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone

-define internal <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly alwaysinline {
+define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly alwaysinline {
  unary8to16(call, float, @llvm.x86.avx.sqrt.ps.256, %0)
  ret <16 x float> %call
 }
@@ -140,19 +138,14 @@ define internal <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readon
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; svml

-; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
-; or, use the macro to call the 4-wide ones 4x with our 16-wide
-; vectors...
+include(`svml.m4')
+;; single precision
+svml_declare(float,f8,8)
+svml_define_x(float,f8,8,f,16)

-declare <16 x float> @__svml_sin(<16 x float>)
-declare <16 x float> @__svml_cos(<16 x float>)
-declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *)
-declare <16 x float> @__svml_tan(<16 x float>)
-declare <16 x float> @__svml_atan(<16 x float>)
-declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>)
-declare <16 x float> @__svml_exp(<16 x float>)
-declare <16 x float> @__svml_log(<16 x float>)
-declare <16 x float> @__svml_pow(<16 x float>, <16 x float>)
+;; double precision
+svml_declare(double,4,4)
+svml_define_x(double,4,4,d,16)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
@@ -160,52 +153,25 @@ declare <16 x float> @__svml_pow(<16 x float>, <16 x float>)
 declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
 declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone

-define internal <16 x float> @__max_varying_float(<16 x float>,
-                                                  <16 x float>) nounwind readonly alwaysinline {
+define <16 x float> @__max_varying_float(<16 x float>,
+                                         <16 x float>) nounwind readonly alwaysinline {
  binary8to16(call, float, @llvm.x86.avx.max.ps.256, %0, %1)
  ret <16 x float> %call
 }

-define internal <16 x float> @__min_varying_float(<16 x float>,
-                                                  <16 x float>) nounwind readonly alwaysinline {
+define <16 x float> @__min_varying_float(<16 x float>,
+                                         <16 x float>) nounwind readonly alwaysinline {
  binary8to16(call, float, @llvm.x86.avx.min.ps.256, %0, %1)
  ret <16 x float> %call
 }


-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; int min/max
-
-define internal <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
-  binary4to16(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
-  ret <16 x i32> %ret
-}
-
-define internal <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
-  binary4to16(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
-  ret <16 x i32> %ret
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; unsigned int min/max
-
-define internal <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
-  binary4to16(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
-  ret <16 x i32> %ret
-}
-
-define internal <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
-  binary4to16(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
-  ret <16 x i32> %ret
-}
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops

 declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone

-define internal i32 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
+define i64 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
  %floatmask = bitcast <16 x i32> %0 to <16 x float>
  %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -216,15 +182,63 @@ define internal i32 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {

  %v1shift = shl i32 %v1, 8
  %v = or i32 %v1shift, %v0
-  ret i32 %v
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
 }

+define i1 @__any(<16 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <16 x i32> %0 to <16 x float>
+  %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v0 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask0) nounwind readnone
+  %mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+          <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %v1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask1) nounwind readnone
+
+  %v1shift = shl i32 %v1, 8
+  %v = or i32 %v1shift, %v0
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<16 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <16 x i32> %0 to <16 x float>
+  %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v0 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask0) nounwind readnone
+  %mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+          <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %v1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask1) nounwind readnone
+
+  %v1shift = shl i32 %v1, 8
+  %v = or i32 %v1shift, %v0
+  %cmp = icmp eq i32 %v, 65535
+  ret i1 %cmp
+}
+
+define i1 @__none(<16 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <16 x i32> %0 to <16 x float>
+  %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v0 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask0) nounwind readnone
+  %mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+          <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %v1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask1) nounwind readnone
+
+  %v1shift = shl i32 %v1, 8
+  %v = or i32 %v1shift, %v0
+  %cmp = icmp eq i32 %v, 0
+  ret i1 %cmp
+}
+
+
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal float ops

 declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone

-define internal float @__reduce_add_float(<16 x float>) nounwind readonly alwaysinline {
+define float @__reduce_add_float(<16 x float>) nounwind readonly alwaysinline {
  %va = shufflevector <16 x float> %0, <16 x float> undef,
          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %vb = shufflevector <16 x float> %0, <16 x float> undef,
@@ -239,12 +253,12 @@ define internal float @__reduce_add_float(<16 x float>) nounwind readonly always
 }


-define internal float @__reduce_min_float(<16 x float>) nounwind readnone alwaysinline {
+define float @__reduce_min_float(<16 x float>) nounwind readnone alwaysinline {
  reduce16(float, @__min_varying_float, @__min_uniform_float)
 }


-define internal float @__reduce_max_float(<16 x float>) nounwind readnone alwaysinline {
+define float @__reduce_max_float(<16 x float>) nounwind readnone alwaysinline {
  reduce16(float, @__max_varying_float, @__max_uniform_float)
 }

@@ -253,28 +267,55 @@ reduce_equal(16)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal int32 ops

-define internal <16 x i32> @__add_varying_int32(<16 x i32>,
-                                                <16 x i32>) nounwind readnone alwaysinline {
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<16 x i8>) nounwind readnone alwaysinline {
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %0,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+define internal <16 x i16> @__add_varying_i16(<16 x i16>,
+                                  <16 x i16>) nounwind readnone alwaysinline {
+  %r = add <16 x i16> %0, %1
+  ret <16 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<16 x i16>) nounwind readnone alwaysinline {
+  reduce16(i16, @__add_varying_i16, @__add_uniform_i16)
+}
+
+define <16 x i32> @__add_varying_int32(<16 x i32>,
+                                       <16 x i32>) nounwind readnone alwaysinline {
  %s = add <16 x i32> %0, %1
  ret <16 x i32> %s
 }

-define internal i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
+define i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
  %s = add i32 %0, %1
  ret i32 %s
 }

-define internal i32 @__reduce_add_int32(<16 x i32>) nounwind readnone alwaysinline {
+define i32 @__reduce_add_int32(<16 x i32>) nounwind readnone alwaysinline {
  reduce16(i32, @__add_varying_int32, @__add_uniform_int32)
 }


-define internal i32 @__reduce_min_int32(<16 x i32>) nounwind readnone alwaysinline {
+define i32 @__reduce_min_int32(<16 x i32>) nounwind readnone alwaysinline {
  reduce16(i32, @__min_varying_int32, @__min_uniform_int32)
 }


-define internal i32 @__reduce_max_int32(<16 x i32>) nounwind readnone alwaysinline {
+define i32 @__reduce_max_int32(<16 x i32>) nounwind readnone alwaysinline {
  reduce16(i32, @__max_varying_int32, @__max_uniform_int32)
 }

@@ -282,17 +323,12 @@ define internal i32 @__reduce_max_int32(<16 x i32>) nounwind readnone alwaysinli
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;; horizontal uint32 ops

-define internal i32 @__reduce_add_uint32(<16 x i32> %v) nounwind readnone alwaysinline {
-  %r = call i32 @__reduce_add_int32(<16 x i32> %v)
-  ret i32 %r
-}
-
-define internal i32 @__reduce_min_uint32(<16 x i32>) nounwind readnone alwaysinline {
+define i32 @__reduce_min_uint32(<16 x i32>) nounwind readnone alwaysinline {
  reduce16(i32, @__min_varying_uint32, @__min_uniform_uint32)
 }


-define internal i32 @__reduce_max_uint32(<16 x i32>) nounwind readnone alwaysinline {
+define i32 @__reduce_max_uint32(<16 x i32>) nounwind readnone alwaysinline {
  reduce16(i32, @__max_varying_uint32, @__max_uniform_uint32)
 }

@@ -302,7 +338,7 @@ define internal i32 @__reduce_max_uint32(<16 x i32>) nounwind readnone alwaysinl

 declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone

-define internal double @__reduce_add_double(<16 x double>) nounwind readonly alwaysinline {
+define double @__reduce_add_double(<16 x double>) nounwind readonly alwaysinline {
  %va = shufflevector <16 x double> %0, <16 x double> undef,
         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %vb = shufflevector <16 x double> %0, <16 x double> undef,
@@ -322,12 +358,12 @@ define internal double @__reduce_add_double(<16 x double>) nounwind readonly alw
  ret double %sum
 }

-define internal double @__reduce_min_double(<16 x double>) nounwind readnone alwaysinline {
+define double @__reduce_min_double(<16 x double>) nounwind readnone alwaysinline {
  reduce16(double, @__min_varying_double, @__min_uniform_double)
 }


-define internal double @__reduce_max_double(<16 x double>) nounwind readnone alwaysinline {
+define double @__reduce_max_double(<16 x double>) nounwind readnone alwaysinline {
  reduce16(double, @__max_varying_double, @__max_uniform_double)
 }

@@ -335,28 +371,28 @@ define internal double @__reduce_max_double(<16 x double>) nounwind readnone alw
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal int64 ops

-define internal <16 x i64> @__add_varying_int64(<16 x i64>,
+define <16 x i64> @__add_varying_int64(<16 x i64>,
                                                <16 x i64>) nounwind readnone alwaysinline {
  %s = add <16 x i64> %0, %1
  ret <16 x i64> %s
 }

-define internal i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
+define i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
  %s = add i64 %0, %1
  ret i64 %s
 }

-define internal i64 @__reduce_add_int64(<16 x i64>) nounwind readnone alwaysinline {
+define i64 @__reduce_add_int64(<16 x i64>) nounwind readnone alwaysinline {
  reduce16(i64, @__add_varying_int64, @__add_uniform_int64)
 }


-define internal i64 @__reduce_min_int64(<16 x i64>) nounwind readnone alwaysinline {
+define i64 @__reduce_min_int64(<16 x i64>) nounwind readnone alwaysinline {
  reduce16(i64, @__min_varying_int64, @__min_uniform_int64)
 }


-define internal i64 @__reduce_max_int64(<16 x i64>) nounwind readnone alwaysinline {
+define i64 @__reduce_max_int64(<16 x i64>) nounwind readnone alwaysinline {
  reduce16(i64, @__max_varying_int64, @__max_uniform_int64)
 }

@@ -364,17 +400,12 @@ define internal i64 @__reduce_max_int64(<16 x i64>) nounwind readnone alwaysinli
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;; horizontal uint64 ops

-define internal i64 @__reduce_add_uint64(<16 x i64> %v) nounwind readnone alwaysinline {
-  %r = call i64 @__reduce_add_int64(<16 x i64> %v)
-  ret i64 %r
-}
-
-define internal i64 @__reduce_min_uint64(<16 x i64>) nounwind readnone alwaysinline {
+define i64 @__reduce_min_uint64(<16 x i64>) nounwind readnone alwaysinline {
  reduce16(i64, @__min_varying_uint64, @__min_uniform_uint64)
 }


-define internal i64 @__reduce_max_uint64(<16 x i64>) nounwind readnone alwaysinline {
+define i64 @__reduce_max_uint64(<16 x i64>) nounwind readnone alwaysinline {
  reduce16(i64, @__max_varying_uint64, @__max_uniform_uint64)
 }

@@ -382,27 +413,22 @@ define internal i64 @__reduce_max_uint64(<16 x i64>) nounwind readnone alwaysinl
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts

-load_and_broadcast(16, i8, 8)
-load_and_broadcast(16, i16, 16)
-load_and_broadcast(16, i32, 32)
-load_and_broadcast(16, i64, 64)
-
 ; no masked load instruction for i8 and i16 types??
-load_masked(16, i8,  8,  1)
-load_masked(16, i16, 16, 2)
+masked_load(i8,  1)
+masked_load(i16, 2)

-declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x float> %mask)
-declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
+declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x MfORi32> %mask)
+declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x MdORi64> %mask)
 
-define <16 x i32> @__load_masked_32(i8 *, <16 x i32> %mask) nounwind alwaysinline {
-  %floatmask = bitcast <16 x i32> %mask to <16 x float>
-  %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+define <16 x i32> @__masked_load_i32(i8 *, <16 x i32> %mask) nounwind alwaysinline {
+  %floatmask = bitcast <16 x i32> %mask to <16 x MfORi32>
+  %mask0 = shufflevector <16 x MfORi32> %floatmask, <16 x MfORi32> undef,
     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %val0 = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8 * %0, <8 x float> %mask0)
-  %mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+  %val0 = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8 * %0, <8 x MfORi32> %mask0)
+  %mask1 = shufflevector <16 x MfORi32> %floatmask, <16 x MfORi32> undef,
     <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %ptr1 = getelementptr i8 * %0, i32 32   ;; 8x4 bytes = 32
-  %val1 = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8 * %ptr1, <8 x float> %mask1)
+  %ptr1 = getelementptr PTR_OP_ARGS(`i8') %0, i32 32    ;; 8x4 bytes = 32
+  %val1 = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8 * %ptr1, <8 x MfORi32> %mask1)

  %retval = shufflevector <8 x float> %val0, <8 x float> %val1,
     <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -412,7 +438,7 @@ define <16 x i32> @__load_masked_32(i8 *, <16 x i32> %mask) nounwind alwaysinlin
 }


-define <16 x i64> @__load_masked_64(i8 *, <16 x i32> %mask) nounwind alwaysinline {
+define <16 x i64> @__masked_load_i64(i8 *, <16 x i32> %mask) nounwind alwaysinline {
  ; double up masks, bitcast to doubles
  %mask0 = shufflevector <16 x i32> %mask, <16 x i32> undef,
     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
@@ -422,18 +448,18 @@ define <16 x i64> @__load_masked_64(i8 *, <16 x i32> %mask) nounwind alwaysinlin
     <8 x i32> <i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11>
  %mask3 = shufflevector <16 x i32> %mask, <16 x i32> undef,
     <8 x i32> <i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
-  %mask0d = bitcast <8 x i32> %mask0 to <4 x double>
-  %mask1d = bitcast <8 x i32> %mask1 to <4 x double>
-  %mask2d = bitcast <8 x i32> %mask2 to <4 x double>
-  %mask3d = bitcast <8 x i32> %mask3 to <4 x double>
+  %mask0d = bitcast <8 x i32> %mask0 to <4 x MdORi64>
+  %mask1d = bitcast <8 x i32> %mask1 to <4 x MdORi64>
+  %mask2d = bitcast <8 x i32> %mask2 to <4 x MdORi64>
+  %mask3d = bitcast <8 x i32> %mask3 to <4 x MdORi64>

-  %val0d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %0, <4 x double> %mask0d)
-  %ptr1 = getelementptr i8 * %0, i32 32
-  %val1d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %ptr1, <4 x double> %mask1d)
-  %ptr2 = getelementptr i8 * %0, i32 64
-  %val2d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %ptr2, <4 x double> %mask2d)
-  %ptr3 = getelementptr i8 * %0, i32 96
-  %val3d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %ptr3, <4 x double> %mask3d)
+  %val0d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %0, <4 x MdORi64> %mask0d)
+  %ptr1 = getelementptr PTR_OP_ARGS(`i8') %0, i32 32
+  %val1d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %ptr1, <4 x MdORi64> %mask1d)
+  %ptr2 = getelementptr PTR_OP_ARGS(`i8') %0, i32 64
+  %val2d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %ptr2, <4 x MdORi64> %mask2d)
+  %ptr3 = getelementptr PTR_OP_ARGS(`i8') %0, i32 96
+  %val3d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %ptr3, <4 x MdORi64> %mask3d)

  %val01 = shufflevector <4 x double> %val0d, <4 x double> %val1d,
      <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -446,6 +472,7 @@ define <16 x i64> @__load_masked_64(i8 *, <16 x i32> %mask) nounwind alwaysinlin
  ret <16 x i64> %val
 }

+masked_load_float_double()

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store
@@ -453,38 +480,38 @@ define <16 x i64> @__load_masked_64(i8 *, <16 x i32> %mask) nounwind alwaysinlin
 ; FIXME: there is no AVX instruction for these, but we could be clever
 ; by packing the bits down and setting the last 3/4 or half, respectively,
 ; of the mask to zero...  Not sure if this would be a win in the end
-gen_masked_store(16, i8, 8)
-gen_masked_store(16, i16, 16)
+gen_masked_store(i8)
+gen_masked_store(i16)

 ; note that mask is the 2nd parameter, not the 3rd one!!
-declare void @llvm.x86.avx.maskstore.ps.256(i8 *, <8 x float>, <8 x float>)
-declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>)
+declare void @llvm.x86.avx.maskstore.ps.256(i8 *, <8 x MfORi32>, <8 x float>)
+declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x MdORi64>, <4 x double>)

-define void @__masked_store_32(<16 x i32>* nocapture, <16 x i32>, 
-                               <16 x i32>) nounwind alwaysinline {
+define void @__masked_store_i32(<16 x i32>* nocapture, <16 x i32>, 
+                                <16 x i32>) nounwind alwaysinline {
  %ptr = bitcast <16 x i32> * %0 to i8 *
  %val = bitcast <16 x i32> %1 to <16 x float>
-  %mask = bitcast <16 x i32> %2 to <16 x float>
+  %mask = bitcast <16 x i32> %2 to <16 x MfORi32>

  %val0 = shufflevector <16 x float> %val, <16 x float> undef,
        <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %val1 = shufflevector <16 x float> %val, <16 x float> undef,
        <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>

-  %mask0 = shufflevector <16 x float> %mask, <16 x float> undef,
+  %mask0 = shufflevector <16 x MfORi32> %mask, <16 x MfORi32> undef,
        <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %mask1 = shufflevector <16 x float> %mask, <16 x float> undef,
+  %mask1 = shufflevector <16 x MfORi32> %mask, <16 x MfORi32> undef,
        <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>

-  call void @llvm.x86.avx.maskstore.ps.256(i8 * %ptr, <8 x float> %mask0, <8 x float> %val0)
-  %ptr1 = getelementptr i8 * %ptr, i32 32
-  call void @llvm.x86.avx.maskstore.ps.256(i8 * %ptr1, <8 x float> %mask1, <8 x float> %val1)
+  call void @llvm.x86.avx.maskstore.ps.256(i8 * %ptr, <8 x MfORi32> %mask0, <8 x float> %val0)
+  %ptr1 = getelementptr PTR_OP_ARGS(`i8') %ptr, i32 32
+  call void @llvm.x86.avx.maskstore.ps.256(i8 * %ptr1, <8 x MfORi32> %mask1, <8 x float> %val1)

  ret void
 }

-define void @__masked_store_64(<16 x i64>* nocapture, <16 x i64>,
-                               <16 x i32> %mask) nounwind alwaysinline {
+define void @__masked_store_i64(<16 x i64>* nocapture, <16 x i64>,
+                                <16 x i32> %mask) nounwind alwaysinline {
  %ptr = bitcast <16 x i64> * %0 to i8 *
  %val = bitcast <16 x i64> %1 to <16 x double>

@@ -497,10 +524,10 @@ define void @__masked_store_64(<16 x i64>* nocapture, <16 x i64>,
     <8 x i32> <i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11>
  %mask3 = shufflevector <16 x i32> %mask, <16 x i32> undef,
     <8 x i32> <i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
-  %mask0d = bitcast <8 x i32> %mask0 to <4 x double>
-  %mask1d = bitcast <8 x i32> %mask1 to <4 x double>
-  %mask2d = bitcast <8 x i32> %mask2 to <4 x double>
-  %mask3d = bitcast <8 x i32> %mask3 to <4 x double>
+  %mask0d = bitcast <8 x i32> %mask0 to <4 x MdORi64>
+  %mask1d = bitcast <8 x i32> %mask1 to <4 x MdORi64>
+  %mask2d = bitcast <8 x i32> %mask2 to <4 x MdORi64>
+  %mask3d = bitcast <8 x i32> %mask3 to <4 x MdORi64>

  %val0 = shufflevector <16 x double> %val, <16 x double> undef,
     <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -511,27 +538,28 @@ define void @__masked_store_64(<16 x i64>* nocapture, <16 x i64>,
  %val3 = shufflevector <16 x double> %val, <16 x double> undef,
     <4 x i32> <i32 12, i32 13, i32 14, i32 15>

-  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr, <4 x double> %mask0d, <4 x double> %val0)
-  %ptr1 = getelementptr i8 * %ptr, i32 32
-  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr1, <4 x double> %mask1d, <4 x double> %val1)
-  %ptr2 = getelementptr i8 * %ptr, i32 64
-  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr2, <4 x double> %mask2d, <4 x double> %val2)
-  %ptr3 = getelementptr i8 * %ptr, i32 96
-  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr3, <4 x double> %mask3d, <4 x double> %val3)
+  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr, <4 x MdORi64> %mask0d, <4 x double> %val0)
+  %ptr1 = getelementptr PTR_OP_ARGS(`i8') %ptr, i32 32
+  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr1, <4 x MdORi64> %mask1d, <4 x double> %val1)
+  %ptr2 = getelementptr PTR_OP_ARGS(`i8') %ptr, i32 64
+  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr2, <4 x MdORi64> %mask2d, <4 x double> %val2)
+  %ptr3 = getelementptr PTR_OP_ARGS(`i8') %ptr, i32 96
+  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr3, <4 x MdORi64> %mask3d, <4 x double> %val3)

  ret void
 }

+masked_store_float_double()

 masked_store_blend_8_16_by_16()

 declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
                                                <8 x float>) nounwind readnone

-define void @__masked_store_blend_32(<16 x i32>* nocapture, <16 x i32>, 
-                                     <16 x i32>) nounwind alwaysinline {
+define void @__masked_store_blend_i32(<16 x i32>* nocapture, <16 x i32>, 
+                                      <16 x i32>) nounwind alwaysinline {
  %maskAsFloat = bitcast <16 x i32> %2 to <16 x float>
-  %oldValue = load <16 x i32>* %0, align 4
+  %oldValue = load PTR_OP_ARGS(`<16 x i32>')  %0, align 4
  %oldAsFloat = bitcast <16 x i32> %oldValue to <16 x float>
  %newAsFloat = bitcast <16 x i32> %1 to <16 x float>
 
@@ -566,9 +594,9 @@ define void @__masked_store_blend_32(<16 x i32>* nocapture, <16 x i32>,
 declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>,
                                                 <4 x double>) nounwind readnone

-define void @__masked_store_blend_64(<16 x i64>* nocapture %ptr, <16 x i64> %newi64, 
-                                     <16 x i32> %mask) nounwind alwaysinline {
-  %oldValue = load <16 x i64>* %ptr, align 8
+define void @__masked_store_blend_i64(<16 x i64>* nocapture %ptr, <16 x i64> %newi64, 
+                                      <16 x i32> %mask) nounwind alwaysinline {
+  %oldValue = load PTR_OP_ARGS(`<16 x i64>')  %ptr, align 8
  %old = bitcast <16 x i64> %oldValue to <16 x double>
  %old0d = shufflevector <16 x double> %old, <16 x double> undef,
     <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -625,24 +653,21 @@ define void @__masked_store_blend_64(<16 x i64>* nocapture %ptr, <16 x i64> %new
 }

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; gather/scatter
+;; scatter

-gen_gather(16, i8)
-gen_gather(16, i16)
-gen_gather(16, i32)
-gen_gather(16, i64)
-
-gen_scatter(16, i8)
-gen_scatter(16, i16)
-gen_scatter(16, i32)
-gen_scatter(16, i64)
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; double precision sqrt

 declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone

-define internal <16 x double> @__sqrt_varying_double(<16 x double>) nounwind alwaysinline {
+define <16 x double> @__sqrt_varying_double(<16 x double>) nounwind alwaysinline {
  unary4to16(ret, double, @llvm.x86.avx.sqrt.pd.256, %0)
  ret <16 x double> %ret
 }
@@ -654,12 +679,21 @@ define internal <16 x double> @__sqrt_varying_double(<16 x double>) nounwind alw
 declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
 declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone

-define internal <16 x double> @__min_varying_double(<16 x double>, <16 x double>) nounwind readnone alwaysinline {
+define <16 x double> @__min_varying_double(<16 x double>, <16 x double>) nounwind readnone alwaysinline {
  binary4to16(ret, double, @llvm.x86.avx.min.pd.256, %0, %1)
  ret <16 x double> %ret
 }

-define internal <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwind readnone alwaysinline {
+define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwind readnone alwaysinline {
  binary4to16(ret, double, @llvm.x86.avx.max.pd.256, %0, %1)
  ret <16 x double> %ret
 }
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reciprocals in double precision, if supported
+
+rsqrtd_decl()
+rcpd_decl()
+
+transcendetals_decl()
+trigonometry_decl()
--- a/builtins/target-avx.ll
+++ b/builtins/target-avx.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2010-2015, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -29,34 +29,30 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  

-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; *** Untested *** AVX target implementation.
-;;
-;; The LLVM AVX code generator is incomplete, so the ispc AVX target
-;; hasn't yet been tested.  There is therefore a higher-than-normal
-;; chance that there are bugs in the code in this file.
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; Basic 8-wide definitions

-stdlib_core(8)
-packed_load_and_store(8)
-scans(8)
-int64minmax(8)
+define(`WIDTH',`8')
+define(`MASK',`i32')
+include(`util.m4')

-include(`builtins-avx-common.ll')
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+
+include(`target-avx-common.ll')

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp

 declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone

-define internal <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
+define <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ; do one N-R iteration to improve precision
  ;  float iv = __rcp_v(v);
  ;  return iv * (2. - v * iv);
-
  %call = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %0)
-  ; do one N-R iteration
  %v_iv = fmul <8 x float> %0, %call
  %two_minus = fsub <8 x float> <float 2., float 2., float 2., float 2.,
                                 float 2., float 2., float 2., float 2.>, %v_iv  
@@ -64,58 +60,15 @@ define internal <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly
  ret <8 x float> %iv_mul
 }

-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; rounding floats
-
-declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
-
-define internal <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
-  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
-  %call = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %0, i32 8)
-  ret <8 x float> %call
-}
-
-define internal <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
-  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
-  %call = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %0, i32 9)
-  ret <8 x float> %call
-}
-
-define internal <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
-  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
-  %call = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %0, i32 10)
-  ret <8 x float> %call
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; rounding doubles
-
-declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
-
-define internal <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
-  round4to8double(%0, 8)
-}
-
-define internal <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
-  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
-  round4to8double(%0, 9)
-}
-
-
-define internal <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
-  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10
-  round4to8double(%0, 10)
-}
-
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rsqrt

 declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone

-define internal <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
+define <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
  ;  float is = __rsqrt_v(v);
  %is = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %v)
+  ; Newton-Raphson iteration to improve precision
  ;  return 0.5 * is * (3. - (v * is) * is);
  %v_is = fmul <8 x float> %v, %is
  %v_is_is = fmul <8 x float> %v_is, %is
@@ -132,27 +85,62 @@ define internal <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind read

 declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone

-define internal <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
+define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
  %call = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %0)
  ret <8 x float> %call
 }

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; svml
+;; double precision sqrt

-; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
-; or, use the macro to call the 4-wide ones twice with our 8-wide
-; vectors...
+declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone

-declare <8 x float> @__svml_sin(<8 x float>)
-declare <8 x float> @__svml_cos(<8 x float>)
-declare void @__svml_sincos(<8 x float>, <8 x float> *, <8 x float> *)
-declare <8 x float> @__svml_tan(<8 x float>)
-declare <8 x float> @__svml_atan(<8 x float>)
-declare <8 x float> @__svml_atan2(<8 x float>, <8 x float>)
-declare <8 x float> @__svml_exp(<8 x float>)
-declare <8 x float> @__svml_log(<8 x float>)
-declare <8 x float> @__svml_pow(<8 x float>, <8 x float>)
+define <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
+  unary4to8(ret, double, @llvm.x86.avx.sqrt.pd.256, %0)
+  ret <8 x double> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding floats
+
+declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
+
+define <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  %call = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %0, i32 8)
+  ret <8 x float> %call
+}
+
+define <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  %call = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %0, i32 9)
+  ret <8 x float> %call
+}
+
+define <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  %call = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %0, i32 10)
+  ret <8 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
+
+define <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  round4to8double(%0, 8)
+}
+
+define <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  round4to8double(%0, 9)
+}
+
+define <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  round4to8double(%0, 10)
+}

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
@@ -160,63 +148,91 @@ declare <8 x float> @__svml_pow(<8 x float>, <8 x float>)
 declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
 declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone

-define internal <8 x float> @__max_varying_float(<8 x float>,
-                                                 <8 x float>) nounwind readonly alwaysinline {
+define <8 x float> @__max_varying_float(<8 x float>,
+                                        <8 x float>) nounwind readonly alwaysinline {
  %call = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %0, <8 x float> %1)
  ret <8 x float> %call
 }

-define internal <8 x float> @__min_varying_float(<8 x float>,
-                                                 <8 x float>) nounwind readonly alwaysinline {
+define <8 x float> @__min_varying_float(<8 x float>,
+                                        <8 x float>) nounwind readonly alwaysinline {
  %call = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %0, <8 x float> %1)
  ret <8 x float> %call
 }


 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; int min/max
+;; double precision min/max

-define internal <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
-  binary4to8(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
-  ret <8 x i32> %ret
+declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
+declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
+  binary4to8(ret, double, @llvm.x86.avx.min.pd.256, %0, %1)
+  ret <8 x double> %ret
 }

-define internal <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
-  binary4to8(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
-  ret <8 x i32> %ret
+define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
+  binary4to8(ret, double, @llvm.x86.avx.max.pd.256, %0, %1)
+  ret <8 x double> %ret
 }


 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; unsigned int min/max
+;; svml

-define internal <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
-  binary4to8(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
-  ret <8 x i32> %ret
-}
+include(`svml.m4')
+;; single precision
+svml_declare(float,f8,8)
+svml_define(float,f8,8,f)
+
+;; double precision
+svml_declare(double,4,4)
+svml_define_x(double,4,4,d,8)

-define internal <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
-  binary4to8(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
-  ret <8 x i32> %ret
-}

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; horizontal ops
+;; mask handling

 declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone

-define internal i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
+define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
  %floatmask = bitcast <8 x i32> %0 to <8 x float>
  %v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone
-  ret i32 %v
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
 }

+define i1 @__any(<8 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<8 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone
+  %cmp = icmp eq i32 %v, 255
+  ret i1 %cmp
+}
+
+define i1 @__none(<8 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone
+  %cmp = icmp eq i32 %v, 0
+  ret i1 %cmp
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal ops / reductions
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal float ops

 declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone

-define internal float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
+define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
  %v1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %0, <8 x float> %0)
  %v2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v1, <8 x float> %v1)
  %scalar1 = extractelement <8 x float> %v2, i32 0
@@ -225,71 +241,20 @@ define internal float @__reduce_add_float(<8 x float>) nounwind readonly alwaysi
  ret float %sum
 }

-
-define internal float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
+define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
  reduce8(float, @__min_varying_float, @__min_uniform_float)
 }

-
-define internal float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
+define float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
  reduce8(float, @__max_varying_float, @__max_uniform_float)
 }

-reduce_equal(8)
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; horizontal int32 ops
-
-define internal <8 x i32> @__add_varying_int32(<8 x i32>,
-                                               <8 x i32>) nounwind readnone alwaysinline {
-  %s = add <8 x i32> %0, %1
-  ret <8 x i32> %s
-}
-
-define internal i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
-  %s = add i32 %0, %1
-  ret i32 %s
-}
-
-define internal i32 @__reduce_add_int32(<8 x i32>) nounwind readnone alwaysinline {
-  reduce8(i32, @__add_varying_int32, @__add_uniform_int32)
-}
-
-
-define internal i32 @__reduce_min_int32(<8 x i32>) nounwind readnone alwaysinline {
-  reduce8(i32, @__min_varying_int32, @__min_uniform_int32)
-}
-
-
-define internal i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
-  reduce8(i32, @__max_varying_int32, @__max_uniform_int32)
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;; horizontal uint32 ops
-
-define internal i32 @__reduce_add_uint32(<8 x i32> %v) nounwind readnone alwaysinline {
-  %r = call i32 @__reduce_add_int32(<8 x i32> %v)
-  ret i32 %r
-}
-
-define internal i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
-  reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32)
-}
-
-
-define internal i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
-  reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32)
-}
-
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal double ops

 declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone

-define internal double @__reduce_add_double(<8 x double>) nounwind readonly alwaysinline {
+define double @__reduce_add_double(<8 x double>) nounwind readonly alwaysinline {
  %v0 = shufflevector <8 x double> %0, <8 x double> undef,
                      <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %v1 = shufflevector <8 x double> %0, <8 x double> undef,
@@ -303,98 +268,160 @@ define internal double @__reduce_add_double(<8 x double>) nounwind readonly alwa
  ret double %sum
 }

-define internal double @__reduce_min_double(<8 x double>) nounwind readnone alwaysinline {
+define double @__reduce_min_double(<8 x double>) nounwind readnone alwaysinline {
  reduce8(double, @__min_varying_double, @__min_uniform_double)
 }

-
-define internal double @__reduce_max_double(<8 x double>) nounwind readnone alwaysinline {
+define double @__reduce_max_double(<8 x double>) nounwind readnone alwaysinline {
  reduce8(double, @__max_varying_double, @__max_uniform_double)
 }

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int8 ops
+
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline {
+  %wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                  i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int16 ops
+
+define internal <8 x i16> @__add_varying_i16(<8 x i16>,
+                                  <8 x i16>) nounwind readnone alwaysinline {
+  %r = add <8 x i16> %0, %1
+  ret <8 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline {
+  reduce8(i16, @__add_varying_i16, @__add_uniform_i16)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int32 ops
+
+;; helper functions
+define <8 x i32> @__add_varying_int32(<8 x i32>,
+                                      <8 x i32>) nounwind readnone alwaysinline {
+  %s = add <8 x i32> %0, %1
+  ret <8 x i32> %s
+}
+
+define i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
+  %s = add i32 %0, %1
+  ret i32 %s
+}
+
+;; reduction functions
+define i32 @__reduce_add_int32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8(i32, @__add_varying_int32, @__add_uniform_int32)
+}
+
+define i32 @__reduce_min_int32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8(i32, @__min_varying_int32, @__min_uniform_int32)
+}
+
+define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8(i32, @__max_varying_int32, @__max_uniform_int32)
+}
+
+define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32)
+}
+
+define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32)
+}
+

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal int64 ops

-define internal <8 x i64> @__add_varying_int64(<8 x i64>,
-                                               <8 x i64>) nounwind readnone alwaysinline {
+;; helper functions
+define <8 x i64> @__add_varying_int64(<8 x i64>,
+                                      <8 x i64>) nounwind readnone alwaysinline {
  %s = add <8 x i64> %0, %1
  ret <8 x i64> %s
 }

-define internal i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
+define i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
  %s = add i64 %0, %1
  ret i64 %s
 }

-define internal i64 @__reduce_add_int64(<8 x i64>) nounwind readnone alwaysinline {
+;; reduction functions
+define i64 @__reduce_add_int64(<8 x i64>) nounwind readnone alwaysinline {
  reduce8(i64, @__add_varying_int64, @__add_uniform_int64)
 }


-define internal i64 @__reduce_min_int64(<8 x i64>) nounwind readnone alwaysinline {
+define i64 @__reduce_min_int64(<8 x i64>) nounwind readnone alwaysinline {
  reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
 }


-define internal i64 @__reduce_max_int64(<8 x i64>) nounwind readnone alwaysinline {
+define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone alwaysinline {
  reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
 }


-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;; horizontal uint64 ops
-
-define internal i64 @__reduce_add_uint64(<8 x i64> %v) nounwind readnone alwaysinline {
-  %r = call i64 @__reduce_add_int64(<8 x i64> %v)
-  ret i64 %r
-}
-
-define internal i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone alwaysinline {
+define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone alwaysinline {
  reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
 }


-define internal i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone alwaysinline {
+define i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone alwaysinline {
  reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
 }

+reduce_equal(8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts

-load_and_broadcast(8, i8, 8)
-load_and_broadcast(8, i16, 16)
-load_and_broadcast(8, i32, 32)
-load_and_broadcast(8, i64, 64)

 ; no masked load instruction for i8 and i16 types??
-load_masked(8, i8,  8,  1)
-load_masked(8, i16, 16, 2)
+masked_load(i8,  1)
+masked_load(i16, 2)

-declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x float> %mask)
-declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
+declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x MfORi32> %mask)
+declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x MdORi64> %mask)
 
-define <8 x i32> @__load_masked_32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
-  %floatmask = bitcast <8 x i32> %mask to <8 x float>
-  %floatval = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8 * %0, <8 x float> %floatmask)
+define <8 x i32> @__masked_load_i32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
+  %floatmask = bitcast <8 x i32> %mask to <8 x MfORi32>
+  %floatval = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8 * %0, <8 x MfORi32> %floatmask)
  %retval = bitcast <8 x float> %floatval to <8 x i32>
  ret <8 x i32> %retval
 }


-define <8 x i64> @__load_masked_64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
+define <8 x i64> @__masked_load_i64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
  ; double up masks, bitcast to doubles
  %mask0 = shufflevector <8 x i32> %mask, <8 x i32> undef,
     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
  %mask1 = shufflevector <8 x i32> %mask, <8 x i32> undef,
     <8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
-  %mask0d = bitcast <8 x i32> %mask0 to <4 x double>
-  %mask1d = bitcast <8 x i32> %mask1 to <4 x double>
+  %mask0d = bitcast <8 x i32> %mask0 to <4 x MdORi64>
+  %mask1d = bitcast <8 x i32> %mask1 to <4 x MdORi64>

-  %val0d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %0, <4 x double> %mask0d)
-  %ptr1 = getelementptr i8 * %0, i32 32
-  %val1d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %ptr1, <4 x double> %mask1d)
+  %val0d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %0, <4 x MdORi64> %mask0d)
+  %ptr1 = getelementptr PTR_OP_ARGS(`i8') %0, i32 32
+  %val1d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %ptr1, <4 x MdORi64> %mask1d)

  %vald = shufflevector <4 x double> %val0d, <4 x double> %val1d,
      <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -402,31 +429,29 @@ define <8 x i64> @__load_masked_64(i8 *, <8 x i32> %mask) nounwind alwaysinline
  ret <8 x i64> %val
 }

+masked_load_float_double()

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store

-; FIXME: there is no AVX instruction for these, but we could be clever
-; by packing the bits down and setting the last 3/4 or half, respectively,
-; of the mask to zero...  Not sure if this would be a win in the end
-gen_masked_store(8, i8, 8)
-gen_masked_store(8, i16, 16)
+gen_masked_store(i8)
+gen_masked_store(i16)

 ; note that mask is the 2nd parameter, not the 3rd one!!
-declare void @llvm.x86.avx.maskstore.ps.256(i8 *, <8 x float>, <8 x float>)
-declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>)
+declare void @llvm.x86.avx.maskstore.ps.256(i8 *, <8 x MfORi32>, <8 x float>)
+declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x MdORi64>, <4 x double>)

-define void @__masked_store_32(<8 x i32>* nocapture, <8 x i32>, 
-                               <8 x i32>) nounwind alwaysinline {
+define void @__masked_store_i32(<8 x i32>* nocapture, <8 x i32>, 
+                                <8 x i32>) nounwind alwaysinline {
  %ptr = bitcast <8 x i32> * %0 to i8 *
  %val = bitcast <8 x i32> %1 to <8 x float>
-  %mask = bitcast <8 x i32> %2 to <8 x float>
-  call void @llvm.x86.avx.maskstore.ps.256(i8 * %ptr, <8 x float> %mask, <8 x float> %val)
+  %mask = bitcast <8 x i32> %2 to <8 x MfORi32>
+  call void @llvm.x86.avx.maskstore.ps.256(i8 * %ptr, <8 x MfORi32> %mask, <8 x float> %val)
  ret void
 }

-define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>,
-                               <8 x i32> %mask) nounwind alwaysinline {
+define void @__masked_store_i64(<8 x i64>* nocapture, <8 x i64>,
+                                <8 x i32> %mask) nounwind alwaysinline {
  %ptr = bitcast <8 x i64> * %0 to i8 *
  %val = bitcast <8 x i64> %1 to <8 x double>

@@ -435,31 +460,34 @@ define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>,
  %mask1 = shufflevector <8 x i32> %mask, <8 x i32> undef,
     <8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>

-  %mask0d = bitcast <8 x i32> %mask0 to <4 x double>
-  %mask1d = bitcast <8 x i32> %mask1 to <4 x double>
+  %mask0d = bitcast <8 x i32> %mask0 to <4 x MdORi64>
+  %mask1d = bitcast <8 x i32> %mask1 to <4 x MdORi64>

  %val0 = shufflevector <8 x double> %val, <8 x double> undef,
     <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %val1 = shufflevector <8 x double> %val, <8 x double> undef,
     <4 x i32> <i32 4, i32 5, i32 6, i32 7>

-  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr, <4 x double> %mask0d, <4 x double> %val0)
-  %ptr1 = getelementptr i8 * %ptr, i32 32
-  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr1, <4 x double> %mask1d, <4 x double> %val1)
+  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr, <4 x MdORi64> %mask0d, <4 x double> %val0)
+  %ptr1 = getelementptr PTR_OP_ARGS(`i8') %ptr, i32 32
+  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr1, <4 x MdORi64> %mask1d, <4 x double> %val1)
  ret void
 }

+masked_store_float_double()

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store blend

 masked_store_blend_8_16_by_8()

 declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
                                                <8 x float>) nounwind readnone

-define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>, 
-                                     <8 x i32>) nounwind alwaysinline {
+define void @__masked_store_blend_i32(<8 x i32>* nocapture, <8 x i32>, 
+                                      <8 x i32>) nounwind alwaysinline {
  %mask_as_float = bitcast <8 x i32> %2 to <8 x float>
-  %oldValue = load <8 x i32>* %0, align 4
+  %oldValue = load PTR_OP_ARGS(`<8 x i32>')  %0, align 4
  %oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float>
  %newAsFloat = bitcast <8 x i32> %1 to <8 x float>
  %blend = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %oldAsFloat,
@@ -471,9 +499,9 @@ define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
 }


-define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new, 
-                                     <8 x i32> %i32mask) nounwind alwaysinline {
-  %oldValue = load <8 x i64>* %ptr, align 8
+define void @__masked_store_blend_i64(<8 x i64>* nocapture %ptr, <8 x i64> %new, 
+                                      <8 x i32> %i32mask) nounwind alwaysinline {
+  %oldValue = load PTR_OP_ARGS(`<8 x i64>')  %ptr, align 8
  %mask = bitcast <8 x i32> %i32mask to <8 x float>

  ; Do 4x64-bit blends by doing two <8 x i32> blends, where the <8 x i32> values
@@ -521,44 +549,21 @@ define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
  ret void
 }

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; scatter
+
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; gather/scatter
+;; reciprocals in double precision, if supported

-gen_gather(8, i8)
-gen_gather(8, i16)
-gen_gather(8, i32)
-gen_gather(8, i64)
-
-gen_scatter(8, i8)
-gen_scatter(8, i16)
-gen_scatter(8, i32)
-gen_scatter(8, i64)
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; double precision sqrt
-
-declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
-
-define internal <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
-  unary4to8(ret, double, @llvm.x86.avx.sqrt.pd.256, %0)
-  ret <8 x double> %ret
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; double precision min/max
-
-declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
-declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
-
-define internal <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
-  binary4to8(ret, double, @llvm.x86.avx.min.pd.256, %0, %1)
-  ret <8 x double> %ret
-}
-
-define internal <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
-  binary4to8(ret, double, @llvm.x86.avx.max.pd.256, %0, %1)
-  ret <8 x double> %ret
-}
+rsqrtd_decl()
+rcpd_decl()

+transcendetals_decl()
+trigonometry_decl()
--- a/builtins/target-avx1-i64x4.ll
+++ b/builtins/target-avx1-i64x4.ll
@@ -0,0 +1,81 @@
+;;  Copyright (c) 2013, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+include(`target-avx1-i64x4base.ll')
+
+rdrand_decls()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+define <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %call = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %call
+}
+define <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %call = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %0, <4 x i32> %1)
+
+  ret <4 x i32> %call
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+define <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %call = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %call
+}
+
+define <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %call = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+ifelse(NO_HALF_DECLARES, `1', `', `
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
--- a/builtins/target-avx1-i64x4base.ll
+++ b/builtins/target-avx1-i64x4base.ll
@@ -0,0 +1,519 @@
+;;  Copyright (c) 2013-2015, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Basic 4-wide definitions
+
+define(`WIDTH',`4')
+define(`MASK',`i64')
+include(`util.m4')
+
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+saturation_arithmetic()
+
+include(`target-avx-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+;; sse intrinsic
+declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
+
+define <4 x float> @__rcp_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  ;  float iv = __rcp_v(v);
+  ;  return iv * (2. - v * iv);
+
+  %call = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %0)
+  ; do one N-R iteration
+  %v_iv = fmul <4 x float> %0, %call
+  %two_minus = fsub <4 x float> <float 2., float 2., float 2., float 2.>, %v_iv
+  %iv_mul = fmul <4 x float> %call, %two_minus
+  ret <4 x float> %iv_mul
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding floats
+
+;; sse intrinsic
+declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
+
+define <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 8)
+  ret <4 x float> %call
+}
+
+define <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 9)
+  ret <4 x float> %call
+}
+
+define <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 10)
+  ret <4 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+;; avx intrinsic
+declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
+
+define <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  %call = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %0, i32 8)
+  ret <4 x double> %call
+}
+
+define <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  %call = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %0, i32 9)
+  ret <4 x double> %call
+}
+
+
+define <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  %call = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %0, i32 10)
+  ret <4 x double> %call
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rsqrt
+
+;; sse intrinsic
+declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
+
+define <4 x float> @__rsqrt_varying_float(<4 x float> %v) nounwind readonly alwaysinline {
+  ;  float is = __rsqrt_v(v);
+  %is = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %v)
+  ; Newton-Raphson iteration to improve precision
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul <4 x float> %v, %is
+  %v_is_is = fmul <4 x float> %v_is, %is
+  %three_sub = fsub <4 x float> <float 3., float 3., float 3., float 3.>, %v_is_is
+  %is_mul = fmul <4 x float> %is, %three_sub
+  %half_scale = fmul <4 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  ret <4 x float> %half_scale
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; sqrt
+
+;; sse intrinsic
+declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
+
+define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  %call = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %0)
+  ret <4 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+;; avx<76> intrinsic
+declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
+
+define <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
+  %call = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %0)
+  ret <4 x double> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; svml
+
+include(`svml.m4')
+;; single precision
+svml_declare(float,f4,4)
+svml_define(float,f4,4,f)
+
+;; double precision
+svml_declare(double,4,4)
+svml_define(double,4,4,d)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+;; sse intrinsics
+declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <4 x float> @__max_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
+  %call = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %0, <4 x float> %1)
+  ret <4 x float> %call
+}
+
+define <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
+  %call = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %0, <4 x float> %1)
+  ret <4 x float> %call
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops
+
+;; sse intrinsic 
+declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>) nounwind readnone
+
+define i64 @__movmsk(<4 x i64>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i64> %0 to <4 x double>
+  %v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
+}
+
+define i1 @__any(<4 x i64>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i64> %0 to <4 x double>
+  %v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<4 x i64>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i64> %0 to <4 x double>
+  %v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
+  %cmp = icmp eq i32 %v, 15
+  ret i1 %cmp
+}
+
+define i1 @__none(<4 x i64>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i64> %0 to <4 x double>
+  %v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
+  %cmp = icmp eq i32 %v, 0
+  ret i1 %cmp
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal float ops
+
+;; sse intrinsic
+declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline {
+  %v1 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %0, <4 x float> %0)
+  %v2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %v1, <4 x float> %v1)
+  %scalar = extractelement <4 x float> %v2, i32 0
+  ret float %scalar
+}
+
+define float @__reduce_min_float(<4 x float>) nounwind readnone {
+  reduce4(float, @__min_varying_float, @__min_uniform_float)
+}
+
+define float @__reduce_max_float(<4 x float>) nounwind readnone {
+  reduce4(float, @__max_varying_float, @__max_uniform_float)
+}
+
+reduce_equal(4)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int8 ops
+
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline 
+{
+  %wide8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4,
+                  i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int16 ops
+
+define internal <4 x i16> @__add_varying_i16(<4 x i16>,
+                                  <4 x i16>) nounwind readnone alwaysinline {
+  %r = add <4 x i16> %0, %1
+  ret <4 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<4 x i16>) nounwind readnone alwaysinline {
+  reduce4(i16, @__add_varying_i16, @__add_uniform_i16)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int32 ops
+
+define <4 x i32> @__add_varying_int32(<4 x i32>,
+                                      <4 x i32>) nounwind readnone alwaysinline {
+  %s = add <4 x i32> %0, %1
+  ret <4 x i32> %s
+}
+
+define i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
+  %s = add i32 %0, %1
+  ret i32 %s
+}
+
+define i32 @__reduce_add_int32(<4 x i32>) nounwind readnone alwaysinline {
+  reduce4(i32, @__add_varying_int32, @__add_uniform_int32)
+}
+
+
+define i32 @__reduce_min_int32(<4 x i32>) nounwind readnone alwaysinline {
+  reduce4(i32, @__min_varying_int32, @__min_uniform_int32)
+}
+
+
+define i32 @__reduce_max_int32(<4 x i32>) nounwind readnone alwaysinline {
+  reduce4(i32, @__max_varying_int32, @__max_uniform_int32)
+}
+
+define i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone alwaysinline {
+  reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32)
+}
+
+define i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone alwaysinline {
+  reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal double ops
+
+declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define double @__reduce_add_double(<4 x double>) nounwind readonly alwaysinline {
+  %v0 = shufflevector <4 x double> %0, <4 x double> undef,
+                      <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v1 = shufflevector <4 x double> <double 0.,double 0.,double 0.,double 0.>, <4 x double> undef,
+                      <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+;;  %v1 = <4 x double> <double 0., double 0., double 0., double 0.>
+  %sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %v0,   <4 x double> %v1)
+  %sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
+  %final0 = extractelement <4 x double> %sum1, i32 0
+  %final1 = extractelement <4 x double> %sum1, i32 2
+  %sum = fadd double %final0, %final1
+
+  ret double %sum
+}
+
+define double @__reduce_min_double(<4 x double>) nounwind readnone alwaysinline {
+  reduce4(double, @__min_varying_double, @__min_uniform_double)
+}
+
+
+define double @__reduce_max_double(<4 x double>) nounwind readnone alwaysinline {
+  reduce4(double, @__max_varying_double, @__max_uniform_double)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int64 ops
+
+define <4 x i64> @__add_varying_int64(<4 x i64>,
+                                      <4 x i64>) nounwind readnone alwaysinline {
+  %s = add <4 x i64> %0, %1
+  ret <4 x i64> %s
+}
+
+define i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
+  %s = add i64 %0, %1
+  ret i64 %s
+}
+
+define i64 @__reduce_add_int64(<4 x i64>) nounwind readnone alwaysinline {
+  reduce4(i64, @__add_varying_int64, @__add_uniform_int64)
+}
+
+
+define i64 @__reduce_min_int64(<4 x i64>) nounwind readnone alwaysinline {
+  reduce4(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+
+define i64 @__reduce_max_int64(<4 x i64>) nounwind readnone alwaysinline {
+  reduce4(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+
+define i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone alwaysinline {
+  reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+
+define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone alwaysinline {
+  reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+
+; no masked load instruction for i8 and i16 types??
+masked_load(i8,  1)
+masked_load(i16, 2)
+
+;; avx intrinsics
+declare <4 x float> @llvm.x86.avx.maskload.ps(i8 *, <4 x MfORi32> %mask)
+declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x MdORi64> %mask)
+ 
+define <4 x i32> @__masked_load_i32(i8 *, <4 x i64> %mask64) nounwind alwaysinline {
+  %mask      = trunc <4 x i64> %mask64 to <4 x i32>
+  %floatmask = bitcast <4 x i32> %mask to <4 x MfORi32>
+  %floatval = call <4 x float> @llvm.x86.avx.maskload.ps(i8 * %0, <4 x MfORi32> %floatmask)
+  %retval = bitcast <4 x float> %floatval to <4 x i32>
+  ret <4 x i32> %retval
+}
+
+
+define <4 x i64> @__masked_load_i64(i8 *, <4 x i64> %mask) nounwind alwaysinline {
+  %doublemask = bitcast <4 x i64> %mask to <4 x MdORi64>
+  %doubleval  = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %0, <4 x MdORi64> %doublemask)
+  %retval = bitcast <4 x double> %doubleval to <4 x i64>
+  ret <4 x i64> %retval
+}
+
+masked_load_float_double()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+gen_masked_store(i8)
+gen_masked_store(i16)
+
+; note that mask is the 2nd parameter, not the 3rd one!!
+;; avx intrinsics
+declare void @llvm.x86.avx.maskstore.ps    (i8 *, <4 x MfORi32>,  <4 x float>)
+declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x MdORi64>, <4 x double>)
+
+define void @__masked_store_i32(<4 x i32>* nocapture, <4 x i32>, 
+                                <4 x i64>) nounwind alwaysinline {
+  %mask32 = trunc <4 x i64> %2 to <4 x i32>
+
+  %ptr    = bitcast <4 x i32> * %0 to i8 *
+  %val    = bitcast <4 x i32> %1 to <4 x float>
+  %mask   = bitcast <4 x i32> %mask32 to <4 x MfORi32>
+  call void @llvm.x86.avx.maskstore.ps(i8 * %ptr, <4 x MfORi32> %mask, <4 x float> %val)
+  ret void
+}
+
+define void @__masked_store_i64(<4 x i64>* nocapture, <4 x i64>,
+                                <4 x i64>) nounwind alwaysinline {
+  %ptr  = bitcast <4 x i64> * %0 to i8 *
+  %val  = bitcast <4 x i64> %1 to <4 x double>
+  %mask = bitcast <4 x i64> %2 to <4 x MdORi64>
+  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr, <4 x MdORi64> %mask, <4 x double> %val)
+  ret void
+}
+
+
+masked_store_blend_8_16_by_4_mask64()
+
+;; sse intrinsic
+declare <4 x float>  @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
+                                             <4 x float>) nounwind readnone
+
+define void @__masked_store_blend_i32(<4 x i32>* nocapture, <4 x i32>, 
+                                      <4 x i64>) nounwind alwaysinline {
+  %mask          = trunc   <4 x i64> %2 to <4 x i32>
+  %mask_as_float = bitcast <4 x i32> %mask to <4 x float>
+  %oldValue      = load PTR_OP_ARGS(`   <4 x i32>')  %0, align 4
+  %oldAsFloat    = bitcast <4 x i32> %oldValue to <4 x float>
+  %newAsFloat    = bitcast <4 x i32> %1 to <4 x float>
+  %blend         = call    <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %oldAsFloat,
+                                                             <4 x float> %newAsFloat,
+                                                             <4 x float> %mask_as_float)
+  %blendAsInt = bitcast <4 x float> %blend to <4 x i32>
+  store <4 x i32> %blendAsInt, <4 x i32>* %0, align 4
+  ret void
+}
+
+;; avx intrinsic
+declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>,
+                                                <4 x double>) nounwind readnone
+
+define void @__masked_store_blend_i64(<4 x i64>* nocapture , <4 x i64>,
+                                      <4 x i64>) nounwind alwaysinline {
+  %mask_as_double = bitcast <4 x i64>  %2 to <4 x double>
+  %oldValue       = load PTR_OP_ARGS(`   <4 x i64>')  %0, align 4
+  %oldAsDouble    = bitcast <4 x i64>  %oldValue to <4 x double>
+  %newAsDouble    = bitcast <4 x i64>  %1 to <4 x double>
+  %blend          = call    <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %oldAsDouble,
+                                                                        <4 x double> %newAsDouble,
+                                                                        <4 x double> %mask_as_double)
+  %blendAsInt = bitcast <4 x double> %blend to <4 x i64>
+  store <4 x i64> %blendAsInt, <4 x i64>* %0, align 4
+  ret void
+}
+
+masked_store_float_double()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; scatter
+
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
+declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone alwaysinline {
+  %call = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %0, <4 x double> %1)
+  ret <4 x double> %call
+}
+
+define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone alwaysinline {
+  %call = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %0, <4 x double> %1)
+  ret <4 x double> %call
+}
+
+rsqrtd_decl()
+rcpd_decl()
+
+transcendetals_decl()
+trigonometry_decl()
--- a/builtins/target-avx1-x2.ll
+++ b/builtins/target-avx1-x2.ll
@@ -0,0 +1,81 @@
+;;  Copyright (c) 2010-2012, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+include(`target-avx-x2.ll')
+
+rdrand_decls()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret <16 x i32> %ret
+}
+
+define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret <16 x i32> %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret <16 x i32> %ret
+}
+
+define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret <16 x i32> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+ifelse(NO_HALF_DECLARES, `1', `', `
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
--- a/builtins/target-avx1.ll
+++ b/builtins/target-avx1.ll
@@ -0,0 +1,82 @@
+;;  Copyright (c) 2010-2013, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+include(`target-avx.ll')
+
+rdrand_decls()
+saturation_arithmetic()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret <8 x i32> %ret
+}
+
+define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret <8 x i32> %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret <8 x i32> %ret
+}
+
+define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret <8 x i32> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+ifelse(NO_HALF_DECLARES, `1', `', `
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
--- a/builtins/target-avx11-i64x4.ll
+++ b/builtins/target-avx11-i64x4.ll
@@ -0,0 +1,119 @@
+;;  Copyright (c) 2013, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+include(`target-avx1-i64x4base.ll')
+
+rdrand_definition()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+define <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %m = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %m
+}
+
+define <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %m = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %m
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+define <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %m = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %m
+}
+
+define <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %m = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %m
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather(i8)
+gen_gather(i16)
+gen_gather(i32)
+gen_gather(float)
+gen_gather(i64)
+gen_gather(double)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float/half conversions
+
+define(`expand_4to8', `
+  %$3 = shufflevector <4 x $1> %$2, <4 x $1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+')
+define(`extract_4from8', `
+  %$3 = shufflevector <8 x $1> %$2, <8 x $1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+')
+
+declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
+; 0 is round nearest even
+declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
+
+define <4 x float> @__half_to_float_varying(<4 x i16> %v4) nounwind readnone {
+  expand_4to8(i16, v4, v) 
+  %r = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %v)
+  extract_4from8(float, r, ret)
+  ret <4 x float> %ret
+}
+
+define <4 x i16> @__float_to_half_varying(<4 x float> %v4) nounwind readnone {
+  expand_4to8(float, v4, v) 
+  %r = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %v, i32 0)
+  extract_4from8(i16, r, ret)
+  ret <4 x i16> %ret
+}
+
+define float @__half_to_float_uniform(i16 %v) nounwind readnone {
+  %v1 = bitcast i16 %v to <1 x i16>
+  %vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
+  %r = extractelement <8 x float> %rv, i32 0
+  ret float %r
+}
+
+define i16 @__float_to_half_uniform(float %v) nounwind readnone {
+  %v1 = bitcast float %v to <1 x float>
+  %vv = shufflevector <1 x float> %v1, <1 x float> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  ; round to nearest even
+  %rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
+  %r = extractelement <8 x i16> %rv, i32 0
+  ret i16 %r
+}
--- a/builtins/target-avx11-x2.ll
+++ b/builtins/target-avx11-x2.ll
@@ -0,0 +1,125 @@
+;;  Copyright (c) 2012-2013, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+include(`target-avx-x2.ll')
+
+rdrand_definition()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret <16 x i32> %ret
+}
+
+define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret <16 x i32> %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret <16 x i32> %ret
+}
+
+define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret <16 x i32> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather(i8)
+gen_gather(i16)
+gen_gather(i32)
+gen_gather(float)
+gen_gather(i64)
+gen_gather(double)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float/half conversions
+
+declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
+; 0 is round nearest even
+declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
+
+define <16 x float> @__half_to_float_varying(<16 x i16> %v) nounwind readnone {
+  %r_0 = shufflevector <16 x i16> %v, <16 x i16> undef,
+             <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %vr_0 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_0)
+  %r_1 = shufflevector <16 x i16> %v, <16 x i16> undef,
+             <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vr_1 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_1)
+  %r = shufflevector <8 x float> %vr_0, <8 x float> %vr_1, 
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %r
+}
+
+define <16 x i16> @__float_to_half_varying(<16 x float> %v) nounwind readnone {
+  %r_0 = shufflevector <16 x float> %v, <16 x float> undef,
+             <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %vr_0 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_0, i32 0)
+  %r_1 = shufflevector <16 x float> %v, <16 x float> undef,
+             <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vr_1 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_1, i32 0)
+  %r = shufflevector <8 x i16> %vr_0, <8 x i16> %vr_1, 
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i16> %r
+}
+
+define float @__half_to_float_uniform(i16 %v) nounwind readnone {
+  %v1 = bitcast i16 %v to <1 x i16>
+  %vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
+  %r = extractelement <8 x float> %rv, i32 0
+  ret float %r
+}
+
+define i16 @__float_to_half_uniform(float %v) nounwind readnone {
+  %v1 = bitcast float %v to <1 x float>
+  %vv = shufflevector <1 x float> %v1, <1 x float> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  ; round to nearest even
+  %rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
+  %r = extractelement <8 x i16> %rv, i32 0
+  ret i16 %r
+}
--- a/builtins/target-avx11.ll
+++ b/builtins/target-avx11.ll
@@ -0,0 +1,110 @@
+;;  Copyright (c) 2012-2013, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+include(`target-avx.ll')
+
+rdrand_definition()
+saturation_arithmetic()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret <8 x i32> %ret
+}
+
+define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret <8 x i32> %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret <8 x i32> %ret
+}
+
+define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret <8 x i32> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather(i8)
+gen_gather(i16)
+gen_gather(i32)
+gen_gather(float)
+gen_gather(i64)
+gen_gather(double)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float/half conversions
+
+declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
+; 0 is round nearest even
+declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
+
+define <8 x float> @__half_to_float_varying(<8 x i16> %v) nounwind readnone {
+  %r = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %v)
+  ret <8 x float> %r
+}
+
+define <8 x i16> @__float_to_half_varying(<8 x float> %v) nounwind readnone {
+  %r = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %v, i32 0)
+  ret <8 x i16> %r
+}
+
+define float @__half_to_float_uniform(i16 %v) nounwind readnone {
+  %v1 = bitcast i16 %v to <1 x i16>
+  %vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
+  %r = extractelement <8 x float> %rv, i32 0
+  ret float %r
+}
+
+define i16 @__float_to_half_uniform(float %v) nounwind readnone {
+  %v1 = bitcast float %v to <1 x float>
+  %vv = shufflevector <1 x float> %v1, <1 x float> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  ; round to nearest even
+  %rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
+  %r = extractelement <8 x i16> %rv, i32 0
+  ret i16 %r
+}
--- a/builtins/target-avx2-i64x4.ll
+++ b/builtins/target-avx2-i64x4.ll
@@ -0,0 +1,342 @@
+;;  Copyright (c) 2013, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`HAVE_GATHER', `1')
+
+include(`target-avx1-i64x4base.ll')
+
+rdrand_definition()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+;; declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
+;; declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readonly
+
+define <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %m = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %m
+}
+
+define <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %m = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %m
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+;; declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readonly
+;; declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readonly
+
+define <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %m = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %m
+}
+
+define <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %m = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %m
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float/half conversions
+
+
+
+define(`expand_4to8', `
+  %$3 = shufflevector <4 x $1> %$2, <4 x $1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+')
+define(`extract_4from8', `
+  %$3 = shufflevector <8 x $1> %$2, <8 x $1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+')
+
+declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
+; 0 is round nearest even
+declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
+
+define <4 x float> @__half_to_float_varying(<4 x i16> %v4) nounwind readnone {
+  expand_4to8(i16, v4, v) 
+  %r = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %v)
+  extract_4from8(float, r, ret)
+  ret <4 x float> %ret
+}
+
+define <4 x i16> @__float_to_half_varying(<4 x float> %v4) nounwind readnone {
+  expand_4to8(float, v4, v) 
+  %r = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %v, i32 0)
+  extract_4from8(i16, r, ret)
+  ret <4 x i16> %ret
+}
+
+define float @__half_to_float_uniform(i16 %v) nounwind readnone {
+  %v1 = bitcast i16 %v to <1 x i16>
+  %vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
+  %r = extractelement <8 x float> %rv, i32 0
+  ret float %r
+}
+
+define i16 @__float_to_half_uniform(float %v) nounwind readnone {
+  %v1 = bitcast float %v to <1 x float>
+  %vv = shufflevector <1 x float> %v1, <1 x float> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  ; round to nearest even
+  %rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
+  %r = extractelement <8 x i16> %rv, i32 0
+  ret i16 %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+declare void @llvm.trap() noreturn nounwind
+
+gen_gather(i8)
+gen_gather(i16)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int32 gathers
+
+declare <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> %target, i8 * %ptr,
+                       <4 x i32> %indices, <4 x i32> %mask, i8 %scale) readonly nounwind
+declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x i32> %mask, i8 %scale) readonly nounwind
+
+define <4 x i32> @__gather_base_offsets32_i32(i8 * %ptr,
+                             i32 %scale, <4 x i32> %offsets,
+                             <4 x i64> %vecmask64) nounwind readonly alwaysinline {
+  %scale8  = trunc i32 %scale to i8
+  %vecmask = trunc <4 x i64> %vecmask64 to <4 x i32>
+
+  %v = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> undef, i8 * %ptr,
+                             <4 x i32> %offsets, <4 x i32> %vecmask, i8 %scale8)
+  ret <4 x i32> %v
+}
+
+
+define <4 x i32> @__gather_base_offsets64_i32(i8 * %ptr,
+                             i32 %scale, <4 x i64> %offsets,
+                             <4 x i64> %vecmask64) nounwind readonly alwaysinline {
+  %scale8  = trunc i32 %scale to i8
+  %vecmask = trunc <4 x i64> %vecmask64 to <4 x i32>
+
+  %v = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, 
+                      <4 x i64> %offsets, <4 x i32> %vecmask, i8 %scale8)
+
+  ret <4 x i32> %v
+}
+
+
+define <4 x i32> @__gather32_i32(<4 x i32> %ptrs,
+                                 <4 x i64> %vecmask64) nounwind readonly alwaysinline {
+
+  %vecmask = trunc <4 x i64> %vecmask64 to <4 x i32>
+
+  %v = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> undef, i8 * null,
+                      <4 x i32> %ptrs, <4 x i32> %vecmask, i8 1)
+  
+  ret <4 x i32> %v
+}
+
+
+define <4 x i32> @__gather64_i32(<4 x i64> %ptrs, 
+                                 <4 x i64> %vecmask64) nounwind readonly alwaysinline {
+  %vecmask = trunc <4 x i64> %vecmask64 to <4 x i32>
+
+  %v = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
+                      <4 x i64> %ptrs, <4 x i32> %vecmask, i8 1)
+
+  ret <4 x i32> %v
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float gathers
+
+declare <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> %target, i8 * %ptr,
+                       <4 x i32> %indices, <4 x float> %mask, i8 %scale8) readonly nounwind
+declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x float> %mask, i8 %scale8) readonly nounwind
+
+define <4 x float> @__gather_base_offsets32_float(i8 * %ptr,
+                                  i32 %scale, <4 x i32> %offsets,
+                                  <4 x i64> %vecmask64) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask = trunc <4 x i64> %vecmask64 to <4 x i32>
+  %mask = bitcast <4 x i32> %vecmask to <4 x float>
+
+  %v = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> undef, i8 * %ptr,
+                       <4 x i32> %offsets, <4 x float> %mask, i8 %scale8)
+
+  ret <4 x float> %v
+}
+
+
+define <4 x float> @__gather_base_offsets64_float(i8 * %ptr,
+                                   i32 %scale, <4 x i64> %offsets,
+                                   <4 x i64> %vecmask64) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask = trunc <4 x i64> %vecmask64 to <4 x i32>
+  %mask = bitcast <4 x i32> %vecmask to <4 x float>
+
+  %v = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, 
+                     <4 x i64> %offsets, <4 x float> %mask, i8 %scale8)
+
+  ret <4 x float> %v
+}
+
+
+define <4 x float> @__gather32_float(<4 x i32> %ptrs, 
+                                     <4 x i64> %vecmask64) nounwind readonly alwaysinline {
+  %vecmask = trunc <4 x i64> %vecmask64 to <4 x i32>
+  %mask = bitcast <4 x i32> %vecmask to <4 x float>
+
+  %v = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> undef, i8 * null,
+                     <4 x i32> %ptrs, <4 x float> %mask, i8 1)
+
+  ret <4 x float> %v
+}
+
+
+define <4 x float> @__gather64_float(<4 x i64> %ptrs, 
+                                     <4 x i64> %vecmask64) nounwind readonly alwaysinline {
+  %vecmask = trunc <4 x i64> %vecmask64 to <4 x i32>
+  %mask = bitcast <4 x i32> %vecmask to <4 x float>
+
+  %v = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
+                      <4 x i64> %ptrs, <4 x float> %mask, i8 1)
+
+  ret <4 x float> %v
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int64 gathers
+
+declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %target, i8 * %ptr,
+                       <4 x i32> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
+declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
+
+define <4 x i64> @__gather_base_offsets32_i64(i8 * %ptr,
+                             i32 %scale, <4 x i32> %offsets,
+                             <4 x i64> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+
+  %v = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
+                             <4 x i32> %offsets, <4 x i64> %vecmask, i8 %scale8)
+
+  ret <4 x i64> %v
+}
+
+
+define <4 x i64> @__gather_base_offsets64_i64(i8 * %ptr,
+                             i32 %scale, <4 x i64> %offsets,
+                             <4 x i64> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+
+  %v = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, 
+                      <4 x i64> %offsets, <4 x i64> %vecmask, i8 %scale8)
+
+  ret <4 x i64> %v
+}
+
+
+define <4 x i64> @__gather32_i64(<4 x i32> %ptrs, 
+                                 <4 x i64> %vecmask) nounwind readonly alwaysinline {
+
+  %v = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i32> %ptrs, <4 x i64> %vecmask, i8 1)
+  ret <4 x i64> %v
+}
+
+
+define <4 x i64> @__gather64_i64(<4 x i64> %ptrs, 
+                                 <4 x i64> %vecmask) nounwind readonly alwaysinline {
+  %v = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i64> %ptrs, <4 x i64> %vecmask, i8 1)
+  ret <4 x i64> %v
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double gathers
+
+declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
+declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %target, i8 * %ptr,
+                       <4 x i32> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
+
+define <4 x double> @__gather_base_offsets32_double(i8 * %ptr,
+                             i32 %scale, <4 x i32> %offsets,
+                             <4 x i64> %vecmask64) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask = bitcast <4 x i64> %vecmask64 to <4 x double>
+
+  %v = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
+                             <4 x i32> %offsets, <4 x double> %vecmask, i8 %scale8)
+  ret <4 x double> %v
+}
+
+define <4 x double> @__gather_base_offsets64_double(i8 * %ptr,
+                             i32 %scale, <4 x i64> %offsets,
+                             <4 x i64> %vecmask64) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask = bitcast <4 x i64> %vecmask64 to <4 x double>
+
+  %v = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, 
+                      <4 x i64> %offsets, <4 x double> %vecmask, i8 %scale8)
+
+  ret <4 x double> %v
+}
+
+define <4 x double> @__gather32_double(<4 x i32> %ptrs, 
+                                       <4 x i64> %vecmask64) nounwind readonly alwaysinline {
+  %vecmask = bitcast <4 x i64> %vecmask64 to <4 x double>
+
+  %v = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i32> %ptrs, <4 x double> %vecmask, i8 1)
+
+  ret <4 x double> %v
+}
+
+define <4 x double> @__gather64_double(<4 x i64> %ptrs, 
+                                       <4 x i64> %vecmask64) nounwind readonly alwaysinline {
+  %vecmask = bitcast <4 x i64> %vecmask64 to <4 x double>
+
+  %v = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i64> %ptrs, <4 x double> %vecmask, i8 1)
+
+  ret <4 x double> %v
+}
--- a/builtins/target-avx2-x2.ll
+++ b/builtins/target-avx2-x2.ll
@@ -0,0 +1,538 @@
+;;  Copyright (c) 2010-2013, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`HAVE_GATHER', `1')
+
+include(`target-avx-x2.ll')
+
+rdrand_definition()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readonly
+declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readonly
+
+define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary8to16(m, i32, @llvm.x86.avx2.pmins.d, %0, %1)
+  ret <16 x i32> %m
+}
+
+define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary8to16(m, i32, @llvm.x86.avx2.pmaxs.d, %0, %1)
+  ret <16 x i32> %m
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readonly
+declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readonly
+
+define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary8to16(m, i32, @llvm.x86.avx2.pminu.d, %0, %1)
+  ret <16 x i32> %m
+}
+
+define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary8to16(m, i32, @llvm.x86.avx2.pmaxu.d, %0, %1)
+  ret <16 x i32> %m
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float/half conversions
+
+declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
+; 0 is round nearest even
+declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
+
+define <16 x float> @__half_to_float_varying(<16 x i16> %v) nounwind readnone {
+  %r_0 = shufflevector <16 x i16> %v, <16 x i16> undef,
+             <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %vr_0 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_0)
+  %r_1 = shufflevector <16 x i16> %v, <16 x i16> undef,
+             <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vr_1 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_1)
+  %r = shufflevector <8 x float> %vr_0, <8 x float> %vr_1, 
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %r
+}
+
+define <16 x i16> @__float_to_half_varying(<16 x float> %v) nounwind readnone {
+  %r_0 = shufflevector <16 x float> %v, <16 x float> undef,
+             <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %vr_0 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_0, i32 0)
+  %r_1 = shufflevector <16 x float> %v, <16 x float> undef,
+             <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vr_1 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_1, i32 0)
+  %r = shufflevector <8 x i16> %vr_0, <8 x i16> %vr_1, 
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i16> %r
+}
+
+define float @__half_to_float_uniform(i16 %v) nounwind readnone {
+  %v1 = bitcast i16 %v to <1 x i16>
+  %vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
+  %r = extractelement <8 x float> %rv, i32 0
+  ret float %r
+}
+
+define i16 @__float_to_half_uniform(float %v) nounwind readnone {
+  %v1 = bitcast float %v to <1 x float>
+  %vv = shufflevector <1 x float> %v1, <1 x float> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  ; round to nearest even
+  %rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
+  %r = extractelement <8 x i16> %rv, i32 0
+  ret i16 %r
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+declare void @llvm.trap() noreturn nounwind
+
+; $1: type
+; $2: var base name
+define(`extract_4s', `
+  %$2_1 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$2_2 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %$2_3 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %$2_4 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+')
+
+; $1: type
+; $2: var base name
+define(`extract_8s', `
+  %$2_1 = shufflevector <16 x $1> %$2, <16 x $1> undef,
+                    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %$2_2 = shufflevector <16 x $1> %$2, <16 x $1> undef,
+                    <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+')
+
+; $1: element type
+; $2: ret name
+; $3: v1
+; $4: v2
+define(`assemble_8s', `
+  %$2 = shufflevector <8 x $1> %$3, <8 x $1> %$4,
+                      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                                  i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+')
+
+; $1: element type
+; $2: ret name
+; $3: v1
+; $4: v2
+; $5: v3
+; $6: v4
+define(`assemble_4s', `
+  %$2_1 = shufflevector <4 x $1> %$3, <4 x $1> %$4,
+                    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %$2_2 = shufflevector <4 x $1> %$5, <4 x $1> %$6,
+                    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  assemble_8s($1, $2, $2_1, $2_2)
+')
+
+
+gen_gather(i8)
+gen_gather(i16)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int32 gathers
+
+declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %target, i8 * %ptr,
+                       <8 x i32> %indices, <8 x i32> %mask, i8 %scale) readonly nounwind
+declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x i32> %mask, i8 %scale) readonly nounwind
+
+define <16 x i32> @__gather_base_offsets32_i32(i8 * %ptr, i32 %scale, <16 x i32> %offsets,
+                             <16 x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  extract_8s(i32, offsets)
+  extract_8s(i32, vecmask)
+
+  %v1 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * %ptr,
+                             <8 x i32> %offsets_1, <8 x i32> %vecmask_1, i8 %scale8)
+  %v2 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * %ptr,
+                             <8 x i32> %offsets_2, <8 x i32> %vecmask_2, i8 %scale8)
+
+  assemble_8s(i32, v, v1, v2)
+
+  ret <16 x i32> %v
+}
+
+
+define <16 x i32> @__gather_base_offsets64_i32(i8 * %ptr,
+                             i32 %scale, <16 x i64> %offsets,
+                             <16 x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+
+  extract_4s(i32, vecmask)
+  extract_4s(i64, offsets)
+
+  %v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_1, <4 x i32> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_2, <4 x i32> %vecmask_2, i8 %scale8)
+  %v3 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_3, <4 x i32> %vecmask_3, i8 %scale8)
+  %v4 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_4, <4 x i32> %vecmask_4, i8 %scale8)
+
+  assemble_4s(i32, v, v1, v2, v3, v4)
+
+  ret <16 x i32> %v
+}
+
+
+define <16 x i32> @__gather32_i32(<16 x i32> %ptrs, 
+                                  <16 x i32> %vecmask) nounwind readonly alwaysinline {
+  extract_8s(i32, ptrs)
+  extract_8s(i32, vecmask)
+
+  %v1 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * null,
+                       <8 x i32> %ptrs_1, <8 x i32> %vecmask_1, i8 1)
+  %v2 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * null,
+                       <8 x i32> %ptrs_2, <8 x i32> %vecmask_2, i8 1)
+
+  assemble_8s(i32, v, v1, v2)
+
+  ret <16 x i32> %v
+}
+
+
+define <16 x i32> @__gather64_i32(<16 x i64> %ptrs, 
+                                  <16 x i32> %vecmask) nounwind readonly alwaysinline {
+  extract_4s(i64, ptrs)
+  extract_4s(i32, vecmask)
+
+  %v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
+                      <4 x i64> %ptrs_1, <4 x i32> %vecmask_1, i8 1)
+  %v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
+                      <4 x i64> %ptrs_2, <4 x i32> %vecmask_2, i8 1)
+  %v3 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
+                      <4 x i64> %ptrs_3, <4 x i32> %vecmask_3, i8 1)
+  %v4 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
+                      <4 x i64> %ptrs_4, <4 x i32> %vecmask_4, i8 1)
+
+  assemble_4s(i32, v, v1, v2, v3, v4)
+
+  ret <16 x i32> %v
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float gathers
+
+declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %target, i8 * %ptr,
+                       <8 x i32> %indices, <8 x float> %mask, i8 %scale8) readonly nounwind
+declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x float> %mask, i8 %scale8) readonly nounwind
+
+define <16 x float> @__gather_base_offsets32_float(i8 * %ptr,
+                                  i32 %scale, <16 x i32> %offsets,
+                                  <16 x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %mask = bitcast <16 x i32> %vecmask to <16 x float>
+  extract_8s(i32, offsets)
+  extract_8s(float, mask)
+
+  %v1 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * %ptr,
+                       <8 x i32> %offsets_1, <8 x float> %mask_1, i8 %scale8)
+  %v2 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * %ptr,
+                       <8 x i32> %offsets_2, <8 x float> %mask_2, i8 %scale8)
+
+  assemble_8s(float, v, v1, v2)
+
+  ret <16 x float> %v
+}
+
+
+define <16 x float> @__gather_base_offsets64_float(i8 * %ptr,
+                                   i32 %scale, <16 x i64> %offsets,
+                                   <16 x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %mask = bitcast <16 x i32> %vecmask to <16 x float>
+  extract_4s(i64, offsets)
+  extract_4s(float, mask)
+
+  %v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, 
+                     <4 x i64> %offsets_1, <4 x float> %mask_1, i8 %scale8)
+  %v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, 
+                     <4 x i64> %offsets_2, <4 x float> %mask_2, i8 %scale8)
+  %v3 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, 
+                     <4 x i64> %offsets_3, <4 x float> %mask_3, i8 %scale8)
+  %v4 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, 
+                     <4 x i64> %offsets_4, <4 x float> %mask_4, i8 %scale8)
+
+  assemble_4s(float, v, v1, v2, v3, v4)
+
+  ret <16 x float> %v
+}
+
+
+define <16 x float> @__gather32_float(<16 x i32> %ptrs, 
+                                      <16 x i32> %vecmask) nounwind readonly alwaysinline {
+  %mask = bitcast <16 x i32> %vecmask to <16 x float>
+  extract_8s(float, mask)
+  extract_8s(i32, ptrs)
+
+  %v1 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * null,
+                     <8 x i32> %ptrs_1, <8 x float> %mask_1, i8 1)
+  %v2 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * null,
+                     <8 x i32> %ptrs_2, <8 x float> %mask_2, i8 1)
+
+  assemble_8s(float, v, v1, v2)
+
+  ret <16 x float> %v
+}
+
+
+define <16 x float> @__gather64_float(<16 x i64> %ptrs, 
+                                      <16 x i32> %vecmask) nounwind readonly alwaysinline {
+  %mask = bitcast <16 x i32> %vecmask to <16 x float>
+  extract_4s(i64, ptrs)
+  extract_4s(float, mask)
+
+  %v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
+                      <4 x i64> %ptrs_1, <4 x float> %mask_1, i8 1)
+  %v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
+                      <4 x i64> %ptrs_2, <4 x float> %mask_2, i8 1)
+  %v3 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
+                      <4 x i64> %ptrs_3, <4 x float> %mask_3, i8 1)
+  %v4 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
+                      <4 x i64> %ptrs_4, <4 x float> %mask_4, i8 1)
+
+  assemble_4s(float, v, v1, v2, v3, v4)
+
+  ret <16 x float> %v
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int64 gathers
+
+declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %target, i8 * %ptr,
+                       <4 x i32> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
+declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
+
+define <16 x i64> @__gather_base_offsets32_i64(i8 * %ptr,
+                             i32 %scale, <16 x i32> %offsets,
+                             <16 x i32> %mask32) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask = sext <16 x i32> %mask32 to <16 x i64>
+  extract_4s(i32, offsets)
+  extract_4s(i64, vecmask)
+
+  %v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
+                             <4 x i32> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
+                             <4 x i32> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8)
+  %v3 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
+                             <4 x i32> %offsets_3, <4 x i64> %vecmask_3, i8 %scale8)
+  %v4 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
+                             <4 x i32> %offsets_4, <4 x i64> %vecmask_4, i8 %scale8)
+
+  assemble_4s(i64, v, v1, v2, v3, v4)
+
+  ret <16 x i64> %v
+}
+
+
+define <16 x i64> @__gather_base_offsets64_i64(i8 * %ptr,
+                             i32 %scale, <16 x i64> %offsets,
+                             <16 x i32> %mask32) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask = sext <16 x i32> %mask32 to <16 x i64>
+  extract_4s(i64, offsets)
+  extract_4s(i64, vecmask)
+
+  %v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8)
+  %v3 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_3, <4 x i64> %vecmask_3, i8 %scale8)
+  %v4 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_4, <4 x i64> %vecmask_4, i8 %scale8)
+
+  assemble_4s(i64, v, v1, v2, v3, v4)
+
+  ret <16 x i64> %v
+}
+
+
+define <16 x i64> @__gather32_i64(<16 x i32> %ptrs, 
+                                  <16 x i32> %mask32) nounwind readonly alwaysinline {
+  %vecmask = sext <16 x i32> %mask32 to <16 x i64>
+  extract_4s(i32, ptrs)
+  extract_4s(i64, vecmask)
+
+  %v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i32> %ptrs_1, <4 x i64> %vecmask_1, i8 1)
+  %v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i32> %ptrs_2, <4 x i64> %vecmask_2, i8 1)
+  %v3 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i32> %ptrs_3, <4 x i64> %vecmask_3, i8 1)
+  %v4 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i32> %ptrs_4, <4 x i64> %vecmask_4, i8 1)
+
+  assemble_4s(i64, v, v1, v2, v3, v4)
+
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @__gather64_i64(<16 x i64> %ptrs, 
+                                  <16 x i32> %mask32) nounwind readonly alwaysinline {
+  %vecmask = sext <16 x i32> %mask32 to <16 x i64>
+  extract_4s(i64, ptrs)
+  extract_4s(i64, vecmask)
+
+  %v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i64> %ptrs_1, <4 x i64> %vecmask_1, i8 1)
+  %v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i64> %ptrs_2, <4 x i64> %vecmask_2, i8 1)
+  %v3 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i64> %ptrs_3, <4 x i64> %vecmask_3, i8 1)
+  %v4 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i64> %ptrs_4, <4 x i64> %vecmask_4, i8 1)
+
+  assemble_4s(i64, v, v1, v2, v3, v4)
+
+  ret <16 x i64> %v
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double gathers
+
+declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
+declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %target, i8 * %ptr,
+                       <4 x i32> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
+
+define <16 x double> @__gather_base_offsets32_double(i8 * %ptr,
+                             i32 %scale, <16 x i32> %offsets,
+                             <16 x i32> %mask32) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask64 = sext <16 x i32> %mask32 to <16 x i64>
+  %vecmask = bitcast <16 x i64> %vecmask64 to <16 x double>
+  extract_4s(i32, offsets)
+  extract_4s(double, vecmask)
+
+  %v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
+                             <4 x i32> %offsets_1, <4 x double> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
+                             <4 x i32> %offsets_2, <4 x double> %vecmask_2, i8 %scale8)
+  %v3 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
+                             <4 x i32> %offsets_3, <4 x double> %vecmask_3, i8 %scale8)
+  %v4 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
+                             <4 x i32> %offsets_4, <4 x double> %vecmask_4, i8 %scale8)
+
+  assemble_4s(double, v, v1, v2, v3, v4)
+
+  ret <16 x double> %v
+}
+
+
+define <16 x double> @__gather_base_offsets64_double(i8 * %ptr,
+                             i32 %scale, <16 x i64> %offsets,
+                             <16 x i32> %mask32) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask64 = sext <16 x i32> %mask32 to <16 x i64>
+  %vecmask = bitcast <16 x i64> %vecmask64 to <16 x double>
+  extract_4s(i64, offsets)
+  extract_4s(double, vecmask)
+
+  %v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_1, <4 x double> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_2, <4 x double> %vecmask_2, i8 %scale8)
+  %v3 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_3, <4 x double> %vecmask_3, i8 %scale8)
+  %v4 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_4, <4 x double> %vecmask_4, i8 %scale8)
+
+  assemble_4s(double, v, v1, v2, v3, v4)
+
+  ret <16 x double> %v
+}
+
+
+define <16 x double> @__gather32_double(<16 x i32> %ptrs, 
+                                        <16 x i32> %mask32) nounwind readonly alwaysinline {
+  %vecmask64 = sext <16 x i32> %mask32 to <16 x i64>
+  %vecmask = bitcast <16 x i64> %vecmask64 to <16 x double>
+  extract_4s(i32, ptrs)
+  extract_4s(double, vecmask)
+
+  %v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i32> %ptrs_1, <4 x double> %vecmask_1, i8 1)
+  %v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i32> %ptrs_2, <4 x double> %vecmask_2, i8 1)
+  %v3 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i32> %ptrs_3, <4 x double> %vecmask_3, i8 1)
+  %v4 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i32> %ptrs_4, <4 x double> %vecmask_4, i8 1)
+
+  assemble_4s(double, v, v1, v2, v3, v4)
+
+  ret <16 x double> %v
+}
+
+
+define <16 x double> @__gather64_double(<16 x i64> %ptrs, 
+                                        <16 x i32> %mask32) nounwind readonly alwaysinline {
+  %vecmask64 = sext <16 x i32> %mask32 to <16 x i64>
+  %vecmask = bitcast <16 x i64> %vecmask64 to <16 x double>
+  extract_4s(i64, ptrs)
+  extract_4s(double, vecmask)
+
+  %v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i64> %ptrs_1, <4 x double> %vecmask_1, i8 1)
+  %v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i64> %ptrs_2, <4 x double> %vecmask_2, i8 1)
+  %v3 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i64> %ptrs_3, <4 x double> %vecmask_3, i8 1)
+  %v4 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i64> %ptrs_4, <4 x double> %vecmask_4, i8 1)
+
+  assemble_4s(double, v, v1, v2, v3, v4)
+
+  ret <16 x double> %v
+}
--- a/builtins/target-avx2.ll
+++ b/builtins/target-avx2.ll
@@ -0,0 +1,409 @@
+;;  Copyright (c) 2010-2013, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`HAVE_GATHER', `1')
+
+include(`target-avx.ll')
+
+rdrand_definition()
+saturation_arithmetic()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readonly
+declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readonly
+
+define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  %m = call <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32> %0, <8 x i32> %1)
+  ret <8 x i32> %m
+}
+
+define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  %m = call <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32> %0, <8 x i32> %1)
+  ret <8 x i32> %m
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readonly
+declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readonly
+
+define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  %m = call <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32> %0, <8 x i32> %1)
+  ret <8 x i32> %m
+}
+
+define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  %m = call <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32> %0, <8 x i32> %1)
+  ret <8 x i32> %m
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float/half conversions
+
+declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
+; 0 is round nearest even
+declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
+
+define <8 x float> @__half_to_float_varying(<8 x i16> %v) nounwind readnone {
+  %r = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %v)
+  ret <8 x float> %r
+}
+
+define <8 x i16> @__float_to_half_varying(<8 x float> %v) nounwind readnone {
+  %r = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %v, i32 0)
+  ret <8 x i16> %r
+}
+
+define float @__half_to_float_uniform(i16 %v) nounwind readnone {
+  %v1 = bitcast i16 %v to <1 x i16>
+  %vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
+  %r = extractelement <8 x float> %rv, i32 0
+  ret float %r
+}
+
+define i16 @__float_to_half_uniform(float %v) nounwind readnone {
+  %v1 = bitcast float %v to <1 x float>
+  %vv = shufflevector <1 x float> %v1, <1 x float> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  ; round to nearest even
+  %rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
+  %r = extractelement <8 x i16> %rv, i32 0
+  ret i16 %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+declare void @llvm.trap() noreturn nounwind
+
+define(`extract_4s', `
+  %$2_1 = shufflevector <8 x $1> %$2, <8 x $1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$2_2 = shufflevector <8 x $1> %$2, <8 x $1> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+')
+
+gen_gather(i8)
+gen_gather(i16)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int32 gathers
+
+declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %target, i8 * %ptr,
+                       <8 x i32> %indices, <8 x i32> %mask, i8 %scale) readonly nounwind
+declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x i32> %mask, i8 %scale) readonly nounwind
+
+define <8 x i32> @__gather_base_offsets32_i32(i8 * %ptr,
+                             i32 %scale, <8 x i32> %offsets,
+                             <8 x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+
+  %v = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * %ptr,
+                             <8 x i32> %offsets, <8 x i32> %vecmask, i8 %scale8)
+
+  ret <8 x i32> %v
+}
+
+
+define <8 x i32> @__gather_base_offsets64_i32(i8 * %ptr,
+                             i32 %scale, <8 x i64> %offsets,
+                             <8 x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  extract_4s(i32, vecmask)
+  extract_4s(i64, offsets)
+
+  %v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_1, <4 x i32> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_2, <4 x i32> %vecmask_2, i8 %scale8)
+
+  %v = shufflevector <4 x i32> %v1, <4 x i32> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %v
+}
+
+
+define <8 x i32> @__gather32_i32(<8 x i32> %ptrs, 
+                                 <8 x i32> %vecmask) nounwind readonly alwaysinline {
+  %v = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * null,
+                      <8 x i32> %ptrs, <8 x i32> %vecmask, i8 1)
+  ret <8 x i32> %v
+}
+
+
+define <8 x i32> @__gather64_i32(<8 x i64> %ptrs, 
+                                 <8 x i32> %vecmask) nounwind readonly alwaysinline {
+  extract_4s(i64, ptrs)
+  extract_4s(i32, vecmask)
+
+  %v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
+                      <4 x i64> %ptrs_1, <4 x i32> %vecmask_1, i8 1)
+  %v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
+                      <4 x i64> %ptrs_2, <4 x i32> %vecmask_2, i8 1)
+
+  %v = shufflevector <4 x i32> %v1, <4 x i32> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %v
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float gathers
+
+declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %target, i8 * %ptr,
+                       <8 x i32> %indices, <8 x float> %mask, i8 %scale8) readonly nounwind
+declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x float> %mask, i8 %scale8) readonly nounwind
+
+define <8 x float> @__gather_base_offsets32_float(i8 * %ptr,
+                                  i32 %scale, <8 x i32> %offsets,
+                                  <8 x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %mask = bitcast <8 x i32> %vecmask to <8 x float>
+
+  %v = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * %ptr,
+                       <8 x i32> %offsets, <8 x float> %mask, i8 %scale8)
+
+  ret <8 x float> %v
+}
+
+
+define <8 x float> @__gather_base_offsets64_float(i8 * %ptr,
+                                   i32 %scale, <8 x i64> %offsets,
+                                   <8 x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %mask = bitcast <8 x i32> %vecmask to <8 x float>
+  extract_4s(i64, offsets)
+  extract_4s(float, mask)
+
+  %v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, 
+                     <4 x i64> %offsets_1, <4 x float> %mask_1, i8 %scale8)
+  %v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, 
+                     <4 x i64> %offsets_2, <4 x float> %mask_2, i8 %scale8)
+
+  %v = shufflevector <4 x float> %v1, <4 x float> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %v
+}
+
+
+define <8 x float> @__gather32_float(<8 x i32> %ptrs, 
+                                     <8 x i32> %vecmask) nounwind readonly alwaysinline {
+  %mask = bitcast <8 x i32> %vecmask to <8 x float>
+
+  %v = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * null,
+                     <8 x i32> %ptrs, <8 x float> %mask, i8 1)
+
+  ret <8 x float> %v
+}
+
+
+define <8 x float> @__gather64_float(<8 x i64> %ptrs, 
+                                     <8 x i32> %vecmask) nounwind readonly alwaysinline {
+  %mask = bitcast <8 x i32> %vecmask to <8 x float>
+  extract_4s(i64, ptrs)
+  extract_4s(float, mask)
+
+  %v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
+                      <4 x i64> %ptrs_1, <4 x float> %mask_1, i8 1)
+  %v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
+                      <4 x i64> %ptrs_2, <4 x float> %mask_2, i8 1)
+
+  %v = shufflevector <4 x float> %v1, <4 x float> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %v
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int64 gathers
+
+declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %target, i8 * %ptr,
+                       <4 x i32> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
+declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
+
+define <8 x i64> @__gather_base_offsets32_i64(i8 * %ptr,
+                             i32 %scale, <8 x i32> %offsets,
+                             <8 x i32> %mask32) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask = sext <8 x i32> %mask32 to <8 x i64>
+  extract_4s(i32, offsets)
+  extract_4s(i64, vecmask)
+
+  %v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
+                             <4 x i32> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
+                             <4 x i32> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8)
+
+  %v = shufflevector <4 x i64> %v1, <4 x i64> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i64> %v
+}
+
+
+define <8 x i64> @__gather_base_offsets64_i64(i8 * %ptr,
+                             i32 %scale, <8 x i64> %offsets,
+                             <8 x i32> %mask32) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask = sext <8 x i32> %mask32 to <8 x i64>
+  extract_4s(i64, offsets)
+  extract_4s(i64, vecmask)
+
+  %v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8)
+
+  %v = shufflevector <4 x i64> %v1, <4 x i64> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i64> %v
+}
+
+
+define <8 x i64> @__gather32_i64(<8 x i32> %ptrs, 
+                                 <8 x i32> %mask32) nounwind readonly alwaysinline {
+  %vecmask = sext <8 x i32> %mask32 to <8 x i64>
+
+  extract_4s(i32, ptrs)
+  extract_4s(i64, vecmask)
+
+  %v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i32> %ptrs_1, <4 x i64> %vecmask_1, i8 1)
+  %v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i32> %ptrs_2, <4 x i64> %vecmask_2, i8 1)
+  %v = shufflevector <4 x i64> %v1, <4 x i64> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i64> %v
+}
+
+
+define <8 x i64> @__gather64_i64(<8 x i64> %ptrs, 
+                                 <8 x i32> %mask32) nounwind readonly alwaysinline {
+  %vecmask = sext <8 x i32> %mask32 to <8 x i64>
+  extract_4s(i64, ptrs)
+  extract_4s(i64, vecmask)
+
+  %v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i64> %ptrs_1, <4 x i64> %vecmask_1, i8 1)
+  %v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i64> %ptrs_2, <4 x i64> %vecmask_2, i8 1)
+
+  %v = shufflevector <4 x i64> %v1, <4 x i64> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i64> %v
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double gathers
+
+declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
+declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %target, i8 * %ptr,
+                       <4 x i32> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
+
+define <8 x double> @__gather_base_offsets32_double(i8 * %ptr,
+                             i32 %scale, <8 x i32> %offsets,
+                             <8 x i32> %mask32) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask64 = sext <8 x i32> %mask32 to <8 x i64>
+  %vecmask = bitcast <8 x i64> %vecmask64 to <8 x double>
+  extract_4s(i32, offsets)
+  extract_4s(double, vecmask)
+
+  %v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
+                             <4 x i32> %offsets_1, <4 x double> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
+                             <4 x i32> %offsets_2, <4 x double> %vecmask_2, i8 %scale8)
+
+  %v = shufflevector <4 x double> %v1, <4 x double> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %v
+}
+
+define <8 x double> @__gather_base_offsets64_double(i8 * %ptr,
+                             i32 %scale, <8 x i64> %offsets,
+                             <8 x i32> %mask32) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask64 = sext <8 x i32> %mask32 to <8 x i64>
+  %vecmask = bitcast <8 x i64> %vecmask64 to <8 x double>
+  extract_4s(i64, offsets)
+  extract_4s(double, vecmask)
+
+  %v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_1, <4 x double> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_2, <4 x double> %vecmask_2, i8 %scale8)
+
+  %v = shufflevector <4 x double> %v1, <4 x double> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %v
+}
+
+define <8 x double> @__gather32_double(<8 x i32> %ptrs, 
+                                       <8 x i32> %mask32) nounwind readonly alwaysinline {
+  %vecmask64 = sext <8 x i32> %mask32 to <8 x i64>
+  %vecmask = bitcast <8 x i64> %vecmask64 to <8 x double>
+  extract_4s(i32, ptrs)
+  extract_4s(double, vecmask)
+
+  %v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i32> %ptrs_1, <4 x double> %vecmask_1, i8 1)
+  %v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i32> %ptrs_2, <4 x double> %vecmask_2, i8 1)
+
+  %v = shufflevector <4 x double> %v1, <4 x double> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %v
+}
+
+define <8 x double> @__gather64_double(<8 x i64> %ptrs, 
+                                       <8 x i32> %mask32) nounwind readonly alwaysinline {
+  %vecmask64 = sext <8 x i32> %mask32 to <8 x i64>
+  %vecmask = bitcast <8 x i64> %vecmask64 to <8 x double>
+  extract_4s(i64, ptrs)
+  extract_4s(double, vecmask)
+
+  %v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i64> %ptrs_1, <4 x double> %vecmask_1, i8 1)
+  %v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i64> %ptrs_2, <4 x double> %vecmask_2, i8 1)
+
+  %v = shufflevector <4 x double> %v1, <4 x double> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+  ret <8 x double> %v
+}
--- a/builtins/target-avx512-common.ll
+++ b/builtins/target-avx512-common.ll
--- a/builtins/target-generic-1.ll
+++ b/builtins/target-generic-1.ll
--- a/builtins/target-generic-16.ll
+++ b/builtins/target-generic-16.ll
@@ -0,0 +1,34 @@
+;;  Copyright (c) 2010-2014, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`WIDTH',`16')
+include(`target-generic-common.ll')
+saturation_arithmetic_novec()
--- a/builtins/target-generic-32.ll
+++ b/builtins/target-generic-32.ll
@@ -0,0 +1,34 @@
+;;  Copyright (c) 2010-2014, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`WIDTH',`32')
+include(`target-generic-common.ll')
+saturation_arithmetic_novec()
--- a/builtins/target-generic-4.ll
+++ b/builtins/target-generic-4.ll
@@ -0,0 +1,34 @@
+;;  Copyright (c) 2010-2014, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`WIDTH',`4')
+include(`target-generic-common.ll')
+saturation_arithmetic_novec()
--- a/builtins/target-generic-64.ll
+++ b/builtins/target-generic-64.ll
@@ -0,0 +1,34 @@
+;;  Copyright (c) 2010-2014, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`WIDTH',`64')
+include(`target-generic-common.ll')
+saturation_arithmetic_novec()
--- a/builtins/target-generic-8.ll
+++ b/builtins/target-generic-8.ll
@@ -0,0 +1,34 @@
+;;  Copyright (c) 2010-2014, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`WIDTH',`8')
+include(`target-generic-common.ll')
+saturation_arithmetic_novec()
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -0,0 +1,394 @@
+;;  Copyright (c) 2010-2015, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128-v16:16:16-v32:32:32-v4:128:128";
+
+define(`MASK',`i1')
+define(`HAVE_GATHER',`1')
+define(`HAVE_SCATTER',`1')
+
+include(`util.m4')
+
+stdlib_core()
+scans()
+reduce_equal(WIDTH)
+rdrand_decls()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; broadcast/rotate/shuffle
+
+declare <WIDTH x float> @__smear_float(float) nounwind readnone
+declare <WIDTH x double> @__smear_double(double) nounwind readnone
+declare <WIDTH x i8> @__smear_i8(i8) nounwind readnone
+declare <WIDTH x i16> @__smear_i16(i16) nounwind readnone
+declare <WIDTH x i32> @__smear_i32(i32) nounwind readnone
+declare <WIDTH x i64> @__smear_i64(i64) nounwind readnone
+
+declare <WIDTH x float> @__setzero_float() nounwind readnone
+declare <WIDTH x double> @__setzero_double() nounwind readnone
+declare <WIDTH x i8> @__setzero_i8() nounwind readnone
+declare <WIDTH x i16> @__setzero_i16() nounwind readnone
+declare <WIDTH x i32> @__setzero_i32() nounwind readnone
+declare <WIDTH x i64> @__setzero_i64() nounwind readnone
+
+declare <WIDTH x float> @__undef_float() nounwind readnone
+declare <WIDTH x double> @__undef_double() nounwind readnone
+declare <WIDTH x i8> @__undef_i8() nounwind readnone
+declare <WIDTH x i16> @__undef_i16() nounwind readnone
+declare <WIDTH x i32> @__undef_i32() nounwind readnone
+declare <WIDTH x i64> @__undef_i64() nounwind readnone
+
+declare <WIDTH x float> @__broadcast_float(<WIDTH x float>, i32) nounwind readnone
+declare <WIDTH x double> @__broadcast_double(<WIDTH x double>, i32) nounwind readnone
+declare <WIDTH x i8> @__broadcast_i8(<WIDTH x i8>, i32) nounwind readnone
+declare <WIDTH x i16> @__broadcast_i16(<WIDTH x i16>, i32) nounwind readnone
+declare <WIDTH x i32> @__broadcast_i32(<WIDTH x i32>, i32) nounwind readnone
+declare <WIDTH x i64> @__broadcast_i64(<WIDTH x i64>, i32) nounwind readnone
+
+declare <WIDTH x i8> @__rotate_i8(<WIDTH x i8>, i32) nounwind readnone
+declare <WIDTH x i16> @__rotate_i16(<WIDTH x i16>, i32) nounwind readnone
+declare <WIDTH x float> @__rotate_float(<WIDTH x float>, i32) nounwind readnone
+declare <WIDTH x i32> @__rotate_i32(<WIDTH x i32>, i32) nounwind readnone
+declare <WIDTH x double> @__rotate_double(<WIDTH x double>, i32) nounwind readnone
+declare <WIDTH x i64> @__rotate_i64(<WIDTH x i64>, i32) nounwind readnone
+
+declare <WIDTH x i8> @__shift_i8(<WIDTH x i8>, i32) nounwind readnone
+declare <WIDTH x i16> @__shift_i16(<WIDTH x i16>, i32) nounwind readnone
+declare <WIDTH x float> @__shift_float(<WIDTH x float>, i32) nounwind readnone
+declare <WIDTH x i32> @__shift_i32(<WIDTH x i32>, i32) nounwind readnone
+declare <WIDTH x double> @__shift_double(<WIDTH x double>, i32) nounwind readnone
+declare <WIDTH x i64> @__shift_i64(<WIDTH x i64>, i32) nounwind readnone
+
+declare <WIDTH x i8> @__shuffle_i8(<WIDTH x i8>, <WIDTH x i32>) nounwind readnone
+declare <WIDTH x i8> @__shuffle2_i8(<WIDTH x i8>, <WIDTH x i8>,
+                                    <WIDTH x i32>) nounwind readnone
+declare <WIDTH x i16> @__shuffle_i16(<WIDTH x i16>, <WIDTH x i32>) nounwind readnone
+declare <WIDTH x i16> @__shuffle2_i16(<WIDTH x i16>, <WIDTH x i16>,
+                                      <WIDTH x i32>) nounwind readnone
+declare <WIDTH x float> @__shuffle_float(<WIDTH x float>,
+                                         <WIDTH x i32>) nounwind readnone
+declare <WIDTH x float> @__shuffle2_float(<WIDTH x float>, <WIDTH x float>,
+                                          <WIDTH x i32>) nounwind readnone
+declare <WIDTH x i32> @__shuffle_i32(<WIDTH x i32>,
+                                     <WIDTH x i32>) nounwind readnone
+declare <WIDTH x i32> @__shuffle2_i32(<WIDTH x i32>, <WIDTH x i32>,
+                                      <WIDTH x i32>) nounwind readnone
+declare <WIDTH x double> @__shuffle_double(<WIDTH x double>,
+                                           <WIDTH x i32>) nounwind readnone
+declare <WIDTH x double> @__shuffle2_double(<WIDTH x double>,
+                                            <WIDTH x double>, <WIDTH x i32>) nounwind readnone
+declare <WIDTH x i64> @__shuffle_i64(<WIDTH x i64>,
+                                     <WIDTH x i32>) nounwind readnone
+declare <WIDTH x i64> @__shuffle2_i64(<WIDTH x i64>, <WIDTH x i64>,
+                                      <WIDTH x i32>) nounwind readnone
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; aos/soa
+
+declare void @__soa_to_aos3_float(<WIDTH x float> %v0, <WIDTH x float> %v1,
+                                  <WIDTH x float> %v2, float * noalias %p) nounwind
+declare void @__aos_to_soa3_float(float * noalias %p, <WIDTH x float> * %out0,
+                                  <WIDTH x float> * %out1, <WIDTH x float> * %out2) nounwind
+declare void @__soa_to_aos4_float(<WIDTH x float> %v0, <WIDTH x float> %v1,
+                                  <WIDTH x float> %v2, <WIDTH x float> %v3,
+                                  float * noalias %p) nounwind
+declare void @__aos_to_soa4_float(float * noalias %p, <WIDTH x float> * noalias %out0,
+                                  <WIDTH x float> * noalias %out1,
+                                  <WIDTH x float> * noalias %out2,
+                                  <WIDTH x float> * noalias %out3) nounwind
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; math
+
+declare void @__fastmath() nounwind 
+
+;; round/floor/ceil
+
+declare float @__round_uniform_float(float) nounwind readnone 
+declare float @__floor_uniform_float(float) nounwind readnone 
+declare float @__ceil_uniform_float(float) nounwind readnone 
+
+declare double @__round_uniform_double(double) nounwind readnone 
+declare double @__floor_uniform_double(double) nounwind readnone 
+declare double @__ceil_uniform_double(double) nounwind readnone 
+
+declare <WIDTH x float> @__round_varying_float(<WIDTH x float>) nounwind readnone 
+declare <WIDTH x float> @__floor_varying_float(<WIDTH x float>) nounwind readnone 
+declare <WIDTH x float> @__ceil_varying_float(<WIDTH x float>) nounwind readnone 
+declare <WIDTH x double> @__round_varying_double(<WIDTH x double>) nounwind readnone 
+declare <WIDTH x double> @__floor_varying_double(<WIDTH x double>) nounwind readnone 
+declare <WIDTH x double> @__ceil_varying_double(<WIDTH x double>) nounwind readnone 
+
+;; min/max
+
+declare float @__max_uniform_float(float, float) nounwind readnone 
+declare float @__min_uniform_float(float, float) nounwind readnone 
+declare i32 @__min_uniform_int32(i32, i32) nounwind readnone 
+declare i32 @__max_uniform_int32(i32, i32) nounwind readnone 
+declare i32 @__min_uniform_uint32(i32, i32) nounwind readnone 
+declare i32 @__max_uniform_uint32(i32, i32) nounwind readnone 
+declare i64 @__min_uniform_int64(i64, i64) nounwind readnone 
+declare i64 @__max_uniform_int64(i64, i64) nounwind readnone 
+declare i64 @__min_uniform_uint64(i64, i64) nounwind readnone 
+declare i64 @__max_uniform_uint64(i64, i64) nounwind readnone 
+declare double @__min_uniform_double(double, double) nounwind readnone 
+declare double @__max_uniform_double(double, double) nounwind readnone 
+
+declare <WIDTH x float> @__max_varying_float(<WIDTH x float>,
+                                             <WIDTH x float>) nounwind readnone 
+declare <WIDTH x float> @__min_varying_float(<WIDTH x float>,
+                                             <WIDTH x float>) nounwind readnone 
+declare <WIDTH x i32> @__min_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone 
+declare <WIDTH x i32> @__max_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone 
+declare <WIDTH x i32> @__min_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone 
+declare <WIDTH x i32> @__max_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone 
+declare <WIDTH x i64> @__min_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone 
+declare <WIDTH x i64> @__max_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone 
+declare <WIDTH x i64> @__min_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone 
+declare <WIDTH x i64> @__max_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone 
+declare <WIDTH x double> @__min_varying_double(<WIDTH x double>,
+                                               <WIDTH x double>) nounwind readnone
+declare <WIDTH x double> @__max_varying_double(<WIDTH x double>,
+                                               <WIDTH x double>) nounwind readnone 
+
+;; sqrt/rsqrt/rcp
+
+declare float @__rsqrt_uniform_float(float) nounwind readnone 
+declare float @__rcp_uniform_float(float) nounwind readnone 
+declare float @__sqrt_uniform_float(float) nounwind readnone 
+declare <WIDTH x float> @__rcp_varying_float(<WIDTH x float>) nounwind readnone 
+declare <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float>) nounwind readnone 
+
+declare <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone 
+
+declare double @__sqrt_uniform_double(double) nounwind readnone
+declare <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readnone
+
+;; bit ops
+
+declare i32 @__popcnt_int32(i32) nounwind readnone
+declare i64 @__popcnt_int64(i64) nounwind readnone 
+
+declare i32 @__count_trailing_zeros_i32(i32) nounwind readnone
+declare i64 @__count_trailing_zeros_i64(i64) nounwind readnone
+declare i32 @__count_leading_zeros_i32(i32) nounwind readnone
+declare i64 @__count_leading_zeros_i64(i64) nounwind readnone
+
+; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
+; or, use the macro to call the 4-wide ones twice with our 8-wide
+; vectors...
+
+;; svml
+
+include(`svml.m4')
+svml_stubs(float,f,WIDTH)
+svml_stubs(double,d,WIDTH)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reductions
+
+declare i64 @__movmsk(<WIDTH x i1>) nounwind readnone 
+declare i1 @__any(<WIDTH x i1>) nounwind readnone 
+declare i1 @__all(<WIDTH x i1>) nounwind readnone 
+declare i1 @__none(<WIDTH x i1>) nounwind readnone 
+
+declare i16 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone
+declare i32 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone
+
+declare float @__reduce_add_float(<WIDTH x float>) nounwind readnone
+declare float @__reduce_min_float(<WIDTH x float>) nounwind readnone 
+declare float @__reduce_max_float(<WIDTH x float>) nounwind readnone 
+
+declare i64 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone
+declare i32 @__reduce_min_int32(<WIDTH x i32>) nounwind readnone 
+declare i32 @__reduce_max_int32(<WIDTH x i32>) nounwind readnone 
+declare i32 @__reduce_min_uint32(<WIDTH x i32>) nounwind readnone 
+declare i32 @__reduce_max_uint32(<WIDTH x i32>) nounwind readnone 
+
+declare double @__reduce_add_double(<WIDTH x double>) nounwind readnone 
+declare double @__reduce_min_double(<WIDTH x double>) nounwind readnone 
+declare double @__reduce_max_double(<WIDTH x double>) nounwind readnone 
+
+declare i64 @__reduce_add_int64(<WIDTH x i64>) nounwind readnone 
+declare i64 @__reduce_min_int64(<WIDTH x i64>) nounwind readnone 
+declare i64 @__reduce_max_int64(<WIDTH x i64>) nounwind readnone 
+declare i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone 
+declare i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone 
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+
+declare <WIDTH x i8> @__masked_load_i8(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
+declare <WIDTH x i16> @__masked_load_i16(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
+declare <WIDTH x i32> @__masked_load_i32(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
+declare <WIDTH x float> @__masked_load_float(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
+declare <WIDTH x i64> @__masked_load_i64(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
+declare <WIDTH x double> @__masked_load_double(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
+
+declare void @__masked_store_i8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
+                                <WIDTH x i1>) nounwind 
+declare void @__masked_store_i16(<WIDTH x i16>* nocapture, <WIDTH x i16>, 
+                                 <WIDTH x i1>) nounwind 
+declare void @__masked_store_i32(<WIDTH x i32>* nocapture, <WIDTH x i32>, 
+                                 <WIDTH x i1>) nounwind 
+declare void @__masked_store_float(<WIDTH x float>* nocapture, <WIDTH x float>, 
+                                   <WIDTH x i1>) nounwind 
+declare void @__masked_store_i64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
+                                 <WIDTH x i1> %mask) nounwind 
+declare void @__masked_store_double(<WIDTH x double>* nocapture, <WIDTH x double>,
+                                    <WIDTH x i1> %mask) nounwind 
+
+
+define void @__masked_store_blend_i8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
+                                     <WIDTH x i1>) nounwind alwaysinline {
+  %v = load PTR_OP_ARGS(`<WIDTH x i8> ')  %0
+  %v1 = select <WIDTH x i1> %2, <WIDTH x i8> %1, <WIDTH x i8> %v
+  store <WIDTH x i8> %v1, <WIDTH x i8> * %0
+  ret void
+}
+
+define void @__masked_store_blend_i16(<WIDTH x i16>* nocapture, <WIDTH x i16>, 
+                                      <WIDTH x i1>) nounwind alwaysinline {
+  %v = load PTR_OP_ARGS(`<WIDTH x i16> ')  %0
+  %v1 = select <WIDTH x i1> %2, <WIDTH x i16> %1, <WIDTH x i16> %v
+  store <WIDTH x i16> %v1, <WIDTH x i16> * %0
+  ret void
+}
+
+define void @__masked_store_blend_i32(<WIDTH x i32>* nocapture, <WIDTH x i32>, 
+                                      <WIDTH x i1>) nounwind alwaysinline {
+  %v = load PTR_OP_ARGS(`<WIDTH x i32> ')  %0
+  %v1 = select <WIDTH x i1> %2, <WIDTH x i32> %1, <WIDTH x i32> %v
+  store <WIDTH x i32> %v1, <WIDTH x i32> * %0
+  ret void
+}
+
+define void @__masked_store_blend_float(<WIDTH x float>* nocapture, <WIDTH x float>, 
+                                        <WIDTH x i1>) nounwind alwaysinline {
+  %v = load PTR_OP_ARGS(`<WIDTH x float> ')  %0
+  %v1 = select <WIDTH x i1> %2, <WIDTH x float> %1, <WIDTH x float> %v
+  store <WIDTH x float> %v1, <WIDTH x float> * %0
+  ret void
+}
+
+define void @__masked_store_blend_i64(<WIDTH x i64>* nocapture,
+                            <WIDTH x i64>, <WIDTH x i1>) nounwind alwaysinline {
+  %v = load PTR_OP_ARGS(`<WIDTH x i64> ')  %0
+  %v1 = select <WIDTH x i1> %2, <WIDTH x i64> %1, <WIDTH x i64> %v
+  store <WIDTH x i64> %v1, <WIDTH x i64> * %0
+  ret void
+}
+
+define void @__masked_store_blend_double(<WIDTH x double>* nocapture,
+                            <WIDTH x double>, <WIDTH x i1>) nounwind alwaysinline {
+  %v = load PTR_OP_ARGS(`<WIDTH x double> ')  %0
+  %v1 = select <WIDTH x i1> %2, <WIDTH x double> %1, <WIDTH x double> %v
+  store <WIDTH x double> %v1, <WIDTH x double> * %0
+  ret void
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather/scatter
+
+define(`gather_scatter', `
+declare <WIDTH x $1> @__gather_base_offsets32_$1(i8 * nocapture, i32, <WIDTH x i32>,
+                                                 <WIDTH x i1>) nounwind readonly 
+declare <WIDTH x $1> @__gather_base_offsets64_$1(i8 * nocapture, i32, <WIDTH x i64>,
+                                                  <WIDTH x i1>) nounwind readonly 
+declare <WIDTH x $1> @__gather32_$1(<WIDTH x i32>, 
+                                    <WIDTH x i1>) nounwind readonly 
+declare <WIDTH x $1> @__gather64_$1(<WIDTH x i64>, 
+                                    <WIDTH x i1>) nounwind readonly 
+
+declare void @__scatter_base_offsets32_$1(i8* nocapture, i32, <WIDTH x i32>,
+                                          <WIDTH x $1>, <WIDTH x i1>) nounwind 
+declare void @__scatter_base_offsets64_$1(i8* nocapture, i32, <WIDTH x i64>,
+                                          <WIDTH x $1>, <WIDTH x i1>) nounwind 
+declare void @__scatter32_$1(<WIDTH x i32>, <WIDTH x $1>,
+                             <WIDTH x i1>) nounwind 
+declare void @__scatter64_$1(<WIDTH x i64>, <WIDTH x $1>,
+                              <WIDTH x i1>) nounwind 
+')
+
+gather_scatter(i8)
+gather_scatter(i16)
+gather_scatter(i32)
+gather_scatter(float)
+gather_scatter(i64)
+gather_scatter(double)
+
+declare i32 @__packed_load_active(i32 * nocapture, <WIDTH x i32> * nocapture,
+                                  <WIDTH x i1>) nounwind
+declare i32 @__packed_store_active(i32 * nocapture, <WIDTH x i32> %vals,
+                                   <WIDTH x i1>) nounwind
+declare i32 @__packed_store_active2(i32 * nocapture, <WIDTH x i32> %vals,
+                                   <WIDTH x i1>) nounwind
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; prefetch
+
+declare void @__prefetch_read_uniform_1(i8 * nocapture) nounwind 
+declare void @__prefetch_read_uniform_2(i8 * nocapture) nounwind 
+declare void @__prefetch_read_uniform_3(i8 * nocapture) nounwind 
+declare void @__prefetch_read_uniform_nt(i8 * nocapture) nounwind 
+
+declare void @__prefetch_read_varying_1(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) nounwind
+declare void @__prefetch_read_varying_1_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
+declare void @__prefetch_read_varying_2(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) nounwind
+declare void @__prefetch_read_varying_2_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
+declare void @__prefetch_read_varying_3(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) nounwind
+declare void @__prefetch_read_varying_3_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
+declare void @__prefetch_read_varying_nt(<WIDTH x i64> %addr, <WIDTH x MASK> %mask) nounwind
+declare void @__prefetch_read_varying_nt_native(i8 * %base, i32 %scale, <WIDTH x i32> %offsets, <WIDTH x MASK> %mask) nounwind
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+define_avgs()
+declare_nvptx()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reciprocals in double precision, if supported
+
+rsqrtd_decl()
+rcpd_decl()
+
+transcendetals_decl()
+trigonometry_decl()
--- a/builtins/target-knl.ll
+++ b/builtins/target-knl.ll
@@ -0,0 +1,74 @@
+;;  Copyright (c) 2015-2016, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`WIDTH',`16')
+
+ifelse(LLVM_VERSION, LLVM_3_7,
+    `include(`target-avx512-common.ll')',
+         LLVM_VERSION, LLVM_3_8,
+    `include(`target-avx512-common.ll')',
+         LLVM_VERSION, LLVM_3_9,
+    `include(`target-avx512-common.ll')',
+         LLVM_VERSION, LLVM_4_0,
+    `include(`target-avx512-common.ll')',
+         LLVM_VERSION, LLVM_5_0,
+    `include(`target-avx512-common.ll')'
+  )
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp, rsqrt
+
+define(`rcp_rsqrt_varying_float_knl',`
+declare <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
+define <16 x float> @__rcp_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  %res = call <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float> %0, <16 x float> undef, i16 -1, i32 8)
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
+define <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline {
+  %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %v, <16 x float> undef, i16 -1, i32 8)
+  ret <16 x float> %res
+}
+')
+
+ifelse(LLVM_VERSION, LLVM_3_7,
+    rcp_rsqrt_varying_float_knl(),
+         LLVM_VERSION, LLVM_3_8,
+    rcp_rsqrt_varying_float_knl(),
+         LLVM_VERSION, LLVM_3_9,
+    rcp_rsqrt_varying_float_knl(),
+         LLVM_VERSION, LLVM_4_0,
+    rcp_rsqrt_varying_float_knl(),
+         LLVM_VERSION, LLVM_5_0,
+    rcp_rsqrt_varying_float_knl()
+  )
+
+;;saturation_arithmetic_novec()
--- a/builtins/target-neon-16.ll
+++ b/builtins/target-neon-16.ll
@@ -0,0 +1,527 @@
+;;
+;; target-neon-16.ll
+;;
+;;  Copyright(c) 2013-2015 Google, Inc.
+;;
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Matt Pharr nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`WIDTH',`8')
+define(`MASK',`i16')
+
+include(`util.m4')
+include(`target-neon-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+define <8 x float> @__half_to_float_varying(<8 x i16> %v) nounwind readnone alwaysinline {
+  unary4to8conv(r, i16, float, @llvm.arm.neon.vcvthf2fp, %v)
+  ret <8 x float> %r
+}
+
+define <8 x i16> @__float_to_half_varying(<8 x float> %v) nounwind readnone alwaysinline {
+  unary4to8conv(r, float, i16, @llvm.arm.neon.vcvtfp2hf, %v)
+  ret <8 x i16> %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; math
+
+;; round/floor/ceil
+
+;; FIXME: grabbed these from the sse2 target, which does not have native
+;; instructions for these.  Is there a better approach for NEON?
+
+define <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  %float_to_int_bitcast.i.i.i.i = bitcast <8 x float> %0 to <8 x i32>
+  %bitop.i.i = and <8 x i32> %float_to_int_bitcast.i.i.i.i,
+      <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648,
+       i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+  %bitop.i = xor <8 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i40.i = bitcast <8 x i32> %bitop.i to <8 x float>
+  %binop.i = fadd <8 x float> %int_to_float_bitcast.i.i40.i,
+    <float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06,
+     float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06>
+  %binop21.i = fadd <8 x float> %binop.i,
+    <float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06,
+     float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06>
+  %float_to_int_bitcast.i.i.i = bitcast <8 x float> %binop21.i to <8 x i32>
+  %bitop31.i = xor <8 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop31.i to <8 x float>
+  ret <8 x float> %int_to_float_bitcast.i.i.i
+}
+
+define <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <8 x float> @__round_varying_float(<8 x float> %0) nounwind
+  %bincmp.i = fcmp ogt <8 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <8 x i1> %bincmp.i to <8 x i32>
+  %bitop.i = and <8 x i32> %val_to_boolvec32.i,
+    <i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432,
+     i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432>
+  %int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop.i to <8 x float>
+  %binop.i = fadd <8 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <8 x float> %binop.i
+}
+
+define <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <8 x float> @__round_varying_float(<8 x float> %0) nounwind
+  %bincmp.i = fcmp olt <8 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <8 x i1> %bincmp.i to <8 x i32>
+  %bitop.i = and <8 x i32> %val_to_boolvec32.i,
+    <i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216,
+     i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216>
+  %int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop.i to <8 x float>
+  %binop.i = fadd <8 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <8 x float> %binop.i
+}
+
+;; FIXME: rounding doubles and double vectors needs to be implemented
+declare <WIDTH x double> @__round_varying_double(<WIDTH x double>) nounwind readnone 
+declare <WIDTH x double> @__floor_varying_double(<WIDTH x double>) nounwind readnone 
+declare <WIDTH x double> @__ceil_varying_double(<WIDTH x double>) nounwind readnone 
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; min/max
+
+declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <WIDTH x float> @__max_varying_float(<WIDTH x float>,
+                                            <WIDTH x float>) nounwind readnone alwaysinline {
+  binary4to8(r, float, @llvm.arm.neon.vmaxs.v4f32, %0, %1)
+  ret <WIDTH x float> %r
+}
+
+define <WIDTH x float> @__min_varying_float(<WIDTH x float>,
+                                            <WIDTH x float>) nounwind readnone alwaysinline {
+  binary4to8(r, float, @llvm.arm.neon.vmins.v4f32, %0, %1)
+  ret <WIDTH x float> %r
+}
+
+declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <WIDTH x i32> @__min_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone alwaysinline {
+  binary4to8(r, i32, @llvm.arm.neon.vmins.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+define <WIDTH x i32> @__max_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone alwaysinline {
+  binary4to8(r, i32, @llvm.arm.neon.vmaxs.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+define <WIDTH x i32> @__min_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone alwaysinline {
+  binary4to8(r, i32, @llvm.arm.neon.vminu.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+define <WIDTH x i32> @__max_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone alwaysinline {
+  binary4to8(r, i32, @llvm.arm.neon.vmaxu.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+;; sqrt/rsqrt/rcp
+
+declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <WIDTH x float> @__rcp_varying_float(<WIDTH x float> %d) nounwind readnone alwaysinline {
+  unary4to8(x0, float, @llvm.arm.neon.vrecpe.v4f32, %d)
+  binary4to8(x0_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x0)
+  %x1 = fmul <WIDTH x float> %x0, %x0_nr
+  binary4to8(x1_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x1)
+  %x2 = fmul <WIDTH x float> %x1, %x1_nr
+  ret <WIDTH x float> %x2
+}
+
+declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %d) nounwind readnone alwaysinline {
+  unary4to8(x0, float, @llvm.arm.neon.vrsqrte.v4f32, %d)
+  %x0_2 = fmul <WIDTH x float> %x0, %x0
+  binary4to8(x0_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x0_2)
+  %x1 = fmul <WIDTH x float> %x0, %x0_nr
+  %x1_2 = fmul <WIDTH x float> %x1, %x1
+  binary4to8(x1_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x1_2)
+  %x2 = fmul <WIDTH x float> %x1, %x1_nr
+  ret <WIDTH x float> %x2
+}
+
+define float @__rsqrt_uniform_float(float) nounwind readnone alwaysinline {
+  %v1 = bitcast float %0 to <1 x float>
+  %vs = shufflevector <1 x float> %v1, <1 x float> undef,
+          <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %vr = call <8 x float> @__rsqrt_varying_float(<8 x float> %vs)
+  %r = extractelement <8 x float> %vr, i32 0
+  ret float %r
+}
+
+define float @__rcp_uniform_float(float) nounwind readnone alwaysinline {
+  %v1 = bitcast float %0 to <1 x float>
+  %vs = shufflevector <1 x float> %v1, <1 x float> undef,
+          <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %vr = call <8 x float> @__rcp_varying_float(<8 x float> %vs)
+  %r = extractelement <8 x float> %vr, i32 0
+  ret float %r
+}
+
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
+
+define <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone alwaysinline {
+  unary4to8(result, float, @llvm.sqrt.v4f32, %0)
+;; this returns nan for v=0, which is undesirable..
+;;  %rsqrt = call <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %0)
+;;  %result = fmul <4 x float> %rsqrt, %0
+  ret <8 x float> %result
+}
+
+declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
+
+define <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readnone alwaysinline {
+  unary4to8(r, double, @llvm.sqrt.v4f64, %0)
+  ret <WIDTH x double> %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reductions
+
+define i64 @__movmsk(<WIDTH x MASK>) nounwind readnone alwaysinline {
+  %and_mask = and <WIDTH x i16> %0,
+    <i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128>
+  %v4 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %and_mask)
+  %v2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %v4)
+  %va = extractelement <2 x i64> %v2, i32 0
+  %vb = extractelement <2 x i64> %v2, i32 1
+  %v = or i64 %va, %vb
+  ret i64 %v
+}
+
+define i1 @__any(<WIDTH x MASK>) nounwind readnone alwaysinline {
+  v8tov4(MASK, %0, %v0123, %v4567)
+  %vor = or <4 x MASK> %v0123, %v4567
+  %v0 = extractelement <4 x MASK> %vor, i32 0
+  %v1 = extractelement <4 x MASK> %vor, i32 1
+  %v2 = extractelement <4 x MASK> %vor, i32 2
+  %v3 = extractelement <4 x MASK> %vor, i32 3
+  %v01 = or MASK %v0, %v1
+  %v23 = or MASK %v2, %v3
+  %v = or MASK %v01, %v23
+  %cmp = icmp ne MASK %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<WIDTH x MASK>) nounwind readnone alwaysinline {
+  v8tov4(MASK, %0, %v0123, %v4567)
+  %vand = and <4 x MASK> %v0123, %v4567
+  %v0 = extractelement <4 x MASK> %vand, i32 0
+  %v1 = extractelement <4 x MASK> %vand, i32 1
+  %v2 = extractelement <4 x MASK> %vand, i32 2
+  %v3 = extractelement <4 x MASK> %vand, i32 3
+  %v01 = and MASK %v0, %v1
+  %v23 = and MASK %v2, %v3
+  %v = and MASK %v01, %v23
+  %cmp = icmp ne MASK %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__none(<WIDTH x MASK>) nounwind readnone alwaysinline {
+  %any = call i1 @__any(<WIDTH x MASK> %0)
+  %none = icmp eq i1 %any, 0
+  ret i1 %none
+}
+
+;; $1: scalar type
+;; $2: vector/vector reduce function (2 x <WIDTH x vec> -> <WIDTH x vec>)
+;; $3: pairwise vector reduce function (2 x <2 x vec> -> <2 x vec>)
+;; $4: scalar reduce function
+
+define(`neon_reduce', `
+  v8tov4($1, %0, %v0123, %v4567)
+  %v0123_8 = shufflevector <4 x $1> %v0123, <4 x $1> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %v4567_8 = shufflevector <4 x $1> %v4567, <4 x $1> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %vfirst = call <8 x $1> $2(<8 x $1> %v0123_8, <8 x $1> %v4567_8)
+  %vfirst_4 = shufflevector <8 x $1> %vfirst, <8 x $1> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  v4tov2($1, %vfirst_4, %v0, %v1)
+  %vh = call <2 x $1> $3(<2 x $1> %v0, <2 x $1> %v1)
+  %vh0 = extractelement <2 x $1> %vh, i32 0
+  %vh1 = extractelement <2 x $1> %vh, i32 1
+  %r = call $1 $4($1 %vh0, $1 %vh1)
+  ret $1 %r
+')
+
+declare <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+define internal float @add_f32(float, float) nounwind readnone alwaysinline {
+  %r = fadd float %0, %1
+  ret float %r
+}
+
+define internal <WIDTH x float> @__add_varying_float(<WIDTH x float>, <WIDTH x float>) nounwind readnone alwaysinline {
+  %r = fadd <WIDTH x float> %0, %1
+  ret <WIDTH x float> %r
+}
+
+define float @__reduce_add_float(<WIDTH x float>) nounwind readnone alwaysinline {
+  neon_reduce(float, @__add_varying_float, @llvm.arm.neon.vpadd.v2f32, @add_f32)
+}
+
+declare <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+define internal float @min_f32(float, float) nounwind readnone alwaysinline {
+  %cmp = fcmp olt float %0, %1
+  %r = select i1 %cmp, float %0, float %1
+  ret float %r
+}
+
+define float @__reduce_min_float(<WIDTH x float>) nounwind readnone alwaysinline {
+  neon_reduce(float, @__min_varying_float, @llvm.arm.neon.vpmins.v2f32, @min_f32)
+}
+
+declare <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+define internal float @max_f32(float, float) nounwind readnone alwaysinline {
+  %cmp = fcmp ugt float %0, %1
+  %r = select i1 %cmp, float %0, float %1
+  ret float %r
+}
+
+define float @__reduce_max_float(<WIDTH x float>) nounwind readnone alwaysinline {
+  neon_reduce(float, @__max_varying_float, @llvm.arm.neon.vpmaxs.v2f32, @max_f32)
+}
+
+declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16>) nounwind readnone
+
+define i16 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone alwaysinline {
+  %a16 = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %0)
+  %a32 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %a16)
+  %a0 = extractelement <2 x i32> %a32, i32 0
+  %a1 = extractelement <2 x i32> %a32, i32 1
+  %r = add i32 %a0, %a1
+  %r16 = trunc i32 %r to i16
+  ret i16 %r16
+}
+
+declare <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<WIDTH x i16>)
+
+define i64 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone alwaysinline {
+  %a1 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<WIDTH x i16> %0)
+  %a2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a1)
+  %aa = extractelement <2 x i64> %a2, i32 0
+  %ab = extractelement <2 x i64> %a2, i32 1
+  %r = add i64 %aa, %ab
+  ret i64 %r
+}
+
+declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone
+
+define i64 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone alwaysinline {
+  v8tov4(i32, %0, %va, %vb)
+  %pa = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %va)
+  %pb = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vb)
+  %psum = add <2 x i64> %pa, %pb
+  %a0 = extractelement <2 x i64> %psum, i32 0
+  %a1 = extractelement <2 x i64> %psum, i32 1
+  %r = add i64 %a0, %a1
+  ret i64 %r
+}
+
+declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @min_si32(i32, i32) nounwind readnone alwaysinline {
+  %cmp = icmp slt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_min_int32(<WIDTH x i32>) nounwind readnone alwaysinline {
+  neon_reduce(i32, @__min_varying_int32, @llvm.arm.neon.vpmins.v2i32, @min_si32)
+}
+
+declare <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @max_si32(i32, i32) nounwind readnone alwaysinline {
+  %cmp = icmp sgt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_max_int32(<WIDTH x i32>) nounwind readnone alwaysinline {
+  neon_reduce(i32, @__max_varying_int32, @llvm.arm.neon.vpmaxs.v2i32, @max_si32)
+}
+
+declare <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @min_ui32(i32, i32) nounwind readnone alwaysinline {
+  %cmp = icmp ult i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_min_uint32(<WIDTH x i32>) nounwind readnone alwaysinline {
+  neon_reduce(i32, @__min_varying_uint32, @llvm.arm.neon.vpmins.v2i32, @min_ui32)
+}
+
+declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @max_ui32(i32, i32) nounwind readnone alwaysinline {
+  %cmp = icmp ugt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_max_uint32(<WIDTH x i32>) nounwind readnone alwaysinline {
+  neon_reduce(i32, @__max_varying_uint32, @llvm.arm.neon.vpmaxs.v2i32, @max_ui32)
+}
+
+define double @__reduce_add_double(<WIDTH x double>) nounwind readnone alwaysinline {
+  v8tov2(double, %0, %v0, %v1, %v2, %v3)
+  %v01 = fadd <2 x double> %v0, %v1
+  %v23 = fadd <2 x double> %v2, %v3
+  %sum = fadd <2 x double> %v01, %v23
+  %e0 = extractelement <2 x double> %sum, i32 0
+  %e1 = extractelement <2 x double> %sum, i32 1
+  %m = fadd double %e0, %e1
+  ret double %m
+}
+
+define double @__reduce_min_double(<WIDTH x double>) nounwind readnone alwaysinline {
+  reduce8(double, @__min_varying_double, @__min_uniform_double)
+}
+
+define double @__reduce_max_double(<WIDTH x double>) nounwind readnone alwaysinline {
+  reduce8(double, @__max_varying_double, @__max_uniform_double)
+}
+
+define i64 @__reduce_add_int64(<WIDTH x i64>) nounwind readnone alwaysinline {
+  v8tov2(i64, %0, %v0, %v1, %v2, %v3)
+  %v01 = add <2 x i64> %v0, %v1
+  %v23 = add <2 x i64> %v2, %v3
+  %sum = add <2 x i64> %v01, %v23
+  %e0 = extractelement <2 x i64> %sum, i32 0
+  %e1 = extractelement <2 x i64> %sum, i32 1
+  %m = add i64 %e0, %e1
+  ret i64 %m
+}
+
+define i64 @__reduce_min_int64(<WIDTH x i64>) nounwind readnone alwaysinline {
+  reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+define i64 @__reduce_max_int64(<WIDTH x i64>) nounwind readnone alwaysinline {
+  reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+define i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone alwaysinline {
+  reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+define i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone alwaysinline {
+  reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16
+
+declare <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+
+define <8 x i8> @__avg_up_uint8(<8 x i8>, <8 x i8>) nounwind readnone alwaysinline {
+  %r = call <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8> %0, <8 x i8> %1)
+  ret <8 x i8> %r
+}
+
+declare <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+
+define <8 x i8> @__avg_up_int8(<8 x i8>, <8 x i8>) nounwind readnone alwaysinline {
+  %r = call <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8> %0, <8 x i8> %1)
+  ret <8 x i8> %r
+}
+
+declare <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+
+define <8 x i8> @__avg_down_uint8(<8 x i8>, <8 x i8>) nounwind readnone alwaysinline {
+  %r = call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %0, <8 x i8> %1)
+  ret <8 x i8> %r
+}
+
+declare <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+
+define <8 x i8> @__avg_down_int8(<8 x i8>, <8 x i8>) nounwind readnone alwaysinline {
+  %r = call <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8> %0, <8 x i8> %1)
+  ret <8 x i8> %r
+}
+
+declare <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @__avg_up_uint16(<8 x i16>, <8 x i16>) nounwind readnone alwaysinline {
+  %r = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %0, <8 x i16> %1)
+  ret <8 x i16> %r
+}
+
+declare <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @__avg_up_int16(<8 x i16>, <8 x i16>) nounwind readnone alwaysinline {
+  %r = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %0, <8 x i16> %1)
+  ret <8 x i16> %r
+}
+
+declare <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @__avg_down_uint16(<8 x i16>, <8 x i16>) nounwind readnone alwaysinline {
+  %r = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %0, <8 x i16> %1)
+  ret <8 x i16> %r
+}
+
+declare <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @__avg_down_int16(<8 x i16>, <8 x i16>) nounwind readnone alwaysinline {
+  %r = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %0, <8 x i16> %1)
+  ret <8 x i16> %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reciprocals in double precision, if supported
+
+rsqrtd_decl()
+rcpd_decl()
+
+transcendetals_decl()
+trigonometry_decl()
+saturation_arithmetic()
--- a/builtins/target-neon-32.ll
+++ b/builtins/target-neon-32.ll
@@ -0,0 +1,497 @@
+;;
+;; target-neon-32.ll
+;;
+;;  Copyright(c) 2012-2013 Matt Pharr
+;;  Copyright(c) 2013, 2015 Google, Inc.
+;;
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Matt Pharr nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`WIDTH',`4')
+define(`MASK',`i32')
+
+include(`util.m4')
+include(`target-neon-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+define <4 x float> @__half_to_float_varying(<4 x i16> %v) nounwind readnone alwaysinline {
+  %r = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %v)
+  ret <4 x float> %r
+}
+
+define <4 x i16> @__float_to_half_varying(<4 x float> %v) nounwind readnone alwaysinline {
+  %r = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %v)
+  ret <4 x i16> %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; math
+
+;; round/floor/ceil
+
+;; FIXME: grabbed these from the sse2 target, which does not have native
+;; instructions for these.  Is there a better approach for NEON?
+
+define <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  %float_to_int_bitcast.i.i.i.i = bitcast <4 x float> %0 to <4 x i32>
+  %bitop.i.i = and <4 x i32> %float_to_int_bitcast.i.i.i.i, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+  %bitop.i = xor <4 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i40.i = bitcast <4 x i32> %bitop.i to <4 x float>
+  %binop.i = fadd <4 x float> %int_to_float_bitcast.i.i40.i, <float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06>
+  %binop21.i = fadd <4 x float> %binop.i, <float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06>
+  %float_to_int_bitcast.i.i.i = bitcast <4 x float> %binop21.i to <4 x i32>
+  %bitop31.i = xor <4 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i.i = bitcast <4 x i32> %bitop31.i to <4 x float>
+  ret <4 x float> %int_to_float_bitcast.i.i.i
+}
+
+define <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <4 x float> @__round_varying_float(<4 x float> %0) nounwind
+  %bincmp.i = fcmp ogt <4 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <4 x i1> %bincmp.i to <4 x i32>
+  %bitop.i = and <4 x i32> %val_to_boolvec32.i, <i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432>
+  %int_to_float_bitcast.i.i.i = bitcast <4 x i32> %bitop.i to <4 x float>
+  %binop.i = fadd <4 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <4 x float> %binop.i
+}
+
+define <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <4 x float> @__round_varying_float(<4 x float> %0) nounwind
+  %bincmp.i = fcmp olt <4 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <4 x i1> %bincmp.i to <4 x i32>
+  %bitop.i = and <4 x i32> %val_to_boolvec32.i, <i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216>
+  %int_to_float_bitcast.i.i.i = bitcast <4 x i32> %bitop.i to <4 x float>
+  %binop.i = fadd <4 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <4 x float> %binop.i
+}
+
+;; FIXME: rounding doubles and double vectors needs to be implemented
+declare <WIDTH x double> @__round_varying_double(<WIDTH x double>) nounwind readnone 
+declare <WIDTH x double> @__floor_varying_double(<WIDTH x double>) nounwind readnone 
+declare <WIDTH x double> @__ceil_varying_double(<WIDTH x double>) nounwind readnone 
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; min/max
+
+declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <WIDTH x float> @__max_varying_float(<WIDTH x float>,
+                                            <WIDTH x float>) nounwind readnone alwaysinline {
+  %r = call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %0, <4 x float> %1)
+  ret <WIDTH x float> %r
+}
+
+define <WIDTH x float> @__min_varying_float(<WIDTH x float>,
+                                            <WIDTH x float>) nounwind readnone alwaysinline {
+  %r = call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %0, <4 x float> %1)
+  ret <WIDTH x float> %r
+}
+
+declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <WIDTH x i32> @__min_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone alwaysinline {
+  %r = call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %r
+}
+
+define <WIDTH x i32> @__max_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone alwaysinline {
+  %r = call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %r
+}
+
+define <WIDTH x i32> @__min_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone alwaysinline {
+  %r = call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %r
+}
+
+define <WIDTH x i32> @__max_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone alwaysinline {
+  %r = call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %r
+}
+
+;; sqrt/rsqrt/rcp
+
+declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <WIDTH x float> @__rcp_varying_float(<WIDTH x float> %d) nounwind readnone alwaysinline {
+  %x0 = call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %d)
+  %x0_nr = call <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float> %d, <4 x float> %x0)
+  %x1 = fmul <4 x float> %x0, %x0_nr
+  %x1_nr = call <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float> %d, <4 x float> %x1)
+  %x2 = fmul <4 x float> %x1, %x1_nr
+  ret <4 x float> %x2
+}
+
+declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %d) nounwind readnone alwaysinline {
+  %x0 = call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %d)
+  %x0_2 = fmul <4 x float> %x0, %x0
+  %x0_nr = call <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float> %d, <4 x float> %x0_2)
+  %x1 = fmul <4 x float> %x0, %x0_nr
+  %x1_2 = fmul <4 x float> %x1, %x1
+  %x1_nr = call <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float> %d, <4 x float> %x1_2)
+  %x2 = fmul <4 x float> %x1, %x1_nr
+  ret <4 x float> %x2
+}
+
+define float @__rsqrt_uniform_float(float) nounwind readnone alwaysinline {
+  %v1 = bitcast float %0 to <1 x float>
+  %vs = shufflevector <1 x float> %v1, <1 x float> undef,
+          <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+  %vr = call <4 x float> @__rsqrt_varying_float(<4 x float> %vs)
+  %r = extractelement <4 x float> %vr, i32 0
+  ret float %r
+}
+
+define float @__rcp_uniform_float(float) nounwind readnone alwaysinline {
+  %v1 = bitcast float %0 to <1 x float>
+  %vs = shufflevector <1 x float> %v1, <1 x float> undef,
+          <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+  %vr = call <4 x float> @__rcp_varying_float(<4 x float> %vs)
+  %r = extractelement <4 x float> %vr, i32 0
+  ret float %r
+}
+
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
+
+define <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone alwaysinline {
+  %result = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %0)
+;; this returns nan for v=0, which is undesirable..
+;;  %rsqrt = call <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %0)
+;;  %result = fmul <4 x float> %rsqrt, %0
+  ret <4 x float> %result
+}
+
+declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
+
+define <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readnone alwaysinline {
+  %r = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %0)
+  ret <4 x double> %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reductions
+
+define i64 @__movmsk(<4 x MASK>) nounwind readnone alwaysinline {
+  %and_mask = and <4 x MASK> %0, <MASK 1, MASK 2, MASK 4, MASK 8>
+  %v01 = shufflevector <4 x i32> %and_mask, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+  %v23 = shufflevector <4 x i32> %and_mask, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vor = or <2 x i32> %v01, %v23
+  %v0 = extractelement <2 x i32> %vor, i32 0
+  %v1 = extractelement <2 x i32> %vor, i32 1
+  %v = or i32 %v0, %v1
+  %mask64 = zext i32 %v to i64
+  ret i64 %mask64
+}
+
+define i1 @__any(<4 x i32>) nounwind readnone alwaysinline {
+  %v01 = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+  %v23 = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vor = or <2 x i32> %v01, %v23
+  %v0 = extractelement <2 x i32> %vor, i32 0
+  %v1 = extractelement <2 x i32> %vor, i32 1
+  %v = or i32 %v0, %v1
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<4 x i32>) nounwind readnone alwaysinline {
+  %v01 = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+  %v23 = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vor = and <2 x i32> %v01, %v23
+  %v0 = extractelement <2 x i32> %vor, i32 0
+  %v1 = extractelement <2 x i32> %vor, i32 1
+  %v = and i32 %v0, %v1
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__none(<4 x i32>) nounwind readnone alwaysinline {
+  %any = call i1 @__any(<4 x i32> %0)
+  %none = icmp eq i1 %any, 0
+  ret i1 %none
+}
+
+;; $1: scalar type
+;; $2: vector reduce function (2 x <2 x vec> -> <2 x vec>)
+;; $3 scalar reduce function
+
+define(`neon_reduce', `
+  %v0 = shufflevector <4 x $1> %0, <4 x $1> undef, <2 x i32> <i32 0, i32 1>
+  %v1 = shufflevector <4 x $1> %0, <4 x $1> undef, <2 x i32> <i32 2, i32 3>
+  %vh = call <2 x $1> $2(<2 x $1> %v0, <2 x $1> %v1)
+  %vh0 = extractelement <2 x $1> %vh, i32 0
+  %vh1 = extractelement <2 x $1> %vh, i32 1
+  %r = call $1$3 ($1 %vh0, $1 %vh1)
+  ret $1 %r
+')
+
+declare <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+define internal float @add_f32(float, float) nounwind readnone alwaysinline {
+  %r = fadd float %0, %1
+  ret float %r
+}
+
+define float @__reduce_add_float(<4 x float>) nounwind readnone alwaysinline {
+  neon_reduce(float, @llvm.arm.neon.vpadd.v2f32, @add_f32)
+}
+
+declare <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+define internal float @min_f32(float, float) nounwind readnone alwaysinline {
+  %cmp = fcmp olt float %0, %1
+  %r = select i1 %cmp, float %0, float %1
+  ret float %r
+}
+
+define float @__reduce_min_float(<4 x float>) nounwind readnone alwaysinline {
+  neon_reduce(float, @llvm.arm.neon.vpmins.v2f32, @min_f32)
+}
+
+declare <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+define internal float @max_f32(float, float) nounwind readnone alwaysinline {
+  %cmp = fcmp ugt float %0, %1
+  %r = select i1 %cmp, float %0, float %1
+  ret float %r
+}
+
+define float @__reduce_max_float(<4 x float>) nounwind readnone alwaysinline {
+  neon_reduce(float, @llvm.arm.neon.vpmaxs.v2f32, @max_f32)
+}
+
+declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone alwaysinline {
+  %v8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
+  %a16 = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %v8)
+  %a32 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %a16)
+  %a0 = extractelement <2 x i32> %a32, i32 0
+  %a1 = extractelement <2 x i32> %a32, i32 1
+  %r = add i32 %a0, %a1
+  %r16 = trunc i32 %r to i16
+  ret i16 %r16
+}
+
+declare <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16>) nounwind readnone
+
+define i32 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone alwaysinline {
+  %a32 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %0)
+  %a0 = extractelement <2 x i32> %a32, i32 0
+  %a1 = extractelement <2 x i32> %a32, i32 1
+  %r = add i32 %a0, %a1
+  ret i32 %r
+}
+
+declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone
+
+define i64 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone alwaysinline {
+  %a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %0)
+  %a0 = extractelement <2 x i64> %a64, i32 0
+  %a1 = extractelement <2 x i64> %a64, i32 1
+  %r = add i64 %a0, %a1
+  ret i64 %r
+}
+
+declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @min_si32(i32, i32) nounwind readnone alwaysinline {
+  %cmp = icmp slt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_min_int32(<4 x i32>) nounwind readnone alwaysinline {
+  neon_reduce(i32, @llvm.arm.neon.vpmins.v2i32, @min_si32)
+}
+
+declare <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @max_si32(i32, i32) nounwind readnone alwaysinline {
+  %cmp = icmp sgt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_max_int32(<4 x i32>) nounwind readnone alwaysinline {
+  neon_reduce(i32, @llvm.arm.neon.vpmaxs.v2i32, @max_si32)
+}
+
+declare <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @min_ui32(i32, i32) nounwind readnone alwaysinline {
+  %cmp = icmp ult i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone alwaysinline {
+  neon_reduce(i32, @llvm.arm.neon.vpmins.v2i32, @min_ui32)
+}
+
+declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @max_ui32(i32, i32) nounwind readnone alwaysinline {
+  %cmp = icmp ugt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone alwaysinline {
+  neon_reduce(i32, @llvm.arm.neon.vpmaxs.v2i32, @max_ui32)
+}
+
+define double @__reduce_add_double(<4 x double>) nounwind readnone alwaysinline {
+  %v0 = shufflevector <4 x double> %0, <4 x double> undef,
+                      <2 x i32> <i32 0, i32 1>
+  %v1 = shufflevector <4 x double> %0, <4 x double> undef,
+                      <2 x i32> <i32 2, i32 3>
+  %sum = fadd <2 x double> %v0, %v1
+  %e0 = extractelement <2 x double> %sum, i32 0
+  %e1 = extractelement <2 x double> %sum, i32 1
+  %m = fadd double %e0, %e1
+  ret double %m
+}
+
+define double @__reduce_min_double(<4 x double>) nounwind readnone alwaysinline {
+  reduce4(double, @__min_varying_double, @__min_uniform_double)
+}
+
+define double @__reduce_max_double(<4 x double>) nounwind readnone alwaysinline {
+  reduce4(double, @__max_varying_double, @__max_uniform_double)
+}
+
+define i64 @__reduce_add_int64(<4 x i64>) nounwind readnone alwaysinline {
+  %v0 = shufflevector <4 x i64> %0, <4 x i64> undef,
+                      <2 x i32> <i32 0, i32 1>
+  %v1 = shufflevector <4 x i64> %0, <4 x i64> undef,
+                      <2 x i32> <i32 2, i32 3>
+  %sum = add <2 x i64> %v0, %v1
+  %e0 = extractelement <2 x i64> %sum, i32 0
+  %e1 = extractelement <2 x i64> %sum, i32 1
+  %m = add i64 %e0, %e1
+  ret i64 %m
+}
+
+define i64 @__reduce_min_int64(<4 x i64>) nounwind readnone alwaysinline {
+  reduce4(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+define i64 @__reduce_max_int64(<4 x i64>) nounwind readnone alwaysinline {
+  reduce4(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+define i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone alwaysinline {
+  reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone alwaysinline {
+  reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16
+
+declare <4 x i8> @llvm.arm.neon.vrhaddu.v4i8(<4 x i8>, <4 x i8>) nounwind readnone
+
+define <4 x i8> @__avg_up_uint8(<4 x i8>, <4 x i8>) nounwind readnone alwaysinline {
+  %r = call <4 x i8> @llvm.arm.neon.vrhaddu.v4i8(<4 x i8> %0, <4 x i8> %1)
+  ret <4 x i8> %r
+}
+
+declare <4 x i8> @llvm.arm.neon.vrhadds.v4i8(<4 x i8>, <4 x i8>) nounwind readnone
+
+define <4 x i8> @__avg_up_int8(<4 x i8>, <4 x i8>) nounwind readnone alwaysinline {
+  %r = call <4 x i8> @llvm.arm.neon.vrhadds.v4i8(<4 x i8> %0, <4 x i8> %1)
+  ret <4 x i8> %r
+}
+
+declare <4 x i8> @llvm.arm.neon.vhaddu.v4i8(<4 x i8>, <4 x i8>) nounwind readnone
+
+define <4 x i8> @__avg_down_uint8(<4 x i8>, <4 x i8>) nounwind readnone alwaysinline {
+  %r = call <4 x i8> @llvm.arm.neon.vhaddu.v4i8(<4 x i8> %0, <4 x i8> %1)
+  ret <4 x i8> %r
+}
+
+declare <4 x i8> @llvm.arm.neon.vhadds.v4i8(<4 x i8>, <4 x i8>) nounwind readnone
+
+define <4 x i8> @__avg_down_int8(<4 x i8>, <4 x i8>) nounwind readnone alwaysinline {
+  %r = call <4 x i8> @llvm.arm.neon.vhadds.v4i8(<4 x i8> %0, <4 x i8> %1)
+  ret <4 x i8> %r
+}
+
+declare <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+
+define <4 x i16> @__avg_up_uint16(<4 x i16>, <4 x i16>) nounwind readnone alwaysinline {
+  %r = call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> %0, <4 x i16> %1)
+  ret <4 x i16> %r
+}
+
+declare <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+
+define <4 x i16> @__avg_up_int16(<4 x i16>, <4 x i16>) nounwind readnone alwaysinline {
+  %r = call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> %0, <4 x i16> %1)
+  ret <4 x i16> %r
+}
+
+declare <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+
+define <4 x i16> @__avg_down_uint16(<4 x i16>, <4 x i16>) nounwind readnone alwaysinline {
+  %r = call <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16> %0, <4 x i16> %1)
+  ret <4 x i16> %r
+}
+
+declare <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+
+define <4 x i16> @__avg_down_int16(<4 x i16>, <4 x i16>) nounwind readnone alwaysinline {
+  %r = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> %0, <4 x i16> %1)
+  ret <4 x i16> %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reciprocals in double precision, if supported
+
+rsqrtd_decl()
+rcpd_decl()
+
+transcendetals_decl()
+trigonometry_decl()
+saturation_arithmetic()
--- a/builtins/target-neon-8.ll
+++ b/builtins/target-neon-8.ll
@@ -0,0 +1,593 @@
+;;
+;; target-neon-8.ll
+;;
+;;  Copyright(c) 2013-2015 Google, Inc.
+;;
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Matt Pharr nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`WIDTH',`16')
+define(`MASK',`i8')
+
+include(`util.m4')
+include(`target-neon-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+define <16 x float> @__half_to_float_varying(<16 x i16> %v) nounwind readnone alwaysinline {
+  unary4to16conv(r, i16, float, @llvm.arm.neon.vcvthf2fp, %v)
+  ret <16 x float> %r
+}
+
+define <16 x i16> @__float_to_half_varying(<16 x float> %v) nounwind readnone alwaysinline {
+  unary4to16conv(r, float, i16, @llvm.arm.neon.vcvtfp2hf, %v)
+  ret <16 x i16> %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; math
+
+;; round/floor/ceil
+
+;; FIXME: grabbed these from the sse2 target, which does not have native
+;; instructions for these.  Is there a better approach for NEON?
+
+define <16 x float> @__round_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  %float_to_int_bitcast.i.i.i.i = bitcast <16 x float> %0 to <16 x i32>
+  %bitop.i.i = and <16 x i32> %float_to_int_bitcast.i.i.i.i,
+    <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648,
+     i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648,
+     i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648,
+     i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+  %bitop.i = xor <16 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i40.i = bitcast <16 x i32> %bitop.i to <16 x float>
+  %binop.i = fadd <16 x float> %int_to_float_bitcast.i.i40.i,
+    <float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06,
+     float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06,
+     float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06,
+     float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06>
+  %binop21.i = fadd <16 x float> %binop.i,
+    <float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06,
+     float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06,
+     float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06,
+     float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06>
+  %float_to_int_bitcast.i.i.i = bitcast <16 x float> %binop21.i to <16 x i32>
+  %bitop31.i = xor <16 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i.i = bitcast <16 x i32> %bitop31.i to <16 x float>
+  ret <16 x float> %int_to_float_bitcast.i.i.i
+}
+
+define <16 x float> @__floor_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <16 x float> @__round_varying_float(<16 x float> %0) nounwind
+  %bincmp.i = fcmp ogt <16 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <16 x i1> %bincmp.i to <16 x i32>
+  %bitop.i = and <16 x i32> %val_to_boolvec32.i,
+    <i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432,
+     i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432,
+     i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432,
+     i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432>
+  %int_to_float_bitcast.i.i.i = bitcast <16 x i32> %bitop.i to <16 x float>
+  %binop.i = fadd <16 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <16 x float> %binop.i
+}
+
+define <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <16 x float> @__round_varying_float(<16 x float> %0) nounwind
+  %bincmp.i = fcmp olt <16 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <16 x i1> %bincmp.i to <16 x i32>
+  %bitop.i = and <16 x i32> %val_to_boolvec32.i,
+    <i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216,
+     i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216,
+     i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216,
+     i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216>
+  %int_to_float_bitcast.i.i.i = bitcast <16 x i32> %bitop.i to <16 x float>
+  %binop.i = fadd <16 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <16 x float> %binop.i
+}
+
+;; FIXME: rounding doubles and double vectors needs to be implemented
+declare <WIDTH x double> @__round_varying_double(<WIDTH x double>) nounwind readnone 
+declare <WIDTH x double> @__floor_varying_double(<WIDTH x double>) nounwind readnone 
+declare <WIDTH x double> @__ceil_varying_double(<WIDTH x double>) nounwind readnone 
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; min/max
+
+declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <WIDTH x float> @__max_varying_float(<WIDTH x float>,
+                                            <WIDTH x float>) nounwind readnone alwaysinline {
+  binary4to16(r, float, @llvm.arm.neon.vmaxs.v4f32, %0, %1)
+  ret <WIDTH x float> %r
+}
+
+define <WIDTH x float> @__min_varying_float(<WIDTH x float>,
+                                            <WIDTH x float>) nounwind readnone alwaysinline {
+  binary4to16(r, float, @llvm.arm.neon.vmins.v4f32, %0, %1)
+  ret <WIDTH x float> %r
+}
+
+declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <WIDTH x i32> @__min_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone alwaysinline {
+  binary4to16(r, i32, @llvm.arm.neon.vmins.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+define <WIDTH x i32> @__max_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone alwaysinline {
+  binary4to16(r, i32, @llvm.arm.neon.vmaxs.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+define <WIDTH x i32> @__min_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone alwaysinline {
+  binary4to16(r, i32, @llvm.arm.neon.vminu.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+define <WIDTH x i32> @__max_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone alwaysinline {
+  binary4to16(r, i32, @llvm.arm.neon.vmaxu.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+;; sqrt/rsqrt/rcp
+
+declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <WIDTH x float> @__rcp_varying_float(<WIDTH x float> %d) nounwind readnone alwaysinline {
+  unary4to16(x0, float, @llvm.arm.neon.vrecpe.v4f32, %d)
+  binary4to16(x0_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x0)
+  %x1 = fmul <WIDTH x float> %x0, %x0_nr
+  binary4to16(x1_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x1)
+  %x2 = fmul <WIDTH x float> %x1, %x1_nr
+  ret <WIDTH x float> %x2
+}
+
+declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %d) nounwind readnone alwaysinline {
+  unary4to16(x0, float, @llvm.arm.neon.vrsqrte.v4f32, %d)
+  %x0_2 = fmul <WIDTH x float> %x0, %x0
+  binary4to16(x0_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x0_2)
+  %x1 = fmul <WIDTH x float> %x0, %x0_nr
+  %x1_2 = fmul <WIDTH x float> %x1, %x1
+  binary4to16(x1_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x1_2)
+  %x2 = fmul <WIDTH x float> %x1, %x1_nr
+  ret <WIDTH x float> %x2
+}
+
+define float @__rsqrt_uniform_float(float) nounwind readnone alwaysinline {
+  %v1 = bitcast float %0 to <1 x float>
+  %vs = shufflevector <1 x float> %v1, <1 x float> undef,
+          <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %vr = call <16 x float> @__rsqrt_varying_float(<16 x float> %vs)
+  %r = extractelement <16 x float> %vr, i32 0
+  ret float %r
+}
+
+define float @__rcp_uniform_float(float) nounwind readnone alwaysinline {
+  %v1 = bitcast float %0 to <1 x float>
+  %vs = shufflevector <1 x float> %v1, <1 x float> undef,
+          <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %vr = call <16 x float> @__rcp_varying_float(<16 x float> %vs)
+  %r = extractelement <16 x float> %vr, i32 0
+  ret float %r
+}
+
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
+
+define <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone alwaysinline {
+  unary4to16(result, float, @llvm.sqrt.v4f32, %0)
+;; this returns nan for v=0, which is undesirable..
+;;  %rsqrt = call <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %0)
+;;  %result = fmul <4 x float> %rsqrt, %0
+  ret <16 x float> %result
+}
+
+declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
+
+define <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readnone alwaysinline {
+  unary4to16(r, double, @llvm.sqrt.v4f64, %0)
+  ret <WIDTH x double> %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reductions
+
+define i64 @__movmsk(<WIDTH x MASK>) nounwind readnone alwaysinline {
+  %and_mask = and <WIDTH x i8> %0,
+    <i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128,
+     i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128>
+  %v8 = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %and_mask)
+  %v4 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %v8)
+  %v2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %v4)
+  %va = extractelement <2 x i64> %v2, i32 0
+  %vb = extractelement <2 x i64> %v2, i32 1
+  %vbshift = shl i64 %vb, 8
+  %v = or i64 %va, %vbshift
+  ret i64 %v
+}
+
+define i1 @__any(<WIDTH x MASK>) nounwind readnone alwaysinline {
+  v16tov8(MASK, %0, %v8a, %v8b)
+  %vor8 = or <8 x MASK> %v8a, %v8b
+  %v16 = sext <8 x i8> %vor8 to <8 x i16>
+  v8tov4(i16, %v16, %v16a, %v16b)
+  %vor16 = or <4 x i16> %v16a, %v16b
+  %v32 = sext <4 x i16> %vor16 to <4 x i32>
+  v4tov2(i32, %v32, %v32a, %v32b)
+  %vor32 = or <2 x i32> %v32a, %v32b
+  %v0 = extractelement <2 x i32> %vor32, i32 0
+  %v1 = extractelement <2 x i32> %vor32, i32 1
+  %v = or i32 %v0, %v1
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<WIDTH x MASK>) nounwind readnone alwaysinline {
+  v16tov8(MASK, %0, %v8a, %v8b)
+  %vand8 = and <8 x MASK> %v8a, %v8b
+  %v16 = sext <8 x i8> %vand8 to <8 x i16>
+  v8tov4(i16, %v16, %v16a, %v16b)
+  %vand16 = and <4 x i16> %v16a, %v16b
+  %v32 = sext <4 x i16> %vand16 to <4 x i32>
+  v4tov2(i32, %v32, %v32a, %v32b)
+  %vand32 = and <2 x i32> %v32a, %v32b
+  %v0 = extractelement <2 x i32> %vand32, i32 0
+  %v1 = extractelement <2 x i32> %vand32, i32 1
+  %v = and i32 %v0, %v1
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__none(<WIDTH x MASK>) nounwind readnone alwaysinline {
+  %any = call i1 @__any(<WIDTH x MASK> %0)
+  %none = icmp eq i1 %any, 0
+  ret i1 %none
+}
+
+;; $1: scalar type
+;; $2: vector/vector reduce function (2 x <WIDTH x vec> -> <WIDTH x vec>)
+;; $3: pairwise vector reduce function (2 x <2 x vec> -> <2 x vec>)
+;; $4: scalar reduce function
+
+define(`neon_reduce', `
+  v16tov8($1, %0, %va, %vb)
+  %va_16 = shufflevector <8 x $1> %va, <8 x $1> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %vb_16 = shufflevector <8 x $1> %vb, <8 x $1> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v8 = call <16 x $1> $2(<16 x $1> %va_16, <16 x $1> %vb_16)
+
+  %v8a = shufflevector <16 x $1> %v8, <16 x $1> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v8b = shufflevector <16 x $1> %v8, <16 x $1> undef,
+    <16 x i32> <i32 4, i32 5, i32 6, i32 7, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+
+  %v4 = call <16 x $1> $2(<16 x $1> %v8a, <16 x $1> %v8b)
+
+  %vfirst_4 = shufflevector <16 x $1> %v4, <16 x $1> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  v4tov2($1, %vfirst_4, %v0, %v1)
+  %vh = call <2 x $1> $3(<2 x $1> %v0, <2 x $1> %v1)
+  %vh0 = extractelement <2 x $1> %vh, i32 0
+  %vh1 = extractelement <2 x $1> %vh, i32 1
+  %r = call $1 $4($1 %vh0, $1 %vh1)
+  ret $1 %r
+')
+
+declare <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+define internal float @add_f32(float, float) nounwind readnone alwaysinline {
+  %r = fadd float %0, %1
+  ret float %r
+}
+
+define internal <WIDTH x float> @__add_varying_float(<WIDTH x float>, <WIDTH x float>) nounwind readnone alwaysinline {
+  %r = fadd <WIDTH x float> %0, %1
+  ret <WIDTH x float> %r
+}
+
+define float @__reduce_add_float(<WIDTH x float>) nounwind readnone alwaysinline {
+  neon_reduce(float, @__add_varying_float, @llvm.arm.neon.vpadd.v2f32, @add_f32)
+}
+
+declare <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+define internal float @min_f32(float, float) nounwind readnone alwaysinline {
+  %cmp = fcmp olt float %0, %1
+  %r = select i1 %cmp, float %0, float %1
+  ret float %r
+}
+
+define float @__reduce_min_float(<WIDTH x float>) nounwind readnone alwaysinline {
+  neon_reduce(float, @__min_varying_float, @llvm.arm.neon.vpmins.v2f32, @min_f32)
+}
+
+declare <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+define internal float @max_f32(float, float) nounwind readnone alwaysinline {
+  %cmp = fcmp ugt float %0, %1
+  %r = select i1 %cmp, float %0, float %1
+  ret float %r
+}
+
+define float @__reduce_max_float(<WIDTH x float>) nounwind readnone alwaysinline {
+  neon_reduce(float, @__max_varying_float, @llvm.arm.neon.vpmaxs.v2f32, @max_f32)
+}
+
+declare <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone
+
+define i64 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone alwaysinline {
+  %a16 = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %0)
+  %a32 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %a16)
+  %a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a32)
+  %a0 = extractelement <2 x i64> %a64, i32 0
+  %a1 = extractelement <2 x i64> %a64, i32 1
+  %r = add i64 %a0, %a1
+  ret i64 %r
+}
+
+define i64 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone alwaysinline {
+  v16tov8(i16, %0, %va, %vb)
+  %a32 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %va)
+  %b32 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %vb)
+  %a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a32)
+  %b64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %b32)
+  %sum = add <2 x i64> %a64, %b64
+  %a0 = extractelement <2 x i64> %sum, i32 0
+  %a1 = extractelement <2 x i64> %sum, i32 1
+  %r = add i64 %a0, %a1
+  ret i64 %r
+}
+
+define i64 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone alwaysinline {
+  v16tov4(i32, %0, %va, %vb, %vc, %vd)
+  %a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %va)
+  %b64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vb)
+  %c64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vc)
+  %d64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vd)
+  %ab = add <2 x i64> %a64, %b64
+  %cd = add <2 x i64> %c64, %d64
+  %sum = add <2 x i64> %ab, %cd
+  %a0 = extractelement <2 x i64> %sum, i32 0
+  %a1 = extractelement <2 x i64> %sum, i32 1
+  %r = add i64 %a0, %a1
+  ret i64 %r
+}
+
+declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @min_si32(i32, i32) nounwind readnone alwaysinline {
+  %cmp = icmp slt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_min_int32(<WIDTH x i32>) nounwind readnone alwaysinline {
+  neon_reduce(i32, @__min_varying_int32, @llvm.arm.neon.vpmins.v2i32, @min_si32)
+}
+
+declare <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @max_si32(i32, i32) nounwind readnone alwaysinline {
+  %cmp = icmp sgt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_max_int32(<WIDTH x i32>) nounwind readnone alwaysinline {
+  neon_reduce(i32, @__max_varying_int32, @llvm.arm.neon.vpmaxs.v2i32, @max_si32)
+}
+
+declare <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @min_ui32(i32, i32) nounwind readnone alwaysinline {
+  %cmp = icmp ult i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_min_uint32(<WIDTH x i32>) nounwind readnone alwaysinline {
+  neon_reduce(i32, @__min_varying_uint32, @llvm.arm.neon.vpmins.v2i32, @min_ui32)
+}
+
+declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @max_ui32(i32, i32) nounwind readnone alwaysinline {
+  %cmp = icmp ugt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_max_uint32(<WIDTH x i32>) nounwind readnone alwaysinline {
+  neon_reduce(i32, @__max_varying_uint32, @llvm.arm.neon.vpmaxs.v2i32, @max_ui32)
+}
+
+define internal double @__add_uniform_double(double, double) nounwind readnone alwaysinline {
+  %r = fadd double %0, %1
+  ret double %r
+}
+
+define internal <WIDTH x double> @__add_varying_double(<WIDTH x double>, <WIDTH x double>) nounwind readnone alwaysinline {
+  %r = fadd <WIDTH x double> %0, %1
+  ret <WIDTH x double> %r
+}
+
+define double @__reduce_add_double(<WIDTH x double>) nounwind readnone alwaysinline {
+  reduce16(double, @__add_varying_double, @__add_uniform_double)
+}
+
+define double @__reduce_min_double(<WIDTH x double>) nounwind readnone alwaysinline {
+  reduce16(double, @__min_varying_double, @__min_uniform_double)
+}
+
+define double @__reduce_max_double(<WIDTH x double>) nounwind readnone alwaysinline {
+  reduce16(double, @__max_varying_double, @__max_uniform_double)
+}
+
+define internal i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
+  %r = add i64 %0, %1
+  ret i64 %r
+}
+
+define internal <WIDTH x i64> @__add_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone alwaysinline {
+  %r = add <WIDTH x i64> %0, %1
+  ret <WIDTH x i64> %r
+}
+
+define i64 @__reduce_add_int64(<WIDTH x i64>) nounwind readnone alwaysinline {
+  reduce16(i64, @__add_varying_int64, @__add_uniform_int64)
+}
+
+define i64 @__reduce_min_int64(<WIDTH x i64>) nounwind readnone alwaysinline {
+  reduce16(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+define i64 @__reduce_max_int64(<WIDTH x i64>) nounwind readnone alwaysinline {
+  reduce16(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+define i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone alwaysinline {
+  reduce16(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+define i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone alwaysinline {
+  reduce16(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+declare <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <16 x i8> @__avg_up_uint8(<16 x i8>, <16 x i8>) nounwind readnone alwaysinline {
+  %r = call <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8> %0, <16 x i8> %1)
+  ret <16 x i8> %r
+}
+
+declare <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <16 x i8> @__avg_up_int8(<16 x i8>, <16 x i8>) nounwind readnone alwaysinline {
+  %r = call <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8> %0, <16 x i8> %1)
+  ret <16 x i8> %r
+}
+
+declare <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <16 x i8> @__avg_down_uint8(<16 x i8>, <16 x i8>) nounwind readnone alwaysinline {
+  %r = call <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8> %0, <16 x i8> %1)
+  ret <16 x i8> %r
+}
+
+declare <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <16 x i8> @__avg_down_int8(<16 x i8>, <16 x i8>) nounwind readnone alwaysinline {
+  %r = call <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8> %0, <16 x i8> %1)
+  ret <16 x i8> %r
+}
+
+declare <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i16> @__avg_up_uint16(<16 x i16>, <16 x i16>) nounwind readnone alwaysinline {
+  v16tov8(i16, %0, %a0, %b0)
+  v16tov8(i16, %1, %a1, %b1)
+  %r0 = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %a0, <8 x i16> %a1)
+  %r1 = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %b0, <8 x i16> %b1)
+  v8tov16(i16, %r0, %r1, %r)
+  ret <16 x i16> %r
+}
+
+declare <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i16> @__avg_up_int16(<16 x i16>, <16 x i16>) nounwind readnone alwaysinline {
+  v16tov8(i16, %0, %a0, %b0)
+  v16tov8(i16, %1, %a1, %b1)
+  %r0 = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %a0, <8 x i16> %a1)
+  %r1 = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %b0, <8 x i16> %b1)
+  v8tov16(i16, %r0, %r1, %r)
+  ret <16 x i16> %r
+}
+
+declare <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i16> @__avg_down_uint16(<16 x i16>, <16 x i16>) nounwind readnone alwaysinline {
+  v16tov8(i16, %0, %a0, %b0)
+  v16tov8(i16, %1, %a1, %b1)
+  %r0 = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %a0, <8 x i16> %a1)
+  %r1 = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %b0, <8 x i16> %b1)
+  v8tov16(i16, %r0, %r1, %r)
+  ret <16 x i16> %r
+}
+
+declare <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i16> @__avg_down_int16(<16 x i16>, <16 x i16>) nounwind readnone alwaysinline {
+  v16tov8(i16, %0, %a0, %b0)
+  v16tov8(i16, %1, %a1, %b1)
+  %r0 = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %a0, <8 x i16> %a1)
+  %r1 = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %b0, <8 x i16> %b1)
+  v8tov16(i16, %r0, %r1, %r)
+  ret <16 x i16> %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reciprocals in double precision, if supported
+
+rsqrtd_decl()
+rcpd_decl()
+
+transcendetals_decl()
+trigonometry_decl()
+saturation_arithmetic()
--- a/builtins/target-neon-common.ll
+++ b/builtins/target-neon-common.ll
@@ -0,0 +1,354 @@
+;;
+;; target-neon-common.ll
+;;
+;;  Copyright(c) 2013-2015 Google, Inc.
+;;
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Matt Pharr nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+target datalayout = "e-p:32:32:32-S32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f16:16:16-f32:32:32-f64:32:64-f128:128:128-v64:32:64-v128:32:128-a0:0:64-n32"
+
+stdlib_core()
+scans()
+reduce_equal(WIDTH)
+rdrand_decls()
+define_shuffles()
+aossoa()
+ctlztz()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16>) nounwind readnone
+
+define float @__half_to_float_uniform(i16 %v) nounwind readnone alwaysinline {
+  %v1 = bitcast i16 %v to <1 x i16>
+  %vec = shufflevector <1 x i16> %v1, <1 x i16> undef, 
+           <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  %h = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %vec)
+  %r = extractelement <4 x float> %h, i32 0
+  ret float %r
+}
+
+define i16 @__float_to_half_uniform(float %v) nounwind readnone alwaysinline {
+  %v1 = bitcast float %v to <1 x float>
+  %vec = shufflevector <1 x float> %v1, <1 x float> undef, 
+           <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  %h = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %vec)
+  %r = extractelement <4 x i16> %h, i32 0
+  ret i16 %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; math
+
+declare i32 @llvm.arm.get.fpscr() nounwind
+declare void @llvm.arm.set.fpscr(i32) nounwind
+
+define void @__fastmath() nounwind alwaysinline {
+  %x = call i32 @llvm.arm.get.fpscr()
+  ; Turn on FTZ (bit 24) and default NaN (bit 25)
+  %y = or i32 %x, 50331648
+  call void @llvm.arm.set.fpscr(i32 %y)
+  ret void
+}
+
+;; round/floor/ceil
+
+;; FIXME: grabbed these from the sse2 target, which does not have native
+;; instructions for these.  Is there a better approach for NEON?
+
+define float @__round_uniform_float(float) nounwind readonly alwaysinline {
+  %float_to_int_bitcast.i.i.i.i = bitcast float %0 to i32
+  %bitop.i.i = and i32 %float_to_int_bitcast.i.i.i.i, -2147483648
+  %bitop.i = xor i32 %bitop.i.i, %float_to_int_bitcast.i.i.i.i
+  %int_to_float_bitcast.i.i40.i = bitcast i32 %bitop.i to float
+  %binop.i = fadd float %int_to_float_bitcast.i.i40.i, 8.388608e+06
+  %binop21.i = fadd float %binop.i, -8.388608e+06
+  %float_to_int_bitcast.i.i.i = bitcast float %binop21.i to i32
+  %bitop31.i = xor i32 %float_to_int_bitcast.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop31.i to float
+  ret float %int_to_float_bitcast.i.i.i
+}
+
+define float @__floor_uniform_float(float) nounwind readonly alwaysinline {
+  %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
+  %bincmp.i = fcmp ogt float %calltmp.i, %0
+  %selectexpr.i = sext i1 %bincmp.i to i32
+  %bitop.i = and i32 %selectexpr.i, -1082130432
+  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
+  %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret float %binop.i
+}
+
+define float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
+  %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
+  %bincmp.i = fcmp olt float %calltmp.i, %0
+  %selectexpr.i = sext i1 %bincmp.i to i32
+  %bitop.i = and i32 %selectexpr.i, 1065353216
+  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
+  %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret float %binop.i
+}
+
+;; FIXME: rounding doubles and double vectors needs to be implemented
+declare double @__round_uniform_double(double) nounwind readnone 
+declare double @__floor_uniform_double(double) nounwind readnone 
+declare double @__ceil_uniform_double(double) nounwind readnone 
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; min/max
+
+define float @__max_uniform_float(float, float) nounwind readnone alwaysinline {
+  %cmp = fcmp ugt float %0, %1
+  %r = select i1 %cmp, float %0, float %1
+  ret float %r
+}
+
+define float @__min_uniform_float(float, float) nounwind readnone alwaysinline {
+  %cmp = fcmp ult float %0, %1
+  %r = select i1 %cmp, float %0, float %1
+  ret float %r
+}
+
+define i32 @__min_uniform_int32(i32, i32) nounwind readnone alwaysinline {
+  %cmp = icmp slt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__max_uniform_int32(i32, i32) nounwind readnone alwaysinline {
+  %cmp = icmp sgt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__min_uniform_uint32(i32, i32) nounwind readnone alwaysinline {
+  %cmp = icmp ult i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__max_uniform_uint32(i32, i32) nounwind readnone alwaysinline {
+  %cmp = icmp ugt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i64 @__min_uniform_int64(i64, i64) nounwind readnone alwaysinline {
+  %cmp = icmp slt i64 %0, %1
+  %r = select i1 %cmp, i64 %0, i64 %1
+  ret i64 %r
+}
+
+define i64 @__max_uniform_int64(i64, i64) nounwind readnone alwaysinline {
+  %cmp = icmp sgt i64 %0, %1
+  %r = select i1 %cmp, i64 %0, i64 %1
+  ret i64 %r
+}
+
+define i64 @__min_uniform_uint64(i64, i64) nounwind readnone alwaysinline {
+  %cmp = icmp ult i64 %0, %1
+  %r = select i1 %cmp, i64 %0, i64 %1
+  ret i64 %r
+}
+
+define i64 @__max_uniform_uint64(i64, i64) nounwind readnone alwaysinline {
+  %cmp = icmp ugt i64 %0, %1
+  %r = select i1 %cmp, i64 %0, i64 %1
+  ret i64 %r
+}
+
+define double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
+  %cmp = fcmp olt double %0, %1
+  %r = select i1 %cmp, double %0, double %1
+  ret double %r
+}
+
+define double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
+  %cmp = fcmp ogt double %0, %1
+  %r = select i1 %cmp, double %0, double %1
+  ret double %r
+}
+
+define <WIDTH x i64> @__min_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone alwaysinline {
+  %m = icmp slt <WIDTH x i64> %0, %1
+  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
+  ret <WIDTH x i64> %r
+}
+
+define <WIDTH x i64> @__max_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone alwaysinline {
+  %m = icmp sgt <WIDTH x i64> %0, %1
+  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
+  ret <WIDTH x i64> %r
+}
+
+define <WIDTH x i64> @__min_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone alwaysinline {
+  %m = icmp ult <WIDTH x i64> %0, %1
+  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
+  ret <WIDTH x i64> %r
+}
+
+define <WIDTH x i64> @__max_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone alwaysinline {
+  %m = icmp ugt <WIDTH x i64> %0, %1
+  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
+  ret <WIDTH x i64> %r
+}
+
+define <WIDTH x double> @__min_varying_double(<WIDTH x double>,
+                                              <WIDTH x double>) nounwind readnone alwaysinline {
+  %m = fcmp olt <WIDTH x double> %0, %1
+  %r = select <WIDTH x i1> %m, <WIDTH x double> %0, <WIDTH x double> %1
+  ret <WIDTH x double> %r
+}
+
+define <WIDTH x double> @__max_varying_double(<WIDTH x double>,
+                                              <WIDTH x double>) nounwind readnone alwaysinline {
+  %m = fcmp ogt <WIDTH x double> %0, %1
+  %r = select <WIDTH x i1> %m, <WIDTH x double> %0, <WIDTH x double> %1
+  ret <WIDTH x double> %r
+}
+
+;; sqrt/rsqrt/rcp
+
+declare float @llvm.sqrt.f32(float)
+
+define float @__sqrt_uniform_float(float) nounwind readnone alwaysinline {
+  %r = call float @llvm.sqrt.f32(float %0)
+  ret float %r
+}
+
+declare double @llvm.sqrt.f64(double)
+
+define double @__sqrt_uniform_double(double) nounwind readnone alwaysinline {
+  %r = call double @llvm.sqrt.f64(double %0)
+  ret double %r
+}
+
+;; bit ops
+
+declare i32 @llvm.ctpop.i32(i32) nounwind readnone
+declare i64 @llvm.ctpop.i64(i64) nounwind readnone
+
+define i32 @__popcnt_int32(i32) nounwind readnone alwaysinline {
+  %v = call i32 @llvm.ctpop.i32(i32 %0)
+  ret i32 %v
+}
+
+define i64 @__popcnt_int64(i64) nounwind readnone alwaysinline {
+  %v = call i64 @llvm.ctpop.i64(i64 %0)
+  ret i64 %v
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+masked_load(i8,  1)
+masked_load(i16, 2)
+masked_load(i32, 4)
+masked_load(float, 4)
+masked_load(i64, 8)
+masked_load(double, 8)
+
+gen_masked_store(i8)
+gen_masked_store(i16)
+gen_masked_store(i32)
+gen_masked_store(i64)
+masked_store_float_double()
+
+define void @__masked_store_blend_i8(<WIDTH x i8>* nocapture %ptr, <WIDTH x i8> %new,
+                                     <WIDTH x MASK> %mask) nounwind alwaysinline {
+  %old = load PTR_OP_ARGS(`<WIDTH x i8> ')  %ptr
+  %mask1 = trunc <WIDTH x MASK> %mask to <WIDTH x i1>
+  %result = select <WIDTH x i1> %mask1, <WIDTH x i8> %new, <WIDTH x i8> %old
+  store <WIDTH x i8> %result, <WIDTH x i8> * %ptr
+  ret void
+}
+
+define void @__masked_store_blend_i16(<WIDTH x i16>* nocapture %ptr, <WIDTH x i16> %new, 
+                                      <WIDTH x MASK> %mask) nounwind alwaysinline {
+  %old = load PTR_OP_ARGS(`<WIDTH x i16> ')  %ptr
+  %mask1 = trunc <WIDTH x MASK> %mask to <WIDTH x i1>
+  %result = select <WIDTH x i1> %mask1, <WIDTH x i16> %new, <WIDTH x i16> %old
+  store <WIDTH x i16> %result, <WIDTH x i16> * %ptr
+  ret void
+}
+
+define void @__masked_store_blend_i32(<WIDTH x i32>* nocapture %ptr, <WIDTH x i32> %new, 
+                                      <WIDTH x MASK> %mask) nounwind alwaysinline {
+  %old = load PTR_OP_ARGS(`<WIDTH x i32> ')  %ptr
+  %mask1 = trunc <WIDTH x MASK> %mask to <WIDTH x i1>
+  %result = select <WIDTH x i1> %mask1, <WIDTH x i32> %new, <WIDTH x i32> %old
+  store <WIDTH x i32> %result, <WIDTH x i32> * %ptr
+  ret void
+}
+
+define void @__masked_store_blend_i64(<WIDTH x i64>* nocapture %ptr,
+                            <WIDTH x i64> %new, <WIDTH x MASK> %mask) nounwind alwaysinline {
+  %old = load PTR_OP_ARGS(`<WIDTH x i64> ')  %ptr
+  %mask1 = trunc <WIDTH x MASK> %mask to <WIDTH x i1>
+  %result = select <WIDTH x i1> %mask1, <WIDTH x i64> %new, <WIDTH x i64> %old
+  store <WIDTH x i64> %result, <WIDTH x i64> * %ptr
+  ret void
+}
+
+;; yuck.  We need declarations of these, even though we shouldnt ever
+;; actually generate calls to them for the NEON target...
+
+
+include(`svml.m4')
+svml_stubs(float,f,WIDTH)
+svml_stubs(double,d,WIDTH)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
+
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)
+
+packed_load_and_store(4)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; prefetch
+
+define_prefetches()
+declare_nvptx()
--- a/builtins/target-nvptx.ll
+++ b/builtins/target-nvptx.ll
--- a/builtins/target-skx.ll
+++ b/builtins/target-skx.ll
@@ -0,0 +1,94 @@
+;;  Copyright (c) 2016, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+define(`WIDTH',`16')
+
+
+ifelse(LLVM_VERSION, LLVM_3_8,
+    `include(`target-avx512-common.ll')',
+         LLVM_VERSION, LLVM_3_9,
+    `include(`target-avx512-common.ll')',
+         LLVM_VERSION, LLVM_4_0,
+    `include(`target-avx512-common.ll')',
+         LLVM_VERSION, LLVM_5_0,
+    `include(`target-avx512-common.ll')'
+  )
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp, rsqrt
+
+define(`rcp_rsqrt_varying_float_skx',`
+declare <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone
+define <16 x float> @__rcp_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  %call = call <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float> %0, <16 x float> undef, i16 -1)
+  ;; do one Newton-Raphson iteration to improve precision
+  ;;  float iv = __rcp_v(v);
+  ;;  return iv * (2. - v * iv);
+  %v_iv = fmul <16 x float> %0`,' %call
+  %two_minus = fsub <16 x float> <float 2.`,' float 2.`,' float 2.`,' float 2.`,'
+                                  float 2.`,' float 2.`,' float 2.`,' float 2.`,'
+                                  float 2.`,' float 2.`,' float 2.`,' float 2.`,'
+                                  float 2.`,' float 2.`,' float 2.`,' float 2.>`,' %v_iv
+  %iv_mul = fmul <16 x float> %call`,'  %two_minus
+  ret <16 x float> %iv_mul
+}
+declare <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float>`,'  <16 x float>`,'  i16) nounwind readnone
+define <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline {
+  %is = call <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float> %v`,'  <16 x float> undef`,'  i16 -1)
+  ; Newton-Raphson iteration to improve precision
+  ;  float is = __rsqrt_v(v);
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul <16 x float> %v`,'  %is
+  %v_is_is = fmul <16 x float> %v_is`,'  %is
+  %three_sub = fsub <16 x float> <float 3.`,' float 3.`,' float 3.`,' float 3.`,'
+                                  float 3.`,' float 3.`,' float 3.`,' float 3.`,'
+                                  float 3.`,' float 3.`,' float 3.`,' float 3.`,'
+                                  float 3.`,' float 3.`,' float 3.`,' float 3.>`,' %v_is_is
+  %is_mul = fmul <16 x float> %is`,'  %three_sub
+  %half_scale = fmul <16 x float> <float 0.5`,' float 0.5`,' float 0.5`,' float 0.5`,'
+                                   float 0.5`,' float 0.5`,' float 0.5`,' float 0.5`,'
+                                   float 0.5`,' float 0.5`,' float 0.5`,' float 0.5`,'
+                                   float 0.5`,' float 0.5`,' float 0.5`,' float 0.5>`,' %is_mul
+  ret <16 x float> %half_scale
+}
+')
+
+ifelse(LLVM_VERSION, LLVM_3_8,
+    rcp_rsqrt_varying_float_skx(),
+         LLVM_VERSION, LLVM_3_9,
+    rcp_rsqrt_varying_float_skx(),
+         LLVM_VERSION, LLVM_4_0,
+    rcp_rsqrt_varying_float_skx(),
+         LLVM_VERSION, LLVM_5_0,
+    rcp_rsqrt_varying_float_skx()
+  )
+
+;;saturation_arithmetic_novec()
--- a/builtins/target-sse2-common.ll
+++ b/builtins/target-sse2-common.ll
@@ -0,0 +1,277 @@
+;;  Copyright (c) 2010-2015, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+ctlztz()
+define_prefetches()
+define_shuffles()
+aossoa()
+rdrand_decls()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
+
+define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
+  ; do the rcpss call
+  %vecval = insertelement <4 x float> undef, float %0, i32 0
+  %call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
+  %scall = extractelement <4 x float> %call, i32 0
+
+  ; do one N-R iteration to improve precision, as above
+  %v_iv = fmul float %0, %scall
+  %two_minus = fsub float 2., %v_iv  
+  %iv_mul = fmul float %scall, %two_minus
+  ret float %iv_mul
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; rsqrt
+
+declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
+
+define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
+  ;  uniform float is = extract(__rsqrt_u(v), 0);
+  %v = insertelement <4 x float> undef, float %0, i32 0
+  %vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
+  %is = extractelement <4 x float> %vis, i32 0
+
+  ; Newton-Raphson iteration to improve precision
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul float %0, %is
+  %v_is_is = fmul float %v_is, %is
+  %three_sub = fsub float 3., %v_is_is
+  %is_mul = fmul float %is, %three_sub
+  %half_scale = fmul float 0.5, %is_mul
+  ret float %half_scale
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; sqrt
+
+declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
+
+
+define float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
+  sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
+  ret float %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; fast math mode
+
+declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
+declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
+
+define void @__fastmath() nounwind alwaysinline {
+  %ptr = alloca i32
+  %ptr8 = bitcast i32 * %ptr to i8 *
+  call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
+  %oldval = load PTR_OP_ARGS(`i32 ') %ptr
+
+  ; turn on DAZ (64)/FTZ (32768) -> 32832
+  %update = or i32 %oldval, 32832
+  store i32 %update, i32 *%ptr
+  call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8)
+  ret void
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
+  ret float %ret
+}
+
+
+define float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
+  ret float %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
+
+define double @__sqrt_uniform_double(double) nounwind alwaysinline {
+  sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
+  ret double %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define double @__min_uniform_double(double, double) nounwind readnone {
+  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
+  ret double %ret
+}
+
+define double @__max_uniform_double(double, double) nounwind readnone {
+  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
+  ret double %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding
+;;
+;; There are not any rounding instructions in SSE2, so we have to emulate
+;; the functionality with multiple instructions...
+
+; The code for __round_* is the result of compiling the following source
+; code.
+;
+; export float Round(float x) {
+;    unsigned int sign = signbits(x);
+;    unsigned int ix = intbits(x);
+;    ix ^= sign;
+;    x = floatbits(ix);
+;    x += 0x1.0p23f;
+;    x -= 0x1.0p23f;
+;    ix = intbits(x);
+;    ix ^= sign;
+;    x = floatbits(ix);
+;    return x;
+;}
+
+define float @__round_uniform_float(float) nounwind readonly alwaysinline {
+  %float_to_int_bitcast.i.i.i.i = bitcast float %0 to i32
+  %bitop.i.i = and i32 %float_to_int_bitcast.i.i.i.i, -2147483648
+  %bitop.i = xor i32 %bitop.i.i, %float_to_int_bitcast.i.i.i.i
+  %int_to_float_bitcast.i.i40.i = bitcast i32 %bitop.i to float
+  %binop.i = fadd float %int_to_float_bitcast.i.i40.i, 8.388608e+06
+  %binop21.i = fadd float %binop.i, -8.388608e+06
+  %float_to_int_bitcast.i.i.i = bitcast float %binop21.i to i32
+  %bitop31.i = xor i32 %float_to_int_bitcast.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop31.i to float
+  ret float %int_to_float_bitcast.i.i.i
+}
+
+;; Similarly, for implementations of the __floor* functions below, we have the
+;; bitcode from compiling the following source code...
+
+;export float Floor(float x) {
+;    float y = Round(x);
+;    unsigned int cmp = y > x ? 0xffffffff : 0;
+;    float delta = -1.f;
+;    unsigned int idelta = intbits(delta);
+;    idelta &= cmp;
+;    delta = floatbits(idelta);
+;    return y + delta;
+;}
+
+define float @__floor_uniform_float(float) nounwind readonly alwaysinline {
+  %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
+  %bincmp.i = fcmp ogt float %calltmp.i, %0
+  %selectexpr.i = sext i1 %bincmp.i to i32
+  %bitop.i = and i32 %selectexpr.i, -1082130432
+  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
+  %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret float %binop.i
+}
+
+;; And here is the code we compiled to get the __ceil* functions below
+;
+;export uniform float Ceil(uniform float x) {
+;    uniform float y = Round(x);
+;    uniform int yltx = y < x ? 0xffffffff : 0;
+;    uniform float delta = 1.f;
+;    uniform int idelta = intbits(delta);
+;    idelta &= yltx;
+;    delta = floatbits(idelta);
+;    return y + delta;
+;}
+
+define float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
+  %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
+  %bincmp.i = fcmp olt float %calltmp.i, %0
+  %selectexpr.i = sext i1 %bincmp.i to i32
+  %bitop.i = and i32 %selectexpr.i, 1065353216
+  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
+  %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret float %binop.i
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+declare double @round(double)
+declare double @floor(double)
+declare double @ceil(double)
+
+define double @__round_uniform_double(double) nounwind readonly alwaysinline {
+  %r = call double @round(double %0)
+  ret double %r
+}
+
+define double @__floor_uniform_double(double) nounwind readonly alwaysinline {
+  %r = call double @floor(double %0)
+  ret double %r
+}
+
+define double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
+  %r = call double @ceil(double %0)
+  ret double %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops / reductions
+
+declare i32 @llvm.ctpop.i32(i32)
+declare i64 @llvm.ctpop.i64(i64)
+
+define i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
+  %val = call i32 @llvm.ctpop.i32(i32 %0)
+  ret i32 %val
+}
+
+define i64 @__popcnt_int64(i64) nounwind readnone alwaysinline {
+  %val = call i64 @llvm.ctpop.i64(i64 %0)
+  ret i64 %val
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+define_avgs()
+
+declare_nvptx()
--- a/builtins/target-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -0,0 +1,664 @@
+;;  Copyright (c) 2010-2015, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+
+;; This file defines the target for "double-pumped" SSE2, i.e. running
+;; with 8-wide vectors
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; standard 8-wide definitions from m4 macros
+
+define(`WIDTH',`8')
+define(`MASK',`i32')
+include(`util.m4')
+
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+saturation_arithmetic()
+
+include(`target-sse2-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
+
+define <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ;  float iv = __rcp_v(v);
+  ;  return iv * (2. - v * iv);
+
+  unary4to8(call, float, @llvm.x86.sse.rcp.ps, %0)
+  ; do one N-R iteration
+  %v_iv = fmul <8 x float> %0, %call
+  %two_minus = fsub <8 x float> <float 2., float 2., float 2., float 2.,
+                                 float 2., float 2., float 2., float 2.>, %v_iv  
+  %iv_mul = fmul <8 x float> %call, %two_minus
+  ret <8 x float> %iv_mul
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rsqrt
+
+declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
+
+define <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
+  ;  float is = __rsqrt_v(v);
+  unary4to8(is, float, @llvm.x86.sse.rsqrt.ps, %v)
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul <8 x float> %v, %is
+  %v_is_is = fmul <8 x float> %v_is, %is
+  %three_sub = fsub <8 x float> <float 3., float 3., float 3., float 3.,
+                                 float 3., float 3., float 3., float 3.>, %v_is_is
+  %is_mul = fmul <8 x float> %is, %three_sub
+  %half_scale = fmul <8 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
+                                  float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  ret <8 x float> %half_scale
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; sqrt
+
+declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
+
+define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  unary4to8(call, float, @llvm.x86.sse.sqrt.ps, %0)
+  ret <8 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; svml stuff
+
+include(`svml.m4')
+;; single precision
+svml_declare(float,f4,4)
+svml_define_x(float,f4,4,f,8)
+
+;; double precision
+svml_declare(double,2,2)
+svml_define_x(double,2,2,d,8)
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <8 x float> @__max_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
+  binary4to8(call, float, @llvm.x86.sse.max.ps, %0, %1)
+  ret <8 x float> %call
+}
+
+define <8 x float> @__min_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
+  binary4to8(call, float, @llvm.x86.sse.min.ps, %0, %1)
+  ret <8 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; min/max
+
+; There is no blend instruction with SSE2, so we simulate it with bit
+; operations on i32s.  For these two vselect functions, for each
+; vector element, if the mask is on, we return the corresponding value
+; from %1, and otherwise return the value from %0.
+
+define <8 x i32> @__vselect_i32(<8 x i32>, <8 x i32> ,
+                                         <8 x i32> %mask) nounwind readnone alwaysinline {
+  %notmask = xor <8 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  %cleared_old = and <8 x i32> %0, %notmask
+  %masked_new = and <8 x i32> %1, %mask
+  %new = or <8 x i32> %cleared_old, %masked_new
+  ret <8 x i32> %new
+}
+
+define <8 x float> @__vselect_float(<8 x float>, <8 x float>,
+                                             <8 x i32> %mask) nounwind readnone alwaysinline {
+  %v0 = bitcast <8 x float> %0 to <8 x i32>
+  %v1 = bitcast <8 x float> %1 to <8 x i32>
+  %r = call <8 x i32> @__vselect_i32(<8 x i32> %v0, <8 x i32> %v1, <8 x i32> %mask)
+  %rf = bitcast <8 x i32> %r to <8 x float>
+  ret <8 x float> %rf
+}
+
+
+; To do vector integer min and max, we do the vector compare and then sign
+; extend the i1 vector result to an i32 mask.  The __vselect does the
+; rest...
+
+define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  %c = icmp slt <8 x i32> %0, %1
+  %mask = sext <8 x i1> %c to <8 x i32>
+  %v = call <8 x i32> @__vselect_i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> %mask)
+  ret <8 x i32> %v
+}
+
+define i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp slt i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  %c = icmp sgt <8 x i32> %0, %1
+  %mask = sext <8 x i1> %c to <8 x i32>
+  %v = call <8 x i32> @__vselect_i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> %mask)
+  ret <8 x i32> %v
+}
+
+define i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp sgt i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+
+; The functions for unsigned ints are similar, just with unsigned
+; comparison functions...
+
+define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  %c = icmp ult <8 x i32> %0, %1
+  %mask = sext <8 x i1> %c to <8 x i32>
+  %v = call <8 x i32> @__vselect_i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> %mask)
+  ret <8 x i32> %v
+}
+
+define i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp ult i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  %c = icmp ugt <8 x i32> %0, %1
+  %mask = sext <8 x i1> %c to <8 x i32>
+  %v = call <8 x i32> @__vselect_i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> %mask)
+  ret <8 x i32> %v
+}
+
+define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp ugt i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops / reductions
+
+declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
+
+define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
+  ; first do two 4-wide movmsk calls
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
+  %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
+
+  ; and shift the first one over by 4 before ORing it with the value 
+  ; of the second one
+  %v1s = shl i32 %v1, 4
+  %v = or i32 %v0, %v1s
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
+}
+
+define i1 @__any(<8 x i32>) nounwind readnone alwaysinline {
+  ; first do two 4-wide movmsk calls
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
+  %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
+
+  ; and shift the first one over by 4 before ORing it with the value 
+  ; of the second one
+  %v1s = shl i32 %v1, 4
+  %v = or i32 %v0, %v1s
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<8 x i32>) nounwind readnone alwaysinline {
+  ; first do two 4-wide movmsk calls
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
+  %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
+
+  ; and shift the first one over by 4 before ORing it with the value 
+  ; of the second one
+  %v1s = shl i32 %v1, 4
+  %v = or i32 %v0, %v1s
+  %cmp = icmp eq i32 %v, 255
+  ret i1 %cmp
+}
+
+define i1 @__none(<8 x i32>) nounwind readnone alwaysinline {
+  ; first do two 4-wide movmsk calls
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
+  %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
+
+  ; and shift the first one over by 4 before ORing it with the value 
+  ; of the second one
+  %v1s = shl i32 %v1, 4
+  %v = or i32 %v0, %v1s
+  %cmp = icmp eq i32 %v, 0
+  ret i1 %cmp
+}
+
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline {
+  %wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                  i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+define internal <8 x i16> @__add_varying_i16(<8 x i16>,
+                                  <8 x i16>) nounwind readnone alwaysinline {
+  %r = add <8 x i16> %0, %1
+  ret <8 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline {
+  reduce8(i16, @__add_varying_i16, @__add_uniform_i16)
+}
+
+define <4 x float> @__vec4_add_float(<4 x float> %v0,
+                                     <4 x float> %v1) nounwind readnone alwaysinline {
+  %v = fadd <4 x float> %v0, %v1
+  ret <4 x float> %v
+}
+
+define float @__add_float(float, float) nounwind readnone alwaysinline {
+  %v = fadd float %0, %1
+  ret float %v
+}
+
+define float @__reduce_add_float(<8 x float>) nounwind readnone alwaysinline {
+  reduce8by4(float, @__vec4_add_float, @__add_float)
+}
+
+define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
+  reduce8(float, @__min_varying_float, @__min_uniform_float)
+}
+
+define float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
+  reduce8(float, @__max_varying_float, @__max_uniform_float)
+}
+
+; helper function for reduce_add_int32
+define <4 x i32> @__vec4_add_int32(<4 x i32> %v0,
+                                   <4 x i32> %v1) nounwind readnone alwaysinline {
+  %v = add <4 x i32> %v0, %v1
+  ret <4 x i32> %v
+}
+
+; helper function for reduce_add_int32
+define i32 @__add_int32(i32, i32) nounwind readnone alwaysinline {
+  %v = add i32 %0, %1
+  ret i32 %v
+}
+
+define i32 @__reduce_add_int32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8by4(i32, @__vec4_add_int32, @__add_int32)
+}
+
+define i32 @__reduce_min_int32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8(i32, @__min_varying_int32, @__min_uniform_int32)
+}
+
+define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8(i32, @__max_varying_int32, @__max_uniform_int32)
+}
+
+define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32)
+}
+
+define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32)
+}
+
+define <4 x double> @__add_varying_double(<4 x double>,
+                                     <4 x double>) nounwind readnone alwaysinline {
+  %r = fadd <4 x double> %0, %1
+  ret <4 x double> %r
+}
+
+define double @__add_uniform_double(double, double) nounwind readnone alwaysinline {
+  %r = fadd double %0, %1
+  ret double %r
+}
+
+define double @__reduce_add_double(<8 x double>) nounwind readnone {
+  reduce8by4(double, @__add_varying_double, @__add_uniform_double)
+}
+
+define double @__reduce_min_double(<8 x double>) nounwind readnone {
+  reduce8(double, @__min_varying_double, @__min_uniform_double)
+}
+
+define double @__reduce_max_double(<8 x double>) nounwind readnone {
+  reduce8(double, @__max_varying_double, @__max_uniform_double)
+}
+
+define <4 x i64> @__add_varying_int64(<4 x i64>,
+                                      <4 x i64>) nounwind readnone alwaysinline {
+  %r = add <4 x i64> %0, %1
+  ret <4 x i64> %r
+}
+
+define i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
+  %r = add i64 %0, %1
+  ret i64 %r
+}
+
+define i64 @__reduce_add_int64(<8 x i64>) nounwind readnone {
+  reduce8by4(i64, @__add_varying_int64, @__add_uniform_int64)
+}
+
+define i64 @__reduce_min_int64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+define i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+reduce_equal(8)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+
+masked_load(i8,  1)
+masked_load(i16, 2)
+masked_load(i32, 4)
+masked_load(float, 4)
+masked_load(i64, 8)
+masked_load(double, 8)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather/scatter
+
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
+
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float rounding
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding
+;;
+;; There are not any rounding instructions in SSE2, so we have to emulate
+;; the functionality with multiple instructions...
+
+; The code for __round_* is the result of compiling the following source
+; code.
+;
+; export float Round(float x) {
+;    unsigned int sign = signbits(x);
+;    unsigned int ix = intbits(x);
+;    ix ^= sign;
+;    x = floatbits(ix);
+;    x += 0x1.0p23f;
+;    x -= 0x1.0p23f;
+;    ix = intbits(x);
+;    ix ^= sign;
+;    x = floatbits(ix);
+;    return x;
+;}
+
+define <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  %float_to_int_bitcast.i.i.i.i = bitcast <8 x float> %0 to <8 x i32>
+  %bitop.i.i = and <8 x i32> %float_to_int_bitcast.i.i.i.i, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+  %bitop.i = xor <8 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i40.i = bitcast <8 x i32> %bitop.i to <8 x float>
+  %binop.i = fadd <8 x float> %int_to_float_bitcast.i.i40.i, <float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06>
+  %binop21.i = fadd <8 x float> %binop.i, <float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06>
+  %float_to_int_bitcast.i.i.i = bitcast <8 x float> %binop21.i to <8 x i32>
+  %bitop31.i = xor <8 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop31.i to <8 x float>
+  ret <8 x float> %int_to_float_bitcast.i.i.i
+}
+
+;; Similarly, for implementations of the __floor* functions below, we have the
+;; bitcode from compiling the following source code...
+
+;export float Floor(float x) {
+;    float y = Round(x);
+;    unsigned int cmp = y > x ? 0xffffffff : 0;
+;    float delta = -1.f;
+;    unsigned int idelta = intbits(delta);
+;    idelta &= cmp;
+;    delta = floatbits(idelta);
+;    return y + delta;
+;}
+
+define <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <8 x float> @__round_varying_float(<8 x float> %0) nounwind
+  %bincmp.i = fcmp ogt <8 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <8 x i1> %bincmp.i to <8 x i32>
+  %bitop.i = and <8 x i32> %val_to_boolvec32.i, <i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432>
+  %int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop.i to <8 x float>
+  %binop.i = fadd <8 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <8 x float> %binop.i
+}
+
+;; And here is the code we compiled to get the __ceil* functions below
+;
+;export uniform float Ceil(uniform float x) {
+;    uniform float y = Round(x);
+;    uniform int yltx = y < x ? 0xffffffff : 0;
+;    uniform float delta = 1.f;
+;    uniform int idelta = intbits(delta);
+;    idelta &= yltx;
+;    delta = floatbits(idelta);
+;    return y + delta;
+;}
+
+define <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <8 x float> @__round_varying_float(<8 x float> %0) nounwind
+  %bincmp.i = fcmp olt <8 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <8 x i1> %bincmp.i to <8 x i32>
+  %bitop.i = and <8 x i32> %val_to_boolvec32.i, <i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216>
+  %int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop.i to <8 x float>
+  %binop.i = fadd <8 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <8 x float> %binop.i
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+define <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  unary1to8(double, @round)
+}
+
+define <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  unary1to8(double, @floor)
+}
+
+define <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  unary1to8(double, @ceil)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+gen_masked_store(i8)
+gen_masked_store(i16)
+gen_masked_store(i32)
+gen_masked_store(i64)
+
+masked_store_blend_8_16_by_8()
+
+define void @__masked_store_blend_i32(<8 x i32>* nocapture, <8 x i32>, 
+                                      <8 x i32> %mask) nounwind alwaysinline {
+  %val = load PTR_OP_ARGS(`<8 x i32> ')  %0, align 4
+  %newval = call <8 x i32> @__vselect_i32(<8 x i32> %val, <8 x i32> %1, <8 x i32> %mask) 
+  store <8 x i32> %newval, <8 x i32> * %0, align 4
+  ret void
+}
+
+define void @__masked_store_blend_i64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
+                                      <8 x i32> %mask) nounwind alwaysinline {
+  %oldValue = load PTR_OP_ARGS(`<8 x i64>')  %ptr, align 8
+
+  ; Do 8x64-bit blends by doing two <8 x i32> blends, where the <8 x i32> values
+  ; are actually bitcast <2 x i64> values
+  ;
+  ; set up the first two 64-bit values
+  %old0123  = shufflevector <8 x i64> %oldValue, <8 x i64> undef,
+                            <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %old0123f = bitcast <4 x i64> %old0123 to <8 x float>
+  %new0123  = shufflevector <8 x i64> %new, <8 x i64> undef,
+                            <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %new0123f = bitcast <4 x i64> %new0123 to <8 x float>
+  ; compute mask--note that the indices are doubled-up
+  %mask0123 = shufflevector <8 x i32> %mask, <8 x i32> undef,
+              <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+  ; and blend the first 4 values
+  %result0123f = call <8 x float> @__vselect_float(<8 x float> %old0123f, <8 x float> %new0123f,
+                                                   <8 x i32> %mask0123)
+  %result0123 = bitcast <8 x float> %result0123f to <4 x i64>
+
+  ; and again
+  %old4567  = shufflevector <8 x i64> %oldValue, <8 x i64> undef,
+                            <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %old4567f = bitcast <4 x i64> %old4567 to <8 x float>
+  %new4567  = shufflevector <8 x i64> %new, <8 x i64> undef,
+                            <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %new4567f = bitcast <4 x i64> %new4567 to <8 x float>
+  ; compute mask--note that the values are doubled-up
+  %mask4567 = shufflevector <8 x i32> %mask, <8 x i32> undef,
+              <8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+  ; and blend the two of the values
+  %result4567f = call <8 x float> @__vselect_float(<8 x float> %old4567f, <8 x float> %new4567f,
+                                                   <8 x i32> %mask4567)
+  %result4567 = bitcast <8 x float> %result4567f to <4 x i64>
+
+  ; reconstruct the final <8 x i64> vector
+  %final = shufflevector <4 x i64> %result0123, <4 x i64> %result4567,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x i64> %final, <8 x i64> * %ptr, align 8
+  ret void
+}
+
+masked_store_float_double()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
+
+define <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
+  unary2to8(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
+  ret <8 x double> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision float min/max
+
+declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
+  binary2to8(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
+  ret <8 x double> %ret
+}
+
+define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
+  binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
+  ret <8 x double> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reciprocals in double precision, if supported
+
+rsqrtd_decl()
+rcpd_decl()
+
+transcendetals_decl()
+trigonometry_decl()
--- a/builtins/target-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -0,0 +1,599 @@
+;;  Copyright (c) 2010-2015, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Define the standard library builtins for the SSE2 target
+
+; Define some basics for a 4-wide target
+define(`WIDTH',`4')
+define(`MASK',`i32')
+include(`util.m4')
+
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+saturation_arithmetic()
+
+include(`target-sse2-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding
+;;
+;; There are not any rounding instructions in SSE2, so we have to emulate
+;; the functionality with multiple instructions...
+
+; The code for __round_* is the result of compiling the following source
+; code.
+;
+; export float Round(float x) {
+;    unsigned int sign = signbits(x);
+;    unsigned int ix = intbits(x);
+;    ix ^= sign;
+;    x = floatbits(ix);
+;    x += 0x1.0p23f;
+;    x -= 0x1.0p23f;
+;    ix = intbits(x);
+;    ix ^= sign;
+;    x = floatbits(ix);
+;    return x;
+;}
+
+define <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  %float_to_int_bitcast.i.i.i.i = bitcast <4 x float> %0 to <4 x i32>
+  %bitop.i.i = and <4 x i32> %float_to_int_bitcast.i.i.i.i, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+  %bitop.i = xor <4 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i40.i = bitcast <4 x i32> %bitop.i to <4 x float>
+  %binop.i = fadd <4 x float> %int_to_float_bitcast.i.i40.i, <float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06>
+  %binop21.i = fadd <4 x float> %binop.i, <float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06>
+  %float_to_int_bitcast.i.i.i = bitcast <4 x float> %binop21.i to <4 x i32>
+  %bitop31.i = xor <4 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i.i = bitcast <4 x i32> %bitop31.i to <4 x float>
+  ret <4 x float> %int_to_float_bitcast.i.i.i
+}
+
+;; Similarly, for implementations of the __floor* functions below, we have the
+;; bitcode from compiling the following source code...
+
+;export float Floor(float x) {
+;    float y = Round(x);
+;    unsigned int cmp = y > x ? 0xffffffff : 0;
+;    float delta = -1.f;
+;    unsigned int idelta = intbits(delta);
+;    idelta &= cmp;
+;    delta = floatbits(idelta);
+;    return y + delta;
+;}
+
+define <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <4 x float> @__round_varying_float(<4 x float> %0) nounwind
+  %bincmp.i = fcmp ogt <4 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <4 x i1> %bincmp.i to <4 x i32>
+  %bitop.i = and <4 x i32> %val_to_boolvec32.i, <i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432>
+  %int_to_float_bitcast.i.i.i = bitcast <4 x i32> %bitop.i to <4 x float>
+  %binop.i = fadd <4 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <4 x float> %binop.i
+}
+
+;; And here is the code we compiled to get the __ceil* functions below
+;
+;export uniform float Ceil(uniform float x) {
+;    uniform float y = Round(x);
+;    uniform int yltx = y < x ? 0xffffffff : 0;
+;    uniform float delta = 1.f;
+;    uniform int idelta = intbits(delta);
+;    idelta &= yltx;
+;    delta = floatbits(idelta);
+;    return y + delta;
+;}
+
+define <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <4 x float> @__round_varying_float(<4 x float> %0) nounwind
+  %bincmp.i = fcmp olt <4 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <4 x i1> %bincmp.i to <4 x i32>
+  %bitop.i = and <4 x i32> %val_to_boolvec32.i, <i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216>
+  %int_to_float_bitcast.i.i.i = bitcast <4 x i32> %bitop.i to <4 x float>
+  %binop.i = fadd <4 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <4 x float> %binop.i
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+define <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  unary1to4(double, @round)
+}
+
+define <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  unary1to4(double, @floor)
+}
+
+define <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  unary1to4(double, @ceil)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; min/max
+
+; There is no blend instruction with SSE2, so we simulate it with bit
+; operations on i32s.  For these two vselect functions, for each
+; vector element, if the mask is on, we return the corresponding value
+; from %1, and otherwise return the value from %0.
+
+define <4 x i32> @__vselect_i32(<4 x i32>, <4 x i32> ,
+                                <4 x i32> %mask) nounwind readnone alwaysinline {
+  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %cleared_old = and <4 x i32> %0, %notmask
+  %masked_new = and <4 x i32> %1, %mask
+  %new = or <4 x i32> %cleared_old, %masked_new
+  ret <4 x i32> %new
+}
+
+define <4 x float> @__vselect_float(<4 x float>, <4 x float>,
+                                    <4 x i32> %mask) nounwind readnone alwaysinline {
+  %v0 = bitcast <4 x float> %0 to <4 x i32>
+  %v1 = bitcast <4 x float> %1 to <4 x i32>
+  %r = call <4 x i32> @__vselect_i32(<4 x i32> %v0, <4 x i32> %v1, <4 x i32> %mask)
+  %rf = bitcast <4 x i32> %r to <4 x float>
+  ret <4 x float> %rf
+}
+
+
+; To do vector integer min and max, we do the vector compare and then sign
+; extend the i1 vector result to an i32 mask.  The __vselect does the
+; rest...
+
+define <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %c = icmp slt <4 x i32> %0, %1
+  %mask = sext <4 x i1> %c to <4 x i32>
+  %v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
+  ret <4 x i32> %v
+}
+
+define i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp slt i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %c = icmp sgt <4 x i32> %0, %1
+  %mask = sext <4 x i1> %c to <4 x i32>
+  %v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
+  ret <4 x i32> %v
+}
+
+define i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp sgt i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+
+; The functions for unsigned ints are similar, just with unsigned
+; comparison functions...
+
+define <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %c = icmp ult <4 x i32> %0, %1
+  %mask = sext <4 x i1> %c to <4 x i32>
+  %v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
+  ret <4 x i32> %v
+}
+
+define i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp ult i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %c = icmp ugt <4 x i32> %0, %1
+  %mask = sext <4 x i1> %c to <4 x i32>
+  %v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
+  ret <4 x i32> %v
+}
+
+define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp ugt i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops / reductions
+
+declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
+
+define i64 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i32> %0 to <4 x float>
+  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
+}
+
+define i1 @__any(<4 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i32> %0 to <4 x float>
+  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<4 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i32> %0 to <4 x float>
+  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+  %cmp = icmp eq i32 %v, 15
+  ret i1 %cmp
+}
+
+define i1 @__none(<4 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i32> %0 to <4 x float>
+  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+  %cmp = icmp eq i32 %v, 0
+  ret i1 %cmp
+}
+
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline {
+  %wide8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4,
+                  i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+define internal <4 x i16> @__add_varying_i16(<4 x i16>,
+                                  <4 x i16>) nounwind readnone alwaysinline {
+  %r = add <4 x i16> %0, %1
+  ret <4 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<4 x i16>) nounwind readnone alwaysinline {
+  reduce4(i16, @__add_varying_i16, @__add_uniform_i16)
+}
+
+define float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline {
+  %v1 = shufflevector <4 x float> %v, <4 x float> undef,
+                      <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %m1 = fadd <4 x float> %v1, %v
+  %m1a = extractelement <4 x float> %m1, i32 0
+  %m1b = extractelement <4 x float> %m1, i32 1
+  %sum = fadd float %m1a, %m1b
+  ret float %sum
+}
+
+define float @__reduce_min_float(<4 x float>) nounwind readnone {
+  reduce4(float, @__min_varying_float, @__min_uniform_float)
+}
+
+define float @__reduce_max_float(<4 x float>) nounwind readnone {
+  reduce4(float, @__max_varying_float, @__max_uniform_float)
+}
+
+define i32 @__reduce_add_int32(<4 x i32> %v) nounwind readnone {
+  %v1 = shufflevector <4 x i32> %v, <4 x i32> undef,
+                      <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %m1 = add <4 x i32> %v1, %v
+  %m1a = extractelement <4 x i32> %m1, i32 0
+  %m1b = extractelement <4 x i32> %m1, i32 1
+  %sum = add i32 %m1a, %m1b
+  ret i32 %sum
+}
+
+define i32 @__reduce_min_int32(<4 x i32>) nounwind readnone {
+  reduce4(i32, @__min_varying_int32, @__min_uniform_int32)
+}
+
+define i32 @__reduce_max_int32(<4 x i32>) nounwind readnone {
+  reduce4(i32, @__max_varying_int32, @__max_uniform_int32)
+}
+
+define i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone {
+  reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32)
+}
+
+define i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone {
+  reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32)
+}
+
+
+define double @__reduce_add_double(<4 x double>) nounwind readnone {
+  %v0 = shufflevector <4 x double> %0, <4 x double> undef,
+                      <2 x i32> <i32 0, i32 1>
+  %v1 = shufflevector <4 x double> %0, <4 x double> undef,
+                      <2 x i32> <i32 2, i32 3>
+  %sum = fadd <2 x double> %v0, %v1
+  %e0 = extractelement <2 x double> %sum, i32 0
+  %e1 = extractelement <2 x double> %sum, i32 1
+  %m = fadd double %e0, %e1
+  ret double %m
+}
+
+define double @__reduce_min_double(<4 x double>) nounwind readnone {
+  reduce4(double, @__min_varying_double, @__min_uniform_double)
+}
+
+define double @__reduce_max_double(<4 x double>) nounwind readnone {
+  reduce4(double, @__max_varying_double, @__max_uniform_double)
+}
+
+define i64 @__reduce_add_int64(<4 x i64>) nounwind readnone {
+  %v0 = shufflevector <4 x i64> %0, <4 x i64> undef,
+                      <2 x i32> <i32 0, i32 1>
+  %v1 = shufflevector <4 x i64> %0, <4 x i64> undef,
+                      <2 x i32> <i32 2, i32 3>
+  %sum = add <2 x i64> %v0, %v1
+  %e0 = extractelement <2 x i64> %sum, i32 0
+  %e1 = extractelement <2 x i64> %sum, i32 1
+  %m = add i64 %e0, %e1
+  ret i64 %m
+}
+
+define i64 @__reduce_min_int64(<4 x i64>) nounwind readnone {
+  reduce4(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+define i64 @__reduce_max_int64(<4 x i64>) nounwind readnone {
+  reduce4(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+define i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone {
+  reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone {
+  reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+reduce_equal(4)
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+define void @__masked_store_blend_i32(<4 x i32>* nocapture, <4 x i32>, 
+                                      <4 x i32> %mask) nounwind alwaysinline {
+  %val = load PTR_OP_ARGS(`<4 x i32> ')  %0, align 4
+  %newval = call <4 x i32> @__vselect_i32(<4 x i32> %val, <4 x i32> %1, <4 x i32> %mask) 
+  store <4 x i32> %newval, <4 x i32> * %0, align 4
+  ret void
+}
+
+define void @__masked_store_blend_i64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
+                                      <4 x i32> %mask) nounwind alwaysinline {
+  %oldValue = load PTR_OP_ARGS(`<4 x i64>')  %ptr, align 8
+
+  ; Do 4x64-bit blends by doing two <4 x i32> blends, where the <4 x i32> values
+  ; are actually bitcast <2 x i64> values
+  ;
+  ; set up the first two 64-bit values
+  %old01  = shufflevector <4 x i64> %oldValue, <4 x i64> undef,
+                          <2 x i32> <i32 0, i32 1>
+  %old01f = bitcast <2 x i64> %old01 to <4 x float>
+  %new01  = shufflevector <4 x i64> %new, <4 x i64> undef,
+                          <2 x i32> <i32 0, i32 1>
+  %new01f = bitcast <2 x i64> %new01 to <4 x float>
+  ; compute mask--note that the indices 0 and 1 are doubled-up
+  %mask01 = shufflevector <4 x i32> %mask, <4 x i32> undef,
+                          <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+  ; and blend the two of the values
+  %result01f = call <4 x float> @__vselect_float(<4 x float> %old01f, <4 x float> %new01f, <4 x i32> %mask01)
+  %result01 = bitcast <4 x float> %result01f to <2 x i64>
+
+  ; and again
+  %old23  = shufflevector <4 x i64> %oldValue, <4 x i64> undef,
+                          <2 x i32> <i32 2, i32 3>
+  %old23f = bitcast <2 x i64> %old23 to <4 x float>
+  %new23  = shufflevector <4 x i64> %new, <4 x i64> undef,
+                          <2 x i32> <i32 2, i32 3>
+  %new23f = bitcast <2 x i64> %new23 to <4 x float>
+  ; compute mask--note that the values 2 and 3 are doubled-up
+  %mask23 = shufflevector <4 x i32> %mask, <4 x i32> undef,
+                          <4 x i32> <i32 2, i32 2, i32 3, i32 3>
+  ; and blend the two of the values
+  %result23f = call <4 x float> @__vselect_float(<4 x float> %old23f, <4 x float> %new23f, <4 x i32> %mask23)
+  %result23 = bitcast <4 x float> %result23f to <2 x i64>
+
+  ; reconstruct the final <4 x i64> vector
+  %final = shufflevector <2 x i64> %result01, <2 x i64> %result23,
+                         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x i64> %final, <4 x i64> * %ptr, align 8
+  ret void
+}
+
+
+masked_store_float_double()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
+
+define <4 x float> @__rcp_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  %call = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %0)
+  ; do one N-R iteration to improve precision
+  ;  float iv = __rcp_v(v);
+  ;  return iv * (2. - v * iv);
+  %v_iv = fmul <4 x float> %0, %call
+  %two_minus = fsub <4 x float> <float 2., float 2., float 2., float 2.>, %v_iv  
+  %iv_mul = fmul <4 x float> %call, %two_minus
+  ret <4 x float> %iv_mul
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; rsqrt
+
+declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
+
+define <4 x float> @__rsqrt_varying_float(<4 x float> %v) nounwind readonly alwaysinline {
+  ;  float is = __rsqrt_v(v);
+  %is = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %v)
+  ; Newton-Raphson iteration to improve precision
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul <4 x float> %v, %is
+  %v_is_is = fmul <4 x float> %v_is, %is
+  %three_sub = fsub <4 x float> <float 3., float 3., float 3., float 3.>, %v_is_is
+  %is_mul = fmul <4 x float> %is, %three_sub
+  %half_scale = fmul <4 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  ret <4 x float> %half_scale
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; sqrt
+
+declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
+
+define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  %call = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %0)
+  ret <4 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; svml stuff
+
+include(`svml.m4')
+;; single precision
+svml_declare(float,f4,4)
+svml_define(float,f4,4,f)
+
+;; double precision
+svml_declare(double,2,2)
+svml_define_x(double,2,2,d,4)
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <4 x float> @__max_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
+  %call = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %0, <4 x float> %1)
+  ret <4 x float> %call
+}
+
+define <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
+  %call = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %0, <4 x float> %1)
+  ret <4 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
+
+define <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
+  unary2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
+  ret <4 x double> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone {
+  binary2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
+  ret <4 x double> %ret
+}
+
+define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone {
+  binary2to4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
+  ret <4 x double> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+masked_store_blend_8_16_by_4()
+
+gen_masked_store(i8)
+gen_masked_store(i16)
+gen_masked_store(i32)
+gen_masked_store(i64)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+
+masked_load(i8,  1)
+masked_load(i16, 2)
+masked_load(i32, 4)
+masked_load(float, 4)
+masked_load(i64, 8)
+masked_load(double, 8)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather/scatter
+
+; define these with the macros from stdlib.m4
+
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
+
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reciprocals in double precision, if supported
+
+rsqrtd_decl()
+rcpd_decl()
+
+transcendetals_decl()
+trigonometry_decl()
--- a/builtins/target-sse4-16.ll
+++ b/builtins/target-sse4-16.ll
@@ -0,0 +1,500 @@
+;;  Copyright (c) 2013, 2015, Google, Inc.
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Google, Inc. nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Define common 4-wide stuff
+define(`WIDTH',`8')
+define(`MASK',`i16')
+include(`util.m4')
+
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+saturation_arithmetic()
+
+include(`target-sse4-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
+
+define <WIDTH x float> @__rcp_varying_float(<WIDTH x float>) nounwind readonly alwaysinline {
+  unary4to8(call, float, @llvm.x86.sse.rcp.ps, %0)
+  ; do one N-R iteration to improve precision
+  ;  float iv = __rcp_v(v);
+  ;  return iv * (2. - v * iv);
+  %v_iv = fmul <8 x float> %0, %call
+  %two_minus = fsub <8 x float> <float 2., float 2., float 2., float 2.,
+                                 float 2., float 2., float 2., float 2.>, %v_iv  
+  %iv_mul = fmul <8 x float> %call, %two_minus
+  ret <8 x float> %iv_mul
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; rsqrt
+
+declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
+
+define <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %v) nounwind readonly alwaysinline {
+  ;  float is = __rsqrt_v(v);
+  unary4to8(is, float, @llvm.x86.sse.rsqrt.ps, %v)
+   ; Newton-Raphson iteration to improve precision
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul <8 x float> %v, %is
+  %v_is_is = fmul <8 x float> %v_is, %is
+  %three_sub = fsub <8 x float> <float 3., float 3., float 3., float 3.,
+                                  float 3., float 3., float 3., float 3.>, %v_is_is
+  %is_mul = fmul <8 x float> %is, %three_sub
+  %half_scale = fmul <8 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
+                                   float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  ret <8 x float> %half_scale
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; sqrt
+
+declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
+
+define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  unary4to8(call, float, @llvm.x86.sse.sqrt.ps, %0)
+  ret <8 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
+
+define <8 x double> @__sqrt_varying_double(<8 x double>) nounwind
+alwaysinline {
+  unary2to8(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
+  ret <8 x double> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding floats
+
+declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
+
+define <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  round4to8(%0, 8)
+}
+
+define <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  round4to8(%0, 9)
+}
+
+define <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  round4to8(%0, 10)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
+
+define <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  round2to8double(%0, 8)
+}
+
+define <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  round2to8double(%0, 9)
+}
+
+define <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  round2to8double(%0, 10)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <8 x float> @__max_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
+  binary4to8(call, float, @llvm.x86.sse.max.ps, %0, %1)
+  ret <8 x float> %call
+}
+
+define <8 x float> @__min_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
+  binary4to8(call, float, @llvm.x86.sse.min.ps, %0, %1)
+  ret <8 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int32 min/max
+
+define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(call, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret <8 x i32> %call
+}
+
+define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(call, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret <8 x i32> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; unsigned int min/max
+
+define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(call, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret <8 x i32> %call
+}
+
+define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(call, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret <8 x i32> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone {
+  binary2to8(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
+  ret <8 x double> %ret
+}
+
+define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone {
+  binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
+  ret <8 x double> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; svml
+
+; FIXME
+include(`svml.m4')
+svml_stubs(float,f,WIDTH)
+svml_stubs(double,d,WIDTH)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops / reductions
+
+declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone
+
+define i64 @__movmsk(<8 x MASK>) nounwind readnone alwaysinline {
+  %m8 = trunc <8 x MASK> %0 to <8 x i8>
+  %mask8 = shufflevector <8 x i8> %m8, <8 x i8> zeroinitializer,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                  i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %mask8)
+  %m64 = zext i32 %m to i64
+  ret i64 %m64
+}
+
+define i1 @__any(<8 x MASK>) nounwind readnone alwaysinline {
+  %m = call i64 @__movmsk(<8 x MASK> %0)
+  %mne = icmp ne i64 %m, 0
+  ret i1 %mne
+}
+
+define i1 @__all(<8 x MASK>) nounwind readnone alwaysinline {
+  %m = call i64 @__movmsk(<8 x MASK> %0)
+  %meq = icmp eq i64 %m, ALL_ON_MASK
+  ret i1 %meq
+}
+
+define i1 @__none(<8 x MASK>) nounwind readnone alwaysinline {
+  %m = call i64 @__movmsk(<8 x MASK> %0)
+  %meq = icmp eq i64 %m, 0
+  ret i1 %meq
+}
+
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline {
+  %wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                  i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+define internal <8 x i16> @__add_varying_i16(<8 x i16>,
+                                  <8 x i16>) nounwind readnone alwaysinline {
+  %r = add <8 x i16> %0, %1
+  ret <8 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline {
+  reduce8(i16, @__add_varying_i16, @__add_uniform_i16)
+}
+
+define internal <8 x float> @__add_varying_float(<8 x float>, <8 x float>) {
+  %r = fadd <8 x float> %0, %1
+  ret <8 x float> %r
+}
+
+define internal float @__add_uniform_float(float, float) {
+  %r = fadd float %0, %1
+  ret float %r
+}
+
+define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
+  reduce8(float, @__add_varying_float, @__add_uniform_float)
+}
+
+define float @__reduce_min_float(<8 x float>) nounwind readnone {
+  reduce8(float, @__min_varying_float, @__min_uniform_float)
+}
+
+define float @__reduce_max_float(<8 x float>) nounwind readnone {
+  reduce8(float, @__max_varying_float, @__max_uniform_float)
+}
+
+define internal <8 x i32> @__add_varying_int32(<8 x i32>, <8 x i32>) {
+  %r = add <8 x i32> %0, %1
+  ret <8 x i32> %r
+}
+
+define internal i32 @__add_uniform_int32(i32, i32) {
+  %r = add i32 %0, %1
+  ret i32 %r
+}
+
+define i32 @__reduce_add_int32(<8 x i32>) nounwind readnone {
+  reduce8(i32, @__add_varying_int32, @__add_uniform_int32)
+}
+
+define i32 @__reduce_min_int32(<8 x i32>) nounwind readnone {
+  reduce8(i32, @__min_varying_int32, @__min_uniform_int32)
+}
+
+define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone {
+  reduce8(i32, @__max_varying_int32, @__max_uniform_int32)
+}
+
+define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone {
+  reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32)
+}
+
+define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone {
+  reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32)
+}
+
+define internal <8 x double> @__add_varying_double(<8 x double>, <8 x double>) {
+  %r = fadd <8 x double> %0, %1
+  ret <8 x double> %r
+}
+
+define internal double @__add_uniform_double(double, double) {
+  %r = fadd double %0, %1
+  ret double %r
+}
+
+define double @__reduce_add_double(<8 x double>) nounwind readnone {
+  reduce8(double, @__add_varying_double, @__add_uniform_double)
+}
+
+define double @__reduce_min_double(<8 x double>) nounwind readnone {
+  reduce8(double, @__min_varying_double, @__min_uniform_double)
+}
+
+define double @__reduce_max_double(<8 x double>) nounwind readnone {
+  reduce8(double, @__max_varying_double, @__max_uniform_double)
+}
+
+define internal <8 x i64> @__add_varying_int64(<8 x i64>, <8 x i64>) {
+  %r = add <8 x i64> %0, %1
+  ret <8 x i64> %r
+}
+
+define internal i64 @__add_uniform_int64(i64, i64) {
+  %r = add i64 %0, %1
+  ret i64 %r
+}
+
+define i64 @__reduce_add_int64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__add_varying_int64, @__add_uniform_int64)
+}
+
+define i64 @__reduce_min_int64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+define i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+reduce_equal(8)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+define void @__masked_store_blend_i64(<8 x i64>* nocapture, <8 x i64>,
+                                      <8 x MASK> %mask) nounwind
+                                      alwaysinline {
+  %mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1>
+  %old = load PTR_OP_ARGS(`<8 x i64>')  %0, align 4
+  %blend = select <8 x i1> %mask_as_i1, <8 x i64> %1, <8 x i64> %old
+  store <8 x i64> %blend, <8 x i64>* %0, align 4
+  ret void
+}
+
+define void @__masked_store_blend_i32(<8 x i32>* nocapture, <8 x i32>, 
+                                      <8 x MASK> %mask) nounwind alwaysinline {
+  %mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1>
+  %old = load PTR_OP_ARGS(`<8 x i32>')  %0, align 4
+  %blend = select <8 x i1> %mask_as_i1, <8 x i32> %1, <8 x i32> %old
+  store <8 x i32> %blend, <8 x i32>* %0, align 4
+  ret void
+}
+
+define void @__masked_store_blend_i16(<8 x i16>* nocapture, <8 x i16>,
+                                     <8 x MASK> %mask) nounwind alwaysinline {
+  %mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1>
+  %old = load PTR_OP_ARGS(`<8 x i16>')  %0, align 4
+  %blend = select <8 x i1> %mask_as_i1, <8 x i16> %1, <8 x i16> %old
+  store <8 x i16> %blend, <8 x i16>* %0, align 4
+  ret void
+}
+
+define void @__masked_store_blend_i8(<8 x i8>* nocapture, <8 x i8>,
+                                     <8 x MASK> %mask) nounwind alwaysinline {
+  %mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1>
+  %old = load PTR_OP_ARGS(`<8 x i8>')  %0, align 4
+  %blend = select <8 x i1> %mask_as_i1, <8 x i8> %1, <8 x i8> %old
+  store <8 x i8> %blend, <8 x i8>* %0, align 4
+  ret void
+}
+
+gen_masked_store(i8)
+gen_masked_store(i16)
+gen_masked_store(i32)
+gen_masked_store(i64)
+
+masked_store_float_double()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+masked_load(i8,  1)
+masked_load(i16, 2)
+masked_load(i32, 4)
+masked_load(float, 4)
+masked_load(i64, 8)
+masked_load(double, 8)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather/scatter
+
+; define these with the macros from stdlib.m4
+
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
+
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <8 x i8> @__avg_up_uint8(<8 x i8>, <8 x i8>) {
+  %v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i8> %r
+}
+
+declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @__avg_up_uint16(<8 x i16>, <8 x i16>) {
+  %r = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %0, <8 x i16> %1)
+  ret <8 x i16> %r
+}
+
+define_avg_up_int8()
+define_avg_up_int16()
+define_down_avgs()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reciprocals in double precision, if supported
+
+rsqrtd_decl()
+rcpd_decl()
+
+transcendetals_decl()
+trigonometry_decl()
--- a/builtins/target-sse4-8.ll
+++ b/builtins/target-sse4-8.ll
@@ -0,0 +1,498 @@
+;;  Copyright (c) 2013, 2015, Google, Inc.
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Google, Inc. nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Define common 4-wide stuff
+define(`WIDTH',`16')
+define(`MASK',`i8')
+include(`util.m4')
+
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+saturation_arithmetic()
+
+include(`target-sse4-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
+
+define <WIDTH x float> @__rcp_varying_float(<WIDTH x float>) nounwind readonly alwaysinline {
+  unary4to16(call, float, @llvm.x86.sse.rcp.ps, %0)
+  ; do one N-R iteration to improve precision
+  ;  float iv = __rcp_v(v);
+  ;  return iv * (2. - v * iv);
+  %v_iv = fmul <16 x float> %0, %call
+  %two_minus = fsub <16 x float> <float 2., float 2., float 2., float 2.,
+                                  float 2., float 2., float 2., float 2.,
+                                  float 2., float 2., float 2., float 2.,
+                                  float 2., float 2., float 2., float 2.>, %v_iv  
+  %iv_mul = fmul <16 x float> %call, %two_minus
+  ret <16 x float> %iv_mul
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; rsqrt
+
+declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
+
+define <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline {
+  ;  float is = __rsqrt_v(v);
+  unary4to16(is, float, @llvm.x86.sse.rsqrt.ps, %v)
+   ; Newton-Raphson iteration to improve precision
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul <16 x float> %v, %is
+  %v_is_is = fmul <16 x float> %v_is, %is
+  %three_sub = fsub <16 x float> <float 3., float 3., float 3., float 3.,
+                                  float 3., float 3., float 3., float 3.,
+                                  float 3., float 3., float 3., float 3.,
+                                  float 3., float 3., float 3., float 3.>, %v_is_is
+  %is_mul = fmul <16 x float> %is, %three_sub
+  %half_scale = fmul <16 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
+                                   float 0.5, float 0.5, float 0.5, float 0.5,
+                                   float 0.5, float 0.5, float 0.5, float 0.5,
+                                   float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  ret <16 x float> %half_scale
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; sqrt
+
+declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
+
+define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  unary4to16(call, float, @llvm.x86.sse.sqrt.ps, %0)
+  ret <16 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
+
+define <16 x double> @__sqrt_varying_double(<16 x double>) nounwind
+alwaysinline {
+  unary2to16(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
+  ret <16 x double> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding floats
+
+declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
+
+define <16 x float> @__round_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  round4to16(%0, 8)
+}
+
+define <16 x float> @__floor_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  round4to16(%0, 9)
+}
+
+define <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  round4to16(%0, 10)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
+
+define <16 x double> @__round_varying_double(<16 x double>) nounwind readonly alwaysinline {
+    round2to16double(%0, 8)
+}
+
+define <16 x double> @__floor_varying_double(<16 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+    round2to16double(%0, 9)
+}
+
+define <16 x double> @__ceil_varying_double(<16 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+    round2to16double(%0, 10)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <16 x float> @__max_varying_float(<16 x float>, <16 x float>) nounwind readonly alwaysinline {
+  binary4to16(call, float, @llvm.x86.sse.max.ps, %0, %1)
+  ret <16 x float> %call
+}
+
+define <16 x float> @__min_varying_float(<16 x float>, <16 x float>) nounwind readonly alwaysinline {
+  binary4to16(call, float, @llvm.x86.sse.min.ps, %0, %1)
+  ret <16 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int32 min/max
+
+define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(call, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret <16 x i32> %call
+}
+
+define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(call, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret <16 x i32> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; unsigned int min/max
+
+define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(call, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret <16 x i32> %call
+}
+
+define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(call, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret <16 x i32> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define <16 x double> @__min_varying_double(<16 x double>, <16 x double>) nounwind readnone {
+  binary2to16(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
+  ret <16 x double> %ret
+}
+
+define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwind readnone {
+  binary2to16(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
+  ret <16 x double> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; svml
+
+; FIXME
+
+include(`svml.m4')
+svml_stubs(float,f,WIDTH)
+svml_stubs(double,d,WIDTH)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops / reductions
+
+declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone
+
+define i64 @__movmsk(<16 x i8>) nounwind readnone alwaysinline {
+  %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0)
+  %m64 = zext i32 %m to i64
+  ret i64 %m64
+}
+
+define i1 @__any(<16 x i8>) nounwind readnone alwaysinline {
+  %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0)
+  %mne = icmp ne i32 %m, 0
+  ret i1 %mne
+}
+
+define i1 @__all(<16 x i8>) nounwind readnone alwaysinline {
+  %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0)
+  %meq = icmp eq i32 %m, ALL_ON_MASK
+  ret i1 %meq
+}
+
+define i1 @__none(<16 x i8>) nounwind readnone alwaysinline {
+  %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0)
+  %meq = icmp eq i32 %m, 0
+  ret i1 %meq
+}
+
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<16 x i8>) nounwind readnone alwaysinline {
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %0,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+define internal <16 x i16> @__add_varying_i16(<16 x i16>,
+                                  <16 x i16>) nounwind readnone alwaysinline {
+  %r = add <16 x i16> %0, %1
+  ret <16 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<16 x i16>) nounwind readnone alwaysinline {
+  reduce16(i16, @__add_varying_i16, @__add_uniform_i16)
+}
+
+define internal <16 x float> @__add_varying_float(<16 x float>, <16 x float>) {
+  %r = fadd <16 x float> %0, %1
+  ret <16 x float> %r
+}
+
+define internal float @__add_uniform_float(float, float) {
+  %r = fadd float %0, %1
+  ret float %r
+}
+
+define float @__reduce_add_float(<16 x float>) nounwind readonly alwaysinline {
+  reduce16(float, @__add_varying_float, @__add_uniform_float)
+}
+
+define float @__reduce_min_float(<16 x float>) nounwind readnone {
+  reduce16(float, @__min_varying_float, @__min_uniform_float)
+}
+
+define float @__reduce_max_float(<16 x float>) nounwind readnone {
+  reduce16(float, @__max_varying_float, @__max_uniform_float)
+}
+
+define internal <16 x i32> @__add_varying_int32(<16 x i32>, <16 x i32>) {
+  %r = add <16 x i32> %0, %1
+  ret <16 x i32> %r
+}
+
+define internal i32 @__add_uniform_int32(i32, i32) {
+  %r = add i32 %0, %1
+  ret i32 %r
+}
+
+define i32 @__reduce_add_int32(<16 x i32>) nounwind readnone {
+  reduce16(i32, @__add_varying_int32, @__add_uniform_int32)
+}
+
+define i32 @__reduce_min_int32(<16 x i32>) nounwind readnone {
+  reduce16(i32, @__min_varying_int32, @__min_uniform_int32)
+}
+
+define i32 @__reduce_max_int32(<16 x i32>) nounwind readnone {
+  reduce16(i32, @__max_varying_int32, @__max_uniform_int32)
+}
+
+define i32 @__reduce_min_uint32(<16 x i32>) nounwind readnone {
+  reduce16(i32, @__min_varying_uint32, @__min_uniform_uint32)
+}
+
+define i32 @__reduce_max_uint32(<16 x i32>) nounwind readnone {
+  reduce16(i32, @__max_varying_uint32, @__max_uniform_uint32)
+}
+
+define internal <16 x double> @__add_varying_double(<16 x double>, <16 x double>) {
+  %r = fadd <16 x double> %0, %1
+  ret <16 x double> %r
+}
+
+define internal double @__add_uniform_double(double, double) {
+  %r = fadd double %0, %1
+  ret double %r
+}
+
+define double @__reduce_add_double(<16 x double>) nounwind readnone {
+  reduce16(double, @__add_varying_double, @__add_uniform_double)
+}
+
+define double @__reduce_min_double(<16 x double>) nounwind readnone {
+  reduce16(double, @__min_varying_double, @__min_uniform_double)
+}
+
+define double @__reduce_max_double(<16 x double>) nounwind readnone {
+  reduce16(double, @__max_varying_double, @__max_uniform_double)
+}
+
+define internal <16 x i64> @__add_varying_int64(<16 x i64>, <16 x i64>) {
+  %r = add <16 x i64> %0, %1
+  ret <16 x i64> %r
+}
+
+define internal i64 @__add_uniform_int64(i64, i64) {
+  %r = add i64 %0, %1
+  ret i64 %r
+}
+
+define i64 @__reduce_add_int64(<16 x i64>) nounwind readnone {
+  reduce16(i64, @__add_varying_int64, @__add_uniform_int64)
+}
+
+define i64 @__reduce_min_int64(<16 x i64>) nounwind readnone {
+  reduce16(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+define i64 @__reduce_max_int64(<16 x i64>) nounwind readnone {
+  reduce16(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+define i64 @__reduce_min_uint64(<16 x i64>) nounwind readnone {
+  reduce16(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+define i64 @__reduce_max_uint64(<16 x i64>) nounwind readnone {
+  reduce16(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+reduce_equal(16)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+define void @__masked_store_blend_i64(<16 x i64>* nocapture, <16 x i64>,
+                                      <16 x i8> %mask) nounwind
+                                      alwaysinline {
+  %mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1>
+  %old = load PTR_OP_ARGS(`<16 x i64>')  %0, align 4
+  %blend = select <16 x i1> %mask_as_i1, <16 x i64> %1, <16 x i64> %old
+  store <16 x i64> %blend, <16 x i64>* %0, align 4
+  ret void
+}
+
+define void @__masked_store_blend_i32(<16 x i32>* nocapture, <16 x i32>, 
+                                      <16 x MASK> %mask) nounwind alwaysinline {
+  %mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1>
+  %old = load PTR_OP_ARGS(`<16 x i32>')  %0, align 4
+  %blend = select <16 x i1> %mask_as_i1, <16 x i32> %1, <16 x i32> %old
+  store <16 x i32> %blend, <16 x i32>* %0, align 4
+  ret void
+}
+
+define void @__masked_store_blend_i16(<16 x i16>* nocapture, <16 x i16>,
+                                     <16 x MASK> %mask) nounwind alwaysinline {
+  %mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1>
+  %old = load PTR_OP_ARGS(`<16 x i16>')  %0, align 4
+  %blend = select <16 x i1> %mask_as_i1, <16 x i16> %1, <16 x i16> %old
+  store <16 x i16> %blend, <16 x i16>* %0, align 4
+  ret void
+}
+
+declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+
+define void @__masked_store_blend_i8(<16 x i8>* nocapture, <16 x i8>,
+                                     <16 x MASK> %mask) nounwind alwaysinline {
+  %old = load PTR_OP_ARGS(`<16 x i8>')  %0, align 4
+  %blend = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %old, <16 x i8> %1,
+                                                   <16 x i8> %mask)
+  store <16 x i8> %blend, <16 x i8>* %0, align 4
+  ret void
+}
+
+gen_masked_store(i8)
+gen_masked_store(i16)
+gen_masked_store(i32)
+gen_masked_store(i64)
+
+masked_store_float_double()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+masked_load(i8,  1)
+masked_load(i16, 2)
+masked_load(i32, 4)
+masked_load(float, 4)
+masked_load(i64, 8)
+masked_load(double, 8)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather/scatter
+
+; define these with the macros from stdlib.m4
+
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
+
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <16 x i8> @__avg_up_uint8(<16 x i8>, <16 x i8>) nounwind readnone {
+  %r = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %0, <16 x i8> %1)
+  ret <16 x i8> %r
+}
+
+declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i16> @__avg_up_uint16(<16 x i16>, <16 x i16>) nounwind readnone {
+  v16tov8(i16, %0, %a0, %b0)
+  v16tov8(i16, %1, %a1, %b1)
+  %r0 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1)
+  %r1 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %b0, <8 x i16> %b1)
+  v8tov16(i16, %r0, %r1, %r)
+  ret <16 x i16> %r
+}
+
+define_avg_up_int8()
+define_avg_up_int16()
+define_down_avgs()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reciprocals in double precision, if supported
+
+rsqrtd_decl()
+rcpd_decl()
+
+transcendetals_decl()
+trigonometry_decl()
--- a/builtins/target-sse4-common.ll
+++ b/builtins/target-sse4-common.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2010-2015, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -30,37 +30,20 @@
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; *** Untested *** AVX target implementation.
-;;
-;; The LLVM AVX code generator is incomplete, so the ispc AVX target
-;; hasn't yet been tested.  There is therefore a higher-than-normal
-;; chance that there are bugs in the code in this file.
+;; SSE4 target implementation.

-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; rcp
-
-declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
-
-define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
-;    uniform float iv = extract(__rcp_u(v), 0);
-;    return iv * (2. - v * iv);
-  %vecval = insertelement <4 x float> undef, float %0, i32 0
-  %call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
-  %scall = extractelement <4 x float> %call, i32 0
-
-  ; do one N-R iteration
-  %v_iv = fmul float %0, %scall
-  %two_minus = fsub float 2., %v_iv  
-  %iv_mul = fmul float %scall, %two_minus
-  ret float %iv_mul
-}
+ctlztz()
+define_prefetches()
+define_shuffles()
+aossoa()
+rdrand_decls()

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding floats

 declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone

-define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
+define float @__round_uniform_float(float) nounwind readonly alwaysinline {
  ; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
  ; the roundss intrinsic is a total mess--docs say:
  ;
@@ -76,14 +59,15 @@ define internal float @__round_uniform_float(float) nounwind readonly alwaysinli
  ;  r3 = a3
  ;
  ;  It doesn't matter what we pass as a, since we only need the r0 value
-  ;  here.  So we pass the same register for both.
+  ;  here.  So we pass the same register for both.  Further, only the 0th
+  ;  element of the b parameter matters
  %xi = insertelement <4 x float> undef, float %0, i32 0
  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
  %rs = extractelement <4 x float> %xr, i32 0
  ret float %rs
 }

-define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
+define float @__floor_uniform_float(float) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <4 x float> undef, float %0, i32 0
  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
@@ -92,7 +76,7 @@ define internal float @__floor_uniform_float(float) nounwind readonly alwaysinli
  ret float %rs
 }

-define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
+define float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <4 x float> undef, float %0, i32 0
  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
@@ -106,43 +90,63 @@ define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinlin

 declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone

-define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
+define double @__round_uniform_double(double) nounwind readonly alwaysinline {
  %xi = insertelement <2 x double> undef, double %0, i32 0
  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
  %rs = extractelement <2 x double> %xr, i32 0
  ret double %rs
 }

-define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
+define double @__floor_uniform_double(double) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <2 x double> undef, double %0, i32 0
-  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  ; roundsd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
  %rs = extractelement <2 x double> %xr, i32 0
  ret double %rs
 }

-define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
+define double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <2 x double> undef, double %0, i32 0
-  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  ; roundsd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
  %rs = extractelement <2 x double> %xr, i32 0
  ret double %rs
 }

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
+
+define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
+  ; do the rcpss call
+  ;    uniform float iv = extract(__rcp_u(v), 0);
+  ;    return iv * (2. - v * iv);
+  %vecval = insertelement <4 x float> undef, float %0, i32 0
+  %call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
+  %scall = extractelement <4 x float> %call, i32 0
+
+  ; do one N-R iteration to improve precision, as above
+  %v_iv = fmul float %0, %scall
+  %two_minus = fsub float 2., %v_iv  
+  %iv_mul = fmul float %scall, %two_minus
+  ret float %iv_mul
+}

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rsqrt

 declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone

-define internal float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
+define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
  ;  uniform float is = extract(__rsqrt_u(v), 0);
  %v = insertelement <4 x float> undef, float %0, i32 0
  %vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
  %is = extractelement <4 x float> %vis, i32 0

+  ; Newton-Raphson iteration to improve precision
  ;  return 0.5 * is * (3. - (v * is) * is);
  %v_is = fmul float %0, %is
  %v_is_is = fmul float %v_is, %is
@@ -158,23 +162,32 @@ define internal float @__rsqrt_uniform_float(float) nounwind readonly alwaysinli

 declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone

-define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
+define float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
  sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
  ret float %ret
 }

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
+
+define double @__sqrt_uniform_double(double) nounwind alwaysinline {
+  sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
+  ret double %ret
+}

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; fastmath
+;; fast math mode

 declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
 declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind

-define internal void @__fastmath() nounwind alwaysinline {
+define void @__fastmath() nounwind alwaysinline {
  %ptr = alloca i32
  %ptr8 = bitcast i32 * %ptr to i8 *
  call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
-  %oldval = load i32 *%ptr
+  %oldval = load PTR_OP_ARGS(`i32 ') %ptr

  ; turn on DAZ (64)/FTZ (32768) -> 32832
  %update = or i32 %oldval, 32832
@@ -189,16 +202,32 @@ define internal void @__fastmath() nounwind alwaysinline {
 declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
 declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone

-define internal float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
+define float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
  ret float %ret
 }

-define internal float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
+define float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
  ret float %ret
 }

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
+  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
+  ret double %ret
+}
+
+define double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
+  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
+  ret double %ret
+}
+

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max
@@ -206,12 +235,12 @@ define internal float @__min_uniform_float(float, float) nounwind readonly alway
 declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
 declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone

-define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+define i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminsd, %0, %1)
  ret i32 %ret
 }

-define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+define i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
  ret i32 %ret
 }
@@ -223,56 +252,31 @@ define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinlin
 declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
 declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone

-define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+define i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminud, %0, %1)
  ret i32 %ret
 }

-define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxud, %0, %1)
  ret i32 %ret
 }

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; horizontal ops
+;; horizontal ops / reductions

 declare i32 @llvm.ctpop.i32(i32) nounwind readnone

-define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
+define i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
  %call = call i32 @llvm.ctpop.i32(i32 %0)
  ret i32 %call
 }

 declare i64 @llvm.ctpop.i64(i64) nounwind readnone

-define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
+define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
  %call = call i64 @llvm.ctpop.i64(i64 %0)
  ret i64 %call
 }

-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; double precision sqrt
-
-declare <2 x double> @llvm.x86.sse.sqrt.sd(<2 x double>) nounwind readnone
-
-define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
-  sse_unary_scalar(ret, 2, double, @llvm.x86.sse.sqrt.sd, %0)
-  ret double %ret
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; double precision min/max
-
-declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
-declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
-
-define internal double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
-  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
-  ret double %ret
-}
-
-define internal double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
-  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
-  ret double %ret
-}
+declare_nvptx()
--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -0,0 +1,603 @@
+;;  Copyright (c) 2010-2015, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+
+;; This file defines the target for "double-pumped" SSE4, i.e. running
+;; with 8-wide vectors
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; standard 8-wide definitions from m4 macros
+
+define(`WIDTH',`8')
+define(`MASK',`i32')
+include(`util.m4')
+
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+saturation_arithmetic()
+
+include(`target-sse4-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
+
+define <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ;  float iv = __rcp_v(v);
+  ;  return iv * (2. - v * iv);
+
+  unary4to8(call, float, @llvm.x86.sse.rcp.ps, %0)
+  ; do one N-R iteration
+  %v_iv = fmul <8 x float> %0, %call
+  %two_minus = fsub <8 x float> <float 2., float 2., float 2., float 2.,
+                                 float 2., float 2., float 2., float 2.>, %v_iv  
+  %iv_mul = fmul <8 x float> %call, %two_minus
+  ret <8 x float> %iv_mul
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rsqrt
+
+declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
+
+define <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
+  ;  float is = __rsqrt_v(v);
+  unary4to8(is, float, @llvm.x86.sse.rsqrt.ps, %v)
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul <8 x float> %v, %is
+  %v_is_is = fmul <8 x float> %v_is, %is
+  %three_sub = fsub <8 x float> <float 3., float 3., float 3., float 3.,
+                                 float 3., float 3., float 3., float 3.>, %v_is_is
+  %is_mul = fmul <8 x float> %is, %three_sub
+  %half_scale = fmul <8 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
+                                  float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  ret <8 x float> %half_scale
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; sqrt
+
+declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
+
+define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  unary4to8(call, float, @llvm.x86.sse.sqrt.ps, %0)
+  ret <8 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; svml stuff
+
+include(`svml.m4')
+;; single precision
+svml_declare(float,f4,4)
+svml_define_x(float,f4,4,f,8)
+
+;; double precision
+svml_declare(double,2,2)
+svml_define_x(double,2,2,d,8)
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <8 x float> @__max_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
+  binary4to8(call, float, @llvm.x86.sse.max.ps, %0, %1)
+  ret <8 x float> %call
+}
+
+define <8 x float> @__min_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
+  binary4to8(call, float, @llvm.x86.sse.min.ps, %0, %1)
+  ret <8 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int32 min/max
+
+define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(call, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret <8 x i32> %call
+}
+
+define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(call, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret <8 x i32> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; unsigned int min/max
+
+define <8 x i32> @__min_varying_uint32(<8 x i32>,
+                                       <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(call, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret <8 x i32> %call
+}
+
+define <8 x i32> @__max_varying_uint32(<8 x i32>,
+                                       <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(call, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret <8 x i32> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops / reductions
+
+declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
+
+define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
+  ; first do two 4-wide movmsk calls
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
+  %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
+
+  ; and shift the first one over by 4 before ORing it with the value 
+  ; of the second one
+  %v1s = shl i32 %v1, 4
+  %v = or i32 %v0, %v1s
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
+}
+
+define i1 @__any(<8 x i32>) nounwind readnone alwaysinline {
+  ; first do two 4-wide movmsk calls
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
+  %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
+
+  ; and shift the first one over by 4 before ORing it with the value 
+  ; of the second one
+  %v1s = shl i32 %v1, 4
+  %v = or i32 %v0, %v1s
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<8 x i32>) nounwind readnone alwaysinline {
+  ; first do two 4-wide movmsk calls
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
+  %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
+
+  ; and shift the first one over by 4 before ORing it with the value 
+  ; of the second one
+  %v1s = shl i32 %v1, 4
+  %v = or i32 %v0, %v1s
+  %cmp = icmp eq i32 %v, 255
+  ret i1 %cmp
+}
+
+define i1 @__none(<8 x i32>) nounwind readnone alwaysinline {
+  ; first do two 4-wide movmsk calls
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
+  %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
+
+  ; and shift the first one over by 4 before ORing it with the value 
+  ; of the second one
+  %v1s = shl i32 %v1, 4
+  %v = or i32 %v0, %v1s
+  %cmp = icmp eq i32 %v, 0
+  ret i1 %cmp
+}
+
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline {
+  %wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                  i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+define internal <8 x i16> @__add_varying_i16(<8 x i16>,
+                                  <8 x i16>) nounwind readnone alwaysinline {
+  %r = add <8 x i16> %0, %1
+  ret <8 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline {
+  reduce8(i16, @__add_varying_i16, @__add_uniform_i16)
+}
+
+define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
+  reduce8by4(float, @llvm.x86.sse.min.ps, @__min_uniform_float)
+}
+
+define float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
+  reduce8by4(float, @llvm.x86.sse.max.ps, @__max_uniform_float)
+}
+
+; helper function for reduce_add_int32
+define <4 x i32> @__vec4_add_int32(<4 x i32> %v0,
+                                   <4 x i32> %v1) nounwind readnone alwaysinline {
+  %v = add <4 x i32> %v0, %v1
+  ret <4 x i32> %v
+}
+
+; helper function for reduce_add_int32
+define i32 @__add_int32(i32, i32) nounwind readnone alwaysinline {
+  %v = add i32 %0, %1
+  ret i32 %v
+}
+
+define i32 @__reduce_add_int32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8by4(i32, @__vec4_add_int32, @__add_int32)
+}
+
+define i32 @__reduce_min_int32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8by4(i32, @llvm.x86.sse41.pminsd, @__min_uniform_int32)
+}
+
+define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8by4(i32, @llvm.x86.sse41.pmaxsd, @__max_uniform_int32)
+}
+
+define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8by4(i32, @llvm.x86.sse41.pminud, @__min_uniform_uint32)
+}
+
+define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8by4(i32, @llvm.x86.sse41.pmaxud, @__max_uniform_uint32)
+}
+
+define <4 x double> @__add_varying_double(<4 x double>,
+                                     <4 x double>) nounwind readnone alwaysinline {
+  %r = fadd <4 x double> %0, %1
+  ret <4 x double> %r
+}
+
+define double @__add_uniform_double(double, double) nounwind readnone alwaysinline {
+  %r = fadd double %0, %1
+  ret double %r
+}
+
+define double @__reduce_add_double(<8 x double>) nounwind readnone {
+  reduce8by4(double, @__add_varying_double, @__add_uniform_double)
+}
+
+define double @__reduce_min_double(<8 x double>) nounwind readnone {
+  reduce8(double, @__min_varying_double, @__min_uniform_double)
+}
+
+define double @__reduce_max_double(<8 x double>) nounwind readnone {
+  reduce8(double, @__max_varying_double, @__max_uniform_double)
+}
+
+define <4 x i64> @__add_varying_int64(<4 x i64>,
+                                      <4 x i64>) nounwind readnone alwaysinline {
+  %r = add <4 x i64> %0, %1
+  ret <4 x i64> %r
+}
+
+define i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
+  %r = add i64 %0, %1
+  ret i64 %r
+}
+
+define i64 @__reduce_add_int64(<8 x i64>) nounwind readnone {
+  reduce8by4(i64, @__add_varying_int64, @__add_uniform_int64)
+}
+
+define i64 @__reduce_min_int64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+define i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+reduce_equal(8)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+
+masked_load(i8,  1)
+masked_load(i16, 2)
+masked_load(i32, 4)
+masked_load(float, 4)
+masked_load(i64, 8)
+masked_load(double, 8)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather/scatter
+
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
+
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float rounding
+
+declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
+
+define <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  round4to8(%0, 8)
+}
+
+define <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  round4to8(%0, 9)
+}
+
+define <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  round4to8(%0, 10)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
+
+define <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  round2to8double(%0, 8)
+}
+
+define <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  round2to8double(%0, 9)
+}
+
+define <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  round2to8double(%0, 10)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops / reductions
+
+declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
+  %a = shufflevector <8 x float> %0, <8 x float> undef,
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %b = shufflevector <8 x float> %0, <8 x float> undef,
+         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %ab = fadd <4 x float> %a, %b
+  %hab = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %ab, <4 x float> %ab)
+  %a_scalar = extractelement <4 x float> %hab, i32 0
+  %b_scalar = extractelement <4 x float> %hab, i32 1
+  %sum = fadd float %a_scalar, %b_scalar
+  ret float %sum
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+gen_masked_store(i8)
+gen_masked_store(i16)
+gen_masked_store(i32)
+gen_masked_store(i64)
+
+masked_store_blend_8_16_by_8()
+
+declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
+                                             <4 x float>) nounwind readnone
+
+define void @__masked_store_blend_i32(<8 x i32>* nocapture, <8 x i32>, 
+                                      <8 x i32> %mask) nounwind alwaysinline {
+  ; do two 4-wide blends with blendvps
+  %mask_as_float = bitcast <8 x i32> %mask to <8 x float>
+  %mask_a = shufflevector <8 x float> %mask_as_float, <8 x float> undef,
+                <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %mask_b = shufflevector <8 x float> %mask_as_float, <8 x float> undef,
+                <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %oldValue = load PTR_OP_ARGS(`<8 x i32>')  %0, align 4
+  %oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float>
+  %newAsFloat = bitcast <8 x i32> %1 to <8 x float>
+  %old_a = shufflevector <8 x float> %oldAsFloat, <8 x float> undef,
+                <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %old_b = shufflevector <8 x float> %oldAsFloat, <8 x float> undef,
+                <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %new_a = shufflevector <8 x float> %newAsFloat, <8 x float> undef,
+                <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %new_b = shufflevector <8 x float> %newAsFloat, <8 x float> undef,
+                <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %blend_a = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %old_a, <4 x float> %new_a,
+                                                       <4 x float> %mask_a)
+  %blend_b = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %old_b, <4 x float> %new_b,
+                                                       <4 x float> %mask_b)
+  %blend = shufflevector <4 x float> %blend_a, <4 x float> %blend_b,
+               <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %blendAsInt = bitcast <8 x float> %blend to <8 x i32>
+  store <8 x i32> %blendAsInt, <8 x i32>* %0, align 4
+  ret void
+}
+
+define void @__masked_store_blend_i64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
+                                      <8 x i32> %mask) nounwind alwaysinline {
+  ; implement this as 4 blends of <4 x i32>s, which are actually bitcast
+  ; <2 x i64>s...
+
+  %mask_as_float = bitcast <8 x i32> %mask to <8 x float>
+
+  %old = load PTR_OP_ARGS(`<8 x i64>')  %ptr, align 8
+
+  ; set up the first two 64-bit values
+  %old01 = shufflevector <8 x i64> %old, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
+  %old01f = bitcast <2 x i64> %old01 to <4 x float>
+  %new01  = shufflevector <8 x i64> %new, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
+  %new01f = bitcast <2 x i64> %new01 to <4 x float>
+  ; compute mask--note that the values mask0 and mask1 are doubled-up
+  %mask01 = shufflevector <8 x float> %mask_as_float, <8 x float> undef,
+                          <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+  ; and blend the two of them values
+  %result01f = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %old01f,
+                                                         <4 x float> %new01f,
+                                                         <4 x float> %mask01)
+  %result01 = bitcast <4 x float> %result01f to <2 x i64>
+
+  ; and again
+  %old23 = shufflevector <8 x i64> %old, <8 x i64> undef, <2 x i32> <i32 2, i32 3>
+  %old23f = bitcast <2 x i64> %old23 to <4 x float>
+  %new23  = shufflevector <8 x i64> %new, <8 x i64> undef, <2 x i32> <i32 2, i32 3>
+  %new23f = bitcast <2 x i64> %new23 to <4 x float>
+  %mask23 = shufflevector <8 x float> %mask_as_float, <8 x float> undef,
+                          <4 x i32> <i32 2, i32 2, i32 3, i32 3>
+  %result23f = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %old23f,
+                                                         <4 x float> %new23f,
+                                                         <4 x float> %mask23)
+  %result23 = bitcast <4 x float> %result23f to <2 x i64>
+
+  %old45 = shufflevector <8 x i64> %old, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
+  %old45f = bitcast <2 x i64> %old45 to <4 x float>
+  %new45  = shufflevector <8 x i64> %new, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
+  %new45f = bitcast <2 x i64> %new45 to <4 x float>
+  %mask45 = shufflevector <8 x float> %mask_as_float, <8 x float> undef,
+                          <4 x i32> <i32 4, i32 4, i32 5, i32 5>
+  %result45f = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %old45f,
+                                                         <4 x float> %new45f,
+                                                         <4 x float> %mask45)
+  %result45 = bitcast <4 x float> %result45f to <2 x i64>
+
+  %old67 = shufflevector <8 x i64> %old, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
+  %old67f = bitcast <2 x i64> %old67 to <4 x float>
+  %new67  = shufflevector <8 x i64> %new, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
+  %new67f = bitcast <2 x i64> %new67 to <4 x float>
+  %mask67 = shufflevector <8 x float> %mask_as_float, <8 x float> undef,
+                          <4 x i32> <i32 6, i32 6, i32 7, i32 7>
+  %result67f = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %old67f,
+                                                         <4 x float> %new67f,
+                                                         <4 x float> %mask67)
+  %result67 = bitcast <4 x float> %result67f to <2 x i64>
+
+  %final0123 = shufflevector <2 x i64> %result01, <2 x i64> %result23,
+       <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %final4567 = shufflevector <2 x i64> %result45, <2 x i64> %result67,
+       <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %final = shufflevector <4 x i64> %final0123, <4 x i64> %final4567,
+       <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x i64> %final, <8 x i64> * %ptr, align 8
+  ret void
+}
+
+masked_store_float_double()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
+
+define <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
+  unary2to8(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
+  ret <8 x double> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision float min/max
+
+declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
+  binary2to8(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
+  ret <8 x double> %ret
+}
+
+define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
+  binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
+  ret <8 x double> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+define_avgs()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reciprocals in double precision, if supported
+
+rsqrtd_decl()
+rcpd_decl()
+
+transcendetals_decl()
+trigonometry_decl()
--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -0,0 +1,526 @@
+;;  Copyright (c) 2010-2015, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Define common 4-wide stuff
+define(`WIDTH',`4')
+define(`MASK',`i32')
+include(`util.m4')
+
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+saturation_arithmetic()
+
+include(`target-sse4-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
+
+define <4 x float> @__rcp_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  ; do one N-R iteration to improve precision
+  ;  float iv = __rcp_v(v);
+  ;  return iv * (2. - v * iv);
+  %call = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %0)
+  %v_iv = fmul <4 x float> %0, %call
+  %two_minus = fsub <4 x float> <float 2., float 2., float 2., float 2.>, %v_iv  
+  %iv_mul = fmul <4 x float> %call, %two_minus
+  ret <4 x float> %iv_mul
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; rsqrt
+
+declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
+
+define <4 x float> @__rsqrt_varying_float(<4 x float> %v) nounwind readonly alwaysinline {
+  ;  float is = __rsqrt_v(v);
+  %is = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %v)
+  ; Newton-Raphson iteration to improve precision
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul <4 x float> %v, %is
+  %v_is_is = fmul <4 x float> %v_is, %is
+  %three_sub = fsub <4 x float> <float 3., float 3., float 3., float 3.>, %v_is_is
+  %is_mul = fmul <4 x float> %is, %three_sub
+  %half_scale = fmul <4 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  ret <4 x float> %half_scale
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; sqrt
+
+declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
+
+define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  %call = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %0)
+  ret <4 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
+
+define <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
+  unary2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
+  ret <4 x double> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding floats
+
+declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
+
+define <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 8)
+  ret <4 x float> %call
+}
+
+define <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 9)
+  ret <4 x float> %call
+}
+
+define <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 10)
+  ret <4 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
+
+define <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  round2to4double(%0, 8)
+}
+
+define <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  round2to4double(%0, 9)
+}
+
+define <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  round2to4double(%0, 10)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <4 x float> @__max_varying_float(<4 x float>,
+                                        <4 x float>) nounwind readonly alwaysinline {
+  %call = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %0, <4 x float> %1)
+  ret <4 x float> %call
+}
+
+define <4 x float> @__min_varying_float(<4 x float>,
+                                        <4 x float>) nounwind readonly alwaysinline {
+  %call = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %0, <4 x float> %1)
+  ret <4 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone {
+  binary2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
+  ret <4 x double> %ret
+}
+
+define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone {
+  binary2to4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
+  ret <4 x double> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int32 min/max
+
+define <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %call = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %call
+}
+
+define <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %call = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; unsigned int min/max
+
+define <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %call = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %call
+}
+
+define <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %call = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; svml stuff
+
+include(`svml.m4')
+;; single precision
+svml_declare(float,f4,4)
+svml_define(float,f4,4,f)
+
+;; double precision
+svml_declare(double,2,2)
+svml_define_x(double,2,2,d,4)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; mask handling
+
+declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
+
+define i64 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i32> %0 to <4 x float>
+  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
+}
+
+define i1 @__any(<4 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i32> %0 to <4 x float>
+  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<4 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i32> %0 to <4 x float>
+  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+  %cmp = icmp eq i32 %v, 15
+  ret i1 %cmp
+}
+
+define i1 @__none(<4 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i32> %0 to <4 x float>
+  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+  %cmp = icmp eq i32 %v, 0
+  ret i1 %cmp
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal ops / reductions
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal float ops
+
+declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline {
+  %v1 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %0, <4 x float> %0)
+  %v2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %v1, <4 x float> %v1)
+  %scalar = extractelement <4 x float> %v2, i32 0
+  ret float %scalar
+}
+
+define float @__reduce_min_float(<4 x float>) nounwind readnone alwaysinline {
+  reduce4(float, @__min_varying_float, @__min_uniform_float)
+}
+
+define float @__reduce_max_float(<4 x float>) nounwind readnone alwaysinline {
+  reduce4(float, @__max_varying_float, @__max_uniform_float)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal double ops
+
+define double @__reduce_add_double(<4 x double>) nounwind readnone alwaysinline {
+  %v0 = shufflevector <4 x double> %0, <4 x double> undef,
+                      <2 x i32> <i32 0, i32 1>
+  %v1 = shufflevector <4 x double> %0, <4 x double> undef,
+                      <2 x i32> <i32 2, i32 3>
+  %sum = fadd <2 x double> %v0, %v1
+  %e0 = extractelement <2 x double> %sum, i32 0
+  %e1 = extractelement <2 x double> %sum, i32 1
+  %m = fadd double %e0, %e1
+  ret double %m
+}
+
+define double @__reduce_min_double(<4 x double>) nounwind readnone alwaysinline {
+  reduce4(double, @__min_varying_double, @__min_uniform_double)
+}
+
+define double @__reduce_max_double(<4 x double>) nounwind readnone alwaysinline {
+  reduce4(double, @__max_varying_double, @__max_uniform_double)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int8 ops
+
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline {
+  %wide8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4,
+                  i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int16 ops
+
+define internal <4 x i16> @__add_varying_i16(<4 x i16>,
+                                  <4 x i16>) nounwind readnone alwaysinline {
+  %r = add <4 x i16> %0, %1
+  ret <4 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<4 x i16>) nounwind readnone alwaysinline {
+  reduce4(i16, @__add_varying_i16, @__add_uniform_i16)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int32 ops
+
+;; reduction functions
+define i32 @__reduce_add_int32(<4 x i32> %v) nounwind readnone alwaysinline {
+  %v1 = shufflevector <4 x i32> %v, <4 x i32> undef,
+                      <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %m1 = add <4 x i32> %v1, %v
+  %m1a = extractelement <4 x i32> %m1, i32 0
+  %m1b = extractelement <4 x i32> %m1, i32 1
+  %sum = add i32 %m1a, %m1b
+  ret i32 %sum
+}
+
+define i32 @__reduce_min_int32(<4 x i32>) nounwind readnone alwaysinline {
+  reduce4(i32, @__min_varying_int32, @__min_uniform_int32)
+}
+
+define i32 @__reduce_max_int32(<4 x i32>) nounwind readnone alwaysinline {
+  reduce4(i32, @__max_varying_int32, @__max_uniform_int32)
+}
+
+define i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone alwaysinline {
+  reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32)
+}
+
+define i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone alwaysinline {
+  reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int64 ops
+
+;; reduction functions
+define i64 @__reduce_add_int64(<4 x i64>) nounwind readnone alwaysinline {
+  %v0 = shufflevector <4 x i64> %0, <4 x i64> undef,
+                      <2 x i32> <i32 0, i32 1>
+  %v1 = shufflevector <4 x i64> %0, <4 x i64> undef,
+                      <2 x i32> <i32 2, i32 3>
+  %sum = add <2 x i64> %v0, %v1
+  %e0 = extractelement <2 x i64> %sum, i32 0
+  %e1 = extractelement <2 x i64> %sum, i32 1
+  %m = add i64 %e0, %e1
+  ret i64 %m
+}
+
+define i64 @__reduce_min_int64(<4 x i64>) nounwind readnone alwaysinline {
+  reduce4(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+define i64 @__reduce_max_int64(<4 x i64>) nounwind readnone alwaysinline {
+  reduce4(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+define i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone alwaysinline {
+  reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone alwaysinline {
+  reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+reduce_equal(4)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+
+masked_load(i8,  1)
+masked_load(i16, 2)
+masked_load(i32, 4)
+masked_load(float, 4)
+masked_load(i64, 8)
+masked_load(double, 8)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+gen_masked_store(i8)
+gen_masked_store(i16)
+gen_masked_store(i32)
+gen_masked_store(i64)
+
+masked_store_float_double()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store blend
+
+masked_store_blend_8_16_by_4()
+
+declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
+                                             <4 x float>) nounwind readnone
+
+
+define void @__masked_store_blend_i32(<4 x i32>* nocapture, <4 x i32>, 
+                                      <4 x i32> %mask) nounwind alwaysinline {
+  %mask_as_float = bitcast <4 x i32> %mask to <4 x float>
+  %oldValue = load PTR_OP_ARGS(`<4 x i32>')  %0, align 4
+  %oldAsFloat = bitcast <4 x i32> %oldValue to <4 x float>
+  %newAsFloat = bitcast <4 x i32> %1 to <4 x float>
+  %blend = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %oldAsFloat,
+                                                     <4 x float> %newAsFloat,
+                                                     <4 x float> %mask_as_float)
+  %blendAsInt = bitcast <4 x float> %blend to <4 x i32>
+  store <4 x i32> %blendAsInt, <4 x i32>* %0, align 4
+  ret void
+}
+
+
+define void @__masked_store_blend_i64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
+                                      <4 x i32> %i32mask) nounwind alwaysinline {
+  %oldValue = load PTR_OP_ARGS(`<4 x i64>')  %ptr, align 8
+  %mask = bitcast <4 x i32> %i32mask to <4 x float>
+
+  ; Do 4x64-bit blends by doing two <4 x i32> blends, where the <4 x i32> values
+  ; are actually bitcast <2 x i64> values
+  ;
+  ; set up the first two 64-bit values
+  %old01  = shufflevector <4 x i64> %oldValue, <4 x i64> undef,
+                          <2 x i32> <i32 0, i32 1>
+  %old01f = bitcast <2 x i64> %old01 to <4 x float>
+  %new01  = shufflevector <4 x i64> %new, <4 x i64> undef,
+                          <2 x i32> <i32 0, i32 1>
+  %new01f = bitcast <2 x i64> %new01 to <4 x float>
+  ; compute mask--note that the indices 0 and 1 are doubled-up
+  %mask01 = shufflevector <4 x float> %mask, <4 x float> undef,
+                          <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+  ; and blend the two of the values
+  %result01f = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %old01f,
+                                                         <4 x float> %new01f,
+                                                         <4 x float> %mask01)
+  %result01 = bitcast <4 x float> %result01f to <2 x i64>
+
+  ; and again
+  %old23  = shufflevector <4 x i64> %oldValue, <4 x i64> undef,
+                          <2 x i32> <i32 2, i32 3>
+  %old23f = bitcast <2 x i64> %old23 to <4 x float>
+  %new23  = shufflevector <4 x i64> %new, <4 x i64> undef,
+                          <2 x i32> <i32 2, i32 3>
+  %new23f = bitcast <2 x i64> %new23 to <4 x float>
+  ; compute mask--note that the values 2 and 3 are doubled-up
+  %mask23 = shufflevector <4 x float> %mask, <4 x float> undef,
+                          <4 x i32> <i32 2, i32 2, i32 3, i32 3>
+  ; and blend the two of the values
+  %result23f = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %old23f,
+                                                         <4 x float> %new23f,
+                                                         <4 x float> %mask23)
+  %result23 = bitcast <4 x float> %result23f to <2 x i64>
+
+  ; reconstruct the final <4 x i64> vector
+  %final = shufflevector <2 x i64> %result01, <2 x i64> %result23,
+                         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x i64> %final, <4 x i64> * %ptr, align 8
+  ret void
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather/scatter
+
+; define these with the macros from stdlib.m4
+
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
+
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+define_avgs()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reciprocals in double precision, if supported
+
+rsqrtd_decl()
+rcpd_decl()
+
+transcendetals_decl()
+trigonometry_decl()
--- a/builtins/util-nvptx.m4
+++ b/builtins/util-nvptx.m4
--- a/builtins/util.m4
+++ b/builtins/util.m4
--- a/cbackend.cpp
+++ b/cbackend.cpp
--- a/check_env.py
+++ b/check_env.py
@@ -0,0 +1,102 @@
+#!/usr/bin/python
+#
+#  Copyright (c) 2013, Intel Corporation
+#  All rights reserved.
+# 
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+# 
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+# 
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in the
+#      documentation and/or other materials provided with the distribution.
+# 
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived from
+#      this software without specific prior written permission.
+# 
+# 
+#   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+#   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+#   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+#   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+#   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+#   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+#   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+#   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+#   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+#   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ 
+# // Author: Filippov Ilia
+
+import common
+import sys
+import os
+import string
+print_debug = common.print_debug
+error = common.error
+take_lines = common.take_lines
+
+exists = [False, False, False, False, False, False, False, False]
+names = ["m4", "bison", "flex", "sde", "ispc", "clang", "gcc", "icc"]
+
+PATH_dir = string.split(os.getenv("PATH"), os.pathsep)
+for counter in PATH_dir:
+    for i in range(0,8):
+        if os.path.exists(counter + os.sep + names[i]):
+            exists[i] = True
+
+print_debug("=== in PATH: ===\n", False, "")
+print_debug("Tools:\n", False, "")
+for i in range(0,3):
+    if exists[i]:
+        print_debug(take_lines(names[i] + " --version", "first"), False, "")
+    else:
+        error("you don't have " + names[i], 0)
+if exists[0] and exists[1] and exists[2]:
+    if common.check_tools(2):
+        print_debug("Tools' versions are ok\n", False, "")
+print_debug("\nSDE:\n", False, "")
+if exists[3]:
+    print_debug(take_lines(names[3] + " --version", "first"), False, "")
+else:
+    error("you don't have " + names[3], 2)
+print_debug("\nISPC:\n", False, "")
+if exists[4]:
+    print_debug(take_lines(names[4] + " --version", "first"), False, "")
+else:
+    error("you don't have " + names[4], 2)
+print_debug("\nC/C++ compilers:\n", False, "")
+for i in range(5,8):
+    if exists[i]:
+        print_debug(take_lines(names[i] + " --version", "first"), False, "")
+    else:
+        error("you don't have " + names[i], 2)
+
+print_debug("\n=== in ISPC specific environment variables: ===\n", False, "")
+if os.environ.get("LLVM_HOME") == None:
+    error("you have no LLVM_HOME", 2)
+else:
+    print_debug("Your LLVM_HOME:" + os.environ.get("LLVM_HOME") + "\n", False, "")
+if os.environ.get("ISPC_HOME") == None:
+    error("you have no ISPC_HOME", 2)
+else:
+    print_debug("Your ISPC_HOME:" + os.environ.get("ISPC_HOME") + "\n", False, "")
+    if os.path.exists(os.environ.get("ISPC_HOME") + os.sep + "ispc"):
+        print_debug("You have ISPC in your ISPC_HOME: " +
+        take_lines(os.environ.get("ISPC_HOME") + os.sep + "ispc" + " --version", "first"), False, "")
+    else:
+        error("you don't have ISPC in your ISPC_HOME", 2)
+if os.environ.get("SDE_HOME") == None:
+    error("You have no SDE_HOME", 2)
+else:
+    print_debug("Your SDE_HOME:" + os.environ.get("SDE_HOME") + "\n", False, "")
+    if os.path.exists(os.environ.get("SDE_HOME") + os.sep + "sde"):
+        print_debug("You have sde in your SDE_HOME: " +
+        take_lines(os.environ.get("SDE_HOME") + os.sep + "sde" + " --version", "first"), False, "")
+    else:
+        error("you don't have any SDE in your ISPC_HOME", 2)
--- a/check_isa.cpp
+++ b/check_isa.cpp
@@ -0,0 +1,170 @@
+/*
+  Copyright (c) 2013-2015, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// This file is a standalone program, which detects the best supported ISA.  //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+
+
+#include <stdio.h>
+
+#if defined(_WIN32) || defined(_WIN64)
+#define ISPC_IS_WINDOWS
+#include <intrin.h>
+#endif
+
+#if !defined (__arm__)
+#if !defined(ISPC_IS_WINDOWS)
+static void __cpuid(int info[4], int infoType) {
+    __asm__ __volatile__ ("cpuid"
+                          : "=a" (info[0]), "=b" (info[1]), "=c" (info[2]), "=d" (info[3])
+                          : "0" (infoType));
+}
+
+/* Save %ebx in case it's the PIC register */
+static void __cpuidex(int info[4], int level, int count) {
+  __asm__ __volatile__ ("xchg{l}\t{%%}ebx, %1\n\t"
+                        "cpuid\n\t"
+                        "xchg{l}\t{%%}ebx, %1\n\t"
+                        : "=a" (info[0]), "=r" (info[1]), "=c" (info[2]), "=d" (info[3])
+                        : "0" (level), "2" (count));
+}
+#endif // !ISPC_IS_WINDOWS
+
+static bool __os_has_avx_support() {
+#if defined(ISPC_IS_WINDOWS)
+    // Check if the OS will save the YMM registers
+    unsigned long long xcrFeatureMask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
+    return (xcrFeatureMask & 6) == 6;
+#else // !defined(ISPC_IS_WINDOWS)
+    // Check xgetbv; this uses a .byte sequence instead of the instruction
+    // directly because older assemblers do not include support for xgetbv and
+    // there is no easy way to conditionally compile based on the assembler used.
+    int rEAX, rEDX;
+    __asm__ __volatile__ (".byte 0x0f, 0x01, 0xd0" : "=a" (rEAX), "=d" (rEDX) : "c" (0));
+    return (rEAX & 6) == 6;
+#endif // !defined(ISPC_IS_WINDOWS)
+}
+
+static bool __os_has_avx512_support() {
+#if defined(ISPC_IS_WINDOWS)
+    // Check if the OS saves the XMM, YMM and ZMM registers, i.e. it supports AVX2 and AVX512.
+    // See section 2.1 of software.intel.com/sites/default/files/managed/0d/53/319433-022.pdf
+    unsigned long long xcrFeatureMask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
+    return (xcrFeatureMask & 0xE6) == 0xE6;
+#else // !defined(ISPC_IS_WINDOWS)
+    // Check xgetbv; this uses a .byte sequence instead of the instruction
+    // directly because older assemblers do not include support for xgetbv and
+    // there is no easy way to conditionally compile based on the assembler used.
+    int rEAX, rEDX;
+    __asm__ __volatile__ (".byte 0x0f, 0x01, 0xd0" : "=a" (rEAX), "=d" (rEDX) : "c" (0));
+    return (rEAX & 0xE6) == 0xE6;
+#endif // !defined(ISPC_IS_WINDOWS)
+}
+#endif // !__arm__
+
+
+static const char *
+lGetSystemISA() {
+#ifdef __arm__
+    return "ARM NEON";
+#else
+    int info[4];
+    __cpuid(info, 1);
+
+    int info2[4];
+    // Call cpuid with eax=7, ecx=0
+    __cpuidex(info2, 7, 0);
+
+    if ((info[2] & (1 << 27)) != 0 &&  // OSXSAVE
+        (info2[1] & (1 <<  5)) != 0 && // AVX2
+        (info2[1] & (1 << 16)) != 0 && // AVX512 F
+        __os_has_avx512_support()) {
+        // We need to verify that AVX2 is also available,
+        // as well as AVX512, because our targets are supposed 
+        // to use both.
+
+        if ((info2[1] & (1 << 17)) != 0 && // AVX512 DQ
+            (info2[1] & (1 << 28)) != 0 && // AVX512 CDI
+            (info2[1] & (1 << 30)) != 0 && // AVX512 BW
+            (info2[1] & (1 << 31)) != 0) { // AVX512 VL
+            return "SKX";
+        }
+        else if ((info2[1] & (1 << 26)) != 0 && // AVX512 PF
+                 (info2[1] & (1 << 27)) != 0 && // AVX512 ER
+                 (info2[1] & (1 << 28)) != 0) { // AVX512 CDI
+            return "KNL";
+        }
+        // If it's unknown AVX512 target, fall through and use AVX2
+        // or whatever is available in the machine.
+    }
+
+    if ((info[2] & (1 << 27)) != 0 && // OSXSAVE
+        (info[2] & (1 << 28)) != 0 &&
+         __os_has_avx_support()) {  // AVX
+        // AVX1 for sure....
+        // Ivy Bridge?
+        if ((info[2] & (1 << 29)) != 0 &&  // F16C
+            (info[2] & (1 << 30)) != 0) {  // RDRAND
+            // So far, so good.  AVX2?
+            if ((info2[1] & (1 << 5)) != 0) {
+                return "AVX2 (codename Haswell)";
+            }
+            else {
+                return "AVX1.1 (codename Ivy Bridge)";
+            }
+        }
+        // Regular AVX
+        return "AVX (codename Sandy Bridge)";
+    }
+    else if ((info[2] & (1 << 19)) != 0) {
+        return "SSE4";
+    }
+    else if ((info[3] & (1 << 26)) != 0) {
+        return "SSE2";
+    }
+    else {
+        return "Error";
+    }
+#endif
+}
+
+int main () {
+    const char* isa = lGetSystemISA();
+    printf("ISA: %s\n", isa);
+
+    return 0;
+}
--- a/common.py
+++ b/common.py
@@ -0,0 +1,502 @@
+#!/usr/bin/python
+#
+#  Copyright (c) 2013, Intel Corporation 
+#  All rights reserved.
+# 
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+# 
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+# 
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in the
+#      documentation and/or other materials provided with the distribution.
+# 
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived from
+#      this software without specific prior written permission.
+# 
+# 
+#   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+#   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+#   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+#   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+#   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+#   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+#   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+#   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+#   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+#   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# // Author: Filippov Ilia, Anton Mitrokhin, Vsevolod Livinskiy
+import sys
+import os
+import errno
+import shutil
+
+# generic empty class
+class EmptyClass(object): pass
+
+# load/save almost every object to a file (good for bug reproducing)
+def dump(fname, obj):
+    import pickle
+    with open(fname, 'w') as fp:
+        pickle.dump(obj, fp)  
+
+def undump(fname):
+    import pickle
+    with open(fname, 'r') as fp:
+        obj = pickle.load(fp) 
+    return obj
+
+# retrieve the host name
+def get_host_name():
+    import socket
+    return socket.gethostname()
+
+def write_to_file(filename, line):
+    f = open(filename, 'a')
+    f.writelines(line)
+    f.close()
+
+# remove file if it exists
+def remove_if_exists(filename):
+    if os.path.exists(filename):
+        if os.path.isdir(filename):
+            shutil.rmtree(filename)
+        else:
+            os.remove(filename)
+
+def make_sure_dir_exists(path):
+    try:
+        os.makedirs(path)
+    except OSError as exception:
+        if exception.errno != errno.EEXIST:
+            raise
+
+
+# detect version which is printed after command
+def take_lines(command, which):
+    os.system(command + " > " + "temp_detect_version")
+    version = open("temp_detect_version")
+    if which == "first":
+        answer = version.readline()
+    if which == "all":
+        answer = version.readlines()
+    version.close()
+    remove_if_exists("temp_detect_version")
+    return answer
+
+# print versions of compilers
+def print_version(ispc_test, ispc_ref, ref_compiler, s, perf_log, is_windows):
+    print_debug("\nUsing test compiler: " + take_lines(ispc_test + " --version", "first"), s, perf_log)
+    if ispc_ref != "":
+        print_debug("Using ref compiler:  " + take_lines(ispc_ref + " --version", "first"), s, perf_log)
+    if is_windows == False:
+        temp1 = take_lines(ref_compiler + " --version", "first")
+    else:
+        os.system(ref_compiler + " 2>&1" + " 2> temp_detect_version > temp_detect_version1" )
+        version = open("temp_detect_version")
+        temp1 = version.readline()
+        version.close()
+        remove_if_exists("temp_detect_version")
+        remove_if_exists("temp_detect_version1")
+    print_debug("Using C/C++ compiler: " + temp1 + "\n", s, perf_log)
+
+# print everything from scripts instead errors
+def print_debug(line, silent, filename):
+    if silent == False:
+        sys.stdout.write(line)
+        sys.stdout.flush()
+        if os.environ.get("ISPC_HOME") != None:
+            if os.path.exists(os.environ.get("ISPC_HOME")):
+                write_to_file(os.environ["ISPC_HOME"] + os.sep + "notify_log.log", line)
+    if filename != "":
+        write_to_file(filename, line)
+
+# print errors from scripts
+# type 1 for error in environment
+# type 2 for warning
+# type 3 for error of compiler or test which isn't the goal of script 
+def error(line, error_type):
+    line = line + "\n"
+    if error_type == 1:
+        sys.stderr.write("Fatal error: " + line)
+        sys.exit(1)
+    if error_type == 2:
+        sys.stderr.write("Warning: " + line)
+    if error_type == 0:
+        print_debug("FIND ERROR: " + line, False, "")
+
+def check_tools(m):
+    input_tools=[[[1,4],"m4 --version", "bad m4 version"],
+                 [[2,4],"bison --version", "bad bison version"],
+                 [[2,5], "flex --version", "bad flex version"]]
+    ret = 1 
+    for t in range(0,len(input_tools)):
+        t1 = ((take_lines(input_tools[t][1], "first"))[:-1].split(" "))
+        for i in range(0,len(t1)):
+            t11 = t1[i].split(".")
+            f = True
+            for j in range(0,len(t11)):
+                if not t11[j].isdigit():
+                    f = False
+            if f == True:
+                for j in range(0,len(t11)):
+                    if j < len(input_tools[t][0]):
+                        if int(t11[j])<input_tools[t][0][j]:
+                            error(input_tools[t][2], m)
+                            ret = 0
+                            break
+                        if int(t11[j])>input_tools[t][0][j]:
+                            break
+    return ret
+
+
+
+
+
+# regression testing functionality
+class TestResult(object):
+    """
+    this class stores basicly two integers which stand for the result
+    of the test: (runfail{0/1}, compfail{0/1}). other values are 
+    deemed invalid. the __cmp__ function of this class is used to 
+    define what test regression actually is.
+    """
+    def __init__(self, runfailed, compfailed):
+        self.runfailed, self.compfailed = (runfailed, compfailed)
+
+    def __cmp__(self, other):
+        if isinstance(other, TestResult):
+            if self.runfailed == other.runfailed   and \
+               self.compfailed == other.compfailed:
+                return 0
+            elif self.compfailed > other.compfailed:
+                return 1
+            elif self.runfailed > other.runfailed and \
+                 self.compfailed == other.compfailed:
+                return 1
+            else:
+                return -1
+
+        raise RuntimeError("Wrong type for comparioson")
+        return NotImplemented
+
+    def __repr__(self):
+        if (self.runfailed < 0 or self.compfailed < 0):
+            return "(Undefined)"
+        return "(r%d c%d)" % (self.runfailed, self.compfailed)
+
+
+class TestCase(object):
+    """
+    the TestCase() is a combination of parameters the tast was run with:
+    the architecture (x86, x86-64 ...), compiler optimization (-O0, -O2 ...)
+    and target (sse, avx ...). we also store the result of the test here.
+    """
+    def __init__(self, arch, opt, target):
+        self.arch, self.opt, self.target = (arch, opt, target)
+        self.result = TestResult(-1, -1)
+
+    def __repr__(self):
+        string = "%s %s %s: " % (self.arch, self.opt, self.target)
+        string = string + repr(self.result)
+        return string
+
+    def __hash__(self):
+        return hash(self.arch + self.opt + self.target)
+
+    def __ne__(self, other):
+        if isinstance(other, TestCase):
+            if hash(self.arch + self.opt + self.target) != hash(other):
+                return True
+            return False
+        raise RuntimeError("Wrong type for comparioson")
+        return NotImplemented
+
+    def __eq__(self, other):
+        if isinstance(other, TestCase):
+            return not self.__ne__(other)   
+        raise RuntimeError("Wrong type for comparioson")
+        return NotImplemented
+
+
+class Test(object):
+    """
+    Test() stores all TestCase() objects for a given test file name
+    i.e. all archs/opts/targets/ and corresponding testing results.
+    """
+    def __init__(self, name):
+        self.name = name
+        self.test_cases = []
+
+    def add_result(self, test_case):
+        if test_case in self.test_cases:
+            raise RuntimeError("This test case is already in the list: " + repr(test_case))
+            return
+        self.test_cases.append(test_case)
+
+    def __repr__(self):
+        string = self.name + '\n'
+        string = string.rjust(20)
+        for test_case in self.test_cases:
+            string += repr(test_case).rjust(60) + '\n'
+        return string
+    
+    def __hash__(self):
+        return hash(self.name)
+
+    def __ne__(self, other):
+        if isinstance(other, Test):
+            if hash(self) != hash(other):
+                return True
+            return False
+        return NotImplemented
+
+    def __eq__(self, other):
+        if isinstance(other, Test):
+            return not self.__ne__(other)   
+        return NotImplemented
+
+
+class RegressionInfo(object):
+    """
+    service class which provides some statistics on a given regression.
+    the regression test names and cases are given in a form of Test() objects
+    with empty (-1, -1) results 
+    """
+    def __init__(self, revision_old, revision_new, tests):
+        self.revision_old, self.revision_new = (revision_old, revision_new)
+        self.tests = tests
+        self.archfailes = {}
+        self.optfails = {}
+        self.targetfails = {}
+        self.testfails = {}
+        self.archs = []
+        self.opts = []
+        self.targets = []
+
+        for test in tests:
+            for test_case in test.test_cases:
+                self.inc_dictionary(self.testfails, test.name)
+                self.inc_dictionary(self.archfailes, test_case.arch)
+                self.inc_dictionary(self.optfails, test_case.opt)
+                self.inc_dictionary(self.targetfails, test_case.target)
+        
+        self.archs = self.archfailes.keys()
+        self.opts = self.optfails.keys()
+        self.targets = self.targetfails.keys()
+
+    def inc_dictionary(self, dictionary, key):
+        if key not in dictionary:
+            dictionary[key] = 0
+        dictionary[key] += 1
+
+    def __repr__(self):
+        string = "Regression of LLVM revision %s in comparison to %s\n" % (self.revision_new, self.revision_old)
+        string += repr(self.tests) + '\n'
+        string += str(self.testfails) + '\n'
+        string += str(self.archfailes) + '\n'
+        string += str(self.optfails) + '\n'
+        string += str(self.targetfails) + '\n'
+
+        return string
+
+
+class TestTable(object):
+    """
+    the table which stores a tuple of Test() objects (one per revision) and has some 
+    convenience methods for dealing with them
+    """
+    def __init__(self):
+        """ This dictionary contains {rev: [test1, test2, ...], ...}, where 'rev' is a string (revision name) and 'test#'
+        is a Test() object instance """
+        self.table = {}
+
+    def add_result(self, revision_name, test_name, arch, opt, target, runfailed, compfailed):
+        revision_name = str(revision_name)
+        if revision_name not in self.table:
+            self.table[revision_name] = []
+        
+        test_case = TestCase(arch, opt, target)
+        test_case.result = TestResult(runfailed, compfailed)
+
+        for test in self.table[revision_name]:
+            if test.name == test_name:
+                test.add_result(test_case)
+                return
+        
+        test = Test(test_name)
+        test.add_result(test_case)
+        self.table[revision_name].append(test)
+
+    def test_intersection(self, test1, test2):
+        """ Return test cases common for test1 and test2. If test names are different than there is nothing in common """
+        if test1.name != test2.name:
+            return []
+        return list(set(test1.test_cases) & set(test2.test_cases))
+
+    def test_regression(self, test1, test2):
+        """ Return the tuple of empty (i.e. with undefined results) TestCase() objects 
+            corresponding to regression in test2 comparing to test1 """
+        if test1.name != test2.name:
+            return []
+
+        regressed = []
+        for tc1 in test1.test_cases:
+            for tc2 in test2.test_cases:
+                """ If test cases are equal (same arch, opt and target) but tc2 has more runfails or compfails """
+                if tc1 == tc2 and tc1.result < tc2.result:
+                    regressed.append(TestCase(tc1.arch, tc1.opt, tc1.target))
+        return regressed
+ 
+    def regression(self, revision_old, revision_new):
+        """ Return a tuple of Test() objects containing TestCase() object which show regression along given revisions """
+        revision_old, revision_new = (str(revision_old), str(revision_new))
+        if revision_new not in self.table:
+            raise RuntimeError("This revision in not in the database: " + str(revision_new) + " (" + str(self.table.keys()) + ")")
+            return
+
+        if revision_old not in self.table:
+            raise RuntimeError("This revision in not in the database: " + str(revision_old) + " (" + str(self.table.keys()) + ")")
+            return
+
+        regressed = []
+        for test_old in self.table[revision_old]:
+            for test_new in self.table[revision_new]:
+                tr = self.test_regression(test_old, test_new)
+                if len(tr) == 0:
+                    continue
+                test = Test(test_new.name)
+                for test_case in tr:
+                    test.add_result(test_case)
+                regressed.append(test)
+        return RegressionInfo(revision_old, revision_new, regressed)
+    
+    def __repr__(self):
+        string = ""
+        for rev in self.table.keys():
+            string += "[" + rev + "]:\n"
+            for test in self.table[rev]:
+                string += repr(test) + '\n'
+        return string
+
+
+class RevisionInfo(object):
+    """
+    this class is intended to store some relevant information about curent LLVM revision
+    """
+    def __init__(self, hostname, revision):
+        self.hostname, self.revision = hostname, revision
+        self.archs = []
+        self.opts = []
+        self.targets = []
+        self.succeed = 0
+        self.runfailed = 0
+        self.compfailed = 0
+        self.skipped = 0
+        self.testall = 0
+        self.regressions = {}
+    
+    def register_test(self, arch, opt, target, succeed, runfailed, compfailed, skipped):
+        if arch not in self.archs:
+            self.archs.append(arch)
+        if opt not in self.opts:
+            self.opts.append(opt)
+        if target not in self.targets:
+            self.targets.append(target)
+        self.runfailed += runfailed
+        self.compfailed += compfailed
+        self.skipped += skipped
+        self.succeed += succeed
+
+    def add_regression(self, revision, regression_info):
+        """ input is intended to be from 'TestTable.regression(..)', 'regression_info' is a tuple of RegressionInfo() object
+        (regression.py) and 'revision' is tested (not current) LLVM revision name """
+        if revision == self.revision:
+            raise RuntimeError("No regression can be found along the same LLVM revision!")
+      
+        if revision in self.regressions:
+            raise RuntimeError("This revision regression info is already in self.regressions!")
+      
+        self.regressions[revision] = regression_info
+
+    def __repr__(self):
+        string = "%s: LLVM(%s)\n" % (self.hostname, self.revision)
+        string += "archs  : %s\n" % (str(self.archs))
+        string += "opts   : %s\n" % (str(self.opts))
+        string += "targets: %s\n" % (str(self.targets))
+        string += "runfails: %d/%d\n" % (self.runfailed, self.testall)
+        string += "compfails: %d/%d\n" % (self.compfailed, self.testall)
+        string += "skipped: %d/%d\n" % (self.skipped, self.testall)
+        string += "succeed: %d/%d\n" % (self.succeed, self.testall)
+        return string
+
+
+class ExecutionStateGatherer(object):
+    def __init__(self):
+        self.hostname = self.get_host_name()
+        self.revision = ""
+        self.rinf = []
+        self.tt = TestTable()
+        self.switch_revision("undefined")
+
+    def switch_revision(self, revision):
+        self.revision = revision
+        self.rinf.append(RevisionInfo(self.hostname, self.revision))
+
+    def current_rinf(self):
+        if len(self.rinf) == 0:
+            raise RuntimeError("self.rinf is empty. Apparently you've never invoked switch_revision")
+        return self.rinf[len(self.rinf) - 1]
+
+    def add_to_tt(self, test_name, arch, opt, target, runfailed, compfailed):
+        if len(self.rinf) == 0:
+            raise RuntimeError("self.rinf is empty. Apparently you've never invoked switch_revision")
+        self.tt.add_result(self.revision, test_name, arch, opt, target, runfailed, compfailed)
+
+    def add_to_rinf(self, arch, opt, target, succeed, runfailed, compfailed, skipped):
+        self.current_rinf().register_test(arch, opt, target, succeed, runfailed, compfailed, skipped)
+
+    def add_to_rinf_testall(self, tried_to_test):
+        self.current_rinf().testall += tried_to_test
+
+    def load_from_tt(self, tt):
+        # TODO: fill in self.rinf field!
+        self.tt = tt
+        REVISIONS = tt.table.keys()
+        self.revision = ""
+        if len(REVISIONS) != 0:
+            self.revision = REVISIONS[0]
+        print "ESG: loaded from 'TestTable()' with revisions", REVISIONS
+
+    def dump(self, fname, obj):
+        import pickle
+        with open(fname, 'w') as fp:
+            pickle.dump(obj, fp)  
+
+    def undump(self, fname):
+        import pickle
+        with open(fname, 'r') as fp:
+            obj = pickle.load(fp) 
+        return obj
+
+    def get_host_name(self):
+        import socket
+        return socket.gethostname()
+
+    def __repr__(self):
+        string = "Hostname: %s\n" % (self.hostname)
+        string += "Current LLVM Revision = %s\n\n" % (self.revision)
+        for rev_info in self.rinf:
+            string += repr(rev_info) + '\n'
+        return string
+
+
+# this class instance is intended to gather and store all information
+# regarding the testing process.
+ex_state = ExecutionStateGatherer()   
--- a/contrib/ispc.vim
+++ b/contrib/ispc.vim
@@ -1,7 +1,7 @@
 " Vim syntax file
 " Language:	ISPC
 " Maintainer:	Andreas Wendleder <andreas.wendleder@gmail.com>
-" Last Change:	2011 Aug 3
+" Last Change:	2016 May 04

 " Quit when a syntax file was already loaded
 if exists("b:current_syntax")
@@ -13,11 +13,19 @@ runtime! syntax/c.vim
 unlet b:current_syntax

 " New keywords
-syn keyword	ispcStatement	cbreak ccontinue creturn launch print reference soa sync task
+syn keyword	ispcStatement	cbreak ccontinue creturn launch print reference soa sync
 syn keyword	ispcConditional	cif
-syn keyword	ispcRepeat	cdo cfor cwhile
-syn keyword	ispcBuiltin	programCount programIndex	
-syn keyword	ispcType	export int8 int16 int32 int64
+syn keyword	ispcRepeat	cdo cfor cwhile foreach foreach_tiled foreach_unique foreach_active
+syn keyword	ispcBuiltin	programCount programIndex taskCount taskCount0 taskCount1 taskCount3 taskIndex taskIndex0 taskIndex1 taskIndex2
+syn keyword	ispcType	export uniform varying int8 int16 int32 int64 task new delete
+syn keyword	ispcOperator	operator
+
+"double precision floating point number, with dot, optional exponent
+syn match	cFloat		display contained "\d\+\.\d*d[-+]\=\d*\>"
+"double precision floating point number, starting with dot, optional exponent
+syn match	cFloat		display contained ".\d*d[-+]\=\d*\>"
+"double precision floating point number, without dot, with exponent
+syn match	cFloat		display contained "\d\+d[-+]\=\d\+\>"

 " Default highlighting
 command -nargs=+ HiLink hi def link <args>
@@ -26,6 +34,7 @@ HiLink ispcConditional	Conditional
 HiLink ispcRepeat	Repeat
 HiLink ispcBuiltin	Statement
 HiLink ispcType		Type
+HiLink ispcOperator	Operator
 delcommand HiLink

 let b:current_syntax = "ispc"
--- a/contrib/ispc.vim.README
+++ b/contrib/ispc.vim.README
@@ -0,0 +1,8 @@
+To install vim syntax highlighting for ispc files:
+
+1) Copy ispc.vim into ~/.vim/syntax/ispc.vim (create if necessary)
+2) Create a filetype for ispc files to correspond to that syntax file
+   To do this, create and append the following line to ~/.vim/ftdetect/ispc.vim
+
+au BufRead,BufNewFile *.ispc set filetype=ispc
+
--- a/ctx.cpp
+++ b/ctx.cpp
--- a/ctx.h
+++ b/ctx.h
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2015, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -28,23 +28,32 @@
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

 /** @file ctx.h
-    @brief Declaration of the FunctionEmitContext class
+    @brief %Declaration of the FunctionEmitContext class
 */

 #ifndef ISPC_CTX_H
 #define ISPC_CTX_H 1

 #include "ispc.h"
-#include <llvm/InstrTypes.h>
-#include <llvm/Instructions.h>
-#ifndef LLVM_2_8
-#include <llvm/Analysis/DIBuilder.h>
+#include <map>
+#if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
+  #include <llvm/InstrTypes.h>
+  #include <llvm/Instructions.h>
+#else // 3.3+
+  #include <llvm/IR/InstrTypes.h>
+  #include <llvm/IR/Instructions.h>
+#endif
+#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_4
+  #include <llvm/DebugInfo.h>
+  #include <llvm/DIBuilder.h>
+#else // 3.5+
+  #include <llvm/IR/DebugInfo.h>
+  #include <llvm/IR/DIBuilder.h>
 #endif
-#include <llvm/Analysis/DebugInfo.h>

 struct CFInfo;

@@ -60,23 +69,27 @@ class FunctionEmitContext {
 public:
    /** Create a new FunctionEmitContext.
        @param function     The Function object representing the function
-        @param sym          Symbol that corresponds to the function
+        @param funSym       Symbol that corresponds to the function
        @param llvmFunction LLVM function in the current module that corresponds
                            to the function
        @param firstStmtPos Source file position of the first statement in the
                            function
     */
-    FunctionEmitContext(Function *function, Symbol *funSym, 
+    FunctionEmitContext(Function *function, Symbol *funSym,
                        llvm::Function *llvmFunction,
                        SourcePos firstStmtPos);
    ~FunctionEmitContext();

+    /** Returns the Function * corresponding to the function that we're
+        currently generating code for. */
+    const Function *GetFunction() const;
+
    /** @name Current basic block management
        @{
     */
-    /** Returns the current basic block pointer */ 
+    /** Returns the current basic block pointer */
    llvm::BasicBlock *GetCurrentBasicBlock();
-    
+
    /** Set the given llvm::BasicBlock to be the basic block to emit
        forthcoming instructions into. */
    void SetCurrentBasicBlock(llvm::BasicBlock *bblock);
@@ -84,22 +97,33 @@ public:
    /** @name Mask management
        @{
     */
-    /** Returns the current mask value */ 
-    llvm::Value *GetMask();
+    /** Returns the mask value at entry to the current function. */
+    llvm::Value *GetFunctionMask();

-    void SetMaskPointer(llvm::Value *p);
+    /** Returns the mask value corresponding to "varying" control flow
+        within the current function.  (i.e. this doesn't include the effect
+        of the mask at function entry. */
+    llvm::Value *GetInternalMask();
+
+    /** Returns the complete current mask value--i.e. the logical AND of
+        the function entry mask and the internal mask. */
+    llvm::Value *GetFullMask();
+
+    /** Returns a pointer to storage in memory that stores the current full
+        mask. */
+    llvm::Value *GetFullMaskPointer();

    /** Provides the value of the mask at function entry */
-    void SetEntryMask(llvm::Value *val);
+    void SetFunctionMask(llvm::Value *val);

-    /** Sets the mask to a new value */
-    void SetMask(llvm::Value *val);
+    /** Sets the internal mask to a new value */
+    void SetInternalMask(llvm::Value *val);

-    /** Sets the mask to (oldMask & val) */
-    void MaskAnd(llvm::Value *oldMask, llvm::Value *val);
+    /** Sets the internal mask to (oldMask & val) */
+    void SetInternalMaskAnd(llvm::Value *oldMask, llvm::Value *val);

-    /** Sets the mask to (oldMask & ~val) */
-    void MaskAndNot(llvm::Value *oldMask, llvm::Value *test);
+    /** Sets the internal mask to (oldMask & ~val) */
+    void SetInternalMaskAndNot(llvm::Value *oldMask, llvm::Value *test);

    /** Emits a branch instruction to the basic block btrue if any of the
        lanes of current mask are on and bfalse if none are on. */
@@ -118,9 +142,8 @@ public:
        @{
    */
    /** Notifies the FunctionEmitContext that we're starting emission of an
-        'if' statement with a uniform test.  The value of the mask going
-        into the 'if' statement is provided in the oldMask parameter. */
-    void StartUniformIf(llvm::Value *oldMask);
+        'if' statement with a uniform test.  */
+    void StartUniformIf();

    /** Notifies the FunctionEmitContext that we're starting emission of an
        'if' statement with a varying test.  The value of the mask going
@@ -135,19 +158,24 @@ public:
        for a loop.  Basic blocks are provides for where 'break' and
        'continue' statements should jump to (if all running lanes want to
        break or continue), uniformControlFlow indicates whether the loop
-        condition is 'uniform', and oldMask provides the current mask going
-        into the loop. */
-    void StartLoop(llvm::BasicBlock *breakTarget, llvm::BasicBlock *continueTarget, 
-                   bool uniformControlFlow, llvm::Value *oldMask);
+        condition is 'uniform'. */
+    void StartLoop(llvm::BasicBlock *breakTarget, llvm::BasicBlock *continueTarget,
+                   bool uniformControlFlow);

    /** Informs FunctionEmitContext of the value of the mask at the start
-        of a loop body. */
-    void SetLoopMask(llvm::Value *mask);
+        of a loop body or switch statement. */
+    void SetBlockEntryMask(llvm::Value *mask);

    /** Informs FunctionEmitContext that code generation for a loop is
        finished. */
    void EndLoop();

+    /** Indicates that code generation for a 'foreach', 'foreach_tiled',
+        'foreach_active', or 'foreach_unique' loop is about to start. */
+    enum ForeachType { FOREACH_REGULAR, FOREACH_ACTIVE, FOREACH_UNIQUE };
+    void StartForeach(ForeachType ft);
+    void EndForeach();
+
    /** Emit code for a 'break' statement in a loop.  If doCoherenceCheck
        is true, then if we're in a 'varying' loop, code will be emitted to
        see if all of the lanes want to break, in which case a jump to the
@@ -167,11 +195,81 @@ public:
        'continue' statement when going through the loop body in the
        previous iteration. */
    void RestoreContinuedLanes();
+    
+    /** This method is called by code emitting IR for a loop.  It clears 
+        any lanes that contained a break since the mask has been updated to take
+        them into account.  This is necessary as all the bail out checks for 
+        breaks are meant to only deal with lanes breaking on the current iteration.
+     */
+    void ClearBreakLanes();
+
+    /** Indicates that code generation for a "switch" statement is about to
+        start.  isUniform indicates whether the "switch" value is uniform,
+        and bbAfterSwitch gives the basic block immediately following the
+        "switch" statement.  (For example, if the switch condition is
+        uniform, we jump here upon executing a "break" statement.) */
+    void StartSwitch(bool isUniform, llvm::BasicBlock *bbAfterSwitch);
+    /** Indicates the end of code generation for a "switch" statement. */
+    void EndSwitch();
+
+    /** Emits code for a "switch" statement in the program.
+        @param expr         Gives the value of the expression after the "switch"
+        @param defaultBlock Basic block to execute for the "default" case.  This
+                            should be NULL if there is no "default" label inside
+                            the switch.
+        @param caseBlocks   vector that stores the mapping from label values
+                            after "case" statements to basic blocks corresponding
+                            to the "case" labels.
+        @param nextBlocks   For each basic block for a "case" or "default"
+                            label, this gives the basic block for the
+                            immediately-following "case" or "default" label (or
+                            the basic block after the "switch" statement for the
+                            last label.)
+    */
+    void SwitchInst(llvm::Value *expr, llvm::BasicBlock *defaultBlock,
+                    const std::vector<std::pair<int, llvm::BasicBlock *> > &caseBlocks,
+                    const std::map<llvm::BasicBlock *, llvm::BasicBlock *> &nextBlocks);
+
+    /** Generates code for a "default" label after a "switch" statement.
+        The checkMask parameter indicates whether additional code should be
+        generated to check to see if the execution mask is all off after
+        the default label (in which case a jump to the following label will
+        be issued. */
+    void EmitDefaultLabel(bool checkMask, SourcePos pos);
+
+    /** Generates code for a "case" label after a "switch" statement.  See
+        the documentation for EmitDefaultLabel() for discussion of the
+        checkMask parameter. */
+    void EmitCaseLabel(int value, bool checkMask, SourcePos pos);

    /** Returns the current number of nested levels of 'varying' control
        flow */
    int VaryingCFDepth() const;

+    bool InForeachLoop() const;
+
+    /** Temporarily disables emission of performance warnings from gathers
+        and scatters from subsequent code. */
+    void DisableGatherScatterWarnings();
+
+    /** Reenables emission of gather/scatter performance warnings. */
+    void EnableGatherScatterWarnings();
+
+    void SetContinueTarget(llvm::BasicBlock *bb) { continueTarget = bb; }
+
+    /** Step through the code and find label statements; create a basic
+        block for each one, so that subsequent calls to
+        GetLabeledBasicBlock() return the corresponding basic block. */
+    void InitializeLabelMap(Stmt *code);
+
+    /** If there is a label in the function with the given name, return the
+        new basic block that it starts. */
+    llvm::BasicBlock *GetLabeledBasicBlock(const std::string &label);
+
+    /** Returns a vector of all labels in the context. This is
+        simply the key set of the labelMap */
+    std::vector<std::string> GetLabels();
+
    /** Called to generate code for 'return' statement; value is the
        expression in the return statement (if non-NULL), and
        doCoherenceCheck indicates whether instructions should be generated
@@ -181,7 +279,7 @@ public:
    /** @} */

    /** @name Small helper/utility routines
-        @{ 
+        @{
    */
    /** Given a boolean mask value of type LLVMTypes::MaskType, return an
        i1 value that indicates if any of the mask lanes are on. */
@@ -192,7 +290,11 @@ public:
    llvm::Value *All(llvm::Value *mask);

    /** Given a boolean mask value of type LLVMTypes::MaskType, return an
-        i32 value wherein the i'th bit is on if and only if the i'th lane
+        i1 value that indicates if all of the mask lanes are off. */
+    llvm::Value *None(llvm::Value *mask);
+
+    /** Given a boolean mask value of type LLVMTypes::MaskType, return an
+        i64 value wherein the i'th bit is on if and only if the i'th lane
        of the mask is on. */
    llvm::Value *LaneMask(llvm::Value *mask);

@@ -200,6 +302,18 @@ public:
        that indicates whether the two masks are equal. */
    llvm::Value *MasksAllEqual(llvm::Value *mask1, llvm::Value *mask2);

+    /** generate constantvector, which contains programindex, i.e.
+        < i32 0, i32 1, i32 2, i32 3> */
+    llvm::Value *ProgramIndexVector(bool is32bits = true);
+#ifdef ISPC_NVPTX_ENABLED
+    llvm::Value *ProgramIndexVectorPTX(bool is32bits = true);
+
+    /** Issues a call to __insert_int8/int16/int32/int64/float/double */
+    llvm::Value* Insert(llvm::Value *vector, llvm::Value *lane, llvm::Value *scalar);
+    /** Issues a call to __extract_int8/int16/int32/int64/float/double */
+    llvm::Value* Extract(llvm::Value *vector, llvm::Value *lane);
+#endif 
+
    /** Given a string, create an anonymous global variable to hold its
        value and return the pointer to the string. */
    llvm::Value *GetStringPtr(const std::string &str);
@@ -213,9 +327,6 @@ public:
        i32. */
    llvm::Value *I1VecToBoolVec(llvm::Value *b);

-    /** Returns the size of the given type. */
-    llvm::Value *SizeOf(LLVM_TYPE_CONST llvm::Type *ty);
-
    /** If the user has asked to compile the program with instrumentation,
        this inserts a callback to the user-supplied instrumentation
        function at the current point in the code. */
@@ -240,8 +351,13 @@ public:
        llvm::Instruction for convenience; in calling code we often have
        Instructions stored using Value pointers; the code here returns
        silently if it's not actually given an instruction. */
-    void AddDebugPos(llvm::Value *instruction, const SourcePos *pos = NULL, 
+    void AddDebugPos(llvm::Value *instruction, const SourcePos *pos = NULL,
+#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
                     llvm::DIScope *scope = NULL);
+#else /* LLVM 3.7+ */
+                     llvm::DIScope *scope = NULL);
+                     //llvm::MDScope *scope = NULL );
+#endif

    /** Inform the debugging information generation code that a new scope
        is starting in the source program. */
@@ -253,7 +369,11 @@ public:

    /** Returns the llvm::DIScope corresponding to the current program
        scope. */
+#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
    llvm::DIScope GetDIScope() const;
+#else // LLVM 3.7++
+    llvm::DIScope *GetDIScope() const;
+#endif

    /** Emits debugging information for the variable represented by
        sym.  */
@@ -261,7 +381,7 @@ public:

    /** Emits debugging information for the function parameter represented
        by sym.  */
-    void EmitFunctionParameterDebugInfo(Symbol *sym);
+    void EmitFunctionParameterDebugInfo(Symbol *sym, int parameterNum);
    /** @} */

    /** @name IR instruction emission
@@ -269,7 +389,7 @@ public:
        instructions.  See the LLVM assembly language reference manual
        (http://llvm.org/docs/LangRef.html) and the LLVM doxygen documentaion
        (http://llvm.org/doxygen) for more information.  Here we will only
-        document significant generalizations to the functionality of the 
+        document significant generalizations to the functionality of the
        corresponding basic LLVM instructions.

        Beyond actually emitting the instruction, the implementations of
@@ -285,7 +405,7 @@ public:
        this also handles applying the given operation to the vector
        elements. */
    llvm::Value *BinaryOperator(llvm::Instruction::BinaryOps inst,
-                                llvm::Value *v0, llvm::Value *v1, 
+                                llvm::Value *v0, llvm::Value *v1,
                                const char *name = NULL);

    /** Emit the "not" operator.  Like BinaryOperator(), this also handles
@@ -295,69 +415,104 @@ public:
    /** Emit a comparison instruction.  If the operands are VectorTypes,
        then a value for the corresponding boolean VectorType is
        returned. */
-    llvm::Value *CmpInst(llvm::Instruction::OtherOps inst, 
+    llvm::Value *CmpInst(llvm::Instruction::OtherOps inst,
                         llvm::CmpInst::Predicate pred,
                         llvm::Value *v0, llvm::Value *v1, const char *name = NULL);

-    llvm::Value *BitCastInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
+    /** Given a scalar value, return a vector of the same type (or an
+        array, for pointer types). */
+    llvm::Value *SmearUniform(llvm::Value *value, const char *name = NULL);
+
+    llvm::Value *BitCastInst(llvm::Value *value, llvm::Type *type,
                             const char *name = NULL);
-    llvm::Value *PtrToIntInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
+    llvm::Value *PtrToIntInst(llvm::Value *value, const char *name = NULL);
+    llvm::Value *PtrToIntInst(llvm::Value *value, llvm::Type *type,
                              const char *name = NULL);
-    llvm::Value *IntToPtrInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
+    llvm::Value *IntToPtrInst(llvm::Value *value, llvm::Type *type,
                              const char *name = NULL);
-    llvm::Instruction *TruncInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
+
+    llvm::Instruction *TruncInst(llvm::Value *value, llvm::Type *type,
                                 const char *name = NULL);
    llvm::Instruction *CastInst(llvm::Instruction::CastOps op, llvm::Value *value,
-                                LLVM_TYPE_CONST llvm::Type *type, const char *name = NULL);
-    llvm::Instruction *FPCastInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type, 
+                                llvm::Type *type, const char *name = NULL);
+    llvm::Instruction *FPCastInst(llvm::Value *value, llvm::Type *type,
                                  const char *name = NULL);
-    llvm::Instruction *SExtInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type, 
+    llvm::Instruction *SExtInst(llvm::Value *value, llvm::Type *type,
                                const char *name = NULL);
-    llvm::Instruction *ZExtInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type, 
+    llvm::Instruction *ZExtInst(llvm::Value *value, llvm::Type *type,
                                const char *name = NULL);

-    /** This GEP method is a generalization of the standard one in LLVM; it
-        supports both uniform and varying basePtr values (an array of
-        pointers) as well as uniform and varying index values (arrays of
-        indices). */
+    /** Given two integer-typed values (but possibly one vector and the
+        other not, and or of possibly-different bit-widths), update their
+        values as needed so that the two have the same (more general)
+        type. */
+    void MatchIntegerTypes(llvm::Value **v0, llvm::Value **v1);
+
+    /** Create a new slice pointer out of the given pointer to an soa type
+        and an integer offset to a slice within that type. */
+    llvm::Value *MakeSlicePointer(llvm::Value *ptr, llvm::Value *offset);
+
+    /** These GEP methods are generalizations of the standard ones in LLVM;
+        they support both uniform and varying basePtr values as well as
+        uniform and varying index values (arrays of indices).  Varying base
+        pointers are expected to come in as vectors of i32/i64 (depending
+        on the target), since LLVM doesn't currently support vectors of
+        pointers.  The underlying type of the base pointer must be provided
+        via the ptrType parameter */
+    llvm::Value *GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index,
+                                   const Type *ptrType, const char *name = NULL);
    llvm::Value *GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index0,
-                                   llvm::Value *index1, const char *name = NULL);
-
-    /** This is a convenience method to generate a GEP instruction with
-        indices with values with known constant values as the ispc program
-        is being compiled. */
-    llvm::Value *GetElementPtrInst(llvm::Value *basePtr, int v0, int v1,
+                                   llvm::Value *index1, const Type *ptrType,
                                   const char *name = NULL);

-    /** Load from the memory location(s) given by lvalue.  The lvalue may
-        be varying, in which case this corresponds to a gather from the
-        multiple memory locations given by the array of pointer values
-        given by the lvalue.  If the lvalue is not varying, then the type
-        parameter may be NULL. */
-    llvm::Value *LoadInst(llvm::Value *lvalue, const Type *type,
-                          const char *name = NULL);
+    /** This method returns a new pointer that represents offsetting the
+        given base pointer to point at the given element number of the
+        structure type that the base pointer points to.  (The provided
+        pointer must be a pointer to a structure type.  The ptrType gives
+        the type of the pointer, though it may be NULL if the base pointer
+        is uniform. */
+    llvm::Value *AddElementOffset(llvm::Value *basePtr, int elementNum,
+                                  const Type *ptrType, const char *name = NULL,
+                                  const PointerType **resultPtrType = NULL);
+
+    /** Load from the memory location(s) given by lvalue, using the given
+        mask.  The lvalue may be varying, in which case this corresponds to
+        a gather from the multiple memory locations given by the array of
+        pointer values given by the lvalue.  If the lvalue is not varying,
+        then both the mask pointer and the type pointer may be NULL. */
+    llvm::Value *LoadInst(llvm::Value *ptr, llvm::Value *mask,
+                          const Type *ptrType, const char *name = NULL, 
+                          bool one_elem = false);
+
+    llvm::Value *LoadInst(llvm::Value *ptr, const char *name = NULL);

    /** Emits an alloca instruction to allocate stack storage for the given
        type.  If a non-zero alignment is specified, the object is also
        allocated at the given alignment.  By default, the alloca
        instruction is added at the start of the function in the entry
        basic block; if it should be added to the current basic block, then
-        the atEntryBlock parameter should be false. */ 
-    llvm::Value *AllocaInst(LLVM_TYPE_CONST llvm::Type *llvmType, const char *name = NULL,
-                            int align = 0, bool atEntryBlock = true);
+        the atEntryBlock parameter should be false. */
+    llvm::Value *AllocaInst(llvm::Type *llvmType,
+                            const char *name = NULL, int align = 0,
+                            bool atEntryBlock = true);

    /** Standard store instruction; for this variant, the lvalue must be a
        single pointer, not a varying lvalue. */
-    void StoreInst(llvm::Value *rvalue, llvm::Value *lvalue, 
-                   const char *name = NULL);
+    void StoreInst(llvm::Value *value, llvm::Value *ptr);

    /** In this variant of StoreInst(), the lvalue may be varying.  If so,
        this corresponds to a scatter.  Whether the lvalue is uniform of
        varying, the given storeMask is used to mask the stores so that
        they only execute for the active program instances. */
-    void StoreInst(llvm::Value *rvalue, llvm::Value *lvalue,
-                   llvm::Value *storeMask, const Type *rvalueType,
-                   const char *name = NULL);
+    void StoreInst(llvm::Value *value, llvm::Value *ptr,
+                   llvm::Value *storeMask, const Type *valueType,
+                   const Type *ptrType);
+
+    /** Copy count bytes of memory from the location pointed to by src to
+        the location pointed to by dest.  (src and dest must not be
+        overlapping.) */
+    void MemcpyInst(llvm::Value *dest, llvm::Value *src, llvm::Value *count,
+                    llvm::Value *align = NULL);

    void BranchInst(llvm::BasicBlock *block);
    void BranchInst(llvm::BasicBlock *trueBlock, llvm::BasicBlock *falseBlock,
@@ -371,32 +526,48 @@ public:
    /** This convenience method maps to an llvm::InsertElementInst if the
        given value is a llvm::VectorType, and to an llvm::InsertValueInst
        otherwise. */
-    llvm::Value *InsertInst(llvm::Value *v, llvm::Value *eltVal, int elt, 
+    llvm::Value *InsertInst(llvm::Value *v, llvm::Value *eltVal, int elt,
                            const char *name = NULL);

-    llvm::PHINode *PhiNode(LLVM_TYPE_CONST llvm::Type *type, int count, 
+    /** This convenience method maps to an llvm::ShuffleVectorInst. */
+    llvm::Value *ShuffleInst(llvm::Value *v1, llvm::Value *v2, llvm::Value *mask,
+                            const char *name = NULL);
+
+    /** This convenience method to generate broadcast pattern. It takes a value
+        and a vector type. Type of the value must match element type of the
+        vector. */
+    llvm::Value *BroadcastValue(llvm::Value *v, llvm::Type *vecType,
+                                const char *name = NULL);
+
+    llvm::PHINode *PhiNode(llvm::Type *type, int count,
                           const char *name = NULL);
    llvm::Instruction *SelectInst(llvm::Value *test, llvm::Value *val0,
                                  llvm::Value *val1, const char *name = NULL);

-    llvm::Instruction *CallInst(llvm::Function *func, 
-                                const std::vector<llvm::Value *> &args,
-                                const char *name = NULL);
+    /** Emits IR to do a function call with the given arguments.  If the
+        function type is a varying function pointer type, its full type
+        must be provided in funcType.  funcType can be NULL if func is a
+        uniform function pointer. */
+    llvm::Value *CallInst(llvm::Value *func, const FunctionType *funcType,
+                          const std::vector<llvm::Value *> &args,
+                          const char *name = NULL);
+
    /** This is a convenience method that issues a call instruction to a
        function that takes just a single argument. */
-    llvm::Instruction *CallInst(llvm::Function *func, llvm::Value *arg,
-                                const char *name = NULL);
+    llvm::Value *CallInst(llvm::Value *func, const FunctionType *funcType,
+                          llvm::Value *arg, const char *name = NULL);

    /** This is a convenience method that issues a call instruction to a
        function that takes two arguments. */
-    llvm::Instruction *CallInst(llvm::Function *func, llvm::Value *arg0,
-                                llvm::Value *arg1, const char *name = NULL);
+    llvm::Value *CallInst(llvm::Value *func, const FunctionType *funcType,
+                          llvm::Value *arg0, llvm::Value *arg1,
+                          const char *name = NULL);

    /** Launch an asynchronous task to run the given function, passing it
        he given argument values. */
-    llvm::Instruction *LaunchInst(llvm::Function *callee, 
-                                  std::vector<llvm::Value *> &argVals,
-                                  llvm::Value *launchCount);
+    llvm::Value *LaunchInst(llvm::Value *callee,
+                            std::vector<llvm::Value *> &argVals,
+                            llvm::Value *launchCount[3]);

    void SyncInst();

@@ -404,6 +575,12 @@ public:
    /** @} */

 private:
+    /** Pointer to the Function for which we're currently generating code. */
+    Function *function;
+
+    /** LLVM function representation for the current function. */
+    llvm::Function *llvmFunction;
+
    /** The basic block into which we add any alloca instructions that need
        to go at the very start of the function. */
    llvm::BasicBlock *allocaBlock;
@@ -413,8 +590,16 @@ private:
    llvm::BasicBlock *bblock;

    /** Pointer to stack-allocated memory that stores the current value of
-        the program mask. */
-    llvm::Value *maskPtr;
+        the full program mask. */
+    llvm::Value *fullMaskPointer;
+
+    /** Pointer to stack-allocated memory that stores the current value of
+        the program mask representing varying control flow within the
+        function. */
+    llvm::Value *internalMaskPointer;
+
+    /** Value of the program mask when the function starts execution.  */
+    llvm::Value *functionMaskValue;

    /** Current source file position; if debugging information is being
        generated, this position is used to set file/line information for
@@ -425,20 +610,14 @@ private:
        for error messages and debugging symbols. */
    SourcePos funcStartPos;

-    /** Type of result that the current function returns. */
-    const Type *returnType;
+    /** If currently in a loop body or switch statement, the value of the
+        mask at the start of it. */
+    llvm::Value *blockEntryMask;

-    /** Value of the program mask when the function starts execution.  */
-    llvm::Value *entryMask;
-
-    /** If currently in a loop body, the value of the mask at the start of
-        the loop. */
-    llvm::Value *loopMask;
-
-    /** If currently in a loop body, this is a pointer to memory to store a
-        mask value that represents which of the lanes have executed a
-        'break' statement.  If we're not in a loop body, this should be
-        NULL. */
+    /** If currently in a loop body or switch statement, this is a pointer
+        to memory to store a mask value that represents which of the lanes
+        have executed a 'break' statement.  If we're not in a loop body or
+        switch, this should be NULL. */
    llvm::Value *breakLanesPtr;

    /** Similar to breakLanesPtr, if we're inside a loop, this is a pointer
@@ -446,16 +625,49 @@ private:
        'continue' statement. */
    llvm::Value *continueLanesPtr;

-    /** If we're inside a loop, this gives the basic block immediately
-        after the current loop, which we will jump to if all of the lanes
-        have executed a break statement or are otherwise done with the
-        loop. */
+    /** If we're inside a loop or switch statement, this gives the basic
+        block immediately after the current loop or switch, which we will
+        jump to if all of the lanes have executed a break statement or are
+        otherwise done with it. */
    llvm::BasicBlock *breakTarget;

    /** If we're inside a loop, this gives the block to jump to if all of
        the running lanes have executed a 'continue' statement. */
    llvm::BasicBlock *continueTarget;

+    /** @name Switch statement state
+
+        These variables store various state that's active when we're
+        generating code for a switch statement.  They should all be NULL
+        outside of a switch.
+        @{
+    */
+
+    /** The value of the expression used to determine which case in the
+        statements after the switch to execute. */
+    llvm::Value *switchExpr;
+
+    /** Map from case label numbers to the basic block that will hold code
+        for that case. */
+    const std::vector<std::pair<int, llvm::BasicBlock *> > *caseBlocks;
+
+    /** The basic block of code to run for the "default" label in the
+        switch statement. */
+    llvm::BasicBlock *defaultBlock;
+
+    /** For each basic block for the code for cases (and the default label,
+        if present), this map gives the basic block for the immediately
+        following case/default label. */
+    const std::map<llvm::BasicBlock *, llvm::BasicBlock *> *nextBlocks;
+
+    /** Records whether the switch condition was uniform; this is a
+        distinct notion from whether the switch represents uniform or
+        varying control flow; we may have varying control flow from a
+        uniform switch condition if there is a 'break' inside the switch
+        that's under varying control flow. */
+    bool switchConditionWasUniform;
+    /** @} */
+
    /** A pointer to memory that records which of the program instances
        have executed a 'return' statement (and are thus really truly done
        running any more instructions in this functions. */
@@ -473,17 +685,31 @@ private:
        emitted. */
    std::vector<CFInfo *> controlFlowInfo;

+#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6
    /** DIFile object corresponding to the source file where the current
-        function was defined (used for debugging info0. */
+        function was defined (used for debugging info). */
    llvm::DIFile diFile;

    /** DISubprogram corresponding to this function (used for debugging
        info). */
-    llvm::DISubprogram diFunction;
+    llvm::DISubprogram diSubprogram;

    /** These correspond to the current set of nested scopes in the
        function. */
    std::vector<llvm::DILexicalBlock> debugScopes;
+#else // LLVM 3.7++
+    /** DIFile object corresponding to the source file where the current
+        function was defined (used for debugging info). */
+    llvm::DIFile *diFile;
+
+    /** DISubprogram corresponding to this function (used for debugging
+        info). */
+    llvm::DISubprogram *diSubprogram;
+
+    /** These correspond to the current set of nested scopes in the
+        function. */
+    std::vector<llvm::DIScope *> debugScopes;
+#endif

    /** True if a 'launch' statement has been encountered in the function. */
    bool launchedTasks;
@@ -493,20 +719,44 @@ private:
        tasks launched from the current function. */
    llvm::Value *launchGroupHandlePtr;

+    /** Nesting count of the number of times calling code has disabled (and
+        not yet reenabled) gather/scatter performance warnings. */
+    int disableGSWarningCount;
+
+    std::map<std::string, llvm::BasicBlock *> labelMap;
+
+    static bool initLabelBBlocks(ASTNode *node, void *data);
+
    llvm::Value *pointerVectorToVoidPointers(llvm::Value *value);
-    static void addGSMetadata(llvm::Instruction *inst, SourcePos pos);
-    bool ifsInLoopAllUniform() const;
+    static void addGSMetadata(llvm::Value *inst, SourcePos pos);
+    bool ifsInCFAllUniform(int cfType) const;
    void jumpIfAllLoopLanesAreDone(llvm::BasicBlock *target);
    llvm::Value *emitGatherCallback(llvm::Value *lvalue, llvm::Value *retPtr);

-    void restoreMaskGivenReturns(llvm::Value *oldMask);
+    llvm::Value *applyVaryingGEP(llvm::Value *basePtr, llvm::Value *index,
+                                 const Type *ptrType);

-    void scatter(llvm::Value *rvalue, llvm::Value *lvalue, 
-                 llvm::Value *maskPtr, const Type *rvalueType);
-    llvm::Value *gather(llvm::Value *lvalue, const Type *type,
-                        const char *name);
-    void maskedStore(llvm::Value *rvalue, llvm::Value *lvalue,
-                     const Type *rvalueType, llvm::Value *maskPtr);
+    void restoreMaskGivenReturns(llvm::Value *oldMask);
+    void addSwitchMaskCheck(llvm::Value *mask);
+    bool inSwitchStatement() const;
+    llvm::Value *getMaskAtSwitchEntry();
+
+    CFInfo *popCFState();
+
+    void scatter(llvm::Value *value, llvm::Value *ptr, const Type *valueType,
+                 const Type *ptrType, llvm::Value *mask);
+    void maskedStore(llvm::Value *value, llvm::Value *ptr, const Type *ptrType,
+                     llvm::Value *mask);
+    void storeUniformToSOA(llvm::Value *value, llvm::Value *ptr,
+                           llvm::Value *mask, const Type *valueType,
+                           const PointerType *ptrType);
+    llvm::Value *loadUniformFromSOA(llvm::Value *ptr, llvm::Value *mask,
+                                    const PointerType *ptrType, const char *name);
+
+    llvm::Value *gather(llvm::Value *ptr, const PointerType *ptrType,
+                        llvm::Value *mask, const char *name);
+
+    llvm::Value *addVaryingOffsetsIfNeeded(llvm::Value *ptr, const Type *ptrType);
 };

 #endif // ISPC_CTX_H
--- a/decl.cpp
+++ b/decl.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2013, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -28,20 +28,100 @@
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

 /** @file decl.cpp
-    @brief Implementations of classes related to turning declarations into 
-           symbols and types.
+    @brief Implementations of classes related to turning declarations into
+           symbol names and types.
 */

 #include "decl.h"
 #include "util.h"
+#include "module.h"
 #include "sym.h"
 #include "type.h"
+#include "stmt.h"
 #include "expr.h"
 #include <stdio.h>
+#include <string.h>
+#include <set>
+
+static void
+lPrintTypeQualifiers(int typeQualifiers) {
+    if (typeQualifiers & TYPEQUAL_INLINE)    printf("inline ");
+    if (typeQualifiers & TYPEQUAL_CONST)     printf("const ");
+    if (typeQualifiers & TYPEQUAL_UNIFORM)   printf("uniform ");
+    if (typeQualifiers & TYPEQUAL_VARYING)   printf("varying ");
+    if (typeQualifiers & TYPEQUAL_TASK)      printf("task ");
+    if (typeQualifiers & TYPEQUAL_SIGNED)    printf("signed ");
+    if (typeQualifiers & TYPEQUAL_UNSIGNED)  printf("unsigned ");
+    if (typeQualifiers & TYPEQUAL_EXPORT)    printf("export ");
+    if (typeQualifiers & TYPEQUAL_UNMASKED)  printf("unmasked ");
+}
+
+
+/** Given a Type and a set of type qualifiers, apply the type qualifiers to
+    the type, returning the type that is the result.
+*/
+static const Type *
+lApplyTypeQualifiers(int typeQualifiers, const Type *type, SourcePos pos) {
+    if (type == NULL)
+        return NULL;
+
+    if ((typeQualifiers & TYPEQUAL_CONST) != 0) {
+        type = type->GetAsConstType();
+    }
+
+    if ( ((typeQualifiers & TYPEQUAL_UNIFORM) != 0)
+         && ((typeQualifiers & TYPEQUAL_VARYING) != 0) ) {
+        Error(pos, "Type \"%s\" cannot be qualified with both uniform and varying.", 
+              type->GetString().c_str());
+    }
+
+    if ((typeQualifiers & TYPEQUAL_UNIFORM) != 0) {
+        if (type->IsVoidType())
+            Error(pos, "\"uniform\" qualifier is illegal with \"void\" type.");
+        else
+            type = type->GetAsUniformType();
+    }
+    else if ((typeQualifiers & TYPEQUAL_VARYING) != 0) {
+        if (type->IsVoidType())
+            Error(pos, "\"varying\" qualifier is illegal with \"void\" type.");
+        else
+            type = type->GetAsVaryingType();
+    }
+    else {
+        if (type->IsVoidType() == false)
+            type = type->GetAsUnboundVariabilityType();
+    }
+
+    if ((typeQualifiers & TYPEQUAL_UNSIGNED) != 0) {
+        if ((typeQualifiers & TYPEQUAL_SIGNED) != 0)
+            Error(pos, "Illegal to apply both \"signed\" and \"unsigned\" "
+                  "qualifiers.");
+
+        const Type *unsignedType = type->GetAsUnsignedType();
+        if (unsignedType != NULL)
+            type = unsignedType;
+        else {
+            const Type *resolvedType =
+                type->ResolveUnboundVariability(Variability::Varying);
+            Error(pos, "\"unsigned\" qualifier is illegal with \"%s\" type.",
+                  resolvedType->GetString().c_str());
+        }
+    }
+
+    if ((typeQualifiers & TYPEQUAL_SIGNED) != 0 && type->IsIntType() == false) {
+        const Type *resolvedType =
+            type->ResolveUnboundVariability(Variability::Varying);
+        Error(pos, "\"signed\" qualifier is illegal with non-integer type "
+              "\"%s\".", resolvedType->GetString().c_str());
+    }
+
+    return type;
+}
+

 ///////////////////////////////////////////////////////////////////////////
 // DeclSpecs
@@ -49,300 +129,619 @@
 DeclSpecs::DeclSpecs(const Type *t, StorageClass sc, int tq) {
    baseType = t;
    storageClass = sc;
-    typeQualifier = tq;
+    typeQualifiers = tq;
    soaWidth = 0;
    vectorSize = 0;
-}
-
-
-void
-DeclSpecs::Print() const {
-    if (storageClass == SC_EXTERN)   printf("extern ");
-    if (storageClass == SC_EXTERN_C) printf("extern \"C\" ");
-    if (storageClass == SC_EXPORT)   printf("export ");
-    if (storageClass == SC_STATIC)   printf("static ");
-    if (storageClass == SC_TYPEDEF)  printf("typedef ");
-
-    if (soaWidth > 0) printf("soa<%d> ", soaWidth);
-
-    if (typeQualifier & TYPEQUAL_INLINE)    printf("inline ");
-    if (typeQualifier & TYPEQUAL_CONST)     printf("const ");
-    if (typeQualifier & TYPEQUAL_UNIFORM)   printf("uniform ");
-    if (typeQualifier & TYPEQUAL_VARYING)   printf("varying ");
-    if (typeQualifier & TYPEQUAL_TASK)      printf("task ");
-    if (typeQualifier & TYPEQUAL_REFERENCE) printf("reference ");
-    if (typeQualifier & TYPEQUAL_UNSIGNED)  printf("unsigned ");
-
-    printf("%s", baseType->GetString().c_str());
-
-    if (vectorSize > 0) printf("<%d>", vectorSize);
-}
-
-
-///////////////////////////////////////////////////////////////////////////
-// Declarator
-
-Declarator::Declarator(Symbol *s, SourcePos p) 
-  : pos(p) { 
-    sym = s;
-    functionArgs = NULL;
-    isFunction = false;
-    initExpr = NULL;
-}
-
-
-void
-Declarator::AddArrayDimension(int size) {
-    assert(size > 0 || size == -1); // -1 -> unsized
-    arraySize.push_back(size);
-}
-
-
-void
-Declarator::InitFromDeclSpecs(DeclSpecs *ds) {
-    sym->type = GetType(ds);
-    sym->storageClass = ds->storageClass;
-}
-
-
-void
-Declarator::Print() const {
-    printf("%s", sym->name.c_str());
-    if (initExpr != NULL) {
-        printf(" = (");
-        initExpr->Print();
-        printf(")");
-    }
-    pos.Print();
-}
-
-
-static const Type *
-lGetType(const Declarator *decl, DeclSpecs *ds, 
-         std::vector<int>::const_iterator arrayIter) {
-    if (arrayIter == decl->arraySize.end()) {
-        // If we don't have an array (or have processed all of the array
-        // dimensions in previous recursive calls), we can go ahead and
-        // figure out the final non-array type we have here.
-        const Type *type = ds->baseType;
-        if (type == NULL) {
-            Error(decl->pos, "Type not provided in variable declaration for variable \"%s\".",
-                  decl->sym->name.c_str());
-            return NULL;
-        }
-
-        // Account for 'unsigned' and 'const' qualifiers in the type
-        if ((ds->typeQualifier & TYPEQUAL_UNSIGNED) != 0) {
-            const Type *unsignedType = type->GetAsUnsignedType();
-            if (unsignedType != NULL)
-                type = unsignedType;
-            else
-                Error(decl->pos, "\"unsigned\" qualifier is illegal with \"%s\" type.",
-                      type->GetString().c_str());
-        }
-        if ((ds->typeQualifier & TYPEQUAL_CONST) != 0)
-            type = type->GetAsConstType();
-
-        if (ds->vectorSize > 0) {
-            const AtomicType *atomicType = dynamic_cast<const AtomicType *>(type);
-            if (atomicType == NULL) {
-                Error(decl->pos, "Only atomic types (int, float, ...) are legal for vector "
-                      "types.");
-                return NULL;
+    if (t != NULL) {
+        if (m->symbolTable->ContainsType(t)) {
+            // Typedefs might have uniform/varying qualifiers inside.
+            if (t->IsVaryingType()) {
+                typeQualifiers |= TYPEQUAL_VARYING;
            }
-            type = new VectorType(atomicType, ds->vectorSize);
-        }
-
-        // if uniform/varying is specified explicitly, then go with that
-        if ((ds->typeQualifier & TYPEQUAL_UNIFORM) != 0)
-            return type->GetAsUniformType();
-        else if ((ds->typeQualifier & TYPEQUAL_VARYING) != 0)
-            return type->GetAsVaryingType();
-        else {
-            // otherwise, structs are uniform by default and everything
-            // else is varying by default
-            if (dynamic_cast<const StructType *>(type) != NULL)
-                return type->GetAsUniformType();
-            else
-                return type->GetAsVaryingType();
-        }
-    }
-    else {
-        // Peel off one dimension of the array
-        int arraySize = *arrayIter;
-        ++arrayIter;
-
-        // Get the type, not including the arraySize dimension peeled off
-        // above.
-        const Type *childType = lGetType(decl, ds, arrayIter);
-
-        int soaWidth = ds->soaWidth;
-        if (soaWidth == 0)
-            // If there's no "soa<n>" stuff going on, just return a regular
-            // array with the appropriate size 
-            return new ArrayType(childType, arraySize == -1 ? 0 : arraySize);
-       else {
-            // Make sure we actually have an array of structs ..
-            const StructType *childStructType = 
-                dynamic_cast<const StructType *>(childType);
-            if (childStructType == NULL) {
-                Error(decl->pos, "Illegal to provide soa<%d> qualifier with non-struct "
-                      "type \"%s\".", soaWidth, childType->GetString().c_str());
-                return new ArrayType(childType, arraySize == -1 ? 0 : arraySize);
+            else if (t->IsUniformType()) {
+                typeQualifiers |= TYPEQUAL_UNIFORM;
            }
-            else if ((soaWidth & (soaWidth - 1)) != 0) {
-                Error(decl->pos, "soa<%d> width illegal.  Value must be power of two.",
-                      soaWidth);
-                return NULL;
-            }
-            else if (arraySize != -1 && (arraySize % soaWidth) != 0) {
-                Error(decl->pos, "soa<%d> width must evenly divide array size %d.",
-                      soaWidth, arraySize);
-                return NULL;
-            }
-            return new SOAArrayType(childStructType, arraySize == -1 ? 0 : arraySize,
-                                    soaWidth);
        }
    }
 }


 const Type *
-Declarator::GetType(DeclSpecs *ds) const {
-    bool hasUniformQual = ((ds->typeQualifier & TYPEQUAL_UNIFORM) != 0);
-    bool hasVaryingQual = ((ds->typeQualifier & TYPEQUAL_VARYING) != 0);
-    bool isTask =         ((ds->typeQualifier & TYPEQUAL_TASK) != 0);
-    bool isReference =    ((ds->typeQualifier & TYPEQUAL_REFERENCE) != 0);
+DeclSpecs::GetBaseType(SourcePos pos) const {
+    const Type *retType = baseType;
+
+    if (retType == NULL) {
+        Warning(pos, "No type specified in declaration.  Assuming int32.");
+        retType = AtomicType::UniformInt32->GetAsUnboundVariabilityType();
+    }
+
+    if (vectorSize > 0) {
+        const AtomicType *atomicType = CastType<AtomicType>(retType);
+        if (atomicType == NULL) {
+            Error(pos, "Only atomic types (int, float, ...) are legal for vector "
+                  "types.");
+            return NULL;
+        }
+        retType = new VectorType(atomicType, vectorSize);
+    }
+
+    retType = lApplyTypeQualifiers(typeQualifiers, retType, pos);
+
+    if (soaWidth > 0) {
+#ifdef ISPC_NVPTX_ENABLED
+#if 0  /* see stmt.cpp in DeclStmt::EmitCode for work-around of SOAType Declaration */
+        if (g->target->getISA() == Target::NVPTX)
+        {
+            Error(pos, "\"soa\" data types are currently not supported with \"nvptx\" target.");
+            return NULL;
+        }
+#endif
+#endif /* ISPC_NVPTX_ENABLED */
+        const StructType *st = CastType<StructType>(retType);
+
+        if (st == NULL) {
+            Error(pos, "Illegal to provide soa<%d> qualifier with non-struct "
+                  "type \"%s\".", soaWidth, retType->GetString().c_str());
+            return NULL;
+        }
+        else if (soaWidth <= 0 || (soaWidth & (soaWidth - 1)) != 0) {
+            Error(pos, "soa<%d> width illegal. Value must be positive power "
+                  "of two.", soaWidth);
+            return NULL;
+        }
+
+        if (st->IsUniformType()) {
+            Error(pos, "\"uniform\" qualifier and \"soa<%d>\" qualifier can't "
+                  "both be used in a type declaration.", soaWidth);
+            return NULL;
+        }
+        else if (st->IsVaryingType()) {
+            Error(pos, "\"varying\" qualifier and \"soa<%d>\" qualifier can't "
+                  "both be used in a type declaration.", soaWidth);
+            return NULL;
+        }
+        else
+            retType = st->GetAsSOAType(soaWidth);
+
+        if (soaWidth < g->target->getVectorWidth())
+            PerformanceWarning(pos, "soa<%d> width smaller than gang size %d "
+                               "currently leads to inefficient code to access "
+                               "soa types.", soaWidth, g->target->getVectorWidth());
+    }
+
+    return retType;
+}
+
+
+static const char *
+lGetStorageClassName(StorageClass storageClass) {
+    switch (storageClass) {
+    case SC_NONE:     return "";
+    case SC_EXTERN:   return "extern";
+    case SC_EXTERN_C: return "extern \"C\"";
+    case SC_STATIC:   return "static";
+    case SC_TYPEDEF:  return "typedef";
+    default:          FATAL("Unhandled storage class in lGetStorageClassName");
+                      return "";
+    }
+}
+
+
+void
+DeclSpecs::Print() const {
+    printf("Declspecs: [%s ", lGetStorageClassName(storageClass));
+
+    if (soaWidth > 0) printf("soa<%d> ", soaWidth);
+    lPrintTypeQualifiers(typeQualifiers);
+    printf("base type: %s", baseType->GetString().c_str());
+
+    if (vectorSize > 0) printf("<%d>", vectorSize);
+    printf("]");
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// Declarator
+
+Declarator::Declarator(DeclaratorKind dk, SourcePos p)
+    : pos(p), kind(dk) {
+    child = NULL;
+    typeQualifiers = 0;
+    storageClass = SC_NONE;
+    arraySize = -1;
+    type = NULL;
+    initExpr = NULL;
+}
+
+
+void
+Declarator::InitFromDeclSpecs(DeclSpecs *ds) {
+    const Type *baseType = ds->GetBaseType(pos);
+
+    InitFromType(baseType, ds);
+
+    if (type == NULL) {
+        AssertPos(pos, m->errorCount > 0);
+        return;
+    }
+
+    storageClass = ds->storageClass;
+
+    if (ds->declSpecList.size() > 0 &&
+        CastType<FunctionType>(type) == NULL) {
+        Error(pos, "__declspec specifiers for non-function type \"%s\" are "
+              "not used.", type->GetString().c_str());
+    }
+}
+
+
+void
+Declarator::Print(int indent) const {
+    printf("%*cdeclarator: [", indent, ' ');
+    pos.Print();
+
+    lPrintTypeQualifiers(typeQualifiers);
+    printf("%s ", lGetStorageClassName(storageClass));
+    if (name.size() > 0)
+        printf("%s", name.c_str());
+    else
+        printf("(unnamed)");
+
+    printf(", array size = %d", arraySize);
+
+    printf(", kind = ");
+    switch (kind) {
+    case DK_BASE:      printf("base");      break;
+    case DK_POINTER:   printf("pointer");   break;
+    case DK_REFERENCE: printf("reference"); break;
+    case DK_ARRAY:     printf("array");     break;
+    case DK_FUNCTION:  printf("function");  break;
+    default:           FATAL("Unhandled declarator kind");
+    }
+
+    if (initExpr != NULL) {
+        printf(" = (");
+        initExpr->Print();
+        printf(")");
+    }
+
+    if (functionParams.size() > 0) {
+        for (unsigned int i = 0; i < functionParams.size(); ++i) {
+            printf("\n%*cfunc param %d:\n", indent, ' ', i);
+            functionParams[i]->Print(indent+4);
+        }
+    }
+
+    if (child != NULL)
+        child->Print(indent + 4);
+
+    printf("]\n");
+}
+
+
+void
+Declarator::InitFromType(const Type *baseType, DeclSpecs *ds) {
+    bool hasUniformQual = ((typeQualifiers & TYPEQUAL_UNIFORM) != 0);
+    bool hasVaryingQual = ((typeQualifiers & TYPEQUAL_VARYING) != 0);
+    bool isTask =         ((typeQualifiers & TYPEQUAL_TASK) != 0);
+    bool isExported =     ((typeQualifiers & TYPEQUAL_EXPORT) != 0);
+    bool isConst =        ((typeQualifiers & TYPEQUAL_CONST) != 0);
+    bool isUnmasked =     ((typeQualifiers & TYPEQUAL_UNMASKED) != 0);

    if (hasUniformQual && hasVaryingQual) {
        Error(pos, "Can't provide both \"uniform\" and \"varying\" qualifiers.");
-        return NULL;
+        return;
+    }
+    if (kind != DK_FUNCTION && isTask) {
+        Error(pos, "\"task\" qualifier illegal in variable declaration.");
+        return;
+    }
+    if (kind != DK_FUNCTION && isUnmasked) {
+        Error(pos, "\"unmasked\" qualifier illegal in variable declaration.");
+        return;
+    }
+    if (kind != DK_FUNCTION && isExported) {
+        Error(pos, "\"export\" qualifier illegal in variable declaration.");
+        return;
    }

-    if (isFunction) {
-        std::vector<const Type *> args;
-        std::vector<std::string> argNames;
-        if (functionArgs) {
-            // Loop over the function arguments and get names and types for
-            // each one in the args and argNames arrays
-            for (unsigned int i = 0; i < functionArgs->size(); ++i) {
-                Declaration *d = (*functionArgs)[i];
-                Symbol *sym;
-                if (d->declarators.size() == 0) {
-                    // function declaration like foo(float), w/o a name for
-                    // the parameter
-                    char buf[32];
-                    sprintf(buf, "__anon_parameter_%d", i);
-                    sym = new Symbol(buf, pos);
-                    Declarator *declarator = new Declarator(sym, sym->pos);
-                    sym->type = declarator->GetType(d->declSpecs);
-                    d->declarators.push_back(declarator);
-                }
-                else {
-                    assert(d->declarators.size() == 1);
-                    sym = d->declarators[0]->sym;
+    Variability variability(Variability::Unbound);
+    if (hasUniformQual)
+        variability = Variability::Uniform;
+    else if (hasVaryingQual)
+        variability = Variability::Varying;
+
+    if (kind == DK_BASE) {
+        // All of the type qualifiers should be in the DeclSpecs for the
+        // base declarator
+        AssertPos(pos, typeQualifiers == 0);
+        AssertPos(pos, child == NULL);
+        type = baseType;
+    }
+    else if (kind == DK_POINTER) {
+        /* For now, any pointer to an SOA type gets the slice property; if
+           we add the capability to declare pointers as slices or not,
+           we'll want to set this based on a type qualifier here. */
+        const Type *ptrType = new PointerType(baseType, variability, isConst,
+                                              baseType->IsSOAType());
+        if (child != NULL) {
+            child->InitFromType(ptrType, ds);
+            type = child->type;
+            name = child->name;
+        }
+        else
+            type = ptrType;
+    }
+    else if (kind == DK_REFERENCE) {
+        if (hasUniformQual) {
+            Error(pos, "\"uniform\" qualifier is illegal to apply to references.");
+            return;
+        }
+        if (hasVaryingQual) {
+            Error(pos, "\"varying\" qualifier is illegal to apply to references.");
+            return;
+        }
+        if (isConst) {
+            Error(pos, "\"const\" qualifier is to illegal apply to references.");
+            return;
+        }
+        // The parser should disallow this already, but double check.
+        if (CastType<ReferenceType>(baseType) != NULL) {
+            Error(pos, "References to references are illegal.");
+            return;
+        }
+
+        const Type *refType = new ReferenceType(baseType);
+        if (child != NULL) {
+            child->InitFromType(refType, ds);
+            type = child->type;
+            name = child->name;
+        }
+        else
+            type = refType;
+    }
+    else if (kind == DK_ARRAY) {
+        if (baseType->IsVoidType()) {
+            Error(pos, "Arrays of \"void\" type are illegal.");
+            return;
+        }
+        if (CastType<ReferenceType>(baseType)) {
+            Error(pos, "Arrays of references (type \"%s\") are illegal.",
+                  baseType->GetString().c_str());
+            return;
+        }
+
+#ifdef ISPC_NVPTX_ENABLED
+#if 0 /* NVPTX */
+        if (baseType->IsUniformType())
+        {
+          fprintf(stderr, " detected uniform array of size= %d  array= %s\n" ,arraySize,
+              baseType->IsArrayType() ? " true " : " false ");
+        }
+#endif
+#endif /* ISPC_NVPTX_ENABLED */
+        const Type *arrayType = new ArrayType(baseType, arraySize);
+        if (child != NULL) {
+            child->InitFromType(arrayType, ds);
+            type = child->type;
+            name = child->name;
+        }
+        else
+            type = arrayType;
+    }
+    else if (kind == DK_FUNCTION) {
+        llvm::SmallVector<const Type *, 8> args;
+        llvm::SmallVector<std::string, 8> argNames;
+        llvm::SmallVector<Expr *, 8> argDefaults;
+        llvm::SmallVector<SourcePos, 8> argPos;
+
+        // Loop over the function arguments and store the names, types,
+        // default values (if any), and source file positions each one in
+        // the corresponding vector.
+        for (unsigned int i = 0; i < functionParams.size(); ++i) {
+            Declaration *d = functionParams[i];
+
+            if (d == NULL) {
+                AssertPos(pos, m->errorCount > 0);
+                continue;
+            }
+            if (d->declarators.size() == 0) {
+                // function declaration like foo(float), w/o a name for the
+                // parameter; wire up a placeholder Declarator for it
+                d->declarators.push_back(new Declarator(DK_BASE, pos));
+                d->declarators[0]->InitFromDeclSpecs(d->declSpecs);
+            }
+
+            AssertPos(pos, d->declarators.size() == 1);
+            Declarator *decl = d->declarators[0];
+            if (decl == NULL || decl->type == NULL) {
+                AssertPos(pos, m->errorCount > 0);
+                continue;
+            }
+
+            if (decl->name == "") {
+                // Give a name to any anonymous parameter declarations
+                char buf[32];
+                sprintf(buf, "__anon_parameter_%d", i);
+                decl->name = buf;
+            }
+            decl->type = decl->type->ResolveUnboundVariability(Variability::Varying);
+
+            if (d->declSpecs->storageClass != SC_NONE)
+                Error(decl->pos, "Storage class \"%s\" is illegal in "
+                      "function parameter declaration for parameter \"%s\".",
+                      lGetStorageClassName(d->declSpecs->storageClass),
+                      decl->name.c_str());
+            if (decl->type->IsVoidType()) {
+                Error(decl->pos, "Parameter with type \"void\" illegal in function "
+                      "parameter list.");
+                decl->type = NULL;
+            }
+
+            const ArrayType *at = CastType<ArrayType>(decl->type);
+            if (at != NULL) {
+                // As in C, arrays are passed to functions as pointers to
+                // their element type.  We'll just immediately make this
+                // change now.  (One shortcoming of losing the fact that
+                // the it was originally an array is that any warnings or
+                // errors later issued that print the function type will
+                // report this differently than it was originally declared
+                // in the function, but it's not clear that this is a
+                // significant problem.)
+                const Type *targetType = at->GetElementType();
+                if (targetType == NULL) {
+                    AssertPos(pos, m->errorCount > 0);
+                    return;
                }

-                // Arrays are passed by reference, so convert array
-                // parameters to be references here.
-                if (dynamic_cast<const ArrayType *>(sym->type) != NULL)
-                    sym->type = new ReferenceType(sym->type, sym->type->IsConstType());
+                decl->type = PointerType::GetUniform(targetType, at->IsSOAType());

-                args.push_back(sym->type);
-                argNames.push_back(sym->name);
+                // Make sure there are no unsized arrays (other than the
+                // first dimension) in function parameter lists.
+                at = CastType<ArrayType>(targetType);
+                while (at != NULL) {
+                    if (at->GetElementCount() == 0)
+                        Error(decl->pos, "Arrays with unsized dimensions in "
+                              "dimensions after the first one are illegal in "
+                              "function parameter lists.");
+                    at = CastType<ArrayType>(at->GetElementType());
+                }
+            }
+
+            args.push_back(decl->type);
+            argNames.push_back(decl->name);
+            argPos.push_back(decl->pos);
+
+            Expr *init = NULL;
+            // Try to find an initializer expression.
+            while (decl != NULL) {
+                if (decl->initExpr != NULL) {
+                    decl->initExpr = TypeCheck(decl->initExpr);
+                    decl->initExpr = Optimize(decl->initExpr);
+                    if (decl->initExpr != NULL) {
+                        init = llvm::dyn_cast<ConstExpr>(decl->initExpr);
+                        if (init == NULL)
+                            init = llvm::dyn_cast<NullPointerExpr>(decl->initExpr);
+                        if (init == NULL)
+                            Error(decl->initExpr->pos, "Default value for parameter "
+                                  "\"%s\" must be a compile-time constant.",
+                                  decl->name.c_str());
+                    }
+                    break;
+                }
+                else
+                    decl = decl->child;
+            }
+            argDefaults.push_back(init);
+        }
+
+        const Type *returnType = baseType;
+        if (returnType == NULL) {
+            Error(pos, "No return type provided in function declaration.");
+            return;
+        }
+
+        if (CastType<FunctionType>(returnType) != NULL) {
+            Error(pos, "Illegal to return function type from function.");
+            return;
+        }
+
+        returnType = returnType->ResolveUnboundVariability(Variability::Varying);
+
+        bool isExternC =  ds && (ds->storageClass == SC_EXTERN_C);
+        bool isExported = ds && ((ds->typeQualifiers & TYPEQUAL_EXPORT) != 0);
+        bool isTask =     ds && ((ds->typeQualifiers & TYPEQUAL_TASK) != 0);
+        bool isUnmasked = ds && ((ds->typeQualifiers & TYPEQUAL_UNMASKED) != 0);
+
+        if (isExported && isTask) {
+            Error(pos, "Function can't have both \"task\" and \"export\" "
+                  "qualifiers");
+            return;
+        }
+        if (isExternC && isTask) {
+            Error(pos, "Function can't have both \"extern \"C\"\" and \"task\" "
+                  "qualifiers");
+            return;
+        }
+        if (isExternC && isExported) {
+            Error(pos, "Function can't have both \"extern \"C\"\" and \"export\" "
+                  "qualifiers");
+            return;
+        }
+        if (isUnmasked && isExported)
+            Warning(pos, "\"unmasked\" qualifier is redundant for exported "
+                    "functions.");
+
+        if (child == NULL) {
+            AssertPos(pos, m->errorCount > 0);
+            return;
+        }
+
+        const FunctionType *functionType =
+            new FunctionType(returnType, args, argNames, argDefaults,
+                             argPos, isTask, isExported, isExternC, isUnmasked);
+
+        // handle any explicit __declspecs on the function
+        if (ds != NULL) {
+            for (int i = 0; i < (int)ds->declSpecList.size(); ++i) {
+                std::string str = ds->declSpecList[i].first;
+                SourcePos pos = ds->declSpecList[i].second;
+
+                if (str == "safe")
+                    (const_cast<FunctionType *>(functionType))->isSafe = true;
+                else if (!strncmp(str.c_str(), "cost", 4)) {
+                    int cost = atoi(str.c_str() + 4);
+                    if (cost < 0)
+                        Error(pos, "Negative function cost %d is illegal.",
+                              cost);
+                    (const_cast<FunctionType *>(functionType))->costOverride = cost;
+                }
+                else
+                    Error(pos, "__declspec parameter \"%s\" unknown.", str.c_str());
            }
        }

-        if (ds->baseType == NULL) {
-            Warning(pos, "No return type provided in declaration of function \"%s\". "
-                    "Treating as \"void\".", sym->name.c_str());
-            ds->baseType = AtomicType::Void;
-        }
-
-        if (isReference) {
-            Error(pos, "Function return types can't be reference types.");
-            return NULL;
-        }
-
-        const Type *returnType = lGetType(this, ds, arraySize.begin());
-        if (returnType == NULL)
-            return NULL;
-
-        bool isExported = (ds->storageClass == SC_EXPORT);
-        bool isExternC =  (ds->storageClass == SC_EXTERN_C);
-        return new FunctionType(returnType, args, pos, &argNames, isTask, 
-                                isExported, isExternC);
-    }
-    else {
-        if (isTask)
-            Error(pos, "\"task\" qualifier illegal in variable declaration \"%s\".",
-                  sym->name.c_str());
-
-        const Type *type = lGetType(this, ds, arraySize.begin());
-
-        if (type != NULL && isReference) {
-            bool hasConstQual = ((ds->typeQualifier & TYPEQUAL_CONST) != 0);
-            type = new ReferenceType(type, hasConstQual);
-        }
-
-        return type;
+        child->InitFromType(functionType, ds);
+        type = child->type;
+        name = child->name;
    }
 }

 ///////////////////////////////////////////////////////////////////////////
 // Declaration

-void
-Declaration::AddSymbols(SymbolTable *st) const {
-    assert(declSpecs->storageClass != SC_TYPEDEF);
-
+Declaration::Declaration(DeclSpecs *ds, std::vector<Declarator *> *dlist) {
+    declSpecs = ds;
+    if (dlist != NULL)
+        declarators = *dlist;
    for (unsigned int i = 0; i < declarators.size(); ++i)
-       if (declarators[i])
-           st->AddVariable(declarators[i]->sym);
+        if (declarators[i] != NULL)
+            declarators[i]->InitFromDeclSpecs(declSpecs);
 }


-void
-Declaration::Print() const {
-    printf("Declaration: specs [");
-    declSpecs->Print();
-    printf("], declarators [");
-    for (unsigned int i = 0 ; i < declarators.size(); ++i) {
-        declarators[i]->Print();
-        printf("%s", (i == declarators.size() - 1) ? "]" : ", ");
+Declaration::Declaration(DeclSpecs *ds, Declarator *d) {
+    declSpecs = ds;
+    if (d != NULL) {
+        d->InitFromDeclSpecs(ds);
+        declarators.push_back(d);
    }
 }

+
+
+std::vector<VariableDeclaration>
+Declaration::GetVariableDeclarations() const {
+    Assert(declSpecs->storageClass != SC_TYPEDEF);
+    std::vector<VariableDeclaration> vars;
+
+    for (unsigned int i = 0; i < declarators.size(); ++i) {
+        Declarator *decl = declarators[i];
+        if (decl == NULL || decl->type == NULL) {
+            // Ignore earlier errors
+            Assert(m->errorCount > 0);
+            continue;
+        }
+
+        if (decl->type->IsVoidType())
+            Error(decl->pos, "\"void\" type variable illegal in declaration.");
+        else if (CastType<FunctionType>(decl->type) == NULL) {
+            decl->type = decl->type->ResolveUnboundVariability(Variability::Varying);
+            Symbol *sym = new Symbol(decl->name, decl->pos, decl->type,
+                                     decl->storageClass);
+            m->symbolTable->AddVariable(sym);
+            vars.push_back(VariableDeclaration(sym, decl->initExpr));
+        }
+    }
+
+    return vars;
+}
+
+
+void
+Declaration::DeclareFunctions() {
+    Assert(declSpecs->storageClass != SC_TYPEDEF);
+
+    for (unsigned int i = 0; i < declarators.size(); ++i) {
+        Declarator *decl = declarators[i];
+        if (decl == NULL || decl->type == NULL) {
+            // Ignore earlier errors
+            Assert(m->errorCount > 0);
+            continue;
+        }
+
+        const FunctionType *ftype = CastType<FunctionType>(decl->type);
+        if (ftype == NULL)
+            continue;
+
+        bool isInline = (declSpecs->typeQualifiers & TYPEQUAL_INLINE);
+        m->AddFunctionDeclaration(decl->name, ftype, decl->storageClass,
+                                  isInline, decl->pos);
+    }
+}
+
+
+void
+Declaration::Print(int indent) const {
+    printf("%*cDeclaration: specs [", indent, ' ');
+    declSpecs->Print();
+    printf("], declarators:\n");
+    for (unsigned int i = 0 ; i < declarators.size(); ++i)
+        declarators[i]->Print(indent+4);
+}
+
+
 ///////////////////////////////////////////////////////////////////////////

 void
 GetStructTypesNamesPositions(const std::vector<StructDeclaration *> &sd,
-                             std::vector<const Type *> *elementTypes,
-                             std::vector<std::string> *elementNames,
-                             std::vector<SourcePos> *elementPositions) {
+                             llvm::SmallVector<const Type *, 8> *elementTypes,
+                             llvm::SmallVector<std::string, 8> *elementNames,
+                             llvm::SmallVector<SourcePos, 8> *elementPositions) {
+    std::set<std::string> seenNames;
    for (unsigned int i = 0; i < sd.size(); ++i) {
        const Type *type = sd[i]->type;
+        if (type == NULL)
+            continue;
+
        // FIXME: making this fake little DeclSpecs here is really
        // disgusting
        DeclSpecs ds(type);
-        if (type->IsUniformType()) 
-            ds.typeQualifier |= TYPEQUAL_UNIFORM;
-        else
-            ds.typeQualifier |= TYPEQUAL_VARYING;
+        if (type->IsVoidType() == false) {
+            if (type->IsUniformType())
+                ds.typeQualifiers |= TYPEQUAL_UNIFORM;
+            else if (type->IsVaryingType())
+                ds.typeQualifiers |= TYPEQUAL_VARYING;
+            else if (type->GetSOAWidth() != 0)
+                ds.soaWidth = type->GetSOAWidth();
+            // FIXME: ds.vectorSize?
+        }

        for (unsigned int j = 0; j < sd[i]->declarators->size(); ++j) {
            Declarator *d = (*sd[i]->declarators)[j];
            d->InitFromDeclSpecs(&ds);

-            // if it's an unsized array, make it a reference to an unsized
-            // array, so the caller can pass a pointer...
-            const ArrayType *at = dynamic_cast<const ArrayType *>(d->sym->type);
-            if (at && at->GetElementCount() == 0)
-                d->sym->type = new ReferenceType(d->sym->type, type->IsConstType());
+            if (d->type->IsVoidType())
+                Error(d->pos, "\"void\" type illegal for struct member.");

-            elementTypes->push_back(d->sym->type);
-            elementNames->push_back(d->sym->name);
-            elementPositions->push_back(d->sym->pos);
+            elementTypes->push_back(d->type);
+
+            if (seenNames.find(d->name) != seenNames.end())
+                Error(d->pos, "Struct member \"%s\" has same name as a "
+                      "previously-declared member.", d->name.c_str());
+            else
+                seenNames.insert(d->name);
+
+            elementNames->push_back(d->name);
+            elementPositions->push_back(d->pos);
        }
    }
+
+    for (int i = 0; i < (int)elementTypes->size() - 1; ++i) {
+        const ArrayType *arrayType = CastType<ArrayType>((*elementTypes)[i]);
+
+        if (arrayType != NULL && arrayType->GetElementCount() == 0)
+            Error((*elementPositions)[i], "Unsized arrays aren't allowed except "
+                  "for the last member in a struct definition.");
+    }
 }
--- a/decl.h
+++ b/decl.h
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2013, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -28,7 +28,7 @@
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

 /** @file decl.h
@@ -47,24 +47,20 @@
    variables--here, that the declaration has the 'static' and 'uniform'
    qualifiers, and that it's basic type is 'int'.  Then for each variable
    declaration, the Declaraiton class holds an instance of a Declarator,
-    which in turn records the per-variable information like the symbol
-    name, array size (if any), initializer expression, etc.
+    which in turn records the per-variable information like the name, array
+    size (if any), initializer expression, etc.
 */

 #ifndef ISPC_DECL_H
 #define ISPC_DECL_H

 #include "ispc.h"
+#include <llvm/ADT/SmallVector.h>

-enum StorageClass {
-    SC_NONE,
-    SC_EXTERN,
-    SC_EXPORT,
-    SC_STATIC,
-    SC_TYPEDEF,
-    SC_EXTERN_C
-};
+struct VariableDeclaration;

+class Declaration;
+class Declarator;

 /* Multiple qualifiers can be provided with types in declarations;
   therefore, they are set up so that they can be ANDed together into an
@@ -74,9 +70,11 @@ enum StorageClass {
 #define TYPEQUAL_UNIFORM    (1<<1)
 #define TYPEQUAL_VARYING    (1<<2)
 #define TYPEQUAL_TASK       (1<<3)
-#define TYPEQUAL_REFERENCE  (1<<4)
+#define TYPEQUAL_SIGNED     (1<<4)
 #define TYPEQUAL_UNSIGNED   (1<<5)
 #define TYPEQUAL_INLINE     (1<<6)
+#define TYPEQUAL_EXPORT     (1<<7)
+#define TYPEQUAL_UNMASKED   (1<<8)

 /** @brief Representation of the declaration specifiers in a declaration.

@@ -85,22 +83,25 @@ enum StorageClass {
 */
 class DeclSpecs {
 public:
-    DeclSpecs(const Type *t = NULL, StorageClass sc = SC_NONE, int tq = TYPEQUAL_NONE);
+    DeclSpecs(const Type *t = NULL, StorageClass sc = SC_NONE,
+              int tq = TYPEQUAL_NONE);

    void Print() const;

    StorageClass storageClass;

    /** Zero or more of the TYPEQUAL_* values, ANDed together. */
-    int typeQualifier;
+    int typeQualifiers;

    /** The basic type provided in the declaration; this should be an
-        AtomicType, a StructType, or a VectorType; other types (like
+        AtomicType, EnumType, StructType, or VectorType; other types (like
        ArrayTypes) will end up being created if a particular declaration
        has an array size, etc.
    */
    const Type *baseType;

+    const Type *GetBaseType(SourcePos pos) const;
+
    /** If this is a declaration with a vector type, this gives the vector
        width.  For non-vector types, this is zero.
     */
@@ -110,45 +111,73 @@ public:
        SOA width specified.  Otherwise this is zero.
     */
    int soaWidth;
+
+    std::vector<std::pair<std::string, SourcePos> > declSpecList;
 };


-/** @brief Representation of the declaration of a single variable.  
+enum DeclaratorKind {
+    DK_BASE,
+    DK_POINTER,
+    DK_REFERENCE,
+    DK_ARRAY,
+    DK_FUNCTION
+};
+
+/** @brief Representation of the declaration of a single variable.

    In conjunction with an instance of the DeclSpecs, this gives us
    everything we need for a full variable declaration.
 */
 class Declarator {
 public:
-    Declarator(Symbol *s, SourcePos p);
-
-    /** As the parser peels off array dimension declarations after the
-        symbol name, it calls this method to provide them to the
-        Declarator.
-     */
-    void AddArrayDimension(int size);
+    Declarator(DeclaratorKind dk, SourcePos p);

    /** Once a DeclSpecs instance is available, this method completes the
-        initialization of the Symbol, setting its Type accordingly.
+        initialization of the type member.
     */
    void InitFromDeclSpecs(DeclSpecs *ds);

-    /** Get the actual type of the combination of Declarator and the given
-        DeclSpecs */
-    const Type *GetType(DeclSpecs *ds) const;
+    void InitFromType(const Type *base, DeclSpecs *ds);

-    void Print() const;
+    void Print(int indent) const;

+    /** Position of the declarator in the source program. */
    const SourcePos pos;
-    Symbol *sym;
-    /** If this declarator includes an array specification, the sizes of
-        the array dimensions are represented here.
-     */
-    std::vector<int> arraySize;
+
+    /** The kind of this declarator; complex declarations are assembled as
+        a hierarchy of Declarators.  (For example, a pointer to an int
+        would have a root declarator with kind DK_POINTER and with the
+        Declarator::child member pointing to a DK_BASE declarator for the
+        int). */
+    const DeclaratorKind kind;
+
+    /** Child pointer if needed; this can only be non-NULL if the
+        declarator's kind isn't DK_BASE. */
+    Declarator *child;
+
+    /** Type qualifiers provided with the declarator. */
+    int typeQualifiers;
+
+    StorageClass storageClass;
+
+    /** For array declarators, this gives the declared size of the array.
+        Unsized arrays have arraySize == 0. */
+    int arraySize;
+
+    /** Name associated with the declarator. */
+    std::string name;
+
    /** Initialization expression for the variable.  May be NULL. */
    Expr *initExpr;
-    bool isFunction;
-    std::vector<Declaration *> *functionArgs;
+
+    /** Type of the declarator.  This is NULL until InitFromDeclSpecs() or
+        InitFromType() is called. */
+    const Type *type;
+
+    /** For function declarations, this holds the Declaration *s for the
+        function's parameters. */
+    std::vector<Declaration *> functionParams;
 };


@@ -157,26 +186,21 @@ public:
 */
 class Declaration {
 public:
-    Declaration(DeclSpecs *ds, std::vector<Declarator *> *dlist = NULL) {
-        declSpecs = ds;
-        if (dlist != NULL)
-            declarators = *dlist;
-        for (unsigned int i = 0; i < declarators.size(); ++i)
-            if (declarators[i] != NULL)
-                declarators[i]->InitFromDeclSpecs(declSpecs);
-    }
-    Declaration(DeclSpecs *ds, Declarator *d) {
-        declSpecs = ds;
-        if (d) {
-            d->InitFromDeclSpecs(ds);
-            declarators.push_back(d);
-        }
-    }
+    Declaration(DeclSpecs *ds, std::vector<Declarator *> *dlist = NULL);
+    Declaration(DeclSpecs *ds, Declarator *d);

-    /** Adds the symbols for the variables in the declaration to the symbol
-        table. */
-    void AddSymbols(SymbolTable *st) const;
-    void Print() const;
+    void Print(int indent) const;
+
+    /** This method walks through all of the Declarators in a declaration
+        and returns a fully-initialized Symbol and (possibly) and
+        initialization expression for each one.  (This allows the rest of
+        the system to not have to worry about the mess of the general
+        Declarator representation.) */
+    std::vector<VariableDeclaration> GetVariableDeclarations() const;
+
+    /** For any function declarations in the Declaration, add the
+        declaration to the module. */
+    void DeclareFunctions();

    DeclSpecs *declSpecs;
    std::vector<Declarator *> declarators;
@@ -197,8 +221,8 @@ struct StructDeclaration {
 /** Given a set of StructDeclaration instances, this returns the types of
    the elements of the corresponding struct and their names. */
 extern void GetStructTypesNamesPositions(const std::vector<StructDeclaration *> &sd,
-                                         std::vector<const Type *> *elementTypes,
-                                         std::vector<std::string> *elementNames,
-                                         std::vector<SourcePos> *elementPositions);
+                                         llvm::SmallVector<const Type *, 8> *elementTypes,
+                                         llvm::SmallVector<std::string, 8> *elementNames,
+                                         llvm::SmallVector<SourcePos, 8> *elementPositions);

 #endif // ISPC_DECL_H
--- a/docs/ReleaseNotes.txt
+++ b/docs/ReleaseNotes.txt
@@ -1,3 +1,720 @@
+=== v1.9.1 === (8 July 2016)
+
+An ISPC update with new native AVX512 target for future Xeon CPUs and
+improvements for debugging, including new switch --dwarf-version to support
+debugging on old systems.
+
+The release is based on patched LLVM 3.8.
+
+=== v1.9.0 === (12 Feb 2016)
+
+An ISPC release with AVX512 (KNL flavor) support and a number of bug fixes,
+based on fresh LLVM 3.8 backend.
+
+For AVX512 two modes are supported - generic and native. For instructions on how
+to use them, please refer to the wiki. Going forward we assume that native mode
+is the primary way to get AVX512 support and that generic mode will be deprecated.
+If you observe significantly better performance in generic mode, please report
+it via github issues.
+
+Starting this release we are shipping two versions on Windows:
+(1) for VS2013 and earlier releases
+(2) for VS2015 and newer releases
+The reason for doing this is the redesigned C run-time library in VS.
+An implementation of "print" ISPC standard library function relies on C runtime
+library, which has changed. If you are not using "print" function in your code,
+you are safe to use either version.
+
+A new options was introduced to improve debugging: --no-omit-frame-pointer.
+
+=== v1.8.2 === (29 May 2015)
+
+An ISPC update with several important stability fixes and an experimental
+AVX512 support.
+
+Current level of AVX512 support is targeting the new generation of Xeon Phi
+codename Knights Landing. It's implemented in two different ways: as generic and
+native target. Generic target is similar to KNC support and requires Intel C/C++
+Compiler (15.0 and newer) and is available in regular ISPC build, which is
+based on LLVM 3.6.1. For the native AVX512 target, we have a separate ISPC
+build, which is based on LLVM trunk (3.7). This build is less stable and has
+several known issues. Nevertheless, if you are interested in AVX512 support for
+your code, we encourage you to try it and report the bugs. We actively working
+with LLVM maintainers to fix all AVX512 bugs, so your feedback is important for
+us and will ensure that bugs affecting your code are fixed by LLVM 3.7 release.
+
+Other notable changes and fixes include:
+
+* Broadwell support via --cpu=broadwell.
+
+* Changed cpu naming to accept cpu codenames. Check help for more details.
+
+* --cpu switch disallowed in multi-target mode.
+
+* Alignment of structure fields (in generated header files) is changed to be
+  more consistent regardless used C/C++ compiler.
+
+* --dllexport switch is added on Windows to make non-static functions DLL
+  export.
+
+* --print-target switch is added to dump details of LLVM target machine.
+  This may help you to debug issues with code generation for incorrect target
+  (or more likely to ensure that code generation is done right).
+
+* A bug was fixed, which triggered uniform statements to be executed with
+  all-off mask under some circumstances.
+
+* The restriction for using some uniform types as return type in multi-target
+  mode with targets of different width was relaxed.
+
+Also, if you are using ISPC for code generation for current generation of
+Xeon Phi (Knights Corner), the following changes are for you:
+
+* A bunch of stability fixes for KNC.
+
+* A bug, which affects projects with multiple ISPC source files compiled with generic
+  target is fixed. As side effect, you may see multiple warnings about unused static
+  functions - you need to add "-wd177" switch to ICC compiling generic output files.
+
+The release includes LLVM 3.6.1 binaries for Linux, MacOS, Windows and Windows based
+cross-compiler for Sony PlayStation4. LLVM 3.5 based experimental Linux binary with
+NVPTX support (now supporting also K80).
+
+Native AVX512 support is available in the set of less stable LLVM 3.7 based binaries
+for Linux, MacOS and Windows.
+
+=== v1.8.1 === (31 December 2014)
+
+A minor update of ``ispc`` with several important stability fixes, namely:
+
+* Auto-dispatch mechanism is fixed in pre-built Linux binaries (it used to
+  select too conservative target).
+
+* Compile crash with "-O2 -g" is fixed.
+
+Also KNC (Xeon Phi) support is further improved.
+
+The release includes experimental build for Sony PlayStation4 target (Windows
+cross compiler), as well NVPTX experimental support (64 bit Linux binaries
+only). Note that there might be NVPTX compilation fails with CUDA 7.0.
+
+Similar to 1.8.0 all binaries are based on LLVM 3.5. MacOS binaries are built
+for MacOS 10.9 Mavericks. Linux binaries are compatible with kernel 2.6.32
+(ok for RHEL6) and later.
+
+=== v1.8.0 === (16 October 2014)
+
+A major new version of ISPC, which introduces experimental support for NVPTX
+target, brings numerous improvements to our KNC (Xeon Phi) support, introduces
+debugging support on Windows and fixes several bugs. We also ship experimental
+build for Sony PlayStation4 target in this release. Binaries for all platforms
+are based on LLVM 3.5.
+
+Note that MacOS binaries are build for MacOS 10.9 Mavericks. Linux binaries are
+compatible with kernel 2.6.32 (ok for RHEL6) and later.
+
+More details:
+
+* Experimental NVPTX support is available for users of our binary distribution
+  on Linux only at the moment. MacOS and Windows users willing to experiment
+  with this target are welcome to build it from source. Note that GPU imposes
+  some limitation on ISPC language, which are discussed in corresponding section
+  of ISPC User's Guide. Implementation of NVPTX support was done by our
+  contributor Evghenii Gaburov.
+
+* KNC support was greatly extended in knc.h header file. Beyond new features
+  there are stability fixes and changes for icc 15.0 compatibility. Stdlib
+  prefetch functions were improved to map to KNC vector prefetches.
+
+* PS4 experimental build is Windows to PS4 cross compiler, which disables arch
+  and cpu selection (which are preset to PS4 hardware).
+
+* Debug info support on Windows (compatible with VS2010, VS2012 and VS2013).
+
+* Critical bug fix, which caused code generation for incorrect target, despite
+  explicit target switches, under some conditions.
+
+* Stability fix of the bug, which caused print() function to execute under
+  all-off mask under some conditions.
+
+=== v1.7.0 === (18 April 2014)
+
+A major new version of ISPC with several language and library extensions and
+fixes in debug info support. Binaries for all platforms are based on patched
+version on LLVM 3.4. There also performance improvements beyond switchover to
+LLVM 3.4.
+
+The list of language and library changes:
+
+* Support for varying types in exported functions was added. See documentation
+  for more details.
+
+* get_programCount() function was moved from stdlib.ispc to
+  examples/util/util.isph, which needs to be included somewhere in your
+  project, if you want to use it.
+
+* Library functions for saturated arithmetic were added. add/sub/mul/div
+  operations are supported for signed and unsigned 8/16/32/64 integer types
+  (both uniform and varying).
+
+* The algorithm for selecting overloaded function was extended to cover more
+  types of overloading. Handling of reference types in overloaded functions was
+  fixed. The rules for selecting the best match were changed to match C++,
+  which requires the function to be the best match for all parameters. In
+  ambiguous cases, a warning is issued, but it will be converted to an error
+  in the next release.
+
+* Explicit typecasts between any two reference types were allowed.
+
+* Implicit cast of pointer to const type to void* was disallowed.
+
+The list of other notable changes is:
+
+* Number of fixes for better debug info support.
+
+* Memory corruption bug was fixed, which caused rare but not reproducible
+  compile time fails.
+
+* Alias analysis was enabled (more aggressive optimizations are expected).
+
+* A bug involving inaccurate handling of "const" qualifier was fixed. As a
+  result, more "const" qualifiers may appear in .h files, which may cause
+  compilation errors.
+
+=== v1.6.0 === (19 December 2013)
+
+A major new version of ISPC with major improvements in performance and
+stability. Linux and MacOS binaries are based on patched version of LLVM 3.3,
+while Windows version is based on LLVM 3.4rc3. LLVM 3.4 significantly improves
+stability on Win32 platform, so we've decided not to wait for official LLVM 3.4
+release.
+
+The list of the most significant changes is:
+
+* New avx1-i32x4 target was added. It may play well for you, if you are focused
+  on integer computations or FP unit in your hardware is 128 bit wide.
+
+* Support for calculations in double precision was extended with two new
+  targets avx1.1-i64x4 and avx2-i64x4.
+
+* Language support for overloaded operators was added.
+
+* New library shift() function was added, which is similar to rotate(), but is
+  non-circular.
+
+* The language was extended to accept 3 dimensional tasking - a syntactic sugar,
+  which may facilitate programming of some tasks.
+
+* Regression, which broke --opt=force-aligned-memory is fixed.
+
+If you are not using pre-built binaries, you may notice the following changes:
+
+* VS2012/VS2013 are supported.
+
+* alloy.py (with -b switch) can build LLVM for you on any platform now
+  (except MacOS 10.9, but we know about the problem and working on it).
+  This is a preferred way to build LLVM for ISPC, as all required patches for
+  better performance and stability will automatically apply.
+
+* LLVM 3.5 (current trunk) is supported.
+
+There are also multiple fixes for better performance and stability, most
+notable are:
+
+* Fixed performance problem for x2 targets.
+
+* Fixed a problem with incorrect vzeroupper insertion on AVX target on Win32.
+
+=== v1.5.0 === (27 September 2013)
+
+A major new version of ISPC with several new targets and important bug fixes.
+Here's a list of the most important changes, if you are using pre-built
+binaries (which are based on patched version of LLVM 3.3):
+
+* The naming of targets was changed to explicitly include data type width and
+  a number of threads in the gang. For example, avx2-i32x8 is avx2 target,
+  which uses 32 bit types as a base and has 8 threads in a gang. Old naming
+  scheme is still supported, but depricated.
+
+* New SSE4 targets for calculations based on 8 bit and 16 bit data types:
+  sse4-i8x16 and sse4-i16x8.
+
+* New AVX1 target for calculations based on 64 bit data types: avx1-i64x4.
+
+* SVML support was extended and improved.
+
+* Behavior of -g switch was changed to not affect optimization level.
+
+* ISPC debug infrastructure was redesigned. See --help-dev for more info and
+  enjoy capabilities of new --debug-phase=<value> and --off-phase=<value>
+  switches.
+
+* Fixed an auto-dispatch bug, which caused AVX code execution when OS doesn't
+  support AVX (but hardware does).
+
+* Fixed a bug, which discarded uniform/varying keyword in typedefs.
+
+* Several performance regressions were fixed.
+
+If you are building ISPC yourself, then following changes are also available
+to you:
+
+* --cpu=slm for targeting Intel Atom codename Silvermont (if LLVM 3.4 is used).
+
+* ARM NEON targets are available (if enabled in build system).
+
+* --debug-ir=<value> is available to generate debug information based on LLVM
+  IR (if LLVM 3.4 is used). In debugger you'll see LLVM IR instead of source
+  code.
+
+* A redesigned and improved test and configuration management system is
+  available to facilitate the process of building LLVM and testing ISPC
+  compiler.
+
+Standard library changes/fixes:
+
+* __pause() function was removed from standard library.
+
+* Fixed reduce_[min|max]_[float|double] intrinsics, which were producing
+  incorrect code under some conditions.
+
+Language changes:
+
+* By default a floating point constant without a suffix is a single precision
+  constant (32 bit). A new suffix "d" was introduced to allow double precision
+  constant (64 bit). Please refer to tests/double-consts.ispc for syntax
+  examples.
+
+=== v1.4.4 === (19 July 2013)
+
+A minor version update with several stability fixes requested by the customers.
+
+=== v1.4.3 === (25 June 2013)
+
+A minor version update with several stability improvements:
+
+* Two bugs were fixed (including a bug in LLVM) to improve stability on 32 bit
+  platforms.
+
+* A bug affecting several examples was fixed.
+
+* --instrument switch is fixed.
+
+All tests and examples now properly compile and execute on native targets on
+Unix platforms (Linux and MacOS).
+
+=== v1.4.2 === (11 June 2013)
+
+A minor version update with a few important changes:
+
+* Stability fix for AVX2 target (Haswell) - problem with gather instructions was
+  released in LLVM 3.4, if you build with LLVM 3.2 or 3.3, it's available in our
+  repository (llvm_patches/r183327-AVX2-GATHER.patch) and needs to be applied
+  manually.
+
+* Stability fix for widespread issue on Win32 platform (#503).
+
+* Performance improvements for Xeon Phi related to mask representation.
+
+Also LLVM 3.3 has been released and now it's the recommended version for building ISPC.
+Precompiled binaries are also built with LLVM 3.3. 
+
+=== v1.4.1 === (28 May 2013)
+
+A major new version of ispc has been released with stability and performance
+improvements on all supported platforms (Windows, Linux and MacOS). 
+This version supports LLVM 3.1, 3.2, 3.3 and 3.4. The released binaries are built with 3.2.
+
+New compiler features:
+
+* ISPC memory allocation returns aligned memory with platform natural alignment
+  of vector registers by default. Alignment can also be managed via
+  --force-alignment=<value>.
+
+Important bug fixes/changes:
+
+* ISPC was fixed to be fully functional when built by GCC 4.7.
+
+* Major cleanup of build and test scripts on Windows.
+
+* Gather/scatter performance improvements on Xeon Phi.
+
+* FMA instructions are enabled for AVX2 instruction set.
+
+* Support of RDRAND instruction when available via library function rdrand (Ivy Bridge).
+
+Release also contains numerous bug fixes and minor improvements.
+
+=== v1.3.0 === (29 June 2012)
+
+This is a major new release of ispc, with support for more compilation
+targets and a number of additions to the language.  As usual, the quality
+of generated code has also been improved in a number of cases and a number
+of small bugs have been fixed.
+
+New targets:
+
+* This release provides "beta" support for compiling to Intel® Xeon
+  Phi™ processor, code named Knights Corner, the first processor in
+  the Intel® Many Integrated Core Architecture.  See
+  http://ispc.github.com/ispc.html#compiling-for-the-intel-xeon-phi-architecture
+  for more details on this support.
+
+* This release also has an "avx1.1" target, which provides support for the
+  new instructions in the Intel Ivy Bridge microarchitecutre. 
+
+New language features:
+
+* The foreach_active statement allows iteration over the active program
+  instances in a gang.  (See
+  http://ispc.github.com/ispc.html#iteration-over-active-program-instances-foreach-active)
+
+* foreach_unique allows iterating over subsets of program instances in a
+  gang that share the same value of a variable.  (See
+  http://ispc.github.com/ispc.html#iteration-over-unique-elements-foreach-unique)
+
+* An "unmasked" function qualifier and statement in the language allow
+  re-activating execution of all program instances in a gang.  (See
+  http://ispc.github.com/ispc.html#re-establishing-the-execution-mask
+
+Standard library updates:
+
+* The seed_rng() function has been modified to take a "varying" seed value
+  when a varying RNGState is being initialized.
+
+* An isnan() function has been added, to check for floating-point "not a
+  number" values.
+
+* The float_to_srgb8() routine does high performance conversion of
+  floating-point color values to SRGB8 format.
+
+Other changes:
+
+* A number of bugfixes have been made for compiler crashes with malformed
+  programs.
+
+* Floating-point comparisons are now "unordered", so that any comparison
+  where one of the operands is a "not a number" value returns false.  (This
+  matches standard IEEE floating-point behavior.)
+
+* The code generated for 'break' statements in "varying" loops has been
+  improved for some common cases. 
+
+* Compile time and compiler memory use have both been improved,
+  particularly for large input programs.
+
+* A nubmer of bugs have been fixed in the debugging information generated
+  by the compiler when the "-g" command-line flag is used.
+
+=== v1.2.2 === (20 April 2012)
+
+This release includes a number of small additions to functionality and a
+number of bugfixes.  New functionality includes:
+
+* It's now possible to forward declare structures as in C/C++: "struct
+  Foo;".  After such a declaration, structs with pointers to "Foo" and
+  functions that take pointers or references to Foo structs can be declared
+  without the entire definition of Foo being available.
+
+* New built-in types size_t, ptrdiff_t, and [u]intptr_t are now available,
+  corresponding to the equivalent types in C.
+
+* The standard library now provides atomic_swap*() and
+  atomic_compare_exchange*() functions for void * types.
+
+* The C++ backend has seen a number of improvements to the quality and
+  readability of generated code.
+
+A number of bugs have been fixed in this release as well.  The most
+significant are:
+
+* Fixed a bug where nested loops could cause a compiler crash in some
+  circumstances (issues #240, and #229)
+
+* Gathers could access invlaid mamory (and cause the program to crash) in
+  some circumstances (#235)
+
+* References to temporary values are now handled properly when passed to a
+  function that takes a reference typed parameter.
+
+* A case where incorrect code could be generated for compile-time-constant
+  initializers has been fixed (#234).
+
+=== v1.2.1 === (6 April 2012)
+
+This release contains only minor new functionality and is mostly for many
+small bugfixes and improvements to error handling and error reporting.
+The new functionality that is present is:
+
+* Significantly more efficient versions of the float / half conversion
+  routines are now available in the standard library, thanks to Fabian
+  Giesen.
+
+* The last member of a struct can now be a zero-length array; this allows
+  the trick of dynamically allocating enough storage for the struct and
+  some number of array elements at the end of it.
+
+Significant bugs fixed include:
+
+* Issue #205: When a target ISA isn't specified, use the host system's
+  capabilities to choose a target for which it will be able to run the
+  generated code.
+
+* Issues #215 and #217: Don't allocate storage for global variables that
+  are declared "extern".
+
+* Issue #197: Allow NULL as a default argument value in a function
+  declaration.
+
+* Issue #223: Fix bugs where taking the address of a function wouldn't work
+  as expected.
+
+* Issue #224: When there are overloaded variants of a function that take
+  both reference and const reference parameters, give the non-const
+  reference preference when matching values of that underlying type.
+
+* Issue #225: An error is issed when a varying lvalue is assigned to a
+  reference type (rather than crashing).
+
+* Issue #193: Permit conversions from array types to void *, not just the
+  pointer type of the underlying array element.
+
+* Issue #199: Still evaluate expressions that are cast to (void).
+
+The documentation has also been improved, with FAQs added to clarify some
+aspects of the ispc pointer model.
+
+=== v1.2.0 === (20 March 2012)
+
+This is a major new release of ispc, with a number of significant
+improvements to functionality, performance, and compiler robustness.  It
+does, however, include three small changes to language syntax and semantics
+that may require changes to existing programs:
+
+* Syntax for the "launch" keyword has been cleaned up; it's now no longer
+  necessary to bracket the launched function call with angle brackets.
+  (In other words, now use "launch foo();", rather than "launch < foo() >;".
+
+* When using pointers, the pointed-to data type is now "uniform" by
+  default.  Use the varying keyword to specify varying pointed-to types when
+  needed.  (i.e. "float *ptr" is a varying pointer to uniform float data,
+  whereas previously it was a varying pointer to varying float values.)
+  Use "varying float *" to specify a varying pointer to varying float data,
+  and so forth.
+
+* The details of "uniform" and "varying" and how they interact with struct
+  types have been cleaned up.  Now, when a struct type is declared, if the
+  struct elements don't have explicit "uniform" or "varying" qualifiers,
+  they are said to have "unbound" variability.  When a struct type is
+  instantiated, any unbound variability elements inherit the variability of
+  the parent struct type. See http://ispc.github.com/ispc.html#struct-types
+  for more details.
+
+ispc has a new language feature that makes it much easier to use the
+efficient "(array of) structure of arrays" (AoSoA, or SoA) memory layout of
+data.  A new "soa<n>" qualifier can be applied to structure types to
+specify an n-wide SoA version of the corresponding type.  Array indexing
+and pointer operations with arrays SoA types automatically handles the
+two-stage indexing calculation to access the data.  See
+http://ispc.github.com/ispc.html#structure-of-array-types for more details.
+
+For more efficient access of data that is still in "array of structures"
+(AoS) format, ispc has a new "memory coalescing" optimization that
+automatically detects series of strided loads and/or gathers that can be
+transformed into a more efficient set of vector loads and shuffles.  A
+diagnostic is emitted when this optimization is successfully applied. 
+
+Smaller changes in this release:
+
+* The standard library now provides memcpy(), memmove() and memset()
+  functions, as well as single-precision asin() and acos() functions.
+
+* -I can now be specified on the command-line to specify a search path for
+  #include files.
+
+* A number of improvements have been made to error reporting from the
+  parser, and a number of cases where malformed programs could cause the
+  compiler to crash have been fixed.
+
+* A number of small improvements to the quality and performance of generated
+  code have been made, including finding more cases where 32-bit addressing
+  calculations can be safely done on 64-bit systems and generating better
+  code for initializer expressions.
+
+=== v1.1.4 === (4 February 2012)
+
+There are two major bugfixes for Windows in this release.  First, a number
+of failures in AVX code generation on Windows have been fixed; AVX on
+Windows now has no known issues.  Second, a longstanding bug in parsing 64-bit
+integer constants on Windows has been fixed.
+
+This release features a new experimental scalar target, contributed by Gabe
+Weisz <gweisz@cs.cmu.edu>.  This target ("--target=generic-1") compiles
+gangs of single program instances (i.e. programCount == 1); it can be
+useful for debugging ispc programs.
+
+The compiler now supports dynamic memory allocation in ispc programs (with
+"new" and "delete" operators based on C++).  See
+http://ispc.github.com/ispc.html#dynamic-memory-allocation in the
+documentation for more information.
+
+ispc now performs "short circuit" evaluation of the || and && logical
+operators and the ? : selection operator.  (This represents the correction
+of a major incompatibility with C.)  Code like "(index < arraySize &&
+array[index] == 1)" thus now executes as in C, where "array[index]" won't
+be evaluated unless "index" is less than "arraySize".
+
+The standard library now provides "local" atomic operations, which are
+atomic across the gang of program instances (but not across other gangs or
+other hardware threads.  See the updated documentation on atomics for more
+information:
+http://ispc.github.com/ispc.html#atomic-operations-and-memory-fences.
+
+The standard library now offers a clock() function, which returns a uniform
+int64 value that counts processor cycles; it can be used for
+fine-resolution timing measurements.
+
+Finally (of limited interest now): ispc now supports the forthcoming AVX2
+instruction set, due with Haswell-generation CPUs.  All tests and examples
+compile and execute correctly with AVX2.  (Thanks specifically to Craig
+Topper and Nadav Rotem for work on AVX2 support in LLVM, which made this
+possible.)
+ 
+=== v1.1.3 === (20 January 2012)
+
+With this release, the language now supports "switch" statements, with the
+same semantics and syntax as in C.
+
+This release includes fixes for two important performance related issues:
+the quality of code generated for "foreach" statements has been
+substantially improved (https://github.com/ispc/ispc/issues/151), and a
+performance regression with code for "gathers" that was introduced in
+v1.1.2 has been fixed in this release. 
+
+A number of other small bugs were fixed in this release as well, including
+one where invalid memory would sometimes be incorrectly accessed
+(https://github.com/ispc/ispc/issues/160).
+
+Thanks to Jean-Luc Duprat for a number of patches that improve support for
+building on various platforms, and to Pierre-Antoine Lacaze for patches so
+that ispc builds under MinGW.
+
+=== v1.1.2 === (9 January 2012)
+
+The major new feature in this release is support for "generic" C++
+vectorized output; in other words, ispc can emit C++ code that corresponds
+to the vectorized computation that the ispc program represents.  See the
+examples/intrinsics directory in the ispc distribution for two example
+implementations of the set of functions that must be provided map the
+vector calls generated by ispc to target specific functions.
+
+ispc now has partial support for 'goto' statements; specifically, goto is
+allowed if any enclosing control flow statements (if/for/while/do) have
+'uniform' test expressions, but not if they have 'varying' tests.
+
+A number of improvements have been made to the code generated for gathers
+and scatters--one of them (better matching x86's "free" scale by 2/4/8 for
+addressing calculations) improved the performance of the noise example by
+14%.
+
+Many small bugs have been fixed in this release as well, including issue
+numbers 138, 129, 135, 127, 149, and 142.
+
+=== v1.1.1 === (15 December 2011)
+
+This release doesn't include any significant new functionality, but does
+include a small improvements in generated code and a number of bug fixes.
+
+The one user-visible language change is that integer constants may be
+specified with 'u' and 'l' suffixes, like in C.  For example, "1024llu"
+defines the constant with unsigned 64-bit type.
+
+More informative and useful error messages are printed when function
+overload resolution fails.
+
+Masking is avoided in additional cases when the mask can be
+statically-determined to be all on. 
+
+A number of small bugs have been fixed:
+- Under some circumstances, incorrect masks were used when assigning a
+  value to a reference and when doing gathers/scatters.
+- Incorrect code could be generated in some cases when some instances
+  returned part way through a function but others contineud executing.
+- Type checking wasn't being performed for calls through function pointers;
+  now an error is issued if the arguments don't match up, etc.
+- Incorrect code was being generated for gather/scatter to structs that had
+  elements with varying short-vector types.
+- Typechecking wasn't being performed for "foreach" statements; this led to
+  problems like function overload resolution not being performed if an
+  overloaded function call was used to determine the iteration range..
+- A number of symbols would be multiply-defined when compiling to multiple
+  targets and using the sse2-x2 target as one of them (issue #131).
+
+=== v1.1.0 === (5 December 2011)
+
+This is a major new release of the compiler, with significant additions to
+language functionality and capabilities.  It includes a number of small
+language syntax changes that will require modification of existing
+programs.  These changes should generally be straightforward and all are
+steps toward eliminating parts of ispc syntax that are incompatible with
+C/C++.  See
+http://ispc.github.com/ispc.html#updating-ispc-programs-for-changes-in-ispc-1-1
+for more information about these changes.
+
+ispc now fully supports pointers, including pointer arithmetic, implicit
+conversions of arrays to pointers, and all of the other capabilities of
+pointers in C.  See http://ispc.github.com/ispc.html#pointer-types for more
+information about pointers in ispc and
+http://ispc.github.com/ispc.html#function-pointer-types for information
+about function pointers in ispc.
+
+Reference types are now declared with C++ syntax (e.g. "const float &foo").
+
+ispc now supports 64-bit addressing.  For performance reasons, this
+capability is disabled by default (even on 64-bit targets), but can be
+enabled with a command-line flag:
+http://ispc.github.com/ispc.html#selecting-32-or-64-bit-addressing.
+
+This release features new parallel "foreach" statements, which make it
+easier in many instances to map program instances to data for data-parallel
+computation than the programIndex/programCount mechanism:
+http://ispc.github.com/ispc.html#parallel-iteration-statements-foreach-and-foreach-tiled.
+
+Finally, all of the system's documentation has been significantly revised.
+The documentation of ispc's parallel execution model has been rewritten:
+http://ispc.github.com/ispc.html#the-ispc-parallel-execution-model, and
+there is now a more specific discussion of similarities and differences
+between ispc and C/C++:
+http://ispc.github.com/ispc.html#relationship-to-the-c-programming-language.
+There is now a separate FAQ (http://ispc.github.com/faq.html), and a
+Performance Guide (http://ispc.github.com/perfguide.html).
+ 
+=== v1.0.12 === (20 October 2011)
+
+This release includes a new "double-pumped" 8-wide target for SSE2,
+"sse2-x2".  Like the sse4-x2 and avx-x2 targets, this target may deliver
+higher performance for some workloads than the regular sse2 target.  (For
+other workloads, it may be slower.)
+
+The ispc language now includes an "assert()" statement.  See
+http://ispc.github.com/ispc.html#assertions for more information.
+
+The compiler now sets a preprocessor #define based on the target ISA; for
+example, ISPC_TARGET_SSE4 is defined for the sse4 targets, and so forth.
+
+The standard library now provides high-performance routines for converting
+between some "array of structures" and "structure of arrays" formats.
+See
+http://ispc.github.com/ispc.html#converting-between-array-of-structures-and-structure-of-arrays-layout
+for more information.
+
+Inline functions now have static linkage.
+
+A number of improvements have been made to the optimization passes that
+detect when gathers and scatters can be transformed into vector stores and
+loads, respectively.  In particular, these passes now handle variables that
+are used as loop induction variables much better.
+
 === v1.0.11 === (6 October 2011)

 The main new feature in this release is support for generating code for
--- a/docs/build.sh
+++ b/docs/build.sh
@@ -1,6 +1,17 @@
 #!/bin/bash

-rst2html.py ispc.txt > ispc.html
+rst2html=rst2html.py
+
+for i in ispc perfguide faq; do
+    $rst2html --template=template.txt --link-stylesheet \
+        --stylesheet-path=css/style.css $i.rst > $i.html
+done
+
+$rst2html --template=template-news.txt --link-stylesheet \
+    --stylesheet-path=css/style.css news.rst > news.html
+
+$rst2html --template=template-perf.txt --link-stylesheet \
+        --stylesheet-path=css/style.css perf.rst > perf.html

 #rst2latex --section-numbering --documentclass=article --documentoptions=DIV=9,10pt,letterpaper ispc.txt > ispc.tex
 #pdflatex ispc.tex
--- a/docs/faq.rst
+++ b/docs/faq.rst
@@ -0,0 +1,879 @@
+=====================================
+Frequently Asked Questions About ispc
+=====================================
+
+This document includes a number of frequently (and not frequently) asked
+questions about ispc, the Intel® SPMD Program Compiler.  The source to this
+document is in the file ``docs/faq.rst`` in the ``ispc`` source
+distribution.
+
+* Understanding ispc's Output
+
+  + `How can I see the assembly language generated by ispc?`_
+  + `How can I have the assembly output be printed using Intel assembly syntax?`_
+  + `Why are there multiple versions of exported ispc functions in the assembly output?`_
+  + `How can I more easily see gathers and scatters in generated assembly?`_
+
+* Running The Compiler
+
+  + `Why is it required to use one of the "generic" targets with C++ output?`_
+  + `Why won't the compiler generate an object file or assembly output with the "generic" targets?`_
+
+* Language Details
+
+  + `What is the difference between "int *foo" and "int foo[]"?`_
+  + `Why are pointed-to types "uniform" by default?`_
+  + `What am I getting an error about assigning a varying lvalue to a reference type?`_ 
+  
+* Interoperability
+
+  + `How can I supply an initial execution mask in the call from the application?`_
+  + `How can I generate a single binary executable with support for multiple instruction sets?`_
+  + `How can I determine at run-time which vector instruction set's instructions were selected to execute?`_
+  + `Is it possible to inline ispc functions in C/C++ code?`_
+  + `Why is it illegal to pass "varying" values from C/C++ to ispc functions?`_ 
+
+* Programming Techniques
+
+  + `What primitives are there for communicating between SPMD program instances?`_
+  + `How can a gang of program instances generate variable amounts of output efficiently?`_
+  + `Is it possible to use ispc for explicit vector programming?`_
+  + `How can I debug my ispc programs using Valgrind?`_
+  + `foreach statements generate more complex assembly than I'd expect; what's going on?`_
+  + `How do I launch an individual task for each active program instance?`_
+
+Understanding ispc's Output
+===========================
+
+How can I see the assembly language generated by ispc?
+------------------------------------------------------
+
+The ``--emit-asm`` flag causes assembly output to be generated.  If the
+``-o`` command-line flag is also supplied, the assembly is stored in the
+given file, or printed to standard output if ``-`` is specified for the
+filename.  For example, given the simple ``ispc`` program:
+
+::
+
+    export uniform int foo(uniform int a, uniform int b) {
+        return a+b;
+    }
+
+If the SSE4 target is used, then the following assembly is printed:
+
+::
+
+    _foo:
+            addl    %esi, %edi
+            movl    %edi, %eax
+            ret
+
+
+How can I have the assembly output be printed using Intel assembly syntax?
+--------------------------------------------------------------------------
+
+The ``ispc`` compiler is currently only able to emit assembly with AT+T
+syntax, where the destination operand is the last operand after an
+instruction.  If you'd prefer Intel assembly output, one option is to use
+Agner Fog's ``objconv`` tool: have ``ispc`` emit a native object file and
+then use ``objconv`` to disassemble it, specifying the assembler syntax
+that you prefer.  ``objconv`` `is available for download here`_.
+
+.. _is available for download here: http://www.agner.org/optimize/#objconv
+
+Why are there multiple versions of exported ispc functions in the assembly output?
+----------------------------------------------------------------------------------
+
+Two generations of all functions qualified with ``export`` are generated:
+one of them is for being be called by other ``ispc`` functions, and the
+other is to be called by the application.  The application callable
+function has the original function's name, while the ``ispc``-callable
+function has a mangled name that encodes the types of the function's
+parameters.
+
+The crucial difference between these two functions is that the
+application-callable function doesn't take a parameter encoding the current
+execution mask, while ``ispc``-callable functions have a hidden mask
+parameter.  An implication of this difference is that the ``export``
+function starts with the execution mask "all on".  This allows a number of
+improvements in the generated code, particularly on architectures that
+don't have support for masked load and store instructions.
+
+As an example, consider this short function, which loads a vector's worth
+values from two arrays in memory, adds them, and writes the result to an
+output array.
+
+::
+
+    export void foo(uniform float a[], uniform float b[],
+                    uniform float result[]) {
+        float aa = a[programIndex], bb = b[programIndex];
+        result[programIndex] = aa+bb;
+    }
+
+Here is the assembly code for the application-callable instance of the
+function.
+
+::
+
+    _foo:
+            movups        (%rsi), %xmm1
+            movups        (%rdi), %xmm0
+            addps         %xmm1, %xmm0
+            movups        %xmm0, (%rdx)
+            ret
+
+
+And here is the assembly code for the ``ispc``-callable instance of the
+function.
+
+::
+
+    "_foo___uptr<Uf>uptr<Uf>uptr<Uf>":
+            movmskps      %xmm0, %eax
+            cmpl          $15, %eax
+            je            LBB0_3
+            testl         %eax, %eax
+            jne           LBB0_4
+            ret
+    LBB0_3:
+            movups        (%rsi), %xmm1
+            movups        (%rdi), %xmm0
+            addps         %xmm1, %xmm0
+            movups        %xmm0, (%rdx)
+            ret
+    LBB0_4:
+    ####
+    ####  Code elided; handle mixed mask case..
+    ####
+            ret
+
+There are a few things to notice in this code.  First, the current program
+mask is coming in via the ``%xmm0`` register and the initial few
+instructions in the function essentially check to see if the mask is all on
+or all off.  If the mask is all on, the code at the label LBB0_3 executes;
+it's the same as the code that was generated for ``_foo`` above.  If the
+mask is all off, then there's nothing to be done, and the function can
+return immediately.
+
+In the case of a mixed mask, a substantial amount of code is generated to
+load from and then store to only the array elements that correspond to
+program instances where the mask is on.  (This code is elided below).  This
+general pattern of having two-code paths for the "all on" and "mixed" mask
+cases is used in the code generated for almost all but the most simple
+functions (where the overhead of the test isn't worthwhile.)
+
+How can I more easily see gathers and scatters in generated assembly?
+---------------------------------------------------------------------
+
+Because CPU vector ISAs don't have native gather and scatter instructions,
+these memory operations are turned into sequences of a series of
+instructions in the code that ``ispc`` generates.  In some cases, it can be
+useful to see where gathers and scatters actually happen in code; there is
+an otherwise undocumented command-line flag that provides this information.
+
+Consider this simple program:
+
+::
+
+    void set(uniform int a[], int value, int index) {
+        a[index] = value;
+    }
+
+When compiled normally to the SSE4 target, this program generates this
+extensive code sequence, which makes it more difficult to see what the
+program is actually doing.
+
+::
+
+    "_set___uptr<Ui>ii":
+            pmulld        LCPI0_0(%rip), %xmm1
+            movmskps      %xmm2, %eax
+            testb         $1, %al
+            je            LBB0_2
+            movd          %xmm1, %ecx
+            movd          %xmm0, (%rcx,%rdi)
+    LBB0_2:
+            testb         $2, %al
+            je            LBB0_4
+            pextrd        $1, %xmm1, %ecx
+            pextrd        $1, %xmm0, (%rcx,%rdi)
+    LBB0_4:
+            testb         $4, %al
+            je            LBB0_6
+            pextrd        $2, %xmm1, %ecx
+            pextrd        $2, %xmm0, (%rcx,%rdi)
+    LBB0_6:
+            testb        $8, %al
+            je            LBB0_8
+            pextrd        $3, %xmm1, %eax
+            pextrd        $3, %xmm0, (%rax,%rdi)
+    LBB0_8:
+            ret
+
+If this program is compiled with the
+``--opt=disable-handle-pseudo-memory-ops`` command-line flag, then the
+scatter is left as an unresolved function call.  The resulting program
+won't link without unresolved symbols, but the assembly output is much
+easier to understand:
+
+::
+
+    "_set___uptr<Ui>ii":
+            movaps        %xmm0, %xmm3
+            pmulld        LCPI0_0(%rip), %xmm1
+            movdqa        %xmm1, %xmm0
+            movaps        %xmm3, %xmm1
+            jmp        ___pseudo_scatter_base_offsets32_32 ## TAILCALL
+
+
+Running The Compiler
+====================
+
+Why is it required to use one of the "generic" targets with C++ output?
+-----------------------------------------------------------------------
+
+The C++ output option transforms the provided ``ispc`` program source into
+C++ code where each basic operation in the program (addition, comparison,
+etc.) is represented as a function call to an as-yet-undefined function,
+chaining the results of these calls together to perform the required
+computations.  It is then expected that the user will provide the
+implementation of these functions via a header file with ``inline``
+functions defined for each of these functions and then use a C++ compiler
+to generate a final object file.  (Examples of these headers include
+``examples/intrinsics/sse4.h`` and ``examples/intrinsics/knc.h`` in the
+``ispc`` distribution.)
+
+If a target other than one of the "generic" ones is used with C++ output,
+then the compiler will transform certain operations into particular code
+sequences that may not be desired for the actual final target; for example,
+SSE targets that don't have hardware "gather" instructions will transform a
+gather into a sequence of scalar load instructions.  When this in turn is
+transformed to C++ code, the fact that the loads were originally a gather
+is lost, and the header file of function definitions wouldn't have a chance
+to map the "gather" to a target-specific operation, as the ``knc.h`` header
+does, for example.  Thus, the "generic" targets exist to provide basic
+targets of various vector widths, without imposing any limitations on the
+final target's capabilities.
+
+Why won't the compiler generate an object file or assembly output with the "generic" targets?
+---------------------------------------------------------------------------------------------
+
+As described in the above FAQ entry, when compiling to the "generic"
+targets, ``ispc`` generates vector code for the source program that
+transforms every basic operation in the program (addition, comparison,
+etc.) into a separate function call.
+
+While there is no fundamental reason that the compiler couldn't generate
+target-specific object code with a function call to an undefined function
+for each primitive operation, doing so wouldn't actually be useful in
+practice--providing definitions of these functions in a separate object
+file and actually performing function calls for each of them (versus
+turning them into inline function calls) would be a highly inefficient way
+to run the program.
+
+Therefore, in the interests of encouraging the  use of the system,
+these types of output are disallowed.
+
+
+Language Details
+================
+
+What is the difference between "int \*foo" and "int foo[]"?
+-----------------------------------------------------------
+
+In C and C++, declaring a function to take a parameter ``int *foo`` and
+``int foo[]`` results in the same type for the parameter.  Both are
+pointers to integers.  In ``ispc``, these are different types.  The first
+one is a varying pointer to a uniform integer value in memory, while the
+second results in a uniform pointer to the start of an array of varying
+integer values in memory.
+
+To understand why the first is a varying pointer to a uniform integer,
+first recall that types without explicit rate qualifiers (``uniform``,
+``varying``, or ``soa<>``) are ``varying`` by default.  Second, recall from
+the `discussion of pointer types in the ispc User's Guide`_ that pointed-to
+types without rate qualifiers are ``uniform`` by default.  (This second
+rule is discussed further below, in `Why are pointed-to types "uniform" by
+default?`_.)  The type of ``int *foo`` follows from these.
+
+.. _discussion of pointer types in the ispc User's Guide: ispc.html#pointer-types 
+
+Conversely, in a function body, ``int foo[10]`` represents a declaration of
+a 10-element array of varying ``int`` values.  In that we'd certainly like
+to be able to pass such an array to a function that takes a ``int []``
+parameter, the natural type for an ``int []`` parameter is a uniform
+pointer to varying integer values.
+
+In terms of compatibility with C/C++, it's unfortunate that this
+distinction exists, though any other set of rules seems to introduce more
+awkwardness than this one.  (Though we're interested to hear ideas to
+improve these rules!).
+
+Why are pointed-to types "uniform" by default?
+----------------------------------------------
+
+In ``ispc``, types without rate qualifiers are "varying" by default, but
+types pointed to by pointers without rate qualifiers are "uniform" by
+default.  Why this difference?
+
+::
+
+    int foo;  // no rate qualifier, "varying int".
+    uniform int *foo;  // pointer type has no rate qualifier, pointed-to does.
+                       // "varying pointer to uniform int".
+    int *foo;  // neither pointer type nor pointed-to type ("int") have
+               // rate qualifiers. Pointer type is varying by default,
+               // pointed-to is uniform. "varying pointer to uniform int".
+    varying int *foo;   // varying pointer to varying int
+
+The first rule, having types without rate qualifiers be varying by default,
+is a default that keeps the number of "uniform" or "varying" qualifiers in
+``ispc`` programs low.  Most ``ispc`` programs use mostly "varying"
+variables, so this rule allows most variables to be declared without also
+requiring rate qualifiers.
+
+On a related note, this rule allows many C/C++ functions to be used to
+define equivalent functions in the SPMD execution model that ``ispc``
+provides with little or no modification:
+
+::
+
+    // scalar add in C/C++, SPMD/vector add in ispc
+    int add(int a, int b) { return a + b; }
+
+This motivation also explains why ``uniform int *foo`` represents a varying
+pointer; having pointers be varying by default if they don't have rate
+qualifiers similarly helps with porting code from C/C++ to ``ispc``.
+
+The tricker issue is why pointed-to types are "uniform" by default.  In our
+experience, data in memory that is accessed via pointers is most often
+uniform; this generally includes all data that has been allocated and
+initialized by the C/C++ application code. In practice, "varying" types are
+more generally (but not exclusively) used for local data in ``ispc``
+functions.  Thus, making the pointed-to type uniform by default leads to
+more concise code for the most common cases.
+
+
+What am I getting an error about assigning a varying lvalue to a reference type?
+--------------------------------------------------------------------------------
+
+Given code like the following:
+
+::
+
+    uniform float a[...];
+    int index = ...;
+    float &r = a[index];
+
+``ispc`` issues the error "Initializer for reference-type variable "r" must
+have a uniform lvalue type.".  The underlying issue stems from how
+references are represented in the code generated by ``ispc``.  Recall that
+``ispc`` supports both uniform and varying pointer types--a uniform pointer
+points to the same location in memory for all program instances in the
+gang, while a varying pointer allows each program instance to have its own
+pointer value.
+
+References are represented a pointer in the code generated by ``ispc``,
+though this is generally opaque to the user; in ``ispc``, they are
+specifically uniform pointers.  This design decision was made so that given
+code like this:
+
+::
+
+    extern void func(float &val);
+    float foo = ...;
+    func(foo);
+
+Then the reference would be handled efficiently as a single pointer, rather
+than unnecessarily being turned into a gang-size of pointers.
+
+However, an implication of this decision is that it's not possible for
+references to refer to completely different things for each of the program
+instances.  (And hence the error that is issued).  In cases where a unique
+per-program-instance pointer is needed, a varying pointer should be used
+instead of a reference.
+
+
+Interoperability
+================
+
+How can I supply an initial execution mask in the call from the application?
+----------------------------------------------------------------------------
+
+Recall that when execution transitions from the application code to an
+``ispc`` function, all of the program instances are initially executing.
+In some cases, it may desired that only some of them are running, based on
+a data-dependent condition computed in the application program.  This
+situation can easily be handled via an additional parameter from the
+application.
+
+As a simple example, consider a case where the application code has an
+array of ``float`` values and we'd like the ``ispc`` code to update
+just specific values in that array, where which of those values to be
+updated has been determined by the application.  In C++ code, we might
+have:
+
+::
+
+    int count = ...;
+    float *array = new float[count];
+    bool *shouldUpdate = new bool[count];
+    // initialize array and shouldUpdate
+    ispc_func(array, shouldUpdate, count);
+
+Then, the ``ispc`` code could process this update as:
+
+::
+
+    export void ispc_func(uniform float array[], uniform bool update[],
+                          uniform int count) {
+        foreach (i = 0 ... count) {
+            cif (update[i] == true)
+                // update array[i+programIndex]...
+        }
+    }
+
+(In this case a "coherent" if statement is likely to be worthwhile if the
+``update`` array will tend to have sections that are either all-true or
+all-false.)
+
+How can I generate a single binary executable with support for multiple instruction sets?
+-----------------------------------------------------------------------------------------
+
+``ispc`` can also generate output that supports multiple target instruction
+sets, also generating code that chooses the most appropriate one at runtime
+if multiple targets are specified with the ``--target`` command-line
+argument.
+
+For example, if you run the command:
+
+::
+
+   ispc foo.ispc -o foo.o --target=sse2,sse4-x2,avx-x2
+
+Then four object files will be generated: ``foo_sse2.o``, ``foo_sse4.o``,
+``foo_avx.o``, and ``foo.o``.[#]_  Link all of these into your executable, and
+when you call a function in ``foo.ispc`` from your application code,
+``ispc`` will determine which instruction sets are supported by the CPU the
+code is running on and will call the most appropriate version of the
+function available.  
+
+.. [#] Similarly, if you choose to generate assembly language output or
+   LLVM bitcode output, multiple versions of those files will be created.
+
+In general, the version of the function that runs will be the one in the
+most general instruction set that is supported by the system.  If you only
+compile SSE2 and SSE4 variants and run on a system that supports AVX, for
+example, then the SSE4 variant will be executed.  If the system doesn't
+is not able to run any of the available variants of the function (for
+example, trying to run a function that only has SSE4 and AVX variants on a
+system that only supports SSE2), then the standard library ``abort()``
+function will be called.
+
+One subtlety is that all non-static global variables (if any) must have the
+same size and layout with all of the targets used.  For example, if you
+have the global variables:
+
+::
+
+   uniform int foo[2*programCount];
+   int bar;
+
+and compile to both SSE2 and AVX targets, both of these variables will have
+different sizes (the first due to program count having the value 4 for SSE2
+and 8 for AVX, and the second due to ``varying`` types having different
+numbers of elements with the two targets--essentially the same issue as the
+first.)  ``ispc`` issues an error in this case.
+
+
+How can I determine at run-time which vector instruction set's instructions were selected to execute?
+-----------------------------------------------------------------------------------------------------
+
+``ispc`` doesn't provide any API that allows querying which vector ISA's
+instructions are running when multi-target compilation was used.  However,
+this can be solved in "user space" by writing a small helper function.
+Specifically, if you implement a function like this
+
+::
+
+    export uniform int isa() {
+    #if defined(ISPC_TARGET_SSE2)
+        return 0;
+    #elif defined(ISPC_TARGET_SSE4)
+        return 1;
+    #elif defined(ISPC_TARGET_AVX)
+        return 2;
+    #else
+        return -1;
+    #endif
+    }
+
+And then call it from your application code at runtime, it will return 0,
+1, or 2, depending on which target's instructions are running.
+
+The way this works is a little surprising, but it's a useful trick.  Of
+course the preprocessor ``#if`` checks are all compile-time only
+operations.  What's actually happening is that the function is compiled
+multiple times, once for each target, with the appropriate ``ISPC_TARGET``
+preprocessor symbol set.  Then, a small dispatch function is generated for
+the application to actually call.  This dispatch function in turn calls the
+appropriate version of the function based on the CPU of the system it's
+executing on, which in turn returns the appropriate value.
+
+In a similar fashion, it's possible to find out at run-time the value of
+``programCount`` for the target that's actually being used.
+
+::
+
+    export uniform int width() { return programCount; }
+
+
+Is it possible to inline ispc functions in C/C++ code?
+------------------------------------------------------
+
+If you're willing to use the ``clang`` C/C++ compiler that's part of the
+LLVM tool suite, then it is possible to inline ``ispc`` code with C/C++
+(and conversely, to inline C/C++ calls in ``ispc``).  Doing so can provide
+performance advantages when calling out to short functions written in the
+"other" language.  Note that you don't need to use ``clang`` to compile all
+of your C/C++ code, but only for the files where you want to be able to
+inline.  In order to do this, you must have a full installation of LLVM
+version 3.0 or later, including the ``clang`` compiler.
+
+The basic approach is to have the various compilers emit LLVM intermediate
+representation (IR) code and to then use tools from LLVM to link together
+the IR from the compilers and then re-optimize it, which gives the LLVM
+optimizer the opportunity to do additional inlining and cross-function
+optimizations.  If you have source files ``foo.ispc`` and ``foo.cpp``,
+first emit LLVM IR:
+
+::
+
+   ispc --emit-llvm -o foo_ispc.bc foo.ispc
+   clang -O2 -c -emit-llvm -o foo_cpp.bc foo.cpp
+
+Next, link the two IR files into a single file and run the LLVM optimizer
+on the result:
+
+::
+  
+    llvm-link foo_ispc.bc foo_cpp.bc -o - | opt -O3 -o foo_opt.bc
+
+And finally, generate a native object file:
+
+::
+
+   llc -filetype=obj foo_opt.bc -o foo.o
+
+This file can in turn be linked in with the rest of your object files when
+linking your applicaiton.
+
+(Note that if you're using the AVX instruction set, you must provide the
+``-mattr=+avx`` flag to ``llc``.)
+    
+
+Why is it illegal to pass "varying" values from C/C++ to ispc functions?
+------------------------------------------------------------------------
+
+If any of the types in the parameter list to an exported function is
+"varying" (including recursively, and members of structure types, etc.),
+then ``ispc`` will issue an error and refuse to compile the function:
+
+::
+
+    % echo "export int add(int x) { return ++x; }" | ispc
+    <stdin>:1:12: Error: Illegal to return a "varying" type from exported function "foo" 
+    <stdin>:1:20: Error: Varying parameter "x" is illegal in an exported function. 
+
+While there's no fundamental reason why this isn't possible, recall the
+definition of "varying" variables: they have one value for each program
+instance in the gang.  As such, the number of values and amount of storage
+required to represent a varying variable depends on the gang size
+(i.e. ``programCount``), which can have different values depending on the
+compilation target.
+
+``ispc`` therefore prohibits passing "varying" values between the
+application and the ``ispc`` program in order to prevent the
+application-side code from depending on a particular gang size, in order to
+encourage portability to different gang sizes.  (A generally desirable
+programming practice.)
+
+For cases where the size of data is actually fixed from the application
+side, the value can be passed via a pointer to a short ``uniform`` array,
+as follows:
+
+::
+
+    export void add4(uniform int ptr[4]) {
+        foreach (i = 0 ... 4)
+            ptr[i]++;
+    }
+
+On the 4-wide SSE instruction set, this compiles to a single vector add
+instruction (and associated move instructions), while it still also
+efficiently computes the correct result on 8-wide AVX targets.
+
+
+Programming Techniques
+======================
+
+What primitives are there for communicating between SPMD program instances?
+---------------------------------------------------------------------------
+
+The ``broadcast()``, ``rotate()``, and ``shuffle()`` standard library
+routines provide a variety of mechanisms for the running program instances
+to communicate values to each other during execution.  Note that there's no
+need to synchronize the program instances before communicating between
+them, due to the synchronized execution model of gangs of program instances
+in ``ispc``.
+
+How can a gang of program instances generate variable amounts of output efficiently?
+------------------------------------------------------------------------------------
+
+It's not unusual to have a gang of program instances where each program
+instance generates a variable amount of output (perhaps some generate no
+output, some generate one output value, some generate many output values
+and so forth), and where one would like to have the output densely packed
+in an output array.  The ``exclusive_scan_add()`` function from the
+standard library is quite useful in this situation.
+
+Consider the following function:
+
+::
+
+    uniform int func(uniform float outArray[], ...) {
+       int numOut = ...;  // figure out how many to be output
+       float outLocal[MAX_OUT]; // staging area
+
+       // each program instance in the gang puts its results in
+       //  outLocal[0], ..., outLocal[numOut-1]
+
+       int startOffset = exclusive_scan_add(numOut);
+       for (int i = 0; i < numOut; ++i)
+           outArray[startOffset + i] = outLocal[i];
+       return reduce_add(numOut);
+    }
+
+Here, each program instance has computed a number, ``numOut``, of values to
+output, and has stored them in the ``outLocal`` array.  Assume that four
+program instances are running and that the first one wants to output one
+value, the second two values, and the third and fourth three values each.
+In this case, ``exclusive_scan_add()`` will return the values (0, 1, 3, 6)
+to the four program instances, respectively.  
+
+The first program instance will then write its one result to
+``outArray[0]``, the second will write its two values to ``outArray[1]``
+and ``outArray[2]``, and so forth.  The ``reduce_add()`` call at the end
+returns the total number of values that all of the program instances have
+written to the array.
+
+FIXME: add discussion of foreach_active as an option here once that's in
+
+Is it possible to use ispc for explicit vector programming?
+-----------------------------------------------------------
+
+The typical model for programming in ``ispc`` is an *implicit* parallel
+model, where one writes a program that is apparently doing scalar
+computation on values and the program is then vectorized to run in parallel
+across the SIMD lanes of a processor.  However, ``ispc`` also has some
+support for explicit vector unit programming, where the vectorization is
+explicit.  Some computations may be more effectively described in the
+explicit model rather than the implicit model.
+
+This support is provided via ``uniform`` instances of short vectors
+Specifically, if this short program
+
+::
+
+    export uniform float<8> madd(uniform float<8> a, uniform float<8> b,
+                                 uniform float<8> c) {
+        return a + b * c;
+    }
+
+is compiled with the AVX target, ``ispc`` generates the following assembly:
+
+::
+
+    _madd:
+	vmulps	%ymm2, %ymm1, %ymm1
+	vaddps	%ymm0, %ymm1, %ymm0
+	ret
+
+(And similarly, if compiled with a 4-wide SSE target, two ``mulps`` and two
+``addps`` instructions are generated, and so forth.)
+
+Note that ``ispc`` doesn't currently support control-flow based on
+``uniform`` short vector types; it is thus not possible to write code like:
+
+::
+
+    export uniform int<8> count(uniform float<8> a, uniform float<8> b) {
+        uniform int<8> sum = 0;
+        while (a++ < b)
+            ++sum;
+    }
+
+
+How can I debug my ispc programs using Valgrind?
+------------------------------------------------
+
+The `valgrind`_ memory checker is an extremely useful memory checker for
+Linux and OSX; it detects a range of memory errors, including accessing
+memory after it has been freed, accessing memory beyond the end of an
+array, accessing uninitialized stack variables, and so forth.
+In general, applications that use ``ispc`` code run with ``valgrind``
+without modification and ``valgrind`` will detect the same range of memory
+errors in ``ispc`` code that it does in C/C++ code.  
+
+.. _valgrind: http://valgrind.org
+
+One issue to be aware of is that until recently, ``valgrind`` only
+supported the SSE2 vector instructions; if you are using a version of
+``valgrind`` older than the 3.7.0 release (5 November 2011), you should
+compile your ``ispc`` programs with ``--target=sse2`` before running them
+through ``valgrind``.  (Note that if no target is specified, then ``ispc``
+chooses a target based on the capabilities of the system you're running
+``ispc`` on.)  If you run an ``ispc`` program that uses instructions that
+``valgrind`` doesn't support, you'll see an error message like:
+
+::
+
+    vex amd64->IR: unhandled instruction bytes: 0xC5 0xFA 0x10 0x0 0xC5 0xFA 0x11 0x84
+    ==46059== valgrind: Unrecognised instruction at address 0x100002707.
+
+The just-released valgrind 3.7.0 adds support for the SSE4.2 instruction
+set; if you're using that version (and your system supports SSE4.2), then
+you can use ``--target=sse4`` when compiling to run with ``valgrind``.
+
+Note that ``valgrind`` does not yet support programs that use the AVX
+instruction set.
+
+foreach statements generate more complex assembly than I'd expect; what's going on?
+-----------------------------------------------------------------------------------
+
+Given a simple ``foreach`` loop like the following:
+
+::
+
+    void foo(uniform float a[], uniform int count) {
+        foreach (i = 0 ... count)
+            a[i] *= 2;
+    }
+
+
+the ``ispc`` compiler generates approximately 40 instructions--why isn't
+the generated code simpler?
+
+There are two main components to the code: one handles
+``programCount``-sized chunks of elements of the array, and the other
+handles any excess elements at the end of the array that don't completely
+fill a gang.  The code for the main loop is essentially what one would
+expect: a vector of values are laoded from the array, the multiply is done,
+and the result is stored.
+
+::
+
+    LBB0_2:                                 ## %foreach_full_body
+	movslq	%edx, %rdx
+	vmovups	(%rdi,%rdx), %ymm1
+	vmulps	%ymm0, %ymm1, %ymm1
+	vmovups	%ymm1, (%rdi,%rdx)
+	addl	$32, %edx
+	addl	$8, %eax
+	cmpl	%ecx, %eax
+	jl	LBB0_2
+
+
+Then, there is a sequence of instructions that handles any additional
+elements at the end of the array.  (These instructions don't execute if
+there aren't any left-over values to process, but they do lengthen the
+amount of generated code.)
+
+::
+
+  ## BB#4:                                ## %partial_inner_only
+	vmovd	%eax, %xmm0
+	vinsertf128	$1, %xmm0, %ymm0, %ymm0
+	vpermilps	$0, %ymm0, %ymm0 ## ymm0 = ymm0[0,0,0,0,4,4,4,4]
+	vextractf128	$1, %ymm0, %xmm3
+	vmovd	%esi, %xmm2
+	vmovaps	LCPI0_1(%rip), %ymm1
+	vextractf128	$1, %ymm1, %xmm4
+	vpaddd	%xmm4, %xmm3, %xmm3
+        # ....
+	vmulps	LCPI0_0(%rip), %ymm1, %ymm1
+	vmaskmovps	%ymm1, %ymm0, (%rdi,%rax)
+
+
+If you know that the number of elements to be processed will always be an
+exact multiple of the 8, 16, etc., then adding a simple assignment to
+``count`` like the one below gives the compiler enough information to be
+able to eliminate the code for the additional array elements.
+
+::
+
+    void foo(uniform float a[], uniform int count) {
+        // This assignment doesn't change the value of count
+        // if it's a multiple of 16, but it gives the compiler
+        // insight into this fact, allowing for simpler code to
+        // be generated for the foreach loop.
+        count = (count & ~(16-1));
+        foreach (i = 0 ... count)
+            a[i] *= 2;
+    }
+
+With this new version of ``foo()``, only the code for the first loop above
+is generated.
+
+
+How do I launch an individual task for each active program instance?
+--------------------------------------------------------------------
+
+Recall from the `discussion of "launch" in the ispc User's Guide`_ that a
+``launch`` statement launches a single task corresponding to a single gang
+of executing program instances, where the indices of the active program
+instances are the same as were active when the ``launch`` statement
+executed.
+
+.. _discussion of "launch" in the ispc User's Guide: ispc.html#task-parallelism-launch-and-sync-statements
+
+In some situations, it's desirable to be able to launch an individual task
+for each executing program instance.  For example, we might be performing
+an iterative computation where a subset of the program instances determine
+that an item they are responsible for requires additional processing.
+
+::
+
+    bool itemNeedsMoreProcessing(int);
+    int itemNum = ...;
+    if (itemNeedsMoreProcessing(itemNum)) {
+        // do additional work 
+    }
+
+For performance reasons, it may be desirable to apply an entire gang's
+worth of comptuation to each item that needs additional processing; 
+there may be available parallelism in this computation such that we'd like
+to process each of the items with SPMD computation.
+
+In this case, the ``foreach_active`` and ``unmasked`` constructs can be
+applied together to accomplish this goal.
+
+::
+
+    // do additional work 
+    task void doWork(uniform int index);
+    foreach_active (index) {
+        unmasked {
+            launch doWork(extract(itemNum, index)); 
+        }
+    }
+
+Recall that the body of the ``foreach_active`` loop runs once for each
+active program instance, with each active program instance's
+``programIndex`` value available in ``index`` in the above.  In the loop,
+we can re-establish an "all on" execution mask, enabling execution in all
+of the program instances in the gang, such that execution in ``doWork()``
+starts with all instances running.  (Alternatively, the ``unmasked`` block
+could be in the definition of ``doWork()``.)
+
--- a/docs/ispc.rst
+++ b/docs/ispc.rst
--- a/docs/ispc.txt
+++ b/docs/ispc.txt
--- a/docs/news.rst
+++ b/docs/news.rst
@@ -0,0 +1,179 @@
+=========
+ispc News
+=========
+
+ispc 1.9.1 is Released
+----------------------
+
+An ``ispc`` release with new native AVX512 target for future Xeon CPUs and
+improvements for debugging. Release is based on patched version LLVM 3.8 backend.
+
+For more details, please check `Release Notes`_.
+
+.. _Release Notes: https://github.com/ispc/ispc/blob/master/docs/ReleaseNotes.txt
+
+ispc 1.9.0 is Released
+----------------------
+
+An ``ispc`` release with AVX512 (KNL flavor) support and a number of bug fixes,
+based on fresh LLVM 3.8 backend.
+
+For more details, please check `Release Notes`_.
+
+.. _Release Notes: https://github.com/ispc/ispc/blob/master/docs/ReleaseNotes.txt
+
+ispc 1.8.2 is Released
+----------------------
+
+An update of ``ispc`` with several important stability fixes and an experimental
+AVX512 support has been released. Binaries are based on LLVM 3.6.1. Binaries with
+native AVX512 support are based on LLVM 3.7 (r238198).
+
+For more details, please check `Release Notes`_.
+
+.. _Release Notes: https://github.com/ispc/ispc/blob/master/docs/ReleaseNotes.txt
+
+ispc 1.8.1 is Released
+----------------------
+
+A minor update of ``ispc`` with several important stability fixes has been
+released. Problem with auto-dispatch on Linux is fixed (affects only pre-built
+binaries), the problem with -O2 -g is also fixed. There are several
+improvements in Xeon Phi support. Similar to 1.8.0 all binaries are based on
+LLVM 3.5.
+
+ispc 1.8.0 is Released
+----------------------
+
+A major new version of ``ispc``, which introduces experimental support for NVPTX
+target, brings numerous improvements to our KNC (Xeon Phi) support, introduces
+debugging support on Windows and fixes several bugs. We also ship experimental
+build for Sony PlayStation4 target in this release. Binaries for all platforms
+are based on LLVM 3.5.
+
+ispc 1.7.0 is Released
+----------------------
+
+A major new version of ``ispc`` with several language and library extensions and
+fixes in debug info support. Binaries for all platforms are based on patched
+version on LLVM 3.4. There also performance improvements beyond switchover to
+LLVM 3.4.
+
+ispc 1.6.0 is Released
+----------------------
+
+A major update of ``ispc`` has been released. The main focus is on improved 
+performance and stability. Several new targets were added. There are also 
+a number of language and library extensions. Released binaries are based on
+patched LLVM 3.3 on Linux and MacOS and LLVM 3.4rc3 on Windows. Please refer
+to Release Notes for complete set of changes.
+
+ispc 1.5.0 is Released
+----------------------
+
+A major update of ``ispc`` has been released with several new targets available
+and bunch of performance and stability fixes. The released binaries are built
+with patched version of LLVM 3.3. Please refer to Release Notes for complete
+set of changes.
+
+ispc 1.4.4 is Released
+----------------------
+
+A minor update of ``ispc`` has been released with several stability improvements.
+The released binaries are built with patched version of LLVM 3.3. Since this
+release we also distribute 32 bit Linux binaries.
+
+ispc 1.4.3 is Released
+----------------------
+
+A minor update of ``ispc`` has been released with several stability improvements.
+All tests and examples now properly compile and execute on native targets on
+Unix platforms (Linux and MacOS).
+The released binaries are built with patched version of LLVM 3.3.
+
+ispc 1.4.2 is Released
+----------------------
+
+A minor update of ``ispc`` has been released with stability fix for AVX2
+(Haswell), fix for Win32 platform and performance improvements on Xeon Phi.
+As usual, it's available on all supported platforms (Windows, Linux and MacOS).
+This version supports LLVM 3.1, 3.2, 3.3 and 3.4, but now we are recommending
+to avoid 3.1, as it's known to contain a number of stability problems and we are
+planning to deprecate its support soon.
+The released binaries are built with 3.3.
+
+ispc 1.4.1 is Released
+----------------------
+
+A major new version of ``ispc`` has been released with stability and
+performance improvements on all supported platforms (Windows, Linux and MacOS).
+This version supports LLVM 3.1, 3.2, 3.3 and 3.4. The released binaries are
+built with 3.2.
+
+ispc 1.3.0 is Released
+----------------------
+
+A major new version of ``ispc`` has been released.  In addition to a number
+of new language features, this release notably features initial support for
+compiling to the Intel Xeon Phi (Many Integrated Core) architecture.
+
+ispc 1.2.1 is Released
+----------------------
+
+This is a bugfix release, fixing approximately 20 bugs in the system and
+improving error handling and error reporting.  New functionality includes
+very efficient float/half conversion routines thanks to Fabian 
+Giesen.  See the `1.2.1 release notes`_ for details.
+
+.. _1.2.1 release notes: https://github.com/ispc/ispc/tree/master/docs/ReleaseNotes.txt
+
+ispc 1.2.0 is Released
+-----------------------
+
+A new major release was posted on March 20, 2012.  This release includes
+significant new functionality for cleanly handling "structure of arrays"
+(SoA) data layout and a new model for how uniform and varying are handled
+with structure types.  
+
+Paper on ispc To Appear in InPar 2012
+-------------------------------------
+
+A technical paper on ``ispc``, `ispc: A SPMD Compiler for High-Performance
+CPU Programming`_, by Matt Pharr and William R. Mark, has been accepted to
+the `InPar 2012`_ conference. This paper describes a number of the design
+features and key characteristics of the ``ispc`` implementation.
+
+(© 2012 IEEE. Personal use of this material is permitted. Permission from
+IEEE must be obtained for all other uses, in any current or future media,
+including reprinting/republishing this material for advertising or
+promotional purposes, creating new collective works, for resale or
+redistribution to servers or lists, or reuse of any copyrighted component
+of this work in other works.).
+
+.. _ispc\: A SPMD Compiler for High-Performance CPU Programming: https://github.com/downloads/ispc/ispc/ispc_inpar_2012.pdf
+.. _InPar 2012: http://innovativeparallel.org/
+
+ispc 1.1.4 is Released
+----------------------
+
+On February 4, 2012, the 1.1.4 release of ``ispc`` was posted; new features
+include ``new`` and ``delete`` for dynamic memory allocation in ``ispc``
+programs, "local" atomic operations in the standard library, and a new
+scalar compilation target.  See the `1.1.4 release notes`_ for details.
+
+.. _1.1.4 release notes: https://github.com/ispc/ispc/tree/master/docs/ReleaseNotes.txt
+
+
+ispc 1.1.3 is Released
+----------------------
+
+With this release, the language now supports "switch" statements, with the same semantics and syntax as in C.
+
+This release includes fixes for two important performance related issues:
+the quality of code generated for "foreach" statements has been
+substantially improved, and performance regression with code for "gathers"
+that was introduced in v1.1.2 has been fixed in this release.
+
+Thanks to Jean-Luc Duprat for a number of patches that improve support for
+building on various platforms, and to Pierre-Antoine Lacaze for patches so
+that ispc builds under MinGW.
--- a/docs/perf.rst
+++ b/docs/perf.rst
@@ -0,0 +1,85 @@
+===========
+Performance
+===========
+
+The SPMD programming model that ``ispc`` makes it easy to harness the
+computational power available in SIMD vector units on modern CPUs, while
+its basis in C makes it easy for programmers to adopt and use
+productively.  This page summarizes the performance of ``ispc`` with the
+workloads in the ``examples/`` directory of the ``ispc`` distribution.
+
+These results were measured on a 4-core Apple iMac with a 4-core 3.4GHz
+Intel® Core-i7 processor using the Intel® AVX instruction set.  The basis
+for comparison is a reference C++ implementation compiled with gcc 4.2.1,
+the version distributed with OS X 10.7.2.  (The reference implementation is
+also included in the ``examples/`` directory.)
+
+.. list-table:: Performance of ``ispc`` with a variety of the workloads
+   from the ``examples/`` directory of the ``ispc`` distribution, compared
+   a reference C++ implementation compiled with gcc 4.2.1.
+
+  * - Workload
+    - ``ispc``, 1 core
+    - ``ispc``, 4 cores
+  * - `AOBench`_ (512 x 512 resolution)
+    - 6.19x
+    - 28.06x
+  * - `Binomial Options`_ (128k options)
+    - 7.94x
+    - 33.43x
+  * - `Black-Scholes Options`_ (128k options)
+    - 8.45x
+    - 32.48x
+  * - `Deferred Shading`_ (1280p)
+    - 5.02x
+    - 23.06x
+  * - `Mandelbrot Set`_
+    - 6.21x
+    - 20.28x
+  * - `Perlin Noise Function`_
+    - 5.37x
+    - n/a
+  * - `Ray Tracer`_ (Sponza dataset)
+    - 4.31x
+    - 20.29x
+  * - `3D Stencil`_
+    - 4.05x
+    - 15.53x
+  * - `Volume Rendering`_
+    - 3.60x
+    - 17.53x
+
+
+.. _AOBench: https://github.com/ispc/ispc/tree/master/examples/aobench
+.. _Binomial Options: https://github.com/ispc/ispc/tree/master/examples/options
+.. _Black-Scholes Options: https://github.com/ispc/ispc/tree/master/examples/options
+.. _Deferred Shading: https://github.com/ispc/ispc/tree/master/examples/deferred
+.. _Mandelbrot Set: https://github.com/ispc/ispc/tree/master/examples/mandelbrot_tasks
+.. _Ray Tracer: https://github.com/ispc/ispc/tree/master/examples/rt
+.. _Perlin Noise Function: https://github.com/ispc/ispc/tree/master/examples/noise
+.. _3D Stencil: https://github.com/ispc/ispc/tree/master/examples/stencil
+.. _Volume Rendering: https://github.com/ispc/ispc/tree/master/examples/volume_rendering
+
+
+The following table shows speedups for a number of the examples on a
+2.40GHz, 40-core Intel® Xeon E7-8870 system with the Intel® SSE4
+instruction set, running Microsoft Windows Server 2008 Enterprise.  Here,
+the serial C/C++ baseline code was compiled with MSVC 2010.
+ 
+.. list-table:: Performance of ``ispc`` with a variety of the workloads
+   from the ``examples/`` directory of the ``ispc`` distribution, on 
+   system with 40 CPU cores.
+
+  * - Workload
+    - ``ispc``, 40 cores
+  * - AOBench (2048 x 2048 resolution)
+    - 182.36x
+  * - Binomial Options (2m options)
+    - 63.85x
+  * - Black-Scholes Options (2m options)
+    - 83.97x
+  * - Ray Tracer (Sponza dataset)
+    - 195.67x
+  * - Volume Rendering
+    - 243.18x
+
--- a/docs/perfguide.rst
+++ b/docs/perfguide.rst
@@ -0,0 +1,829 @@
+==============================================
+Intel® SPMD Program Compiler Performance Guide
+==============================================
+
+The SPMD programming model provided by ``ispc`` naturally delivers
+excellent performance for many workloads thanks to efficient use of CPU
+SIMD vector hardware.  This guide provides more details about how to get
+the most out of ``ispc`` in practice.
+
+* `Key Concepts`_
+
+  + `Efficient Iteration With "foreach"`_
+  + `Improving Control Flow Coherence With "foreach_tiled"`_
+  + `Using Coherent Control Flow Constructs`_
+  + `Use "uniform" Whenever Appropriate`_
+  + `Use "Structure of Arrays" Layout When Possible`_
+
+* `Tips and Techniques`_
+
+  + `Understanding Gather and Scatter`_
+  + `Avoid 64-bit Addressing Calculations When Possible`_
+  + `Avoid Computation With 8 and 16-bit Integer Types`_
+  + `Implementing Reductions Efficiently`_
+  + `Using "foreach_active" Effectively`_
+  + `Using Low-level Vector Tricks`_
+  + `The "Fast math" Option`_
+  + `"inline" Aggressively`_
+  + `Avoid The System Math Library`_
+  + `Declare Variables In The Scope Where They're Used`_
+  + `Instrumenting ISPC Programs To Understand Runtime Behavior`_
+  + `Choosing A Target Vector Width`_
+
+* `Disclaimer and Legal Information`_
+
+* `Optimization Notice`_
+
+Key Concepts
+============
+
+This section describes the four most important concepts to understand and
+keep in mind when writing high-performance ``ispc`` programs.  It assumes
+good familiarity with the topics covered in the ``ispc`` `Users Guide`_.
+
+.. _Users Guide: ispc.html
+
+Efficient Iteration With "foreach"
+----------------------------------
+
+The ``foreach`` parallel iteration construct is semantically equivalent to
+a regular ``for()`` loop, though it offers meaningful performance benefits.
+(See the `documentation on "foreach" in the Users Guide`_ for a review of
+its syntax and semantics.)  As an example, consider this simple function
+that iterates over some number of elements in an array, doing computation
+on each one:
+
+.. _documentation on "foreach" in the Users Guide: ispc.html#parallel-iteration-statements-foreach-and-foreach-tiled
+
+::
+
+    export void foo(uniform int a[], uniform int count) {
+        for (int i = programIndex; i < count; i += programCount) {
+            // do some computation on a[i]
+        }
+    }
+
+Depending on the specifics of the computation being performed, the code
+generated for this function could likely be improved by modifying the code 
+so that the loop only goes as far through the data as is possible to pack
+an entire gang of program instances with computation each time through the
+loop.  Doing so enables the ``ispc`` compiler to generate more efficient
+code for cases where it knows that the execution mask is "all on".  Then,
+an ``if`` statement at the end handles processing the ragged extra bits of
+data that didn't fully fill a gang.
+
+::
+
+    export void foo(uniform int a[], uniform int count) {
+        // First, just loop up to the point where all program instances
+        // in the gang will be active at the loop iteration start
+        uniform int countBase = count & ~(programCount-1);
+        for (uniform int i = 0; i < countBase; i += programCount) {
+            int index = i + programIndex;
+            // do some computation on a[index]
+        }
+        // Now handle the ragged extra bits at the end
+        if (countBase < count) {
+            int index = countBase + programIndex;
+            // do some computation on a[index]
+        }
+    }
+
+While the performance of the above code will likely be better than the
+first version of the function, the loop body code has been duplicated (or
+has been forced to move into a separate utility function).
+
+Using the ``foreach`` looping construct as below provides all of the
+performance benefits of the second version of this function, with the
+compactness of the first.
+
+::
+
+    export void foo(uniform int a[], uniform int count) {
+        foreach (i = 0 ... count) {
+            // do some computation on a[i]
+        }
+    }
+
+Improving Control Flow Coherence With "foreach_tiled"
+-----------------------------------------------------
+
+Depending on the computation being performed, ``foreach_tiled`` may give
+better performance than ``foreach``.  (See the `documentation in the Users
+Guide`_ for the syntax and semantics of ``foreach_tiled``.)  Given a
+multi-dimensional iteration like:
+
+.. _documentation in the Users Guide: ispc.html#parallel-iteration-statements-foreach-and-foreach-tiled
+
+::
+
+    foreach (i = 0 ... width, j = 0 ... height) {
+        // do computation on element (i,j)
+    }
+
+if the ``foreach`` statement is used, elements in the gang of program
+instances will be mapped to values of ``i`` and ``j`` by taking spans of
+``programCount`` elements across ``i`` with a single value of ``j``.  For
+example, the ``foreach`` statement above roughly corresponds to:
+
+::
+
+    for (uniform int j = 0; j < height; ++j)
+        for (int i = 0; i < width; i += programCount) {
+            // do computation 
+    }
+
+When a multi-dimensional domain is being iterated over, ``foreach_tiled``
+statement maps program instances to data in a way that tries to select
+square n-dimensional segments of the domain.  For example, on a compilation
+target with 8-wide gangs of program instances, it generates code that
+iterates over the domain the same way as the following code (though more
+efficiently):
+
+::
+
+    for (int j = programIndex/4; j < height; j += 2)
+        for (int i = programIndex%4; i < width; i += 4) {
+            // do computation 
+    }
+
+Thus, each gang of program instances operates on a 2x4 tile of the domain.
+With higher-dimensional iteration and different gang sizes, a similar
+mapping is performed--e.g. for 2D iteration with a 16-wide gang size, 4x4
+tiles are iterated over; for 4D iteration with a 8-gang, 1x2x2x2 tiles are
+processed, and so forth.  
+
+Performance benefit can come from using ``foreach_tiled`` in that it
+essentially optimizes for the benefit of iterating over *compact* regions
+of the domain (while ``foreach`` iterates over the domain in a way that
+generally allows linear memory access.)  There are two benefits from
+processing compact regions of the domain.  
+
+First, it's often the case that the control flow coherence of the program
+instances in the gang is improved; if data-dependent control flow decisions
+are related to the values of the data in the domain being processed, and if
+the data values have some coherence, iterating with compact regions will
+improve control flow coherence.
+
+Second, processing compact regions may mean that the data accessed by
+program instances in the gang is be more coherent, leading to performance
+benefits from better cache hit rates.
+
+As a concrete example, for the ray tracer example in the ``ispc``
+distribution (in the ``examples/rt`` directory), performance is 20% better
+when the pixels are iterated over using ``foreach_tiled`` than ``foreach``,
+because more coherent regions of the scene are accessed by the set of rays
+in the gang of program instances.
+
+
+Using Coherent Control Flow Constructs
+--------------------------------------
+
+Recall from the ``ispc`` Users Guide, in the `SPMD-on-SIMD Execution Model
+section`_ that ``if`` statements with a ``uniform`` test compile to more
+efficient code than ``if`` tests with varying tests.  The coherent ``cif``
+statement can provide many benefits of ``if`` with a uniform test in the
+case where the test is actually varying.
+
+.. _SPMD-on-SIMD Execution Model section: ispc.html#the-spmd-on-simd-execution-model
+
+In this case, the code the compiler generates for the ``if``
+test is along the lines of the following pseudo-code:
+
+::
+
+   bool expr = /* evaluate cif condition */
+   if (all(expr)) {
+       // run "true" case of if test only
+   } else if (!any(expr)) {
+       // run "false" case of if test only
+   } else {
+       // run both true and false cases, updating mask appropriately
+   }
+
+For ``if`` statements where the different running SPMD program instances
+don't have coherent values for the boolean ``if`` test, using ``cif``
+introduces some additional overhead from the ``all`` and ``any`` tests as
+well as the corresponding branches.  For cases where the program
+instances often do compute the same boolean value, this overhead is
+worthwhile.  If the control flow is in fact usually incoherent, this
+overhead only costs performance.
+
+In a similar fashion, ``ispc`` provides ``cfor``, ``cwhile``, and ``cdo``
+statements.  These statements are semantically the same as the
+corresponding non-"c"-prefixed functions.
+
+Use "uniform" Whenever Appropriate
+----------------------------------
+
+For any variable that will always have the same value across all of the
+program instances in a gang, declare the variable with the  ``uniform``
+qualifier.  Doing so enables the ``ispc`` compiler to emit better code in
+many different ways.
+
+As a simple example, consider a ``for`` loop that always does the same
+number of iterations:
+
+::
+
+    for (int i = 0; i < 10; ++i)
+        // do something ten times
+
+If this is written with ``i`` as a ``varying`` variable, as above, there's
+additional overhead in the code generated for the loop as the compiler
+emits instructions to handle the possibility of not all program instances
+following the same control flow path (as might be the case if the loop
+limit, 10, was itself a ``varying`` value.)
+
+If the above loop is instead written with ``i`` ``uniform``, as:
+
+::
+
+    for (uniform int i = 0; i < 10; ++i)
+        // do something ten times
+
+Then better code can be generated (and the loop possibly unrolled).
+
+In some cases, the compiler may be able to detect simple cases like these,
+but it's always best to provide the compiler with as much help as possible
+to understand the actual form of your computation.
+
+
+Use "Structure of Arrays" Layout When Possible
+----------------------------------------------
+
+In general, memory access performance (for both reads and writes) is best
+when the running program instances access a contiguous region of memory; in
+this case efficient vector load and store instructions can often be used
+rather than gathers and scatters.  As an example of this issue, consider an
+array of a simple point datatype laid out and accessed in conventional
+"array of structures" (AOS) layout:
+
+::
+
+    struct Point { float x, y, z; };
+    uniform Point pts[...];
+    float v = pts[programIndex].x;
+
+In the above code, the access to ``pts[programIndex].x`` accesses
+non-sequential memory locations, due to the ``y`` and ``z`` values between
+the desired ``x`` values in memory.  A "gather" is required to get the
+value of ``v``, with a corresponding decrease in performance.
+
+If ``Point`` was defined as a "structure of arrays" (SOA) type, the access
+can be much more efficient:
+
+::
+
+    struct Point8 { float x[8], y[8], z[8]; };
+    uniform Point8 pts8[...];
+    int majorIndex = programIndex / 8;
+    int minorIndex = programIndex % 8;
+    float v = pts8[majorIndex].x[minorIndex];
+
+In this case, each ``Point8`` has 8 ``x`` values contiguous in memory
+before 8 ``y`` values and then 8 ``z`` values.  If the gang size is 8 or
+less, the access for ``v`` will have the same value of ``majorIndex`` for
+all program instances and will access consecutive elements of the ``x[8]``
+array with a vector load.  (For larger gang sizes, two 8-wide vector loads
+would be issues, which is also quite efficient.)
+
+However, the syntax in the above code is messy; accessing SOA data in this
+fashion is much less elegant than the corresponding code for accessing the
+data with AOS layout.  The ``soa`` qualifier in ``ispc`` can be used to
+cause the corresponding transformation to be made to the ``Point`` type,
+while preserving the clean syntax for data access that comes with AOS
+layout:
+
+::
+
+    soa<8> Point pts[...]; 
+    float v = pts[programIndex].x;
+
+Thanks to having SOA layout a first-class concept in the language's type
+system, it's easy to write functions that convert data between the
+layouts.  For example, the ``aos_to_soa`` function below converts ``count``
+elements of the given ``Point`` type from AOS to 8-wide SOA layout.  (It
+assumes that the caller has pre-allocated sufficient space in the
+``pts_soa`` output array.
+
+::
+
+    void aos_to_soa(uniform Point pts_aos[], uniform int count,
+                    soa<8> pts_soa[]) {
+         foreach (i = 0 ... count)
+             pts_soa[i] = pts_aos[i];
+    }
+
+Analogously, a function could be written to convert back from SOA to AOS if
+needed.
+
+
+Tips and Techniques
+===================
+
+This section introduces a number of additional techniques that are worth
+keeping in mind when writing ``ispc`` programs.
+
+Understanding Gather and Scatter
+--------------------------------
+
+Memory reads and writes from the program instances in a gang that access
+irregular memory locations (rather than a consecutive set of locations, or
+a single location) can be relatively inefficient.  As an example, consider
+the "simple" array indexing calculation below:
+
+::
+
+    int i = ....;
+    uniform float x[10] = { ... };
+    float f = x[i];
+
+Since the index ``i`` is a varying value, the program instances in the gang
+will in general be reading different locations in the array ``x``.  Because
+current CPUs have a "gather" instruction, the ``ispc`` compiler has to
+serialize these memory reads, performing a separate memory load for each
+running program instance, packing the result into ``f``.  (The analogous
+case happens for a write into ``x[i]``.)
+
+In many cases, gathers like these are unavoidable; the program instances
+just need to access incoherent memory locations.  However, if the array
+index ``i`` actually has the same value for all of the program instances or
+if it represents an access to a consecutive set of array locations, much
+more efficient load and store instructions can be generated instead of
+gathers and scatters, respectively.
+
+In many cases, the ``ispc`` compiler is able to deduce that the memory
+locations accessed by a varying index are either all the same or are
+uniform.  For example, given:
+
+::
+
+  uniform int x = ...;
+  int y = x;
+  return array[y];
+
+The compiler is able to determine that all of the program instances are
+loading from the same location, even though ``y`` is not a ``uniform``
+variable.  In this case, the compiler will transform this load to a regular
+vector load, rather than a general gather.
+
+Sometimes the running program instances will access a linear sequence of
+memory locations; this happens most frequently when array indexing is done
+based on the built-in ``programIndex`` variable.  In many of these cases,
+the compiler is also able to detect this case and then do a vector load.
+For example, given:
+
+::
+
+    for (int i = programIndex; i < count; i += programCount)
+      // process array[i];
+
+Regular vector loads and stores are issued for accesses to ``array[i]``.
+
+Both of these cases have been ones where the compiler is able to determine
+statically that the index has the same value at compile-time.  It's 
+often the case that this determination can't be made at compile time, but
+this is often the case at run time.  The ``reduce_equal()`` function from
+the standard library can be used in this case; it checks to see if the
+given value is the same across over all of the running program instances,
+returning true and its ``uniform`` value if so.
+
+The following function shows the use of ``reduce_equal()`` to check for an
+equal index at execution time and then either do a scalar load and
+broadcast or a general gather.
+
+::
+
+    uniform float array[..] = { ... };
+    float value;
+    int i = ...;
+    uniform int ui;
+    if (reduce_equal(i, &ui) == true)
+        value = array[ui]; // scalar load + broadcast
+    else
+        value = array[i];  // gather
+
+For a simple case like the one above, the overhead of doing the
+``reduce_equal()`` check is likely not worthwhile compared to just always
+doing a gather.  In more complex cases, where a number of accesses are done
+based on the index, it can be worth doing.  See the example
+``examples/volume_rendering`` in the ``ispc`` distribution for the use of
+this technique in an instance where it is beneficial to performance.
+
+Understanding Memory Read Coalescing
+------------------------------------
+
+XXXX todo
+
+
+Avoid 64-bit Addressing Calculations When Possible
+--------------------------------------------------
+
+Even when compiling to a 64-bit architecture target, ``ispc`` does many of
+the addressing calculations in 32-bit precision by default--this behavior
+can be overridden with the ``--addressing=64`` command-line argument.  This
+option should only be used if it's necessary to be able to address over 4GB
+of memory in the ``ispc`` code, as it essentially doubles the cost of
+memory addressing calculations in the generated code.
+
+Avoid Computation With 8 and 16-bit Integer Types
+-------------------------------------------------
+
+The code generated for 8 and 16-bit integer types is generally not as
+efficient as the code generated for 32-bit integer types.  It is generally
+worthwhile to use 32-bit integer types for intermediate computations, even
+if the final result will be stored in a smaller integer type.
+
+Implementing Reductions Efficiently
+-----------------------------------
+
+It's often necessary to compute a reduction over a data set--for example,
+one might want to add all of the values in an array, compute their minimum,
+etc.  ``ispc`` provides a few capabilities that make it easy to efficiently
+compute reductions like these.  However, it's important to use these
+capabilities appropriately for best results.
+
+As an example, consider the task of computing the sum of all of the values
+in an array.  In C code, we might have:
+
+::
+
+    /* C implementation of a sum reduction */
+    float sum(const float array[], int count) {
+        float sum = 0;
+        for (int i = 0; i < count; ++i)
+            sum += array[i];
+        return sum;
+    } 
+
+Exactly this computation could also be expressed as a purely uniform
+computation in ``ispc``, though without any benefit from vectorization:
+
+::
+
+    /* inefficient ispc implementation of a sum reduction */
+    uniform float sum(const uniform float array[], uniform int count) {
+        uniform float sum = 0;
+        for (uniform int i = 0; i < count; ++i)
+            sum += array[i];
+        return sum;
+    } 
+
+As a first try, one might try using the ``reduce_add()`` function from the
+``ispc`` standard library; it takes a ``varying`` value and returns the sum
+of that value across all of the active program instances.
+
+::
+
+    /* inefficient ispc implementation of a sum reduction */
+    uniform float sum(const uniform float array[], uniform int count) {
+        uniform float sum = 0;
+        foreach (i = 0 ... count)
+            sum += reduce_add(array[i+programIndex]);
+        return sum;
+    } 
+
+This implementation loads a gang's worth of values from the array, one for
+each of the program instances, and then uses ``reduce_add()`` to reduce
+across the program instances and then update the sum.  Unfortunately this
+approach loses most benefit from vectorization, as it does more work on the
+cross-program instance ``reduce_add()`` call than it saves from the vector
+load of values.
+
+The most efficient approach is to do the reduction in two phases: rather
+than using a ``uniform`` variable to store the sum, we maintain a varying
+value, such that each program instance is effectively computing a local
+partial sum on the subset of array values that it has loaded from the
+array.  When the loop over array elements concludes, a single call to
+``reduce_add()`` computes the final reduction across each of the program
+instances' elements of ``sum``.  This approach effectively compiles to a
+single vector load and a single vector add for each loop iteration's of
+values--very efficient code in the end.
+
+::
+
+    /* good ispc implementation of a sum reduction */
+    uniform float sum(const uniform float array[], uniform int count) {
+        float sum = 0;
+        foreach (i = 0 ... count)
+            sum += array[i+programIndex];
+        return reduce_add(sum);
+    } 
+
+Using "foreach_active" Effectively
+----------------------------------
+
+For high-performance code,
+
+For example, consider this segment of code, from the introduction of
+``foreach_active`` in the ispc User's Guide:
+
+::
+
+    uniform float array[...] = { ... };    
+    int index = ...;
+    foreach_active (i) {
+        ++array[index];
+    }  
+
+Here, ``index`` was assumed to possibly have the same value for multiple
+program instances, so the updates to ``array[index]`` are serialized by the
+``foreach_active`` statement in order to not have undefined results when
+``index`` values do collide.
+
+The code generated by the compiler can be improved  in this case by making
+it clear that only a single element of the array is accessed by
+``array[index]`` and that thus a general gather or scatter isn't required.
+Specifically, by using the ``extract()`` function from the standard library
+to extract the current program instance's value of ``index`` into a
+``uniform`` variable and then using that to index into ``array``, as below,
+more efficient code is generated.
+
+::
+
+    foreach_active (instanceNum) {
+        uniform int unifIndex = extract(index, instanceNum);
+        ++array[unifIndex];
+    }
+
+
+Using Low-level Vector Tricks
+-----------------------------
+
+Many low-level Intel® SSE and AVX coding constructs can be implemented in
+``ispc`` code.  The ``ispc`` standard library functions ``intbits()`` and
+``floatbits()`` are often useful in this context.  Recall that
+``intbits()`` takes a ``float`` value and returns it as an integer where
+the bits of the integer are the same as the bit representation in memory of
+the ``float``.  (In other words, it does *not* perform an integer to
+floating-point conversion.)  ``floatbits()``, then, performs the inverse
+computation.
+
+As an example of the use of these functions, the following code efficiently
+reverses the sign of the given values.
+
+::
+
+  float flipsign(float a) {
+      unsigned int i = intbits(a);
+      i ^= 0x80000000;
+      return floatbits(i);
+  }
+
+This code compiles down to a single XOR instruction.
+
+The "Fast math" Option
+----------------------
+
+``ispc`` has a ``--opt=fast-math`` command-line flag that enables a number of
+optimizations that may be undesirable in code where numerical precision is
+critically important.  For many graphics applications, for example, the
+approximations introduced may be acceptable, however.  The following two
+optimizations are performed when ``--opt=fast-math`` is used.  By default, the
+``--opt=fast-math`` flag is off.
+
+* Expressions like ``x / y``, where ``y`` is a compile-time constant, are
+  transformed to ``x * (1./y)``, where the inverse value of ``y`` is
+  precomputed at compile time.
+
+* Expressions like ``x / y``, where ``y`` is not a compile-time constant,
+  are transformed to ``x * rcp(y)``, where ``rcp()`` maps to the
+  approximate reciprocal instruction from the ``ispc`` standard library.
+
+
+"inline" Aggressively
+---------------------
+
+Inlining functions aggressively is generally beneficial for performance
+with ``ispc``.  Definitely use the ``inline`` qualifier for any short
+functions (a few lines long), and experiment with it for longer functions.
+
+Avoid The System Math Library
+-----------------------------
+
+The default math library for transcendentals and the like that ``ispc`` has
+higher error than the system's math library, though is much more efficient
+due to being vectorized across the program instances and due to the fact
+that the functions can be inlined in the final code.  (It generally has
+errors in the range of 10ulps, while the system math library generally has
+no more than 1ulp of error for transcendentals.)
+
+If the ``--math-lib=system`` command-line option is used when compiling an
+``ispc`` program, then calls to the system math library will be generated
+instead.  This option should only be used if the higher precision is
+absolutely required as the performance impact of using it can be
+significant.
+
+Declare Variables In The Scope Where They're Used
+-------------------------------------------------
+
+Performance is slightly improved by declaring variables at the same block
+scope where they are first used.  For example, in code like the
+following, if the lifetime of ``foo`` is only within the scope of the
+``if`` clause, write the code like this:  
+
+::
+
+    float func() {
+        ....
+        if (x < y) {
+            float foo;
+            ... use foo ...
+        }
+    }
+
+Try not to write code as:
+
+::
+
+    float func() {
+        float foo;
+        ....
+        if (x < y) {
+            ... use foo ...
+        }
+    }
+
+Doing so can reduce the amount of masked store instructions that the
+compiler needs to generate.
+
+Instrumenting ISPC Programs To Understand Runtime Behavior
+----------------------------------------------------------
+
+``ispc`` has an optional instrumentation feature that can help you
+understand performance issues.  If a program is compiled using the
+``--instrument`` flag, the compiler emits calls to a function with the
+following signature at various points in the program (for
+example, at interesting points in the control flow, when scatters or
+gathers happen.)
+
+::
+
+    extern "C" {
+        void ISPCInstrument(const char *fn, const char *note, 
+                            int line, uint64_t mask);
+    }
+
+This function is passed the file name of the ``ispc`` file running, a short
+note indicating what is happening, the line number in the source file, and
+the current mask of active program instances in the gang.  You must provide an
+implementation of this function and link it in with your application.
+
+For example, when the ``ispc`` program runs, this function might be called
+as follows:
+
+::
+
+   ISPCInstrument("foo.ispc", "function entry", 55, 0xfull);
+
+This call indicates that at the currently executing program has just
+entered the function defined at line 55 of the file ``foo.ispc``, with a
+mask of all lanes currently executing (assuming a four-wide gang size
+target machine).
+
+For a fuller example of the utility of this functionality, see
+``examples/aobench_instrumented`` in the ``ispc`` distribution.  This
+example includes an implementation of the ``ISPCInstrument()`` function
+that collects aggregate data about the program's execution behavior.
+
+When running this example, you will want to direct to the ``ao`` executable
+to generate a low resolution image, because the instrumentation adds
+substantial execution overhead.  For example:
+
+::
+
+    % ./ao 1 32 32
+
+After the ``ao`` program exits, a summary report along the following lines
+will be printed.  In the first few lines, you can see how many times a few
+functions were called, and the average percentage of SIMD lanes that were
+active upon function entry.
+
+:: 
+
+    ao.ispc(0067) - function entry: 342424 calls (0 / 0.00% all off!), 95.86% active lanes
+    ao.ispc(0067) - return: uniform control flow: 342424 calls (0 / 0.00% all off!), 95.86% active lanes
+    ao.ispc(0071) - function entry: 1122 calls (0 / 0.00% all off!), 97.33% active lanes
+    ao.ispc(0075) - return: uniform control flow: 1122 calls (0 / 0.00% all off!), 97.33% active lanes
+    ao.ispc(0079) - function entry: 10072 calls (0 / 0.00% all off!), 45.09% active lanes
+    ao.ispc(0088) - function entry: 36928 calls (0 / 0.00% all off!), 97.40% active lanes
+    ...
+
+
+Choosing A Target Vector Width
+------------------------------
+
+By default, ``ispc`` compiles to the natural vector width of the target
+instruction set.  For example, for SSE2 and SSE4, it compiles four-wide,
+and for AVX, it complies 8-wide.  For some programs, higher performance may
+be seen if the program is compiled to a doubled vector width--8-wide for
+SSE and 16-wide for AVX.  
+
+For workloads that don't require many of registers, this method can lead to
+significantly more efficient execution thanks to greater instruction level
+parallelism and amortization of various overhead over more program
+instances.  For other workloads, it may lead to a slowdown due to higher
+register pressure; trying both approaches for key kernels may be
+worthwhile.
+
+This option is only available for each of the SSE2, SSE4 and AVX targets.
+It is selected with the ``--target=sse2-x2``, ``--target=sse4-x2`` and
+``--target=avx-x2`` options, respectively.
+
+
+Disclaimer and Legal Information
+================================
+
+INFORMATION IN THIS DOCUMENT IS PROVIDED IN CONNECTION WITH INTEL(R) PRODUCTS.
+NO LICENSE, EXPRESS OR IMPLIED, BY ESTOPPEL OR OTHERWISE, TO ANY INTELLECTUAL
+PROPERTY RIGHTS IS GRANTED BY THIS DOCUMENT. EXCEPT AS PROVIDED IN INTEL'S TERMS
+AND CONDITIONS OF SALE FOR SUCH PRODUCTS, INTEL ASSUMES NO LIABILITY WHATSOEVER,
+AND INTEL DISCLAIMS ANY EXPRESS OR IMPLIED WARRANTY, RELATING TO SALE AND/OR USE
+OF INTEL PRODUCTS INCLUDING LIABILITY OR WARRANTIES RELATING TO FITNESS FOR A
+PARTICULAR PURPOSE, MERCHANTABILITY, OR INFRINGEMENT OF ANY PATENT, COPYRIGHT
+OR OTHER INTELLECTUAL PROPERTY RIGHT.
+
+UNLESS OTHERWISE AGREED IN WRITING BY INTEL, THE INTEL PRODUCTS ARE NOT DESIGNED
+NOR INTENDED FOR ANY APPLICATION IN WHICH THE FAILURE OF THE INTEL PRODUCT COULD
+CREATE A SITUATION WHERE PERSONAL INJURY OR DEATH MAY OCCUR.
+
+Intel may make changes to specifications and product descriptions at any time,
+without notice. Designers must not rely on the absence or characteristics of any
+features or instructions marked "reserved" or "undefined." Intel reserves these
+for future definition and shall have no responsibility whatsoever for conflicts
+or incompatibilities arising from future changes to them. The information here
+is subject to change without notice. Do not finalize a design with this
+information.
+
+The products described in this document may contain design defects or errors
+known as errata which may cause the product to deviate from published
+specifications. Current characterized errata are available on request.
+
+Contact your local Intel sales office or your distributor to obtain the latest
+specifications and before placing your product order.
+
+Copies of documents which have an order number and are referenced in this
+document, or other Intel literature, may be obtained by calling 1-800-548-4725,
+or by visiting Intel's Web Site.
+
+Intel processor numbers are not a measure of performance. Processor numbers
+differentiate features within each processor family, not across different
+processor families. See http://www.intel.com/products/processor_number for
+details.
+
+BunnyPeople, Celeron, Celeron Inside, Centrino, Centrino Atom,
+Centrino Atom Inside, Centrino Inside, Centrino logo, Core Inside, FlashFile,
+i960, InstantIP, Intel, Intel logo, Intel386, Intel486, IntelDX2, IntelDX4,
+IntelSX2, Intel Atom, Intel Atom Inside, Intel Core, Intel Inside,
+Intel Inside logo, Intel. Leap ahead., Intel. Leap ahead. logo, Intel NetBurst,
+Intel NetMerge, Intel NetStructure, Intel SingleDriver, Intel SpeedStep,
+Intel StrataFlash, Intel Viiv, Intel vPro, Intel XScale, Itanium,
+Itanium Inside, MCS, MMX, Oplus, OverDrive, PDCharm, Pentium, Pentium Inside,
+skoool, Sound Mark, The Journey Inside, Viiv Inside, vPro Inside, VTune, Xeon,
+and Xeon Inside are trademarks of Intel Corporation in the U.S. and other
+countries.
+
+* Other names and brands may be claimed as the property of others.
+
+Copyright(C) 2011-2016, Intel Corporation. All rights reserved.
+
+
+Optimization Notice
+===================
+
+Intel compilers, associated libraries and associated development tools may
+include or utilize options that optimize for instruction sets that are
+available in both Intel and non-Intel microprocessors (for example SIMD
+instruction sets), but do not optimize equally for non-Intel
+microprocessors.  In addition, certain compiler options for Intel
+compilers, including some that are not specific to Intel
+micro-architecture, are reserved for Intel microprocessors.  For a detailed
+description of Intel compiler options, including the instruction sets and
+specific microprocessors they implicate, please refer to the "Intel
+Compiler User and Reference Guides" under "Compiler Options."  Many library
+routines that are part of Intel compiler products are more highly optimized
+for Intel microprocessors than for other microprocessors.  While the
+compilers and libraries in Intel compiler products offer optimizations for
+both Intel and Intel-compatible microprocessors, depending on the options
+you select, your code and other factors, you likely will get extra
+performance on Intel microprocessors.
+
+Intel compilers, associated libraries and associated development tools may
+or may not optimize to the same degree for non-Intel microprocessors for
+optimizations that are not unique to Intel microprocessors.  These
+optimizations include Intel® Streaming SIMD Extensions 2 (Intel® SSE2),
+Intel® Streaming SIMD Extensions 3 (Intel® SSE3), and Supplemental
+Streaming SIMD Extensions 3 (Intel SSSE3) instruction sets and other
+optimizations.  Intel does not guarantee the availability, functionality,
+or effectiveness of any optimization on microprocessors not manufactured by
+Intel.  Microprocessor-dependent optimizations in this product are intended
+for use with Intel microprocessors.
+
+While Intel believes our compilers and libraries are excellent choices to
+assist in obtaining the best performance on Intel and non-Intel
+microprocessors, Intel recommends that you evaluate other compilers and
+libraries to determine which best meet your requirements.  We hope to win
+your business by striving to offer the best performance of any compiler or
+library; please let us know if you find we do not.
+
--- a/docs/template-news.txt
+++ b/docs/template-news.txt
@@ -0,0 +1,66 @@
+%(head_prefix)s
+%(head)s
+<script type="text/javascript">
+
+  var _gaq = _gaq || [];
+  _gaq.push(['_setAccount', 'UA-1486404-4']);
+  _gaq.push(['_trackPageview']);
+
+  (function() {
+    var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
+    ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
+    var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
+  })();
+
+</script>
+%(stylesheet)s
+%(body_prefix)s
+<div id="wrap">
+  <div id="wrap2">
+    <div id="header">
+      <h1 id="logo">Intel SPMD Program Compiler</h1>
+      <div id="slogan">An open-source compiler for high-performance SIMD programming on
+      the CPU</div>
+    </div>
+    <div id="nav">
+      <div id="nbar">
+        <ul>
+          <li><a href="index.html">Overview</a></li>
+          <li id="selected"><a href="news.html">News</a></li>
+          <li><a href="features.html">Features</a></li>
+          <li><a href="downloads.html">Downloads</a></li>
+          <li><a href="documentation.html">Documentation</a></li>
+          <li><a href="perf.html">Performance</a></li>
+          <li><a href="contrib.html">Contributors</a></li>
+        </ul>
+      </div>
+    </div>
+    <div id="content-wrap">
+      <div id="sidebar">
+          <div class="widgetspace">
+            <h1>Resources</h1>
+            <ul class="menu">
+              <li><a href="http://github.com/ispc/ispc/">ispc page on github</a></li>
+              <li><a href="http://groups.google.com/group/ispc-users/">ispc
+              users mailing list</a></li>
+              <li><a href="http://groups.google.com/group/ispc-dev/">ispc
+              developers mailing list</a></li>
+              <li><a href="http://github.com/ispc/ispc/wiki/">Wiki</a></li>
+              <li><a href="http://github.com/ispc/ispc/issues/">Bug tracking</a></li>
+              <li><a href="doxygen/index.html">Doxygen</a></li>
+            </ul>
+        </div>
+      </div>
+%(body_pre_docinfo)s
+%(docinfo)s
+<div id="content">
+%(body)s
+</div>
+    <div class="clearfix"></div>
+    <div id="footer"> &copy; 2011-2016 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue  by: <a href="http://www.themebin.com/">ThemeBin</a>
+      <!-- Please Do Not remove this link, thank u -->
+      </div>
+      </div>
+      </div>
+      </div>
+%(body_suffix)s
--- a/docs/template-perf.txt
+++ b/docs/template-perf.txt
@@ -0,0 +1,66 @@
+%(head_prefix)s
+%(head)s
+<script type="text/javascript">
+
+  var _gaq = _gaq || [];
+  _gaq.push(['_setAccount', 'UA-1486404-4']);
+  _gaq.push(['_trackPageview']);
+
+  (function() {
+    var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
+    ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
+    var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
+  })();
+
+</script>
+%(stylesheet)s
+%(body_prefix)s
+<div id="wrap">
+  <div id="wrap2">
+    <div id="header">
+      <h1 id="logo">Intel SPMD Program Compiler</h1>
+      <div id="slogan">An open-source compiler for high-performance SIMD programming on
+      the CPU</div>
+    </div>
+    <div id="nav">
+      <div id="nbar">
+        <ul>
+          <li><a href="index.html">Overview</a></li>
+          <li><a href="news.html">News</a></li>
+          <li><a href="features.html">Features</a></li>
+          <li><a href="downloads.html">Downloads</a></li>
+          <li><a href="documentation.html">Documentation</a></li>
+          <li id="selected"><a href="perf.html">Performance</a></li>
+          <li><a href="contrib.html">Contributors</a></li>
+        </ul>
+      </div>
+    </div>
+    <div id="content-wrap">
+      <div id="sidebar">
+          <div class="widgetspace">
+            <h1>Resources</h1>
+            <ul class="menu">
+              <li><a href="http://github.com/ispc/ispc/">ispc page on github</a></li>
+              <li><a href="http://groups.google.com/group/ispc-users/">ispc
+              users mailing list</a></li>
+              <li><a href="http://groups.google.com/group/ispc-dev/">ispc
+              developers mailing list</a></li>
+              <li><a href="http://github.com/ispc/ispc/wiki/">Wiki</a></li>
+              <li><a href="http://github.com/ispc/ispc/issues/">Bug tracking</a></li>
+              <li><a href="doxygen/index.html">Doxygen</a></li>
+            </ul>
+        </div>
+      </div>
+%(body_pre_docinfo)s
+%(docinfo)s
+<div id="content">
+%(body)s
+</div>
+    <div class="clearfix"></div>
+    <div id="footer"> &copy; 2011-2016 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue  by: <a href="http://www.themebin.com/">ThemeBin</a>
+      <!-- Please Do Not remove this link, thank u -->
+      </div>
+      </div>
+      </div>
+      </div>
+%(body_suffix)s
--- a/docs/template.txt
+++ b/docs/template.txt
@@ -0,0 +1,66 @@
+%(head_prefix)s
+%(head)s
+<script type="text/javascript">
+
+  var _gaq = _gaq || [];
+  _gaq.push(['_setAccount', 'UA-1486404-4']);
+  _gaq.push(['_trackPageview']);
+
+  (function() {
+    var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
+    ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
+    var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
+  })();
+
+</script>
+%(stylesheet)s
+%(body_prefix)s
+<div id="wrap">
+  <div id="wrap2">
+    <div id="header">
+      <h1 id="logo">Intel SPMD Program Compiler</h1>
+      <div id="slogan">An open-source compiler for high-performance SIMD programming on
+      the CPU</div>
+    </div>
+    <div id="nav">
+      <div id="nbar">
+        <ul>
+          <li><a href="index.html">Overview</a></li>
+          <li><a href="news.html">News</a></li>
+          <li><a href="features.html">Features</a></li>
+          <li><a href="downloads.html">Downloads</a></li>
+          <li id="selected"><a href="documentation.html">Documentation</a></li>
+          <li><a href="perf.html">Performance</a></li>
+          <li><a href="contrib.html">Contributors</a></li>
+        </ul>
+      </div>
+    </div>
+    <div id="content-wrap">
+      <div id="sidebar">
+          <div class="widgetspace">
+            <h1>Resources</h1>
+            <ul class="menu">
+              <li><a href="http://github.com/ispc/ispc/">ispc page on github</a></li>
+              <li><a href="http://groups.google.com/group/ispc-users/">ispc
+              users mailing list</a></li>
+              <li><a href="http://groups.google.com/group/ispc-dev/">ispc
+              developers mailing list</a></li>
+              <li><a href="http://github.com/ispc/ispc/wiki/">Wiki</a></li>
+              <li><a href="http://github.com/ispc/ispc/issues/">Bug tracking</a></li>
+              <li><a href="doxygen/index.html">Doxygen</a></li>
+            </ul>
+        </div>
+      </div>
+%(body_pre_docinfo)s
+%(docinfo)s
+<div id="content">
+%(body)s
+</div>
+    <div class="clearfix"></div>
+    <div id="footer"> &copy; 2011-2016 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue  by: <a href="http://www.themebin.com/">ThemeBin</a>
+      <!-- Please Do Not remove this link, thank u -->
+      </div>
+      </div>
+      </div>
+      </div>
+%(body_suffix)s
--- a/doxygen.cfg
+++ b/doxygen.cfg
@@ -31,7 +31,7 @@ PROJECT_NAME           = "Intel SPMD Program Compiler"
 # This could be handy for archiving the generated documentation or
 # if some version control system is used.

-PROJECT_NUMBER         = 1.0.11
+PROJECT_NUMBER         = 1.9.2dev

 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
 # base path where the generated documentation will be put.
@@ -581,10 +581,12 @@ WARN_LOGFILE           =
 # directories like "/usr/src/myproject". Separate the files or directories
 # with spaces.

-INPUT                  = builtins.h \
+INPUT                  = ast.h \
+                         builtins.h \
                         ctx.h \
                         decl.h \
                         expr.h \
+                         func.h \
                         ispc.h \
                         llvmutil.h \
                         module.h \
@@ -593,10 +595,13 @@ INPUT                  = builtins.h \
                         sym.h \
                         type.h \
                         util.h \
+                         ast.cpp \
                         builtins.cpp \
+                         cbackend.cpp \
                         ctx.cpp \
                         decl.cpp \
                         expr.cpp \
+                         func.cpp \
                         ispc.cpp \
                         llvmutil.cpp \
                         main.cpp \
@@ -608,7 +613,7 @@ INPUT                  = builtins.h \
                         util.cpp \
                         parse.yy \
                         lex.ll \
-                         builtins-c.c
+                         builtins/builtins.c

 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is
--- a/examples/README.txt
+++ b/examples/README.txt
@@ -39,9 +39,6 @@ example implementation of this function that counts the number of times the
 callback is made and records some statistics about control flow coherence
 is provided in the instrument.cpp file.

-*** Note: on Linux, this example currently hits an assertion in LLVM during
-*** compilation
-

 Deferred
 ========
@@ -76,6 +73,14 @@ This directory includes three implementations of the algorithm:
  light culling and shading.


+GMRES
+=====
+
+An implementation of the generalized minimal residual method for solving
+sparse matrix equations.
+(http://en.wikipedia.org/wiki/Generalized_minimal_residual_method)
+
+
 Mandelbrot
 ==========

@@ -110,6 +115,13 @@ This program implements both the Black-Scholes and Binomial options pricing
 models in both ispc and regular serial C++ code.


+Perfbench
+=========
+
+This runs a number of microbenchmarks to measure system performance and
+code generation quality.
+
+
 RT
 ==

@@ -134,6 +146,11 @@ This is a simple "hello world" type program that shows a ~10 line
 application program calling out to a ~5 line ispc program to do a simple
 computation.

+Sort
+====
+This is a bucket sort of 32 bit unsigned integers.
+By default 1000000 random elements get sorted.
+Call ./sort N in order to sort N elements instead.

 Volume
 ======
--- a/examples/aobench/Makefile
+++ b/examples/aobench/Makefile
@@ -1,39 +1,8 @@

-ARCH = $(shell uname)
+EXAMPLE=ao
+CPP_SRC=ao.cpp ao_serial.cpp
+ISPC_SRC=ao.ispc
+ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x4,avx1-i32x8,avx2-i32x8,avx512knl-i32x16,avx512skx-i32x16
+ISPC_ARM_TARGETS=neon

-TASK_CXX=../tasksys.cpp
-TASK_LIB=-lpthread
-TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
-
-CXX=g++
-CXXFLAGS=-Iobjs/ -O3 -Wall -m64
-ISPC=ispc
-ISPCFLAGS=-O2 --target=sse2,sse4,avx --arch=x86-64
-
-ISPC_OBJS=objs/ao_ispc.o objs/ao_ispc_sse2.o objs/ao_ispc_sse4.o \
-	objs/ao_ispc_avx.o
-OBJS=objs/ao.o objs/ao_serial.o $(ISPC_OBJS) $(TASK_OBJ)
-
-default: ao
-
-.PHONY: dirs clean
-
-dirs:
-	/bin/mkdir -p objs/
-
-clean:
-	/bin/rm -rf objs *~ ao
-
-ao: dirs $(OBJS) $(TASK_OBJ)
-	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASK_LIB)
-
-objs/%.o: %.cpp
-	$(CXX) $< $(CXXFLAGS) -c -o $@
-
-objs/%.o: ../%.cpp
-	$(CXX) $< $(CXXFLAGS) -c -o $@
-
-objs/ao.o: objs/ao_ispc.h 
-
-objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
-	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
+include ../common.mk
--- a/examples/aobench/ao.cpp
+++ b/examples/aobench/ao.cpp
@@ -60,7 +60,7 @@ using namespace ispc;

 extern void ao_serial(int w, int h, int nsubsamples, float image[]);

-static unsigned int test_iterations;
+static unsigned int test_iterations[] = {3, 7, 1};
 static unsigned int width, height;
 static unsigned char *img;
 static float *fimg;
@@ -106,16 +106,20 @@ savePPM(const char *fname, int w, int h)

 int main(int argc, char **argv)
 {
-    if (argc != 4) {
+    if (argc < 3) {
        printf ("%s\n", argv[0]);
-        printf ("Usage: ao [num test iterations] [width] [height]\n");
+        printf ("Usage: ao [width] [height] [ispc iterations] [tasks iterations] [serial iterations]\n");
        getchar();
        exit(-1);
    }
    else {
-        test_iterations = atoi(argv[1]);
-        width = atoi (argv[2]);
-        height = atoi (argv[3]);
+        if (argc == 6) {
+            for (int i = 0; i < 3; i++) {
+                test_iterations[i] = atoi(argv[3 + i]);
+            }
+        }
+        width = atoi (argv[1]);
+        height = atoi (argv[2]);
    }

    // Allocate space for output images
@@ -127,18 +131,19 @@ int main(int argc, char **argv)
    // time for any of them.
    //
    double minTimeISPC = 1e30;
-    for (unsigned int i = 0; i < test_iterations; i++) {
+    for (unsigned int i = 0; i < test_iterations[0]; i++) {
        memset((void *)fimg, 0, sizeof(float) * width * height * 3);
        assert(NSUBSAMPLES == 2);

        reset_and_start_timer();
        ao_ispc(width, height, NSUBSAMPLES, fimg);
        double t = get_elapsed_mcycles();
+        printf("@time of ISPC run:\t\t\t[%.3f] million cycles\n", t);
        minTimeISPC = std::min(minTimeISPC, t);
    }

    // Report results and save image
-    printf("[aobench ispc]:\t\t\t[%.3f] M cycles (%d x %d image)\n", 
+    printf("[aobench ispc]:\t\t\t[%.3f] million cycles (%d x %d image)\n", 
           minTimeISPC, width, height);
    savePPM("ao-ispc.ppm", width, height); 

@@ -147,18 +152,19 @@ int main(int argc, char **argv)
    // minimum time for any of them.
    //
    double minTimeISPCTasks = 1e30;
-    for (unsigned int i = 0; i < test_iterations; i++) {
+    for (unsigned int i = 0; i < test_iterations[1]; i++) {
        memset((void *)fimg, 0, sizeof(float) * width * height * 3);
        assert(NSUBSAMPLES == 2);

        reset_and_start_timer();
        ao_ispc_tasks(width, height, NSUBSAMPLES, fimg);
        double t = get_elapsed_mcycles();
+        printf("@time of ISPC + TASKS run:\t\t\t[%.3f] million cycles\n", t);
        minTimeISPCTasks = std::min(minTimeISPCTasks, t);
    }

    // Report results and save image
-    printf("[aobench ispc + tasks]:\t\t[%.3f] M cycles (%d x %d image)\n", 
+    printf("[aobench ispc + tasks]:\t\t[%.3f] million cycles (%d x %d image)\n", 
           minTimeISPCTasks, width, height);
    savePPM("ao-ispc-tasks.ppm", width, height); 

@@ -167,16 +173,17 @@ int main(int argc, char **argv)
    // minimum time.
    //
    double minTimeSerial = 1e30;
-    for (unsigned int i = 0; i < test_iterations; i++) {
+    for (unsigned int i = 0; i < test_iterations[2]; i++) {
        memset((void *)fimg, 0, sizeof(float) * width * height * 3);
        reset_and_start_timer();
        ao_serial(width, height, NSUBSAMPLES, fimg);
        double t = get_elapsed_mcycles();
+        printf("@time of serial run:\t\t\t\t[%.3f] million cycles\n", t);
        minTimeSerial = std::min(minTimeSerial, t);
    }

    // Report more results, save another image...
-    printf("[aobench serial]:\t\t[%.3f] M cycles (%d x %d image)\n", minTimeSerial, 
+    printf("[aobench serial]:\t\t[%.3f] million cycles (%d x %d image)\n", minTimeSerial, 
           width, height);
    printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", 
           minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks);
--- a/examples/aobench/ao.ispc
+++ b/examples/aobench/ao.ispc
@@ -50,7 +50,6 @@ struct Isect {
 struct Sphere {
    vec        center;
    float      radius;
-
 };

 struct Plane {
@@ -75,16 +74,15 @@ static inline vec vcross(vec v0, vec v1) {
    return ret;
 }

-static inline void vnormalize(reference vec v) {
+static inline void vnormalize(vec &v) {
    float len2 = dot(v, v);
    float invlen = rsqrt(len2);
    v *= invlen;
 }


-static inline void
-ray_plane_intersect(reference Isect isect, reference Ray ray, 
-                    reference Plane plane) {
+static void
+ray_plane_intersect(Isect &isect, Ray &ray, uniform Plane &plane) {
    float d = -dot(plane.p, plane.n);
    float v = dot(ray.dir, plane.n);

@@ -104,8 +102,7 @@ ray_plane_intersect(reference Isect isect, reference Ray ray,


 static inline void
-ray_sphere_intersect(reference Isect isect, reference Ray ray, 
-                     reference Sphere sphere) {
+ray_sphere_intersect(Isect &isect, Ray &ray, uniform Sphere &sphere) {
    vec rs = ray.org - sphere.center;

    float B = dot(rs, ray.dir);
@@ -126,8 +123,8 @@ ray_sphere_intersect(reference Isect isect, reference Ray ray,
 }


-static inline void
-orthoBasis(reference vec basis[3], vec n) {
+static void
+orthoBasis(vec basis[3], vec n) {
    basis[2] = n;
    basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;

@@ -149,9 +146,9 @@ orthoBasis(reference vec basis[3], vec n) {
 }


-static inline float
-ambient_occlusion(reference Isect isect, reference Plane plane, 
-                  reference Sphere spheres[3], reference RNGState rngstate) {
+static float
+ambient_occlusion(Isect &isect, uniform Plane &plane, uniform Sphere spheres[3],
+                  RNGState &rngstate) {
    float eps = 0.0001f;
    vec p, n;
    vec basis[3];
@@ -168,8 +165,8 @@ ambient_occlusion(reference Isect isect, reference Plane plane,
            Ray ray;
            Isect occIsect;

-            float theta = sqrt(frandom(rngstate));
-            float phi   = 2.0f * M_PI * frandom(rngstate);
+            float theta = sqrt(frandom(&rngstate));
+            float phi   = 2.0f * M_PI * frandom(&rngstate);
            float x = cos(phi) * theta;
            float y = sin(phi) * theta;
            float z = sqrt(1.0 - theta * theta);
@@ -205,113 +202,53 @@ ambient_occlusion(reference Isect isect, reference Plane plane,
 */
 static void ao_scanlines(uniform int y0, uniform int y1, uniform int w, 
                         uniform int h,  uniform int nsubsamples, 
-                         reference uniform float image[]) {
-    static Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
-    static Sphere spheres[3] = {
+                         uniform float image[]) {
+    static uniform Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
+    static uniform Sphere spheres[3] = {
        { { -2.0f, 0.0f, -3.5f }, 0.5f },
        { { -0.5f, 0.0f, -3.0f }, 0.5f },
        { { 1.0f, 0.0f, -2.2f }, 0.5f } };
    RNGState rngstate;

-    seed_rng(rngstate, y0);
+    seed_rng(&rngstate, programIndex + (y0 << (programIndex & 15)));
+    float invSamples = 1.f / nsubsamples;

-    // Compute the mapping between the 'programCount'-wide program
-    // instances running in parallel and samples in the image.  
-    //
-    // For now, we'll always take four samples per pixel, so start by
-    // initializing du and dv with offsets into subpixel samples.  We'll
-    // take care of further updating du and dv for the case where we're
-    // doing more than 4 program instances in parallel shortly.
-    uniform float uSteps[4] = { 0, 1, 0, 1 };
-    uniform float vSteps[4] = { 0, 0, 1, 1 };
-    float du = uSteps[programIndex % 4] / nsubsamples;
-    float dv = vSteps[programIndex % 4] / nsubsamples;
+    foreach_tiled(y = y0 ... y1, x = 0 ... w, 
+                  u = 0 ... nsubsamples, v = 0 ... nsubsamples) {
+        float du = (float)u * invSamples, dv = (float)v * invSamples;

-    // Now handle the case where we are able to do more than one pixel's
-    // worth of work at once.  nx records the number of pixels in the x
-    // direction we do per iteration and ny the number in y.
-    uniform int nx = 1, ny = 1;
+        // Figure out x,y pixel in NDC
+        float px =  (x + du - (w / 2.0f)) / (w / 2.0f);
+        float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
+        float ret = 0.f;
+        Ray ray;
+        Isect isect;

-    // FIXME: We actually need ny to be 1 regardless of the decomposition,
-    // since the task decomposition is one scanline high.
+        ray.org = 0.f;

-    if (programCount == 8) {
-        // Do two pixels at once in the x direction
-        nx = 2;
-        if (programIndex >= 4) 
-            // And shift the offsets for the second pixel's worth of work
-            ++du;
-    }
-    else if (programCount == 16) {
-        nx = 4;
-        ny = 1;
-        if (programIndex >= 4 && programIndex < 8)
-            ++du;
-        if (programIndex >= 8 && programIndex < 12)
-            du += 2;
-        if (programIndex >= 12)
-            du += 3;
-    }
+        // Poor man's perspective projection
+        ray.dir.x = px;
+        ray.dir.y = py;
+        ray.dir.z = -1.0;
+        vnormalize(ray.dir);

-    // Now loop over all of the pixels, stepping in x and y as calculated
-    // above.  (Assumes that ny divides y and nx divides x...)
-    for (uniform int y = y0; y < y1; y += ny) {
-        for (uniform int x = 0; x < w; x += nx)  {
-            // Figure out x,y pixel in NDC
-            float px =  (x + du - (w / 2.0f)) / (w / 2.0f);
-            float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
-            float ret = 0.f;
-            Ray ray;
-            Isect isect;
+        isect.t   = 1.0e+17;
+        isect.hit = 0;

-            ray.org = 0.f;
+        for (uniform int snum = 0; snum < 3; ++snum)
+            ray_sphere_intersect(isect, ray, spheres[snum]);
+        ray_plane_intersect(isect, ray, plane);

-            // Poor man's perspective projection
-            ray.dir.x = px;
-            ray.dir.y = py;
-            ray.dir.z = -1.0;
-            vnormalize(ray.dir);
+        // Note use of 'coherent' if statement; the set of rays we
+        // trace will often all hit or all miss the scene
+        cif (isect.hit) {
+            ret = ambient_occlusion(isect, plane, spheres, rngstate);
+            ret *= invSamples * invSamples;

-            isect.t   = 1.0e+17;
-            isect.hit = 0;
-
-            for (uniform int snum = 0; snum < 3; ++snum)
-                ray_sphere_intersect(isect, ray, spheres[snum]);
-            ray_plane_intersect(isect, ray, plane);
-
-            // Note use of 'coherent' if statement; the set of rays we
-            // trace will often all hit or all miss the scene
-            cif (isect.hit)
-                ret = ambient_occlusion(isect, plane, spheres, rngstate);
-
-            // This is a little grungy; we have results for
-            // programCount-worth of values.  Because we're doing 2x2
-            // subsamples, we need to peel them off in groups of four,
-            // average the four values for each pixel, and update the
-            // output image.
-            //
-            // Store the varying value to a uniform array of the same size.
-            // See the discussion about communication among program
-            // instances in the ispc user's manual for more discussion on
-            // this idiom.
-            uniform float retArray[programCount];
-            retArray[programIndex] = ret;
-
-            // offset to the first pixel in the image
-            uniform int offset = 3 * (y * w + x);
-            for (uniform int p = 0; p < programCount; p += 4, offset += 3) {
-                // Get the four sample values for this pixel
-                uniform float sumret = retArray[p] + retArray[p+1] + retArray[p+2] +
-                    retArray[p+3];
-
-                // Normalize by number of samples taken
-                sumret /= nsubsamples * nsubsamples; 
-                
-                // Store result in the image
-                image[offset+0] = sumret;
-                image[offset+1] = sumret;
-                image[offset+2] = sumret;
-            }
+            int offset = 3 * (y * w + x);
+            atomic_add_local(&image[offset], ret);
+            atomic_add_local(&image[offset+1], ret);
+            atomic_add_local(&image[offset+2], ret);
        }
    }
 }
@@ -331,5 +268,5 @@ static void task ao_task(uniform int width, uniform int height,

 export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples, 
                          uniform float image[]) {
-    launch[h] < ao_task(w, h, nsubsamples, image) >;
+    launch[h] ao_task(w, h, nsubsamples, image);
 }
--- a/examples/aobench/aobench.vcxproj
+++ b/examples/aobench/aobench.vcxproj
@@ -18,159 +18,17 @@
      <Platform>x64</Platform>
    </ProjectConfiguration>
  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>aobench</RootNamespace>
+    <ISPC_file>ao</ISPC_file>
+    <default_targets>sse2,sse4,avx1-i32x8</default_targets>
+  </PropertyGroup>
+  <Import Project="..\common.props" />
  <ItemGroup>
    <ClCompile Include="ao.cpp" />
    <ClCompile Include="ao_serial.cpp" />
    <ClCompile Include="../tasksys.cpp" />
  </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="ao.ispc">
-      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-    </CustomBuild>
-  </ItemGroup>
-  <PropertyGroup Label="Globals">
-    <ProjectGuid>{F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB}</ProjectGuid>
-    <Keyword>Win32Proj</Keyword>
-    <RootNamespace>aobench</RootNamespace>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ExecutablePath);$(ProjectDir)..\..</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-  </ImportGroup>
-</Project>
+</Project>
--- a/examples/aobench_instrumented/Makefile
+++ b/examples/aobench_instrumented/Makefile
@@ -1,5 +1,5 @@

-CXX=g++ -m64
+CXX=clang++ -m64
 CXXFLAGS=-Iobjs/ -g3 -Wall
 ISPC=ispc
 ISPCFLAGS=-O2 --instrument --arch=x86-64 --target=sse2
@@ -14,13 +14,13 @@ dirs:
 clean:
 	/bin/rm -rf objs *~ ao

-ao: dirs objs/ao.o objs/instrument.o objs/ao_ispc.o
-	$(CXX) $(CXXFLAGS) -o $@ objs/ao.o objs/ao_ispc.o objs/instrument.o -lm -lpthread
+ao: objs/ao.o objs/instrument.o objs/ao_instrumented_ispc.o ../tasksys.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $^ -lm -lpthread

-objs/%.o: %.cpp
+objs/%.o: %.cpp dirs
 	$(CXX) $< $(CXXFLAGS) -c -o $@

-objs/ao.o: objs/ao_ispc.h 
+objs/ao.o: objs/ao_instrumented_ispc.h

-objs/%_ispc.h objs/%_ispc.o: %.ispc
+objs/%_ispc.h objs/%_ispc.o: %.ispc dirs
 	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/aobench_instrumented/ao.cpp
+++ b/examples/aobench_instrumented/ao.cpp
@@ -35,6 +35,8 @@
 #define NOMINMAX
 #pragma warning (disable: 4244)
 #pragma warning (disable: 4305)
+// preventing MSVC fopen() deprecation complaints
+#define _CRT_SECURE_NO_DEPRECATE
 #endif

 #include <stdio.h>
@@ -50,7 +52,7 @@
 #include <algorithm>
 #include <sys/types.h>

-#include "ao_ispc.h"
+#include "ao_instrumented_ispc.h"
 using namespace ispc;

 #include "instrument.h"
--- a/examples/aobench_instrumented/ao_instrumented.ispc
+++ b/examples/aobench_instrumented/ao_instrumented.ispc
@@ -75,7 +75,7 @@ static inline vec vcross(vec v0, vec v1) {
    return ret;
 }

-static inline void vnormalize(reference vec v) {
+static inline void vnormalize(vec &v) {
    float len2 = dot(v, v);
    float invlen = rsqrt(len2);
    v *= invlen;
@@ -83,8 +83,7 @@ static inline void vnormalize(reference vec v) {


 static inline void
-ray_plane_intersect(reference Isect isect, reference Ray ray, 
-                    reference Plane plane) {
+ray_plane_intersect(Isect &isect, Ray &ray, Plane &plane) {
    float d = -dot(plane.p, plane.n);
    float v = dot(ray.dir, plane.n);

@@ -104,8 +103,7 @@ ray_plane_intersect(reference Isect isect, reference Ray ray,


 static inline void
-ray_sphere_intersect(reference Isect isect, reference Ray ray, 
-                     reference Sphere sphere) {
+ray_sphere_intersect(Isect &isect, Ray &ray, Sphere &sphere) {
    vec rs = ray.org - sphere.center;

    float B = dot(rs, ray.dir);
@@ -127,7 +125,7 @@ ray_sphere_intersect(reference Isect isect, reference Ray ray,


 static inline void
-orthoBasis(reference vec basis[3], vec n) {
+orthoBasis(vec basis[3], vec n) {
    basis[2] = n;
    basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;

@@ -150,8 +148,8 @@ orthoBasis(reference vec basis[3], vec n) {


 static inline float
-ambient_occlusion(reference Isect isect, reference Plane plane, 
-                  reference Sphere spheres[3], reference RNGState rngstate) {
+ambient_occlusion(Isect &isect, Plane &plane, Sphere spheres[3], 
+                  RNGState &rngstate) {
    float eps = 0.0001f;
    vec p, n;
    vec basis[3];
@@ -168,8 +166,8 @@ ambient_occlusion(reference Isect isect, reference Plane plane,
            Ray ray;
            Isect occIsect;

-            float theta = sqrt(frandom(rngstate));
-            float phi   = 2.0f * M_PI * frandom(rngstate);
+            float theta = sqrt(frandom(&rngstate));
+            float phi   = 2.0f * M_PI * frandom(&rngstate);
            float x = cos(phi) * theta;
            float y = sin(phi) * theta;
            float z = sqrt(1.0 - theta * theta);
@@ -203,8 +201,9 @@ ambient_occlusion(reference Isect isect, reference Plane plane,
 /* Compute the image for the scanlines from [y0,y1), for an overall image
   of width w and height h.
 */
-void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h, 
-                  uniform int nsubsamples, reference uniform float image[]) {
+static void ao_scanlines(uniform int y0, uniform int y1, uniform int w, 
+                         uniform int h,  uniform int nsubsamples, 
+                         uniform float image[]) {
    static Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
    static Sphere spheres[3] = {
        { { -2.0f, 0.0f, -3.5f }, 0.5f },
@@ -212,7 +211,7 @@ void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,
        { { 1.0f, 0.0f, -2.2f }, 0.5f } };
    RNGState rngstate;

-    seed_rng(rngstate, y0);
+    seed_rng(&rngstate, programIndex + (y0 << (programIndex & 15)));

    // Compute the mapping between the 'programCount'-wide program
    // instances running in parallel and samples in the image.  
@@ -231,6 +230,9 @@ void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,
    // direction we do per iteration and ny the number in y.
    uniform int nx = 1, ny = 1;

+    // FIXME: We actually need ny to be 1 regardless of the decomposition,
+    // since the task decomposition is one scanline high.
+
    if (programCount == 8) {
        // Do two pixels at once in the x direction
        nx = 2;
@@ -239,19 +241,21 @@ void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,
            ++du;
    }
    else if (programCount == 16) {
-        // Two at once in both x and y
-        nx = ny = 2;
-        if ((programIndex >= 4 && programIndex < 8) || programIndex >= 12)
+        nx = 4;
+        ny = 1;
+        if (programIndex >= 4 && programIndex < 8)
            ++du;
-        if (programIndex >= 8)  
-            ++dv;
+        if (programIndex >= 8 && programIndex < 12)
+            du += 2;
+        if (programIndex >= 12)
+            du += 3;
    }

    // Now loop over all of the pixels, stepping in x and y as calculated
    // above.  (Assumes that ny divides y and nx divides x...)
    for (uniform int y = y0; y < y1; y += ny) {
        for (uniform int x = 0; x < w; x += nx)  {
-            // Figur out x,y pixel in NDC
+            // Figure out x,y pixel in NDC
            float px =  (x + du - (w / 2.0f)) / (w / 2.0f);
            float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
            float ret = 0.f;
@@ -293,7 +297,7 @@ void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,

            // offset to the first pixel in the image
            uniform int offset = 3 * (y * w + x);
-            for (uniform int p = 0; p < programCount; p += 4, ++offset) {
+            for (uniform int p = 0; p < programCount; p += 4, offset += 3) {
                // Get the four sample values for this pixel
                uniform float sumret = retArray[p] + retArray[p+1] + retArray[p+2] +
                    retArray[p+3];
@@ -315,3 +319,15 @@ export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples,
                    uniform float image[]) {
    ao_scanlines(0, h, w, h, nsubsamples, image);
 }
+
+
+static void task ao_task(uniform int width, uniform int height, 
+                         uniform int nsubsamples, uniform float image[]) {
+    ao_scanlines(taskIndex, taskIndex+1, width, height, nsubsamples, image);
+}
+
+
+export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples, 
+                          uniform float image[]) {
+    launch[h] ao_task(w, h, nsubsamples, image);
+}
--- a/examples/aobench_instrumented/aobench_instrumented.vcxproj
+++ b/examples/aobench_instrumented/aobench_instrumented.vcxproj
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
@@ -18,152 +18,18 @@
      <Platform>x64</Platform>
    </ProjectConfiguration>
  </ItemGroup>
-  <ItemGroup>
-    <ClCompile Include="ao.cpp" />
-    <ClCompile Include="instrument.cpp" />
-  </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="ao.ispc">
-      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --instrument --target=sse2
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --instrument --target=sse2
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --instrument --target=sse2
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --instrument --target=sse2
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-    </CustomBuild>
-  </ItemGroup>
  <PropertyGroup Label="Globals">
    <ProjectGuid>{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}</ProjectGuid>
    <Keyword>Win32Proj</Keyword>
    <RootNamespace>aobench_instrumented</RootNamespace>
+    <ISPC_file>ao_instrumented</ISPC_file>
+    <default_targets>sse2</default_targets>
+    <flags>--instrument</flags>
  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-  </ImportGroup>
+  <Import Project="..\common.props" />
+  <ItemGroup>
+    <ClCompile Include="ao.cpp" />
+    <ClCompile Include="instrument.cpp" />
+    <ClCompile Include="../tasksys.cpp" />
+  </ItemGroup>
 </Project>
--- a/examples/aobench_instrumented/instrument.cpp
+++ b/examples/aobench_instrumented/instrument.cpp
@@ -34,6 +34,8 @@
 #include "instrument.h"
 #include <stdio.h>
 #include <assert.h>
+#include <iomanip>
+#include <sstream>
 #include <string>
 #include <map>

@@ -46,7 +48,7 @@ struct CallInfo {

 static std::map<std::string, CallInfo> callInfo;

-int countbits(int i) {
+int countbits(uint64_t i) {
    int ret = 0;
    while (i) {
        if (i & 0x1)
@@ -60,14 +62,13 @@ int countbits(int i) {
 // Callback function that ispc compiler emits calls to when --instrument
 // command-line flag is given while compiling.
 void
-ISPCInstrument(const char *fn, const char *note, int line, int mask) {
-    char sline[16];
-    sprintf(sline, "%04d", line);
-    std::string s = std::string(fn) + std::string("(") + std::string(sline) +
-        std::string(") - ") + std::string(note);
+ISPCInstrument(const char *fn, const char *note, int line, uint64_t mask) {
+    std::stringstream s;
+    s << fn << "(" << std::setfill('0') << std::setw(4) << line << ") - "
+      << note;

    // Find or create a CallInfo instance for this callsite.
-    CallInfo &ci = callInfo[s];
+    CallInfo &ci = callInfo[s.str()];

    // And update its statistics... 
    ++ci.count;
--- a/examples/aobench_instrumented/instrument.h
+++ b/examples/aobench_instrumented/instrument.h
@@ -28,7 +28,7 @@
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

 #ifndef INSTRUMENT_H
@@ -36,8 +36,8 @@

 #include <stdint.h>

-extern "C" { 
-    void ISPCInstrument(const char *fn, const char *note, int line, int mask);
+extern "C" {
+    void ISPCInstrument(const char *fn, const char *note, int line, uint64_t mask);
 }

 void ISPCPrintInstrument();
--- a/examples/common.mk
+++ b/examples/common.mk
@@ -0,0 +1,120 @@
+
+TASK_CXX=../tasksys.cpp
+TASK_LIB=-lpthread
+TASK_OBJ=objs/tasksys.o
+
+CXX=clang++
+CXXFLAGS+=-Iobjs/ -O2
+CC=clang
+CCFLAGS+=-Iobjs/ -O2
+
+LIBS=-lm $(TASK_LIB) -lstdc++
+ISPC=ispc
+ISPC_FLAGS+=-O2
+ISPC_HEADER=objs/$(ISPC_SRC:.ispc=_ispc.h)
+
+ARCH:=$(shell uname -m | sed -e s/x86_64/x86/ -e s/i686/x86/ -e s/arm.*/arm/ -e s/sa110/arm/)
+
+ifeq ($(ARCH),x86)
+  ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc.o)
+  COMMA=,
+  ifneq (,$(findstring $(COMMA),$(ISPC_IA_TARGETS)))
+    #$(info multi-target detected: $(ISPC_IA_TARGETS))
+    ifneq (,$(findstring sse2,$(ISPC_IA_TARGETS)))
+      ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_sse2.o)
+    endif
+    ifneq (,$(findstring sse4,$(ISPC_IA_TARGETS)))
+      ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_sse4.o)
+    endif
+    ifneq (,$(findstring avx1-,$(ISPC_IA_TARGETS)))
+      ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_avx.o)
+    endif
+    ifneq (,$(findstring avx1.1,$(ISPC_IA_TARGETS)))
+      ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_avx11.o)
+    endif
+    ifneq (,$(findstring avx2,$(ISPC_IA_TARGETS)))
+      ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_avx2.o)
+    endif
+    ifneq (,$(findstring avx512knl,$(ISPC_IA_TARGETS)))
+      ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_avx512knl.o)
+    endif
+    ifneq (,$(findstring avx512skx,$(ISPC_IA_TARGETS)))
+      ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_avx512skx.o)
+    endif
+  endif
+  ISPC_TARGETS=$(ISPC_IA_TARGETS)
+  ARCH_BIT:=$(shell getconf LONG_BIT)
+  ifeq ($(ARCH_BIT),32)
+    ISPC_FLAGS += --arch=x86
+    CXXFLAGS += -m32
+    CCFLAGS += -m32
+  else
+    ISPC_FLAGS += --arch=x86-64
+    CXXFLAGS += -m64
+    CCFLAGS += -m64
+  endif
+else ifeq ($(ARCH),arm)
+  ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=_ispc.o))
+  ISPC_TARGETS=$(ISPC_ARM_TARGETS)
+else
+  $(error Unknown architecture $(ARCH) from uname -m)
+endif
+
+CPP_OBJS=$(addprefix objs/, $(CPP_SRC:.cpp=.o))
+CC_OBJS=$(addprefix objs/, $(CC_SRC:.c=.o))
+OBJS=$(CPP_OBJS) $(CC_OBJS) $(TASK_OBJ) $(ISPC_OBJS)
+
+default: $(EXAMPLE)
+
+all: $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16 $(EXAMPLE)-scalar
+
+.PHONY: dirs clean
+
+dirs:
+	/bin/mkdir -p objs/
+
+objs/%.cpp objs/%.o objs/%.h: dirs
+
+clean:
+	/bin/rm -rf objs *~ $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16 ref test
+
+$(EXAMPLE): $(OBJS)
+	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
+
+objs/%.o: %.cpp dirs $(ISPC_HEADER)
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/%.o: %.c dirs $(ISPC_HEADER)
+	$(CC) $< $(CCFLAGS) -c -o $@
+
+objs/%.o: ../%.cpp dirs
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/$(EXAMPLE).o: objs/$(EXAMPLE)_ispc.h dirs
+
+objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o objs/%_ispc_avx11.o objs/%_ispc_avx2.o objs/%_ispc_avx512knl.o objs/%_ispc_avx512skx.o : %.ispc dirs
+	$(ISPC) $(ISPC_FLAGS) --target=$(ISPC_TARGETS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
+
+objs/$(ISPC_SRC:.ispc=)_sse4.cpp: $(ISPC_SRC)
+	$(ISPC) $(ISPC_FLAGS) $< -o $@ --target=generic-4 --emit-c++ --c++-include-file=sse4.h
+
+objs/$(ISPC_SRC:.ispc=)_sse4.o: objs/$(ISPC_SRC:.ispc=)_sse4.cpp
+	$(CXX) -I../intrinsics -msse4.2 $< $(CXXFLAGS) -c -o $@
+
+$(EXAMPLE)-sse4: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_sse4.o
+	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
+
+objs/$(ISPC_SRC:.ispc=)_generic16.cpp: $(ISPC_SRC)
+	$(ISPC) $(ISPC_FLAGS) $< -o $@ --target=generic-16 --emit-c++ --c++-include-file=generic-16.h
+
+objs/$(ISPC_SRC:.ispc=)_generic16.o: objs/$(ISPC_SRC:.ispc=)_generic16.cpp
+	$(CXX) -I../intrinsics $< $(CXXFLAGS) -c -o $@
+
+$(EXAMPLE)-generic16: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_generic16.o
+	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
+
+objs/$(ISPC_SRC:.ispc=)_scalar.o: $(ISPC_SRC)
+	$(ISPC) $(ISPC_FLAGS) $< -o $@ --target=generic-1
+
+$(EXAMPLE)-scalar: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_scalar.o
+	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
--- a/examples/common.props
+++ b/examples/common.props
@@ -0,0 +1,176 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+    <PlatformToolset>v110</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+    <PlatformToolset>v110</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+    <PlatformToolset>v110</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+    <PlatformToolset>v110</PlatformToolset>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <PropertyGroup Label="User">
+    <ISPC_compiler Condition=" '$(ISPC_compiler)' == '' ">ispc</ISPC_compiler>
+    <Target_str Condition=" '$(Target_str)' == '' ">$(default_targets)</Target_str>
+    <Target_out>$(ISPC_file).obj</Target_out>
+    <Target_out Condition="($(Target_str.Contains(',')) And $(Target_str.Contains('sse2')))">$(Target_out);$(ISPC_file)_sse2.obj</Target_out>
+    <Target_out Condition="($(Target_str.Contains(',')) And $(Target_str.Contains('sse4')))">$(Target_out);$(ISPC_file)_sse4.obj</Target_out>
+    <Target_out Condition="($(Target_str.Contains(',')) And $(Target_str.Contains('avx1-')))">$(Target_out);$(ISPC_file)_avx.obj</Target_out>
+    <Target_out Condition="($(Target_str.Contains(',')) And $(Target_str.Contains('avx1.1')))">$(Target_out);$(ISPC_file)_avx11.obj</Target_out>
+    <Target_out Condition="($(Target_str.Contains(',')) And $(Target_str.Contains('avx2')))">$(Target_out);$(ISPC_file)_avx2.obj</Target_out>
+  </PropertyGroup>
+  <ItemGroup>
+    <CustomBuild Include='$(ISPC_file).ispc'>
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(ISPC_compiler) -O0 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=$(Target_str) -g $(flags)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(ISPC_compiler) -O0 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=$(Target_str) -g $(flags)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(Target_out)</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(Target_out)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(ISPC_compiler) -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=$(Target_str) $(flags)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(ISPC_compiler) -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=$(Target_str) $(flags)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(Target_out)</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(Target_out)</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
--- a/examples/deferred/Makefile
+++ b/examples/deferred/Makefile
@@ -1,38 +1,9 @@

-ARCH = $(shell uname)
+EXAMPLE=deferred_shading
+CPP_SRC=common.cpp main.cpp dynamic_c.cpp dynamic_cilk.cpp
+ISPC_SRC=kernels.ispc
+ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x16,avx2-i32x16,avx512knl-i32x16,avx512skx-i32x16
+ISPC_ARM_TARGETS=neon
+ISPC_FLAGS=--opt=fast-math

-TASK_CXX=../tasksys.cpp
-TASK_LIB=-lpthread
-TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
-
-CXX=g++
-CXXFLAGS=-Iobjs/ -O3 -Wall -m64
-ISPC=ispc
-ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx-x2 --arch=x86-64 --math-lib=fast
-
-OBJS=objs/main.o objs/common.o objs/kernels_ispc.o objs/kernels_ispc_sse2.o \
-	objs/kernels_ispc_sse4.o objs/kernels_ispc_avx.o \
-	objs/dynamic_c.o objs/dynamic_cilk.o
-
-default: deferred_shading
-
-.PHONY: dirs clean
-.PRECIOUS: objs/kernels_ispc.h
-
-dirs:
-	/bin/mkdir -p objs/
-
-clean:
-	/bin/rm -rf objs *~ deferred_shading
-
-deferred_shading: dirs $(OBJS) $(TASK_OBJ)
-	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) $(TASK_OBJ) -lm $(TASK_LIB)
-
-objs/%.o: %.cpp objs/kernels_ispc.h deferred.h
-	$(CXX) $< $(CXXFLAGS) -c -o $@
-
-objs/%.o: ../%.cpp
-	$(CXX) $< $(CXXFLAGS) -c -o $@
-
-objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
-	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
+include ../common.mk
--- a/examples/deferred/common.cpp
+++ b/examples/deferred/common.cpp
@@ -204,6 +204,7 @@ void WriteFrame(const char *filename, const InputData *input,
    fprintf(out, "P6 %d %d 255\n", input->header.framebufferWidth, 
            input->header.framebufferHeight);
    fwrite(framebufferAOS, imageBytes, 1, out);
+    fclose(out);

    lAlignedFree(framebufferAOS);
 }
--- a/examples/deferred/deferred_shading.vcxproj
+++ b/examples/deferred/deferred_shading.vcxproj
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
@@ -21,133 +21,11 @@
  <PropertyGroup Label="Globals">
    <ProjectGuid>{87f53c53-957e-4e91-878a-bc27828fb9eb}</ProjectGuid>
    <Keyword>Win32Proj</Keyword>
-    <RootNamespace>mandelbrot</RootNamespace>
+    <RootNamespace>deferred</RootNamespace>
+    <ISPC_file>kernels</ISPC_file>
+    <default_targets>sse2,sse4-x2,avx1-x2</default_targets>
  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
-      <FloatingPointModel>Fast</FloatingPointModel>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
-  </ItemDefinitionGroup>
+  <Import Project="..\common.props" />
  <ItemGroup>
    <ClCompile Include="common.cpp" />
    <ClCompile Include="dynamic_c.cpp" />
@@ -155,24 +33,4 @@
    <ClCompile Include="main.cpp" />
    <ClCompile Include="../tasksys.cpp" />
  </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="kernels.ispc">
-      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-    </CustomBuild>
-  </ItemGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-  </ImportGroup>
 </Project>
--- a/examples/deferred/kernels.ispc
+++ b/examples/deferred/kernels.ispc
@@ -35,35 +35,35 @@

 struct InputDataArrays
 {
-    uniform float zBuffer[];
-    uniform unsigned int16 normalEncoded_x[]; // half float
-    uniform unsigned int16 normalEncoded_y[]; // half float
-    uniform unsigned int16 specularAmount[]; // half float
-    uniform unsigned int16 specularPower[]; // half float
-    uniform unsigned int8 albedo_x[]; // unorm8
-    uniform unsigned int8 albedo_y[]; // unorm8
-    uniform unsigned int8 albedo_z[]; // unorm8
-    uniform float lightPositionView_x[];
-    uniform float lightPositionView_y[];
-    uniform float lightPositionView_z[];
-    uniform float lightAttenuationBegin[];
-    uniform float lightColor_x[];
-    uniform float lightColor_y[];
-    uniform float lightColor_z[];
-    uniform float lightAttenuationEnd[];
+    float *zBuffer;
+    unsigned int16 *normalEncoded_x; // half float
+    unsigned int16 *normalEncoded_y; // half float
+    unsigned int16 *specularAmount; // half float
+    unsigned int16 *specularPower; // half float
+    unsigned int8 *albedo_x; // unorm8
+    unsigned int8 *albedo_y; // unorm8
+    unsigned int8 *albedo_z; // unorm8
+    float *lightPositionView_x;
+    float *lightPositionView_y;
+    float *lightPositionView_z;
+    float *lightAttenuationBegin;
+    float *lightColor_x;
+    float *lightColor_y;
+    float *lightColor_z;
+    float *lightAttenuationEnd;
 };

 struct InputHeader
 {
-    uniform float cameraProj[4][4];
-    uniform float cameraNear;
-    uniform float cameraFar;
+    float cameraProj[4][4];
+    float cameraNear;
+    float cameraFar;

-    uniform int32 framebufferWidth;
-    uniform int32 framebufferHeight;
-    uniform int32 numLights;
-    uniform int32 inputDataChunkSize;
-    uniform int32 inputDataArrayOffsets[idaNum];
+    int32 framebufferWidth;
+    int32 framebufferHeight;
+    int32 numLights;
+    int32 inputDataChunkSize;
+    int32 inputDataArrayOffsets[idaNum];
 };


@@ -77,8 +77,7 @@ dot3(float x, float y, float z, float a, float b, float c) {


 static inline void
-normalize3(float x, float y, float z, reference float ox, 
-           reference float oy, reference float oz) {
+normalize3(float x, float y, float z, float &ox, float &oy, float &oz) {
    float n = rsqrt(x*x + y*y + z*z);
    ox = x * n;
    oy = y * n;
@@ -98,7 +97,6 @@ Float32ToUnorm8(float f) {
 }


-// tile width must be a multiple of programCount (SIMD size)
 static void
 ComputeZBounds(
    uniform int32 tileStartX, uniform int32 tileEndX,
@@ -110,17 +108,17 @@ ComputeZBounds(
    uniform float cameraProj_33, uniform float cameraProj_43,
    uniform float cameraNear, uniform float cameraFar,
    // Output
-    reference uniform float minZ,
-    reference uniform float maxZ
+    uniform float &minZ,
+    uniform float &maxZ
    )
 {
    // Find Z bounds
    float laneMinZ = cameraFar;
    float laneMaxZ = cameraNear;
    for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
-        for (uniform int32 x = tileStartX; x < tileEndX; x += programCount) {
+        foreach (x = tileStartX ... tileEndX) {
            // Unproject depth buffer Z value into view space
-            float z = zBuffer[(y * gBufferWidth + x) + programIndex];
+            float z = zBuffer[y * gBufferWidth + x];
            float viewSpaceZ = cameraProj_43 / (z - cameraProj_33);

            // Work out Z bounds for our samples
@@ -136,8 +134,6 @@ ComputeZBounds(
 }


-// tile width must be a multiple of programCount (SIMD size)
-// numLights must currently be a multiple of programCount (SIMD size)
 export uniform int32
 IntersectLightsWithTileMinMax(
    uniform int32 tileStartX, uniform int32 tileEndX,
@@ -156,51 +152,33 @@ IntersectLightsWithTileMinMax(
    uniform float light_positionView_z_array[],
    uniform float light_attenuationEnd_array[],
    // Output
-    reference uniform int32 tileLightIndices[]
+    uniform int32 tileLightIndices[]
    )
 {
    uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
    uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
        
-    // Parallize across frustum planes.
-    // We really only have four side planes here, but write the code to
-    // handle programCount > 4 robustly
-    uniform float frustumPlanes_xy[programCount];
-    uniform float frustumPlanes_z[programCount];
+    uniform float frustumPlanes_xy[4] = {
+        -(cameraProj_11 * gBufferScale_x),
+         (cameraProj_11 * gBufferScale_x),
+         (cameraProj_22 * gBufferScale_y),
+        -(cameraProj_22 * gBufferScale_y) };
+    uniform float frustumPlanes_z[4] = {
+         tileEndX - gBufferScale_x,
+        -tileStartX + gBufferScale_x,
+         tileEndY - gBufferScale_y,
+        -tileStartY + gBufferScale_y };

-    // TODO: If programIndex < 4 here? Don't care about masking off the
-    // rest but if interleaving ("x2" modes) the other lanes should ideally
-    // not be emitted...
-    {
-        // This one is totally constant over the whole screen... worth pulling it up at all?
-        float frustumPlanes_xy_v;
-        frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 0, -(cameraProj_11 * gBufferScale_x));
-        frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 1,  (cameraProj_11 * gBufferScale_x));
-        frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 2,  (cameraProj_22 * gBufferScale_y));
-        frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 3, -(cameraProj_22 * gBufferScale_y));
-    
-        float frustumPlanes_z_v;
-        frustumPlanes_z_v = insert(frustumPlanes_z_v, 0,  tileEndX - gBufferScale_x);
-        frustumPlanes_z_v = insert(frustumPlanes_z_v, 1, -tileStartX + gBufferScale_x);
-        frustumPlanes_z_v = insert(frustumPlanes_z_v, 2,  tileEndY - gBufferScale_y);
-        frustumPlanes_z_v = insert(frustumPlanes_z_v, 3, -tileStartY + gBufferScale_y);
-
-        // Normalize
-        float norm = rsqrt(frustumPlanes_xy_v * frustumPlanes_xy_v + 
-                           frustumPlanes_z_v * frustumPlanes_z_v);
-            frustumPlanes_xy_v *= norm;
-            frustumPlanes_z_v *= norm;
-
-        // Save out for uniform use later
-        frustumPlanes_xy[programIndex] = frustumPlanes_xy_v;
-        frustumPlanes_z[programIndex] = frustumPlanes_z_v;
+    for (uniform int i = 0; i < 4; ++i) {
+        uniform float norm = rsqrt(frustumPlanes_xy[i] * frustumPlanes_xy[i] + 
+                                   frustumPlanes_z[i] * frustumPlanes_z[i]);
+        frustumPlanes_xy[i] *= norm;
+        frustumPlanes_z[i] *= norm;
    }

    uniform int32 tileNumLights = 0;

-    for (uniform int32 baseLightIndex = 0; baseLightIndex < numLights; 
-         baseLightIndex += programCount) {
-        int32 lightIndex = baseLightIndex + programIndex;
+    foreach (lightIndex = 0 ... numLights) {
        float light_positionView_z = light_positionView_z_array[lightIndex];
        float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
        float light_attenuationEndNeg = -light_attenuationEnd;
@@ -215,32 +193,31 @@ IntersectLightsWithTileMinMax(
        // don't actually need to mask the rest of this function - this is
        // just a greedy early-out.  Could also structure all of this as
        // nested if() statements, but this a bit easier to read
-        if (!any(inFrustum)) 
-            continue;
+        if (any(inFrustum)) {
+            float light_positionView_x = light_positionView_x_array[lightIndex];
+            float light_positionView_y = light_positionView_y_array[lightIndex];

-        float light_positionView_x = light_positionView_x_array[lightIndex];
-        float light_positionView_y = light_positionView_y_array[lightIndex];
+            d = light_positionView_z * frustumPlanes_z[0] + 
+                light_positionView_x * frustumPlanes_xy[0];
+            inFrustum = inFrustum && (d >= light_attenuationEndNeg);

-        d = light_positionView_z * frustumPlanes_z[0] + 
-            light_positionView_x * frustumPlanes_xy[0];
-        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+            d = light_positionView_z * frustumPlanes_z[1] + 
+                light_positionView_x * frustumPlanes_xy[1];
+            inFrustum = inFrustum && (d >= light_attenuationEndNeg);

-        d = light_positionView_z * frustumPlanes_z[1] + 
-            light_positionView_x * frustumPlanes_xy[1];
-        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+            d = light_positionView_z * frustumPlanes_z[2] + 
+                light_positionView_y * frustumPlanes_xy[2];
+            inFrustum = inFrustum && (d >= light_attenuationEndNeg);

-        d = light_positionView_z * frustumPlanes_z[2] + 
-            light_positionView_y * frustumPlanes_xy[2];
-        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
-
-        d = light_positionView_z * frustumPlanes_z[3] + 
-            light_positionView_y * frustumPlanes_xy[3];
-        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+            d = light_positionView_z * frustumPlanes_z[3] + 
+                light_positionView_y * frustumPlanes_xy[3];
+            inFrustum = inFrustum && (d >= light_attenuationEndNeg);
        
-        // Pack and store intersecting lights
-        cif (inFrustum) {
-            tileNumLights += packed_store_active(tileLightIndices, tileNumLights, 
-                                                 lightIndex);
+            // Pack and store intersecting lights
+            cif (inFrustum) {
+                tileNumLights += packed_store_active(&tileLightIndices[tileNumLights], 
+                                                     lightIndex);
+            }
        }
    }

@@ -248,8 +225,6 @@ IntersectLightsWithTileMinMax(
 }


-// tile width must be a multiple of programCount (SIMD size)
-// numLights must currently be a multiple of programCount (SIMD size)
 static uniform int32
 IntersectLightsWithTile(
    uniform int32 tileStartX, uniform int32 tileEndX,
@@ -268,7 +243,7 @@ IntersectLightsWithTile(
    uniform float light_positionView_z_array[],
    uniform float light_attenuationEnd_array[],
    // Output
-    reference uniform int32 tileLightIndices[]
+    uniform int32 tileLightIndices[]
    )
 {
    uniform float minZ, maxZ;
@@ -287,32 +262,31 @@ IntersectLightsWithTile(
 }


-// tile width must be a multiple of programCount (SIMD size)
 export void
 ShadeTile(
    uniform int32 tileStartX, uniform int32 tileEndX,
    uniform int32 tileStartY, uniform int32 tileEndY,
    uniform int32 gBufferWidth, uniform int32 gBufferHeight,
-    reference uniform InputDataArrays inputData,
+    uniform InputDataArrays &inputData,
    // Camera data
    uniform float cameraProj_11, uniform float cameraProj_22,
    uniform float cameraProj_33, uniform float cameraProj_43,
    // Light list
-    reference uniform int32 tileLightIndices[],
+    uniform int32 tileLightIndices[],
    uniform int32 tileNumLights,
    // UI
    uniform bool visualizeLightCount,
    // Output
-    reference uniform unsigned int8 framebuffer_r[],
-    reference uniform unsigned int8 framebuffer_g[],
-    reference uniform unsigned int8 framebuffer_b[]
+    uniform unsigned int8 framebuffer_r[],
+    uniform unsigned int8 framebuffer_g[],
+    uniform unsigned int8 framebuffer_b[]
    )
 {
    if (tileNumLights == 0 || visualizeLightCount) {
        uniform unsigned int8 c = (unsigned int8)(min(tileNumLights << 2, 255));
        for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
-            for (uniform int32 x = tileStartX; x < tileEndX; x += programCount) {
-                int32 framebufferIndex = (y * gBufferWidth + x) + programIndex;
+            foreach (x = tileStartX ... tileEndX) {
+                int32 framebufferIndex = (y * gBufferWidth + x);
                framebuffer_r[framebufferIndex] = c;
                framebuffer_g[framebufferIndex] = c;
                framebuffer_b[framebufferIndex] = c;
@@ -325,9 +299,8 @@ ShadeTile(
        for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
            uniform float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);

-            for (uniform int32 x = tileStartX; x < tileEndX; x += programCount) {
-                uniform int32 gBufferOffsetBase = y * gBufferWidth + x;
-                int32 gBufferOffset = gBufferOffsetBase + programIndex;
+            foreach (x = tileStartX ... tileEndX) {
+                int32 gBufferOffset = y * gBufferWidth + x;
                
                // Reconstruct position and (negative) view vector from G-buffer
                float surface_positionView_x, surface_positionView_y, surface_positionView_z;
@@ -337,7 +310,7 @@ ShadeTile(

                // Compute screen/clip-space position
                // NOTE: Mind DX11 viewport transform and pixel center!
-                float positionScreen_x = (0.5f + (float)(x + programIndex)) * 
+                float positionScreen_x = (0.5f + (float)(x)) * 
                    twoOverGBufferWidth - 1.0f;

                // Unproject depth buffer Z value into view space
@@ -354,8 +327,8 @@ ShadeTile(

                // Reconstruct normal from G-buffer
                float surface_normal_x, surface_normal_y, surface_normal_z;
-                float normal_x = half_to_float_fast(inputData.normalEncoded_x[gBufferOffset]);
-                float normal_y = half_to_float_fast(inputData.normalEncoded_y[gBufferOffset]);
+                float normal_x = half_to_float(inputData.normalEncoded_x[gBufferOffset]);
+                float normal_y = half_to_float(inputData.normalEncoded_y[gBufferOffset]);
                    
                float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
                float m = sqrt(4.0f * f - 1.0f);
@@ -366,9 +339,9 @@ ShadeTile(

                // Load other G-buffer parameters
                float surface_specularAmount = 
-                    half_to_float_fast(inputData.specularAmount[gBufferOffset]);
+                    half_to_float(inputData.specularAmount[gBufferOffset]);
                float surface_specularPower  = 
-                    half_to_float_fast(inputData.specularPower[gBufferOffset]);
+                    half_to_float(inputData.specularPower[gBufferOffset]);
                float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
                float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
                float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
@@ -478,13 +451,13 @@ ShadeTile(

 task void
 RenderTile(uniform int num_groups_x, uniform int num_groups_y,
-           reference uniform InputHeader inputHeader,
-           reference uniform InputDataArrays inputData,
+           uniform InputHeader &inputHeader,
+           uniform InputDataArrays &inputData,
           uniform int visualizeLightCount,
           // Output
-           reference uniform unsigned int8 framebuffer_r[],
-           reference uniform unsigned int8 framebuffer_g[],
-           reference uniform unsigned int8 framebuffer_b[]) {
+           uniform unsigned int8 framebuffer_r[],
+           uniform unsigned int8 framebuffer_g[],
+           uniform unsigned int8 framebuffer_b[]) {
    uniform int32 group_y = taskIndex / num_groups_x;
    uniform int32 group_x = taskIndex % num_groups_x;
    uniform int32 tile_start_x = group_x * MIN_TILE_WIDTH;
@@ -526,13 +499,13 @@ RenderTile(uniform int num_groups_x, uniform int num_groups_y,


 export void
-RenderStatic(reference uniform InputHeader inputHeader,
-             reference uniform InputDataArrays inputData,
+RenderStatic(uniform InputHeader &inputHeader,
+             uniform InputDataArrays &inputData,
             uniform int visualizeLightCount,
             // Output
-             reference uniform unsigned int8 framebuffer_r[],
-             reference uniform unsigned int8 framebuffer_g[],
-             reference uniform unsigned int8 framebuffer_b[]) {
+             uniform unsigned int8 framebuffer_r[],
+             uniform unsigned int8 framebuffer_g[],
+             uniform unsigned int8 framebuffer_b[]) {
    uniform int num_groups_x = (inputHeader.framebufferWidth + 
                                MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
    uniform int num_groups_y = (inputHeader.framebufferHeight + 
@@ -541,9 +514,9 @@ RenderStatic(reference uniform InputHeader inputHeader,

    // Launch a task to render each tile, each of which is MIN_TILE_WIDTH
    // by MIN_TILE_HEIGHT pixels.
-    launch[num_groups] < RenderTile(num_groups_x, num_groups_y,
-                                    inputHeader, inputData, visualizeLightCount,
-                                    framebuffer_r, framebuffer_g, framebuffer_b) >;
+    launch[num_groups] RenderTile(num_groups_x, num_groups_y,
+                                  inputHeader, inputData, visualizeLightCount,
+                                  framebuffer_r, framebuffer_g, framebuffer_b);
 }


@@ -551,7 +524,6 @@ RenderStatic(reference uniform InputHeader inputHeader,
 // Routines for dynamic decomposition path

 // This computes the z min/max range for a whole row worth of tiles.
-// The tile width must be a multiple of programCount (SIMD size)
 export void
 ComputeZBoundsRow(
    uniform int32 tileY,
@@ -564,8 +536,8 @@ ComputeZBoundsRow(
    uniform float cameraProj_33, uniform float cameraProj_43,
    uniform float cameraNear, uniform float cameraFar,
    // Output
-    reference uniform float minZArray[],
-    reference uniform float maxZArray[]
+    uniform float minZArray[],
+    uniform float maxZArray[]
    )
 {
    for (uniform int32 tileX = 0; tileX < numTilesX; ++tileX) {
@@ -596,47 +568,35 @@ SplitTileMinMax(
    // Camera data
    uniform float cameraProj_11, uniform float cameraProj_22,
    // Light Data
-    reference uniform int32 lightIndices[],
+    uniform int32 lightIndices[],
    uniform int32 numLights,
    uniform float light_positionView_x_array[],
    uniform float light_positionView_y_array[],
    uniform float light_positionView_z_array[],
    uniform float light_attenuationEnd_array[],
    // Outputs
-    // TODO: ISPC doesn't currently like multidimensionsal arrays so we'll do the
-    // indexing math ourselves
-    reference uniform int32 subtileIndices[],
+    uniform int32 subtileIndices[],
    uniform int32 subtileIndicesPitch,
-    reference uniform int32 subtileNumLights[]
+    uniform int32 subtileNumLights[]
    )
 {
    uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
    uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
        
-    // Parallize across frustum planes
-    // Only have 2 frustum split planes here so may not be worth it, but
-    // we'll do it for now for consistency
-    uniform float frustumPlanes_xy[programCount];
-    uniform float frustumPlanes_z[programCount];
-
-    // This one is totally constant over the whole screen... worth pulling it up at all?
-    float frustumPlanes_xy_v;
-    frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 0, -(cameraProj_11 * gBufferScale_x));
-    frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 1,  (cameraProj_22 * gBufferScale_y));
-    
-    float frustumPlanes_z_v;
-    frustumPlanes_z_v = insert(frustumPlanes_z_v, 0, tileMidX - gBufferScale_x);
-    frustumPlanes_z_v = insert(frustumPlanes_z_v, 1, tileMidY - gBufferScale_y);
+    uniform float frustumPlanes_xy[2] = { -(cameraProj_11 * gBufferScale_x),
+                                           (cameraProj_22 * gBufferScale_y) };
+    uniform float frustumPlanes_z[2] = { tileMidX - gBufferScale_x,
+                                         tileMidY - gBufferScale_y };

    // Normalize
-    float norm = rsqrt(frustumPlanes_xy_v * frustumPlanes_xy_v + 
-                       frustumPlanes_z_v * frustumPlanes_z_v);
-    frustumPlanes_xy_v *= norm;
-    frustumPlanes_z_v *= norm;
-
-    // Save out for uniform use later
-    frustumPlanes_xy[programIndex] = frustumPlanes_xy_v;
-    frustumPlanes_z[programIndex] = frustumPlanes_z_v;
+    uniform float norm[2] = { rsqrt(frustumPlanes_xy[0] * frustumPlanes_xy[0] + 
+                                    frustumPlanes_z[0] * frustumPlanes_z[0]),
+                              rsqrt(frustumPlanes_xy[1] * frustumPlanes_xy[1] + 
+                                    frustumPlanes_z[1] * frustumPlanes_z[1]) };
+    frustumPlanes_xy[0] *= norm[0];
+    frustumPlanes_xy[1] *= norm[1];
+    frustumPlanes_z[0] *= norm[0];
+    frustumPlanes_z[1] *= norm[1];

    // Initialize
    uniform int32 subtileLightOffset[4];
@@ -645,12 +605,7 @@ SplitTileMinMax(
    subtileLightOffset[2] = 2 * subtileIndicesPitch;
    subtileLightOffset[3] = 3 * subtileIndicesPitch;

-    for (int32 i = programIndex; i < numLights; i += programCount) {
-        // TODO: ISPC says gather required here when it actually
-        // isn't... this could be fixed this by nesting an if() within a
-        // uniform loop, but I'm not totally sure if that's a win
-        // overall. For now we'll just eat the perf cost for cleanliness
-        // since the below are real gathers anyways.
+    foreach (i = 0 ... numLights) {
        int32 lightIndex = lightIndices[i];

        float light_positionView_x = light_positionView_x_array[lightIndex];
@@ -693,21 +648,21 @@ SplitTileMinMax(
        // Pack and store intersecting lights
        // TODO: Experiment with a loop here instead
        cif (inFrustum[0])
-            subtileLightOffset[0] += packed_store_active(subtileIndices, 
-                                                         subtileLightOffset[0], 
-                                                         lightIndex);
+            subtileLightOffset[0] += 
+            packed_store_active(&subtileIndices[subtileLightOffset[0]],
+                                lightIndex);
        cif (inFrustum[1])
-            subtileLightOffset[1] += packed_store_active(subtileIndices, 
-                                                         subtileLightOffset[1], 
-                                                         lightIndex);
+            subtileLightOffset[1] += 
+            packed_store_active(&subtileIndices[subtileLightOffset[1]],
+                                lightIndex);
        cif (inFrustum[2])
-            subtileLightOffset[2] += packed_store_active(subtileIndices, 
-                                                         subtileLightOffset[2], 
-                                                         lightIndex);
+            subtileLightOffset[2] += 
+            packed_store_active(&subtileIndices[subtileLightOffset[2]], 
+                                lightIndex);
        cif (inFrustum[3])
-            subtileLightOffset[3] += packed_store_active(subtileIndices, 
-                                                         subtileLightOffset[3], 
-                                                         lightIndex);
+            subtileLightOffset[3] += 
+            packed_store_active(&subtileIndices[subtileLightOffset[3]], 
+                                lightIndex);
    }

    subtileNumLights[0] = subtileLightOffset[0] - 0 * subtileIndicesPitch;
--- a/Show More
+++ b/Show More