Merge branch 'master' of https://github.com/ispc/ispc

2012-10-24 13:49:04 -04:00
parent d665e2e85b 172a189c6f
commit 7c16292cb7
1391 changed files with 202705 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,15 @@
+*.pyc
+*~
+depend
+ispc
+ispc_test
+objs
+docs/doxygen
+docs/*.html
+tests*/*cpp
+tests*/*run
+examples/*/*.png
+examples/*/*.ppm
+examples/*/objs/*
+
+
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -0,0 +1,143 @@
+Copyright (c) 2010-2011, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+===========================================================================
+Copyrights and Licenses for Third Party Software Distrubted with 
+The Intel(r) SPMD Program Compiler
+===========================================================================
+
+ISPC incorporates code from the Syrah library, which is covered by the
+following license:
+
+Copyright (c) 2009, Stanford University, and authors listed below.
+All rights reserved.
+
+Original authors:
+  Solomon Boulos
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+Neither the name of Stanford University nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+---------------------------------------------------------------------------
+
+Binary distributions of ISPC are linked with the LLVM libraries, which are
+covered by the following license:
+
+University of Illinois/NCSA
+Open Source License
+
+Copyright (c) 2003-2010 University of Illinois at Urbana-Champaign.
+All rights reserved.
+
+Developed by:
+
+    LLVM Team
+
+    University of Illinois at Urbana-Champaign
+
+    http://llvm.org
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal with
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimers.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimers in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the names of the LLVM Team, University of Illinois at
+      Urbana-Champaign, nor the names of its contributors may be used to
+      endorse or promote products derived from this Software without specific
+      prior written permission.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+SOFTWARE.
+
+---------------------------------------------------------------------------
+
+ispc's code to convert to and from half-precision floats is based on James
+Tursa's code, which is covered by the following license:
+
+Redistribution and use in source and binary forms, with or without 
+modification, are permitted provided that the following conditions are 
+met:
+
+   * Redistributions of source code must retain the above copyright 
+     notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above copyright 
+     notice, this list of conditions and the following disclaimer in 
+     the documentation and/or other materials provided with the distribution
+      
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+POSSIBILITY OF SUCH DAMAGE.
--- a/172
+++ b/172
@@ -0,0 +1,172 @@
+#
+# ispc Makefile
+#
+
+# If you have your own special version of llvm and/or clang, change
+# these variables to match.
+LLVM_CONFIG=$(shell which llvm-config)
+CLANG_INCLUDE=$(shell $(LLVM_CONFIG) --includedir)
+
+# Add llvm bin to the path so any scripts run will go to the right llvm-config
+LLVM_BIN= $(shell $(LLVM_CONFIG) --bindir)
+export PATH:=$(LLVM_BIN):$(PATH)
+
+ARCH_OS = $(shell uname)
+ifeq ($(ARCH_OS), Darwin)
+	ARCH_OS2 = "OSX"
+else
+	ARCH_OS2 = $(shell uname -o)
+endif
+ARCH_TYPE = $(shell arch)
+
+LLVM_LIBS=$(shell $(LLVM_CONFIG) --libs)
+
+CLANG=clang
+CLANG_LIBS = -lclangFrontend -lclangDriver \
+             -lclangSerialization -lclangParse -lclangSema \
+             -lclangAnalysis -lclangAST -lclangLex -lclangBasic
+ifneq ($(shell $(LLVM_CONFIG) --version), 3.0)
+  CLANG_LIBS += -lclangEdit
+endif
+
+ISPC_LIBS=$(shell $(LLVM_CONFIG) --ldflags) $(CLANG_LIBS) $(LLVM_LIBS) \
+	-lpthread
+
+ifeq ($(ARCH_OS),Linux)
+	ISPC_LIBS += -ldl
+endif
+
+ifeq ($(ARCH_OS2),Msys)
+	ISPC_LIBS += -lshlwapi -limagehlp -lpsapi
+endif
+
+LLVM_CXXFLAGS=$(shell $(LLVM_CONFIG) --cppflags)
+LLVM_VERSION=LLVM_$(shell $(LLVM_CONFIG) --version | sed -e s/\\./_/ -e s/svn//)
+LLVM_VERSION_DEF=-D$(LLVM_VERSION)
+
+BUILD_DATE=$(shell date +%Y%m%d)
+BUILD_VERSION=$(shell git log --abbrev-commit --abbrev=16 | head -1)
+
+CXX=g++
+CPP=cpp
+OPT=-O2
+CXXFLAGS=$(OPT) $(LLVM_CXXFLAGS) -I. -Iobjs/ -I$(CLANG_INCLUDE)  \
+	-Wall $(LLVM_VERSION_DEF) \
+	-DBUILD_DATE="\"$(BUILD_DATE)\"" -DBUILD_VERSION="\"$(BUILD_VERSION)\""
+
+LDFLAGS=
+ifeq ($(ARCH_OS),Linux)
+  # try to link everything statically under Linux (including libstdc++) so
+  # that the binaries we generate will be portable across distributions...
+#    LDFLAGS=-static
+endif
+
+LEX=flex
+YACC=bison -d -v -t
+
+###########################################################################
+
+CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
+	ispc.cpp llvmutil.cpp main.cpp module.cpp opt.cpp stmt.cpp sym.cpp \
+	type.cpp util.cpp
+HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
+	opt.h stmt.h sym.h type.h util.h
+TARGETS=avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 sse2 sse2-x2 sse4 sse4-x2 \
+	generic-4 generic-8 generic-16 generic-32 generic-64 generic-1
+BUILTINS_SRC=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS))) \
+	builtins/dispatch.ll
+BUILTINS_OBJS=$(addprefix builtins-, $(notdir $(BUILTINS_SRC:.ll=.o))) \
+	builtins-c-32.cpp builtins-c-64.cpp 
+BISON_SRC=parse.yy
+FLEX_SRC=lex.ll
+
+OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_OBJS) \
+	stdlib_generic_ispc.o stdlib_x86_ispc.o \
+	$(BISON_SRC:.yy=.o) $(FLEX_SRC:.ll=.o))
+
+default: ispc
+
+.PHONY: dirs clean depend doxygen print_llvm_src llvm_check
+.PRECIOUS: objs/builtins-%.cpp
+
+depend: llvm_check $(CXX_SRC) $(HEADERS)
+	@echo Updating dependencies
+	@gcc -MM $(CXXFLAGS) $(CXX_SRC) | sed 's_^\([a-z]\)_objs/\1_g' > depend
+
+-include depend
+
+dirs:
+	@echo Creating objs/ directory
+	@/bin/mkdir -p objs
+
+llvm_check:
+	@llvm-config --version > /dev/null || \
+	(echo; \
+	 echo "******************************************"; \
+	 echo "ERROR: llvm-config not found in your PATH";  \
+	 echo "******************************************"; \
+	 echo; exit 1)
+
+print_llvm_src: llvm_check
+	@echo Using LLVM `llvm-config --version` from `llvm-config --libdir`
+
+clean:
+	/bin/rm -rf objs ispc
+
+doxygen:
+	/bin/rm -rf docs/doxygen
+	doxygen doxygen.cfg
+
+ispc: print_llvm_src dirs $(OBJS)
+	@echo Creating ispc executable
+	@$(CXX) $(OPT) $(LDFLAGS) -o $@ $(OBJS) $(ISPC_LIBS)
+
+objs/%.o: %.cpp
+	@echo Compiling $<
+	@$(CXX) $(CXXFLAGS) -o $@ -c $<
+
+objs/cbackend.o: cbackend.cpp
+	@echo Compiling $<
+	@$(CXX) -fno-rtti -fno-exceptions $(CXXFLAGS) -o $@ -c $<
+
+objs/%.o: objs/%.cpp
+	@echo Compiling $<
+	@$(CXX) $(CXXFLAGS) -o $@ -c $<
+
+objs/parse.cc: parse.yy
+	@echo Running bison on $<
+	@$(YACC) -o $@ $<
+
+objs/parse.o: objs/parse.cc $(HEADERS)
+	@echo Compiling $<
+	@$(CXX) $(CXXFLAGS) -o $@ -c $<
+
+objs/lex.cpp: lex.ll 
+	@echo Running flex on $<
+	@$(LEX) -o $@ $<
+
+objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc
+	@echo Compiling $<
+	@$(CXX) $(CXXFLAGS) -o $@ -c $<
+
+objs/builtins-%.cpp: builtins/%.ll builtins/util.m4 $(wildcard builtins/*common.ll)
+	@echo Creating C++ source from builtins definition file $<
+	@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) $< | python bitcode2cpp.py $< > $@
+
+objs/builtins-c-32.cpp: builtins/builtins.c
+	@echo Creating C++ source from builtins definition file $<
+	@$(CLANG) -m32 -emit-llvm -c $< -o - | llvm-dis - | python bitcode2cpp.py c-32 > $@
+
+objs/builtins-c-64.cpp: builtins/builtins.c
+	@echo Creating C++ source from builtins definition file $<
+	@$(CLANG) -m64 -emit-llvm -c $< -o - | llvm-dis - | python bitcode2cpp.py c-64 > $@
+
+objs/stdlib_generic_ispc.cpp: stdlib.ispc
+	@echo Creating C++ source from $< for generic
+	@$(CLANG) -E -x c -DISPC_TARGET_GENERIC=1 -DISPC=1 -DPI=3.1415926536 $< -o - | \
+		python stdlib2cpp.py generic > $@
+
+objs/stdlib_x86_ispc.cpp: stdlib.ispc
+	@echo Creating C++ source from $< for x86
+	@$(CLANG) -E -x c -DISPC=1 -DPI=3.1415926536 $< -o - | \
+		python stdlib2cpp.py x86 > $@
--- a/README.rst
+++ b/README.rst
@@ -0,0 +1,90 @@
+==============================
+Intel(r) SPMD Program Compiler
+==============================
+
+``ispc`` is a compiler for a variant of the C programming language, with
+extensions for `single program, multiple data
+<http://en.wikipedia.org/wiki/SPMD>`_ programming.  Under the SPMD model,
+the programmer writes a program that generally appears to be a regular
+serial program, though the execution model is actually that a number of
+*program instances* execute in parallel on the hardware.
+
+Overview
+--------
+
+``ispc`` compiles a C-based SPMD programming language to run on the SIMD
+units of CPUs; it frequently provides a 3x or more speedup on CPUs with
+4-wide vector SSE units and 5x-6x on CPUs with 8-wide AVX vector units,
+without any of the difficulty of writing intrinsics code.  Parallelization
+across multiple cores is also supported by ``ispc``, making it
+possible to write programs that achieve performance improvement that scales
+by both number of cores and vector unit size.
+
+There are a few key principles in the design of ``ispc``:
+
+  * To build a small set of extensions to the C language that
+    would deliver excellent performance to performance-oriented
+    programmers who want to run SPMD programs on the CPU.
+
+  * To provide a thin abstraction layer between the programmer
+    and the hardware--in particular, to have an execution and
+    data model where the programmer can cleanly reason about the
+    mapping of their source program to compiled assembly language
+    and the underlying hardware.
+
+  * To make it possible to harness the computational power of SIMD
+    vector units without the extremely low-programmer-productivity
+    activity of directly writing intrinsics.
+
+  * To explore opportunities from close coupling between C/C++
+    application code and SPMD ``ispc`` code running on the
+    same processor--to have lightweight function calls between
+    the two languages and to share data directly via pointers without
+    copying or reformatting.
+
+``ispc`` is an open source compiler with the BSD license.  It uses the
+remarkable `LLVM Compiler Infrastructure <http://llvm.org>`_ for back-end
+code generation and optimization and is `hosted on
+github <http://github.com/ispc/ispc/>`_. It supports Windows, Mac, and
+Linux, with both x86 and x86-64 targets.  It currently supports the SSE2,
+SSE4, AVX1, and AVX2 instruction sets.
+
+Features
+--------
+
+``ispc`` provides a number of key features to developers:
+
+  * Familiarity as an extension of the C programming
+    language: ``ispc`` supports familiar C syntax and
+    programming idioms, while adding the ability to write SPMD
+    programs.
+
+  * High-quality SIMD code generation: the performance
+    of code generated by ``ispc`` is often close to that of
+    hand-written intrinsics code.
+
+  * Ease of adoption with existing software
+    systems: functions written in ``ispc`` directly
+    interoperate with application functions written in C/C++ and
+    with application data structures.
+            
+  * Portability across over a decade of CPU
+    generations: ``ispc`` has targets for SSE2, SSE4, AVX
+    (and soon, AVX2).
+
+  * Portability across operating systems: Microsoft
+    Windows, Mac OS X, and Linux are all supported
+    by ``ispc``.
+
+  * Debugging with standard tools: ``ispc``
+    programs can be debugged with standard debuggers (OS X and
+    Linux only).
+
+Additional Resources
+--------------------
+
+Prebuilt ``ispc`` binaries for Windows, OS X and Linux can be downloaded
+from the `ispc downloads page <http://ispc.github.com/downloads.html>`_.
+See also additional
+`documentation <http://ispc.github.com/documentation.html>`_ and additional
+`performance information <http://ispc.github.com/perf.html>`_.
--- a/ast.cpp
+++ b/ast.cpp
@@ -0,0 +1,483 @@
+/*
+  Copyright (c) 2011-2012, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file ast.cpp
+
+    @brief General functionality related to abstract syntax trees and
+    traversal of them.
+ */
+
+#include "ast.h"
+#include "expr.h"
+#include "func.h"
+#include "stmt.h"
+#include "sym.h"
+#include "util.h"
+
+///////////////////////////////////////////////////////////////////////////
+// ASTNode
+
+ASTNode::~ASTNode() {
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// AST
+
+void
+AST::AddFunction(Symbol *sym, Stmt *code) {
+    if (sym == NULL)
+        return;
+    functions.push_back(new Function(sym, code));
+}
+
+
+void
+AST::GenerateIR() {
+    for (unsigned int i = 0; i < functions.size(); ++i)
+        functions[i]->GenerateIR();
+}
+
+///////////////////////////////////////////////////////////////////////////
+
+ASTNode *
+WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
+        void *data) {
+    if (node == NULL)
+        return node;
+
+    // Call the callback function
+    if (preFunc != NULL) {
+        if (preFunc(node, data) == false)
+            // The function asked us to not continue recursively, so stop.
+            return node;
+    }
+
+    ////////////////////////////////////////////////////////////////////////////
+    // Handle Statements
+    if (dynamic_cast<Stmt *>(node) != NULL) {
+        ExprStmt *es;
+        DeclStmt *ds;
+        IfStmt *is;
+        DoStmt *dos;
+        ForStmt *fs;
+        ForeachStmt *fes;
+        ForeachActiveStmt *fas;
+        ForeachUniqueStmt *fus;
+        CaseStmt *cs;
+        DefaultStmt *defs;
+        SwitchStmt *ss;
+        ReturnStmt *rs;
+        LabeledStmt *ls;
+        StmtList *sl;
+        PrintStmt *ps;
+        AssertStmt *as;
+        DeleteStmt *dels;
+        UnmaskedStmt *ums;
+
+        if ((es = dynamic_cast<ExprStmt *>(node)) != NULL)
+            es->expr = (Expr *)WalkAST(es->expr, preFunc, postFunc, data);
+        else if ((ds = dynamic_cast<DeclStmt *>(node)) != NULL) {
+            for (unsigned int i = 0; i < ds->vars.size(); ++i)
+                ds->vars[i].init = (Expr *)WalkAST(ds->vars[i].init, preFunc, 
+                                                   postFunc, data);
+        }
+        else if ((is = dynamic_cast<IfStmt *>(node)) != NULL) {
+            is->test = (Expr *)WalkAST(is->test, preFunc, postFunc, data);
+            is->trueStmts = (Stmt *)WalkAST(is->trueStmts, preFunc, 
+                                            postFunc, data);
+            is->falseStmts = (Stmt *)WalkAST(is->falseStmts, preFunc, 
+                                             postFunc, data);
+        }
+        else if ((dos = dynamic_cast<DoStmt *>(node)) != NULL) {
+            dos->testExpr = (Expr *)WalkAST(dos->testExpr, preFunc, 
+                                            postFunc, data);
+            dos->bodyStmts = (Stmt *)WalkAST(dos->bodyStmts, preFunc, 
+                                             postFunc, data);
+        }
+        else if ((fs = dynamic_cast<ForStmt *>(node)) != NULL) {
+            fs->init = (Stmt *)WalkAST(fs->init, preFunc, postFunc, data);
+            fs->test = (Expr *)WalkAST(fs->test, preFunc, postFunc, data);
+            fs->step = (Stmt *)WalkAST(fs->step, preFunc, postFunc, data);
+            fs->stmts = (Stmt *)WalkAST(fs->stmts, preFunc, postFunc, data);
+        }
+        else if ((fes = dynamic_cast<ForeachStmt *>(node)) != NULL) {
+            for (unsigned int i = 0; i < fes->startExprs.size(); ++i)
+                fes->startExprs[i] = (Expr *)WalkAST(fes->startExprs[i], preFunc, 
+                                                     postFunc, data);
+            for (unsigned int i = 0; i < fes->endExprs.size(); ++i)
+                fes->endExprs[i] = (Expr *)WalkAST(fes->endExprs[i], preFunc, 
+                                                   postFunc, data);
+            fes->stmts = (Stmt *)WalkAST(fes->stmts, preFunc, postFunc, data);
+        }
+        else if ((fas = dynamic_cast<ForeachActiveStmt *>(node)) != NULL) {
+            fas->stmts = (Stmt *)WalkAST(fas->stmts, preFunc, postFunc, data);
+        }
+        else if ((fus = dynamic_cast<ForeachUniqueStmt *>(node)) != NULL) {
+            fus->expr = (Expr *)WalkAST(fus->expr, preFunc, postFunc, data);
+            fus->stmts = (Stmt *)WalkAST(fus->stmts, preFunc, postFunc, data);
+        }
+        else if ((cs = dynamic_cast<CaseStmt *>(node)) != NULL)
+            cs->stmts = (Stmt *)WalkAST(cs->stmts, preFunc, postFunc, data);
+        else if ((defs = dynamic_cast<DefaultStmt *>(node)) != NULL)
+            defs->stmts = (Stmt *)WalkAST(defs->stmts, preFunc, postFunc, data);
+        else if ((ss = dynamic_cast<SwitchStmt *>(node)) != NULL) {
+            ss->expr = (Expr *)WalkAST(ss->expr, preFunc, postFunc, data);
+            ss->stmts = (Stmt *)WalkAST(ss->stmts, preFunc, postFunc, data);
+        }
+        else if (dynamic_cast<BreakStmt *>(node) != NULL ||
+                 dynamic_cast<ContinueStmt *>(node) != NULL ||
+                 dynamic_cast<GotoStmt *>(node) != NULL) {
+            // nothing
+        }
+        else if ((ls = dynamic_cast<LabeledStmt *>(node)) != NULL)
+            ls->stmt = (Stmt *)WalkAST(ls->stmt, preFunc, postFunc, data);
+        else if ((rs = dynamic_cast<ReturnStmt *>(node)) != NULL)
+            rs->expr = (Expr *)WalkAST(rs->expr, preFunc, postFunc, data);
+        else if ((sl = dynamic_cast<StmtList *>(node)) != NULL) {
+            std::vector<Stmt *> &sls = sl->stmts;
+            for (unsigned int i = 0; i < sls.size(); ++i)
+                sls[i] = (Stmt *)WalkAST(sls[i], preFunc, postFunc, data);
+        }
+        else if ((ps = dynamic_cast<PrintStmt *>(node)) != NULL)
+            ps->values = (Expr *)WalkAST(ps->values, preFunc, postFunc, data);
+        else if ((as = dynamic_cast<AssertStmt *>(node)) != NULL)
+            as->expr = (Expr *)WalkAST(as->expr, preFunc, postFunc, data);
+        else if ((dels = dynamic_cast<DeleteStmt *>(node)) != NULL)
+            dels->expr = (Expr *)WalkAST(dels->expr, preFunc, postFunc, data);
+        else if ((ums = dynamic_cast<UnmaskedStmt *>(node)) != NULL)
+            ums->stmts = (Stmt *)WalkAST(ums->stmts, preFunc, postFunc, data);
+        else
+            FATAL("Unhandled statement type in WalkAST()");
+    }
+    else {
+        ///////////////////////////////////////////////////////////////////////////
+        // Handle expressions
+        Assert(dynamic_cast<Expr *>(node) != NULL);
+        UnaryExpr *ue;
+        BinaryExpr *be;
+        AssignExpr *ae;
+        SelectExpr *se;
+        ExprList *el;
+        FunctionCallExpr *fce;
+        IndexExpr *ie;
+        MemberExpr *me;
+        TypeCastExpr *tce;
+        ReferenceExpr *re;
+        PtrDerefExpr *ptrderef;
+        RefDerefExpr *refderef;
+        SizeOfExpr *soe;
+        AddressOfExpr *aoe;
+        NewExpr *newe;
+
+        if ((ue = dynamic_cast<UnaryExpr *>(node)) != NULL)
+            ue->expr = (Expr *)WalkAST(ue->expr, preFunc, postFunc, data);
+        else if ((be = dynamic_cast<BinaryExpr *>(node)) != NULL) {
+            be->arg0 = (Expr *)WalkAST(be->arg0, preFunc, postFunc, data);
+            be->arg1 = (Expr *)WalkAST(be->arg1, preFunc, postFunc, data);
+        }
+        else if ((ae = dynamic_cast<AssignExpr *>(node)) != NULL) {
+            ae->lvalue = (Expr *)WalkAST(ae->lvalue, preFunc, postFunc, data);
+            ae->rvalue = (Expr *)WalkAST(ae->rvalue, preFunc, postFunc, data);
+        }
+        else if ((se = dynamic_cast<SelectExpr *>(node)) != NULL) {
+            se->test = (Expr *)WalkAST(se->test, preFunc, postFunc, data);
+            se->expr1 = (Expr *)WalkAST(se->expr1, preFunc, postFunc, data);
+            se->expr2 = (Expr *)WalkAST(se->expr2, preFunc, postFunc, data);
+        }
+        else if ((el = dynamic_cast<ExprList *>(node)) != NULL) {
+            for (unsigned int i = 0; i < el->exprs.size(); ++i)
+                el->exprs[i] = (Expr *)WalkAST(el->exprs[i], preFunc, 
+                                               postFunc, data);
+        }
+        else if ((fce = dynamic_cast<FunctionCallExpr *>(node)) != NULL) {
+            fce->func = (Expr *)WalkAST(fce->func, preFunc, postFunc, data);
+            fce->args = (ExprList *)WalkAST(fce->args, preFunc, postFunc, data);
+            fce->launchCountExpr = (Expr *)WalkAST(fce->launchCountExpr, preFunc,
+                                                   postFunc, data);
+        }
+        else if ((ie = dynamic_cast<IndexExpr *>(node)) != NULL) {
+            ie->baseExpr = (Expr *)WalkAST(ie->baseExpr, preFunc, postFunc, data);
+            ie->index = (Expr *)WalkAST(ie->index, preFunc, postFunc, data);
+        }
+        else if ((me = dynamic_cast<MemberExpr *>(node)) != NULL)
+            me->expr = (Expr *)WalkAST(me->expr, preFunc, postFunc, data);
+        else if ((tce = dynamic_cast<TypeCastExpr *>(node)) != NULL)
+            tce->expr = (Expr *)WalkAST(tce->expr, preFunc, postFunc, data);
+        else if ((re = dynamic_cast<ReferenceExpr *>(node)) != NULL)
+            re->expr = (Expr *)WalkAST(re->expr, preFunc, postFunc, data);
+        else if ((ptrderef = dynamic_cast<PtrDerefExpr *>(node)) != NULL)
+            ptrderef->expr = (Expr *)WalkAST(ptrderef->expr, preFunc, postFunc,
+                                             data);
+        else if ((refderef = dynamic_cast<RefDerefExpr *>(node)) != NULL)
+            refderef->expr = (Expr *)WalkAST(refderef->expr, preFunc, postFunc,
+                                             data);
+        else if ((soe = dynamic_cast<SizeOfExpr *>(node)) != NULL)
+            soe->expr = (Expr *)WalkAST(soe->expr, preFunc, postFunc, data);
+        else if ((aoe = dynamic_cast<AddressOfExpr *>(node)) != NULL)
+            aoe->expr = (Expr *)WalkAST(aoe->expr, preFunc, postFunc, data);
+        else if ((newe = dynamic_cast<NewExpr *>(node)) != NULL) {
+            newe->countExpr = (Expr *)WalkAST(newe->countExpr, preFunc, 
+                                              postFunc, data);
+            newe->initExpr = (Expr *)WalkAST(newe->initExpr, preFunc, 
+                                             postFunc, data);
+        }
+        else if (dynamic_cast<SymbolExpr *>(node) != NULL ||
+                 dynamic_cast<ConstExpr *>(node) != NULL ||
+                 dynamic_cast<FunctionSymbolExpr *>(node) != NULL ||
+                 dynamic_cast<SyncExpr *>(node) != NULL ||
+                 dynamic_cast<NullPointerExpr *>(node) != NULL) {
+            // nothing to do 
+        }
+        else 
+            FATAL("Unhandled expression type in WalkAST().");
+    }
+
+    // Call the callback function
+    if (postFunc != NULL)
+        return postFunc(node, data);
+    else
+        return node;
+}
+
+
+static ASTNode *
+lOptimizeNode(ASTNode *node, void *) {
+    return node->Optimize();
+}
+
+
+ASTNode *
+Optimize(ASTNode *root) {
+    return WalkAST(root, NULL, lOptimizeNode, NULL);
+}
+
+
+Expr *
+Optimize(Expr *expr) {
+    return (Expr *)Optimize((ASTNode *)expr);
+}
+
+
+Stmt *
+Optimize(Stmt *stmt) {
+    return (Stmt *)Optimize((ASTNode *)stmt);
+}
+
+
+static ASTNode *
+lTypeCheckNode(ASTNode *node, void *) {
+    return node->TypeCheck();
+}
+
+
+ASTNode *
+TypeCheck(ASTNode *root) {
+    return WalkAST(root, NULL, lTypeCheckNode, NULL);
+}
+
+
+Expr *
+TypeCheck(Expr *expr) {
+    return (Expr *)TypeCheck((ASTNode *)expr);
+}
+
+
+Stmt *
+TypeCheck(Stmt *stmt) {
+    return (Stmt *)TypeCheck((ASTNode *)stmt);
+}
+
+
+struct CostData {
+    CostData() { cost = foreachDepth = 0; }
+
+    int cost;
+    int foreachDepth;
+};
+
+
+static bool
+lCostCallbackPre(ASTNode *node, void *d) {
+    CostData *data = (CostData *)d;
+    if (dynamic_cast<ForeachStmt *>(node) != NULL)
+        ++data->foreachDepth;
+    if (data->foreachDepth == 0)
+        data->cost += node->EstimateCost();
+    return true;
+}
+
+
+static ASTNode *
+lCostCallbackPost(ASTNode *node, void *d) {
+    CostData *data = (CostData *)d;
+    if (dynamic_cast<ForeachStmt *>(node) != NULL)
+        --data->foreachDepth;
+    return node;
+}
+
+
+int
+EstimateCost(ASTNode *root) {
+    CostData data;
+    WalkAST(root, lCostCallbackPre, lCostCallbackPost, &data);
+    return data.cost;
+}
+
+
+/** Given an AST node, check to see if it's safe if we happen to run the
+    code for that node with the execution mask all off.
+ */
+static bool
+lCheckAllOffSafety(ASTNode *node, void *data) {
+    bool *okPtr = (bool *)data;
+
+    FunctionCallExpr *fce;
+    if ((fce = dynamic_cast<FunctionCallExpr *>(node)) != NULL) {
+        if (fce->func == NULL)
+            return false;
+
+        const Type *type = fce->func->GetType();
+        const PointerType *pt = CastType<PointerType>(type);
+        if (pt != NULL)
+            type = pt->GetBaseType();
+        const FunctionType *ftype = CastType<FunctionType>(type);
+        Assert(ftype != NULL);
+
+        if (ftype->isSafe == false) {
+            *okPtr = false;
+            return false;
+        }
+    }
+
+    if (dynamic_cast<AssertStmt *>(node) != NULL) {
+        // While it's fine to run the assert for varying tests, it's not
+        // desirable to check an assert on a uniform variable if all of the
+        // lanes are off.
+        *okPtr = false;
+        return false;
+    }
+
+    if (dynamic_cast<NewExpr *>(node) != NULL ||
+        dynamic_cast<DeleteStmt *>(node) != NULL) {
+        // We definitely don't want to run the uniform variants of these if
+        // the mask is all off.  It's also worth skipping the overhead of
+        // executing the varying versions of them in the all-off mask case.
+        *okPtr = false;
+        return false;
+    }
+
+    if (dynamic_cast<ForeachStmt *>(node) != NULL ||
+        dynamic_cast<ForeachActiveStmt *>(node) != NULL ||
+        dynamic_cast<ForeachUniqueStmt *>(node) != NULL ||
+        dynamic_cast<UnmaskedStmt *>(node) != NULL) {
+        // The various foreach statements also shouldn't be run with an
+        // all-off mask.  Since they can re-establish an 'all on' mask,
+        // this would be pretty unintuitive.  (More generally, it's
+        // possibly a little strange to allow foreach in the presence of
+        // any non-uniform control flow...)
+        //
+        // Similarly, the implementation of foreach_unique assumes as a
+        // precondition that the mask won't be all off going into it, so
+        // we'll enforce that here...
+        *okPtr = false;
+        return false;
+    }
+
+    IndexExpr *ie;
+    if ((ie = dynamic_cast<IndexExpr *>(node)) != NULL && ie->baseExpr != NULL) {
+        const Type *type = ie->baseExpr->GetType();
+        if (type == NULL)
+            return true;
+        if (CastType<ReferenceType>(type) != NULL)
+            type = type->GetReferenceTarget();
+
+        ConstExpr *ce = dynamic_cast<ConstExpr *>(ie->index);
+        if (ce == NULL) {
+            // indexing with a variable... -> not safe
+            *okPtr = false;
+            return false;
+        }
+
+        const PointerType *pointerType = CastType<PointerType>(type);
+        if (pointerType != NULL) {
+            // pointer[index] -> can't be sure -> not safe
+            *okPtr = false;
+            return false;
+        }
+
+        const SequentialType *seqType = CastType<SequentialType>(type);
+        Assert(seqType != NULL);
+        int nElements = seqType->GetElementCount();
+        if (nElements == 0) {
+            // Unsized array, so we can't be sure -> not safe
+            *okPtr = false;
+            return false;
+        }
+
+        int32_t indices[ISPC_MAX_NVEC];
+        int count = ce->AsInt32(indices);
+        for (int i = 0; i < count; ++i) {
+            if (indices[i] < 0 || indices[i] >= nElements) {
+                // Index is out of bounds -> not safe
+                *okPtr = false;
+                return false;
+            }
+        }
+
+        // All indices are in-bounds
+        return true;
+    }
+
+    MemberExpr *me;
+    if ((me = dynamic_cast<MemberExpr *>(node)) != NULL &&
+        me->dereferenceExpr) {
+        *okPtr = false;
+        return false;
+    }
+
+    if (dynamic_cast<PtrDerefExpr *>(node) != NULL) {
+        *okPtr = false;
+        return false;
+    }
+
+    return true;
+}
+
+
+bool
+SafeToRunWithMaskAllOff(ASTNode *root) {
+    bool safe = true;
+    WalkAST(root, lCheckAllOffSafety, NULL, &safe);
+    return safe;
+}
--- a/ast.h
+++ b/ast.h
@@ -0,0 +1,150 @@
+/*
+  Copyright (c) 2011-2012, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file ast.h
+    @brief 
+*/
+
+#ifndef ISPC_AST_H
+#define ISPC_AST_H 1
+
+#include "ispc.h"
+#include <vector>
+
+/** @brief Abstract base class for nodes in the abstract syntax tree (AST).
+
+    This class defines a basic interface that all abstract syntax tree
+    (AST) nodes must implement.  The base classes for both expressions
+    (Expr) and statements (Stmt) inherit from this class.
+*/
+class ASTNode {
+public:
+    ASTNode(SourcePos p) : pos(p) { }
+    virtual ~ASTNode();
+
+    /** The Optimize() method should perform any appropriate early-stage
+        optimizations on the node (e.g. constant folding).  This method
+        will be called after the node's children have already been
+        optimized, and the caller will store the returned ASTNode * in
+        place of the original node.  This method should return NULL if an
+        error is encountered during optimization. */
+    virtual ASTNode *Optimize() = 0;
+
+    /** Type checking should be performed by the node when this method is
+        called.  In the event of an error, a NULL value may be returned.
+        As with ASTNode::Optimize(), the caller should store the returned
+        pointer in place of the original ASTNode *. */
+    virtual ASTNode *TypeCheck() = 0;
+
+    /** Estimate the execution cost of the node (not including the cost of
+        the children.  The value returned should be based on the COST_*
+        enumerant values defined in ispc.h. */
+    virtual int EstimateCost() const = 0;
+
+    /** All AST nodes must track the file position where they are
+        defined. */
+    SourcePos pos;
+};
+
+
+/** Simple representation of the abstract syntax trees for all of the
+    functions declared in a compilation unit.
+ */
+class AST {
+public:
+    /** Add the AST for a function described by the given declaration
+        information and source code. */
+    void AddFunction(Symbol *sym, Stmt *code);
+
+    /** Generate LLVM IR for all of the functions into the current
+        module. */
+    void GenerateIR();
+
+private:
+    std::vector<Function *> functions;
+};
+
+
+/** Callback function type for preorder traversial visiting function for
+    the AST walk.
+ */
+typedef bool (* ASTPreCallBackFunc)(ASTNode *node, void *data);
+
+/** Callback function type for postorder traversial visiting function for
+    the AST walk.
+ */
+typedef ASTNode * (* ASTPostCallBackFunc)(ASTNode *node, void *data);
+
+/** Walk (some portion of) an AST, starting from the given root node.  At
+    each node, if preFunc is non-NULL, call it, passing the given void
+    *data pointer; if the call to preFunc function returns false, then the
+    children of the node aren't visited.  This function then makes
+    recursive calls to WalkAST() to process the node's children; after
+    doing so, calls postFunc, at the node.  The return value from the
+    postFunc call is ignored. */
+extern ASTNode *WalkAST(ASTNode *root, ASTPreCallBackFunc preFunc,
+                        ASTPostCallBackFunc postFunc, void *data);
+
+/** Perform simple optimizations on the AST or portion thereof passed to
+    this function, returning the resulting AST. */
+extern ASTNode *Optimize(ASTNode *root);
+
+/** Convenience version of Optimize() for Expr *s that returns an Expr *
+    (rather than an ASTNode *, which would require the caller to cast back
+    to an Expr *). */ 
+extern Expr *Optimize(Expr *);
+
+/** Convenience version of Optimize() for Expr *s that returns an Stmt *
+    (rather than an ASTNode *, which would require the caller to cast back
+    to a Stmt *). */ 
+extern Stmt *Optimize(Stmt *);
+
+/** Perform type-checking on the given AST (or portion of one), returning a
+    pointer to the root of the resulting AST. */
+extern ASTNode *TypeCheck(ASTNode *root);
+
+/** Convenience version of TypeCheck() for Expr *s that returns an Expr *. */
+extern Expr *TypeCheck(Expr *);
+
+/** Convenience version of TypeCheck() for Stmt *s that returns an Stmt *. */
+extern Stmt *TypeCheck(Stmt *);
+
+/** Returns an estimate of the execution cost of the tree starting at
+    the given root. */
+extern int EstimateCost(ASTNode *root);
+
+/** Returns true if it would be safe to run the given code with an "all
+    off" mask. */ 
+extern bool SafeToRunWithMaskAllOff(ASTNode *root);
+
+#endif // ISPC_AST_H
--- a/bitcode2cpp.py
+++ b/bitcode2cpp.py
@@ -0,0 +1,47 @@
+#!/usr/bin/python
+
+import sys
+import string
+import re
+import subprocess
+import platform
+import os
+
+length=0
+
+src=str(sys.argv[1])
+
+target = re.sub("builtins/target-", "", src)
+target = re.sub(r"builtins\\target-", "", target)
+target = re.sub("builtins/", "", target)
+target = re.sub(r"builtins\\", "", target)
+target = re.sub("\.ll$", "", target)
+target = re.sub("\.c$", "", target)
+target = re.sub("-", "_", target)
+
+llvm_as="llvm-as"
+if platform.system() == 'Windows' or string.find(platform.system(), "CYGWIN_NT") != -1:
+    llvm_as = os.getenv("LLVM_INSTALL_DIR").replace("\\", "/") + "/bin/" + llvm_as
+
+try:
+    as_out=subprocess.Popen([llvm_as, "-", "-o", "-"], stdout=subprocess.PIPE)
+except IOError:
+    sys.stderr.write("Couldn't open " + src)
+    sys.exit(1)
+
+width = 16;
+sys.stdout.write("unsigned char builtins_bitcode_" + target + "[] = {\n")
+
+data = as_out.stdout.read()
+for i in range(0, len(data), 1):
+        sys.stdout.write("0x%0.2X, " % ord(data[i:i+1]))
+
+        if i%width == (width-1):
+            sys.stdout.write("\n")
+
+sys.stdout.write("0x00 };\n\n")
+sys.stdout.write("int builtins_bitcode_" + target + "_length = " + str(i+1) + ";\n")
+
+as_out.wait()
+
+sys.exit(as_out.returncode)
--- a/buildall.bat
+++ b/buildall.bat
@@ -0,0 +1,15 @@
+@echo off
+
+REM If LLVM_INSTALL_DIR isn't set globally in your environment,
+REM it can be set here_
+set LLVM_INSTALL_DIR=c:\users\mmp\llvm-dev
+
+REM Both the LLVM binaries and python need to be in the path
+set path=%LLVM_INSTALL_DIR%\bin;%PATH%;c:\cygwin\bin
+
+msbuild ispc.vcxproj /V:m /p:Platform=Win32 /p:Configuration=Release
+
+msbuild examples\examples.sln /V:m /p:Platform=x64 /p:Configuration=Release /t:rebuild
+msbuild examples\examples.sln /V:m /p:Platform=x64 /p:Configuration=Debug /t:rebuild
+msbuild examples\examples.sln /V:m /p:Platform=Win32 /p:Configuration=Release /t:rebuild
+msbuild examples\examples.sln /V:m /p:Platform=Win32 /p:Configuration=Debug /t:rebuild
--- a/buildispc.bat
+++ b/buildispc.bat
@@ -0,0 +1,11 @@
+@echo off
+
+REM If LLVM_INSTALL_DIR isn't set globally in your environment,
+REM it can be set here_
+REM set LLVM_INSTALL_DIR=c:\users\mmp\llvm-dev
+REM set LLVM_VERSION=3.2
+
+REM Both the LLVM binaries and python need to be in the path
+set path=%LLVM_INSTALL_DIR%\bin;%PATH%;c:\cygwin\bin
+
+msbuild ispc.vcxproj /V:m /p:Platform=Win32 /p:Configuration=Release
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -0,0 +1,965 @@
+/*
+  Copyright (c) 2010-2012, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file builtins.cpp
+    @brief Definitions of functions related to setting up the standard library 
+           and other builtins.
+*/
+
+#include "builtins.h"
+#include "type.h"
+#include "util.h"
+#include "sym.h"
+#include "expr.h"
+#include "llvmutil.h"
+#include "module.h"
+#include "ctx.h"
+
+#include <math.h>
+#include <stdlib.h>
+#include <llvm/LLVMContext.h>
+#if !defined(LLVM_3_0) && !defined(LLVM_3_1)
+  #include <llvm/Attributes.h>
+#endif
+#include <llvm/Module.h>
+#include <llvm/Type.h>
+#include <llvm/DerivedTypes.h>
+#include <llvm/Instructions.h>
+#include <llvm/Intrinsics.h>
+#include <llvm/Linker.h>
+#include <llvm/Target/TargetMachine.h>
+#include <llvm/ADT/Triple.h>
+#include <llvm/Support/MemoryBuffer.h>
+#include <llvm/Bitcode/ReaderWriter.h>
+
+extern int yyparse();
+struct yy_buffer_state;
+extern yy_buffer_state *yy_scan_string(const char *);
+
+
+/** Given an LLVM type, try to find the equivalent ispc type.  Note that
+    this is an under-constrained problem due to LLVM's type representations
+    carrying less information than ispc's.  (For example, LLVM doesn't
+    distinguish between signed and unsigned integers in its types.)
+
+    Because this function is only used for generating ispc declarations of
+    functions defined in LLVM bitcode in the builtins-*.ll files, in practice
+    we can get enough of what we need for the relevant cases to make things
+    work, partially with the help of the intAsUnsigned parameter, which
+    indicates whether LLVM integer types should be treated as being signed
+    or unsigned.
+
+ */
+static const Type *
+lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
+    if (t == LLVMTypes::VoidType)
+        return AtomicType::Void;
+
+    // uniform
+    else if (t == LLVMTypes::BoolType)
+        return AtomicType::UniformBool;
+    else if (t == LLVMTypes::Int8Type)
+        return intAsUnsigned ? AtomicType::UniformUInt8 : AtomicType::UniformInt8;
+    else if (t == LLVMTypes::Int16Type)
+        return intAsUnsigned ? AtomicType::UniformUInt16 : AtomicType::UniformInt16;
+    else if (t == LLVMTypes::Int32Type)
+        return intAsUnsigned ? AtomicType::UniformUInt32 : AtomicType::UniformInt32;
+    else if (t == LLVMTypes::FloatType)
+        return AtomicType::UniformFloat;
+    else if (t == LLVMTypes::DoubleType)
+        return AtomicType::UniformDouble;
+    else if (t == LLVMTypes::Int64Type)
+        return intAsUnsigned ? AtomicType::UniformUInt64 : AtomicType::UniformInt64;
+
+    // varying
+    if (LLVMTypes::MaskType != LLVMTypes::Int32VectorType &&
+        t == LLVMTypes::MaskType)
+        return AtomicType::VaryingBool;
+    else if (t == LLVMTypes::Int8VectorType)
+        return intAsUnsigned ? AtomicType::VaryingUInt8 : AtomicType::VaryingInt8;
+    else if (t == LLVMTypes::Int16VectorType)
+        return intAsUnsigned ? AtomicType::VaryingUInt16 : AtomicType::VaryingInt16;
+    else if (t == LLVMTypes::Int32VectorType)
+        return intAsUnsigned ? AtomicType::VaryingUInt32 : AtomicType::VaryingInt32;
+    else if (t == LLVMTypes::FloatVectorType)
+        return AtomicType::VaryingFloat;
+    else if (t == LLVMTypes::DoubleVectorType)
+        return AtomicType::VaryingDouble;
+    else if (t == LLVMTypes::Int64VectorType)
+        return intAsUnsigned ? AtomicType::VaryingUInt64 : AtomicType::VaryingInt64;
+
+    // pointers to uniform
+    else if (t == LLVMTypes::Int8PointerType)
+        return PointerType::GetUniform(intAsUnsigned ? AtomicType::UniformUInt8 :
+                                       AtomicType::UniformInt8);
+    else if (t == LLVMTypes::Int16PointerType)
+        return PointerType::GetUniform(intAsUnsigned ? AtomicType::UniformUInt16 :
+                                       AtomicType::UniformInt16);
+    else if (t == LLVMTypes::Int32PointerType)
+        return PointerType::GetUniform(intAsUnsigned ? AtomicType::UniformUInt32 :
+                                       AtomicType::UniformInt32);
+    else if (t == LLVMTypes::Int64PointerType)
+        return PointerType::GetUniform(intAsUnsigned ? AtomicType::UniformUInt64 :
+                                       AtomicType::UniformInt64);
+    else if (t == LLVMTypes::FloatPointerType)
+        return PointerType::GetUniform(AtomicType::UniformFloat);
+    else if (t == LLVMTypes::DoublePointerType)
+        return PointerType::GetUniform(AtomicType::UniformDouble);
+
+    // pointers to varying
+    else if (t == LLVMTypes::Int8VectorPointerType)
+        return PointerType::GetUniform(intAsUnsigned ? AtomicType::VaryingUInt8 :
+                                       AtomicType::VaryingInt8);
+    else if (t == LLVMTypes::Int16VectorPointerType)
+        return PointerType::GetUniform(intAsUnsigned ? AtomicType::VaryingUInt16 :
+                                       AtomicType::VaryingInt16);
+    else if (t == LLVMTypes::Int32VectorPointerType)
+        return PointerType::GetUniform(intAsUnsigned ? AtomicType::VaryingUInt32 :
+                                       AtomicType::VaryingInt32);
+    else if (t == LLVMTypes::Int64VectorPointerType)
+        return PointerType::GetUniform(intAsUnsigned ? AtomicType::VaryingUInt64 :
+                                       AtomicType::VaryingInt64);
+    else if (t == LLVMTypes::FloatVectorPointerType)
+        return PointerType::GetUniform(AtomicType::VaryingFloat);
+    else if (t == LLVMTypes::DoubleVectorPointerType)
+        return PointerType::GetUniform(AtomicType::VaryingDouble);
+
+    return NULL;
+}
+
+
+static void
+lCreateSymbol(const std::string &name, const Type *returnType, 
+              llvm::SmallVector<const Type *, 8> &argTypes, 
+              const llvm::FunctionType *ftype, llvm::Function *func, 
+              SymbolTable *symbolTable) {
+    SourcePos noPos;
+    noPos.name = "__stdlib";
+
+    FunctionType *funcType = new FunctionType(returnType, argTypes, noPos);
+
+    Debug(noPos, "Created builtin symbol \"%s\" [%s]\n", name.c_str(),
+          funcType->GetString().c_str());
+
+    Symbol *sym = new Symbol(name, noPos, funcType);
+    sym->function = func;
+    symbolTable->AddFunction(sym);
+}
+
+
+/** Given an LLVM function declaration, synthesize the equivalent ispc
+    symbol for the function (if possible).  Returns true on success, false
+    on failure.
+ */
+static bool
+lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
+    SourcePos noPos;
+    noPos.name = "__stdlib";
+
+    const llvm::FunctionType *ftype = func->getFunctionType();
+    std::string name = func->getName();
+
+    if (name.size() < 3 || name[0] != '_' || name[1] != '_')
+        return false;
+
+    Debug(SourcePos(), "Attempting to create ispc symbol for function \"%s\".",
+          name.c_str());
+
+    // An unfortunate hack: we want this builtin function to have the
+    // signature "int __sext_varying_bool(bool)", but the ispc function
+    // symbol creation code below assumes that any LLVM vector of i32s is a
+    // varying int32.  Here, we need that to be interpreted as a varying
+    // bool, so just have a one-off override for that one...
+    if (g->target.maskBitCount != 1 && name == "__sext_varying_bool") {
+        const Type *returnType = AtomicType::VaryingInt32;
+        llvm::SmallVector<const Type *, 8> argTypes;
+        argTypes.push_back(AtomicType::VaryingBool);
+
+        FunctionType *funcType = new FunctionType(returnType, argTypes, noPos);
+
+        Symbol *sym = new Symbol(name, noPos, funcType);
+        sym->function = func;
+        symbolTable->AddFunction(sym);
+        return true;
+    }
+
+    // If the function has any parameters with integer types, we'll make
+    // two Symbols for two overloaded versions of the function, one with
+    // all of the integer types treated as signed integers and one with all
+    // of them treated as unsigned.
+    for (int i = 0; i < 2; ++i) {
+        bool intAsUnsigned = (i == 1);
+
+        const Type *returnType = lLLVMTypeToISPCType(ftype->getReturnType(),
+                                                     intAsUnsigned);
+        if (returnType == NULL) {
+            Debug(SourcePos(), "Failed: return type not representable for "
+                  "builtin %s.", name.c_str());
+            // return type not representable in ispc -> not callable from ispc
+            return false;
+        }
+
+        // Iterate over the arguments and try to find their equivalent ispc
+        // types.  Track if any of the arguments has an integer type.
+        bool anyIntArgs = false;
+        llvm::SmallVector<const Type *, 8> argTypes;
+        for (unsigned int j = 0; j < ftype->getNumParams(); ++j) {
+            const llvm::Type *llvmArgType = ftype->getParamType(j);
+            const Type *type = lLLVMTypeToISPCType(llvmArgType, intAsUnsigned);
+            if (type == NULL) {
+                Debug(SourcePos(), "Failed: type of parameter %d not "
+                      "representable for builtin %s", j, name.c_str());
+                return false;
+            }
+            anyIntArgs |= 
+                (Type::Equal(type, lLLVMTypeToISPCType(llvmArgType, !intAsUnsigned)) == false);
+            argTypes.push_back(type);
+        }
+
+        // Always create the symbol the first time through, in particular
+        // so that we get symbols for things with no integer types!
+        if (i == 0 || anyIntArgs == true)
+            lCreateSymbol(name, returnType, argTypes, ftype, func, symbolTable);
+    }
+
+    return true;
+}
+
+
+/** Given an LLVM module, create ispc symbols for the functions in the
+    module.
+ */
+static void
+lAddModuleSymbols(llvm::Module *module, SymbolTable *symbolTable) {
+#if 0
+    // FIXME: handle globals?
+    Assert(module->global_empty());
+#endif
+
+    llvm::Module::iterator iter;
+    for (iter = module->begin(); iter != module->end(); ++iter) {
+        llvm::Function *func = iter;
+        lCreateISPCSymbol(func, symbolTable);
+    }
+}
+
+
+/** In many of the builtins-*.ll files, we have declarations of various LLVM
+    intrinsics that are then used in the implementation of various target-
+    specific functions.  This function loops over all of the intrinsic 
+    declarations and makes sure that the signature we have in our .ll file
+    matches the signature of the actual intrinsic.
+*/
+static void
+lCheckModuleIntrinsics(llvm::Module *module) {
+    llvm::Module::iterator iter;
+    for (iter = module->begin(); iter != module->end(); ++iter) {
+        llvm::Function *func = iter;
+        if (!func->isIntrinsic())
+            continue;
+
+        const std::string funcName = func->getName().str();
+        // Work around http://llvm.org/bugs/show_bug.cgi?id=10438; only
+        // check the llvm.x86.* intrinsics for now...
+        if (!strncmp(funcName.c_str(), "llvm.x86.", 9)) {
+            llvm::Intrinsic::ID id = (llvm::Intrinsic::ID)func->getIntrinsicID();
+            Assert(id != 0);
+            llvm::Type *intrinsicType = 
+                llvm::Intrinsic::getType(*g->ctx, id);
+            intrinsicType = llvm::PointerType::get(intrinsicType, 0);
+            Assert(func->getType() == intrinsicType);
+        }
+    }
+}
+
+
+/** We'd like to have all of these functions declared as 'internal' in
+    their respective bitcode files so that if they aren't needed by the
+    user's program they are elimiated from the final output.  However, if
+    we do so, then they aren't brought in by the LinkModules() call below
+    since they aren't yet used by anything in the module they're being
+    linked with (in LLVM 3.1, at least).
+
+    Therefore, we don't declare them as internal when we first define them,
+    but instead mark them as internal after they've been linked in.  This
+    is admittedly a kludge.
+ */
+static void
+lSetInternalFunctions(llvm::Module *module) {
+    const char *names[] = {
+        "__add_float",
+        "__add_int32",
+        "__add_uniform_double",
+        "__add_uniform_int32",
+        "__add_uniform_int64",
+        "__add_varying_double",
+        "__add_varying_int32",
+        "__add_varying_int64",
+        "__all",
+        "__any",
+        "__aos_to_soa3_float",
+        "__aos_to_soa3_float16",
+        "__aos_to_soa3_float4",
+        "__aos_to_soa3_float8",
+        "__aos_to_soa3_int32",
+        "__aos_to_soa4_float",
+        "__aos_to_soa4_float16",
+        "__aos_to_soa4_float4",
+        "__aos_to_soa4_float8",
+        "__aos_to_soa4_int32",
+        "__atomic_add_int32_global",
+        "__atomic_add_int64_global",
+        "__atomic_add_uniform_int32_global",
+        "__atomic_add_uniform_int64_global",
+        "__atomic_and_int32_global",
+        "__atomic_and_int64_global",
+        "__atomic_and_uniform_int32_global",
+        "__atomic_and_uniform_int64_global",
+        "__atomic_compare_exchange_double_global",
+        "__atomic_compare_exchange_float_global",
+        "__atomic_compare_exchange_int32_global",
+        "__atomic_compare_exchange_int64_global",
+        "__atomic_compare_exchange_uniform_double_global",
+        "__atomic_compare_exchange_uniform_float_global",
+        "__atomic_compare_exchange_uniform_int32_global",
+        "__atomic_compare_exchange_uniform_int64_global",
+        "__atomic_max_uniform_int32_global",
+        "__atomic_max_uniform_int64_global",
+        "__atomic_min_uniform_int32_global",
+        "__atomic_min_uniform_int64_global",
+        "__atomic_or_int32_global",
+        "__atomic_or_int64_global",
+        "__atomic_or_uniform_int32_global",
+        "__atomic_or_uniform_int64_global",
+        "__atomic_sub_int32_global",
+        "__atomic_sub_int64_global",
+        "__atomic_sub_uniform_int32_global",
+        "__atomic_sub_uniform_int64_global",
+        "__atomic_swap_double_global",
+        "__atomic_swap_float_global",
+        "__atomic_swap_int32_global",
+        "__atomic_swap_int64_global",
+        "__atomic_swap_uniform_double_global",
+        "__atomic_swap_uniform_float_global",
+        "__atomic_swap_uniform_int32_global",
+        "__atomic_swap_uniform_int64_global",
+        "__atomic_umax_uniform_uint32_global",
+        "__atomic_umax_uniform_uint64_global",
+        "__atomic_umin_uniform_uint32_global",
+        "__atomic_umin_uniform_uint64_global",
+        "__atomic_xor_int32_global",
+        "__atomic_xor_int64_global",
+        "__atomic_xor_uniform_int32_global",
+        "__atomic_xor_uniform_int64_global",
+        "__broadcast_double",
+        "__broadcast_float",
+        "__broadcast_i16",
+        "__broadcast_i32",
+        "__broadcast_i64",
+        "__broadcast_i8",
+        "__ceil_uniform_double",
+        "__ceil_uniform_float",
+        "__ceil_varying_double",
+        "__ceil_varying_float",
+        "__clock",
+        "__count_trailing_zeros_i32",
+        "__count_trailing_zeros_i64",
+        "__count_leading_zeros_i32",
+        "__count_leading_zeros_i64",
+        "__delete_uniform",
+        "__delete_varying",
+        "__do_assert_uniform",
+        "__do_assert_varying",
+        "__do_print", 
+        "__doublebits_uniform_int64",
+        "__doublebits_varying_int64",
+        "__exclusive_scan_add_double",
+        "__exclusive_scan_add_float",
+        "__exclusive_scan_add_i32",
+        "__exclusive_scan_add_i64",
+        "__exclusive_scan_and_i32",
+        "__exclusive_scan_and_i64",
+        "__exclusive_scan_or_i32",
+        "__exclusive_scan_or_i64",
+        "__extract_int16",
+        "__extract_int32",
+        "__extract_int64",
+        "__extract_int8",
+        "__fastmath",
+        "__float_to_half_uniform",
+        "__float_to_half_varying",
+        "__floatbits_uniform_int32",
+        "__floatbits_varying_int32",
+        "__floor_uniform_double",
+        "__floor_uniform_float",
+        "__floor_varying_double",
+        "__floor_varying_float",
+        "__get_system_isa",
+        "__half_to_float_uniform",
+        "__half_to_float_varying",
+        "__insert_int16",
+        "__insert_int32",
+        "__insert_int64",
+        "__insert_int8",
+        "__intbits_uniform_double",
+        "__intbits_uniform_float",
+        "__intbits_varying_double",
+        "__intbits_varying_float",
+        "__max_uniform_double",
+        "__max_uniform_float",
+        "__max_uniform_int32",
+        "__max_uniform_int64",
+        "__max_uniform_uint32",
+        "__max_uniform_uint64",
+        "__max_varying_double",
+        "__max_varying_float",
+        "__max_varying_int32",
+        "__max_varying_int64",
+        "__max_varying_uint32",
+        "__max_varying_uint64",
+        "__memory_barrier",
+        "__memcpy32",
+        "__memcpy64",
+        "__memmove32",
+        "__memmove64",
+        "__memset32",
+        "__memset64",
+        "__min_uniform_double",
+        "__min_uniform_float",
+        "__min_uniform_int32",
+        "__min_uniform_int64",
+        "__min_uniform_uint32",
+        "__min_uniform_uint64",
+        "__min_varying_double",
+        "__min_varying_float",
+        "__min_varying_int32",
+        "__min_varying_int64",
+        "__min_varying_uint32",
+        "__min_varying_uint64",
+        "__movmsk",
+        "__new_uniform",
+        "__new_varying32",
+        "__new_varying64",
+        "__none",
+        "__num_cores",
+        "__packed_load_active",
+        "__packed_store_active",
+        "__pause",
+        "__popcnt_int32",
+        "__popcnt_int64",
+        "__prefetch_read_uniform_1",
+        "__prefetch_read_uniform_2",
+        "__prefetch_read_uniform_3",
+        "__prefetch_read_uniform_nt",
+        "__rcp_uniform_float",
+        "__rcp_varying_float",
+        "__rdrand_i16",
+        "__rdrand_i32",
+        "__rdrand_i64",
+        "__reduce_add_double",
+        "__reduce_add_float",
+        "__reduce_add_int32",
+        "__reduce_add_int64",
+        "__reduce_equal_double",
+        "__reduce_equal_float",
+        "__reduce_equal_int32",
+        "__reduce_equal_int64",
+        "__reduce_max_double",
+        "__reduce_max_float",
+        "__reduce_max_int32",
+        "__reduce_max_int64",
+        "__reduce_max_uint32",
+        "__reduce_max_uint64",
+        "__reduce_min_double",
+        "__reduce_min_float",
+        "__reduce_min_int32",
+        "__reduce_min_int64",
+        "__reduce_min_uint32",
+        "__reduce_min_uint64",
+        "__rotate_double",
+        "__rotate_float",
+        "__rotate_i16",
+        "__rotate_i32",
+        "__rotate_i64",
+        "__rotate_i8",
+        "__round_uniform_double",
+        "__round_uniform_float",
+        "__round_varying_double",
+        "__round_varying_float",
+        "__rsqrt_uniform_float",
+        "__rsqrt_varying_float",
+        "__set_system_isa",
+        "__sext_uniform_bool",
+        "__sext_varying_bool",
+        "__shuffle2_double",
+        "__shuffle2_float",
+        "__shuffle2_i16",
+        "__shuffle2_i32",
+        "__shuffle2_i64",
+        "__shuffle2_i8",
+        "__shuffle_double",
+        "__shuffle_float",
+        "__shuffle_i16",
+        "__shuffle_i32",
+        "__shuffle_i64",
+        "__shuffle_i8",
+        "__soa_to_aos3_float",
+        "__soa_to_aos3_float16",
+        "__soa_to_aos3_float4",
+        "__soa_to_aos3_float8",
+        "__soa_to_aos3_int32",
+        "__soa_to_aos4_float",
+        "__soa_to_aos4_float16",
+        "__soa_to_aos4_float4",
+        "__soa_to_aos4_float8",
+        "__soa_to_aos4_int32",
+        "__sqrt_uniform_double",
+        "__sqrt_uniform_float",
+        "__sqrt_varying_double",
+        "__sqrt_varying_float",
+        "__stdlib_acosf",
+        "__stdlib_asinf",
+        "__stdlib_atan",
+        "__stdlib_atan2",
+        "__stdlib_atan2f",
+        "__stdlib_atanf",
+        "__stdlib_cos",
+        "__stdlib_cosf",
+        "__stdlib_exp",
+        "__stdlib_expf",
+        "__stdlib_log",
+        "__stdlib_logf",
+        "__stdlib_pow",
+        "__stdlib_powf",
+        "__stdlib_sin",
+        "__stdlib_sincos",
+        "__stdlib_sincosf",
+        "__stdlib_sinf",
+        "__stdlib_tan",
+        "__stdlib_tanf",
+        "__svml_sin",
+        "__svml_cos",
+        "__svml_sincos",
+        "__svml_tan",
+        "__svml_atan",
+        "__svml_atan2",
+        "__svml_exp",
+        "__svml_log",
+        "__svml_pow",
+        "__undef_uniform",
+        "__undef_varying",
+        "__vec4_add_float",
+        "__vec4_add_int32",
+        "__vselect_float",
+        "__vselect_i32",
+    };
+
+    int count = sizeof(names) / sizeof(names[0]);
+    for (int i = 0; i < count; ++i) {
+        llvm::Function *f = module->getFunction(names[i]);
+        if (f != NULL && f->empty() == false)
+            f->setLinkage(llvm::GlobalValue::InternalLinkage);
+    }
+}
+
+
+/** This utility function takes serialized binary LLVM bitcode and adds its
+    definitions to the given module.  Functions in the bitcode that can be
+    mapped to ispc functions are also added to the symbol table.
+
+    @param bitcode     Binary LLVM bitcode (e.g. the contents of a *.bc file)
+    @param length      Length of the bitcode buffer
+    @param module      Module to link the bitcode into
+    @param symbolTable Symbol table to add definitions to
+ */
+void
+AddBitcodeToModule(const unsigned char *bitcode, int length,
+                   llvm::Module *module, SymbolTable *symbolTable) {
+    std::string bcErr;
+    llvm::StringRef sb = llvm::StringRef((char *)bitcode, length);
+    llvm::MemoryBuffer *bcBuf = llvm::MemoryBuffer::getMemBuffer(sb);
+    llvm::Module *bcModule = llvm::ParseBitcodeFile(bcBuf, *g->ctx, &bcErr);
+    if (!bcModule)
+        Error(SourcePos(), "Error parsing stdlib bitcode: %s", bcErr.c_str());
+    else {
+        // FIXME: this feels like a bad idea, but the issue is that when we
+        // set the llvm::Module's target triple in the ispc Module::Module
+        // constructor, we start by calling llvm::sys::getHostTriple() (and
+        // then change the arch if needed).  Somehow that ends up giving us
+        // strings like 'x86_64-apple-darwin11.0.0', while the stuff we
+        // compile to bitcode with clang has module triples like
+        // 'i386-apple-macosx10.7.0'.  And then LLVM issues a warning about
+        // linking together modules with incompatible target triples..
+        llvm::Triple mTriple(m->module->getTargetTriple());
+        llvm::Triple bcTriple(bcModule->getTargetTriple());
+        Assert(bcTriple.getArch() == llvm::Triple::UnknownArch ||
+               mTriple.getArch() == bcTriple.getArch());
+        Assert(bcTriple.getVendor() == llvm::Triple::UnknownVendor ||
+               mTriple.getVendor() == bcTriple.getVendor());
+        bcModule->setTargetTriple(mTriple.str());
+
+        // This is also suboptimal; LLVM issues a warning about linking
+        // modules with different datalayouts, due to things like
+        // bulitins-c.c having the regular IA layout, but the generic
+        // targets having a layout with 16-bit alignment for 16xi1 vectors.
+        // As long as builtins-c.c doesn't have any 16xi1 vector types
+        // (which it shouldn't!), then this override is safe.
+        if (g->target.isa == Target::GENERIC)
+            bcModule->setDataLayout(module->getDataLayout());
+
+        std::string(linkError);
+        if (llvm::Linker::LinkModules(module, bcModule, 
+                                      llvm::Linker::DestroySource,
+                                      &linkError))
+            Error(SourcePos(), "Error linking stdlib bitcode: %s", linkError.c_str());
+        lSetInternalFunctions(module);
+        if (symbolTable != NULL)
+            lAddModuleSymbols(module, symbolTable);
+        lCheckModuleIntrinsics(module);
+    }
+}
+
+
+/** Utility routine that defines a constant int32 with given value, adding
+    the symbol to both the ispc symbol table and the given LLVM module.
+ */
+static void
+lDefineConstantInt(const char *name, int val, llvm::Module *module,
+                   SymbolTable *symbolTable) {
+    Symbol *sym = 
+        new Symbol(name, SourcePos(), AtomicType::UniformInt32->GetAsConstType(),
+                   SC_STATIC);
+    sym->constValue = new ConstExpr(sym->type, val, SourcePos());
+    llvm::Type *ltype = LLVMTypes::Int32Type;
+    llvm::Constant *linit = LLVMInt32(val);
+    // Use WeakODRLinkage rather than InternalLinkage so that a definition
+    // survives even if it's not used in the module, so that the symbol is
+    // there in the debugger.
+    llvm::GlobalValue::LinkageTypes linkage = g->generateDebuggingSymbols ?
+        llvm::GlobalValue::WeakODRLinkage : llvm::GlobalValue::InternalLinkage;
+    sym->storagePtr = new llvm::GlobalVariable(*module, ltype, true, linkage,
+                                               linit, name);
+    symbolTable->AddVariable(sym);
+
+    if (m->diBuilder != NULL) {
+        llvm::DIFile file;
+        llvm::DIType diType = sym->type->GetDIType(file);
+        Assert(diType.Verify());
+        // FIXME? DWARF says that this (and programIndex below) should
+        // have the DW_AT_artifical attribute.  It's not clear if this
+        // matters for anything though.
+        llvm::DIGlobalVariable var = 
+            m->diBuilder->createGlobalVariable(name, 
+                                               file,
+                                               0 /* line */,
+                                               diType,
+                                               true /* static */,
+                                               sym->storagePtr);
+        Assert(var.Verify());
+    }
+}
+
+
+
+static void
+lDefineConstantIntFunc(const char *name, int val, llvm::Module *module,
+                       SymbolTable *symbolTable) {
+    llvm::SmallVector<const Type *, 8> args;
+    FunctionType *ft = new FunctionType(AtomicType::UniformInt32, args, SourcePos());
+    Symbol *sym = new Symbol(name, SourcePos(), ft, SC_STATIC);
+
+    llvm::Function *func = module->getFunction(name);
+    Assert(func != NULL); // it should be declared already...
+#if defined(LLVM_3_0) || defined(LLVM_3_1)
+    func->addFnAttr(llvm::Attribute::AlwaysInline);
+#else
+    func->addFnAttr(llvm::Attributes::AlwaysInline);
+#endif
+    llvm::BasicBlock *bblock = llvm::BasicBlock::Create(*g->ctx, "entry", func, 0);
+    llvm::ReturnInst::Create(*g->ctx, LLVMInt32(val), bblock);
+
+    sym->function = func;
+    symbolTable->AddVariable(sym);
+}
+
+
+
+static void
+lDefineProgramIndex(llvm::Module *module, SymbolTable *symbolTable) {
+    Symbol *sym = 
+        new Symbol("programIndex", SourcePos(), 
+                   AtomicType::VaryingInt32->GetAsConstType(), SC_STATIC);
+
+    int pi[ISPC_MAX_NVEC];
+    for (int i = 0; i < g->target.vectorWidth; ++i)
+        pi[i] = i;
+    sym->constValue = new ConstExpr(sym->type, pi, SourcePos());
+
+    llvm::Type *ltype = LLVMTypes::Int32VectorType;
+    llvm::Constant *linit = LLVMInt32Vector(pi);
+    // See comment in lDefineConstantInt() for why WeakODRLinkage is used here
+    llvm::GlobalValue::LinkageTypes linkage = g->generateDebuggingSymbols ?
+        llvm::GlobalValue::WeakODRLinkage : llvm::GlobalValue::InternalLinkage;
+    sym->storagePtr = new llvm::GlobalVariable(*module, ltype, true, linkage,
+                                               linit, sym->name.c_str());
+    symbolTable->AddVariable(sym);
+
+    if (m->diBuilder != NULL) {
+        llvm::DIFile file;
+        llvm::DIType diType = sym->type->GetDIType(file);
+        Assert(diType.Verify());
+        llvm::DIGlobalVariable var =
+            m->diBuilder->createGlobalVariable(sym->name.c_str(), 
+                                               file,
+                                               0 /* line */,
+                                               diType,
+                                               false /* static */,
+                                               sym->storagePtr);
+        Assert(var.Verify());
+    }
+}
+
+
+void
+DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *module,
+             bool includeStdlibISPC) {
+    // Add the definitions from the compiled builtins-c.c file
+    if (g->target.is32Bit) {
+        extern unsigned char builtins_bitcode_c_32[];
+        extern int builtins_bitcode_c_32_length;
+        AddBitcodeToModule(builtins_bitcode_c_32, builtins_bitcode_c_32_length, 
+                           module, symbolTable);
+    }
+    else {
+        extern unsigned char builtins_bitcode_c_64[];
+        extern int builtins_bitcode_c_64_length;
+        AddBitcodeToModule(builtins_bitcode_c_64, builtins_bitcode_c_64_length, 
+                           module, symbolTable);
+    }
+
+    // Next, add the target's custom implementations of the various needed
+    // builtin functions (e.g. __masked_store_32(), etc).
+    switch (g->target.isa) {
+    case Target::SSE2:
+        extern unsigned char builtins_bitcode_sse2[];
+        extern int builtins_bitcode_sse2_length;
+        extern unsigned char builtins_bitcode_sse2_x2[];
+        extern int builtins_bitcode_sse2_x2_length;
+        switch (g->target.vectorWidth) {
+        case 4: 
+            AddBitcodeToModule(builtins_bitcode_sse2, builtins_bitcode_sse2_length, 
+                               module, symbolTable);
+            break;
+        case 8:
+            AddBitcodeToModule(builtins_bitcode_sse2_x2, builtins_bitcode_sse2_x2_length, 
+                               module, symbolTable);
+            break;
+        default:
+            FATAL("logic error in DefineStdlib");
+        }
+        break;
+    case Target::SSE4:
+        extern unsigned char builtins_bitcode_sse4[];
+        extern int builtins_bitcode_sse4_length;
+        extern unsigned char builtins_bitcode_sse4_x2[];
+        extern int builtins_bitcode_sse4_x2_length;
+        switch (g->target.vectorWidth) {
+        case 4: 
+            AddBitcodeToModule(builtins_bitcode_sse4,
+                               builtins_bitcode_sse4_length, 
+                               module, symbolTable);
+            break;
+        case 8:
+            AddBitcodeToModule(builtins_bitcode_sse4_x2, 
+                               builtins_bitcode_sse4_x2_length, 
+                               module, symbolTable);
+            break;
+        default:
+            FATAL("logic error in DefineStdlib");
+        }
+        break;
+    case Target::AVX:
+        switch (g->target.vectorWidth) {
+        case 8:
+            extern unsigned char builtins_bitcode_avx1[];
+            extern int builtins_bitcode_avx1_length;
+            AddBitcodeToModule(builtins_bitcode_avx1, 
+                               builtins_bitcode_avx1_length, 
+                               module, symbolTable);
+            break;
+        case 16:
+            extern unsigned char builtins_bitcode_avx1_x2[];
+            extern int builtins_bitcode_avx1_x2_length;
+            AddBitcodeToModule(builtins_bitcode_avx1_x2, 
+                               builtins_bitcode_avx1_x2_length,
+                               module,  symbolTable);
+            break;
+        default:
+            FATAL("logic error in DefineStdlib");
+        }
+        break;
+    case Target::AVX11:
+        switch (g->target.vectorWidth) {
+        case 8:
+            extern unsigned char builtins_bitcode_avx11[];
+            extern int builtins_bitcode_avx11_length;
+            AddBitcodeToModule(builtins_bitcode_avx11, 
+                               builtins_bitcode_avx11_length, 
+                               module, symbolTable);
+            break;
+        case 16:
+            extern unsigned char builtins_bitcode_avx11_x2[];
+            extern int builtins_bitcode_avx11_x2_length;
+            AddBitcodeToModule(builtins_bitcode_avx11_x2, 
+                               builtins_bitcode_avx11_x2_length,
+                               module,  symbolTable);
+            break;
+        default:
+            FATAL("logic error in DefineStdlib");
+        }
+        break;
+    case Target::AVX2:
+        switch (g->target.vectorWidth) {
+        case 8:
+            extern unsigned char builtins_bitcode_avx2[];
+            extern int builtins_bitcode_avx2_length;
+            AddBitcodeToModule(builtins_bitcode_avx2, 
+                               builtins_bitcode_avx2_length, 
+                               module, symbolTable);
+            break;
+        case 16:
+            extern unsigned char builtins_bitcode_avx2_x2[];
+            extern int builtins_bitcode_avx2_x2_length;
+            AddBitcodeToModule(builtins_bitcode_avx2_x2, 
+                               builtins_bitcode_avx2_x2_length,
+                               module,  symbolTable);
+            break;
+        default:
+            FATAL("logic error in DefineStdlib");
+        }
+        break;
+    case Target::GENERIC:
+        switch (g->target.vectorWidth) {
+        case 4:
+            extern unsigned char builtins_bitcode_generic_4[];
+            extern int builtins_bitcode_generic_4_length;
+            AddBitcodeToModule(builtins_bitcode_generic_4, 
+                               builtins_bitcode_generic_4_length, 
+                               module, symbolTable);
+            break;
+        case 8:
+            extern unsigned char builtins_bitcode_generic_8[];
+            extern int builtins_bitcode_generic_8_length;
+            AddBitcodeToModule(builtins_bitcode_generic_8, 
+                               builtins_bitcode_generic_8_length, 
+                               module, symbolTable);
+            break;
+        case 16:
+            extern unsigned char builtins_bitcode_generic_16[];
+            extern int builtins_bitcode_generic_16_length;
+            AddBitcodeToModule(builtins_bitcode_generic_16, 
+                               builtins_bitcode_generic_16_length, 
+                               module, symbolTable);
+            break;
+        case 32:
+            extern unsigned char builtins_bitcode_generic_32[];
+            extern int builtins_bitcode_generic_32_length;
+            AddBitcodeToModule(builtins_bitcode_generic_32, 
+                               builtins_bitcode_generic_32_length, 
+                               module, symbolTable);
+            break;
+        case 64:
+            extern unsigned char builtins_bitcode_generic_64[];
+            extern int builtins_bitcode_generic_64_length;
+            AddBitcodeToModule(builtins_bitcode_generic_64, 
+                               builtins_bitcode_generic_64_length, 
+                               module, symbolTable);
+            break;
+	case 1:
+            extern unsigned char builtins_bitcode_generic_1[];
+            extern int builtins_bitcode_generic_1_length;
+            AddBitcodeToModule(builtins_bitcode_generic_1, 
+                               builtins_bitcode_generic_1_length, 
+                               module, symbolTable);
+            break;
+        default:
+            FATAL("logic error in DefineStdlib");
+        }
+        break;
+    default:
+        FATAL("logic error");
+    }
+
+    // define the 'programCount' builtin variable
+    lDefineConstantInt("programCount", g->target.vectorWidth, module, symbolTable);
+
+    // define the 'programIndex' builtin
+    lDefineProgramIndex(module, symbolTable);
+
+    // Define __math_lib stuff.  This is used by stdlib.ispc, for example, to
+    // figure out which math routines to end up calling...
+    lDefineConstantInt("__math_lib", (int)g->mathLib, module, symbolTable);
+    lDefineConstantInt("__math_lib_ispc", (int)Globals::Math_ISPC, module,
+                       symbolTable);
+    lDefineConstantInt("__math_lib_ispc_fast", (int)Globals::Math_ISPCFast, 
+                       module, symbolTable);
+    lDefineConstantInt("__math_lib_svml", (int)Globals::Math_SVML, module,
+                       symbolTable);
+    lDefineConstantInt("__math_lib_system", (int)Globals::Math_System, module,
+                       symbolTable);
+    lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload,
+                           module, symbolTable);
+
+    lDefineConstantInt("__have_native_half", g->target.hasHalf, module, 
+                       symbolTable);
+    lDefineConstantInt("__have_native_rand", g->target.hasRand, module, 
+                       symbolTable);
+    lDefineConstantInt("__have_native_transcendentals", g->target.hasTranscendentals,
+                       module, symbolTable);
+
+    if (includeStdlibISPC) {
+        // If the user wants the standard library to be included, parse the
+        // serialized version of the stdlib.ispc file to get its
+        // definitions added.
+      if (g->target.isa == Target::GENERIC&&g->target.vectorWidth!=1) { // 1 wide uses x86 stdlib
+            extern char stdlib_generic_code[];
+            yy_scan_string(stdlib_generic_code);
+            yyparse();
+        }
+        else {
+            extern char stdlib_x86_code[];
+            yy_scan_string(stdlib_x86_code);
+            yyparse();
+        }
+    }
+}
--- a/builtins.h
+++ b/builtins.h
@@ -0,0 +1,61 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file builtins.h
+    @brief Declarations of functions related to builtins and the 
+           standard library
+*/
+
+#ifndef ISPC_STDLIB_H
+#define ISPC_STDLIB_H 1
+
+#include "ispc.h"
+
+/** Adds declarations and definitions of ispc standard library functions
+    and types to the given module.
+
+    @param symbolTable     SymbolTable in which to add symbol definitions for
+                           stdlib stuff
+    @param ctx             llvm::LLVMContext to use for getting types and the
+                           like for standard library definitions
+    @param module          Module in which to add the declarations/definitions
+    @param includeStdlib   Indicates whether the definitions from the stdlib.ispc
+                           file should be added to the module.
+ */
+void DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *module,
+                  bool includeStdlib);
+
+void AddBitcodeToModule(const unsigned char *bitcode, int length,
+                        llvm::Module *module, SymbolTable *symbolTable = NULL);
+
+#endif // ISPC_STDLIB_H
--- a/builtins/builtins.c
+++ b/builtins/builtins.c
@@ -0,0 +1,201 @@
+/*
+  Copyright (c) 2010-2012, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file builtins-c.c
+    @brief Standard library function implementations written in C.
+
+    This file provides C implementations of various functions that can be
+    called from ispc programs; in other words, this file is *not* linked
+    into the ispc compiler executable, but rather provides functions that
+    can be compiled into ispc programs.
+
+    When the ispc compiler is built, this file is compiled with clang to
+    generate LLVM bitcode.  This bitcode is later linked in to the program
+    being compiled by the DefineStdlib() function.  The first way to access
+    definitions from this file is by asking for them name from the
+    llvm::Module's' symbol table (e.g. as the PrintStmt implementation does
+    with __do_print() below.  Alternatively, if a function defined in this
+    file has a signature that can be mapped back to ispc types by the
+    lLLVMTypeToIspcType() function, then its declaration will be made
+    available to ispc programs at compile time automatically.
+  */
+
+
+#ifndef _MSC_VER
+#include <unistd.h>
+#endif // !_MSC_VER
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+
+typedef int Bool;
+
+#define PRINT_BUF_SIZE 4096
+
+#define APPEND(str)                                        \
+    do {                                                   \
+        int offset = bufp - &printString[0];               \
+        *bufp = '\0';                                      \
+        strncat(bufp, str, PRINT_BUF_SIZE-offset);         \
+        bufp += strlen(str);                               \
+        if (bufp >= &printString[PRINT_BUF_SIZE])          \
+            goto done;                                     \
+    } while (0) /* eat semicolon */
+
+
+#define PRINT_SCALAR(fmt, type)                  \
+    sprintf(tmpBuf, fmt, *((type *)ptr));        \
+    APPEND(tmpBuf);                              \
+    break
+
+#define PRINT_VECTOR(fmt, type)                                         \
+    *bufp++ = '[';                                                      \
+    if (bufp == &printString[PRINT_BUF_SIZE]) break;                    \
+    for (int i = 0; i < width; ++i) {                                   \
+        /* only print the value if the current lane is executing */     \
+        if (mask & (1ull<<i))                                           \
+            sprintf(tmpBuf, fmt, ((type *)ptr)[i]);                     \
+        else                                                            \
+            sprintf(tmpBuf, "((" fmt "))", ((type *)ptr)[i]);           \
+        APPEND(tmpBuf);                                                 \
+        *bufp++ = (i != width-1 ? ',' : ']');                           \
+    }                                                                   \
+    break
+
+/** This function is called by PrintStmt to do the work of printing values
+    from ispc programs.  Note that the function signature here must match
+    the parameters that PrintStmt::EmitCode() generates.
+
+    @param format  Print format string
+    @param types   Encoded types of the values being printed.
+                   (See lEncodeType()). 
+    @param width   Vector width of the compilation target
+    @param mask    Current lane mask when the print statemnt is called
+    @param args    Array of pointers to the values to be printed
+ */
+void __do_print(const char *format, const char *types, int width, uint64_t mask, 
+                void **args) {
+    char printString[PRINT_BUF_SIZE+1]; // +1 for trailing NUL
+    char *bufp = &printString[0];
+    char tmpBuf[256];
+
+    int argCount = 0;
+    while (*format && bufp < &printString[PRINT_BUF_SIZE]) {
+        // Format strings are just single percent signs.
+        if (*format != '%') {
+            *bufp++ = *format;
+        }
+        else {
+            if (*types) {
+                void *ptr = args[argCount++];
+                // Based on the encoding in the types string, cast the
+                // value appropriately and print it with a reasonable
+                // printf() formatting string.
+                switch (*types) {
+                case 'b': {
+                    sprintf(tmpBuf, "%s", *((Bool *)ptr) ? "true" : "false");
+                    APPEND(tmpBuf);
+                    break;
+                }
+                case 'B': {
+                    *bufp++ = '[';
+                    if (bufp == &printString[PRINT_BUF_SIZE])
+                        break;
+                    for (int i = 0; i < width; ++i) {
+                        if (mask & (1ull << i)) {
+                            sprintf(tmpBuf, "%s", ((Bool *)ptr)[i] ? "true" : "false");
+                            APPEND(tmpBuf);
+                        }
+                        else
+                            APPEND("_________");
+                        *bufp++ = (i != width-1) ? ',' : ']';
+                    }
+                    break;
+                }
+                case 'i': PRINT_SCALAR("%d", int);
+                case 'I': PRINT_VECTOR("%d", int);
+                case 'u': PRINT_SCALAR("%u", unsigned int);
+                case 'U': PRINT_VECTOR("%u", unsigned int);
+                case 'f': PRINT_SCALAR("%f", float);
+                case 'F': PRINT_VECTOR("%f", float);
+                case 'l': PRINT_SCALAR("%lld", long long);
+                case 'L': PRINT_VECTOR("%lld", long long);
+                case 'v': PRINT_SCALAR("%llu", unsigned long long);
+                case 'V': PRINT_VECTOR("%llu", unsigned long long);
+                case 'd': PRINT_SCALAR("%f", double);
+                case 'D': PRINT_VECTOR("%f", double);
+                case 'p': PRINT_SCALAR("%p", void *);
+                case 'P': PRINT_VECTOR("%p", void *);
+                default:
+                    APPEND("UNKNOWN TYPE ");
+                    *bufp++ = *types;
+                }
+                ++types;
+            }
+        }
+        ++format;
+    }
+
+ done:
+    *bufp = '\0';
+    fputs(printString, stdout);
+    fflush(stdout);
+}
+
+
+int __num_cores() {
+#if defined(_MSC_VER) || defined(__MINGW32__)
+    // This is quite a hack.  Including all of windows.h to get this definition
+    // pulls in a bunch of stuff that leads to undefined symbols at link time.
+    // So we don't #include <windows.h> but instead have the equivalent declarations
+    // here.  Presumably this struct declaration won't be changing in the future
+    // anyway...
+    struct SYSTEM_INFO {
+        int pad0[2];
+        void *pad1[2];
+        int *pad2;
+        int dwNumberOfProcessors;
+        int pad3[3];
+    };
+
+    struct SYSTEM_INFO sysInfo;
+    extern void __stdcall GetSystemInfo(struct SYSTEM_INFO *);
+    GetSystemInfo(&sysInfo);
+    return sysInfo.dwNumberOfProcessors;
+#else
+    return sysconf(_SC_NPROCESSORS_ONLN);
+#endif // !_MSC_VER
+}
--- a/builtins/dispatch.ll
+++ b/builtins/dispatch.ll
@@ -0,0 +1,160 @@
+;;  Copyright (c) 2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+;; This file defines various functions that are used when generating the
+;; the "dispatch" object/assembly file that has entrypoints for each
+;; exported function in a module that dispatch to the best available
+;; variant of that function that will run on the system's CPU.
+
+;; Stores the best target ISA that the system on which we're actually
+;; running supports.  -1 represents "uninitialized", otherwise this value
+;; should correspond to one of the enumerant values of Target::ISA from
+;; ispc.h.
+
+@__system_best_isa = internal global i32 -1
+
+declare void @abort() noreturn
+
+;; The below is the result of running "clang -O2 -emit-llvm -c -o -" on the
+;; following code...  Specifically, __get_system_isa should return a value
+;; corresponding to one of the Target::ISA enumerant values that gives the
+;; most capable ISA that the curremt system can run.
+;;
+;; Note: clang from LLVM 3.0 should be used if this is updated, for maximum
+;; backwards compatibility for anyone building ispc with LLVM 3.0
+;;
+;; #include <stdint.h>
+;; #include <stdlib.h>
+;; 
+;; static void __cpuid(int info[4], int infoType) {
+;;     __asm__ __volatile__ ("cpuid"
+;;                           : "=a" (info[0]), "=b" (info[1]), "=c" (info[2]), "=d" (info[3])
+;;                           : "0" (infoType));
+;; }
+;; 
+;; /* Save %ebx in case it's the PIC register */
+;; static void __cpuid_count(int info[4], int level, int count) {
+;;   __asm__ __volatile__ ("xchg{l}\t{%%}ebx, %1\n\t"
+;;                         "cpuid\n\t"
+;;                         "xchg{l}\t{%%}ebx, %1\n\t"
+;;                         : "=a" (info[0]), "=r" (info[1]), "=c" (info[2]), "=d" (info[3])
+;;                         : "0" (level), "2" (count));
+;; }
+;; 
+;; int32_t __get_system_isa() {
+;;     int info[4];
+;;     __cpuid(info, 1);
+;; 
+;;     /* NOTE: the values returned below must be the same as the
+;;        corresponding enumerant values in Target::ISA. */
+;;     if ((info[2] & (1 << 28)) != 0) {
+;;        if ((info[2] & (1 << 29)) != 0 &&  // F16C
+;;            (info[2] & (1 << 30)) != 0) {  // RDRAND
+;;            // So far, so good.  AVX2?
+;;            // Call cpuid with eax=7, ecx=0
+;;            int info2[4];
+;;            __cpuid_count(info2, 7, 0);
+;;            if ((info2[1] & (1 << 5)) != 0)
+;;                return 4;
+;;            else
+;;                return 3;
+;;        }
+;;        // Regular AVX
+;;        return 2;
+;;     }
+;;     else if ((info[2] & (1 << 19)) != 0)
+;;         return 1; // SSE4
+;;     else if ((info[3] & (1 << 26)) != 0)
+;;         return 0; // SSE2
+;;     else
+;;         abort();
+;; }
+
+define i32 @__get_system_isa() nounwind uwtable ssp {
+entry:
+  %0 = tail call { i32, i32, i32, i32 } asm sideeffect "cpuid", "={ax},={bx},={cx},={dx},0,~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind
+  %asmresult5.i = extractvalue { i32, i32, i32, i32 } %0, 2
+  %asmresult6.i = extractvalue { i32, i32, i32, i32 } %0, 3
+  %and = and i32 %asmresult5.i, 268435456
+  %cmp = icmp eq i32 %and, 0
+  br i1 %cmp, label %if.else13, label %if.then
+
+if.then:                                          ; preds = %entry
+  %1 = and i32 %asmresult5.i, 1610612736
+  %2 = icmp eq i32 %1, 1610612736
+  br i1 %2, label %if.then7, label %return
+
+if.then7:                                         ; preds = %if.then
+  %3 = tail call { i32, i32, i32, i32 } asm sideeffect "xchg$(l$)\09$(%$)ebx, $1\0A\09cpuid\0A\09xchg$(l$)\09$(%$)ebx, $1\0A\09", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 7, i32 0) nounwind
+  %asmresult4.i28 = extractvalue { i32, i32, i32, i32 } %3, 1
+  %and10 = lshr i32 %asmresult4.i28, 5
+  %4 = and i32 %and10, 1
+  %5 = add i32 %4, 3
+  br label %return
+
+if.else13:                                        ; preds = %entry
+  %and15 = and i32 %asmresult5.i, 524288
+  %cmp16 = icmp eq i32 %and15, 0
+  br i1 %cmp16, label %if.else18, label %return
+
+if.else18:                                        ; preds = %if.else13
+  %and20 = and i32 %asmresult6.i, 67108864
+  %cmp21 = icmp eq i32 %and20, 0
+  br i1 %cmp21, label %if.else23, label %return
+
+if.else23:                                        ; preds = %if.else18
+  tail call void @abort() noreturn nounwind
+  unreachable
+
+return:                                           ; preds = %if.else18, %if.else13, %if.then7, %if.then
+  %retval.0 = phi i32 [ %5, %if.then7 ], [ 2, %if.then ], [ 1, %if.else13 ], [ 0, %if.else18 ]
+  ret i32 %retval.0
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; This function is called by each of the dispatch functions we generate;
+;; it sets @__system_best_isa if it is unset.
+
+define void @__set_system_isa() {
+entry:
+  %bi = load i32* @__system_best_isa
+  %unset = icmp eq i32 %bi, -1
+  br i1 %unset, label %set_system_isa, label %done
+
+set_system_isa:
+  %bival = call i32 @__get_system_isa()
+  store i32 %bival, i32* @__system_best_isa
+  ret void
+
+done:
+  ret void
+}
+
--- a/builtins/target-avx-common.ll
+++ b/builtins/target-avx-common.ll
@@ -0,0 +1,279 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; AVX target implementation.
+
+ctlztz()
+define_prefetches()
+define_shuffles()
+aossoa()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
+
+define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
+;    uniform float iv = extract(__rcp_u(v), 0);
+;    return iv * (2. - v * iv);
+  %vecval = insertelement <4 x float> undef, float %0, i32 0
+  %call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
+  %scall = extractelement <4 x float> %call, i32 0
+
+  ; do one N-R iteration
+  %v_iv = fmul float %0, %scall
+  %two_minus = fsub float 2., %v_iv  
+  %iv_mul = fmul float %scall, %two_minus
+  ret float %iv_mul
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding floats
+
+declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
+
+define float @__round_uniform_float(float) nounwind readonly alwaysinline {
+  ; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  ; the roundss intrinsic is a total mess--docs say:
+  ;
+  ;  __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
+  ;       
+  ;  b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
+  ;  on b0. The higher order 96 bits are copied directly from input parameter a. The
+  ;  return value is described by the following equations:
+  ;
+  ;  r0 = RND(b0)
+  ;  r1 = a1
+  ;  r2 = a2
+  ;  r3 = a3
+  ;
+  ;  It doesn't matter what we pass as a, since we only need the r0 value
+  ;  here.  So we pass the same register for both.
+  %xi = insertelement <4 x float> undef, float %0, i32 0
+  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
+  %rs = extractelement <4 x float> %xr, i32 0
+  ret float %rs
+}
+
+define float @__floor_uniform_float(float) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <4 x float> undef, float %0, i32 0
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
+  %rs = extractelement <4 x float> %xr, i32 0
+  ret float %rs
+}
+
+define float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <4 x float> undef, float %0, i32 0
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
+  %rs = extractelement <4 x float> %xr, i32 0
+  ret float %rs
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
+
+define double @__round_uniform_double(double) nounwind readonly alwaysinline {
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
+define double @__floor_uniform_double(double) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
+define double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rsqrt
+
+declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
+
+define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
+  ;  uniform float is = extract(__rsqrt_u(v), 0);
+  %v = insertelement <4 x float> undef, float %0, i32 0
+  %vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
+  %is = extractelement <4 x float> %vis, i32 0
+
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul float %0, %is
+  %v_is_is = fmul float %v_is, %is
+  %three_sub = fsub float 3., %v_is_is
+  %is_mul = fmul float %is, %three_sub
+  %half_scale = fmul float 0.5, %is_mul
+  ret float %half_scale
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; sqrt
+
+declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
+
+define float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
+  sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
+  ret float %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; fastmath
+
+declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
+declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
+
+define void @__fastmath() nounwind alwaysinline {
+  %ptr = alloca i32
+  %ptr8 = bitcast i32 * %ptr to i8 *
+  call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
+  %oldval = load i32 *%ptr
+
+  ; turn on DAZ (64)/FTZ (32768) -> 32832
+  %update = or i32 %oldval, 32832
+  store i32 %update, i32 *%ptr
+  call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8)
+  ret void
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
+  ret float %ret
+}
+
+define float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
+  ret float %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
+
+define i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret i32 %ret
+}
+
+define i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret i32 %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
+
+define i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret i32 %ret
+}
+
+define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret i32 %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops
+
+declare i32 @llvm.ctpop.i32(i32) nounwind readnone
+
+define i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
+  %call = call i32 @llvm.ctpop.i32(i32 %0)
+  ret i32 %call
+}
+
+declare i64 @llvm.ctpop.i64(i64) nounwind readnone
+
+define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
+  %call = call i64 @llvm.ctpop.i64(i64 %0)
+  ret i64 %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
+
+define double @__sqrt_uniform_double(double) nounwind alwaysinline {
+  sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
+  ret double %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
+  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
+  ret double %ret
+}
+
+define double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
+  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
+  ret double %ret
+}
--- a/builtins/target-avx-x2.ll
+++ b/builtins/target-avx-x2.ll
@@ -0,0 +1,667 @@
+;;  Copyright (c) 2010-2012, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Basic 16-wide definitions
+
+define(`WIDTH',`16')
+define(`MASK',`i32')
+include(`util.m4')
+
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+
+include(`target-avx-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
+
+define <16 x float> @__rcp_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  ;  float iv = __rcp_v(v);
+  ;  return iv * (2. - v * iv);
+
+  unary8to16(call, float, @llvm.x86.avx.rcp.ps.256, %0)
+  ; do one N-R iteration
+  %v_iv = fmul <16 x float> %0, %call
+  %two_minus = fsub <16 x float> <float 2., float 2., float 2., float 2.,
+                                  float 2., float 2., float 2., float 2.,
+                                  float 2., float 2., float 2., float 2.,
+                                  float 2., float 2., float 2., float 2.>, %v_iv  
+  %iv_mul = fmul <16 x float> %call, %two_minus
+  ret <16 x float> %iv_mul
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding floats
+
+declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
+
+define <16 x float> @__round_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  round8to16(%0, 8)
+}
+
+define <16 x float> @__floor_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  round8to16(%0, 9)
+}
+
+define <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  round8to16(%0, 10)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
+
+define <16 x double> @__round_varying_double(<16 x double>) nounwind readonly alwaysinline {
+  round4to16double(%0, 8)
+}
+
+define <16 x double> @__floor_varying_double(<16 x double>) nounwind readonly alwaysinline {
+  round4to16double(%0, 9)
+}
+
+define <16 x double> @__ceil_varying_double(<16 x double>) nounwind readonly alwaysinline {
+  round4to16double(%0, 10)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rsqrt
+
+declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
+
+define <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline {
+  ;  float is = __rsqrt_v(v);
+  unary8to16(is, float, @llvm.x86.avx.rsqrt.ps.256, %v)
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul <16 x float> %v, %is
+  %v_is_is = fmul <16 x float> %v_is, %is
+  %three_sub = fsub <16 x float> <float 3., float 3., float 3., float 3.,
+                                  float 3., float 3., float 3., float 3.,
+                                  float 3., float 3., float 3., float 3.,
+                                  float 3., float 3., float 3., float 3.>, %v_is_is
+  %is_mul = fmul <16 x float> %is, %three_sub
+  %half_scale = fmul <16 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
+                                   float 0.5, float 0.5, float 0.5, float 0.5,
+                                   float 0.5, float 0.5, float 0.5, float 0.5,
+                                   float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  ret <16 x float> %half_scale
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; sqrt
+
+declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
+
+define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  unary8to16(call, float, @llvm.x86.avx.sqrt.ps.256, %0)
+  ret <16 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; svml
+
+; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
+; or, use the macro to call the 4-wide ones 4x with our 16-wide
+; vectors...
+
+declare <16 x float> @__svml_sin(<16 x float>)
+declare <16 x float> @__svml_cos(<16 x float>)
+declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *)
+declare <16 x float> @__svml_tan(<16 x float>)
+declare <16 x float> @__svml_atan(<16 x float>)
+declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>)
+declare <16 x float> @__svml_exp(<16 x float>)
+declare <16 x float> @__svml_log(<16 x float>)
+declare <16 x float> @__svml_pow(<16 x float>, <16 x float>)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
+declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
+
+define <16 x float> @__max_varying_float(<16 x float>,
+                                         <16 x float>) nounwind readonly alwaysinline {
+  binary8to16(call, float, @llvm.x86.avx.max.ps.256, %0, %1)
+  ret <16 x float> %call
+}
+
+define <16 x float> @__min_varying_float(<16 x float>,
+                                         <16 x float>) nounwind readonly alwaysinline {
+  binary8to16(call, float, @llvm.x86.avx.min.ps.256, %0, %1)
+  ret <16 x float> %call
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops
+
+declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
+
+define i64 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <16 x i32> %0 to <16 x float>
+  %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v0 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask0) nounwind readnone
+  %mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+          <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %v1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask1) nounwind readnone
+
+  %v1shift = shl i32 %v1, 8
+  %v = or i32 %v1shift, %v0
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
+}
+
+define i1 @__any(<16 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <16 x i32> %0 to <16 x float>
+  %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v0 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask0) nounwind readnone
+  %mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+          <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %v1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask1) nounwind readnone
+
+  %v1shift = shl i32 %v1, 8
+  %v = or i32 %v1shift, %v0
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<16 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <16 x i32> %0 to <16 x float>
+  %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v0 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask0) nounwind readnone
+  %mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+          <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %v1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask1) nounwind readnone
+
+  %v1shift = shl i32 %v1, 8
+  %v = or i32 %v1shift, %v0
+  %cmp = icmp eq i32 %v, 65535
+  ret i1 %cmp
+}
+
+define i1 @__none(<16 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <16 x i32> %0 to <16 x float>
+  %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v0 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask0) nounwind readnone
+  %mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+          <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %v1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask1) nounwind readnone
+
+  %v1shift = shl i32 %v1, 8
+  %v = or i32 %v1shift, %v0
+  %cmp = icmp eq i32 %v, 0
+  ret i1 %cmp
+}
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal float ops
+
+declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone
+
+define float @__reduce_add_float(<16 x float>) nounwind readonly alwaysinline {
+  %va = shufflevector <16 x float> %0, <16 x float> undef,
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %vb = shufflevector <16 x float> %0, <16 x float> undef,
+          <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %v1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %va, <8 x float> %vb)
+  %v2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v1, <8 x float> %v1)
+  %v3 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v2, <8 x float> %v2)
+  %scalar1 = extractelement <8 x float> %v3, i32 0
+  %scalar2 = extractelement <8 x float> %v3, i32 4
+  %sum = fadd float %scalar1, %scalar2
+  ret float %sum
+}
+
+
+define float @__reduce_min_float(<16 x float>) nounwind readnone alwaysinline {
+  reduce16(float, @__min_varying_float, @__min_uniform_float)
+}
+
+
+define float @__reduce_max_float(<16 x float>) nounwind readnone alwaysinline {
+  reduce16(float, @__max_varying_float, @__max_uniform_float)
+}
+
+reduce_equal(16)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int32 ops
+
+define <16 x i32> @__add_varying_int32(<16 x i32>,
+                                       <16 x i32>) nounwind readnone alwaysinline {
+  %s = add <16 x i32> %0, %1
+  ret <16 x i32> %s
+}
+
+define i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
+  %s = add i32 %0, %1
+  ret i32 %s
+}
+
+define i32 @__reduce_add_int32(<16 x i32>) nounwind readnone alwaysinline {
+  reduce16(i32, @__add_varying_int32, @__add_uniform_int32)
+}
+
+
+define i32 @__reduce_min_int32(<16 x i32>) nounwind readnone alwaysinline {
+  reduce16(i32, @__min_varying_int32, @__min_uniform_int32)
+}
+
+
+define i32 @__reduce_max_int32(<16 x i32>) nounwind readnone alwaysinline {
+  reduce16(i32, @__max_varying_int32, @__max_uniform_int32)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; horizontal uint32 ops
+
+define i32 @__reduce_min_uint32(<16 x i32>) nounwind readnone alwaysinline {
+  reduce16(i32, @__min_varying_uint32, @__min_uniform_uint32)
+}
+
+
+define i32 @__reduce_max_uint32(<16 x i32>) nounwind readnone alwaysinline {
+  reduce16(i32, @__max_varying_uint32, @__max_uniform_uint32)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal double ops
+
+declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define double @__reduce_add_double(<16 x double>) nounwind readonly alwaysinline {
+  %va = shufflevector <16 x double> %0, <16 x double> undef,
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %vb = shufflevector <16 x double> %0, <16 x double> undef,
+         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vc = shufflevector <16 x double> %0, <16 x double> undef,
+         <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %vd = shufflevector <16 x double> %0, <16 x double> undef,
+         <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  %vab = fadd <4 x double> %va, %vb
+  %vcd = fadd <4 x double> %vc, %vd
+
+  %sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %vab, <4 x double> %vcd)
+  %sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
+  %final0 = extractelement <4 x double> %sum1, i32 0
+  %final1 = extractelement <4 x double> %sum1, i32 2
+  %sum = fadd double %final0, %final1
+  ret double %sum
+}
+
+define double @__reduce_min_double(<16 x double>) nounwind readnone alwaysinline {
+  reduce16(double, @__min_varying_double, @__min_uniform_double)
+}
+
+
+define double @__reduce_max_double(<16 x double>) nounwind readnone alwaysinline {
+  reduce16(double, @__max_varying_double, @__max_uniform_double)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int64 ops
+
+define <16 x i64> @__add_varying_int64(<16 x i64>,
+                                                <16 x i64>) nounwind readnone alwaysinline {
+  %s = add <16 x i64> %0, %1
+  ret <16 x i64> %s
+}
+
+define i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
+  %s = add i64 %0, %1
+  ret i64 %s
+}
+
+define i64 @__reduce_add_int64(<16 x i64>) nounwind readnone alwaysinline {
+  reduce16(i64, @__add_varying_int64, @__add_uniform_int64)
+}
+
+
+define i64 @__reduce_min_int64(<16 x i64>) nounwind readnone alwaysinline {
+  reduce16(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+
+define i64 @__reduce_max_int64(<16 x i64>) nounwind readnone alwaysinline {
+  reduce16(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; horizontal uint64 ops
+
+define i64 @__reduce_min_uint64(<16 x i64>) nounwind readnone alwaysinline {
+  reduce16(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+
+define i64 @__reduce_max_uint64(<16 x i64>) nounwind readnone alwaysinline {
+  reduce16(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+; no masked load instruction for i8 and i16 types??
+masked_load(i8,  1)
+masked_load(i16, 2)
+
+declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x float> %mask)
+declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
+ 
+define <16 x i32> @__masked_load_i32(i8 *, <16 x i32> %mask) nounwind alwaysinline {
+  %floatmask = bitcast <16 x i32> %mask to <16 x float>
+  %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %val0 = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8 * %0, <8 x float> %mask0)
+  %mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+     <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %ptr1 = getelementptr i8 * %0, i32 32   ;; 8x4 bytes = 32
+  %val1 = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8 * %ptr1, <8 x float> %mask1)
+
+  %retval = shufflevector <8 x float> %val0, <8 x float> %val1,
+     <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                 i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %reti32 = bitcast <16 x float> %retval to <16 x i32>
+  ret <16 x i32> %reti32
+}
+
+
+define <16 x i64> @__masked_load_i64(i8 *, <16 x i32> %mask) nounwind alwaysinline {
+  ; double up masks, bitcast to doubles
+  %mask0 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+  %mask1 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+  %mask2 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11>
+  %mask3 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
+  %mask0d = bitcast <8 x i32> %mask0 to <4 x double>
+  %mask1d = bitcast <8 x i32> %mask1 to <4 x double>
+  %mask2d = bitcast <8 x i32> %mask2 to <4 x double>
+  %mask3d = bitcast <8 x i32> %mask3 to <4 x double>
+
+  %val0d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %0, <4 x double> %mask0d)
+  %ptr1 = getelementptr i8 * %0, i32 32
+  %val1d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %ptr1, <4 x double> %mask1d)
+  %ptr2 = getelementptr i8 * %0, i32 64
+  %val2d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %ptr2, <4 x double> %mask2d)
+  %ptr3 = getelementptr i8 * %0, i32 96
+  %val3d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %ptr3, <4 x double> %mask3d)
+
+  %val01 = shufflevector <4 x double> %val0d, <4 x double> %val1d,
+      <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %val23 = shufflevector <4 x double> %val2d, <4 x double> %val3d,
+      <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %val0123 = shufflevector <8 x double> %val01, <8 x double> %val23,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                  i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %val = bitcast <16 x double> %val0123 to <16 x i64>
+  ret <16 x i64> %val
+}
+
+masked_load_float_double()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+; FIXME: there is no AVX instruction for these, but we could be clever
+; by packing the bits down and setting the last 3/4 or half, respectively,
+; of the mask to zero...  Not sure if this would be a win in the end
+gen_masked_store(i8)
+gen_masked_store(i16)
+
+; note that mask is the 2nd parameter, not the 3rd one!!
+declare void @llvm.x86.avx.maskstore.ps.256(i8 *, <8 x float>, <8 x float>)
+declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>)
+
+define void @__masked_store_i32(<16 x i32>* nocapture, <16 x i32>, 
+                                <16 x i32>) nounwind alwaysinline {
+  %ptr = bitcast <16 x i32> * %0 to i8 *
+  %val = bitcast <16 x i32> %1 to <16 x float>
+  %mask = bitcast <16 x i32> %2 to <16 x float>
+
+  %val0 = shufflevector <16 x float> %val, <16 x float> undef,
+        <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %val1 = shufflevector <16 x float> %val, <16 x float> undef,
+        <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+  %mask0 = shufflevector <16 x float> %mask, <16 x float> undef,
+        <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %mask1 = shufflevector <16 x float> %mask, <16 x float> undef,
+        <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+  call void @llvm.x86.avx.maskstore.ps.256(i8 * %ptr, <8 x float> %mask0, <8 x float> %val0)
+  %ptr1 = getelementptr i8 * %ptr, i32 32
+  call void @llvm.x86.avx.maskstore.ps.256(i8 * %ptr1, <8 x float> %mask1, <8 x float> %val1)
+
+  ret void
+}
+
+define void @__masked_store_i64(<16 x i64>* nocapture, <16 x i64>,
+                                <16 x i32> %mask) nounwind alwaysinline {
+  %ptr = bitcast <16 x i64> * %0 to i8 *
+  %val = bitcast <16 x i64> %1 to <16 x double>
+
+  ; double up masks, bitcast to doubles
+  %mask0 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+  %mask1 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+  %mask2 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11>
+  %mask3 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
+  %mask0d = bitcast <8 x i32> %mask0 to <4 x double>
+  %mask1d = bitcast <8 x i32> %mask1 to <4 x double>
+  %mask2d = bitcast <8 x i32> %mask2 to <4 x double>
+  %mask3d = bitcast <8 x i32> %mask3 to <4 x double>
+
+  %val0 = shufflevector <16 x double> %val, <16 x double> undef,
+     <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %val1 = shufflevector <16 x double> %val, <16 x double> undef,
+     <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %val2 = shufflevector <16 x double> %val, <16 x double> undef,
+     <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %val3 = shufflevector <16 x double> %val, <16 x double> undef,
+     <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+
+  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr, <4 x double> %mask0d, <4 x double> %val0)
+  %ptr1 = getelementptr i8 * %ptr, i32 32
+  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr1, <4 x double> %mask1d, <4 x double> %val1)
+  %ptr2 = getelementptr i8 * %ptr, i32 64
+  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr2, <4 x double> %mask2d, <4 x double> %val2)
+  %ptr3 = getelementptr i8 * %ptr, i32 96
+  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr3, <4 x double> %mask3d, <4 x double> %val3)
+
+  ret void
+}
+
+masked_store_float_double()
+
+masked_store_blend_8_16_by_16()
+
+declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
+                                                <8 x float>) nounwind readnone
+
+define void @__masked_store_blend_i32(<16 x i32>* nocapture, <16 x i32>, 
+                                      <16 x i32>) nounwind alwaysinline {
+  %maskAsFloat = bitcast <16 x i32> %2 to <16 x float>
+  %oldValue = load <16 x i32>* %0, align 4
+  %oldAsFloat = bitcast <16 x i32> %oldValue to <16 x float>
+  %newAsFloat = bitcast <16 x i32> %1 to <16 x float>
+ 
+  %old0 = shufflevector <16 x float> %oldAsFloat, <16 x float> undef,
+        <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %old1 = shufflevector <16 x float> %oldAsFloat, <16 x float> undef,
+        <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %new0 = shufflevector <16 x float> %newAsFloat, <16 x float> undef,
+        <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %new1 = shufflevector <16 x float> %newAsFloat, <16 x float> undef,
+        <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %mask0 = shufflevector <16 x float> %maskAsFloat, <16 x float> undef,
+        <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %mask1 = shufflevector <16 x float> %maskAsFloat, <16 x float> undef,
+        <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+  %blend0 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old0,
+                                                         <8 x float> %new0,
+                                                         <8 x float> %mask0)
+  %blend1 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old1,
+                                                         <8 x float> %new1,
+                                                         <8 x float> %mask1)
+  %blend = shufflevector <8 x float> %blend0, <8 x float> %blend1,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %blendAsInt = bitcast <16 x float> %blend to <16 x i32>
+  store <16 x i32> %blendAsInt, <16 x i32>* %0, align 4
+  ret void
+}
+
+
+declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>,
+                                                 <4 x double>) nounwind readnone
+
+define void @__masked_store_blend_i64(<16 x i64>* nocapture %ptr, <16 x i64> %newi64, 
+                                      <16 x i32> %mask) nounwind alwaysinline {
+  %oldValue = load <16 x i64>* %ptr, align 8
+  %old = bitcast <16 x i64> %oldValue to <16 x double>
+  %old0d = shufflevector <16 x double> %old, <16 x double> undef,
+     <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %old1d = shufflevector <16 x double> %old, <16 x double> undef,
+     <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %old2d = shufflevector <16 x double> %old, <16 x double> undef,
+     <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %old3d = shufflevector <16 x double> %old, <16 x double> undef,
+     <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+
+  %new = bitcast <16 x i64> %newi64 to <16 x double>
+  %new0d = shufflevector <16 x double> %new, <16 x double> undef,
+     <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %new1d = shufflevector <16 x double> %new, <16 x double> undef,
+     <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %new2d = shufflevector <16 x double> %new, <16 x double> undef,
+     <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %new3d = shufflevector <16 x double> %new, <16 x double> undef,
+     <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+
+  %mask0 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+  %mask1 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+  %mask2 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11>
+  %mask3 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
+  %mask0d = bitcast <8 x i32> %mask0 to <4 x double>
+  %mask1d = bitcast <8 x i32> %mask1 to <4 x double>
+  %mask2d = bitcast <8 x i32> %mask2 to <4 x double>
+  %mask3d = bitcast <8 x i32> %mask3 to <4 x double>
+
+  %result0d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old0d,
+                                 <4 x double> %new0d, <4 x double> %mask0d)
+  %result1d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old1d,
+                                 <4 x double> %new1d, <4 x double> %mask1d)
+  %result2d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old2d,
+                                 <4 x double> %new2d, <4 x double> %mask2d)
+  %result3d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old3d,
+                                 <4 x double> %new3d, <4 x double> %mask3d)
+
+  %result01 = shufflevector <4 x double> %result0d, <4 x double> %result1d,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %result23 = shufflevector <4 x double> %result2d, <4 x double> %result3d,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+  %result = shufflevector <8 x double> %result01, <8 x double> %result23,
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %result64 = bitcast <16 x double> %result to <16 x i64>
+  store <16 x i64> %result64, <16 x i64> * %ptr
+  ret void
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; scatter
+
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
+
+define <16 x double> @__sqrt_varying_double(<16 x double>) nounwind alwaysinline {
+  unary4to16(ret, double, @llvm.x86.avx.sqrt.pd.256, %0)
+  ret <16 x double> %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
+declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define <16 x double> @__min_varying_double(<16 x double>, <16 x double>) nounwind readnone alwaysinline {
+  binary4to16(ret, double, @llvm.x86.avx.min.pd.256, %0, %1)
+  ret <16 x double> %ret
+}
+
+define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwind readnone alwaysinline {
+  binary4to16(ret, double, @llvm.x86.avx.max.pd.256, %0, %1)
+  ret <16 x double> %ret
+}
--- a/builtins/target-avx.ll
+++ b/builtins/target-avx.ll
@@ -0,0 +1,537 @@
+;;  Copyright (c) 2010-2012, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Basic 8-wide definitions
+
+define(`WIDTH',`8')
+define(`MASK',`i32')
+include(`util.m4')
+
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+
+include(`target-avx-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
+
+define <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ;  float iv = __rcp_v(v);
+  ;  return iv * (2. - v * iv);
+
+  %call = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %0)
+  ; do one N-R iteration
+  %v_iv = fmul <8 x float> %0, %call
+  %two_minus = fsub <8 x float> <float 2., float 2., float 2., float 2.,
+                                 float 2., float 2., float 2., float 2.>, %v_iv  
+  %iv_mul = fmul <8 x float> %call, %two_minus
+  ret <8 x float> %iv_mul
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding floats
+
+declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
+
+define <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  %call = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %0, i32 8)
+  ret <8 x float> %call
+}
+
+define <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  %call = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %0, i32 9)
+  ret <8 x float> %call
+}
+
+define <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  %call = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %0, i32 10)
+  ret <8 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
+
+define <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  round4to8double(%0, 8)
+}
+
+define <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  round4to8double(%0, 9)
+}
+
+
+define <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  round4to8double(%0, 10)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rsqrt
+
+declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
+
+define <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
+  ;  float is = __rsqrt_v(v);
+  %is = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %v)
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul <8 x float> %v, %is
+  %v_is_is = fmul <8 x float> %v_is, %is
+  %three_sub = fsub <8 x float> <float 3., float 3., float 3., float 3.,
+                                 float 3., float 3., float 3., float 3.>, %v_is_is
+  %is_mul = fmul <8 x float> %is, %three_sub
+  %half_scale = fmul <8 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
+                                  float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  ret <8 x float> %half_scale
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; sqrt
+
+declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
+
+define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  %call = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %0)
+  ret <8 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; svml
+
+; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
+; or, use the macro to call the 4-wide ones twice with our 8-wide
+; vectors...
+
+declare <8 x float> @__svml_sin(<8 x float>)
+declare <8 x float> @__svml_cos(<8 x float>)
+declare void @__svml_sincos(<8 x float>, <8 x float> *, <8 x float> *)
+declare <8 x float> @__svml_tan(<8 x float>)
+declare <8 x float> @__svml_atan(<8 x float>)
+declare <8 x float> @__svml_atan2(<8 x float>, <8 x float>)
+declare <8 x float> @__svml_exp(<8 x float>)
+declare <8 x float> @__svml_log(<8 x float>)
+declare <8 x float> @__svml_pow(<8 x float>, <8 x float>)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
+declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
+
+define <8 x float> @__max_varying_float(<8 x float>,
+                                        <8 x float>) nounwind readonly alwaysinline {
+  %call = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %0, <8 x float> %1)
+  ret <8 x float> %call
+}
+
+define <8 x float> @__min_varying_float(<8 x float>,
+                                        <8 x float>) nounwind readonly alwaysinline {
+  %call = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %0, <8 x float> %1)
+  ret <8 x float> %call
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops
+
+declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
+
+define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
+}
+
+define i1 @__any(<8 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<8 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone
+  %cmp = icmp eq i32 %v, 255
+  ret i1 %cmp
+}
+
+define i1 @__none(<8 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone
+  %cmp = icmp eq i32 %v, 0
+  ret i1 %cmp
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal float ops
+
+declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone
+
+define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
+  %v1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %0, <8 x float> %0)
+  %v2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v1, <8 x float> %v1)
+  %scalar1 = extractelement <8 x float> %v2, i32 0
+  %scalar2 = extractelement <8 x float> %v2, i32 4
+  %sum = fadd float %scalar1, %scalar2
+  ret float %sum
+}
+
+
+define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
+  reduce8(float, @__min_varying_float, @__min_uniform_float)
+}
+
+
+define float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
+  reduce8(float, @__max_varying_float, @__max_uniform_float)
+}
+
+reduce_equal(8)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int32 ops
+
+define <8 x i32> @__add_varying_int32(<8 x i32>,
+                                      <8 x i32>) nounwind readnone alwaysinline {
+  %s = add <8 x i32> %0, %1
+  ret <8 x i32> %s
+}
+
+define i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
+  %s = add i32 %0, %1
+  ret i32 %s
+}
+
+define i32 @__reduce_add_int32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8(i32, @__add_varying_int32, @__add_uniform_int32)
+}
+
+
+define i32 @__reduce_min_int32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8(i32, @__min_varying_int32, @__min_uniform_int32)
+}
+
+
+define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8(i32, @__max_varying_int32, @__max_uniform_int32)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; horizontal uint32 ops
+
+define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32)
+}
+
+
+define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal double ops
+
+declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define double @__reduce_add_double(<8 x double>) nounwind readonly alwaysinline {
+  %v0 = shufflevector <8 x double> %0, <8 x double> undef,
+                      <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v1 = shufflevector <8 x double> %0, <8 x double> undef,
+                      <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %v0, <4 x double> %v1)
+  %sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
+  %final0 = extractelement <4 x double> %sum1, i32 0
+  %final1 = extractelement <4 x double> %sum1, i32 2
+  %sum = fadd double %final0, %final1
+
+  ret double %sum
+}
+
+define double @__reduce_min_double(<8 x double>) nounwind readnone alwaysinline {
+  reduce8(double, @__min_varying_double, @__min_uniform_double)
+}
+
+
+define double @__reduce_max_double(<8 x double>) nounwind readnone alwaysinline {
+  reduce8(double, @__max_varying_double, @__max_uniform_double)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int64 ops
+
+define <8 x i64> @__add_varying_int64(<8 x i64>,
+                                      <8 x i64>) nounwind readnone alwaysinline {
+  %s = add <8 x i64> %0, %1
+  ret <8 x i64> %s
+}
+
+define i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
+  %s = add i64 %0, %1
+  ret i64 %s
+}
+
+define i64 @__reduce_add_int64(<8 x i64>) nounwind readnone alwaysinline {
+  reduce8(i64, @__add_varying_int64, @__add_uniform_int64)
+}
+
+
+define i64 @__reduce_min_int64(<8 x i64>) nounwind readnone alwaysinline {
+  reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+
+define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone alwaysinline {
+  reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; horizontal uint64 ops
+
+define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone alwaysinline {
+  reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+
+define i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone alwaysinline {
+  reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+
+; no masked load instruction for i8 and i16 types??
+masked_load(i8,  1)
+masked_load(i16, 2)
+
+declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x float> %mask)
+declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
+ 
+define <8 x i32> @__masked_load_i32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
+  %floatmask = bitcast <8 x i32> %mask to <8 x float>
+  %floatval = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8 * %0, <8 x float> %floatmask)
+  %retval = bitcast <8 x float> %floatval to <8 x i32>
+  ret <8 x i32> %retval
+}
+
+
+define <8 x i64> @__masked_load_i64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
+  ; double up masks, bitcast to doubles
+  %mask0 = shufflevector <8 x i32> %mask, <8 x i32> undef,
+     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+  %mask1 = shufflevector <8 x i32> %mask, <8 x i32> undef,
+     <8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+  %mask0d = bitcast <8 x i32> %mask0 to <4 x double>
+  %mask1d = bitcast <8 x i32> %mask1 to <4 x double>
+
+  %val0d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %0, <4 x double> %mask0d)
+  %ptr1 = getelementptr i8 * %0, i32 32
+  %val1d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %ptr1, <4 x double> %mask1d)
+
+  %vald = shufflevector <4 x double> %val0d, <4 x double> %val1d,
+      <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %val = bitcast <8 x double> %vald to <8 x i64>
+  ret <8 x i64> %val
+}
+
+masked_load_float_double()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+gen_masked_store(i8)
+gen_masked_store(i16)
+
+; note that mask is the 2nd parameter, not the 3rd one!!
+declare void @llvm.x86.avx.maskstore.ps.256(i8 *, <8 x float>, <8 x float>)
+declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>)
+
+define void @__masked_store_i32(<8 x i32>* nocapture, <8 x i32>, 
+                                <8 x i32>) nounwind alwaysinline {
+  %ptr = bitcast <8 x i32> * %0 to i8 *
+  %val = bitcast <8 x i32> %1 to <8 x float>
+  %mask = bitcast <8 x i32> %2 to <8 x float>
+  call void @llvm.x86.avx.maskstore.ps.256(i8 * %ptr, <8 x float> %mask, <8 x float> %val)
+  ret void
+}
+
+define void @__masked_store_i64(<8 x i64>* nocapture, <8 x i64>,
+                                <8 x i32> %mask) nounwind alwaysinline {
+  %ptr = bitcast <8 x i64> * %0 to i8 *
+  %val = bitcast <8 x i64> %1 to <8 x double>
+
+  %mask0 = shufflevector <8 x i32> %mask, <8 x i32> undef,
+     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+  %mask1 = shufflevector <8 x i32> %mask, <8 x i32> undef,
+     <8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+
+  %mask0d = bitcast <8 x i32> %mask0 to <4 x double>
+  %mask1d = bitcast <8 x i32> %mask1 to <4 x double>
+
+  %val0 = shufflevector <8 x double> %val, <8 x double> undef,
+     <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %val1 = shufflevector <8 x double> %val, <8 x double> undef,
+     <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+
+  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr, <4 x double> %mask0d, <4 x double> %val0)
+  %ptr1 = getelementptr i8 * %ptr, i32 32
+  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr1, <4 x double> %mask1d, <4 x double> %val1)
+  ret void
+}
+
+
+masked_store_blend_8_16_by_8()
+
+declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
+                                                <8 x float>) nounwind readnone
+
+define void @__masked_store_blend_i32(<8 x i32>* nocapture, <8 x i32>, 
+                                      <8 x i32>) nounwind alwaysinline {
+  %mask_as_float = bitcast <8 x i32> %2 to <8 x float>
+  %oldValue = load <8 x i32>* %0, align 4
+  %oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float>
+  %newAsFloat = bitcast <8 x i32> %1 to <8 x float>
+  %blend = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %oldAsFloat,
+                                                        <8 x float> %newAsFloat,
+                                                        <8 x float> %mask_as_float)
+  %blendAsInt = bitcast <8 x float> %blend to <8 x i32>
+  store <8 x i32> %blendAsInt, <8 x i32>* %0, align 4
+  ret void
+}
+
+
+define void @__masked_store_blend_i64(<8 x i64>* nocapture %ptr, <8 x i64> %new, 
+                                      <8 x i32> %i32mask) nounwind alwaysinline {
+  %oldValue = load <8 x i64>* %ptr, align 8
+  %mask = bitcast <8 x i32> %i32mask to <8 x float>
+
+  ; Do 4x64-bit blends by doing two <8 x i32> blends, where the <8 x i32> values
+  ; are actually bitcast <4 x i64> values
+  ;
+  ; set up the first four 64-bit values
+  %old01  = shufflevector <8 x i64> %oldValue, <8 x i64> undef,
+                          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %old01f = bitcast <4 x i64> %old01 to <8 x float>
+  %new01  = shufflevector <8 x i64> %new, <8 x i64> undef,
+                          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %new01f = bitcast <4 x i64> %new01 to <8 x float>
+  ; compute mask--note that the indices are all doubled-up
+  %mask01 = shufflevector <8 x float> %mask, <8 x float> undef,
+                          <8 x i32> <i32 0, i32 0, i32 1, i32 1,
+                                     i32 2, i32 2, i32 3, i32 3>
+  ; and blend them
+  %result01f = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old01f,
+                                                            <8 x float> %new01f,
+                                                            <8 x float> %mask01)
+  %result01 = bitcast <8 x float> %result01f to <4 x i64>
+
+  ; and again
+  %old23  = shufflevector <8 x i64> %oldValue, <8 x i64> undef,
+                          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %old23f = bitcast <4 x i64> %old23 to <8 x float>
+  %new23  = shufflevector <8 x i64> %new, <8 x i64> undef,
+                          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %new23f = bitcast <4 x i64> %new23 to <8 x float>
+  ; compute mask--note that the values are doubled-up...
+  %mask23 = shufflevector <8 x float> %mask, <8 x float> undef,
+                          <8 x i32> <i32 4, i32 4, i32 5, i32 5,
+                                     i32 6, i32 6, i32 7, i32 7>
+  ; and blend them
+  %result23f = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old23f,
+                                                            <8 x float> %new23f,
+                                                            <8 x float> %mask23)
+  %result23 = bitcast <8 x float> %result23f to <4 x i64>
+
+  ; reconstruct the final <8 x i64> vector
+  %final = shufflevector <4 x i64> %result01, <4 x i64> %result23,
+                         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
+                                    i32 4, i32 5, i32 6, i32 7>
+  store <8 x i64> %final, <8 x i64> * %ptr, align 8
+  ret void
+}
+
+masked_store_float_double()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; scatter
+
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
+
+define <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
+  unary4to8(ret, double, @llvm.x86.avx.sqrt.pd.256, %0)
+  ret <8 x double> %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
+declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
+  binary4to8(ret, double, @llvm.x86.avx.min.pd.256, %0, %1)
+  ret <8 x double> %ret
+}
+
+define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
+  binary4to8(ret, double, @llvm.x86.avx.max.pd.256, %0, %1)
+  ret <8 x double> %ret
+}
+
--- a/builtins/target-avx1-x2.ll
+++ b/builtins/target-avx1-x2.ll
@@ -0,0 +1,81 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+include(`target-avx-x2.ll')
+
+rdrand_decls()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret <16 x i32> %ret
+}
+
+define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret <16 x i32> %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret <16 x i32> %ret
+}
+
+define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret <16 x i32> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+ifelse(NO_HALF_DECLARES, `1', `', `
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
--- a/builtins/target-avx1.ll
+++ b/builtins/target-avx1.ll
@@ -0,0 +1,81 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+include(`target-avx.ll')
+
+rdrand_decls()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret <8 x i32> %ret
+}
+
+define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret <8 x i32> %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret <8 x i32> %ret
+}
+
+define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret <8 x i32> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+ifelse(NO_HALF_DECLARES, `1', `', `
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
--- a/builtins/target-avx11-x2.ll
+++ b/builtins/target-avx11-x2.ll
@@ -0,0 +1,132 @@
+;;  Copyright (c) 2012, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+include(`target-avx-x2.ll')
+
+ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
+       LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
+       `rdrand_definition()')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret <16 x i32> %ret
+}
+
+define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret <16 x i32> %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret <16 x i32> %ret
+}
+
+define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret <16 x i32> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather(i8)
+gen_gather(i16)
+gen_gather(i32)
+gen_gather(float)
+gen_gather(i64)
+gen_gather(double)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float/half conversions
+
+ifelse(LLVM_VERSION, `LLVM_3_0', `
+;; nothing to define...
+', `
+declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
+; 0 is round nearest even
+declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
+
+define <16 x float> @__half_to_float_varying(<16 x i16> %v) nounwind readnone {
+  %r_0 = shufflevector <16 x i16> %v, <16 x i16> undef,
+             <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %vr_0 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_0)
+  %r_1 = shufflevector <16 x i16> %v, <16 x i16> undef,
+             <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vr_1 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_1)
+  %r = shufflevector <8 x float> %vr_0, <8 x float> %vr_1, 
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %r
+}
+
+define <16 x i16> @__float_to_half_varying(<16 x float> %v) nounwind readnone {
+  %r_0 = shufflevector <16 x float> %v, <16 x float> undef,
+             <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %vr_0 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_0, i32 0)
+  %r_1 = shufflevector <16 x float> %v, <16 x float> undef,
+             <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vr_1 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_1, i32 0)
+  %r = shufflevector <8 x i16> %vr_0, <8 x i16> %vr_1, 
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i16> %r
+}
+
+define float @__half_to_float_uniform(i16 %v) nounwind readnone {
+  %v1 = bitcast i16 %v to <1 x i16>
+  %vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
+  %r = extractelement <8 x float> %rv, i32 0
+  ret float %r
+}
+
+define i16 @__float_to_half_uniform(float %v) nounwind readnone {
+  %v1 = bitcast float %v to <1 x float>
+  %vv = shufflevector <1 x float> %v1, <1 x float> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  ; round to nearest even
+  %rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
+  %r = extractelement <8 x i16> %rv, i32 0
+  ret i16 %r
+}
+'
+)
--- a/builtins/target-avx11.ll
+++ b/builtins/target-avx11.ll
@@ -0,0 +1,115 @@
+;;  Copyright (c) 2012, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+include(`target-avx.ll')
+
+ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
+       LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
+       `rdrand_definition()')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret <8 x i32> %ret
+}
+
+define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret <8 x i32> %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret <8 x i32> %ret
+}
+
+define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret <8 x i32> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather(i8)
+gen_gather(i16)
+gen_gather(i32)
+gen_gather(float)
+gen_gather(i64)
+gen_gather(double)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float/half conversions
+
+ifelse(LLVM_VERSION, `LLVM_3_0', `
+;; nothing to define...
+', `
+declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
+; 0 is round nearest even
+declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
+
+define <8 x float> @__half_to_float_varying(<8 x i16> %v) nounwind readnone {
+  %r = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %v)
+  ret <8 x float> %r
+}
+
+define <8 x i16> @__float_to_half_varying(<8 x float> %v) nounwind readnone {
+  %r = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %v, i32 0)
+  ret <8 x i16> %r
+}
+
+define float @__half_to_float_uniform(i16 %v) nounwind readnone {
+  %v1 = bitcast i16 %v to <1 x i16>
+  %vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
+  %r = extractelement <8 x float> %rv, i32 0
+  ret float %r
+}
+
+define i16 @__float_to_half_uniform(float %v) nounwind readnone {
+  %v1 = bitcast float %v to <1 x float>
+  %vv = shufflevector <1 x float> %v1, <1 x float> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  ; round to nearest even
+  %rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
+  %r = extractelement <8 x i16> %rv, i32 0
+  ret i16 %r
+}
+')
--- a/builtins/target-avx2-x2.ll
+++ b/builtins/target-avx2-x2.ll
@@ -0,0 +1,561 @@
+;;  Copyright (c) 2010-2012, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+ifelse(LLVM_VERSION, `LLVM_3_0', `',
+       LLVM_VERSION, `LLVM_3_1', `',
+       `define(`HAVE_GATHER', `1')')
+
+include(`target-avx-x2.ll')
+
+ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
+       LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
+       `rdrand_definition()')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readonly
+declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readonly
+
+define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary8to16(m, i32, @llvm.x86.avx2.pmins.d, %0, %1)
+  ret <16 x i32> %m
+}
+
+define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary8to16(m, i32, @llvm.x86.avx2.pmaxs.d, %0, %1)
+  ret <16 x i32> %m
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readonly
+declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readonly
+
+define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary8to16(m, i32, @llvm.x86.avx2.pminu.d, %0, %1)
+  ret <16 x i32> %m
+}
+
+define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary8to16(m, i32, @llvm.x86.avx2.pmaxu.d, %0, %1)
+  ret <16 x i32> %m
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float/half conversions
+
+ifelse(LLVM_VERSION, `LLVM_3_0', `
+;; nothing to define...
+', `
+declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
+; 0 is round nearest even
+declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
+
+define <16 x float> @__half_to_float_varying(<16 x i16> %v) nounwind readnone {
+  %r_0 = shufflevector <16 x i16> %v, <16 x i16> undef,
+             <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %vr_0 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_0)
+  %r_1 = shufflevector <16 x i16> %v, <16 x i16> undef,
+             <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vr_1 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_1)
+  %r = shufflevector <8 x float> %vr_0, <8 x float> %vr_1, 
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %r
+}
+
+define <16 x i16> @__float_to_half_varying(<16 x float> %v) nounwind readnone {
+  %r_0 = shufflevector <16 x float> %v, <16 x float> undef,
+             <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %vr_0 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_0, i32 0)
+  %r_1 = shufflevector <16 x float> %v, <16 x float> undef,
+             <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vr_1 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_1, i32 0)
+  %r = shufflevector <8 x i16> %vr_0, <8 x i16> %vr_1, 
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i16> %r
+}
+
+define float @__half_to_float_uniform(i16 %v) nounwind readnone {
+  %v1 = bitcast i16 %v to <1 x i16>
+  %vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
+  %r = extractelement <8 x float> %rv, i32 0
+  ret float %r
+}
+
+define i16 @__float_to_half_uniform(float %v) nounwind readnone {
+  %v1 = bitcast float %v to <1 x float>
+  %vv = shufflevector <1 x float> %v1, <1 x float> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  ; round to nearest even
+  %rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
+  %r = extractelement <8 x i16> %rv, i32 0
+  ret i16 %r
+}
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+declare void @llvm.trap() noreturn nounwind
+
+; $1: type
+; $2: var base name
+define(`extract_4s', `
+  %$2_1 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$2_2 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %$2_3 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %$2_4 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+')
+
+; $1: type
+; $2: var base name
+define(`extract_8s', `
+  %$2_1 = shufflevector <16 x $1> %$2, <16 x $1> undef,
+                    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %$2_2 = shufflevector <16 x $1> %$2, <16 x $1> undef,
+                    <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+')
+
+; $1: element type
+; $2: ret name
+; $3: v1
+; $4: v2
+define(`assemble_8s', `
+  %$2 = shufflevector <8 x $1> %$3, <8 x $1> %$4,
+                      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                                  i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+')
+
+; $1: element type
+; $2: ret name
+; $3: v1
+; $4: v2
+; $5: v3
+; $6: v4
+define(`assemble_4s', `
+  %$2_1 = shufflevector <4 x $1> %$3, <4 x $1> %$4,
+                    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %$2_2 = shufflevector <4 x $1> %$5, <4 x $1> %$6,
+                    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  assemble_8s($1, $2, $2_1, $2_2)
+')
+
+ifelse(LLVM_VERSION, `LLVM_3_0', `
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)',
+LLVM_VERSION, `LLVM_3_1', `
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)', `
+
+gen_gather(i8)
+gen_gather(i16)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int32 gathers
+
+declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %target, i8 * %ptr,
+                       <8 x i32> %indices, <8 x i32> %mask, i8 %scale) readonly nounwind
+declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x i32> %mask, i8 %scale) readonly nounwind
+
+define <16 x i32> @__gather_base_offsets32_i32(i8 * %ptr, i32 %scale, <16 x i32> %offsets,
+                             <16 x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  extract_8s(i32, offsets)
+  extract_8s(i32, vecmask)
+
+  %v1 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * %ptr,
+                             <8 x i32> %offsets_1, <8 x i32> %vecmask_1, i8 %scale8)
+  %v2 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * %ptr,
+                             <8 x i32> %offsets_2, <8 x i32> %vecmask_2, i8 %scale8)
+
+  assemble_8s(i32, v, v1, v2)
+
+  ret <16 x i32> %v
+}
+
+
+define <16 x i32> @__gather_base_offsets64_i32(i8 * %ptr,
+                             i32 %scale, <16 x i64> %offsets,
+                             <16 x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+
+  extract_4s(i32, vecmask)
+  extract_4s(i64, offsets)
+
+  %v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_1, <4 x i32> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_2, <4 x i32> %vecmask_2, i8 %scale8)
+  %v3 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_3, <4 x i32> %vecmask_3, i8 %scale8)
+  %v4 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_4, <4 x i32> %vecmask_4, i8 %scale8)
+
+  assemble_4s(i32, v, v1, v2, v3, v4)
+
+  ret <16 x i32> %v
+}
+
+
+define <16 x i32> @__gather32_i32(<16 x i32> %ptrs, 
+                                  <16 x i32> %vecmask) nounwind readonly alwaysinline {
+  extract_8s(i32, ptrs)
+  extract_8s(i32, vecmask)
+
+  %v1 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * null,
+                       <8 x i32> %ptrs_1, <8 x i32> %vecmask_1, i8 1)
+  %v2 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * null,
+                       <8 x i32> %ptrs_2, <8 x i32> %vecmask_2, i8 1)
+
+  assemble_8s(i32, v, v1, v2)
+
+  ret <16 x i32> %v
+}
+
+
+define <16 x i32> @__gather64_i32(<16 x i64> %ptrs, 
+                                  <16 x i32> %vecmask) nounwind readonly alwaysinline {
+  extract_4s(i64, ptrs)
+  extract_4s(i32, vecmask)
+
+  %v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
+                      <4 x i64> %ptrs_1, <4 x i32> %vecmask_1, i8 1)
+  %v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
+                      <4 x i64> %ptrs_2, <4 x i32> %vecmask_2, i8 1)
+  %v3 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
+                      <4 x i64> %ptrs_3, <4 x i32> %vecmask_3, i8 1)
+  %v4 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
+                      <4 x i64> %ptrs_4, <4 x i32> %vecmask_4, i8 1)
+
+  assemble_4s(i32, v, v1, v2, v3, v4)
+
+  ret <16 x i32> %v
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float gathers
+
+declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %target, i8 * %ptr,
+                       <8 x i32> %indices, <8 x float> %mask, i8 %scale8) readonly nounwind
+declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x float> %mask, i8 %scale8) readonly nounwind
+
+define <16 x float> @__gather_base_offsets32_float(i8 * %ptr,
+                                  i32 %scale, <16 x i32> %offsets,
+                                  <16 x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %mask = bitcast <16 x i32> %vecmask to <16 x float>
+  extract_8s(i32, offsets)
+  extract_8s(float, mask)
+
+  %v1 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * %ptr,
+                       <8 x i32> %offsets_1, <8 x float> %mask_1, i8 %scale8)
+  %v2 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * %ptr,
+                       <8 x i32> %offsets_2, <8 x float> %mask_2, i8 %scale8)
+
+  assemble_8s(float, v, v1, v2)
+
+  ret <16 x float> %v
+}
+
+
+define <16 x float> @__gather_base_offsets64_float(i8 * %ptr,
+                                   i32 %scale, <16 x i64> %offsets,
+                                   <16 x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %mask = bitcast <16 x i32> %vecmask to <16 x float>
+  extract_4s(i64, offsets)
+  extract_4s(float, mask)
+
+  %v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, 
+                     <4 x i64> %offsets_1, <4 x float> %mask_1, i8 %scale8)
+  %v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, 
+                     <4 x i64> %offsets_2, <4 x float> %mask_2, i8 %scale8)
+  %v3 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, 
+                     <4 x i64> %offsets_3, <4 x float> %mask_3, i8 %scale8)
+  %v4 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, 
+                     <4 x i64> %offsets_4, <4 x float> %mask_4, i8 %scale8)
+
+  assemble_4s(float, v, v1, v2, v3, v4)
+
+  ret <16 x float> %v
+}
+
+
+define <16 x float> @__gather32_float(<16 x i32> %ptrs, 
+                                      <16 x i32> %vecmask) nounwind readonly alwaysinline {
+  %mask = bitcast <16 x i32> %vecmask to <16 x float>
+  extract_8s(float, mask)
+  extract_8s(i32, ptrs)
+
+  %v1 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * null,
+                     <8 x i32> %ptrs_1, <8 x float> %mask_1, i8 1)
+  %v2 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * null,
+                     <8 x i32> %ptrs_2, <8 x float> %mask_2, i8 1)
+
+  assemble_8s(float, v, v1, v2)
+
+  ret <16 x float> %v
+}
+
+
+define <16 x float> @__gather64_float(<16 x i64> %ptrs, 
+                                      <16 x i32> %vecmask) nounwind readonly alwaysinline {
+  %mask = bitcast <16 x i32> %vecmask to <16 x float>
+  extract_4s(i64, ptrs)
+  extract_4s(float, mask)
+
+  %v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
+                      <4 x i64> %ptrs_1, <4 x float> %mask_1, i8 1)
+  %v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
+                      <4 x i64> %ptrs_2, <4 x float> %mask_2, i8 1)
+  %v3 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
+                      <4 x i64> %ptrs_3, <4 x float> %mask_3, i8 1)
+  %v4 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
+                      <4 x i64> %ptrs_4, <4 x float> %mask_4, i8 1)
+
+  assemble_4s(float, v, v1, v2, v3, v4)
+
+  ret <16 x float> %v
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int64 gathers
+
+declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %target, i8 * %ptr,
+                       <4 x i32> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
+declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
+
+define <16 x i64> @__gather_base_offsets32_i64(i8 * %ptr,
+                             i32 %scale, <16 x i32> %offsets,
+                             <16 x i32> %mask32) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask = sext <16 x i32> %mask32 to <16 x i64>
+  extract_4s(i32, offsets)
+  extract_4s(i64, vecmask)
+
+  %v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
+                             <4 x i32> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
+                             <4 x i32> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8)
+  %v3 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
+                             <4 x i32> %offsets_3, <4 x i64> %vecmask_3, i8 %scale8)
+  %v4 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
+                             <4 x i32> %offsets_4, <4 x i64> %vecmask_4, i8 %scale8)
+
+  assemble_4s(i64, v, v1, v2, v3, v4)
+
+  ret <16 x i64> %v
+}
+
+
+define <16 x i64> @__gather_base_offsets64_i64(i8 * %ptr,
+                             i32 %scale, <16 x i64> %offsets,
+                             <16 x i32> %mask32) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask = sext <16 x i32> %mask32 to <16 x i64>
+  extract_4s(i64, offsets)
+  extract_4s(i64, vecmask)
+
+  %v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8)
+  %v3 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_3, <4 x i64> %vecmask_3, i8 %scale8)
+  %v4 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_4, <4 x i64> %vecmask_4, i8 %scale8)
+
+  assemble_4s(i64, v, v1, v2, v3, v4)
+
+  ret <16 x i64> %v
+}
+
+
+define <16 x i64> @__gather32_i64(<16 x i32> %ptrs, 
+                                  <16 x i32> %mask32) nounwind readonly alwaysinline {
+  %vecmask = sext <16 x i32> %mask32 to <16 x i64>
+  extract_4s(i32, ptrs)
+  extract_4s(i64, vecmask)
+
+  %v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i32> %ptrs_1, <4 x i64> %vecmask_1, i8 1)
+  %v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i32> %ptrs_2, <4 x i64> %vecmask_2, i8 1)
+  %v3 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i32> %ptrs_3, <4 x i64> %vecmask_3, i8 1)
+  %v4 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i32> %ptrs_4, <4 x i64> %vecmask_4, i8 1)
+
+  assemble_4s(i64, v, v1, v2, v3, v4)
+
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @__gather64_i64(<16 x i64> %ptrs, 
+                                  <16 x i32> %mask32) nounwind readonly alwaysinline {
+  %vecmask = sext <16 x i32> %mask32 to <16 x i64>
+  extract_4s(i64, ptrs)
+  extract_4s(i64, vecmask)
+
+  %v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i64> %ptrs_1, <4 x i64> %vecmask_1, i8 1)
+  %v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i64> %ptrs_2, <4 x i64> %vecmask_2, i8 1)
+  %v3 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i64> %ptrs_3, <4 x i64> %vecmask_3, i8 1)
+  %v4 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i64> %ptrs_4, <4 x i64> %vecmask_4, i8 1)
+
+  assemble_4s(i64, v, v1, v2, v3, v4)
+
+  ret <16 x i64> %v
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double gathers
+
+declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
+declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %target, i8 * %ptr,
+                       <4 x i32> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
+
+define <16 x double> @__gather_base_offsets32_double(i8 * %ptr,
+                             i32 %scale, <16 x i32> %offsets,
+                             <16 x i32> %mask32) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask64 = sext <16 x i32> %mask32 to <16 x i64>
+  %vecmask = bitcast <16 x i64> %vecmask64 to <16 x double>
+  extract_4s(i32, offsets)
+  extract_4s(double, vecmask)
+
+  %v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
+                             <4 x i32> %offsets_1, <4 x double> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
+                             <4 x i32> %offsets_2, <4 x double> %vecmask_2, i8 %scale8)
+  %v3 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
+                             <4 x i32> %offsets_3, <4 x double> %vecmask_3, i8 %scale8)
+  %v4 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
+                             <4 x i32> %offsets_4, <4 x double> %vecmask_4, i8 %scale8)
+
+  assemble_4s(double, v, v1, v2, v3, v4)
+
+  ret <16 x double> %v
+}
+
+
+define <16 x double> @__gather_base_offsets64_double(i8 * %ptr,
+                             i32 %scale, <16 x i64> %offsets,
+                             <16 x i32> %mask32) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask64 = sext <16 x i32> %mask32 to <16 x i64>
+  %vecmask = bitcast <16 x i64> %vecmask64 to <16 x double>
+  extract_4s(i64, offsets)
+  extract_4s(double, vecmask)
+
+  %v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_1, <4 x double> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_2, <4 x double> %vecmask_2, i8 %scale8)
+  %v3 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_3, <4 x double> %vecmask_3, i8 %scale8)
+  %v4 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_4, <4 x double> %vecmask_4, i8 %scale8)
+
+  assemble_4s(double, v, v1, v2, v3, v4)
+
+  ret <16 x double> %v
+}
+
+
+define <16 x double> @__gather32_double(<16 x i32> %ptrs, 
+                                        <16 x i32> %mask32) nounwind readonly alwaysinline {
+  %vecmask64 = sext <16 x i32> %mask32 to <16 x i64>
+  %vecmask = bitcast <16 x i64> %vecmask64 to <16 x double>
+  extract_4s(i32, ptrs)
+  extract_4s(double, vecmask)
+
+  %v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i32> %ptrs_1, <4 x double> %vecmask_1, i8 1)
+  %v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i32> %ptrs_2, <4 x double> %vecmask_2, i8 1)
+  %v3 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i32> %ptrs_3, <4 x double> %vecmask_3, i8 1)
+  %v4 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i32> %ptrs_4, <4 x double> %vecmask_4, i8 1)
+
+  assemble_4s(double, v, v1, v2, v3, v4)
+
+  ret <16 x double> %v
+}
+
+
+define <16 x double> @__gather64_double(<16 x i64> %ptrs, 
+                                        <16 x i32> %mask32) nounwind readonly alwaysinline {
+  %vecmask64 = sext <16 x i32> %mask32 to <16 x i64>
+  %vecmask = bitcast <16 x i64> %vecmask64 to <16 x double>
+  extract_4s(i64, ptrs)
+  extract_4s(double, vecmask)
+
+  %v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i64> %ptrs_1, <4 x double> %vecmask_1, i8 1)
+  %v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i64> %ptrs_2, <4 x double> %vecmask_2, i8 1)
+  %v3 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i64> %ptrs_3, <4 x double> %vecmask_3, i8 1)
+  %v4 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i64> %ptrs_4, <4 x double> %vecmask_4, i8 1)
+
+  assemble_4s(double, v, v1, v2, v3, v4)
+
+  ret <16 x double> %v
+}
+
+')
--- a/builtins/target-avx2.ll
+++ b/builtins/target-avx2.ll
@@ -0,0 +1,433 @@
+;;  Copyright (c) 2010-2012, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+ifelse(LLVM_VERSION, `LLVM_3_0', `',
+       LLVM_VERSION, `LLVM_3_1', `',
+       `define(`HAVE_GATHER', `1')')
+
+include(`target-avx.ll')
+
+ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
+       LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
+       `rdrand_definition()')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readonly
+declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readonly
+
+define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  %m = call <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32> %0, <8 x i32> %1)
+  ret <8 x i32> %m
+}
+
+define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  %m = call <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32> %0, <8 x i32> %1)
+  ret <8 x i32> %m
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readonly
+declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readonly
+
+define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  %m = call <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32> %0, <8 x i32> %1)
+  ret <8 x i32> %m
+}
+
+define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  %m = call <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32> %0, <8 x i32> %1)
+  ret <8 x i32> %m
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float/half conversions
+
+ifelse(LLVM_VERSION, `LLVM_3_0', `
+;; nothing to define...
+', `
+declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
+; 0 is round nearest even
+declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
+
+define <8 x float> @__half_to_float_varying(<8 x i16> %v) nounwind readnone {
+  %r = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %v)
+  ret <8 x float> %r
+}
+
+define <8 x i16> @__float_to_half_varying(<8 x float> %v) nounwind readnone {
+  %r = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %v, i32 0)
+  ret <8 x i16> %r
+}
+
+define float @__half_to_float_uniform(i16 %v) nounwind readnone {
+  %v1 = bitcast i16 %v to <1 x i16>
+  %vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
+  %r = extractelement <8 x float> %rv, i32 0
+  ret float %r
+}
+
+define i16 @__float_to_half_uniform(float %v) nounwind readnone {
+  %v1 = bitcast float %v to <1 x float>
+  %vv = shufflevector <1 x float> %v1, <1 x float> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  ; round to nearest even
+  %rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
+  %r = extractelement <8 x i16> %rv, i32 0
+  ret i16 %r
+}
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+declare void @llvm.trap() noreturn nounwind
+
+define(`extract_4s', `
+  %$2_1 = shufflevector <8 x $1> %$2, <8 x $1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$2_2 = shufflevector <8 x $1> %$2, <8 x $1> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+')
+
+ifelse(LLVM_VERSION, `LLVM_3_0', `
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)',
+LLVM_VERSION, `LLVM_3_1', `
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)', `
+
+gen_gather(i8)
+gen_gather(i16)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int32 gathers
+
+declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %target, i8 * %ptr,
+                       <8 x i32> %indices, <8 x i32> %mask, i8 %scale) readonly nounwind
+declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x i32> %mask, i8 %scale) readonly nounwind
+
+define <8 x i32> @__gather_base_offsets32_i32(i8 * %ptr,
+                             i32 %scale, <8 x i32> %offsets,
+                             <8 x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+
+  %v = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * %ptr,
+                             <8 x i32> %offsets, <8 x i32> %vecmask, i8 %scale8)
+
+  ret <8 x i32> %v
+}
+
+
+define <8 x i32> @__gather_base_offsets64_i32(i8 * %ptr,
+                             i32 %scale, <8 x i64> %offsets,
+                             <8 x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  extract_4s(i32, vecmask)
+  extract_4s(i64, offsets)
+
+  %v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_1, <4 x i32> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_2, <4 x i32> %vecmask_2, i8 %scale8)
+
+  %v = shufflevector <4 x i32> %v1, <4 x i32> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %v
+}
+
+
+define <8 x i32> @__gather32_i32(<8 x i32> %ptrs, 
+                                 <8 x i32> %vecmask) nounwind readonly alwaysinline {
+  %v = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * null,
+                      <8 x i32> %ptrs, <8 x i32> %vecmask, i8 1)
+  ret <8 x i32> %v
+}
+
+
+define <8 x i32> @__gather64_i32(<8 x i64> %ptrs, 
+                                 <8 x i32> %vecmask) nounwind readonly alwaysinline {
+  extract_4s(i64, ptrs)
+  extract_4s(i32, vecmask)
+
+  %v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
+                      <4 x i64> %ptrs_1, <4 x i32> %vecmask_1, i8 1)
+  %v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
+                      <4 x i64> %ptrs_2, <4 x i32> %vecmask_2, i8 1)
+
+  %v = shufflevector <4 x i32> %v1, <4 x i32> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %v
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float gathers
+
+declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %target, i8 * %ptr,
+                       <8 x i32> %indices, <8 x float> %mask, i8 %scale8) readonly nounwind
+declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x float> %mask, i8 %scale8) readonly nounwind
+
+define <8 x float> @__gather_base_offsets32_float(i8 * %ptr,
+                                  i32 %scale, <8 x i32> %offsets,
+                                  <8 x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %mask = bitcast <8 x i32> %vecmask to <8 x float>
+
+  %v = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * %ptr,
+                       <8 x i32> %offsets, <8 x float> %mask, i8 %scale8)
+
+  ret <8 x float> %v
+}
+
+
+define <8 x float> @__gather_base_offsets64_float(i8 * %ptr,
+                                   i32 %scale, <8 x i64> %offsets,
+                                   <8 x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %mask = bitcast <8 x i32> %vecmask to <8 x float>
+  extract_4s(i64, offsets)
+  extract_4s(float, mask)
+
+  %v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, 
+                     <4 x i64> %offsets_1, <4 x float> %mask_1, i8 %scale8)
+  %v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, 
+                     <4 x i64> %offsets_2, <4 x float> %mask_2, i8 %scale8)
+
+  %v = shufflevector <4 x float> %v1, <4 x float> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %v
+}
+
+
+define <8 x float> @__gather32_float(<8 x i32> %ptrs, 
+                                     <8 x i32> %vecmask) nounwind readonly alwaysinline {
+  %mask = bitcast <8 x i32> %vecmask to <8 x float>
+
+  %v = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * null,
+                     <8 x i32> %ptrs, <8 x float> %mask, i8 1)
+
+  ret <8 x float> %v
+}
+
+
+define <8 x float> @__gather64_float(<8 x i64> %ptrs, 
+                                     <8 x i32> %vecmask) nounwind readonly alwaysinline {
+  %mask = bitcast <8 x i32> %vecmask to <8 x float>
+  extract_4s(i64, ptrs)
+  extract_4s(float, mask)
+
+  %v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
+                      <4 x i64> %ptrs_1, <4 x float> %mask_1, i8 1)
+  %v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
+                      <4 x i64> %ptrs_2, <4 x float> %mask_2, i8 1)
+
+  %v = shufflevector <4 x float> %v1, <4 x float> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %v
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int64 gathers
+
+declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %target, i8 * %ptr,
+                       <4 x i32> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
+declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
+
+define <8 x i64> @__gather_base_offsets32_i64(i8 * %ptr,
+                             i32 %scale, <8 x i32> %offsets,
+                             <8 x i32> %mask32) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask = sext <8 x i32> %mask32 to <8 x i64>
+  extract_4s(i32, offsets)
+  extract_4s(i64, vecmask)
+
+  %v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
+                             <4 x i32> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
+                             <4 x i32> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8)
+
+  %v = shufflevector <4 x i64> %v1, <4 x i64> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i64> %v
+}
+
+
+define <8 x i64> @__gather_base_offsets64_i64(i8 * %ptr,
+                             i32 %scale, <8 x i64> %offsets,
+                             <8 x i32> %mask32) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask = sext <8 x i32> %mask32 to <8 x i64>
+  extract_4s(i64, offsets)
+  extract_4s(i64, vecmask)
+
+  %v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8)
+
+  %v = shufflevector <4 x i64> %v1, <4 x i64> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i64> %v
+}
+
+
+define <8 x i64> @__gather32_i64(<8 x i32> %ptrs, 
+                                 <8 x i32> %mask32) nounwind readonly alwaysinline {
+  %vecmask = sext <8 x i32> %mask32 to <8 x i64>
+
+  extract_4s(i32, ptrs)
+  extract_4s(i64, vecmask)
+
+  %v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i32> %ptrs_1, <4 x i64> %vecmask_1, i8 1)
+  %v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i32> %ptrs_2, <4 x i64> %vecmask_2, i8 1)
+  %v = shufflevector <4 x i64> %v1, <4 x i64> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i64> %v
+}
+
+
+define <8 x i64> @__gather64_i64(<8 x i64> %ptrs, 
+                                 <8 x i32> %mask32) nounwind readonly alwaysinline {
+  %vecmask = sext <8 x i32> %mask32 to <8 x i64>
+  extract_4s(i64, ptrs)
+  extract_4s(i64, vecmask)
+
+  %v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i64> %ptrs_1, <4 x i64> %vecmask_1, i8 1)
+  %v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i64> %ptrs_2, <4 x i64> %vecmask_2, i8 1)
+
+  %v = shufflevector <4 x i64> %v1, <4 x i64> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i64> %v
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double gathers
+
+declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
+declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %target, i8 * %ptr,
+                       <4 x i32> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
+
+define <8 x double> @__gather_base_offsets32_double(i8 * %ptr,
+                             i32 %scale, <8 x i32> %offsets,
+                             <8 x i32> %mask32) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask64 = sext <8 x i32> %mask32 to <8 x i64>
+  %vecmask = bitcast <8 x i64> %vecmask64 to <8 x double>
+  extract_4s(i32, offsets)
+  extract_4s(double, vecmask)
+
+  %v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
+                             <4 x i32> %offsets_1, <4 x double> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
+                             <4 x i32> %offsets_2, <4 x double> %vecmask_2, i8 %scale8)
+
+  %v = shufflevector <4 x double> %v1, <4 x double> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %v
+}
+
+define <8 x double> @__gather_base_offsets64_double(i8 * %ptr,
+                             i32 %scale, <8 x i64> %offsets,
+                             <8 x i32> %mask32) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask64 = sext <8 x i32> %mask32 to <8 x i64>
+  %vecmask = bitcast <8 x i64> %vecmask64 to <8 x double>
+  extract_4s(i64, offsets)
+  extract_4s(double, vecmask)
+
+  %v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_1, <4 x double> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_2, <4 x double> %vecmask_2, i8 %scale8)
+
+  %v = shufflevector <4 x double> %v1, <4 x double> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %v
+}
+
+define <8 x double> @__gather32_double(<8 x i32> %ptrs, 
+                                       <8 x i32> %mask32) nounwind readonly alwaysinline {
+  %vecmask64 = sext <8 x i32> %mask32 to <8 x i64>
+  %vecmask = bitcast <8 x i64> %vecmask64 to <8 x double>
+  extract_4s(i32, ptrs)
+  extract_4s(double, vecmask)
+
+  %v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i32> %ptrs_1, <4 x double> %vecmask_1, i8 1)
+  %v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i32> %ptrs_2, <4 x double> %vecmask_2, i8 1)
+
+  %v = shufflevector <4 x double> %v1, <4 x double> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %v
+}
+
+define <8 x double> @__gather64_double(<8 x i64> %ptrs, 
+                                       <8 x i32> %mask32) nounwind readonly alwaysinline {
+  %vecmask64 = sext <8 x i32> %mask32 to <8 x i64>
+  %vecmask = bitcast <8 x i64> %vecmask64 to <8 x double>
+  extract_4s(i64, ptrs)
+  extract_4s(double, vecmask)
+
+  %v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i64> %ptrs_1, <4 x double> %vecmask_1, i8 1)
+  %v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i64> %ptrs_2, <4 x double> %vecmask_2, i8 1)
+
+  %v = shufflevector <4 x double> %v1, <4 x double> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+  ret <8 x double> %v
+}
+
+')
--- a/builtins/target-generic-1.ll
+++ b/builtins/target-generic-1.ll
@@ -0,0 +1,955 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Define the standard library builtins for the NOVEC target
+define(`MASK',`i32')
+define(`WIDTH',`1')
+include(`util.m4')
+; Define some basics for a 1-wide target
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+aossoa()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+gen_masked_store(i8)
+gen_masked_store(i16)
+gen_masked_store(i32)
+gen_masked_store(i64)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+
+masked_load(i8,  1)
+masked_load(i16, 2)
+masked_load(i32, 4)
+masked_load(float, 4)
+masked_load(i64, 8)
+masked_load(double, 8)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather/scatter
+
+; define these with the macros from stdlib.m4
+
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
+
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)
+
+
+define  <1 x i8> @__vselect_i8(<1 x i8>, <1 x i8> ,
+                               <1 x i32> %mask) nounwind readnone alwaysinline {
+;  %mv = trunc <1 x i32> %mask to <1 x i8>
+;  %notmask = xor <1 x i8> %mv, <i8 -1>
+;  %cleared_old = and <1 x i8> %0, %notmask
+;  %masked_new = and <1 x i8> %1, %mv
+;  %new = or <1 x i8> %cleared_old, %masked_new
+;  ret <1 x i8> %new
+
+   ; not doing this the easy way because of problems with LLVM's scalarizer
+;   %cmp = icmp eq <1 x i32> %mask, <i32 0>
+;   %sel = select <1 x i1> %cmp, <1 x i8> %0, <1 x i8> %1
+    %m = extractelement <1 x i32> %mask, i32 0
+    %cmp = icmp eq i32 %m, 0
+    %d0 = extractelement <1 x i8> %0, i32 0
+    %d1 = extractelement <1 x i8> %1, i32 0
+    %sel = select i1 %cmp, i8 %d0, i8 %d1    
+    %r = insertelement <1 x i8> undef, i8 %sel, i32 0
+   ret <1 x i8> %r
+}
+
+define  <1 x i16> @__vselect_i16(<1 x i16>, <1 x i16> ,
+                                 <1 x i32> %mask) nounwind readnone alwaysinline {
+;  %mv = trunc <1 x i32> %mask to <1 x i16>
+;  %notmask = xor <1 x i16> %mv, <i16 -1>
+;  %cleared_old = and <1 x i16> %0, %notmask
+;  %masked_new = and <1 x i16> %1, %mv
+;  %new = or <1 x i16> %cleared_old, %masked_new
+;  ret <1 x i16> %new
+;   %cmp = icmp eq <1 x i32> %mask, <i32 0>
+;   %sel = select <1 x i1> %cmp, <1 x i16> %0, <1 x i16> %1
+    %m = extractelement <1 x i32> %mask, i32 0
+    %cmp = icmp eq i32 %m, 0
+    %d0 = extractelement <1 x i16> %0, i32 0
+    %d1 = extractelement <1 x i16> %1, i32 0
+    %sel = select i1 %cmp, i16 %d0, i16 %d1    
+    %r = insertelement <1 x i16> undef, i16 %sel, i32 0
+   ret <1 x i16> %r
+
+;   ret <1 x i16> %sel
+}
+
+
+define  <1 x i32> @__vselect_i32(<1 x i32>, <1 x i32> ,
+                                 <1 x i32> %mask) nounwind readnone alwaysinline {
+;  %notmask = xor <1 x i32> %mask, <i32 -1>
+;  %cleared_old = and <1 x i32> %0, %notmask
+;  %masked_new = and <1 x i32> %1, %mask
+;  %new = or <1 x i32> %cleared_old, %masked_new
+;  ret <1 x i32> %new
+;   %cmp = icmp eq <1 x i32> %mask, <i32 0>
+;   %sel = select <1 x i1> %cmp, <1 x i32> %0, <1 x i32> %1
+;   ret <1 x i32> %sel
+    %m = extractelement <1 x i32> %mask, i32 0
+    %cmp = icmp eq i32 %m, 0
+    %d0 = extractelement <1 x i32> %0, i32 0
+    %d1 = extractelement <1 x i32> %1, i32 0
+    %sel = select i1 %cmp, i32 %d0, i32 %d1    
+    %r = insertelement <1 x i32> undef, i32 %sel, i32 0
+   ret <1 x i32> %r
+
+}
+
+define  <1 x i64> @__vselect_i64(<1 x i64>, <1 x i64> ,
+                                 <1 x i32> %mask) nounwind readnone alwaysinline {
+;  %newmask = zext <1 x i32> %mask to <1 x i64>
+;  %notmask = xor <1 x i64> %newmask, <i64 -1>
+;  %cleared_old = and <1 x i64> %0, %notmask
+;  %masked_new = and <1 x i64> %1, %newmask
+;  %new = or <1 x i64> %cleared_old, %masked_new
+;  ret <1 x i64> %new
+;   %cmp = icmp eq <1 x i32> %mask, <i32 0>
+;   %sel = select <1 x i1> %cmp, <1 x i64> %0, <1 x i64> %1
+;   ret <1 x i64> %sel
+    %m = extractelement <1 x i32> %mask, i32 0
+    %cmp = icmp eq i32 %m, 0
+    %d0 = extractelement <1 x i64> %0, i32 0
+    %d1 = extractelement <1 x i64> %1, i32 0
+    %sel = select i1 %cmp, i64 %d0, i64 %d1    
+    %r = insertelement <1 x i64> undef, i64 %sel, i32 0
+   ret <1 x i64> %r
+
+}
+
+define  <1 x float> @__vselect_float(<1 x float>, <1 x float>,
+                                     <1 x i32> %mask) nounwind readnone alwaysinline {
+;  %v0 = bitcast <1 x float> %0 to <1 x i32>
+;  %v1 = bitcast <1 x float> %1 to <1 x i32>
+;  %r = call <1 x i32> @__vselect_i32(<1 x i32> %v0, <1 x i32> %v1, <1 x i32> %mask)
+;  %rf = bitcast <1 x i32> %r to <1 x float>
+;  ret <1 x float> %rf
+;   %cmp = icmp eq <1 x i32> %mask, <i32 0>
+;   %sel = select <1 x i1> %cmp, <1 x float> %0, <1 x float> %1
+;   ret <1 x float> %sel
+    %m = extractelement <1 x i32> %mask, i32 0
+    %cmp = icmp eq i32 %m, 0
+    %d0 = extractelement <1 x float> %0, i32 0
+    %d1 = extractelement <1 x float> %1, i32 0
+    %sel = select i1 %cmp, float %d0, float %d1    
+    %r = insertelement <1 x float> undef, float %sel, i32 0
+   ret <1 x float> %r
+
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+define void @__masked_store_blend_i8(<1 x i8>* nocapture, <1 x i8>,
+                                     <1 x i32> %mask) nounwind alwaysinline {
+  %val = load <1 x i8> * %0, align 4
+  %newval = call <1 x i8> @__vselect_i8(<1 x i8> %val, <1 x i8> %1, <1 x i32> %mask) 
+  store <1 x i8> %newval, <1 x i8> * %0, align 4
+  ret void
+}
+
+define void @__masked_store_blend_i16(<1 x i16>* nocapture, <1 x i16>, 
+                                      <1 x i32> %mask) nounwind alwaysinline {
+  %val = load <1 x i16> * %0, align 4
+  %newval = call <1 x i16> @__vselect_i16(<1 x i16> %val, <1 x i16> %1, <1 x i32> %mask) 
+  store <1 x i16> %newval, <1 x i16> * %0, align 4
+  ret void
+}
+
+define void @__masked_store_blend_i32(<1 x i32>* nocapture, <1 x i32>, 
+                                     <1 x i32> %mask) nounwind alwaysinline {
+  %val = load <1 x i32> * %0, align 4
+  %newval = call <1 x i32> @__vselect_i32(<1 x i32> %val, <1 x i32> %1, <1 x i32> %mask) 
+  store <1 x i32> %newval, <1 x i32> * %0, align 4
+  ret void
+}
+
+define void @__masked_store_blend_i64(<1 x i64>* nocapture, <1 x i64>,
+                                      <1 x i32> %mask) nounwind alwaysinline {
+  %val = load <1 x i64> * %0, align 4
+  %newval = call <1 x i64> @__vselect_i64(<1 x i64> %val, <1 x i64> %1, <1 x i32> %mask) 
+  store <1 x i64> %newval, <1 x i64> * %0, align 4
+  ret void
+}
+
+masked_store_float_double()
+
+define  i64 @__movmsk(<1 x i32>) nounwind readnone alwaysinline {
+  %item = extractelement <1 x i32> %0, i32 0
+  %v = lshr i32 %item, 31
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
+}
+
+define  i1 @__any(<1 x i32>) nounwind readnone alwaysinline {
+  %item = extractelement <1 x i32> %0, i32 0
+  %v = lshr i32 %item, 31
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define  i1 @__all(<1 x i32>) nounwind readnone alwaysinline {
+  %item = extractelement <1 x i32> %0, i32 0
+  %v = lshr i32 %item, 31
+  %cmp = icmp eq i32 %v, 1
+  ret i1 %cmp
+}
+
+define  i1 @__none(<1 x i32>) nounwind readnone alwaysinline {
+  %item = extractelement <1 x i32> %0, i32 0
+  %v = lshr i32 %item, 31
+  %cmp = icmp eq i32 %v, 0
+  ret i1 %cmp
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding
+;;
+;; There are not any rounding instructions in SSE2, so we have to emulate
+;; the functionality with multiple instructions...
+
+; The code for __round_* is the result of compiling the following source
+; code.
+;
+; export float Round(float x) {
+;    unsigned int sign = signbits(x);
+;    unsigned int ix = intbits(x);
+;    ix ^= sign;
+;    x = floatbits(ix);
+;    x += 0x1.0p23f;
+;    x -= 0x1.0p23f;
+;    ix = intbits(x);
+;    ix ^= sign;
+;    x = floatbits(ix);
+;    return x;
+;}
+
+define  <1 x float> @__round_varying_float(<1 x float>) nounwind readonly alwaysinline {
+  %float_to_int_bitcast.i.i.i.i = bitcast <1 x float> %0 to <1 x i32>
+  %bitop.i.i = and <1 x i32> %float_to_int_bitcast.i.i.i.i, <i32 -2147483648>
+  %bitop.i = xor <1 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i40.i = bitcast <1 x i32> %bitop.i to <1 x float>
+  %binop.i = fadd <1 x float> %int_to_float_bitcast.i.i40.i, <float 8.388608e+06>
+  %binop21.i = fadd <1 x float> %binop.i, <float -8.388608e+06>
+  %float_to_int_bitcast.i.i.i = bitcast <1 x float> %binop21.i to <1 x i32>
+  %bitop31.i = xor <1 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i.i = bitcast <1 x i32> %bitop31.i to <1 x float>
+  ret <1 x float> %int_to_float_bitcast.i.i.i
+}
+
+;; Similarly, for implementations of the __floor* functions below, we have the
+;; bitcode from compiling the following source code...
+
+;export float Floor(float x) {
+;    float y = Round(x);
+;    unsigned int cmp = y > x ? 0xffffffff : 0;
+;    float delta = -1.f;
+;    unsigned int idelta = intbits(delta);
+;    idelta &= cmp;
+;    delta = floatbits(idelta);
+;    return y + delta;
+;}
+
+define  <1 x float> @__floor_varying_float(<1 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <1 x float> @__round_varying_float(<1 x float> %0) nounwind
+  %bincmp.i = fcmp ogt <1 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <1 x i1> %bincmp.i to <1 x i32>
+  %bitop.i = and <1 x i32> %val_to_boolvec32.i, <i32 -1082130432>
+  %int_to_float_bitcast.i.i.i = bitcast <1 x i32> %bitop.i to <1 x float>
+  %binop.i = fadd <1 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <1 x float> %binop.i
+}
+
+;; And here is the code we compiled to get the __ceil* functions below
+;
+;export uniform float Ceil(uniform float x) {
+;    uniform float y = Round(x);
+;    uniform int yltx = y < x ? 0xffffffff : 0;
+;    uniform float delta = 1.f;
+;    uniform int idelta = intbits(delta);
+;    idelta &= yltx;
+;    delta = floatbits(idelta);
+;    return y + delta;
+;}
+
+define  <1 x float> @__ceil_varying_float(<1 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <1 x float> @__round_varying_float(<1 x float> %0) nounwind
+  %bincmp.i = fcmp olt <1 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <1 x i1> %bincmp.i to <1 x i32>
+  %bitop.i = and <1 x i32> %val_to_boolvec32.i, <i32 1065353216>
+  %int_to_float_bitcast.i.i.i = bitcast <1 x i32> %bitop.i to <1 x float>
+  %binop.i = fadd <1 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <1 x float> %binop.i
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+; expecting math lib to provide this
+declare double @ceil (double) nounwind readnone
+declare double @floor (double) nounwind readnone
+declare double @round (double) nounwind readnone
+;declare float     @llvm.sqrt.f32(float %Val)
+declare double    @llvm.sqrt.f64(double %Val)
+declare float     @llvm.sin.f32(float %Val)
+declare float     @llvm.cos.f32(float %Val)
+declare float     @llvm.sqrt.f32(float %Val)
+declare float     @llvm.exp.f32(float %Val)
+declare float     @llvm.log.f32(float %Val)
+declare float     @llvm.pow.f32(float %f, float %e)
+
+
+
+
+;; stuff that could be in builtins ...
+
+define(`unary1to1', `
+  %v_0 = extractelement <1 x $1> %0, i32 0
+  %r_0 = call $1 $2($1 %v_0)
+  %ret_0 = insertelement <1 x $1> undef, $1 %r_0, i32 0
+  ret <1 x $1> %ret_0
+')
+
+
+
+;; dummy 1 wide vector ops
+define  void
+@__aos_to_soa4_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2,
+        <1 x float> %v3, <1 x float> * noalias %out0, 
+        <1 x float> * noalias %out1, <1 x float> * noalias %out2, 
+        <1 x float> * noalias %out3) nounwind alwaysinline { 
+
+  store <1 x float> %v0, <1 x float > * %out0
+  store <1 x float> %v1, <1 x float > * %out1
+  store <1 x float> %v2, <1 x float > * %out2
+  store <1 x float> %v3, <1 x float > * %out3
+
+  ret void
+}
+
+define  void
+@__soa_to_aos4_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2,
+        <1 x float> %v3, <1 x float> * noalias %out0, 
+        <1 x float> * noalias %out1, <1 x float> * noalias %out2, 
+        <1 x float> * noalias %out3) nounwind alwaysinline { 
+  call void @__aos_to_soa4_float1(<1 x float> %v0, <1 x float> %v1, 
+    <1 x float> %v2, <1 x float> %v3, <1 x float> * %out0, 
+    <1 x float> * %out1, <1 x float> * %out2, <1 x float> * %out3)
+  ret void
+}
+
+define  void
+@__aos_to_soa3_float1(<1 x float> %v0, <1 x float> %v1,
+         <1 x float> %v2, <1 x float> * %out0, <1 x float> * %out1,
+         <1 x float> * %out2) {
+  store <1 x float> %v0, <1 x float > * %out0
+  store <1 x float> %v1, <1 x float > * %out1
+  store <1 x float> %v2, <1 x float > * %out2
+
+  ret void
+}
+
+define  void
+@__soa_to_aos3_float1(<1 x float> %v0, <1 x float> %v1,
+         <1 x float> %v2, <1 x float> * %out0, <1 x float> * %out1,
+         <1 x float> * %out2) {
+  call void @__aos_to_soa3_float1(<1 x float> %v0, <1 x float> %v1,
+         <1 x float> %v2, <1 x float> * %out0, <1 x float> * %out1,
+         <1 x float> * %out2)
+  ret void
+}
+
+
+;; end builtins
+
+
+define  <1 x double> @__round_varying_double(<1 x double>) nounwind readonly alwaysinline {
+  unary1to1(double, @round)
+}
+
+define  <1 x double> @__floor_varying_double(<1 x double>) nounwind readonly alwaysinline {
+  unary1to1(double, @floor)
+}
+
+
+define  <1 x double> @__ceil_varying_double(<1 x double>) nounwind readonly alwaysinline {
+  unary1to1(double, @ceil)
+}
+
+; To do vector integer min and max, we do the vector compare and then sign
+; extend the i1 vector result to an i32 mask.  The __vselect does the
+; rest...
+
+define  <1 x i32> @__min_varying_int32(<1 x i32>, <1 x i32>) nounwind readonly alwaysinline {
+  %c = icmp slt <1 x i32> %0, %1
+  %mask = sext <1 x i1> %c to <1 x i32>
+  %v = call <1 x i32> @__vselect_i32(<1 x i32> %1, <1 x i32> %0, <1 x i32> %mask)
+  ret <1 x i32> %v
+}
+
+define  i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp slt i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define  <1 x i32> @__max_varying_int32(<1 x i32>, <1 x i32>) nounwind readonly alwaysinline {
+  %c = icmp sgt <1 x i32> %0, %1
+  %mask = sext <1 x i1> %c to <1 x i32>
+  %v = call <1 x i32> @__vselect_i32(<1 x i32> %1, <1 x i32> %0, <1 x i32> %mask)
+  ret <1 x i32> %v
+}
+
+define  i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp sgt i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+
+; The functions for unsigned ints are similar, just with unsigned
+; comparison functions...
+
+define  <1 x i32> @__min_varying_uint32(<1 x i32>, <1 x i32>) nounwind readonly alwaysinline {
+  %c = icmp ult <1 x i32> %0, %1
+  %mask = sext <1 x i1> %c to <1 x i32>
+  %v = call <1 x i32> @__vselect_i32(<1 x i32> %1, <1 x i32> %0, <1 x i32> %mask)
+  ret <1 x i32> %v
+}
+
+define  i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp ult i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define  <1 x i32> @__max_varying_uint32(<1 x i32>, <1 x i32>) nounwind readonly alwaysinline {
+  %c = icmp ugt <1 x i32> %0, %1
+  %mask = sext <1 x i1> %c to <1 x i32>
+  %v = call <1 x i32> @__vselect_i32(<1 x i32> %1, <1 x i32> %0, <1 x i32> %mask)
+  ret <1 x i32> %v
+}
+
+define  i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp ugt i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops / reductions
+
+declare i32 @llvm.ctpop.i32(i32) nounwind readnone
+
+define  i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
+  %call = call i32 @llvm.ctpop.i32(i32 %0)
+  ret i32 %call
+}
+
+declare i64 @llvm.ctpop.i64(i64) nounwind readnone
+
+define  i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
+  %call = call i64 @llvm.ctpop.i64(i64 %0)
+  ret i64 %call
+}
+
+
+define  float @__reduce_add_float(<1 x float> %v) nounwind readonly alwaysinline {
+  %r = extractelement <1 x float> %v, i32 0
+  ret float %r
+}
+
+define  float @__reduce_min_float(<1 x float>) nounwind readnone {
+  %r = extractelement <1 x float> %0, i32 0
+  ret float %r
+}
+
+define  float @__reduce_max_float(<1 x float>) nounwind readnone {
+  %r = extractelement <1 x float> %0, i32 0
+  ret float %r
+}
+
+define  i32 @__reduce_add_int32(<1 x i32> %v) nounwind readnone {
+  %r = extractelement <1 x i32> %v, i32 0
+  ret i32 %r
+}
+
+define  i32 @__reduce_min_int32(<1 x i32>) nounwind readnone {
+  %r = extractelement <1 x i32> %0, i32 0
+  ret i32 %r
+}
+
+define  i32 @__reduce_max_int32(<1 x i32>) nounwind readnone {
+  %r = extractelement <1 x i32> %0, i32 0
+  ret i32 %r
+}
+
+define  i32 @__reduce_min_uint32(<1 x i32>) nounwind readnone {
+  %r = extractelement <1 x i32> %0, i32 0
+  ret i32 %r
+}
+
+define  i32 @__reduce_max_uint32(<1 x i32>) nounwind readnone {
+  %r = extractelement <1 x i32> %0, i32 0
+  ret i32 %r
+ }
+
+
+define  double @__reduce_add_double(<1 x double>) nounwind readnone {
+  %m = extractelement <1 x double> %0, i32 0
+  ret double %m
+}
+
+define  double @__reduce_min_double(<1 x double>) nounwind readnone {
+  %m = extractelement <1 x double> %0, i32 0
+  ret double %m
+}
+
+define  double @__reduce_max_double(<1 x double>) nounwind readnone {
+  %m = extractelement <1 x double> %0, i32 0
+  ret double %m
+}
+
+define  i64 @__reduce_add_int64(<1 x i64>) nounwind readnone {
+  %m = extractelement <1 x i64> %0, i32 0
+  ret i64 %m
+}
+
+define  i64 @__reduce_min_int64(<1 x i64>) nounwind readnone {
+  %m = extractelement <1 x i64> %0, i32 0
+  ret i64 %m
+}
+
+define  i64 @__reduce_max_int64(<1 x i64>) nounwind readnone {
+  %m = extractelement <1 x i64> %0, i32 0
+  ret i64 %m
+}
+
+define  i64 @__reduce_min_uint64(<1 x i64>) nounwind readnone {
+  %m = extractelement <1 x i64> %0, i32 0
+  ret i64 %m
+}
+
+define  i64 @__reduce_max_uint64(<1 x i64>) nounwind readnone {
+  %m = extractelement <1 x i64> %0, i32 0
+  ret i64 %m
+}
+
+define  i1 @__reduce_equal_int32(<1 x i32> %vv, i32 * %samevalue,
+                                      <1 x i32> %mask) nounwind alwaysinline {
+  %v=extractelement <1 x i32> %vv, i32 0
+  store i32 %v, i32 * %samevalue
+  ret i1 true
+
+}
+
+define  i1 @__reduce_equal_float(<1 x float> %vv, float * %samevalue,
+                                      <1 x i32> %mask) nounwind alwaysinline {
+  %v=extractelement <1 x float> %vv, i32 0
+  store float %v, float * %samevalue
+  ret i1 true
+
+}
+
+define  i1 @__reduce_equal_int64(<1 x i64> %vv, i64 * %samevalue,
+                                      <1 x i32> %mask) nounwind alwaysinline {
+  %v=extractelement <1 x i64> %vv, i32 0
+  store i64 %v, i64 * %samevalue
+  ret i1 true
+
+}
+
+define  i1 @__reduce_equal_double(<1 x double> %vv, double * %samevalue,
+                                      <1 x i32> %mask) nounwind alwaysinline {
+  %v=extractelement <1 x double> %vv, i32 0
+  store double %v, double * %samevalue
+  ret i1 true
+
+}
+
+; extracting/reinserting elements because I want to be able to remove vectors later on
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+define  <1 x float> @__rcp_varying_float(<1 x float>) nounwind readonly alwaysinline {
+  ;%call = call <1 x float> @llvm.x86.sse.rcp.ps(<1 x float> %0)
+  ; do one N-R iteration to improve precision
+  ;  float iv = __rcp_v(v);
+  ;  return iv * (2. - v * iv);
+  ;%v_iv = fmul <1 x float> %0, %call
+  ;%two_minus = fsub <1 x float> <float 2., float 2., float 2., float 2.>, %v_iv  
+  ;%iv_mul = fmul <1 x float> %call, %two_minus
+  ;ret <1 x float> %iv_mul
+  %d = extractelement <1 x float> %0, i32 0
+  %r = fdiv float 1.,%d
+  %rv = insertelement <1 x float> undef, float %r, i32 0
+  ret <1 x float> %rv
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; sqrt
+
+define  <1 x float> @__sqrt_varying_float(<1 x float>) nounwind readonly alwaysinline {
+  ;%call = call <1 x float> @llvm.x86.sse.sqrt.ps(<1 x float> %0)
+  ;ret <1 x float> %call
+  %d = extractelement <1 x float> %0, i32 0
+  %r = call float @llvm.sqrt.f32(float %d)
+  %rv = insertelement <1 x float> undef, float %r, i32 0
+  ret <1 x float> %rv
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; rsqrt
+
+define  <1 x float> @__rsqrt_varying_float(<1 x float> %v) nounwind readonly alwaysinline {
+  ;  float is = __rsqrt_v(v);
+  ;%is = call <1 x float> @llvm.x86.sse.rsqrt.ps(<1 x float> %v)
+  ; Newton-Raphson iteration to improve precision
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  ;%v_is = fmul <1 x float> %v, %is
+  ;%v_is_is = fmul <1 x float> %v_is, %is
+  ;%three_sub = fsub <1 x float> <float 3., float 3., float 3., float 3.>, %v_is_is
+  ;%is_mul = fmul <1 x float> %is, %three_sub
+  ;%half_scale = fmul <1 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  ;ret <1 x float> %half_scale
+  %s = call <1 x float> @__sqrt_varying_float(<1 x float> %v)
+  %r = call <1 x float> @__rcp_varying_float(<1 x float> %s)
+  ret <1 x float> %r
+  
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; svml stuff
+
+define  <1 x float> @__svml_sin(<1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_sinf4(<1 x float> %0)
+  ;ret <1 x float> %ret
+  ;%r = extractelement <1 x float> %0, i32 0
+  ;%s = call float @llvm.sin.f32(float %r)
+  ;%rv = insertelement <1 x float> undef, float %r, i32 0
+  ;ret <1 x float> %rv
+  unary1to1(float,@llvm.sin.f32)
+   
+}
+
+define  <1 x float> @__svml_cos(<1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_cosf4(<1 x float> %0)
+  ;ret <1 x float> %ret
+  ;%r = extractelement <1 x float> %0, i32 0
+  ;%s = call float @llvm.cos.f32(float %r)
+  ;%rv = insertelement <1 x float> undef, float %r, i32 0
+  ;ret <1 x float> %rv
+  unary1to1(float, @llvm.cos.f32)
+
+}
+
+define  void @__svml_sincos(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline {
+;  %s = call <1 x float> @__svml_sincosf4(<1 x float> * %2, <1 x float> %0)
+;  store <1 x float> %s, <1 x float> * %1
+;  ret void
+   %sin = call <1 x float> @__svml_sin (<1 x float> %0)
+   %cos = call <1 x float> @__svml_cos (<1 x float> %0)
+   store <1 x float> %sin, <1 x float> * %1
+   store <1 x float> %cos, <1 x float> * %2
+   ret void
+}
+
+define  <1 x float> @__svml_tan(<1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_tanf4(<1 x float> %0)
+  ;ret <1 x float> %ret
+  ;%r = extractelement <1 x float> %0, i32 0
+  ;%s = call float @llvm_tan_f32(float %r)
+  ;%rv = insertelement <1 x float> undef, float %r, i32 0
+  ;ret <1 x float> %rv
+  ;unasry1to1(float, @llvm.tan.f32)
+  ; UNSUPPORTED!
+  ret <1 x float > %0
+}
+
+define  <1 x float> @__svml_atan(<1 x float>) nounwind readnone alwaysinline {
+;  %ret = call <1 x float> @__svml_atanf4(<1 x float> %0)
+;  ret <1 x float> %ret
+  ;%r = extractelement <1 x float> %0, i32 0
+  ;%s = call float @llvm_atan_f32(float %r)
+  ;%rv = insertelement <1 x float> undef, float %r, i32 0
+  ;ret <1 x float> %rv
+  ;unsary1to1(float,@llvm.atan.f32)
+  ;UNSUPPORTED!
+  ret <1 x float > %0
+
+}
+
+define  <1 x float> @__svml_atan2(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_atan2f4(<1 x float> %0, <1 x float> %1)
+  ;ret <1 x float> %ret
+  ;%y = extractelement <1 x float> %0, i32 0
+  ;%x = extractelement <1 x float> %1, i32 0
+  ;%q = fdiv float %y, %x
+  ;%a = call float @llvm.atan.f32 (float %q)
+  ;%rv = insertelement <1 x float> undef, float %a, i32 0
+  ;ret <1 x float> %rv
+  ; UNSUPPORTED!
+  ret <1 x float > %0
+}
+
+define  <1 x float> @__svml_exp(<1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_expf4(<1 x float> %0)
+  ;ret <1 x float> %ret
+  unary1to1(float, @llvm.exp.f32)
+}
+
+define  <1 x float> @__svml_log(<1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_logf4(<1 x float> %0)
+  ;ret <1 x float> %ret
+  unary1to1(float, @llvm.log.f32)
+}
+
+define  <1 x float> @__svml_pow(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_powf4(<1 x float> %0, <1 x float> %1)
+  ;ret <1 x float> %ret
+  %r = extractelement <1 x float> %0, i32 0
+  %e  = extractelement <1 x float> %1, i32 0
+  %s = call float @llvm.pow.f32(float %r,float %e)
+  %rv = insertelement <1 x float> undef, float %s, i32 0
+  ret <1 x float> %rv
+
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+define  <1 x float> @__max_varying_float(<1 x float>, <1 x float>) nounwind readonly alwaysinline {
+;  %call = call <1 x float> @llvm.x86.sse.max.ps(<1 x float> %0, <1 x float> %1)
+;  ret <1 x float> %call
+  %a = extractelement <1 x float> %0, i32 0
+  %b = extractelement <1 x float> %1, i32 0
+  %d = fcmp ogt float %a, %b  
+  %r = select i1 %d, float %a, float %b
+  %rv = insertelement <1 x float> undef, float %r, i32 0
+  ret <1 x float> %rv    
+}
+
+define  <1 x float> @__min_varying_float(<1 x float>, <1 x float>) nounwind readonly alwaysinline {
+;  %call = call <1 x float> @llvm.x86.sse.min.ps(<1 x float> %0, <1 x float> %1)
+;  ret <1 x float> %call
+  %a = extractelement <1 x float> %0, i32 0
+  %b = extractelement <1 x float> %1, i32 0
+  %d = fcmp olt float %a, %b  
+  %r = select i1 %d, float %a, float %b
+  %rv = insertelement <1 x float> undef, float %r, i32 0
+  ret <1 x float> %rv    
+
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+;declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
+
+define  <1 x double> @__sqrt_varying_double(<1 x double>) nounwind alwaysinline {
+  ;unarya2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
+  ;ret <1 x double> %ret
+  unary1to1(double, @llvm.sqrt.f64)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+;declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
+;declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define  <1 x double> @__min_varying_double(<1 x double>, <1 x double>) nounwind readnone {
+  ;binarsy2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
+  ;ret <1 x double> %ret
+  %a = extractelement <1 x double> %0, i32 0
+  %b = extractelement <1 x double> %1, i32 0
+  %d = fcmp olt double %a, %b  
+  %r = select i1 %d, double %a, double %b
+  %rv = insertelement <1 x double> undef, double %r, i32 0
+  ret <1 x double> %rv    
+
+}
+
+define  <1 x double> @__max_varying_double(<1 x double>, <1 x double>) nounwind readnone {
+  ;binary2sto4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
+  ;ret <1 x double> %ret
+  %a = extractelement <1 x double> %0, i32 0
+  %b = extractelement <1 x double> %1, i32 0
+  %d = fcmp ogt double %a, %b  
+  %r = select i1 %d, double %a, double %b
+  %rv = insertelement <1 x double> undef, double %r, i32 0
+  ret <1 x double> %rv    
+
+}
+
+
+define  float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
+;    uniform float iv = extract(__rcp_u(v), 0);
+;    return iv * (2. - v * iv);
+  %r = fdiv float 1.,%0
+  ret float %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding floats
+
+define  float @__round_uniform_float(float) nounwind readonly alwaysinline {
+  ; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  ; the roundss intrinsic is a total mess--docs say:
+  ;
+  ;  __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
+  ;       
+  ;  b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
+  ;  on b0. The higher order 96 bits are copied directly from input parameter a. The
+  ;  return value is described by the following equations:
+  ;
+  ;  r0 = RND(b0)
+  ;  r1 = a1
+  ;  r2 = a2
+  ;  r3 = a3
+  ;
+  ;  It doesn't matter what we pass as a, since we only need the r0 value
+  ;  here.  So we pass the same register for both.
+  %v = insertelement<1 x float> undef, float %0, i32 0
+  %rv = call <1 x float> @__round_varying_float(<1 x float> %v)
+  %r=extractelement <1 x float> %rv, i32 0
+  ret float %r
+
+}
+
+define  float @__floor_uniform_float(float) nounwind readonly alwaysinline {
+  %v = insertelement<1 x float> undef, float %0, i32 0
+  %rv = call <1 x float> @__floor_varying_float(<1 x float> %v)
+  %r=extractelement <1 x float> %rv, i32 0
+  ret float %r
+
+}
+
+define  float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
+  %v = insertelement<1 x float> undef, float %0, i32 0
+  %rv = call <1 x float> @__ceil_varying_float(<1 x float> %v)
+  %r=extractelement <1 x float> %rv, i32 0
+  ret float %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+
+define  double @__round_uniform_double(double) nounwind readonly alwaysinline {
+       %rs=call double @round(double %0)
+       ret double %rs
+}
+
+define  double @__floor_uniform_double(double) nounwind readonly alwaysinline {
+  %rs = call double @floor(double %0)
+  ret double %rs
+}
+
+define  double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
+  %rs = call double @ceil(double %0)
+  ret double %rs
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; sqrt
+
+
+define  float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
+  %ret = call float @llvm.sqrt.f32(float %0)
+  ret float %ret
+}
+
+define  double @__sqrt_uniform_double(double) nounwind readonly alwaysinline {
+  %ret = call double @llvm.sqrt.f64(double %0)
+  ret double %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rsqrt
+
+
+define  float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
+  %s = call float @__sqrt_uniform_float(float %0)
+  %r = call float @__rcp_uniform_float(float %s)
+  ret float %r
+}
+
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; fastmath
+
+
+define  void @__fastmath() nounwind alwaysinline {
+ ; no-op
+  ret void
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+
+define  float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
+  %d = fcmp ogt float %0, %1 
+  %r = select i1 %d, float %0, float %1
+  ret float %r
+
+}
+
+define  float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
+  %d = fcmp olt float %0, %1 
+  %r = select i1 %d, float %0, float %1
+  ret float %r
+
+}
+define  double @__max_uniform_double(double, double) nounwind readonly alwaysinline {
+  %d = fcmp ogt double %0, %1 
+  %r = select i1 %d, double %0, double %1
+  ret double %r
+
+}
+
+define  double @__min_uniform_double(double, double) nounwind readonly alwaysinline {
+  %d = fcmp olt double %0, %1 
+  %r = select i1 %d, double %0, double %1
+  ret double %r
+
+}
+
+define_shuffles()
+
+ctlztz()
+
+define_prefetches()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
--- a/builtins/target-generic-16.ll
+++ b/builtins/target-generic-16.ll
@@ -0,0 +1,34 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`WIDTH',`16')
+include(`target-generic-common.ll')
+
--- a/builtins/target-generic-32.ll
+++ b/builtins/target-generic-32.ll
@@ -0,0 +1,33 @@
+;;  Copyright (c) 2010-2012, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`WIDTH',`32')
+include(`target-generic-common.ll')
--- a/builtins/target-generic-4.ll
+++ b/builtins/target-generic-4.ll
@@ -0,0 +1,34 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`WIDTH',`4')
+include(`target-generic-common.ll')
+
--- a/builtins/target-generic-64.ll
+++ b/builtins/target-generic-64.ll
@@ -0,0 +1,33 @@
+;;  Copyright (c) 2010-2012, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`WIDTH',`64')
+include(`target-generic-common.ll')
--- a/builtins/target-generic-8.ll
+++ b/builtins/target-generic-8.ll
@@ -0,0 +1,34 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`WIDTH',`8')
+include(`target-generic-common.ll')
+
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -0,0 +1,381 @@
+;;  Copyright (c) 2010-2012, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128-v16:16:16-v32:32:32";
+
+define(`MASK',`i1')
+define(`HAVE_GATHER',`1')
+define(`HAVE_SCATTER',`1')
+
+include(`util.m4')
+
+stdlib_core()
+scans()
+reduce_equal(WIDTH)
+rdrand_decls()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; broadcast/rotate/shuffle
+
+declare <WIDTH x float> @__smear_float(float) nounwind readnone
+declare <WIDTH x double> @__smear_double(double) nounwind readnone
+declare <WIDTH x i8> @__smear_i8(i8) nounwind readnone
+declare <WIDTH x i16> @__smear_i16(i16) nounwind readnone
+declare <WIDTH x i32> @__smear_i32(i32) nounwind readnone
+declare <WIDTH x i64> @__smear_i64(i64) nounwind readnone
+
+declare <WIDTH x float> @__setzero_float() nounwind readnone
+declare <WIDTH x double> @__setzero_double() nounwind readnone
+declare <WIDTH x i8> @__setzero_i8() nounwind readnone
+declare <WIDTH x i16> @__setzero_i16() nounwind readnone
+declare <WIDTH x i32> @__setzero_i32() nounwind readnone
+declare <WIDTH x i64> @__setzero_i64() nounwind readnone
+
+declare <WIDTH x float> @__undef_float() nounwind readnone
+declare <WIDTH x double> @__undef_double() nounwind readnone
+declare <WIDTH x i8> @__undef_i8() nounwind readnone
+declare <WIDTH x i16> @__undef_i16() nounwind readnone
+declare <WIDTH x i32> @__undef_i32() nounwind readnone
+declare <WIDTH x i64> @__undef_i64() nounwind readnone
+
+declare <WIDTH x float> @__broadcast_float(<WIDTH x float>, i32) nounwind readnone
+declare <WIDTH x double> @__broadcast_double(<WIDTH x double>, i32) nounwind readnone
+declare <WIDTH x i8> @__broadcast_i8(<WIDTH x i8>, i32) nounwind readnone
+declare <WIDTH x i16> @__broadcast_i16(<WIDTH x i16>, i32) nounwind readnone
+declare <WIDTH x i32> @__broadcast_i32(<WIDTH x i32>, i32) nounwind readnone
+declare <WIDTH x i64> @__broadcast_i64(<WIDTH x i64>, i32) nounwind readnone
+
+declare <WIDTH x i8> @__rotate_i8(<WIDTH x i8>, i32) nounwind readnone
+declare <WIDTH x i16> @__rotate_i16(<WIDTH x i16>, i32) nounwind readnone
+declare <WIDTH x float> @__rotate_float(<WIDTH x float>, i32) nounwind readnone
+declare <WIDTH x i32> @__rotate_i32(<WIDTH x i32>, i32) nounwind readnone
+declare <WIDTH x double> @__rotate_double(<WIDTH x double>, i32) nounwind readnone
+declare <WIDTH x i64> @__rotate_i64(<WIDTH x i64>, i32) nounwind readnone
+
+declare <WIDTH x i8> @__shuffle_i8(<WIDTH x i8>, <WIDTH x i32>) nounwind readnone
+declare <WIDTH x i8> @__shuffle2_i8(<WIDTH x i8>, <WIDTH x i8>,
+                                    <WIDTH x i32>) nounwind readnone
+declare <WIDTH x i16> @__shuffle_i16(<WIDTH x i16>, <WIDTH x i32>) nounwind readnone
+declare <WIDTH x i16> @__shuffle2_i16(<WIDTH x i16>, <WIDTH x i16>,
+                                      <WIDTH x i32>) nounwind readnone
+declare <WIDTH x float> @__shuffle_float(<WIDTH x float>,
+                                         <WIDTH x i32>) nounwind readnone
+declare <WIDTH x float> @__shuffle2_float(<WIDTH x float>, <WIDTH x float>,
+                                          <WIDTH x i32>) nounwind readnone
+declare <WIDTH x i32> @__shuffle_i32(<WIDTH x i32>,
+                                     <WIDTH x i32>) nounwind readnone
+declare <WIDTH x i32> @__shuffle2_i32(<WIDTH x i32>, <WIDTH x i32>,
+                                      <WIDTH x i32>) nounwind readnone
+declare <WIDTH x double> @__shuffle_double(<WIDTH x double>,
+                                           <WIDTH x i32>) nounwind readnone
+declare <WIDTH x double> @__shuffle2_double(<WIDTH x double>,
+                                            <WIDTH x double>, <WIDTH x i32>) nounwind readnone
+declare <WIDTH x i64> @__shuffle_i64(<WIDTH x i64>,
+                                     <WIDTH x i32>) nounwind readnone
+declare <WIDTH x i64> @__shuffle2_i64(<WIDTH x i64>, <WIDTH x i64>,
+                                      <WIDTH x i32>) nounwind readnone
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; aos/soa
+
+declare void @__soa_to_aos3_float(<WIDTH x float> %v0, <WIDTH x float> %v1,
+                                  <WIDTH x float> %v2, float * noalias %p) nounwind
+declare void @__aos_to_soa3_float(float * noalias %p, <WIDTH x float> * %out0,
+                                  <WIDTH x float> * %out1, <WIDTH x float> * %out2) nounwind
+declare void @__soa_to_aos4_float(<WIDTH x float> %v0, <WIDTH x float> %v1,
+                                  <WIDTH x float> %v2, <WIDTH x float> %v3,
+                                  float * noalias %p) nounwind
+declare void @__aos_to_soa4_float(float * noalias %p, <WIDTH x float> * noalias %out0,
+                                  <WIDTH x float> * noalias %out1,
+                                  <WIDTH x float> * noalias %out2,
+                                  <WIDTH x float> * noalias %out3) nounwind
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; math
+
+declare void @__fastmath() nounwind 
+
+;; round/floor/ceil
+
+declare float @__round_uniform_float(float) nounwind readnone 
+declare float @__floor_uniform_float(float) nounwind readnone 
+declare float @__ceil_uniform_float(float) nounwind readnone 
+
+declare double @__round_uniform_double(double) nounwind readnone 
+declare double @__floor_uniform_double(double) nounwind readnone 
+declare double @__ceil_uniform_double(double) nounwind readnone 
+
+declare <WIDTH x float> @__round_varying_float(<WIDTH x float>) nounwind readnone 
+declare <WIDTH x float> @__floor_varying_float(<WIDTH x float>) nounwind readnone 
+declare <WIDTH x float> @__ceil_varying_float(<WIDTH x float>) nounwind readnone 
+declare <WIDTH x double> @__round_varying_double(<WIDTH x double>) nounwind readnone 
+declare <WIDTH x double> @__floor_varying_double(<WIDTH x double>) nounwind readnone 
+declare <WIDTH x double> @__ceil_varying_double(<WIDTH x double>) nounwind readnone 
+
+;; min/max
+
+declare float @__max_uniform_float(float, float) nounwind readnone 
+declare float @__min_uniform_float(float, float) nounwind readnone 
+declare i32 @__min_uniform_int32(i32, i32) nounwind readnone 
+declare i32 @__max_uniform_int32(i32, i32) nounwind readnone 
+declare i32 @__min_uniform_uint32(i32, i32) nounwind readnone 
+declare i32 @__max_uniform_uint32(i32, i32) nounwind readnone 
+declare i64 @__min_uniform_int64(i64, i64) nounwind readnone 
+declare i64 @__max_uniform_int64(i64, i64) nounwind readnone 
+declare i64 @__min_uniform_uint64(i64, i64) nounwind readnone 
+declare i64 @__max_uniform_uint64(i64, i64) nounwind readnone 
+declare double @__min_uniform_double(double, double) nounwind readnone 
+declare double @__max_uniform_double(double, double) nounwind readnone 
+
+declare <WIDTH x float> @__max_varying_float(<WIDTH x float>,
+                                             <WIDTH x float>) nounwind readnone 
+declare <WIDTH x float> @__min_varying_float(<WIDTH x float>,
+                                             <WIDTH x float>) nounwind readnone 
+declare <WIDTH x i32> @__min_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone 
+declare <WIDTH x i32> @__max_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone 
+declare <WIDTH x i32> @__min_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone 
+declare <WIDTH x i32> @__max_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone 
+declare <WIDTH x i64> @__min_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone 
+declare <WIDTH x i64> @__max_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone 
+declare <WIDTH x i64> @__min_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone 
+declare <WIDTH x i64> @__max_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone 
+declare <WIDTH x double> @__min_varying_double(<WIDTH x double>,
+                                               <WIDTH x double>) nounwind readnone
+declare <WIDTH x double> @__max_varying_double(<WIDTH x double>,
+                                               <WIDTH x double>) nounwind readnone 
+
+;; sqrt/rsqrt/rcp
+
+declare float @__rsqrt_uniform_float(float) nounwind readnone 
+declare float @__rcp_uniform_float(float) nounwind readnone 
+declare float @__sqrt_uniform_float(float) nounwind readnone 
+declare <WIDTH x float> @__rcp_varying_float(<WIDTH x float>) nounwind readnone 
+declare <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float>) nounwind readnone 
+declare <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone 
+
+declare double @__sqrt_uniform_double(double) nounwind readnone
+declare <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readnone
+
+;; bit ops
+
+declare i32 @__popcnt_int32(i32) nounwind readnone
+declare i64 @__popcnt_int64(i64) nounwind readnone 
+
+declare i32 @__count_trailing_zeros_i32(i32) nounwind readnone
+declare i64 @__count_trailing_zeros_i64(i64) nounwind readnone
+declare i32 @__count_leading_zeros_i32(i32) nounwind readnone
+declare i64 @__count_leading_zeros_i64(i64) nounwind readnone
+
+;; svml
+
+; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
+; or, use the macro to call the 4-wide ones twice with our 8-wide
+; vectors...
+
+declare <WIDTH x float> @__svml_sin(<WIDTH x float>)
+declare <WIDTH x float> @__svml_cos(<WIDTH x float>)
+declare void @__svml_sincos(<WIDTH x float>, <WIDTH x float> *, <WIDTH x float> *)
+declare <WIDTH x float> @__svml_tan(<WIDTH x float>)
+declare <WIDTH x float> @__svml_atan(<WIDTH x float>)
+declare <WIDTH x float> @__svml_atan2(<WIDTH x float>, <WIDTH x float>)
+declare <WIDTH x float> @__svml_exp(<WIDTH x float>)
+declare <WIDTH x float> @__svml_log(<WIDTH x float>)
+declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reductions
+
+declare i64 @__movmsk(<WIDTH x i1>) nounwind readnone 
+declare i1 @__any(<WIDTH x i1>) nounwind readnone 
+declare i1 @__all(<WIDTH x i1>) nounwind readnone 
+declare i1 @__none(<WIDTH x i1>) nounwind readnone 
+
+declare float @__reduce_add_float(<WIDTH x float>) nounwind readnone
+declare float @__reduce_min_float(<WIDTH x float>) nounwind readnone 
+declare float @__reduce_max_float(<WIDTH x float>) nounwind readnone 
+
+declare i32 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone 
+declare i32 @__reduce_min_int32(<WIDTH x i32>) nounwind readnone 
+declare i32 @__reduce_max_int32(<WIDTH x i32>) nounwind readnone 
+
+declare i32 @__reduce_min_uint32(<WIDTH x i32>) nounwind readnone 
+declare i32 @__reduce_max_uint32(<WIDTH x i32>) nounwind readnone 
+
+declare double @__reduce_add_double(<WIDTH x double>) nounwind readnone 
+declare double @__reduce_min_double(<WIDTH x double>) nounwind readnone 
+declare double @__reduce_max_double(<WIDTH x double>) nounwind readnone 
+
+declare i64 @__reduce_add_int64(<WIDTH x i64>) nounwind readnone 
+declare i64 @__reduce_min_int64(<WIDTH x i64>) nounwind readnone 
+declare i64 @__reduce_max_int64(<WIDTH x i64>) nounwind readnone 
+
+declare i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone 
+declare i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone 
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+
+declare <WIDTH x i8> @__masked_load_i8(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
+declare <WIDTH x i16> @__masked_load_i16(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
+declare <WIDTH x i32> @__masked_load_i32(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
+declare <WIDTH x float> @__masked_load_float(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
+declare <WIDTH x i64> @__masked_load_i64(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
+declare <WIDTH x double> @__masked_load_double(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
+
+declare void @__masked_store_i8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
+                                <WIDTH x i1>) nounwind 
+declare void @__masked_store_i16(<WIDTH x i16>* nocapture, <WIDTH x i16>, 
+                                 <WIDTH x i1>) nounwind 
+declare void @__masked_store_i32(<WIDTH x i32>* nocapture, <WIDTH x i32>, 
+                                 <WIDTH x i1>) nounwind 
+declare void @__masked_store_float(<WIDTH x float>* nocapture, <WIDTH x float>, 
+                                   <WIDTH x i1>) nounwind 
+declare void @__masked_store_i64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
+                                 <WIDTH x i1> %mask) nounwind 
+declare void @__masked_store_double(<WIDTH x double>* nocapture, <WIDTH x double>,
+                                    <WIDTH x i1> %mask) nounwind 
+
+ifelse(LLVM_VERSION, `LLVM_3_0', `
+declare void @__masked_store_blend_i8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
+                                      <WIDTH x i1>) nounwind 
+declare void @__masked_store_blend_i16(<WIDTH x i16>* nocapture, <WIDTH x i16>, 
+                                       <WIDTH x i1>) nounwind 
+declare void @__masked_store_blend_i32(<WIDTH x i32>* nocapture, <WIDTH x i32>, 
+                                       <WIDTH x i1>) nounwind 
+declare void @__masked_store_blend_float(<WIDTH x float>* nocapture, <WIDTH x float>, 
+                                       <WIDTH x i1>) nounwind 
+declare void @__masked_store_blend_i64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
+                                       <WIDTH x i1> %mask) nounwind 
+declare void @__masked_store_blend_double(<WIDTH x double>* nocapture, <WIDTH x double>,
+                                       <WIDTH x i1> %mask) nounwind 
+', `
+define void @__masked_store_blend_i8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
+                                     <WIDTH x i1>) nounwind alwaysinline {
+  %v = load <WIDTH x i8> * %0
+  %v1 = select <WIDTH x i1> %2, <WIDTH x i8> %1, <WIDTH x i8> %v
+  store <WIDTH x i8> %v1, <WIDTH x i8> * %0
+  ret void
+}
+
+define void @__masked_store_blend_i16(<WIDTH x i16>* nocapture, <WIDTH x i16>, 
+                                      <WIDTH x i1>) nounwind alwaysinline {
+  %v = load <WIDTH x i16> * %0
+  %v1 = select <WIDTH x i1> %2, <WIDTH x i16> %1, <WIDTH x i16> %v
+  store <WIDTH x i16> %v1, <WIDTH x i16> * %0
+  ret void
+}
+
+define void @__masked_store_blend_i32(<WIDTH x i32>* nocapture, <WIDTH x i32>, 
+                                      <WIDTH x i1>) nounwind alwaysinline {
+  %v = load <WIDTH x i32> * %0
+  %v1 = select <WIDTH x i1> %2, <WIDTH x i32> %1, <WIDTH x i32> %v
+  store <WIDTH x i32> %v1, <WIDTH x i32> * %0
+  ret void
+}
+
+define void @__masked_store_blend_float(<WIDTH x float>* nocapture, <WIDTH x float>, 
+                                        <WIDTH x i1>) nounwind alwaysinline {
+  %v = load <WIDTH x float> * %0
+  %v1 = select <WIDTH x i1> %2, <WIDTH x float> %1, <WIDTH x float> %v
+  store <WIDTH x float> %v1, <WIDTH x float> * %0
+  ret void
+}
+
+define void @__masked_store_blend_i64(<WIDTH x i64>* nocapture,
+                            <WIDTH x i64>, <WIDTH x i1>) nounwind alwaysinline {
+  %v = load <WIDTH x i64> * %0
+  %v1 = select <WIDTH x i1> %2, <WIDTH x i64> %1, <WIDTH x i64> %v
+  store <WIDTH x i64> %v1, <WIDTH x i64> * %0
+  ret void
+}
+
+define void @__masked_store_blend_double(<WIDTH x double>* nocapture,
+                            <WIDTH x double>, <WIDTH x i1>) nounwind alwaysinline {
+  %v = load <WIDTH x double> * %0
+  %v1 = select <WIDTH x i1> %2, <WIDTH x double> %1, <WIDTH x double> %v
+  store <WIDTH x double> %v1, <WIDTH x double> * %0
+  ret void
+}
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather/scatter
+
+define(`gather_scatter', `
+declare <WIDTH x $1> @__gather_base_offsets32_$1(i8 * nocapture, i32, <WIDTH x i32>,
+                                                 <WIDTH x i1>) nounwind readonly 
+declare <WIDTH x $1> @__gather_base_offsets64_$1(i8 * nocapture, i32, <WIDTH x i64>,
+                                                  <WIDTH x i1>) nounwind readonly 
+declare <WIDTH x $1> @__gather32_$1(<WIDTH x i32>, 
+                                    <WIDTH x i1>) nounwind readonly 
+declare <WIDTH x $1> @__gather64_$1(<WIDTH x i64>, 
+                                    <WIDTH x i1>) nounwind readonly 
+
+declare void @__scatter_base_offsets32_$1(i8* nocapture, i32, <WIDTH x i32>,
+                                          <WIDTH x $1>, <WIDTH x i1>) nounwind 
+declare void @__scatter_base_offsets64_$1(i8* nocapture, i32, <WIDTH x i64>,
+                                          <WIDTH x $1>, <WIDTH x i1>) nounwind 
+declare void @__scatter32_$1(<WIDTH x i32>, <WIDTH x $1>,
+                             <WIDTH x i1>) nounwind 
+declare void @__scatter64_$1(<WIDTH x i64>, <WIDTH x $1>,
+                              <WIDTH x i1>) nounwind 
+')
+
+gather_scatter(i8)
+gather_scatter(i16)
+gather_scatter(i32)
+gather_scatter(float)
+gather_scatter(i64)
+gather_scatter(double)
+
+declare i32 @__packed_load_active(i32 * nocapture, <WIDTH x i32> * nocapture,
+                                  <WIDTH x i1>) nounwind
+declare i32 @__packed_store_active(i32 * nocapture, <WIDTH x i32> %vals,
+                                   <WIDTH x i1>) nounwind
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; prefetch
+
+declare void @__prefetch_read_uniform_1(i8 * nocapture) nounwind 
+declare void @__prefetch_read_uniform_2(i8 * nocapture) nounwind 
+declare void @__prefetch_read_uniform_3(i8 * nocapture) nounwind 
+declare void @__prefetch_read_uniform_nt(i8 * nocapture) nounwind 
+
--- a/builtins/target-sse2-common.ll
+++ b/builtins/target-sse2-common.ll
@@ -0,0 +1,272 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+ctlztz()
+define_prefetches()
+define_shuffles()
+aossoa()
+rdrand_decls()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
+
+define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
+  ; do the rcpss call
+  %vecval = insertelement <4 x float> undef, float %0, i32 0
+  %call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
+  %scall = extractelement <4 x float> %call, i32 0
+
+  ; do one N-R iteration to improve precision, as above
+  %v_iv = fmul float %0, %scall
+  %two_minus = fsub float 2., %v_iv  
+  %iv_mul = fmul float %scall, %two_minus
+  ret float %iv_mul
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; rsqrt
+
+declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
+
+define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
+  ;  uniform float is = extract(__rsqrt_u(v), 0);
+  %v = insertelement <4 x float> undef, float %0, i32 0
+  %vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
+  %is = extractelement <4 x float> %vis, i32 0
+
+  ; Newton-Raphson iteration to improve precision
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul float %0, %is
+  %v_is_is = fmul float %v_is, %is
+  %three_sub = fsub float 3., %v_is_is
+  %is_mul = fmul float %is, %three_sub
+  %half_scale = fmul float 0.5, %is_mul
+  ret float %half_scale
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; sqrt
+
+declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
+
+
+define float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
+  sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
+  ret float %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; fast math mode
+
+declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
+declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
+
+define void @__fastmath() nounwind alwaysinline {
+  %ptr = alloca i32
+  %ptr8 = bitcast i32 * %ptr to i8 *
+  call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
+  %oldval = load i32 *%ptr
+
+  ; turn on DAZ (64)/FTZ (32768) -> 32832
+  %update = or i32 %oldval, 32832
+  store i32 %update, i32 *%ptr
+  call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8)
+  ret void
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
+  ret float %ret
+}
+
+
+define float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
+  ret float %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
+
+define double @__sqrt_uniform_double(double) nounwind alwaysinline {
+  sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
+  ret double %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define double @__min_uniform_double(double, double) nounwind readnone {
+  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
+  ret double %ret
+}
+
+define double @__max_uniform_double(double, double) nounwind readnone {
+  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
+  ret double %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding
+;;
+;; There are not any rounding instructions in SSE2, so we have to emulate
+;; the functionality with multiple instructions...
+
+; The code for __round_* is the result of compiling the following source
+; code.
+;
+; export float Round(float x) {
+;    unsigned int sign = signbits(x);
+;    unsigned int ix = intbits(x);
+;    ix ^= sign;
+;    x = floatbits(ix);
+;    x += 0x1.0p23f;
+;    x -= 0x1.0p23f;
+;    ix = intbits(x);
+;    ix ^= sign;
+;    x = floatbits(ix);
+;    return x;
+;}
+
+define float @__round_uniform_float(float) nounwind readonly alwaysinline {
+  %float_to_int_bitcast.i.i.i.i = bitcast float %0 to i32
+  %bitop.i.i = and i32 %float_to_int_bitcast.i.i.i.i, -2147483648
+  %bitop.i = xor i32 %bitop.i.i, %float_to_int_bitcast.i.i.i.i
+  %int_to_float_bitcast.i.i40.i = bitcast i32 %bitop.i to float
+  %binop.i = fadd float %int_to_float_bitcast.i.i40.i, 8.388608e+06
+  %binop21.i = fadd float %binop.i, -8.388608e+06
+  %float_to_int_bitcast.i.i.i = bitcast float %binop21.i to i32
+  %bitop31.i = xor i32 %float_to_int_bitcast.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop31.i to float
+  ret float %int_to_float_bitcast.i.i.i
+}
+
+;; Similarly, for implementations of the __floor* functions below, we have the
+;; bitcode from compiling the following source code...
+
+;export float Floor(float x) {
+;    float y = Round(x);
+;    unsigned int cmp = y > x ? 0xffffffff : 0;
+;    float delta = -1.f;
+;    unsigned int idelta = intbits(delta);
+;    idelta &= cmp;
+;    delta = floatbits(idelta);
+;    return y + delta;
+;}
+
+define float @__floor_uniform_float(float) nounwind readonly alwaysinline {
+  %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
+  %bincmp.i = fcmp ogt float %calltmp.i, %0
+  %selectexpr.i = sext i1 %bincmp.i to i32
+  %bitop.i = and i32 %selectexpr.i, -1082130432
+  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
+  %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret float %binop.i
+}
+
+;; And here is the code we compiled to get the __ceil* functions below
+;
+;export uniform float Ceil(uniform float x) {
+;    uniform float y = Round(x);
+;    uniform int yltx = y < x ? 0xffffffff : 0;
+;    uniform float delta = 1.f;
+;    uniform int idelta = intbits(delta);
+;    idelta &= yltx;
+;    delta = floatbits(idelta);
+;    return y + delta;
+;}
+
+define float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
+  %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
+  %bincmp.i = fcmp olt float %calltmp.i, %0
+  %selectexpr.i = sext i1 %bincmp.i to i32
+  %bitop.i = and i32 %selectexpr.i, 1065353216
+  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
+  %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret float %binop.i
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+declare double @round(double)
+declare double @floor(double)
+declare double @ceil(double)
+
+define double @__round_uniform_double(double) nounwind readonly alwaysinline {
+  %r = call double @round(double %0)
+  ret double %r
+}
+
+define double @__floor_uniform_double(double) nounwind readonly alwaysinline {
+  %r = call double @floor(double %0)
+  ret double %r
+}
+
+define double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
+  %r = call double @ceil(double %0)
+  ret double %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops / reductions
+
+declare i32 @llvm.ctpop.i32(i32)
+declare i64 @llvm.ctpop.i64(i64)
+
+define i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
+  %val = call i32 @llvm.ctpop.i32(i32 %0)
+  ret i32 %val
+}
+
+define i64 @__popcnt_int64(i64) nounwind readnone alwaysinline {
+  %val = call i64 @llvm.ctpop.i64(i64 %0)
+  ret i64 %val
+}
+
+
--- a/builtins/target-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -0,0 +1,697 @@
+;;  Copyright (c) 2010-2012, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+
+;; This file defines the target for "double-pumped" SSE2, i.e. running
+;; with 8-wide vectors
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; standard 8-wide definitions from m4 macros
+
+define(`WIDTH',`8')
+define(`MASK',`i32')
+include(`util.m4')
+
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+
+include(`target-sse2-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
+
+define <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ;  float iv = __rcp_v(v);
+  ;  return iv * (2. - v * iv);
+
+  unary4to8(call, float, @llvm.x86.sse.rcp.ps, %0)
+  ; do one N-R iteration
+  %v_iv = fmul <8 x float> %0, %call
+  %two_minus = fsub <8 x float> <float 2., float 2., float 2., float 2.,
+                                 float 2., float 2., float 2., float 2.>, %v_iv  
+  %iv_mul = fmul <8 x float> %call, %two_minus
+  ret <8 x float> %iv_mul
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rsqrt
+
+declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
+
+define <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
+  ;  float is = __rsqrt_v(v);
+  unary4to8(is, float, @llvm.x86.sse.rsqrt.ps, %v)
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul <8 x float> %v, %is
+  %v_is_is = fmul <8 x float> %v_is, %is
+  %three_sub = fsub <8 x float> <float 3., float 3., float 3., float 3.,
+                                 float 3., float 3., float 3., float 3.>, %v_is_is
+  %is_mul = fmul <8 x float> %is, %three_sub
+  %half_scale = fmul <8 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
+                                  float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  ret <8 x float> %half_scale
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; sqrt
+
+declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
+
+define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  unary4to8(call, float, @llvm.x86.sse.sqrt.ps, %0)
+  ret <8 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; svml stuff
+
+declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
+declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
+
+
+define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline {
+  unary4to8(ret, float, @__svml_sinf4, %0)
+  ret <8 x float> %ret
+}
+
+define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline {
+  unary4to8(ret, float, @__svml_cosf4, %0)
+  ret <8 x float> %ret
+}
+
+define void @__svml_sincos(<8 x float>, <8 x float> *,
+                                    <8 x float> *) nounwind readnone alwaysinline {
+  ; call svml_sincosf4 two times with the two 4-wide sub-vectors
+  %a = shufflevector <8 x float> %0, <8 x float> undef,
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %b = shufflevector <8 x float> %0, <8 x float> undef,
+         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+
+  %cospa = alloca <4 x float>
+  %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a)
+
+  %cospb = alloca <4 x float>
+  %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b)
+
+  %sin = shufflevector <4 x float> %sa, <4 x float> %sb,
+         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
+                    i32 4, i32 5, i32 6, i32 7>
+  store <8 x float> %sin, <8 x float> * %1
+
+  %cosa = load <4 x float> * %cospa
+  %cosb = load <4 x float> * %cospb
+  %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb,
+         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
+                    i32 4, i32 5, i32 6, i32 7>
+  store <8 x float> %cos, <8 x float> * %2
+
+  ret void
+}
+
+define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline {
+  unary4to8(ret, float, @__svml_tanf4, %0)
+  ret <8 x float> %ret
+}
+
+define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline {
+  unary4to8(ret, float, @__svml_atanf4, %0)
+  ret <8 x float> %ret
+}
+
+define <8 x float> @__svml_atan2(<8 x float>,
+                                          <8 x float>) nounwind readnone alwaysinline {
+  binary4to8(ret, float, @__svml_atan2f4, %0, %1)
+  ret <8 x float> %ret
+}
+
+define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline {
+  unary4to8(ret, float, @__svml_expf4, %0)
+  ret <8 x float> %ret
+}
+
+define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline {
+  unary4to8(ret, float, @__svml_logf4, %0)
+  ret <8 x float> %ret
+}
+
+define <8 x float> @__svml_pow(<8 x float>,
+                                        <8 x float>) nounwind readnone alwaysinline {
+  binary4to8(ret, float, @__svml_powf4, %0, %1)
+  ret <8 x float> %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <8 x float> @__max_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
+  binary4to8(call, float, @llvm.x86.sse.max.ps, %0, %1)
+  ret <8 x float> %call
+}
+
+define <8 x float> @__min_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
+  binary4to8(call, float, @llvm.x86.sse.min.ps, %0, %1)
+  ret <8 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; min/max
+
+; There is no blend instruction with SSE2, so we simulate it with bit
+; operations on i32s.  For these two vselect functions, for each
+; vector element, if the mask is on, we return the corresponding value
+; from %1, and otherwise return the value from %0.
+
+define <8 x i32> @__vselect_i32(<8 x i32>, <8 x i32> ,
+                                         <8 x i32> %mask) nounwind readnone alwaysinline {
+  %notmask = xor <8 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  %cleared_old = and <8 x i32> %0, %notmask
+  %masked_new = and <8 x i32> %1, %mask
+  %new = or <8 x i32> %cleared_old, %masked_new
+  ret <8 x i32> %new
+}
+
+define <8 x float> @__vselect_float(<8 x float>, <8 x float>,
+                                             <8 x i32> %mask) nounwind readnone alwaysinline {
+  %v0 = bitcast <8 x float> %0 to <8 x i32>
+  %v1 = bitcast <8 x float> %1 to <8 x i32>
+  %r = call <8 x i32> @__vselect_i32(<8 x i32> %v0, <8 x i32> %v1, <8 x i32> %mask)
+  %rf = bitcast <8 x i32> %r to <8 x float>
+  ret <8 x float> %rf
+}
+
+
+; To do vector integer min and max, we do the vector compare and then sign
+; extend the i1 vector result to an i32 mask.  The __vselect does the
+; rest...
+
+define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  %c = icmp slt <8 x i32> %0, %1
+  %mask = sext <8 x i1> %c to <8 x i32>
+  %v = call <8 x i32> @__vselect_i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> %mask)
+  ret <8 x i32> %v
+}
+
+define i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp slt i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  %c = icmp sgt <8 x i32> %0, %1
+  %mask = sext <8 x i1> %c to <8 x i32>
+  %v = call <8 x i32> @__vselect_i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> %mask)
+  ret <8 x i32> %v
+}
+
+define i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp sgt i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+
+; The functions for unsigned ints are similar, just with unsigned
+; comparison functions...
+
+define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  %c = icmp ult <8 x i32> %0, %1
+  %mask = sext <8 x i1> %c to <8 x i32>
+  %v = call <8 x i32> @__vselect_i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> %mask)
+  ret <8 x i32> %v
+}
+
+define i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp ult i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  %c = icmp ugt <8 x i32> %0, %1
+  %mask = sext <8 x i1> %c to <8 x i32>
+  %v = call <8 x i32> @__vselect_i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> %mask)
+  ret <8 x i32> %v
+}
+
+define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp ugt i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops / reductions
+
+declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
+
+define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
+  ; first do two 4-wide movmsk calls
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
+  %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
+
+  ; and shift the first one over by 4 before ORing it with the value 
+  ; of the second one
+  %v1s = shl i32 %v1, 4
+  %v = or i32 %v0, %v1s
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
+}
+
+define i1 @__any(<8 x i32>) nounwind readnone alwaysinline {
+  ; first do two 4-wide movmsk calls
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
+  %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
+
+  ; and shift the first one over by 4 before ORing it with the value 
+  ; of the second one
+  %v1s = shl i32 %v1, 4
+  %v = or i32 %v0, %v1s
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<8 x i32>) nounwind readnone alwaysinline {
+  ; first do two 4-wide movmsk calls
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
+  %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
+
+  ; and shift the first one over by 4 before ORing it with the value 
+  ; of the second one
+  %v1s = shl i32 %v1, 4
+  %v = or i32 %v0, %v1s
+  %cmp = icmp eq i32 %v, 255
+  ret i1 %cmp
+}
+
+define i1 @__none(<8 x i32>) nounwind readnone alwaysinline {
+  ; first do two 4-wide movmsk calls
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
+  %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
+
+  ; and shift the first one over by 4 before ORing it with the value 
+  ; of the second one
+  %v1s = shl i32 %v1, 4
+  %v = or i32 %v0, %v1s
+  %cmp = icmp eq i32 %v, 0
+  ret i1 %cmp
+}
+
+define <4 x float> @__vec4_add_float(<4 x float> %v0,
+                                     <4 x float> %v1) nounwind readnone alwaysinline {
+  %v = fadd <4 x float> %v0, %v1
+  ret <4 x float> %v
+}
+
+define float @__add_float(float, float) nounwind readnone alwaysinline {
+  %v = fadd float %0, %1
+  ret float %v
+}
+
+define float @__reduce_add_float(<8 x float>) nounwind readnone alwaysinline {
+  reduce8by4(float, @__vec4_add_float, @__add_float)
+}
+
+define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
+  reduce8(float, @__min_varying_float, @__min_uniform_float)
+}
+
+define float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
+  reduce8(float, @__max_varying_float, @__max_uniform_float)
+}
+
+; helper function for reduce_add_int32
+define <4 x i32> @__vec4_add_int32(<4 x i32> %v0,
+                                   <4 x i32> %v1) nounwind readnone alwaysinline {
+  %v = add <4 x i32> %v0, %v1
+  ret <4 x i32> %v
+}
+
+; helper function for reduce_add_int32
+define i32 @__add_int32(i32, i32) nounwind readnone alwaysinline {
+  %v = add i32 %0, %1
+  ret i32 %v
+}
+
+define i32 @__reduce_add_int32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8by4(i32, @__vec4_add_int32, @__add_int32)
+}
+
+define i32 @__reduce_min_int32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8(i32, @__min_varying_int32, @__min_uniform_int32)
+}
+
+define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8(i32, @__max_varying_int32, @__max_uniform_int32)
+}
+
+define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32)
+}
+
+define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32)
+}
+
+define <4 x double> @__add_varying_double(<4 x double>,
+                                     <4 x double>) nounwind readnone alwaysinline {
+  %r = fadd <4 x double> %0, %1
+  ret <4 x double> %r
+}
+
+define double @__add_uniform_double(double, double) nounwind readnone alwaysinline {
+  %r = fadd double %0, %1
+  ret double %r
+}
+
+define double @__reduce_add_double(<8 x double>) nounwind readnone {
+  reduce8by4(double, @__add_varying_double, @__add_uniform_double)
+}
+
+define double @__reduce_min_double(<8 x double>) nounwind readnone {
+  reduce8(double, @__min_varying_double, @__min_uniform_double)
+}
+
+define double @__reduce_max_double(<8 x double>) nounwind readnone {
+  reduce8(double, @__max_varying_double, @__max_uniform_double)
+}
+
+define <4 x i64> @__add_varying_int64(<4 x i64>,
+                                      <4 x i64>) nounwind readnone alwaysinline {
+  %r = add <4 x i64> %0, %1
+  ret <4 x i64> %r
+}
+
+define i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
+  %r = add i64 %0, %1
+  ret i64 %r
+}
+
+define i64 @__reduce_add_int64(<8 x i64>) nounwind readnone {
+  reduce8by4(i64, @__add_varying_int64, @__add_uniform_int64)
+}
+
+define i64 @__reduce_min_int64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+define i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+reduce_equal(8)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+
+masked_load(i8,  1)
+masked_load(i16, 2)
+masked_load(i32, 4)
+masked_load(float, 4)
+masked_load(i64, 8)
+masked_load(double, 8)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather/scatter
+
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
+
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float rounding
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding
+;;
+;; There are not any rounding instructions in SSE2, so we have to emulate
+;; the functionality with multiple instructions...
+
+; The code for __round_* is the result of compiling the following source
+; code.
+;
+; export float Round(float x) {
+;    unsigned int sign = signbits(x);
+;    unsigned int ix = intbits(x);
+;    ix ^= sign;
+;    x = floatbits(ix);
+;    x += 0x1.0p23f;
+;    x -= 0x1.0p23f;
+;    ix = intbits(x);
+;    ix ^= sign;
+;    x = floatbits(ix);
+;    return x;
+;}
+
+define <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  %float_to_int_bitcast.i.i.i.i = bitcast <8 x float> %0 to <8 x i32>
+  %bitop.i.i = and <8 x i32> %float_to_int_bitcast.i.i.i.i, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+  %bitop.i = xor <8 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i40.i = bitcast <8 x i32> %bitop.i to <8 x float>
+  %binop.i = fadd <8 x float> %int_to_float_bitcast.i.i40.i, <float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06>
+  %binop21.i = fadd <8 x float> %binop.i, <float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06>
+  %float_to_int_bitcast.i.i.i = bitcast <8 x float> %binop21.i to <8 x i32>
+  %bitop31.i = xor <8 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop31.i to <8 x float>
+  ret <8 x float> %int_to_float_bitcast.i.i.i
+}
+
+;; Similarly, for implementations of the __floor* functions below, we have the
+;; bitcode from compiling the following source code...
+
+;export float Floor(float x) {
+;    float y = Round(x);
+;    unsigned int cmp = y > x ? 0xffffffff : 0;
+;    float delta = -1.f;
+;    unsigned int idelta = intbits(delta);
+;    idelta &= cmp;
+;    delta = floatbits(idelta);
+;    return y + delta;
+;}
+
+define <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <8 x float> @__round_varying_float(<8 x float> %0) nounwind
+  %bincmp.i = fcmp ogt <8 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <8 x i1> %bincmp.i to <8 x i32>
+  %bitop.i = and <8 x i32> %val_to_boolvec32.i, <i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432>
+  %int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop.i to <8 x float>
+  %binop.i = fadd <8 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <8 x float> %binop.i
+}
+
+;; And here is the code we compiled to get the __ceil* functions below
+;
+;export uniform float Ceil(uniform float x) {
+;    uniform float y = Round(x);
+;    uniform int yltx = y < x ? 0xffffffff : 0;
+;    uniform float delta = 1.f;
+;    uniform int idelta = intbits(delta);
+;    idelta &= yltx;
+;    delta = floatbits(idelta);
+;    return y + delta;
+;}
+
+define <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <8 x float> @__round_varying_float(<8 x float> %0) nounwind
+  %bincmp.i = fcmp olt <8 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <8 x i1> %bincmp.i to <8 x i32>
+  %bitop.i = and <8 x i32> %val_to_boolvec32.i, <i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216>
+  %int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop.i to <8 x float>
+  %binop.i = fadd <8 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <8 x float> %binop.i
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+define <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  unary1to8(double, @round)
+}
+
+define <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  unary1to8(double, @floor)
+}
+
+define <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  unary1to8(double, @ceil)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+gen_masked_store(i8)
+gen_masked_store(i16)
+gen_masked_store(i32)
+gen_masked_store(i64)
+
+masked_store_blend_8_16_by_8()
+
+define void @__masked_store_blend_i32(<8 x i32>* nocapture, <8 x i32>, 
+                                      <8 x i32> %mask) nounwind alwaysinline {
+  %val = load <8 x i32> * %0, align 4
+  %newval = call <8 x i32> @__vselect_i32(<8 x i32> %val, <8 x i32> %1, <8 x i32> %mask) 
+  store <8 x i32> %newval, <8 x i32> * %0, align 4
+  ret void
+}
+
+define void @__masked_store_blend_i64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
+                                      <8 x i32> %mask) nounwind alwaysinline {
+  %oldValue = load <8 x i64>* %ptr, align 8
+
+  ; Do 8x64-bit blends by doing two <8 x i32> blends, where the <8 x i32> values
+  ; are actually bitcast <2 x i64> values
+  ;
+  ; set up the first two 64-bit values
+  %old0123  = shufflevector <8 x i64> %oldValue, <8 x i64> undef,
+                            <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %old0123f = bitcast <4 x i64> %old0123 to <8 x float>
+  %new0123  = shufflevector <8 x i64> %new, <8 x i64> undef,
+                            <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %new0123f = bitcast <4 x i64> %new0123 to <8 x float>
+  ; compute mask--note that the indices are doubled-up
+  %mask0123 = shufflevector <8 x i32> %mask, <8 x i32> undef,
+              <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+  ; and blend the first 4 values
+  %result0123f = call <8 x float> @__vselect_float(<8 x float> %old0123f, <8 x float> %new0123f,
+                                                   <8 x i32> %mask0123)
+  %result0123 = bitcast <8 x float> %result0123f to <4 x i64>
+
+  ; and again
+  %old4567  = shufflevector <8 x i64> %oldValue, <8 x i64> undef,
+                            <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %old4567f = bitcast <4 x i64> %old4567 to <8 x float>
+  %new4567  = shufflevector <8 x i64> %new, <8 x i64> undef,
+                            <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %new4567f = bitcast <4 x i64> %new4567 to <8 x float>
+  ; compute mask--note that the values are doubled-up
+  %mask4567 = shufflevector <8 x i32> %mask, <8 x i32> undef,
+              <8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+  ; and blend the two of the values
+  %result4567f = call <8 x float> @__vselect_float(<8 x float> %old4567f, <8 x float> %new4567f,
+                                                   <8 x i32> %mask4567)
+  %result4567 = bitcast <8 x float> %result4567f to <4 x i64>
+
+  ; reconstruct the final <8 x i64> vector
+  %final = shufflevector <4 x i64> %result0123, <4 x i64> %result4567,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x i64> %final, <8 x i64> * %ptr, align 8
+  ret void
+}
+
+masked_store_float_double()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
+
+define <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
+  unary2to8(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
+  ret <8 x double> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision float min/max
+
+declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
+  binary2to8(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
+  ret <8 x double> %ret
+}
+
+define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
+  binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
+  ret <8 x double> %ret
+}
--- a/builtins/target-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -0,0 +1,606 @@
+;;  Copyright (c) 2010-2012, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Define the standard library builtins for the SSE2 target
+
+; Define some basics for a 4-wide target
+define(`WIDTH',`4')
+define(`MASK',`i32')
+include(`util.m4')
+
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+
+include(`target-sse2-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding
+;;
+;; There are not any rounding instructions in SSE2, so we have to emulate
+;; the functionality with multiple instructions...
+
+; The code for __round_* is the result of compiling the following source
+; code.
+;
+; export float Round(float x) {
+;    unsigned int sign = signbits(x);
+;    unsigned int ix = intbits(x);
+;    ix ^= sign;
+;    x = floatbits(ix);
+;    x += 0x1.0p23f;
+;    x -= 0x1.0p23f;
+;    ix = intbits(x);
+;    ix ^= sign;
+;    x = floatbits(ix);
+;    return x;
+;}
+
+define <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  %float_to_int_bitcast.i.i.i.i = bitcast <4 x float> %0 to <4 x i32>
+  %bitop.i.i = and <4 x i32> %float_to_int_bitcast.i.i.i.i, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+  %bitop.i = xor <4 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i40.i = bitcast <4 x i32> %bitop.i to <4 x float>
+  %binop.i = fadd <4 x float> %int_to_float_bitcast.i.i40.i, <float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06>
+  %binop21.i = fadd <4 x float> %binop.i, <float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06>
+  %float_to_int_bitcast.i.i.i = bitcast <4 x float> %binop21.i to <4 x i32>
+  %bitop31.i = xor <4 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i.i = bitcast <4 x i32> %bitop31.i to <4 x float>
+  ret <4 x float> %int_to_float_bitcast.i.i.i
+}
+
+;; Similarly, for implementations of the __floor* functions below, we have the
+;; bitcode from compiling the following source code...
+
+;export float Floor(float x) {
+;    float y = Round(x);
+;    unsigned int cmp = y > x ? 0xffffffff : 0;
+;    float delta = -1.f;
+;    unsigned int idelta = intbits(delta);
+;    idelta &= cmp;
+;    delta = floatbits(idelta);
+;    return y + delta;
+;}
+
+define <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <4 x float> @__round_varying_float(<4 x float> %0) nounwind
+  %bincmp.i = fcmp ogt <4 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <4 x i1> %bincmp.i to <4 x i32>
+  %bitop.i = and <4 x i32> %val_to_boolvec32.i, <i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432>
+  %int_to_float_bitcast.i.i.i = bitcast <4 x i32> %bitop.i to <4 x float>
+  %binop.i = fadd <4 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <4 x float> %binop.i
+}
+
+;; And here is the code we compiled to get the __ceil* functions below
+;
+;export uniform float Ceil(uniform float x) {
+;    uniform float y = Round(x);
+;    uniform int yltx = y < x ? 0xffffffff : 0;
+;    uniform float delta = 1.f;
+;    uniform int idelta = intbits(delta);
+;    idelta &= yltx;
+;    delta = floatbits(idelta);
+;    return y + delta;
+;}
+
+define <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <4 x float> @__round_varying_float(<4 x float> %0) nounwind
+  %bincmp.i = fcmp olt <4 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <4 x i1> %bincmp.i to <4 x i32>
+  %bitop.i = and <4 x i32> %val_to_boolvec32.i, <i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216>
+  %int_to_float_bitcast.i.i.i = bitcast <4 x i32> %bitop.i to <4 x float>
+  %binop.i = fadd <4 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <4 x float> %binop.i
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+define <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  unary1to4(double, @round)
+}
+
+define <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  unary1to4(double, @floor)
+}
+
+define <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  unary1to4(double, @ceil)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; min/max
+
+; There is no blend instruction with SSE2, so we simulate it with bit
+; operations on i32s.  For these two vselect functions, for each
+; vector element, if the mask is on, we return the corresponding value
+; from %1, and otherwise return the value from %0.
+
+define <4 x i32> @__vselect_i32(<4 x i32>, <4 x i32> ,
+                                <4 x i32> %mask) nounwind readnone alwaysinline {
+  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %cleared_old = and <4 x i32> %0, %notmask
+  %masked_new = and <4 x i32> %1, %mask
+  %new = or <4 x i32> %cleared_old, %masked_new
+  ret <4 x i32> %new
+}
+
+define <4 x float> @__vselect_float(<4 x float>, <4 x float>,
+                                    <4 x i32> %mask) nounwind readnone alwaysinline {
+  %v0 = bitcast <4 x float> %0 to <4 x i32>
+  %v1 = bitcast <4 x float> %1 to <4 x i32>
+  %r = call <4 x i32> @__vselect_i32(<4 x i32> %v0, <4 x i32> %v1, <4 x i32> %mask)
+  %rf = bitcast <4 x i32> %r to <4 x float>
+  ret <4 x float> %rf
+}
+
+
+; To do vector integer min and max, we do the vector compare and then sign
+; extend the i1 vector result to an i32 mask.  The __vselect does the
+; rest...
+
+define <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %c = icmp slt <4 x i32> %0, %1
+  %mask = sext <4 x i1> %c to <4 x i32>
+  %v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
+  ret <4 x i32> %v
+}
+
+define i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp slt i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %c = icmp sgt <4 x i32> %0, %1
+  %mask = sext <4 x i1> %c to <4 x i32>
+  %v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
+  ret <4 x i32> %v
+}
+
+define i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp sgt i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+
+; The functions for unsigned ints are similar, just with unsigned
+; comparison functions...
+
+define <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %c = icmp ult <4 x i32> %0, %1
+  %mask = sext <4 x i1> %c to <4 x i32>
+  %v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
+  ret <4 x i32> %v
+}
+
+define i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp ult i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %c = icmp ugt <4 x i32> %0, %1
+  %mask = sext <4 x i1> %c to <4 x i32>
+  %v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
+  ret <4 x i32> %v
+}
+
+define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp ugt i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops / reductions
+
+declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
+
+define i64 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i32> %0 to <4 x float>
+  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
+}
+
+define i1 @__any(<4 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i32> %0 to <4 x float>
+  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<4 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i32> %0 to <4 x float>
+  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+  %cmp = icmp eq i32 %v, 15
+  ret i1 %cmp
+}
+
+define i1 @__none(<4 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i32> %0 to <4 x float>
+  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+  %cmp = icmp eq i32 %v, 0
+  ret i1 %cmp
+}
+
+define float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline {
+  %v1 = shufflevector <4 x float> %v, <4 x float> undef,
+                      <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %m1 = fadd <4 x float> %v1, %v
+  %m1a = extractelement <4 x float> %m1, i32 0
+  %m1b = extractelement <4 x float> %m1, i32 1
+  %sum = fadd float %m1a, %m1b
+  ret float %sum
+}
+
+define float @__reduce_min_float(<4 x float>) nounwind readnone {
+  reduce4(float, @__min_varying_float, @__min_uniform_float)
+}
+
+define float @__reduce_max_float(<4 x float>) nounwind readnone {
+  reduce4(float, @__max_varying_float, @__max_uniform_float)
+}
+
+define i32 @__reduce_add_int32(<4 x i32> %v) nounwind readnone {
+  %v1 = shufflevector <4 x i32> %v, <4 x i32> undef,
+                      <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %m1 = add <4 x i32> %v1, %v
+  %m1a = extractelement <4 x i32> %m1, i32 0
+  %m1b = extractelement <4 x i32> %m1, i32 1
+  %sum = add i32 %m1a, %m1b
+  ret i32 %sum
+}
+
+define i32 @__reduce_min_int32(<4 x i32>) nounwind readnone {
+  reduce4(i32, @__min_varying_int32, @__min_uniform_int32)
+}
+
+define i32 @__reduce_max_int32(<4 x i32>) nounwind readnone {
+  reduce4(i32, @__max_varying_int32, @__max_uniform_int32)
+}
+
+define i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone {
+  reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32)
+}
+
+define i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone {
+  reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32)
+}
+
+
+define double @__reduce_add_double(<4 x double>) nounwind readnone {
+  %v0 = shufflevector <4 x double> %0, <4 x double> undef,
+                      <2 x i32> <i32 0, i32 1>
+  %v1 = shufflevector <4 x double> %0, <4 x double> undef,
+                      <2 x i32> <i32 2, i32 3>
+  %sum = fadd <2 x double> %v0, %v1
+  %e0 = extractelement <2 x double> %sum, i32 0
+  %e1 = extractelement <2 x double> %sum, i32 1
+  %m = fadd double %e0, %e1
+  ret double %m
+}
+
+define double @__reduce_min_double(<4 x double>) nounwind readnone {
+  reduce4(double, @__min_varying_double, @__min_uniform_double)
+}
+
+define double @__reduce_max_double(<4 x double>) nounwind readnone {
+  reduce4(double, @__max_varying_double, @__max_uniform_double)
+}
+
+define i64 @__reduce_add_int64(<4 x i64>) nounwind readnone {
+  %v0 = shufflevector <4 x i64> %0, <4 x i64> undef,
+                      <2 x i32> <i32 0, i32 1>
+  %v1 = shufflevector <4 x i64> %0, <4 x i64> undef,
+                      <2 x i32> <i32 2, i32 3>
+  %sum = add <2 x i64> %v0, %v1
+  %e0 = extractelement <2 x i64> %sum, i32 0
+  %e1 = extractelement <2 x i64> %sum, i32 1
+  %m = add i64 %e0, %e1
+  ret i64 %m
+}
+
+define i64 @__reduce_min_int64(<4 x i64>) nounwind readnone {
+  reduce4(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+define i64 @__reduce_max_int64(<4 x i64>) nounwind readnone {
+  reduce4(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+define i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone {
+  reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone {
+  reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+reduce_equal(4)
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+define void @__masked_store_blend_i32(<4 x i32>* nocapture, <4 x i32>, 
+                                      <4 x i32> %mask) nounwind alwaysinline {
+  %val = load <4 x i32> * %0, align 4
+  %newval = call <4 x i32> @__vselect_i32(<4 x i32> %val, <4 x i32> %1, <4 x i32> %mask) 
+  store <4 x i32> %newval, <4 x i32> * %0, align 4
+  ret void
+}
+
+define void @__masked_store_blend_i64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
+                                      <4 x i32> %mask) nounwind alwaysinline {
+  %oldValue = load <4 x i64>* %ptr, align 8
+
+  ; Do 4x64-bit blends by doing two <4 x i32> blends, where the <4 x i32> values
+  ; are actually bitcast <2 x i64> values
+  ;
+  ; set up the first two 64-bit values
+  %old01  = shufflevector <4 x i64> %oldValue, <4 x i64> undef,
+                          <2 x i32> <i32 0, i32 1>
+  %old01f = bitcast <2 x i64> %old01 to <4 x float>
+  %new01  = shufflevector <4 x i64> %new, <4 x i64> undef,
+                          <2 x i32> <i32 0, i32 1>
+  %new01f = bitcast <2 x i64> %new01 to <4 x float>
+  ; compute mask--note that the indices 0 and 1 are doubled-up
+  %mask01 = shufflevector <4 x i32> %mask, <4 x i32> undef,
+                          <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+  ; and blend the two of the values
+  %result01f = call <4 x float> @__vselect_float(<4 x float> %old01f, <4 x float> %new01f, <4 x i32> %mask01)
+  %result01 = bitcast <4 x float> %result01f to <2 x i64>
+
+  ; and again
+  %old23  = shufflevector <4 x i64> %oldValue, <4 x i64> undef,
+                          <2 x i32> <i32 2, i32 3>
+  %old23f = bitcast <2 x i64> %old23 to <4 x float>
+  %new23  = shufflevector <4 x i64> %new, <4 x i64> undef,
+                          <2 x i32> <i32 2, i32 3>
+  %new23f = bitcast <2 x i64> %new23 to <4 x float>
+  ; compute mask--note that the values 2 and 3 are doubled-up
+  %mask23 = shufflevector <4 x i32> %mask, <4 x i32> undef,
+                          <4 x i32> <i32 2, i32 2, i32 3, i32 3>
+  ; and blend the two of the values
+  %result23f = call <4 x float> @__vselect_float(<4 x float> %old23f, <4 x float> %new23f, <4 x i32> %mask23)
+  %result23 = bitcast <4 x float> %result23f to <2 x i64>
+
+  ; reconstruct the final <4 x i64> vector
+  %final = shufflevector <2 x i64> %result01, <2 x i64> %result23,
+                         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x i64> %final, <4 x i64> * %ptr, align 8
+  ret void
+}
+
+
+masked_store_float_double()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
+
+define <4 x float> @__rcp_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  %call = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %0)
+  ; do one N-R iteration to improve precision
+  ;  float iv = __rcp_v(v);
+  ;  return iv * (2. - v * iv);
+  %v_iv = fmul <4 x float> %0, %call
+  %two_minus = fsub <4 x float> <float 2., float 2., float 2., float 2.>, %v_iv  
+  %iv_mul = fmul <4 x float> %call, %two_minus
+  ret <4 x float> %iv_mul
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; rsqrt
+
+declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
+
+define <4 x float> @__rsqrt_varying_float(<4 x float> %v) nounwind readonly alwaysinline {
+  ;  float is = __rsqrt_v(v);
+  %is = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %v)
+  ; Newton-Raphson iteration to improve precision
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul <4 x float> %v, %is
+  %v_is_is = fmul <4 x float> %v_is, %is
+  %three_sub = fsub <4 x float> <float 3., float 3., float 3., float 3.>, %v_is_is
+  %is_mul = fmul <4 x float> %is, %three_sub
+  %half_scale = fmul <4 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  ret <4 x float> %half_scale
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; sqrt
+
+declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
+
+define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  %call = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %0)
+  ret <4 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; svml stuff
+
+declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
+declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
+
+
+define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
+  ret <4 x float> %ret
+}
+
+define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
+  ret <4 x float> %ret
+}
+
+define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
+  %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
+  store <4 x float> %s, <4 x float> * %1
+  ret void
+}
+
+define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
+  ret <4 x float> %ret
+}
+
+define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
+  ret <4 x float> %ret
+}
+
+define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
+  ret <4 x float> %ret
+}
+
+define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_expf4(<4 x float> %0)
+  ret <4 x float> %ret
+}
+
+define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_logf4(<4 x float> %0)
+  ret <4 x float> %ret
+}
+
+define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
+  ret <4 x float> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <4 x float> @__max_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
+  %call = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %0, <4 x float> %1)
+  ret <4 x float> %call
+}
+
+define <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
+  %call = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %0, <4 x float> %1)
+  ret <4 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
+
+define <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
+  unary2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
+  ret <4 x double> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone {
+  binary2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
+  ret <4 x double> %ret
+}
+
+define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone {
+  binary2to4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
+  ret <4 x double> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+masked_store_blend_8_16_by_4()
+
+gen_masked_store(i8)
+gen_masked_store(i16)
+gen_masked_store(i32)
+gen_masked_store(i64)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+
+masked_load(i8,  1)
+masked_load(i16, 2)
+masked_load(i32, 4)
+masked_load(float, 4)
+masked_load(i64, 8)
+masked_load(double, 8)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather/scatter
+
+; define these with the macros from stdlib.m4
+
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
+
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)
--- a/builtins/target-sse4-common.ll
+++ b/builtins/target-sse4-common.ll
@@ -0,0 +1,277 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+ctlztz()
+define_prefetches()
+define_shuffles()
+aossoa()
+rdrand_decls()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding floats
+
+declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
+
+define float @__round_uniform_float(float) nounwind readonly alwaysinline {
+  ; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  ; the roundss intrinsic is a total mess--docs say:
+  ;
+  ;  __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
+  ;       
+  ;  b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
+  ;  on b0. The higher order 96 bits are copied directly from input parameter a. The
+  ;  return value is described by the following equations:
+  ;
+  ;  r0 = RND(b0)
+  ;  r1 = a1
+  ;  r2 = a2
+  ;  r3 = a3
+  ;
+  ;  It doesn't matter what we pass as a, since we only need the r0 value
+  ;  here.  So we pass the same register for both.  Further, only the 0th
+  ;  element of the b parameter matters
+  %xi = insertelement <4 x float> undef, float %0, i32 0
+  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
+  %rs = extractelement <4 x float> %xr, i32 0
+  ret float %rs
+}
+
+define float @__floor_uniform_float(float) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <4 x float> undef, float %0, i32 0
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1010 = 9
+  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
+  %rs = extractelement <4 x float> %xr, i32 0
+  ret float %rs
+}
+
+define float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <4 x float> undef, float %0, i32 0
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
+  %rs = extractelement <4 x float> %xr, i32 0
+  ret float %rs
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
+
+define double @__round_uniform_double(double) nounwind readonly alwaysinline {
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
+define double @__floor_uniform_double(double) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
+define double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
+
+define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
+  ; do the rcpss call
+  %vecval = insertelement <4 x float> undef, float %0, i32 0
+  %call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
+  %scall = extractelement <4 x float> %call, i32 0
+
+  ; do one N-R iteration to improve precision, as above
+  %v_iv = fmul float %0, %scall
+  %two_minus = fsub float 2., %v_iv  
+  %iv_mul = fmul float %scall, %two_minus
+  ret float %iv_mul
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; rsqrt
+
+declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
+
+define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
+  ;  uniform float is = extract(__rsqrt_u(v), 0);
+  %v = insertelement <4 x float> undef, float %0, i32 0
+  %vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
+  %is = extractelement <4 x float> %vis, i32 0
+
+  ; Newton-Raphson iteration to improve precision
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul float %0, %is
+  %v_is_is = fmul float %v_is, %is
+  %three_sub = fsub float 3., %v_is_is
+  %is_mul = fmul float %is, %three_sub
+  %half_scale = fmul float 0.5, %is_mul
+  ret float %half_scale
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; sqrt
+
+declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
+
+define float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
+  sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
+  ret float %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; fast math mode
+
+declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
+declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
+
+define void @__fastmath() nounwind alwaysinline {
+  %ptr = alloca i32
+  %ptr8 = bitcast i32 * %ptr to i8 *
+  call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
+  %oldval = load i32 *%ptr
+
+  ; turn on DAZ (64)/FTZ (32768) -> 32832
+  %update = or i32 %oldval, 32832
+  store i32 %update, i32 *%ptr
+  call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8)
+  ret void
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
+  ret float %ret
+}
+
+define float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
+  ret float %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
+
+define double @__sqrt_uniform_double(double) nounwind alwaysinline {
+  sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
+  ret double %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define double @__min_uniform_double(double, double) nounwind readnone {
+  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
+  ret double %ret
+}
+
+
+define double @__max_uniform_double(double, double) nounwind readnone {
+  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
+  ret double %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int32 min/max
+
+declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
+
+define i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret i32 %ret
+}
+
+define i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret i32 %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; unsigned int min/max
+
+declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
+
+define i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret i32 %ret
+}
+
+define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret i32 %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops / reductions
+
+declare i32 @llvm.ctpop.i32(i32) nounwind readnone
+
+define i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
+  %call = call i32 @llvm.ctpop.i32(i32 %0)
+  ret i32 %call
+}
+
+declare i64 @llvm.ctpop.i64(i64) nounwind readnone
+
+define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
+  %call = call i64 @llvm.ctpop.i64(i64 %0)
+  ret i64 %call
+}
--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -0,0 +1,631 @@
+;;  Copyright (c) 2010-2012, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+
+;; This file defines the target for "double-pumped" SSE4, i.e. running
+;; with 8-wide vectors
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; standard 8-wide definitions from m4 macros
+
+define(`WIDTH',`8')
+define(`MASK',`i32')
+include(`util.m4')
+
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+
+include(`target-sse4-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
+
+define <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ;  float iv = __rcp_v(v);
+  ;  return iv * (2. - v * iv);
+
+  unary4to8(call, float, @llvm.x86.sse.rcp.ps, %0)
+  ; do one N-R iteration
+  %v_iv = fmul <8 x float> %0, %call
+  %two_minus = fsub <8 x float> <float 2., float 2., float 2., float 2.,
+                                 float 2., float 2., float 2., float 2.>, %v_iv  
+  %iv_mul = fmul <8 x float> %call, %two_minus
+  ret <8 x float> %iv_mul
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rsqrt
+
+declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
+
+define <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
+  ;  float is = __rsqrt_v(v);
+  unary4to8(is, float, @llvm.x86.sse.rsqrt.ps, %v)
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul <8 x float> %v, %is
+  %v_is_is = fmul <8 x float> %v_is, %is
+  %three_sub = fsub <8 x float> <float 3., float 3., float 3., float 3.,
+                                 float 3., float 3., float 3., float 3.>, %v_is_is
+  %is_mul = fmul <8 x float> %is, %three_sub
+  %half_scale = fmul <8 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
+                                  float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  ret <8 x float> %half_scale
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; sqrt
+
+declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
+
+define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  unary4to8(call, float, @llvm.x86.sse.sqrt.ps, %0)
+  ret <8 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; svml stuff
+
+declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
+declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
+
+
+define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline {
+  unary4to8(ret, float, @__svml_sinf4, %0)
+  ret <8 x float> %ret
+}
+
+define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline {
+  unary4to8(ret, float, @__svml_cosf4, %0)
+  ret <8 x float> %ret
+}
+
+define void @__svml_sincos(<8 x float>, <8 x float> *,
+                                    <8 x float> *) nounwind readnone alwaysinline {
+  ; call svml_sincosf4 two times with the two 4-wide sub-vectors
+  %a = shufflevector <8 x float> %0, <8 x float> undef,
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %b = shufflevector <8 x float> %0, <8 x float> undef,
+         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+
+  %cospa = alloca <4 x float>
+  %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a)
+
+  %cospb = alloca <4 x float>
+  %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b)
+
+  %sin = shufflevector <4 x float> %sa, <4 x float> %sb,
+         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
+                    i32 4, i32 5, i32 6, i32 7>
+  store <8 x float> %sin, <8 x float> * %1
+
+  %cosa = load <4 x float> * %cospa
+  %cosb = load <4 x float> * %cospb
+  %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb,
+         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
+                    i32 4, i32 5, i32 6, i32 7>
+  store <8 x float> %cos, <8 x float> * %2
+
+  ret void
+}
+
+define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline {
+  unary4to8(ret, float, @__svml_tanf4, %0)
+  ret <8 x float> %ret
+}
+
+define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline {
+  unary4to8(ret, float, @__svml_atanf4, %0)
+  ret <8 x float> %ret
+}
+
+define <8 x float> @__svml_atan2(<8 x float>,
+                                          <8 x float>) nounwind readnone alwaysinline {
+  binary4to8(ret, float, @__svml_atan2f4, %0, %1)
+  ret <8 x float> %ret
+}
+
+define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline {
+  unary4to8(ret, float, @__svml_expf4, %0)
+  ret <8 x float> %ret
+}
+
+define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline {
+  unary4to8(ret, float, @__svml_logf4, %0)
+  ret <8 x float> %ret
+}
+
+define <8 x float> @__svml_pow(<8 x float>,
+                                        <8 x float>) nounwind readnone alwaysinline {
+  binary4to8(ret, float, @__svml_powf4, %0, %1)
+  ret <8 x float> %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <8 x float> @__max_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
+  binary4to8(call, float, @llvm.x86.sse.max.ps, %0, %1)
+  ret <8 x float> %call
+}
+
+define <8 x float> @__min_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
+  binary4to8(call, float, @llvm.x86.sse.min.ps, %0, %1)
+  ret <8 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int32 min/max
+
+define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(call, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret <8 x i32> %call
+}
+
+define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(call, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret <8 x i32> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; unsigned int min/max
+
+define <8 x i32> @__min_varying_uint32(<8 x i32>,
+                                       <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(call, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret <8 x i32> %call
+}
+
+define <8 x i32> @__max_varying_uint32(<8 x i32>,
+                                       <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(call, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret <8 x i32> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops / reductions
+
+declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
+
+define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
+  ; first do two 4-wide movmsk calls
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
+  %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
+
+  ; and shift the first one over by 4 before ORing it with the value 
+  ; of the second one
+  %v1s = shl i32 %v1, 4
+  %v = or i32 %v0, %v1s
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
+}
+
+define i1 @__any(<8 x i32>) nounwind readnone alwaysinline {
+  ; first do two 4-wide movmsk calls
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
+  %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
+
+  ; and shift the first one over by 4 before ORing it with the value 
+  ; of the second one
+  %v1s = shl i32 %v1, 4
+  %v = or i32 %v0, %v1s
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<8 x i32>) nounwind readnone alwaysinline {
+  ; first do two 4-wide movmsk calls
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
+  %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
+
+  ; and shift the first one over by 4 before ORing it with the value 
+  ; of the second one
+  %v1s = shl i32 %v1, 4
+  %v = or i32 %v0, %v1s
+  %cmp = icmp eq i32 %v, 255
+  ret i1 %cmp
+}
+
+define i1 @__none(<8 x i32>) nounwind readnone alwaysinline {
+  ; first do two 4-wide movmsk calls
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
+  %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
+
+  ; and shift the first one over by 4 before ORing it with the value 
+  ; of the second one
+  %v1s = shl i32 %v1, 4
+  %v = or i32 %v0, %v1s
+  %cmp = icmp eq i32 %v, 0
+  ret i1 %cmp
+}
+
+define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
+  reduce8by4(float, @llvm.x86.sse.min.ps, @__min_uniform_float)
+}
+
+define float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
+  reduce8by4(float, @llvm.x86.sse.max.ps, @__max_uniform_float)
+}
+
+; helper function for reduce_add_int32
+define <4 x i32> @__vec4_add_int32(<4 x i32> %v0,
+                                   <4 x i32> %v1) nounwind readnone alwaysinline {
+  %v = add <4 x i32> %v0, %v1
+  ret <4 x i32> %v
+}
+
+; helper function for reduce_add_int32
+define i32 @__add_int32(i32, i32) nounwind readnone alwaysinline {
+  %v = add i32 %0, %1
+  ret i32 %v
+}
+
+define i32 @__reduce_add_int32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8by4(i32, @__vec4_add_int32, @__add_int32)
+}
+
+define i32 @__reduce_min_int32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8by4(i32, @llvm.x86.sse41.pminsd, @__min_uniform_int32)
+}
+
+define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8by4(i32, @llvm.x86.sse41.pmaxsd, @__max_uniform_int32)
+}
+
+define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8by4(i32, @llvm.x86.sse41.pminud, @__min_uniform_uint32)
+}
+
+define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8by4(i32, @llvm.x86.sse41.pmaxud, @__max_uniform_uint32)
+}
+
+define <4 x double> @__add_varying_double(<4 x double>,
+                                     <4 x double>) nounwind readnone alwaysinline {
+  %r = fadd <4 x double> %0, %1
+  ret <4 x double> %r
+}
+
+define double @__add_uniform_double(double, double) nounwind readnone alwaysinline {
+  %r = fadd double %0, %1
+  ret double %r
+}
+
+define double @__reduce_add_double(<8 x double>) nounwind readnone {
+  reduce8by4(double, @__add_varying_double, @__add_uniform_double)
+}
+
+define double @__reduce_min_double(<8 x double>) nounwind readnone {
+  reduce8(double, @__min_varying_double, @__min_uniform_double)
+}
+
+define double @__reduce_max_double(<8 x double>) nounwind readnone {
+  reduce8(double, @__max_varying_double, @__max_uniform_double)
+}
+
+define <4 x i64> @__add_varying_int64(<4 x i64>,
+                                      <4 x i64>) nounwind readnone alwaysinline {
+  %r = add <4 x i64> %0, %1
+  ret <4 x i64> %r
+}
+
+define i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
+  %r = add i64 %0, %1
+  ret i64 %r
+}
+
+define i64 @__reduce_add_int64(<8 x i64>) nounwind readnone {
+  reduce8by4(i64, @__add_varying_int64, @__add_uniform_int64)
+}
+
+define i64 @__reduce_min_int64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+define i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+reduce_equal(8)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+
+masked_load(i8,  1)
+masked_load(i16, 2)
+masked_load(i32, 4)
+masked_load(float, 4)
+masked_load(i64, 8)
+masked_load(double, 8)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather/scatter
+
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
+
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float rounding
+
+declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
+
+define <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  round4to8(%0, 8)
+}
+
+define <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  round4to8(%0, 9)
+}
+
+define <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  round4to8(%0, 10)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
+
+define <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  round2to8double(%0, 8)
+}
+
+define <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  round2to8double(%0, 9)
+}
+
+define <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  round2to8double(%0, 10)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops / reductions
+
+declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
+  %a = shufflevector <8 x float> %0, <8 x float> undef,
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %b = shufflevector <8 x float> %0, <8 x float> undef,
+         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %ab = fadd <4 x float> %a, %b
+  %hab = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %ab, <4 x float> %ab)
+  %a_scalar = extractelement <4 x float> %hab, i32 0
+  %b_scalar = extractelement <4 x float> %hab, i32 1
+  %sum = fadd float %a_scalar, %b_scalar
+  ret float %sum
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+gen_masked_store(i8)
+gen_masked_store(i16)
+gen_masked_store(i32)
+gen_masked_store(i64)
+
+masked_store_blend_8_16_by_8()
+
+declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
+                                             <4 x float>) nounwind readnone
+
+define void @__masked_store_blend_i32(<8 x i32>* nocapture, <8 x i32>, 
+                                      <8 x i32> %mask) nounwind alwaysinline {
+  ; do two 4-wide blends with blendvps
+  %mask_as_float = bitcast <8 x i32> %mask to <8 x float>
+  %mask_a = shufflevector <8 x float> %mask_as_float, <8 x float> undef,
+                <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %mask_b = shufflevector <8 x float> %mask_as_float, <8 x float> undef,
+                <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %oldValue = load <8 x i32>* %0, align 4
+  %oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float>
+  %newAsFloat = bitcast <8 x i32> %1 to <8 x float>
+  %old_a = shufflevector <8 x float> %oldAsFloat, <8 x float> undef,
+                <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %old_b = shufflevector <8 x float> %oldAsFloat, <8 x float> undef,
+                <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %new_a = shufflevector <8 x float> %newAsFloat, <8 x float> undef,
+                <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %new_b = shufflevector <8 x float> %newAsFloat, <8 x float> undef,
+                <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %blend_a = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %old_a, <4 x float> %new_a,
+                                                       <4 x float> %mask_a)
+  %blend_b = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %old_b, <4 x float> %new_b,
+                                                       <4 x float> %mask_b)
+  %blend = shufflevector <4 x float> %blend_a, <4 x float> %blend_b,
+               <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %blendAsInt = bitcast <8 x float> %blend to <8 x i32>
+  store <8 x i32> %blendAsInt, <8 x i32>* %0, align 4
+  ret void
+}
+
+define void @__masked_store_blend_i64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
+                                      <8 x i32> %mask) nounwind alwaysinline {
+  ; implement this as 4 blends of <4 x i32>s, which are actually bitcast
+  ; <2 x i64>s...
+
+  %mask_as_float = bitcast <8 x i32> %mask to <8 x float>
+
+  %old = load <8 x i64>* %ptr, align 8
+
+  ; set up the first two 64-bit values
+  %old01 = shufflevector <8 x i64> %old, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
+  %old01f = bitcast <2 x i64> %old01 to <4 x float>
+  %new01  = shufflevector <8 x i64> %new, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
+  %new01f = bitcast <2 x i64> %new01 to <4 x float>
+  ; compute mask--note that the values mask0 and mask1 are doubled-up
+  %mask01 = shufflevector <8 x float> %mask_as_float, <8 x float> undef,
+                          <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+  ; and blend the two of them values
+  %result01f = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %old01f,
+                                                         <4 x float> %new01f,
+                                                         <4 x float> %mask01)
+  %result01 = bitcast <4 x float> %result01f to <2 x i64>
+
+  ; and again
+  %old23 = shufflevector <8 x i64> %old, <8 x i64> undef, <2 x i32> <i32 2, i32 3>
+  %old23f = bitcast <2 x i64> %old23 to <4 x float>
+  %new23  = shufflevector <8 x i64> %new, <8 x i64> undef, <2 x i32> <i32 2, i32 3>
+  %new23f = bitcast <2 x i64> %new23 to <4 x float>
+  %mask23 = shufflevector <8 x float> %mask_as_float, <8 x float> undef,
+                          <4 x i32> <i32 2, i32 2, i32 3, i32 3>
+  %result23f = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %old23f,
+                                                         <4 x float> %new23f,
+                                                         <4 x float> %mask23)
+  %result23 = bitcast <4 x float> %result23f to <2 x i64>
+
+  %old45 = shufflevector <8 x i64> %old, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
+  %old45f = bitcast <2 x i64> %old45 to <4 x float>
+  %new45  = shufflevector <8 x i64> %new, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
+  %new45f = bitcast <2 x i64> %new45 to <4 x float>
+  %mask45 = shufflevector <8 x float> %mask_as_float, <8 x float> undef,
+                          <4 x i32> <i32 4, i32 4, i32 5, i32 5>
+  %result45f = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %old45f,
+                                                         <4 x float> %new45f,
+                                                         <4 x float> %mask45)
+  %result45 = bitcast <4 x float> %result45f to <2 x i64>
+
+  %old67 = shufflevector <8 x i64> %old, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
+  %old67f = bitcast <2 x i64> %old67 to <4 x float>
+  %new67  = shufflevector <8 x i64> %new, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
+  %new67f = bitcast <2 x i64> %new67 to <4 x float>
+  %mask67 = shufflevector <8 x float> %mask_as_float, <8 x float> undef,
+                          <4 x i32> <i32 6, i32 6, i32 7, i32 7>
+  %result67f = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %old67f,
+                                                         <4 x float> %new67f,
+                                                         <4 x float> %mask67)
+  %result67 = bitcast <4 x float> %result67f to <2 x i64>
+
+  %final0123 = shufflevector <2 x i64> %result01, <2 x i64> %result23,
+       <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %final4567 = shufflevector <2 x i64> %result45, <2 x i64> %result67,
+       <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %final = shufflevector <4 x i64> %final0123, <4 x i64> %final4567,
+       <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x i64> %final, <8 x i64> * %ptr, align 8
+  ret void
+}
+
+masked_store_float_double()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
+
+define <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
+  unary2to8(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
+  ret <8 x double> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision float min/max
+
+declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
+  binary2to8(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
+  ret <8 x double> %ret
+}
+
+define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
+  binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
+  ret <8 x double> %ret
+}
--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -0,0 +1,505 @@
+;;  Copyright (c) 2010-2012, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Define common 4-wide stuff
+define(`WIDTH',`4')
+define(`MASK',`i32')
+include(`util.m4')
+
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+
+include(`target-sse4-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
+
+define <4 x float> @__rcp_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  %call = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %0)
+  ; do one N-R iteration to improve precision
+  ;  float iv = __rcp_v(v);
+  ;  return iv * (2. - v * iv);
+  %v_iv = fmul <4 x float> %0, %call
+  %two_minus = fsub <4 x float> <float 2., float 2., float 2., float 2.>, %v_iv  
+  %iv_mul = fmul <4 x float> %call, %two_minus
+  ret <4 x float> %iv_mul
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; rsqrt
+
+declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
+
+define <4 x float> @__rsqrt_varying_float(<4 x float> %v) nounwind readonly alwaysinline {
+  ;  float is = __rsqrt_v(v);
+  %is = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %v)
+  ; Newton-Raphson iteration to improve precision
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul <4 x float> %v, %is
+  %v_is_is = fmul <4 x float> %v_is, %is
+  %three_sub = fsub <4 x float> <float 3., float 3., float 3., float 3.>, %v_is_is
+  %is_mul = fmul <4 x float> %is, %three_sub
+  %half_scale = fmul <4 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  ret <4 x float> %half_scale
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; sqrt
+
+declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
+
+define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  %call = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %0)
+  ret <4 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
+
+define <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
+  unary2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
+  ret <4 x double> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding floats
+
+declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
+
+define <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 8)
+  ret <4 x float> %call
+}
+
+define <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 9)
+  ret <4 x float> %call
+}
+
+define <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 10)
+  ret <4 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
+
+define <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  round2to4double(%0, 8)
+}
+
+define <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  round2to4double(%0, 9)
+}
+
+define <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  round2to4double(%0, 10)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <4 x float> @__max_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
+  %call = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %0, <4 x float> %1)
+  ret <4 x float> %call
+}
+
+define <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
+  %call = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %0, <4 x float> %1)
+  ret <4 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int32 min/max
+
+define <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %call = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %call
+}
+
+define <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %call = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; unsigned int min/max
+
+define <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %call = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %call
+}
+
+define <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %call = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone {
+  binary2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
+  ret <4 x double> %ret
+}
+
+define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone {
+  binary2to4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
+  ret <4 x double> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; svml stuff
+
+declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
+declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
+
+
+define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
+  ret <4 x float> %ret
+}
+
+define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
+  ret <4 x float> %ret
+}
+
+define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
+  %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
+  store <4 x float> %s, <4 x float> * %1
+  ret void
+}
+
+define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
+  ret <4 x float> %ret
+}
+
+define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
+  ret <4 x float> %ret
+}
+
+define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
+  ret <4 x float> %ret
+}
+
+define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_expf4(<4 x float> %0)
+  ret <4 x float> %ret
+}
+
+define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_logf4(<4 x float> %0)
+  ret <4 x float> %ret
+}
+
+define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
+  ret <4 x float> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops / reductions
+
+declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
+
+define i64 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i32> %0 to <4 x float>
+  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
+}
+
+define i1 @__any(<4 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i32> %0 to <4 x float>
+  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<4 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i32> %0 to <4 x float>
+  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+  %cmp = icmp eq i32 %v, 15
+  ret i1 %cmp
+}
+
+define i1 @__none(<4 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i32> %0 to <4 x float>
+  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+  %cmp = icmp eq i32 %v, 0
+  ret i1 %cmp
+}
+
+declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline {
+  %v1 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %0, <4 x float> %0)
+  %v2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %v1, <4 x float> %v1)
+  %scalar = extractelement <4 x float> %v2, i32 0
+  ret float %scalar
+}
+
+define float @__reduce_min_float(<4 x float>) nounwind readnone {
+  reduce4(float, @__min_varying_float, @__min_uniform_float)
+}
+
+define float @__reduce_max_float(<4 x float>) nounwind readnone {
+  reduce4(float, @__max_varying_float, @__max_uniform_float)
+}
+
+define i32 @__reduce_add_int32(<4 x i32> %v) nounwind readnone {
+  %v1 = shufflevector <4 x i32> %v, <4 x i32> undef,
+                      <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %m1 = add <4 x i32> %v1, %v
+  %m1a = extractelement <4 x i32> %m1, i32 0
+  %m1b = extractelement <4 x i32> %m1, i32 1
+  %sum = add i32 %m1a, %m1b
+  ret i32 %sum
+}
+
+define i32 @__reduce_min_int32(<4 x i32>) nounwind readnone {
+  reduce4(i32, @__min_varying_int32, @__min_uniform_int32)
+}
+
+define i32 @__reduce_max_int32(<4 x i32>) nounwind readnone {
+  reduce4(i32, @__max_varying_int32, @__max_uniform_int32)
+}
+
+define i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone {
+  reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32)
+}
+
+define i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone {
+  reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32)
+}
+
+
+define double @__reduce_add_double(<4 x double>) nounwind readnone {
+  %v0 = shufflevector <4 x double> %0, <4 x double> undef,
+                      <2 x i32> <i32 0, i32 1>
+  %v1 = shufflevector <4 x double> %0, <4 x double> undef,
+                      <2 x i32> <i32 2, i32 3>
+  %sum = fadd <2 x double> %v0, %v1
+  %e0 = extractelement <2 x double> %sum, i32 0
+  %e1 = extractelement <2 x double> %sum, i32 1
+  %m = fadd double %e0, %e1
+  ret double %m
+}
+
+define double @__reduce_min_double(<4 x double>) nounwind readnone {
+  reduce4(double, @__min_varying_double, @__min_uniform_double)
+}
+
+define double @__reduce_max_double(<4 x double>) nounwind readnone {
+  reduce4(double, @__max_varying_double, @__max_uniform_double)
+}
+
+define i64 @__reduce_add_int64(<4 x i64>) nounwind readnone {
+  %v0 = shufflevector <4 x i64> %0, <4 x i64> undef,
+                      <2 x i32> <i32 0, i32 1>
+  %v1 = shufflevector <4 x i64> %0, <4 x i64> undef,
+                      <2 x i32> <i32 2, i32 3>
+  %sum = add <2 x i64> %v0, %v1
+  %e0 = extractelement <2 x i64> %sum, i32 0
+  %e1 = extractelement <2 x i64> %sum, i32 1
+  %m = add i64 %e0, %e1
+  ret i64 %m
+}
+
+define i64 @__reduce_min_int64(<4 x i64>) nounwind readnone {
+  reduce4(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+define i64 @__reduce_max_int64(<4 x i64>) nounwind readnone {
+  reduce4(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+define i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone {
+  reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone {
+  reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+reduce_equal(4)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
+                                             <4 x float>) nounwind readnone
+
+
+define void @__masked_store_blend_i32(<4 x i32>* nocapture, <4 x i32>, 
+                                      <4 x i32> %mask) nounwind alwaysinline {
+  %mask_as_float = bitcast <4 x i32> %mask to <4 x float>
+  %oldValue = load <4 x i32>* %0, align 4
+  %oldAsFloat = bitcast <4 x i32> %oldValue to <4 x float>
+  %newAsFloat = bitcast <4 x i32> %1 to <4 x float>
+  %blend = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %oldAsFloat,
+                                                     <4 x float> %newAsFloat,
+                                                     <4 x float> %mask_as_float)
+  %blendAsInt = bitcast <4 x float> %blend to <4 x i32>
+  store <4 x i32> %blendAsInt, <4 x i32>* %0, align 4
+  ret void
+}
+
+
+define void @__masked_store_blend_i64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
+                                      <4 x i32> %i32mask) nounwind alwaysinline {
+  %oldValue = load <4 x i64>* %ptr, align 8
+  %mask = bitcast <4 x i32> %i32mask to <4 x float>
+
+  ; Do 4x64-bit blends by doing two <4 x i32> blends, where the <4 x i32> values
+  ; are actually bitcast <2 x i64> values
+  ;
+  ; set up the first two 64-bit values
+  %old01  = shufflevector <4 x i64> %oldValue, <4 x i64> undef,
+                          <2 x i32> <i32 0, i32 1>
+  %old01f = bitcast <2 x i64> %old01 to <4 x float>
+  %new01  = shufflevector <4 x i64> %new, <4 x i64> undef,
+                          <2 x i32> <i32 0, i32 1>
+  %new01f = bitcast <2 x i64> %new01 to <4 x float>
+  ; compute mask--note that the indices 0 and 1 are doubled-up
+  %mask01 = shufflevector <4 x float> %mask, <4 x float> undef,
+                          <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+  ; and blend the two of the values
+  %result01f = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %old01f,
+                                                         <4 x float> %new01f,
+                                                         <4 x float> %mask01)
+  %result01 = bitcast <4 x float> %result01f to <2 x i64>
+
+  ; and again
+  %old23  = shufflevector <4 x i64> %oldValue, <4 x i64> undef,
+                          <2 x i32> <i32 2, i32 3>
+  %old23f = bitcast <2 x i64> %old23 to <4 x float>
+  %new23  = shufflevector <4 x i64> %new, <4 x i64> undef,
+                          <2 x i32> <i32 2, i32 3>
+  %new23f = bitcast <2 x i64> %new23 to <4 x float>
+  ; compute mask--note that the values 2 and 3 are doubled-up
+  %mask23 = shufflevector <4 x float> %mask, <4 x float> undef,
+                          <4 x i32> <i32 2, i32 2, i32 3, i32 3>
+  ; and blend the two of the values
+  %result23f = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %old23f,
+                                                         <4 x float> %new23f,
+                                                         <4 x float> %mask23)
+  %result23 = bitcast <4 x float> %result23f to <2 x i64>
+
+  ; reconstruct the final <4 x i64> vector
+  %final = shufflevector <2 x i64> %result01, <2 x i64> %result23,
+                         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x i64> %final, <4 x i64> * %ptr, align 8
+  ret void
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+masked_store_blend_8_16_by_4()
+
+gen_masked_store(i8)
+gen_masked_store(i16)
+gen_masked_store(i32)
+gen_masked_store(i64)
+
+masked_store_float_double()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+
+masked_load(i8,  1)
+masked_load(i16, 2)
+masked_load(i32, 4)
+masked_load(float, 4)
+masked_load(i64, 8)
+masked_load(double, 8)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather/scatter
+
+; define these with the macros from stdlib.m4
+
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
+
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)
--- a/builtins/util.m4
+++ b/builtins/util.m4
--- a/cbackend.cpp
+++ b/cbackend.cpp
--- a/contrib/ispc.vim
+++ b/contrib/ispc.vim
@@ -0,0 +1,32 @@
+" Vim syntax file
+" Language:	ISPC
+" Maintainer:	Andreas Wendleder <andreas.wendleder@gmail.com>
+" Last Change:	2011 Aug 3
+
+" Quit when a syntax file was already loaded
+if exists("b:current_syntax")
+  finish
+endif
+
+" Read the C syntax to start with
+runtime! syntax/c.vim
+unlet b:current_syntax
+
+" New keywords
+syn keyword	ispcStatement	cbreak ccontinue creturn launch print reference soa sync task
+syn keyword	ispcConditional	cif
+syn keyword	ispcRepeat	cdo cfor cwhile
+syn keyword	ispcBuiltin	programCount programIndex	
+syn keyword	ispcType	export uniform varying int8 int16 int32 int64
+
+" Default highlighting
+command -nargs=+ HiLink hi def link <args>
+HiLink ispcStatement	Statement
+HiLink ispcConditional	Conditional
+HiLink ispcRepeat	Repeat
+HiLink ispcBuiltin	Statement
+HiLink ispcType		Type
+delcommand HiLink
+
+let b:current_syntax = "ispc"
+
--- a/contrib/ispc.vim.README
+++ b/contrib/ispc.vim.README
@@ -0,0 +1,8 @@
+To install vim syntax highlighting for ispc files:
+
+1) Copy ispc.vim into ~/.vim/syntax/ispc.vim (create if necessary)
+2) Create a filetype for ispc files to correspond to that syntax file
+   To do this, create and append the following line to ~/.vim/ftdetect/ispc.vim
+
+au BufRead,BufNewFile *.ispc set filetype=ispc
+
--- a/ctx.cpp
+++ b/ctx.cpp
--- a/ctx.h
+++ b/ctx.h
@@ -0,0 +1,704 @@
+/*
+  Copyright (c) 2010-2012, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file ctx.h
+    @brief Declaration of the FunctionEmitContext class
+*/
+
+#ifndef ISPC_CTX_H
+#define ISPC_CTX_H 1
+
+#include "ispc.h"
+#include <map>
+#include <llvm/InstrTypes.h>
+#include <llvm/Instructions.h>
+#if defined(LLVM_3_0) || defined(LLVM_3_1)
+  #include <llvm/Analysis/DebugInfo.h>
+  #include <llvm/Analysis/DIBuilder.h>
+#else
+  #include <llvm/DebugInfo.h>
+  #include <llvm/DIBuilder.h>
+#endif
+
+struct CFInfo;
+
+/** FunctionEmitContext is one of the key classes in ispc; it is used to
+    help with emitting the intermediate representation of a function during
+    compilation.  It carries information the current program context during
+    IR emission (e.g. the basic block into which instructions should be
+    added; or, the current source file and line number, so debugging
+    symbols can be correctly generated).  This class also provides a number
+    of helper routines that are useful for code that emits IR.
+ */
+class FunctionEmitContext {
+public:
+    /** Create a new FunctionEmitContext.
+        @param function     The Function object representing the function
+        @param funSym       Symbol that corresponds to the function
+        @param llvmFunction LLVM function in the current module that corresponds
+                            to the function
+        @param firstStmtPos Source file position of the first statement in the
+                            function
+     */
+    FunctionEmitContext(Function *function, Symbol *funSym, 
+                        llvm::Function *llvmFunction,
+                        SourcePos firstStmtPos);
+    ~FunctionEmitContext();
+
+    /** Returns the Function * corresponding to the function that we're
+        currently generating code for. */
+    const Function *GetFunction() const;
+
+    /** @name Current basic block management
+        @{
+     */
+    /** Returns the current basic block pointer */ 
+    llvm::BasicBlock *GetCurrentBasicBlock();
+    
+    /** Set the given llvm::BasicBlock to be the basic block to emit
+        forthcoming instructions into. */
+    void SetCurrentBasicBlock(llvm::BasicBlock *bblock);
+
+    /** @name Mask management
+        @{
+     */
+    /** Returns the mask value at entry to the current function. */ 
+    llvm::Value *GetFunctionMask();
+
+    /** Returns the mask value corresponding to "varying" control flow
+        within the current function.  (i.e. this doesn't include the effect
+        of the mask at function entry. */
+    llvm::Value *GetInternalMask();
+
+    /** Returns the complete current mask value--i.e. the logical AND of
+        the function entry mask and the internal mask. */ 
+    llvm::Value *GetFullMask();
+
+    /** Returns a pointer to storage in memory that stores the current full
+        mask. */
+    llvm::Value *GetFullMaskPointer();
+
+    /** Provides the value of the mask at function entry */
+    void SetFunctionMask(llvm::Value *val);
+
+    /** Sets the internal mask to a new value */
+    void SetInternalMask(llvm::Value *val);
+
+    /** Sets the internal mask to (oldMask & val) */
+    void SetInternalMaskAnd(llvm::Value *oldMask, llvm::Value *val);
+
+    /** Sets the internal mask to (oldMask & ~val) */
+    void SetInternalMaskAndNot(llvm::Value *oldMask, llvm::Value *test);
+
+    /** Emits a branch instruction to the basic block btrue if any of the
+        lanes of current mask are on and bfalse if none are on. */
+    void BranchIfMaskAny(llvm::BasicBlock *btrue, llvm::BasicBlock *bfalse);
+
+    /** Emits a branch instruction to the basic block btrue if all of the
+        lanes of current mask are on and bfalse if none are on. */
+    void BranchIfMaskAll(llvm::BasicBlock *btrue, llvm::BasicBlock *bfalse);
+
+    /** Emits a branch instruction to the basic block btrue if none of the
+        lanes of current mask are on and bfalse if none are on. */
+    void BranchIfMaskNone(llvm::BasicBlock *btrue, llvm::BasicBlock *bfalse);
+    /** @} */
+
+    /** @name Control flow management
+        @{
+    */
+    /** Notifies the FunctionEmitContext that we're starting emission of an
+        'if' statement with a uniform test.  */
+    void StartUniformIf();
+
+    /** Notifies the FunctionEmitContext that we're starting emission of an
+        'if' statement with a varying test.  The value of the mask going
+        into the 'if' statement is provided in the oldMask parameter. */
+    void StartVaryingIf(llvm::Value *oldMask);
+
+    /** Notifies the FunctionEmitConitext that we're done emitting the IR
+        for an 'if' statement. */
+    void EndIf();
+
+    /** Notifies the FunctionEmitContext that we're starting to emit IR
+        for a loop.  Basic blocks are provides for where 'break' and
+        'continue' statements should jump to (if all running lanes want to
+        break or continue), uniformControlFlow indicates whether the loop
+        condition is 'uniform'. */
+    void StartLoop(llvm::BasicBlock *breakTarget, llvm::BasicBlock *continueTarget, 
+                   bool uniformControlFlow);
+
+    /** Informs FunctionEmitContext of the value of the mask at the start
+        of a loop body or switch statement. */
+    void SetBlockEntryMask(llvm::Value *mask);
+
+    /** Informs FunctionEmitContext that code generation for a loop is
+        finished. */
+    void EndLoop();
+
+    /** Indicates that code generation for a 'foreach', 'foreach_tiled',
+        'foreach_active', or 'foreach_unique' loop is about to start. */
+    enum ForeachType { FOREACH_REGULAR, FOREACH_ACTIVE, FOREACH_UNIQUE };
+    void StartForeach(ForeachType ft);
+    void EndForeach();
+
+    /** Emit code for a 'break' statement in a loop.  If doCoherenceCheck
+        is true, then if we're in a 'varying' loop, code will be emitted to
+        see if all of the lanes want to break, in which case a jump to the
+        break target will be taken.  (For 'uniform' loops, the jump is
+        always done). */
+    void Break(bool doCoherenceCheck);
+
+    /** Emit code for a 'continue' statement in a loop.  If
+        doCoherenceCheck is true, then if we're in a 'varying' loop, code
+        will be emitted to see if all of the lanes want to continue, in
+        which case a jump to the continue target will be taken.  (For
+        'uniform' loops, the jump is always done). */
+    void Continue(bool doCoherenceCheck);
+
+    /** This method is called by code emitting IR for a loop at the end of
+        the loop body; it restores the lanes of the mask that executed a
+        'continue' statement when going through the loop body in the
+        previous iteration. */
+    void RestoreContinuedLanes();
+
+    /** Indicates that code generation for a "switch" statement is about to
+        start.  isUniform indicates whether the "switch" value is uniform,
+        and bbAfterSwitch gives the basic block immediately following the
+        "switch" statement.  (For example, if the switch condition is
+        uniform, we jump here upon executing a "break" statement.) */
+    void StartSwitch(bool isUniform, llvm::BasicBlock *bbAfterSwitch);
+    /** Indicates the end of code generation for a "switch" statement. */
+    void EndSwitch();
+
+    /** Emits code for a "switch" statement in the program.
+        @param expr         Gives the value of the expression after the "switch"
+        @param defaultBlock Basic block to execute for the "default" case.  This
+                            should be NULL if there is no "default" label inside
+                            the switch.
+        @param caseBlocks   vector that stores the mapping from label values
+                            after "case" statements to basic blocks corresponding
+                            to the "case" labels.
+        @param nextBlocks   For each basic block for a "case" or "default" 
+                            label, this gives the basic block for the 
+                            immediately-following "case" or "default" label (or
+                            the basic block after the "switch" statement for the
+                            last label.)
+    */
+    void SwitchInst(llvm::Value *expr, llvm::BasicBlock *defaultBlock,
+                    const std::vector<std::pair<int, llvm::BasicBlock *> > &caseBlocks,
+                    const std::map<llvm::BasicBlock *, llvm::BasicBlock *> &nextBlocks);
+
+    /** Generates code for a "default" label after a "switch" statement.
+        The checkMask parameter indicates whether additional code should be
+        generated to check to see if the execution mask is all off after
+        the default label (in which case a jump to the following label will
+        be issued. */
+    void EmitDefaultLabel(bool checkMask, SourcePos pos);
+
+    /** Generates code for a "case" label after a "switch" statement.  See
+        the documentation for EmitDefaultLabel() for discussion of the
+        checkMask parameter. */
+    void EmitCaseLabel(int value, bool checkMask, SourcePos pos);
+
+    /** Returns the current number of nested levels of 'varying' control
+        flow */
+    int VaryingCFDepth() const;
+
+    bool InForeachLoop() const;
+
+    /** Temporarily disables emission of performance warnings from gathers
+        and scatters from subsequent code. */
+    void DisableGatherScatterWarnings();
+
+    /** Reenables emission of gather/scatter performance warnings. */
+    void EnableGatherScatterWarnings();
+
+    void SetContinueTarget(llvm::BasicBlock *bb) { continueTarget = bb; }
+
+    /** Step through the code and find label statements; create a basic
+        block for each one, so that subsequent calls to
+        GetLabeledBasicBlock() return the corresponding basic block. */
+    void InitializeLabelMap(Stmt *code);
+
+    /** If there is a label in the function with the given name, return the
+        new basic block that it starts. */
+    llvm::BasicBlock *GetLabeledBasicBlock(const std::string &label);
+
+    /** Returns a vector of all labels in the context. This is
+        simply the key set of the labelMap */
+    std::vector<std::string> GetLabels();
+
+    /** Called to generate code for 'return' statement; value is the
+        expression in the return statement (if non-NULL), and
+        doCoherenceCheck indicates whether instructions should be generated
+        to see if all of the currently-running lanes have returned (if
+        we're under varying control flow).  */
+    void CurrentLanesReturned(Expr *value, bool doCoherenceCheck);
+    /** @} */
+
+    /** @name Small helper/utility routines
+        @{ 
+    */
+    /** Given a boolean mask value of type LLVMTypes::MaskType, return an
+        i1 value that indicates if any of the mask lanes are on. */
+    llvm::Value *Any(llvm::Value *mask);
+
+    /** Given a boolean mask value of type LLVMTypes::MaskType, return an
+        i1 value that indicates if all of the mask lanes are on. */
+    llvm::Value *All(llvm::Value *mask);
+
+    /** Given a boolean mask value of type LLVMTypes::MaskType, return an
+        i1 value that indicates if all of the mask lanes are off. */
+    llvm::Value *None(llvm::Value *mask);
+
+    /** Given a boolean mask value of type LLVMTypes::MaskType, return an
+        i64 value wherein the i'th bit is on if and only if the i'th lane
+        of the mask is on. */
+    llvm::Value *LaneMask(llvm::Value *mask);
+
+    /** Given two masks of type LLVMTypes::MaskType, return an i1 value
+        that indicates whether the two masks are equal. */
+    llvm::Value *MasksAllEqual(llvm::Value *mask1, llvm::Value *mask2);
+
+    /** Given a string, create an anonymous global variable to hold its
+        value and return the pointer to the string. */
+    llvm::Value *GetStringPtr(const std::string &str);
+
+    /** Create a new basic block with given name */
+    llvm::BasicBlock *CreateBasicBlock(const char *name);
+
+    /** Given a vector with element type i1, return a vector of type
+        LLVMTypes::BoolVectorType.  This method handles the conversion for
+        the targets where the bool vector element type is, for example,
+        i32. */
+    llvm::Value *I1VecToBoolVec(llvm::Value *b);
+
+    /** If the user has asked to compile the program with instrumentation,
+        this inserts a callback to the user-supplied instrumentation
+        function at the current point in the code. */
+    void AddInstrumentationPoint(const char *note);
+    /** @} */
+
+    /** @name Debugging support
+        @{
+    */
+    /** Set the current source file position; subsequent emitted
+        instructions will have this position associated with them if
+        debugging information is being generated. */
+    void SetDebugPos(SourcePos pos);
+
+    SourcePos GetDebugPos() const;
+
+    /** Adds debugging metadata to the given instruction.  If pos == NULL,
+        use FunctionEmitContext::currentPos as the source file position for
+        the instruction.  Similarly, if a DIScope is provided, it's used
+        and otherwise the scope is found from a GetDIScope() call.  This
+        takes a llvm::Value for the instruction rather than an
+        llvm::Instruction for convenience; in calling code we often have
+        Instructions stored using Value pointers; the code here returns
+        silently if it's not actually given an instruction. */
+    void AddDebugPos(llvm::Value *instruction, const SourcePos *pos = NULL, 
+                     llvm::DIScope *scope = NULL);
+
+    /** Inform the debugging information generation code that a new scope
+        is starting in the source program. */
+    void StartScope();
+
+    /** Inform the debugging information generation code that the current
+        scope is ending in the source program. */
+    void EndScope();
+
+    /** Returns the llvm::DIScope corresponding to the current program
+        scope. */
+    llvm::DIScope GetDIScope() const;
+
+    /** Emits debugging information for the variable represented by
+        sym.  */
+    void EmitVariableDebugInfo(Symbol *sym);
+
+    /** Emits debugging information for the function parameter represented
+        by sym.  */
+    void EmitFunctionParameterDebugInfo(Symbol *sym, int parameterNum);
+    /** @} */
+
+    /** @name IR instruction emission
+        @brief These methods generally closely correspond to LLVM IR
+        instructions.  See the LLVM assembly language reference manual
+        (http://llvm.org/docs/LangRef.html) and the LLVM doxygen documentaion
+        (http://llvm.org/doxygen) for more information.  Here we will only
+        document significant generalizations to the functionality of the 
+        corresponding basic LLVM instructions.
+
+        Beyond actually emitting the instruction, the implementations of
+        these methods in FunctionEmitContext also handle adding debugging
+        metadata if debugging symbols are enabled, adding the instructions
+        to the current basic block, and handling generalizations like
+        'varying' lvalues, arithmetic operations with VectorType operands,
+        etc.
+        @{
+    */
+    /** Emit the binary operator given by the inst parameter.  If
+        llvm::Values corresponding to VectorTypes are given as operands,
+        this also handles applying the given operation to the vector
+        elements. */
+    llvm::Value *BinaryOperator(llvm::Instruction::BinaryOps inst,
+                                llvm::Value *v0, llvm::Value *v1, 
+                                const char *name = NULL);
+
+    /** Emit the "not" operator.  Like BinaryOperator(), this also handles
+        a VectorType-based operand. */
+    llvm::Value *NotOperator(llvm::Value *v, const char *name = NULL);
+
+    /** Emit a comparison instruction.  If the operands are VectorTypes,
+        then a value for the corresponding boolean VectorType is
+        returned. */
+    llvm::Value *CmpInst(llvm::Instruction::OtherOps inst, 
+                         llvm::CmpInst::Predicate pred,
+                         llvm::Value *v0, llvm::Value *v1, const char *name = NULL);
+
+    /** Given a scalar value, return a vector of the same type (or an
+        array, for pointer types). */
+    llvm::Value *SmearUniform(llvm::Value *value, const char *name = NULL);
+
+    llvm::Value *BitCastInst(llvm::Value *value, llvm::Type *type,
+                             const char *name = NULL);
+    llvm::Value *PtrToIntInst(llvm::Value *value, const char *name = NULL);
+    llvm::Value *PtrToIntInst(llvm::Value *value, llvm::Type *type,
+                              const char *name = NULL);
+    llvm::Value *IntToPtrInst(llvm::Value *value, llvm::Type *type,
+                              const char *name = NULL);
+
+    llvm::Instruction *TruncInst(llvm::Value *value, llvm::Type *type,
+                                 const char *name = NULL);
+    llvm::Instruction *CastInst(llvm::Instruction::CastOps op, llvm::Value *value,
+                                llvm::Type *type, const char *name = NULL);
+    llvm::Instruction *FPCastInst(llvm::Value *value, llvm::Type *type, 
+                                  const char *name = NULL);
+    llvm::Instruction *SExtInst(llvm::Value *value, llvm::Type *type, 
+                                const char *name = NULL);
+    llvm::Instruction *ZExtInst(llvm::Value *value, llvm::Type *type, 
+                                const char *name = NULL);
+
+    /** Given two integer-typed values (but possibly one vector and the
+        other not, and or of possibly-different bit-widths), update their
+        values as needed so that the two have the same (more general)
+        type. */ 
+    void MatchIntegerTypes(llvm::Value **v0, llvm::Value **v1);
+
+    /** Create a new slice pointer out of the given pointer to an soa type
+        and an integer offset to a slice within that type. */
+    llvm::Value *MakeSlicePointer(llvm::Value *ptr, llvm::Value *offset);
+
+    /** These GEP methods are generalizations of the standard ones in LLVM;
+        they support both uniform and varying basePtr values as well as
+        uniform and varying index values (arrays of indices).  Varying base
+        pointers are expected to come in as vectors of i32/i64 (depending
+        on the target), since LLVM doesn't currently support vectors of
+        pointers.  The underlying type of the base pointer must be provided
+        via the ptrType parameter */
+    llvm::Value *GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index,
+                                   const Type *ptrType, const char *name = NULL);
+    llvm::Value *GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index0,
+                                   llvm::Value *index1, const Type *ptrType,
+                                   const char *name = NULL);
+
+    /** This method returns a new pointer that represents offsetting the
+        given base pointer to point at the given element number of the
+        structure type that the base pointer points to.  (The provided
+        pointer must be a pointer to a structure type.  The ptrType gives
+        the type of the pointer, though it may be NULL if the base pointer
+        is uniform. */
+    llvm::Value *AddElementOffset(llvm::Value *basePtr, int elementNum,
+                                  const Type *ptrType, const char *name = NULL,
+                                  const PointerType **resultPtrType = NULL);
+
+    /** Load from the memory location(s) given by lvalue, using the given
+        mask.  The lvalue may be varying, in which case this corresponds to
+        a gather from the multiple memory locations given by the array of
+        pointer values given by the lvalue.  If the lvalue is not varying,
+        then both the mask pointer and the type pointer may be NULL. */
+    llvm::Value *LoadInst(llvm::Value *ptr, llvm::Value *mask,
+                          const Type *ptrType, const char *name = NULL);
+
+    llvm::Value *LoadInst(llvm::Value *ptr, const char *name = NULL);
+
+    /** Emits an alloca instruction to allocate stack storage for the given
+        type.  If a non-zero alignment is specified, the object is also
+        allocated at the given alignment.  By default, the alloca
+        instruction is added at the start of the function in the entry
+        basic block; if it should be added to the current basic block, then
+        the atEntryBlock parameter should be false. */ 
+    llvm::Value *AllocaInst(llvm::Type *llvmType, 
+                            const char *name = NULL, int align = 0, 
+                            bool atEntryBlock = true);
+
+    /** Standard store instruction; for this variant, the lvalue must be a
+        single pointer, not a varying lvalue. */
+    void StoreInst(llvm::Value *value, llvm::Value *ptr);
+
+    /** In this variant of StoreInst(), the lvalue may be varying.  If so,
+        this corresponds to a scatter.  Whether the lvalue is uniform of
+        varying, the given storeMask is used to mask the stores so that
+        they only execute for the active program instances. */
+    void StoreInst(llvm::Value *value, llvm::Value *ptr,
+                   llvm::Value *storeMask, const Type *valueType,
+                   const Type *ptrType);
+
+    /** Copy count bytes of memory from the location pointed to by src to
+        the location pointed to by dest.  (src and dest must not be
+        overlapping.) */ 
+    void MemcpyInst(llvm::Value *dest, llvm::Value *src, llvm::Value *count,
+                    llvm::Value *align = NULL);
+
+    void BranchInst(llvm::BasicBlock *block);
+    void BranchInst(llvm::BasicBlock *trueBlock, llvm::BasicBlock *falseBlock,
+                    llvm::Value *test);
+
+    /** This convenience method maps to an llvm::ExtractElementInst if the
+        given value is a llvm::VectorType, and to an llvm::ExtractValueInst
+        otherwise. */
+    llvm::Value *ExtractInst(llvm::Value *v, int elt, const char *name = NULL);
+
+    /** This convenience method maps to an llvm::InsertElementInst if the
+        given value is a llvm::VectorType, and to an llvm::InsertValueInst
+        otherwise. */
+    llvm::Value *InsertInst(llvm::Value *v, llvm::Value *eltVal, int elt, 
+                            const char *name = NULL);
+
+    llvm::PHINode *PhiNode(llvm::Type *type, int count, 
+                           const char *name = NULL);
+    llvm::Instruction *SelectInst(llvm::Value *test, llvm::Value *val0,
+                                  llvm::Value *val1, const char *name = NULL);
+
+    /** Emits IR to do a function call with the given arguments.  If the
+        function type is a varying function pointer type, its full type
+        must be provided in funcType.  funcType can be NULL if func is a
+        uniform function pointer. */
+    llvm::Value *CallInst(llvm::Value *func, const FunctionType *funcType,
+                          const std::vector<llvm::Value *> &args,
+                          const char *name = NULL);
+
+    /** This is a convenience method that issues a call instruction to a
+        function that takes just a single argument. */
+    llvm::Value *CallInst(llvm::Value *func, const FunctionType *funcType,
+                          llvm::Value *arg, const char *name = NULL);
+
+    /** This is a convenience method that issues a call instruction to a
+        function that takes two arguments. */
+    llvm::Value *CallInst(llvm::Value *func, const FunctionType *funcType,
+                          llvm::Value *arg0, llvm::Value *arg1,
+                          const char *name = NULL);
+
+    /** Launch an asynchronous task to run the given function, passing it
+        he given argument values. */
+    llvm::Value *LaunchInst(llvm::Value *callee, 
+                            std::vector<llvm::Value *> &argVals,
+                            llvm::Value *launchCount);
+
+    void SyncInst();
+
+    llvm::Instruction *ReturnInst();
+    /** @} */
+
+private:
+    /** Pointer to the Function for which we're currently generating code. */
+    Function *function;
+
+    /** LLVM function representation for the current function. */
+    llvm::Function *llvmFunction;
+
+    /** The basic block into which we add any alloca instructions that need
+        to go at the very start of the function. */
+    llvm::BasicBlock *allocaBlock;
+
+    /** The current basic block into which we're emitting new
+        instructions */
+    llvm::BasicBlock *bblock;
+
+    /** Pointer to stack-allocated memory that stores the current value of
+        the full program mask. */
+    llvm::Value *fullMaskPointer;
+
+    /** Pointer to stack-allocated memory that stores the current value of
+        the program mask representing varying control flow within the
+        function. */
+    llvm::Value *internalMaskPointer;
+
+    /** Value of the program mask when the function starts execution.  */
+    llvm::Value *functionMaskValue;
+
+    /** Current source file position; if debugging information is being
+        generated, this position is used to set file/line information for
+        instructions. */
+    SourcePos currentPos;
+
+    /** Source file position where the function definition started.  Used
+        for error messages and debugging symbols. */
+    SourcePos funcStartPos;
+
+    /** If currently in a loop body or switch statement, the value of the
+        mask at the start of it. */
+    llvm::Value *blockEntryMask;
+
+    /** If currently in a loop body or switch statement, this is a pointer
+        to memory to store a mask value that represents which of the lanes
+        have executed a 'break' statement.  If we're not in a loop body or
+        switch, this should be NULL. */
+    llvm::Value *breakLanesPtr;
+
+    /** Similar to breakLanesPtr, if we're inside a loop, this is a pointer
+        to memory to record which of the program instances have executed a
+        'continue' statement. */
+    llvm::Value *continueLanesPtr;
+
+    /** If we're inside a loop or switch statement, this gives the basic
+        block immediately after the current loop or switch, which we will
+        jump to if all of the lanes have executed a break statement or are
+        otherwise done with it. */
+    llvm::BasicBlock *breakTarget;
+
+    /** If we're inside a loop, this gives the block to jump to if all of
+        the running lanes have executed a 'continue' statement. */
+    llvm::BasicBlock *continueTarget;
+
+    /** @name Switch statement state
+
+        These variables store various state that's active when we're
+        generating code for a switch statement.  They should all be NULL
+        outside of a switch.
+        @{
+    */
+
+    /** The value of the expression used to determine which case in the
+        statements after the switch to execute. */
+    llvm::Value *switchExpr;
+
+    /** Map from case label numbers to the basic block that will hold code
+        for that case. */
+    const std::vector<std::pair<int, llvm::BasicBlock *> > *caseBlocks;
+
+    /** The basic block of code to run for the "default" label in the
+        switch statement. */
+    llvm::BasicBlock *defaultBlock;
+
+    /** For each basic block for the code for cases (and the default label,
+        if present), this map gives the basic block for the immediately
+        following case/default label. */
+    const std::map<llvm::BasicBlock *, llvm::BasicBlock *> *nextBlocks;
+
+    /** Records whether the switch condition was uniform; this is a
+        distinct notion from whether the switch represents uniform or
+        varying control flow; we may have varying control flow from a
+        uniform switch condition if there is a 'break' inside the switch
+        that's under varying control flow. */
+    bool switchConditionWasUniform;
+    /** @} */
+
+    /** A pointer to memory that records which of the program instances
+        have executed a 'return' statement (and are thus really truly done
+        running any more instructions in this functions. */
+    llvm::Value *returnedLanesPtr;
+
+    /** A pointer to memory to store the return value for the function.
+        Since difference program instances may execute 'return' statements
+        at different times, we need to accumulate the return values as they
+        come in until we return for real. */
+    llvm::Value *returnValuePtr;
+
+    /** The CFInfo structure records information about a nesting level of
+        control flow.  This vector lets us see what control flow is going
+        around outside the current position in the function being
+        emitted. */
+    std::vector<CFInfo *> controlFlowInfo;
+
+    /** DIFile object corresponding to the source file where the current
+        function was defined (used for debugging info). */
+    llvm::DIFile diFile;
+
+    /** DISubprogram corresponding to this function (used for debugging
+        info). */
+    llvm::DISubprogram diSubprogram;
+
+    /** These correspond to the current set of nested scopes in the
+        function. */
+    std::vector<llvm::DILexicalBlock> debugScopes;
+
+    /** True if a 'launch' statement has been encountered in the function. */
+    bool launchedTasks;
+
+    /** This is a pointer to a void * that is passed to the ISPCLaunch(),
+        ISPCAlloc(), and ISPCSync() routines as a handle to the group ot
+        tasks launched from the current function. */
+    llvm::Value *launchGroupHandlePtr;
+
+    /** Nesting count of the number of times calling code has disabled (and
+        not yet reenabled) gather/scatter performance warnings. */
+    int disableGSWarningCount;
+
+    std::map<std::string, llvm::BasicBlock *> labelMap;
+
+    static bool initLabelBBlocks(ASTNode *node, void *data);
+
+    llvm::Value *pointerVectorToVoidPointers(llvm::Value *value);
+    static void addGSMetadata(llvm::Value *inst, SourcePos pos);
+    bool ifsInCFAllUniform(int cfType) const;
+    void jumpIfAllLoopLanesAreDone(llvm::BasicBlock *target);
+    llvm::Value *emitGatherCallback(llvm::Value *lvalue, llvm::Value *retPtr);
+
+    llvm::Value *applyVaryingGEP(llvm::Value *basePtr, llvm::Value *index, 
+                                 const Type *ptrType);
+
+    void restoreMaskGivenReturns(llvm::Value *oldMask);
+    void addSwitchMaskCheck(llvm::Value *mask);
+    bool inSwitchStatement() const;
+    llvm::Value *getMaskAtSwitchEntry();
+
+    CFInfo *popCFState();
+
+    void scatter(llvm::Value *value, llvm::Value *ptr, const Type *valueType,
+                 const Type *ptrType, llvm::Value *mask);
+    void maskedStore(llvm::Value *value, llvm::Value *ptr, const Type *ptrType,
+                     llvm::Value *mask);
+    void storeUniformToSOA(llvm::Value *value, llvm::Value *ptr, 
+                           llvm::Value *mask, const Type *valueType,
+                           const PointerType *ptrType);
+    llvm::Value *loadUniformFromSOA(llvm::Value *ptr, llvm::Value *mask,
+                                    const PointerType *ptrType, const char *name);
+
+    llvm::Value *gather(llvm::Value *ptr, const PointerType *ptrType,
+                        llvm::Value *mask, const char *name);
+
+    llvm::Value *addVaryingOffsetsIfNeeded(llvm::Value *ptr, const Type *ptrType);
+};
+
+#endif // ISPC_CTX_H
--- a/decl.cpp
+++ b/decl.cpp
@@ -0,0 +1,708 @@
+/*
+  Copyright (c) 2010-2012, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file decl.cpp
+    @brief Implementations of classes related to turning declarations into 
+           symbol names and types.
+*/
+
+#include "decl.h"
+#include "util.h"
+#include "module.h"
+#include "sym.h"
+#include "type.h"
+#include "stmt.h"
+#include "expr.h"
+#include <stdio.h>
+#include <string.h>
+#include <set>
+
+static void
+lPrintTypeQualifiers(int typeQualifiers) {
+    if (typeQualifiers & TYPEQUAL_INLINE)    printf("inline ");
+    if (typeQualifiers & TYPEQUAL_CONST)     printf("const ");
+    if (typeQualifiers & TYPEQUAL_UNIFORM)   printf("uniform ");
+    if (typeQualifiers & TYPEQUAL_VARYING)   printf("varying ");
+    if (typeQualifiers & TYPEQUAL_TASK)      printf("task ");
+    if (typeQualifiers & TYPEQUAL_SIGNED)    printf("signed ");
+    if (typeQualifiers & TYPEQUAL_UNSIGNED)  printf("unsigned ");
+    if (typeQualifiers & TYPEQUAL_EXPORT)    printf("export ");
+    if (typeQualifiers & TYPEQUAL_UNMASKED)  printf("unmasked ");
+}
+
+
+/** Given a Type and a set of type qualifiers, apply the type qualifiers to
+    the type, returning the type that is the result. 
+*/
+static const Type *
+lApplyTypeQualifiers(int typeQualifiers, const Type *type, SourcePos pos) {
+    if (type == NULL)
+        return NULL;
+
+    if ((typeQualifiers & TYPEQUAL_CONST) != 0)
+        type = type->GetAsConstType();
+
+    if ((typeQualifiers & TYPEQUAL_UNIFORM) != 0) {
+        if (Type::Equal(type, AtomicType::Void))
+            Error(pos, "\"uniform\" qualifier is illegal with \"void\" type.");
+        else
+            type = type->GetAsUniformType();
+    }
+    else if ((typeQualifiers & TYPEQUAL_VARYING) != 0) {
+        if (Type::Equal(type, AtomicType::Void))
+            Error(pos, "\"varying\" qualifier is illegal with \"void\" type.");
+        else
+            type = type->GetAsVaryingType();
+    }
+    else
+        if (Type::Equal(type, AtomicType::Void) == false)
+            type = type->GetAsUnboundVariabilityType();
+
+    if ((typeQualifiers & TYPEQUAL_UNSIGNED) != 0) {
+        if ((typeQualifiers & TYPEQUAL_SIGNED) != 0)
+            Error(pos, "Illegal to apply both \"signed\" and \"unsigned\" "
+                  "qualifiers.");
+
+        const Type *unsignedType = type->GetAsUnsignedType();
+        if (unsignedType != NULL)
+            type = unsignedType;
+        else {
+            const Type *resolvedType = 
+                type->ResolveUnboundVariability(Variability::Varying);
+            Error(pos, "\"unsigned\" qualifier is illegal with \"%s\" type.",
+                  resolvedType->GetString().c_str());
+        }
+    }
+
+    if ((typeQualifiers & TYPEQUAL_SIGNED) != 0 && type->IsIntType() == false) {
+        const Type *resolvedType = 
+            type->ResolveUnboundVariability(Variability::Varying);
+        Error(pos, "\"signed\" qualifier is illegal with non-integer type "
+              "\"%s\".", resolvedType->GetString().c_str());
+    }
+
+    return type;
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// DeclSpecs
+
+DeclSpecs::DeclSpecs(const Type *t, StorageClass sc, int tq) {
+    baseType = t;
+    storageClass = sc;
+    typeQualifiers = tq;
+    soaWidth = 0;
+    vectorSize = 0;
+}
+
+
+const Type *
+DeclSpecs::GetBaseType(SourcePos pos) const {
+    const Type *retType = baseType;
+
+    if (retType == NULL) {
+        Warning(pos, "No type specified in declaration.  Assuming int32.");
+        retType = AtomicType::UniformInt32->GetAsUnboundVariabilityType();
+    }
+
+    if (vectorSize > 0) {
+        const AtomicType *atomicType = CastType<AtomicType>(retType);
+        if (atomicType == NULL) {
+            Error(pos, "Only atomic types (int, float, ...) are legal for vector "
+                  "types.");
+            return NULL;
+        }
+        retType = new VectorType(atomicType, vectorSize);
+    }
+
+    retType = lApplyTypeQualifiers(typeQualifiers, retType, pos);
+    
+    if (soaWidth > 0) {
+        const StructType *st = CastType<StructType>(retType);
+
+        if (st == NULL) {
+            Error(pos, "Illegal to provide soa<%d> qualifier with non-struct "
+                  "type \"%s\".", soaWidth, retType->GetString().c_str());
+            return NULL;
+        }
+        else if (soaWidth <= 0 || (soaWidth & (soaWidth - 1)) != 0) {
+            Error(pos, "soa<%d> width illegal.  Value must be positive power "
+                  "of two.", soaWidth);
+            return NULL;
+        }
+
+        if (st->IsUniformType()) {
+            Error(pos, "\"uniform\" qualifier and \"soa<%d>\" qualifier can't "
+                  "both be used in a type declaration.", soaWidth);
+            return NULL;
+        }
+        else if (st->IsVaryingType()) {
+            Error(pos, "\"varying\" qualifier and \"soa<%d>\" qualifier can't "
+                  "both be used in a type declaration.", soaWidth);
+            return NULL;
+        }
+        else
+            retType = st->GetAsSOAType(soaWidth);
+
+        if (soaWidth < g->target.vectorWidth)
+            PerformanceWarning(pos, "soa<%d> width smaller than gang size %d "
+                               "currently leads to inefficient code to access "
+                               "soa types.", soaWidth, g->target.vectorWidth);
+    }
+    
+    return retType;
+}
+
+
+static const char *
+lGetStorageClassName(StorageClass storageClass) {
+    switch (storageClass) {
+    case SC_NONE:     return "";
+    case SC_EXTERN:   return "extern";
+    case SC_EXTERN_C: return "extern \"C\"";
+    case SC_STATIC:   return "static";
+    case SC_TYPEDEF:  return "typedef";
+    default:          FATAL("Unhandled storage class in lGetStorageClassName");
+                      return "";
+    }
+}
+
+
+void
+DeclSpecs::Print() const {
+    printf("Declspecs: [%s ", lGetStorageClassName(storageClass));
+
+    if (soaWidth > 0) printf("soa<%d> ", soaWidth);
+    lPrintTypeQualifiers(typeQualifiers);
+    printf("base type: %s", baseType->GetString().c_str());
+
+    if (vectorSize > 0) printf("<%d>", vectorSize);
+    printf("]");
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// Declarator
+
+Declarator::Declarator(DeclaratorKind dk, SourcePos p) 
+    : pos(p), kind(dk) { 
+    child = NULL;
+    typeQualifiers = 0;
+    storageClass = SC_NONE;
+    arraySize = -1;
+    type = NULL;
+    initExpr = NULL;
+}
+
+
+void
+Declarator::InitFromDeclSpecs(DeclSpecs *ds) {
+    const Type *baseType = ds->GetBaseType(pos);
+    InitFromType(baseType, ds);
+
+    if (type == NULL) {
+        AssertPos(pos, m->errorCount > 0);
+        return;
+    }
+
+    storageClass = ds->storageClass;
+
+    if (ds->declSpecList.size() > 0 && 
+        CastType<FunctionType>(type) == NULL) {
+        Error(pos, "__declspec specifiers for non-function type \"%s\" are "
+              "not used.", type->GetString().c_str());
+    }
+}
+
+
+void
+Declarator::Print(int indent) const {
+    printf("%*cdeclarator: [", indent, ' ');
+    pos.Print();
+
+    lPrintTypeQualifiers(typeQualifiers);
+    printf("%s ", lGetStorageClassName(storageClass));
+    if (name.size() > 0)
+        printf("%s", name.c_str());
+    else
+        printf("(unnamed)");
+
+    printf(", array size = %d", arraySize);
+
+    printf(", kind = ");
+    switch (kind) {
+    case DK_BASE:      printf("base");      break;
+    case DK_POINTER:   printf("pointer");   break;
+    case DK_REFERENCE: printf("reference"); break;
+    case DK_ARRAY:     printf("array");     break;
+    case DK_FUNCTION:  printf("function");  break;
+    default:           FATAL("Unhandled declarator kind");
+    }
+
+    if (initExpr != NULL) {
+        printf(" = (");
+        initExpr->Print();
+        printf(")");
+    }
+
+    if (functionParams.size() > 0) {
+        for (unsigned int i = 0; i < functionParams.size(); ++i) {
+            printf("\n%*cfunc param %d:\n", indent, ' ', i);
+            functionParams[i]->Print(indent+4);
+        }
+    }
+
+    if (child != NULL)
+        child->Print(indent + 4);
+
+    printf("]\n");
+}
+
+
+void
+Declarator::InitFromType(const Type *baseType, DeclSpecs *ds) {
+    bool hasUniformQual = ((typeQualifiers & TYPEQUAL_UNIFORM) != 0);
+    bool hasVaryingQual = ((typeQualifiers & TYPEQUAL_VARYING) != 0);
+    bool isTask =         ((typeQualifiers & TYPEQUAL_TASK) != 0);
+    bool isExported =     ((typeQualifiers & TYPEQUAL_EXPORT) != 0);
+    bool isConst =        ((typeQualifiers & TYPEQUAL_CONST) != 0);
+    bool isUnmasked =     ((typeQualifiers & TYPEQUAL_UNMASKED) != 0);
+
+    if (hasUniformQual && hasVaryingQual) {
+        Error(pos, "Can't provide both \"uniform\" and \"varying\" qualifiers.");
+        return;
+    }
+    if (kind != DK_FUNCTION && isTask) {
+        Error(pos, "\"task\" qualifier illegal in variable declaration.");
+        return;
+    }
+    if (kind != DK_FUNCTION && isUnmasked) {
+        Error(pos, "\"unmasked\" qualifier illegal in variable declaration.");
+        return;
+    }
+    if (kind != DK_FUNCTION && isExported) {
+        Error(pos, "\"export\" qualifier illegal in variable declaration.");
+        return;
+    }
+    
+    Variability variability(Variability::Unbound);
+    if (hasUniformQual)
+        variability = Variability::Uniform;
+    else if (hasVaryingQual)
+        variability = Variability::Varying;
+
+    if (kind == DK_BASE) {
+        // All of the type qualifiers should be in the DeclSpecs for the
+        // base declarator
+        AssertPos(pos, typeQualifiers == 0);
+        AssertPos(pos, child == NULL);
+        type = baseType;
+    }
+    else if (kind == DK_POINTER) {
+        /* For now, any pointer to an SOA type gets the slice property; if
+           we add the capability to declare pointers as slices or not,
+           we'll want to set this based on a type qualifier here. */
+        const Type *ptrType = new PointerType(baseType, variability, isConst,
+                                              baseType->IsSOAType());
+        if (child != NULL) {
+            child->InitFromType(ptrType, ds);
+            type = child->type;
+            name = child->name;
+        }
+        else
+            type = ptrType;
+    }
+    else if (kind == DK_REFERENCE) {
+        if (hasUniformQual) {
+            Error(pos, "\"uniform\" qualifier is illegal to apply to references.");
+            return;
+        }
+        if (hasVaryingQual) {
+            Error(pos, "\"varying\" qualifier is illegal to apply to references.");
+            return;
+        }
+        if (isConst) {
+            Error(pos, "\"const\" qualifier is to illegal apply to references.");
+            return;
+        }
+        // The parser should disallow this already, but double check.
+        if (CastType<ReferenceType>(baseType) != NULL) {
+            Error(pos, "References to references are illegal.");
+            return;
+        }
+
+        const Type *refType = new ReferenceType(baseType);
+        if (child != NULL) {
+            child->InitFromType(refType, ds);
+            type = child->type;
+            name = child->name;
+        }
+        else
+            type = refType;
+    }
+    else if (kind == DK_ARRAY) {
+        if (Type::Equal(baseType, AtomicType::Void)) {
+            Error(pos, "Arrays of \"void\" type are illegal.");
+            return;
+        }
+        if (CastType<ReferenceType>(baseType)) {
+            Error(pos, "Arrays of references (type \"%s\") are illegal.",
+                  baseType->GetString().c_str());
+            return;
+        }
+
+        const Type *arrayType = new ArrayType(baseType, arraySize);
+        if (child != NULL) {
+            child->InitFromType(arrayType, ds);
+            type = child->type;
+            name = child->name;
+        }
+        else
+            type = arrayType;
+    }
+    else if (kind == DK_FUNCTION) {
+        llvm::SmallVector<const Type *, 8> args;
+        llvm::SmallVector<std::string, 8> argNames;
+        llvm::SmallVector<Expr *, 8> argDefaults;
+        llvm::SmallVector<SourcePos, 8> argPos;
+        
+        // Loop over the function arguments and store the names, types,
+        // default values (if any), and source file positions each one in
+        // the corresponding vector.
+        for (unsigned int i = 0; i < functionParams.size(); ++i) {
+            Declaration *d = functionParams[i];
+
+            if (d == NULL) {
+                AssertPos(pos, m->errorCount > 0);
+                continue;
+            }
+            if (d->declarators.size() == 0) {
+                // function declaration like foo(float), w/o a name for the
+                // parameter; wire up a placeholder Declarator for it
+                d->declarators.push_back(new Declarator(DK_BASE, pos));
+                d->declarators[0]->InitFromDeclSpecs(d->declSpecs);
+            }
+
+            AssertPos(pos, d->declarators.size() == 1);
+            Declarator *decl = d->declarators[0];
+            if (decl == NULL || decl->type == NULL) {
+                AssertPos(pos, m->errorCount > 0);
+                continue;
+            }
+
+            if (decl->name == "") {
+                // Give a name to any anonymous parameter declarations
+                char buf[32];
+                sprintf(buf, "__anon_parameter_%d", i);
+                decl->name = buf;
+            }
+            decl->type = decl->type->ResolveUnboundVariability(Variability::Varying);
+
+            if (d->declSpecs->storageClass != SC_NONE)
+                Error(decl->pos, "Storage class \"%s\" is illegal in "
+                      "function parameter declaration for parameter \"%s\".", 
+                      lGetStorageClassName(d->declSpecs->storageClass),
+                      decl->name.c_str());
+            if (Type::Equal(decl->type, AtomicType::Void)) {
+                Error(decl->pos, "Parameter with type \"void\" illegal in function "
+                      "parameter list.");
+                decl->type = NULL;
+            }
+
+            const ArrayType *at = CastType<ArrayType>(decl->type);
+            if (at != NULL) {
+                // As in C, arrays are passed to functions as pointers to
+                // their element type.  We'll just immediately make this
+                // change now.  (One shortcoming of losing the fact that
+                // the it was originally an array is that any warnings or
+                // errors later issued that print the function type will
+                // report this differently than it was originally declared
+                // in the function, but it's not clear that this is a
+                // significant problem.)
+                const Type *targetType = at->GetElementType();
+                if (targetType == NULL) {
+                    AssertPos(pos, m->errorCount > 0);
+                    return;
+                }
+
+                decl->type = PointerType::GetUniform(targetType);
+
+                // Make sure there are no unsized arrays (other than the
+                // first dimension) in function parameter lists.
+                at = CastType<ArrayType>(targetType);
+                while (at != NULL) {
+                    if (at->GetElementCount() == 0)
+                        Error(decl->pos, "Arrays with unsized dimensions in "
+                              "dimensions after the first one are illegal in "
+                              "function parameter lists.");
+                    at = CastType<ArrayType>(at->GetElementType());
+                }
+            }
+
+            args.push_back(decl->type);
+            argNames.push_back(decl->name);
+            argPos.push_back(decl->pos);
+
+            Expr *init = NULL;
+            // Try to find an initializer expression.
+            while (decl != NULL) {
+                if (decl->initExpr != NULL) {
+                    decl->initExpr = TypeCheck(decl->initExpr);
+                    decl->initExpr = Optimize(decl->initExpr);
+                    if (decl->initExpr != NULL) {
+                        init = dynamic_cast<ConstExpr *>(decl->initExpr);
+                        if (init == NULL)
+                            init = dynamic_cast<NullPointerExpr *>(decl->initExpr);
+                        if (init == NULL)
+                            Error(decl->initExpr->pos, "Default value for parameter "
+                                  "\"%s\" must be a compile-time constant.", 
+                                  decl->name.c_str());
+                    }
+                    break;
+                }
+                else
+                    decl = decl->child;
+            }
+            argDefaults.push_back(init);
+        }
+
+        const Type *returnType = baseType;
+        if (returnType == NULL) {
+            Error(pos, "No return type provided in function declaration.");
+            return;
+        }
+
+        if (CastType<FunctionType>(returnType) != NULL) {
+            Error(pos, "Illegal to return function type from function.");
+            return;
+        }
+        
+        returnType = returnType->ResolveUnboundVariability(Variability::Varying);
+
+        bool isExternC =  ds && (ds->storageClass == SC_EXTERN_C);
+        bool isExported = ds && ((ds->typeQualifiers & TYPEQUAL_EXPORT) != 0);
+        bool isTask =     ds && ((ds->typeQualifiers & TYPEQUAL_TASK) != 0);
+        bool isUnmasked = ds && ((ds->typeQualifiers & TYPEQUAL_UNMASKED) != 0);
+        
+        if (isExported && isTask) {
+            Error(pos, "Function can't have both \"task\" and \"export\" "
+                  "qualifiers");
+            return;
+        }
+        if (isExternC && isTask) {
+            Error(pos, "Function can't have both \"extern \"C\"\" and \"task\" "
+                  "qualifiers");
+            return;
+        }
+        if (isExternC && isExported) {
+            Error(pos, "Function can't have both \"extern \"C\"\" and \"export\" "
+                  "qualifiers");
+            return;
+        }
+        if (isUnmasked && isExported)
+            Warning(pos, "\"unmasked\" qualifier is redundant for exported "
+                    "functions.");
+
+        if (child == NULL) {
+            AssertPos(pos, m->errorCount > 0);
+            return;
+        }
+
+        const FunctionType *functionType = 
+            new FunctionType(returnType, args, argNames, argDefaults,
+                             argPos, isTask, isExported, isExternC, isUnmasked);
+
+        // handle any explicit __declspecs on the function
+        if (ds != NULL) {
+            for (int i = 0; i < (int)ds->declSpecList.size(); ++i) {
+                std::string str = ds->declSpecList[i].first;
+                SourcePos pos = ds->declSpecList[i].second;
+
+                if (str == "safe")
+                    (const_cast<FunctionType *>(functionType))->isSafe = true;
+                else if (!strncmp(str.c_str(), "cost", 4)) {
+                    int cost = atoi(str.c_str() + 4);
+                    if (cost < 0)
+                        Error(pos, "Negative function cost %d is illegal.",
+                              cost);
+                    (const_cast<FunctionType *>(functionType))->costOverride = cost;
+                }
+                else
+                    Error(pos, "__declspec parameter \"%s\" unknown.", str.c_str());
+            }
+        }
+
+        child->InitFromType(functionType, ds);
+        type = child->type;
+        name = child->name;
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////
+// Declaration
+
+Declaration::Declaration(DeclSpecs *ds, std::vector<Declarator *> *dlist) {
+    declSpecs = ds;
+    if (dlist != NULL)
+        declarators = *dlist;
+    for (unsigned int i = 0; i < declarators.size(); ++i)
+        if (declarators[i] != NULL)
+            declarators[i]->InitFromDeclSpecs(declSpecs);
+}
+
+
+Declaration::Declaration(DeclSpecs *ds, Declarator *d) {
+    declSpecs = ds;
+    if (d != NULL) {
+        d->InitFromDeclSpecs(ds);
+        declarators.push_back(d);
+    }
+}
+
+
+std::vector<VariableDeclaration>
+Declaration::GetVariableDeclarations() const {
+    Assert(declSpecs->storageClass != SC_TYPEDEF);
+    std::vector<VariableDeclaration> vars;
+
+    for (unsigned int i = 0; i < declarators.size(); ++i) {
+        Declarator *decl = declarators[i];
+        if (decl == NULL || decl->type == NULL) {
+            // Ignore earlier errors
+            Assert(m->errorCount > 0);
+            continue;
+        }
+
+        if (Type::Equal(decl->type, AtomicType::Void))
+            Error(decl->pos, "\"void\" type variable illegal in declaration.");
+        else if (CastType<FunctionType>(decl->type) == NULL) {
+            decl->type = decl->type->ResolveUnboundVariability(Variability::Varying);
+            Symbol *sym = new Symbol(decl->name, decl->pos, decl->type,
+                                     decl->storageClass);
+            m->symbolTable->AddVariable(sym);
+            vars.push_back(VariableDeclaration(sym, decl->initExpr));
+        }
+    }
+
+    return vars;
+}
+
+
+void
+Declaration::DeclareFunctions() {
+    Assert(declSpecs->storageClass != SC_TYPEDEF);
+
+    for (unsigned int i = 0; i < declarators.size(); ++i) {
+        Declarator *decl = declarators[i];
+        if (decl == NULL || decl->type == NULL) {
+            // Ignore earlier errors
+            Assert(m->errorCount > 0);
+            continue;
+        }
+
+        const FunctionType *ftype = CastType<FunctionType>(decl->type);
+        if (ftype == NULL)
+            continue;
+
+        bool isInline = (declSpecs->typeQualifiers & TYPEQUAL_INLINE);
+        m->AddFunctionDeclaration(decl->name, ftype, decl->storageClass,
+                                  isInline, decl->pos);
+    }
+}
+
+
+void
+Declaration::Print(int indent) const {
+    printf("%*cDeclaration: specs [", indent, ' ');
+    declSpecs->Print();
+    printf("], declarators:\n");
+    for (unsigned int i = 0 ; i < declarators.size(); ++i)
+        declarators[i]->Print(indent+4);
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+
+void
+GetStructTypesNamesPositions(const std::vector<StructDeclaration *> &sd,
+                             llvm::SmallVector<const Type *, 8> *elementTypes,
+                             llvm::SmallVector<std::string, 8> *elementNames,
+                             llvm::SmallVector<SourcePos, 8> *elementPositions) {
+    std::set<std::string> seenNames;
+    for (unsigned int i = 0; i < sd.size(); ++i) {
+        const Type *type = sd[i]->type;
+        if (type == NULL)
+            continue;
+
+        // FIXME: making this fake little DeclSpecs here is really
+        // disgusting
+        DeclSpecs ds(type);
+        if (Type::Equal(type, AtomicType::Void) == false) {
+            if (type->IsUniformType()) 
+                ds.typeQualifiers |= TYPEQUAL_UNIFORM;
+            else if (type->IsVaryingType())
+                ds.typeQualifiers |= TYPEQUAL_VARYING;
+            else if (type->GetSOAWidth() != 0)
+                ds.soaWidth = type->GetSOAWidth();
+            // FIXME: ds.vectorSize?
+        }
+
+        for (unsigned int j = 0; j < sd[i]->declarators->size(); ++j) {
+            Declarator *d = (*sd[i]->declarators)[j];
+            d->InitFromDeclSpecs(&ds);
+
+            if (Type::Equal(d->type, AtomicType::Void))
+                Error(d->pos, "\"void\" type illegal for struct member.");
+
+            elementTypes->push_back(d->type);
+
+            if (seenNames.find(d->name) != seenNames.end())
+                Error(d->pos, "Struct member \"%s\" has same name as a "
+                      "previously-declared member.", d->name.c_str());
+            else
+                seenNames.insert(d->name);
+
+            elementNames->push_back(d->name);
+            elementPositions->push_back(d->pos);
+        }
+    }
+
+    for (int i = 0; i < (int)elementTypes->size() - 1; ++i) {
+        const ArrayType *arrayType = CastType<ArrayType>((*elementTypes)[i]);
+
+        if (arrayType != NULL && arrayType->GetElementCount() == 0)
+            Error((*elementPositions)[i], "Unsized arrays aren't allowed except "
+                  "for the last member in a struct definition.");
+    }
+}
--- a/decl.h
+++ b/decl.h
@@ -0,0 +1,228 @@
+/*
+  Copyright (c) 2010-2012, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file decl.h
+    @brief Declarations related to type declarations; the parser basically
+    creates instances of these classes, which are then turned into actual
+    Types.
+
+    Three classes work together to represent declarations.  As an example,
+    consider a declaration like:
+
+    static uniform int foo, bar[10];
+
+    An instance of the Declaration class represents this entire declaration
+    of two variables, 'foo' and 'bar'.  It holds a single instance of the
+    DeclSpecs class represents the common specifiers for all of the
+    variables--here, that the declaration has the 'static' and 'uniform'
+    qualifiers, and that it's basic type is 'int'.  Then for each variable
+    declaration, the Declaraiton class holds an instance of a Declarator,
+    which in turn records the per-variable information like the name, array
+    size (if any), initializer expression, etc.  
+*/
+
+#ifndef ISPC_DECL_H
+#define ISPC_DECL_H
+
+#include "ispc.h"
+#include <llvm/ADT/SmallVector.h>
+
+struct VariableDeclaration;
+
+class Declaration;
+class Declarator;
+
+/* Multiple qualifiers can be provided with types in declarations;
+   therefore, they are set up so that they can be ANDed together into an
+   int. */
+#define TYPEQUAL_NONE           0
+#define TYPEQUAL_CONST      (1<<0)
+#define TYPEQUAL_UNIFORM    (1<<1)
+#define TYPEQUAL_VARYING    (1<<2)
+#define TYPEQUAL_TASK       (1<<3)
+#define TYPEQUAL_SIGNED     (1<<4)
+#define TYPEQUAL_UNSIGNED   (1<<5)
+#define TYPEQUAL_INLINE     (1<<6)
+#define TYPEQUAL_EXPORT     (1<<7)
+#define TYPEQUAL_UNMASKED   (1<<8)
+
+/** @brief Representation of the declaration specifiers in a declaration.
+
+    In other words, this represents all of the stuff that applies to all of
+    the (possibly multiple) variables in a declaration.
+ */
+class DeclSpecs {
+public:
+    DeclSpecs(const Type *t = NULL, StorageClass sc = SC_NONE,
+              int tq = TYPEQUAL_NONE);
+
+    void Print() const;
+
+    StorageClass storageClass;
+
+    /** Zero or more of the TYPEQUAL_* values, ANDed together. */
+    int typeQualifiers;
+
+    /** The basic type provided in the declaration; this should be an
+        AtomicType, EnumType, StructType, or VectorType; other types (like
+        ArrayTypes) will end up being created if a particular declaration
+        has an array size, etc.
+    */
+    const Type *baseType;
+
+    const Type *GetBaseType(SourcePos pos) const;
+
+    /** If this is a declaration with a vector type, this gives the vector
+        width.  For non-vector types, this is zero.
+     */
+    int vectorSize;
+
+    /** If this is a declaration with an "soa<n>" qualifier, this gives the
+        SOA width specified.  Otherwise this is zero.
+     */
+    int soaWidth;
+
+    std::vector<std::pair<std::string, SourcePos> > declSpecList;
+};
+
+
+enum DeclaratorKind {
+    DK_BASE,
+    DK_POINTER,
+    DK_REFERENCE,
+    DK_ARRAY,
+    DK_FUNCTION
+};
+
+/** @brief Representation of the declaration of a single variable.  
+
+    In conjunction with an instance of the DeclSpecs, this gives us
+    everything we need for a full variable declaration.
+ */
+class Declarator {
+public:
+    Declarator(DeclaratorKind dk, SourcePos p);
+
+    /** Once a DeclSpecs instance is available, this method completes the
+        initialization of the type member.
+     */
+    void InitFromDeclSpecs(DeclSpecs *ds);
+
+    void InitFromType(const Type *base, DeclSpecs *ds);
+
+    void Print(int indent) const;
+
+    /** Position of the declarator in the source program. */
+    const SourcePos pos;
+
+    /** The kind of this declarator; complex declarations are assembled as
+        a hierarchy of Declarators.  (For example, a pointer to an int
+        would have a root declarator with kind DK_POINTER and with the
+        Declarator::child member pointing to a DK_BASE declarator for the
+        int). */
+    const DeclaratorKind kind;
+
+    /** Child pointer if needed; this can only be non-NULL if the
+        declarator's kind isn't DK_BASE. */
+    Declarator *child;
+
+    /** Type qualifiers provided with the declarator. */
+    int typeQualifiers;
+
+    StorageClass storageClass;
+
+    /** For array declarators, this gives the declared size of the array.
+        Unsized arrays have arraySize == 0. */ 
+    int arraySize;
+
+    /** Name associated with the declarator. */
+    std::string name;
+
+    /** Initialization expression for the variable.  May be NULL. */
+    Expr *initExpr;
+
+    /** Type of the declarator.  This is NULL until InitFromDeclSpecs() or
+        InitFromType() is called. */
+    const Type *type;
+
+    /** For function declarations, this holds the Declaration *s for the
+        function's parameters. */
+    std::vector<Declaration *> functionParams;
+};
+
+
+/** @brief Representation of a full declaration of one or more variables,
+    including the shared DeclSpecs as well as the per-variable Declarators.
+ */
+class Declaration {
+public:
+    Declaration(DeclSpecs *ds, std::vector<Declarator *> *dlist = NULL);
+    Declaration(DeclSpecs *ds, Declarator *d);
+
+    void Print(int indent) const;
+
+    /** This method walks through all of the Declarators in a declaration
+        and returns a fully-initialized Symbol and (possibly) and
+        initialization expression for each one.  (This allows the rest of
+        the system to not have to worry about the mess of the general
+        Declarator representation.) */
+    std::vector<VariableDeclaration> GetVariableDeclarations() const;
+
+    /** For any function declarations in the Declaration, add the
+        declaration to the module. */
+    void DeclareFunctions();
+
+    DeclSpecs *declSpecs;
+    std::vector<Declarator *> declarators;
+};
+
+
+/** The parser creates instances of StructDeclaration for the members of
+    structs as it's parsing their declarations. */
+struct StructDeclaration {
+    StructDeclaration(const Type *t, std::vector<Declarator *> *d)
+        : type(t), declarators(d) { }
+
+    const Type *type;
+    std::vector<Declarator *> *declarators;
+};
+
+
+/** Given a set of StructDeclaration instances, this returns the types of
+    the elements of the corresponding struct and their names. */
+extern void GetStructTypesNamesPositions(const std::vector<StructDeclaration *> &sd,
+                                         llvm::SmallVector<const Type *, 8> *elementTypes,
+                                         llvm::SmallVector<std::string, 8> *elementNames,
+                                         llvm::SmallVector<SourcePos, 8> *elementPositions);
+
+#endif // ISPC_DECL_H
--- a/docs/ReleaseNotes.txt
+++ b/docs/ReleaseNotes.txt
@@ -0,0 +1,606 @@
+=== v1.3.0 === (29 June 2012)
+
+This is a major new release of ispc, with support for more compilation
+targets and a number of additions to the language.  As usual, the quality
+of generated code has also been improved in a number of cases and a number
+of small bugs have been fixed.
+
+New targets:
+
+* This release provides "beta" support for compiling to Intel® Xeon
+  Phi™ processor, code named Knights Corner, the first processor in
+  the Intel® Many Integrated Core Architecture.  See
+  http://ispc.github.com/ispc.html#compiling-for-the-intel-xeon-phi-architecture
+  for more details on this support.
+
+* This release also has an "avx1.1" target, which provides support for the
+  new instructions in the Intel Ivy Bridge microarchitecutre. 
+
+New language features:
+
+* The foreach_active statement allows iteration over the active program
+  instances in a gang.  (See
+  http://ispc.github.com/ispc.html#iteration-over-active-program-instances-foreach-active)
+
+* foreach_unique allows iterating over subsets of program instances in a
+  gang that share the same value of a variable.  (See
+  http://ispc.github.com/ispc.html#iteration-over-unique-elements-foreach-unique)
+
+* An "unmasked" function qualifier and statement in the language allow
+  re-activating execution of all program instances in a gang.  (See
+  http://ispc.github.com/ispc.html#re-establishing-the-execution-mask
+
+Standard library updates:
+
+* The seed_rng() function has been modified to take a "varying" seed value
+  when a varying RNGState is being initialized.
+
+* An isnan() function has been added, to check for floating-point "not a
+  number" values.
+
+* The float_to_srgb8() routine does high performance conversion of
+  floating-point color values to SRGB8 format.
+
+Other changes:
+
+* A number of bugfixes have been made for compiler crashes with malformed
+  programs.
+
+* Floating-point comparisons are now "unordered", so that any comparison
+  where one of the operands is a "not a number" value returns false.  (This
+  matches standard IEEE floating-point behavior.)
+
+* The code generated for 'break' statements in "varying" loops has been
+  improved for some common cases. 
+
+* Compile time and compiler memory use have both been improved,
+  particularly for large input programs.
+
+* A nubmer of bugs have been fixed in the debugging information generated
+  by the compiler when the "-g" command-line flag is used.
+
+=== v1.2.2 === (20 April 2012)
+
+This release includes a number of small additions to functionality and a
+number of bugfixes.  New functionality includes:
+
+* It's now possible to forward declare structures as in C/C++: "struct
+  Foo;".  After such a declaration, structs with pointers to "Foo" and
+  functions that take pointers or references to Foo structs can be declared
+  without the entire definition of Foo being available.
+
+* New built-in types size_t, ptrdiff_t, and [u]intptr_t are now available,
+  corresponding to the equivalent types in C.
+
+* The standard library now provides atomic_swap*() and
+  atomic_compare_exchange*() functions for void * types.
+
+* The C++ backend has seen a number of improvements to the quality and
+  readability of generated code.
+
+A number of bugs have been fixed in this release as well.  The most
+significant are:
+
+* Fixed a bug where nested loops could cause a compiler crash in some
+  circumstances (issues #240, and #229)
+
+* Gathers could access invlaid mamory (and cause the program to crash) in
+  some circumstances (#235)
+
+* References to temporary values are now handled properly when passed to a
+  function that takes a reference typed parameter.
+
+* A case where incorrect code could be generated for compile-time-constant
+  initializers has been fixed (#234).
+
+=== v1.2.1 === (6 April 2012)
+
+This release contains only minor new functionality and is mostly for many
+small bugfixes and improvements to error handling and error reporting.
+The new functionality that is present is:
+
+* Significantly more efficient versions of the float / half conversion
+  routines are now available in the standard library, thanks to Fabian
+  Giesen.
+
+* The last member of a struct can now be a zero-length array; this allows
+  the trick of dynamically allocating enough storage for the struct and
+  some number of array elements at the end of it.
+
+Significant bugs fixed include:
+
+* Issue #205: When a target ISA isn't specified, use the host system's
+  capabilities to choose a target for which it will be able to run the
+  generated code.
+
+* Issues #215 and #217: Don't allocate storage for global variables that
+  are declared "extern".
+
+* Issue #197: Allow NULL as a default argument value in a function
+  declaration.
+
+* Issue #223: Fix bugs where taking the address of a function wouldn't work
+  as expected.
+
+* Issue #224: When there are overloaded variants of a function that take
+  both reference and const reference parameters, give the non-const
+  reference preference when matching values of that underlying type.
+
+* Issue #225: An error is issed when a varying lvalue is assigned to a
+  reference type (rather than crashing).
+
+* Issue #193: Permit conversions from array types to void *, not just the
+  pointer type of the underlying array element.
+
+* Issue #199: Still evaluate expressions that are cast to (void).
+
+The documentation has also been improved, with FAQs added to clarify some
+aspects of the ispc pointer model.
+
+=== v1.2.0 === (20 March 2012)
+
+This is a major new release of ispc, with a number of significant
+improvements to functionality, performance, and compiler robustness.  It
+does, however, include three small changes to language syntax and semantics
+that may require changes to existing programs:
+
+* Syntax for the "launch" keyword has been cleaned up; it's now no longer
+  necessary to bracket the launched function call with angle brackets.
+  (In other words, now use "launch foo();", rather than "launch < foo() >;".
+
+* When using pointers, the pointed-to data type is now "uniform" by
+  default.  Use the varying keyword to specify varying pointed-to types when
+  needed.  (i.e. "float *ptr" is a varying pointer to uniform float data,
+  whereas previously it was a varying pointer to varying float values.)
+  Use "varying float *" to specify a varying pointer to varying float data,
+  and so forth.
+
+* The details of "uniform" and "varying" and how they interact with struct
+  types have been cleaned up.  Now, when a struct type is declared, if the
+  struct elements don't have explicit "uniform" or "varying" qualifiers,
+  they are said to have "unbound" variability.  When a struct type is
+  instantiated, any unbound variability elements inherit the variability of
+  the parent struct type. See http://ispc.github.com/ispc.html#struct-types
+  for more details.
+
+ispc has a new language feature that makes it much easier to use the
+efficient "(array of) structure of arrays" (AoSoA, or SoA) memory layout of
+data.  A new "soa<n>" qualifier can be applied to structure types to
+specify an n-wide SoA version of the corresponding type.  Array indexing
+and pointer operations with arrays SoA types automatically handles the
+two-stage indexing calculation to access the data.  See
+http://ispc.github.com/ispc.html#structure-of-array-types for more details.
+
+For more efficient access of data that is still in "array of structures"
+(AoS) format, ispc has a new "memory coalescing" optimization that
+automatically detects series of strided loads and/or gathers that can be
+transformed into a more efficient set of vector loads and shuffles.  A
+diagnostic is emitted when this optimization is successfully applied. 
+
+Smaller changes in this release:
+
+* The standard library now provides memcpy(), memmove() and memset()
+  functions, as well as single-precision asin() and acos() functions.
+
+* -I can now be specified on the command-line to specify a search path for
+  #include files.
+
+* A number of improvements have been made to error reporting from the
+  parser, and a number of cases where malformed programs could cause the
+  compiler to crash have been fixed.
+
+* A number of small improvements to the quality and performance of generated
+  code have been made, including finding more cases where 32-bit addressing
+  calculations can be safely done on 64-bit systems and generating better
+  code for initializer expressions.
+
+=== v1.1.4 === (4 February 2012)
+
+There are two major bugfixes for Windows in this release.  First, a number
+of failures in AVX code generation on Windows have been fixed; AVX on
+Windows now has no known issues.  Second, a longstanding bug in parsing 64-bit
+integer constants on Windows has been fixed.
+
+This release features a new experimental scalar target, contributed by Gabe
+Weisz <gweisz@cs.cmu.edu>.  This target ("--target=generic-1") compiles
+gangs of single program instances (i.e. programCount == 1); it can be
+useful for debugging ispc programs.
+
+The compiler now supports dynamic memory allocation in ispc programs (with
+"new" and "delete" operators based on C++).  See
+http://ispc.github.com/ispc.html#dynamic-memory-allocation in the
+documentation for more information.
+
+ispc now performs "short circuit" evaluation of the || and && logical
+operators and the ? : selection operator.  (This represents the correction
+of a major incompatibility with C.)  Code like "(index < arraySize &&
+array[index] == 1)" thus now executes as in C, where "array[index]" won't
+be evaluated unless "index" is less than "arraySize".
+
+The standard library now provides "local" atomic operations, which are
+atomic across the gang of program instances (but not across other gangs or
+other hardware threads.  See the updated documentation on atomics for more
+information:
+http://ispc.github.com/ispc.html#atomic-operations-and-memory-fences.
+
+The standard library now offers a clock() function, which returns a uniform
+int64 value that counts processor cycles; it can be used for
+fine-resolution timing measurements.
+
+Finally (of limited interest now): ispc now supports the forthcoming AVX2
+instruction set, due with Haswell-generation CPUs.  All tests and examples
+compile and execute correctly with AVX2.  (Thanks specifically to Craig
+Topper and Nadav Rotem for work on AVX2 support in LLVM, which made this
+possible.)
+ 
+=== v1.1.3 === (20 January 2012)
+
+With this release, the language now supports "switch" statements, with the
+same semantics and syntax as in C.
+
+This release includes fixes for two important performance related issues:
+the quality of code generated for "foreach" statements has been
+substantially improved (https://github.com/ispc/ispc/issues/151), and a
+performance regression with code for "gathers" that was introduced in
+v1.1.2 has been fixed in this release. 
+
+A number of other small bugs were fixed in this release as well, including
+one where invalid memory would sometimes be incorrectly accessed
+(https://github.com/ispc/ispc/issues/160).
+
+Thanks to Jean-Luc Duprat for a number of patches that improve support for
+building on various platforms, and to Pierre-Antoine Lacaze for patches so
+that ispc builds under MinGW.
+
+=== v1.1.2 === (9 January 2012)
+
+The major new feature in this release is support for "generic" C++
+vectorized output; in other words, ispc can emit C++ code that corresponds
+to the vectorized computation that the ispc program represents.  See the
+examples/intrinsics directory in the ispc distribution for two example
+implementations of the set of functions that must be provided map the
+vector calls generated by ispc to target specific functions.
+
+ispc now has partial support for 'goto' statements; specifically, goto is
+allowed if any enclosing control flow statements (if/for/while/do) have
+'uniform' test expressions, but not if they have 'varying' tests.
+
+A number of improvements have been made to the code generated for gathers
+and scatters--one of them (better matching x86's "free" scale by 2/4/8 for
+addressing calculations) improved the performance of the noise example by
+14%.
+
+Many small bugs have been fixed in this release as well, including issue
+numbers 138, 129, 135, 127, 149, and 142.
+
+=== v1.1.1 === (15 December 2011)
+
+This release doesn't include any significant new functionality, but does
+include a small improvements in generated code and a number of bug fixes.
+
+The one user-visible language change is that integer constants may be
+specified with 'u' and 'l' suffixes, like in C.  For example, "1024llu"
+defines the constant with unsigned 64-bit type.
+
+More informative and useful error messages are printed when function
+overload resolution fails.
+
+Masking is avoided in additional cases when the mask can be
+statically-determined to be all on. 
+
+A number of small bugs have been fixed:
+- Under some circumstances, incorrect masks were used when assigning a
+  value to a reference and when doing gathers/scatters.
+- Incorrect code could be generated in some cases when some instances
+  returned part way through a function but others contineud executing.
+- Type checking wasn't being performed for calls through function pointers;
+  now an error is issued if the arguments don't match up, etc.
+- Incorrect code was being generated for gather/scatter to structs that had
+  elements with varying short-vector types.
+- Typechecking wasn't being performed for "foreach" statements; this led to
+  problems like function overload resolution not being performed if an
+  overloaded function call was used to determine the iteration range..
+- A number of symbols would be multiply-defined when compiling to multiple
+  targets and using the sse2-x2 target as one of them (issue #131).
+
+=== v1.1.0 === (5 December 2011)
+
+This is a major new release of the compiler, with significant additions to
+language functionality and capabilities.  It includes a number of small
+language syntax changes that will require modification of existing
+programs.  These changes should generally be straightforward and all are
+steps toward eliminating parts of ispc syntax that are incompatible with
+C/C++.  See
+http://ispc.github.com/ispc.html#updating-ispc-programs-for-changes-in-ispc-1-1
+for more information about these changes.
+
+ispc now fully supports pointers, including pointer arithmetic, implicit
+conversions of arrays to pointers, and all of the other capabilities of
+pointers in C.  See http://ispc.github.com/ispc.html#pointer-types for more
+information about pointers in ispc and
+http://ispc.github.com/ispc.html#function-pointer-types for information
+about function pointers in ispc.
+
+Reference types are now declared with C++ syntax (e.g. "const float &foo").
+
+ispc now supports 64-bit addressing.  For performance reasons, this
+capability is disabled by default (even on 64-bit targets), but can be
+enabled with a command-line flag:
+http://ispc.github.com/ispc.html#selecting-32-or-64-bit-addressing.
+
+This release features new parallel "foreach" statements, which make it
+easier in many instances to map program instances to data for data-parallel
+computation than the programIndex/programCount mechanism:
+http://ispc.github.com/ispc.html#parallel-iteration-statements-foreach-and-foreach-tiled.
+
+Finally, all of the system's documentation has been significantly revised.
+The documentation of ispc's parallel execution model has been rewritten:
+http://ispc.github.com/ispc.html#the-ispc-parallel-execution-model, and
+there is now a more specific discussion of similarities and differences
+between ispc and C/C++:
+http://ispc.github.com/ispc.html#relationship-to-the-c-programming-language.
+There is now a separate FAQ (http://ispc.github.com/faq.html), and a
+Performance Guide (http://ispc.github.com/perfguide.html).
+ 
+=== v1.0.12 === (20 October 2011)
+
+This release includes a new "double-pumped" 8-wide target for SSE2,
+"sse2-x2".  Like the sse4-x2 and avx-x2 targets, this target may deliver
+higher performance for some workloads than the regular sse2 target.  (For
+other workloads, it may be slower.)
+
+The ispc language now includes an "assert()" statement.  See
+http://ispc.github.com/ispc.html#assertions for more information.
+
+The compiler now sets a preprocessor #define based on the target ISA; for
+example, ISPC_TARGET_SSE4 is defined for the sse4 targets, and so forth.
+
+The standard library now provides high-performance routines for converting
+between some "array of structures" and "structure of arrays" formats.
+See
+http://ispc.github.com/ispc.html#converting-between-array-of-structures-and-structure-of-arrays-layout
+for more information.
+
+Inline functions now have static linkage.
+
+A number of improvements have been made to the optimization passes that
+detect when gathers and scatters can be transformed into vector stores and
+loads, respectively.  In particular, these passes now handle variables that
+are used as loop induction variables much better.
+
+=== v1.0.11 === (6 October 2011)
+
+The main new feature in this release is support for generating code for
+multiple targets (e.g., SSE2, SSE4, and AVX) and having the compiled code
+select the best variant at execution time.  For more information, see
+http://ispc.github.com/ispc.html#compiling-with-support-for-multiple-instruction-sets.
+
+All of the examples now take advantage of the support for multiple
+compilation targets; thus, if one has an AVX system, it's not necessary to
+recompile the examples to use the AVX target.
+
+Performance of the built-in task system that is used in the examples has
+been improved.
+
+Finally, the print() statement now works on OSX; it had been broken for the
+last few releases.
+
+=== v1.0.10 === (30 September 2011)
+
+This release features an extensive new example showing the application of
+ispc to a deferred shading algorithm for scenes with thousands of lights
+(examples/deferred).  This is an implementation of the algorithm that Johan
+Andersson described at SIGGRAPH 2009 and was implemented by Andrew
+Lauritzen and Jefferson Montgomery.  The basic idea is that a pre-rendered
+G-buffer is partitioned into tiles, and in each tile, the set of lights
+that contribute to the tile is computed.  Then, the pixels in the tile are
+then shaded using those light sources. (See slides 19-29 of
+http://s09.idav.ucdavis.edu/talks/04-JAndersson-ParallelFrostbite-Siggraph09.pdf
+for more details on the algorithm.)
+
+The mechanism for launching tasks from ispc code has been generalized to
+allow multiple tasks to be launched with a single launch call (see
+http://ispc.github.com/ispc.html#task-parallelism-language-syntax for more
+information.)
+
+A few new functions have been added to the standard library: num_cores()
+returns the number of cores in the system's CPU, and variants of all of the
+atomic operators that take 'uniform' values as parameters have been added.
+
+=== v1.0.9 === (26 September 2011)
+
+The binary release of v1.0.9 is the first that supports AVX code
+generation.  Two targets are provided: "avx", which runs with a
+programCount of 8, and "avx-x2" which runs 16 program instances
+simultaneously.  (This binary is also built using the in-progress LLVM 3.0
+development libraries, while previous ones have been built with the
+released 2.9 version of LLVM.)
+
+This release has no other significant changes beyond a number of small
+bugfixes (https://github.com/ispc/ispc/issues/100,
+https://github.com/ispc/ispc/issues/101, https://github.com/ispc/ispc/issues/103.)
+ 
+=== v1.0.8 === (19 September 2011)
+
+A number of improvements have been made to handling of 'if' statements in
+the language:
+  - A bug was fixed where invalid memory could be incorrectly accessed even
+    if none of the running program instances wanted to execute the
+    corresponding instructions (https://github.com/ispc/ispc/issues/74).
+  - The code generated for 'if' statements is a bit simpler and thus more
+    efficient.
+
+There is now '--pic' command-line argument that causes position-independent
+code to be generated (Linux and OSX only).
+
+A number of additional performance improvements:
+  - Loops are now unrolled by default; the --opt=disable-loop-unroll
+    command-line argument can be used to disable this behavior.
+    (https://github.com/ispc/ispc/issues/78)
+  - A few more cases where gathers/scatters could be determined at compile
+    time to actually access contiguous locations have been added.
+    (https://github.com/ispc/ispc/issues/79)
+
+Finally, warnings are now issued (if possible) when it can be determined
+at compile-time that an out-of-bounds array index is being used.
+(https://github.com/ispc/ispc/issues/98).
+
+
+=== v1.0.7 === (3 September 2011)
+
+The various atomic_*_global() standard library functions are generally
+substantially more efficient.  They all previously issued one hardware
+atomic instruction for each running program instance but now locally
+compute a reduction over the operands and issue a single hardware atomic,
+giving the same effect and results in the end (issue #57).
+
+CPU/ISA target handling has been substantially improved.  If no CPU is
+specified, the host CPU type is used, not just a default of "nehalem".  A
+number of bugs were fixed that ensure that LLVM doesn't generate SSE>2
+instructions when using the SSE2 target (fixes issue #82).
+
+Shift rights of unsigned integer types use a logical shift right
+instruction now, not an arithmetic shift right (fixed issue #88).
+
+When emitting header files, 'extern' declarations of globals used in ispc
+code are now outside of the ispc namespace.  Fixes issue #64.
+
+The stencil example has been modified to do runs with and without
+parallelism.
+
+Many other small bugfixes and improvements.
+
+=== v1.0.6 === (17 August 2011)
+
+Some additional cross-program instance operations have been added to the
+standard library.  reduce_equal() checks to see if the given value is the
+same across all running program instances, and exclusive_scan_{and,or,and}()
+computes a scan over the given value in the running program instances.
+See the documentation of these new routines for more information:
+http://ispc.github.com/ispc.html#cross-program-instance-operations.
+
+The simple task system implementations used in the examples have been
+improved.  The Windows version no nlonger has a hard limit on the number of
+tasks that can be launched, and all versions have less dynamic memory
+allocation and less locking.  More of the examples now have paths that also
+measure performance using tasks along with SPMD vectorization.
+
+Two new examples have been added: one that shows the implementation of a
+ray-marching volume rendering algorithm, and one that shows a 3D stencil
+computation, as might be done for PDE solutions.
+
+Standard library routines to issue prefetches have been added.  See the
+documentation for more details: http://ispc.github.com/ispc.html#prefetches.
+
+Fast versions of the float to half-precision float conversion routines have
+been added.  For more details, see:
+http://ispc.github.com/ispc.html#conversions-to-and-from-half-precision-floats.
+
+There is the usual set of small bug fixes.  Notably, a number of details
+related to handling 32 versus 64 bit targets have been fixed, which in turn
+has fixed a bug related to tasks having incorrect values for pointers
+passed to them.
+
+=== v1.0.5 === (1 August 2011)
+
+Multi-element vector swizzles are supported; for example, given a 3-wide
+vector "foo", then expressions like "foo.zyx" and "foo.yz" can be used to
+construct other short vectors.  See
+http://ispc.github.com/ispc.html#short-vector-types
+for more details.  (Thanks to Pete Couperus for implementing this code!).
+
+int8 and int16 datatypes are now supported.  It is still generally more
+efficient to use int32 for intermediate computations, even if the in-memory
+format is int8 or int16.
+
+There are now standard library routines to convert to and from 'half'-format
+floating-point values (half_to_float() and float_to_half()).
+
+There is a new example with an implementation of Perlin's Noise function
+(examples/noise).  It shows a speedup of approximately 4.2x versus a C
+implementation on OSX and a 2.9x speedup versus C on Windows.
+
+=== v1.0.4 === (18 July 2011)
+
+enums are now supported in ispc; see the section on enumeration types in
+the documentation (http://ispc.github.com/ispc.html#enumeration-types) for
+more informaiton.
+
+bools are converted to integers with zero extension, not sign extension as
+before (i.e. a 'true' bool converts to the value one, not 'all bits on'.)
+For cases where sign extension is still desired, there is a
+sign_extend(bool) function in the standard library.
+
+Support for 64-bit types in the standard library is much more complete than
+before.
+
+64-bit integer constants are now supported by the parser.
+
+Storage for parameters to tasks is now allocated dynamically on Windows,
+rather than on the stack; with this fix, all tests now run correctly on
+Windows.
+
+There is now support for atomic swap and compare/exchange with float and
+double types.
+
+A number of additional small bugs have been fixed and a number of cases
+where the compiler would crash given a malformed program have been fixed.
+
+=== v1.0.3 === (4 July 2011)
+
+ispc now has a bulit-in pre-processor (from LLVM's clang compiler).
+(Thanks to Pete Couperus for this patch!)  It is therefore no longer
+necessary to use cl.exe for preprocessing on Windows; the MSVC proejct
+files for the examples have been updated accordingly.
+
+There is another variant of the shuffle() function int the standard
+library: "<type> shuffle(<type> v0, <type> v1, int permute)", where the
+permutation vector indexes over the concatenation of the two vectors
+(e.g. the value 0 corresponds to the first element of v0, the value
+2*programCount-1 corresponds to the last element of v1, etc.)
+
+ispc now supports the usual range of atomic operations (add, subtract, min,
+max, and, or, and xor) as well as atomic swap and atomic compare and
+exchange.  There is also a facility for inserting memory fences.  See the
+"Atomic Operations and Memory Fences" section of the user's guide
+(http://ispc.github.com/ispc.html#atomic-operations-and-memory-fences) for
+more information.
+ 
+There are now both 'signed' and 'unsigned' variants of the standard library
+functions like packed_load_active() that take references to arrays of
+signed int32s and unsigned int32s respectively.  (The
+{load_from,store_to}_{int8,int16}() functions have similarly been augmented
+to have both 'signed' and 'unsigned' variants.)
+
+In initializer expressions with variable declarations, it is no longer
+legal to initialize arrays and structs with single scalar values that then
+initialize their members; they now must be initialized with initializer
+lists in braces (or initialized after of the initializer with a loop over
+array elements, etc.)
+
+=== v1.0.2 === (1 July 2011)
+
+Floating-point hexidecimal constants are now parsed correctly on Windows
+(fixes issue #16).
+
+SSE2 is now the default target if --cpu=atom is given in the command line
+arguments and another target isn't explicitly specified.
+
+The standard library now provides broadcast(), rotate(), and shuffle()
+routines for efficient communication between program instances.
+
+The MSVC solution files to build the examples on Windows now use
+/fpmath:fast when building.
+
+=== v1.0.1 === (24 June 2011)
+
+ispc no longer requires that pointers to memory that are passed in to ispc
+have alignment equal to the targets vector width; now alignment just has to
+be the regular element alignment (e.g. 4 bytes for floats, etc.)  This
+change also fixed a number of cases where it previously incorrectly
+generated aligned load/store instructions in cases where the address wasn't
+actually aligned (even if the base address passed into ispc code was).
+
+=== v1.0 === (21 June 2011)
+
+Initial Release
--- a/docs/build.sh
+++ b/docs/build.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+for i in ispc perfguide faq; do
+    rst2html --template=template.txt --link-stylesheet \
+        --stylesheet-path=css/style.css $i.rst > $i.html
+done
+
+rst2html --template=template-news.txt --link-stylesheet \
+    --stylesheet-path=css/style.css news.rst > news.html
+
+rst2html --template=template-perf.txt --link-stylesheet \
+        --stylesheet-path=css/style.css perf.rst > perf.html
+
+#rst2latex --section-numbering --documentclass=article --documentoptions=DIV=9,10pt,letterpaper ispc.txt > ispc.tex
+#pdflatex ispc.tex
+#/bin/rm -f ispc.aux ispc.log ispc.out ispc.tex
--- a/docs/faq.rst
+++ b/docs/faq.rst
@@ -0,0 +1,879 @@
+=====================================
+Frequently Asked Questions About ispc
+=====================================
+
+This document includes a number of frequently (and not frequently) asked
+questions about ispc, the Intel® SPMD Program Compiler.  The source to this
+document is in the file ``docs/faq.rst`` in the ``ispc`` source
+distribution.
+
+* Understanding ispc's Output
+
+  + `How can I see the assembly language generated by ispc?`_
+  + `How can I have the assembly output be printed using Intel assembly syntax?`_
+  + `Why are there multiple versions of exported ispc functions in the assembly output?`_
+  + `How can I more easily see gathers and scatters in generated assembly?`_
+
+* Running The Compiler
+
+  + `Why is it required to use one of the "generic" targets with C++ output?`_
+  + `Why won't the compiler generate an object file or assembly output with the "generic" targets?`_
+
+* Language Details
+
+  + `What is the difference between "int *foo" and "int foo[]"?`_
+  + `Why are pointed-to types "uniform" by default?`_
+  + `What am I getting an error about assigning a varying lvalue to a reference type?`_ 
+  
+* Interoperability
+
+  + `How can I supply an initial execution mask in the call from the application?`_
+  + `How can I generate a single binary executable with support for multiple instruction sets?`_
+  + `How can I determine at run-time which vector instruction set's instructions were selected to execute?`_
+  + `Is it possible to inline ispc functions in C/C++ code?`_
+  + `Why is it illegal to pass "varying" values from C/C++ to ispc functions?`_ 
+
+* Programming Techniques
+
+  + `What primitives are there for communicating between SPMD program instances?`_
+  + `How can a gang of program instances generate variable amounts of output efficiently?`_
+  + `Is it possible to use ispc for explicit vector programming?`_
+  + `How can I debug my ispc programs using Valgrind?`_
+  + `foreach statements generate more complex assembly than I'd expect; what's going on?`_
+  + `How do I launch an individual task for each active program instance?`_
+
+Understanding ispc's Output
+===========================
+
+How can I see the assembly language generated by ispc?
+------------------------------------------------------
+
+The ``--emit-asm`` flag causes assembly output to be generated.  If the
+``-o`` command-line flag is also supplied, the assembly is stored in the
+given file, or printed to standard output if ``-`` is specified for the
+filename.  For example, given the simple ``ispc`` program:
+
+::
+
+    export uniform int foo(uniform int a, uniform int b) {
+        return a+b;
+    }
+
+If the SSE4 target is used, then the following assembly is printed:
+
+::
+
+    _foo:
+            addl    %esi, %edi
+            movl    %edi, %eax
+            ret
+
+
+How can I have the assembly output be printed using Intel assembly syntax?
+--------------------------------------------------------------------------
+
+The ``ispc`` compiler is currently only able to emit assembly with AT+T
+syntax, where the destination operand is the last operand after an
+instruction.  If you'd prefer Intel assembly output, one option is to use
+Agner Fog's ``objconv`` tool: have ``ispc`` emit a native object file and
+then use ``objconv`` to disassemble it, specifying the assembler syntax
+that you prefer.  ``objconv`` `is available for download here`_.
+
+.. _is available for download here: http://www.agner.org/optimize/#objconv
+
+Why are there multiple versions of exported ispc functions in the assembly output?
+----------------------------------------------------------------------------------
+
+Two generations of all functions qualified with ``export`` are generated:
+one of them is for being be called by other ``ispc`` functions, and the
+other is to be called by the application.  The application callable
+function has the original function's name, while the ``ispc``-callable
+function has a mangled name that encodes the types of the function's
+parameters.
+
+The crucial difference between these two functions is that the
+application-callable function doesn't take a parameter encoding the current
+execution mask, while ``ispc``-callable functions have a hidden mask
+parameter.  An implication of this difference is that the ``export``
+function starts with the execution mask "all on".  This allows a number of
+improvements in the generated code, particularly on architectures that
+don't have support for masked load and store instructions.
+
+As an example, consider this short function, which loads a vector's worth
+values from two arrays in memory, adds them, and writes the result to an
+output array.
+
+::
+
+    export void foo(uniform float a[], uniform float b[],
+                    uniform float result[]) {
+        float aa = a[programIndex], bb = b[programIndex];
+        result[programIndex] = aa+bb;
+    }
+
+Here is the assembly code for the application-callable instance of the
+function.
+
+::
+
+    _foo:
+            movups        (%rsi), %xmm1
+            movups        (%rdi), %xmm0
+            addps         %xmm1, %xmm0
+            movups        %xmm0, (%rdx)
+            ret
+
+
+And here is the assembly code for the ``ispc``-callable instance of the
+function.
+
+::
+
+    "_foo___uptr<Uf>uptr<Uf>uptr<Uf>":
+            movmskps      %xmm0, %eax
+            cmpl          $15, %eax
+            je            LBB0_3
+            testl         %eax, %eax
+            jne           LBB0_4
+            ret
+    LBB0_3:
+            movups        (%rsi), %xmm1
+            movups        (%rdi), %xmm0
+            addps         %xmm1, %xmm0
+            movups        %xmm0, (%rdx)
+            ret
+    LBB0_4:
+    ####
+    ####  Code elided; handle mixed mask case..
+    ####
+            ret
+
+There are a few things to notice in this code.  First, the current program
+mask is coming in via the ``%xmm0`` register and the initial few
+instructions in the function essentially check to see if the mask is all on
+or all off.  If the mask is all on, the code at the label LBB0_3 executes;
+it's the same as the code that was generated for ``_foo`` above.  If the
+mask is all off, then there's nothing to be done, and the function can
+return immediately.
+
+In the case of a mixed mask, a substantial amount of code is generated to
+load from and then store to only the array elements that correspond to
+program instances where the mask is on.  (This code is elided below).  This
+general pattern of having two-code paths for the "all on" and "mixed" mask
+cases is used in the code generated for almost all but the most simple
+functions (where the overhead of the test isn't worthwhile.)
+
+How can I more easily see gathers and scatters in generated assembly?
+---------------------------------------------------------------------
+
+Because CPU vector ISAs don't have native gather and scatter instructions,
+these memory operations are turned into sequences of a series of
+instructions in the code that ``ispc`` generates.  In some cases, it can be
+useful to see where gathers and scatters actually happen in code; there is
+an otherwise undocumented command-line flag that provides this information.
+
+Consider this simple program:
+
+::
+
+    void set(uniform int a[], int value, int index) {
+        a[index] = value;
+    }
+
+When compiled normally to the SSE4 target, this program generates this
+extensive code sequence, which makes it more difficult to see what the
+program is actually doing.
+
+::
+
+    "_set___uptr<Ui>ii":
+            pmulld        LCPI0_0(%rip), %xmm1
+            movmskps      %xmm2, %eax
+            testb         $1, %al
+            je            LBB0_2
+            movd          %xmm1, %ecx
+            movd          %xmm0, (%rcx,%rdi)
+    LBB0_2:
+            testb         $2, %al
+            je            LBB0_4
+            pextrd        $1, %xmm1, %ecx
+            pextrd        $1, %xmm0, (%rcx,%rdi)
+    LBB0_4:
+            testb         $4, %al
+            je            LBB0_6
+            pextrd        $2, %xmm1, %ecx
+            pextrd        $2, %xmm0, (%rcx,%rdi)
+    LBB0_6:
+            testb        $8, %al
+            je            LBB0_8
+            pextrd        $3, %xmm1, %eax
+            pextrd        $3, %xmm0, (%rax,%rdi)
+    LBB0_8:
+            ret
+
+If this program is compiled with the
+``--opt=disable-handle-pseudo-memory-ops`` command-line flag, then the
+scatter is left as an unresolved function call.  The resulting program
+won't link without unresolved symbols, but the assembly output is much
+easier to understand:
+
+::
+
+    "_set___uptr<Ui>ii":
+            movaps        %xmm0, %xmm3
+            pmulld        LCPI0_0(%rip), %xmm1
+            movdqa        %xmm1, %xmm0
+            movaps        %xmm3, %xmm1
+            jmp        ___pseudo_scatter_base_offsets32_32 ## TAILCALL
+
+
+Running The Compiler
+====================
+
+Why is it required to use one of the "generic" targets with C++ output?
+-----------------------------------------------------------------------
+
+The C++ output option transforms the provided ``ispc`` program source into
+C++ code where each basic operation in the program (addition, comparison,
+etc.) is represented as a function call to an as-yet-undefined function,
+chaining the results of these calls together to perform the required
+computations.  It is then expected that the user will provide the
+implementation of these functions via a header file with ``inline``
+functions defined for each of these functions and then use a C++ compiler
+to generate a final object file.  (Examples of these headers include
+``examples/intrinsics/sse4.h`` and ``examples/intrinsics/knc.h`` in the
+``ispc`` distribution.)
+
+If a target other than one of the "generic" ones is used with C++ output,
+then the compiler will transform certain operations into particular code
+sequences that may not be desired for the actual final target; for example,
+SSE targets that don't have hardware "gather" instructions will transform a
+gather into a sequence of scalar load instructions.  When this in turn is
+transformed to C++ code, the fact that the loads were originally a gather
+is lost, and the header file of function definitions wouldn't have a chance
+to map the "gather" to a target-specific operation, as the ``knc.h`` header
+does, for example.  Thus, the "generic" targets exist to provide basic
+targets of various vector widths, without imposing any limitations on the
+final target's capabilities.
+
+Why won't the compiler generate an object file or assembly output with the "generic" targets?
+---------------------------------------------------------------------------------------------
+
+As described in the above FAQ entry, when compiling to the "generic"
+targets, ``ispc`` generates vector code for the source program that
+transforms every basic operation in the program (addition, comparison,
+etc.) into a separate function call.
+
+While there is no fundamental reason that the compiler couldn't generate
+target-specific object code with a function call to an undefined function
+for each primitive operation, doing so wouldn't actually be useful in
+practice--providing definitions of these functions in a separate object
+file and actually performing function calls for each of them (versus
+turning them into inline function calls) would be a highly inefficient way
+to run the program.
+
+Therefore, in the interests of encouraging the  use of the system,
+these types of output are disallowed.
+
+
+Language Details
+================
+
+What is the difference between "int \*foo" and "int foo[]"?
+-----------------------------------------------------------
+
+In C and C++, declaring a function to take a parameter ``int *foo`` and
+``int foo[]`` results in the same type for the parameter.  Both are
+pointers to integers.  In ``ispc``, these are different types.  The first
+one is a varying pointer to a uniform integer value in memory, while the
+second results in a uniform pointer to the start of an array of varying
+integer values in memory.
+
+To understand why the first is a varying pointer to a uniform integer,
+first recall that types without explicit rate qualifiers (``uniform``,
+``varying``, or ``soa<>``) are ``varying`` by default.  Second, recall from
+the `discussion of pointer types in the ispc User's Guide`_ that pointed-to
+types without rate qualifiers are ``uniform`` by default.  (This second
+rule is discussed further below, in `Why are pointed-to types "uniform" by
+default?`_.)  The type of ``int *foo`` follows from these.
+
+.. _discussion of pointer types in the ispc User's Guide: ispc.html#pointer-types 
+
+Conversely, in a function body, ``int foo[10]`` represents a declaration of
+a 10-element array of varying ``int`` values.  In that we'd certainly like
+to be able to pass such an array to a function that takes a ``int []``
+parameter, the natural type for an ``int []`` parameter is a uniform
+pointer to varying integer values.
+
+In terms of compatibility with C/C++, it's unfortunate that this
+distinction exists, though any other set of rules seems to introduce more
+awkwardness than this one.  (Though we're interested to hear ideas to
+improve these rules!).
+
+Why are pointed-to types "uniform" by default?
+----------------------------------------------
+
+In ``ispc``, types without rate qualifiers are "varying" by default, but
+types pointed to by pointers without rate qualifiers are "uniform" by
+default.  Why this difference?
+
+::
+
+    int foo;  // no rate qualifier, "varying int".
+    uniform int *foo;  // pointer type has no rate qualifier, pointed-to does.
+                       // "varying pointer to uniform int".
+    int *foo;  // neither pointer type nor pointed-to type ("int") have
+               // rate qualifiers. Pointer type is varying by default,
+               // pointed-to is uniform. "varying pointer to uniform int".
+    varying int *foo;   // varying pointer to varying int
+
+The first rule, having types without rate qualifiers be varying by default,
+is a default that keeps the number of "uniform" or "varying" qualifiers in
+``ispc`` programs low.  Most ``ispc`` programs use mostly "varying"
+variables, so this rule allows most variables to be declared without also
+requiring rate qualifiers.
+
+On a related note, this rule allows many C/C++ functions to be used to
+define equivalent functions in the SPMD execution model that ``ispc``
+provides with little or no modification:
+
+::
+
+    // scalar add in C/C++, SPMD/vector add in ispc
+    int add(int a, int b) { return a + b; }
+
+This motivation also explains why ``uniform int *foo`` represents a varying
+pointer; having pointers be varying by default if they don't have rate
+qualifiers similarly helps with porting code from C/C++ to ``ispc``.
+
+The tricker issue is why pointed-to types are "uniform" by default.  In our
+experience, data in memory that is accessed via pointers is most often
+uniform; this generally includes all data that has been allocated and
+initialized by the C/C++ application code. In practice, "varying" types are
+more generally (but not exclusively) used for local data in ``ispc``
+functions.  Thus, making the pointed-to type uniform by default leads to
+more concise code for the most common cases.
+
+
+What am I getting an error about assigning a varying lvalue to a reference type?
+--------------------------------------------------------------------------------
+
+Given code like the following:
+
+::
+
+    uniform float a[...];
+    int index = ...;
+    float &r = a[index];
+
+``ispc`` issues the error "Initializer for reference-type variable "r" must
+have a uniform lvalue type.".  The underlying issue stems from how
+references are represented in the code generated by ``ispc``.  Recall that
+``ispc`` supports both uniform and varying pointer types--a uniform pointer
+points to the same location in memory for all program instances in the
+gang, while a varying pointer allows each program instance to have its own
+pointer value.
+
+References are represented a pointer in the code generated by ``ispc``,
+though this is generally opaque to the user; in ``ispc``, they are
+specifically uniform pointers.  This design decision was made so that given
+code like this:
+
+::
+
+    extern void func(float &val);
+    float foo = ...;
+    func(foo);
+
+Then the reference would be handled efficiently as a single pointer, rather
+than unnecessarily being turned into a gang-size of pointers.
+
+However, an implication of this decision is that it's not possible for
+references to refer to completely different things for each of the program
+instances.  (And hence the error that is issued).  In cases where a unique
+per-program-instance pointer is needed, a varying pointer should be used
+instead of a reference.
+
+
+Interoperability
+================
+
+How can I supply an initial execution mask in the call from the application?
+----------------------------------------------------------------------------
+
+Recall that when execution transitions from the application code to an
+``ispc`` function, all of the program instances are initially executing.
+In some cases, it may desired that only some of them are running, based on
+a data-dependent condition computed in the application program.  This
+situation can easily be handled via an additional parameter from the
+application.
+
+As a simple example, consider a case where the application code has an
+array of ``float`` values and we'd like the ``ispc`` code to update
+just specific values in that array, where which of those values to be
+updated has been determined by the application.  In C++ code, we might
+have:
+
+::
+
+    int count = ...;
+    float *array = new float[count];
+    bool *shouldUpdate = new bool[count];
+    // initialize array and shouldUpdate
+    ispc_func(array, shouldUpdate, count);
+
+Then, the ``ispc`` code could process this update as:
+
+::
+
+    export void ispc_func(uniform float array[], uniform bool update[],
+                          uniform int count) {
+        foreach (i = 0 ... count) {
+            cif (update[i] == true)
+                // update array[i+programIndex]...
+        }
+    }
+
+(In this case a "coherent" if statement is likely to be worthwhile if the
+``update`` array will tend to have sections that are either all-true or
+all-false.)
+
+How can I generate a single binary executable with support for multiple instruction sets?
+-----------------------------------------------------------------------------------------
+
+``ispc`` can also generate output that supports multiple target instruction
+sets, also generating code that chooses the most appropriate one at runtime
+if multiple targets are specified with the ``--target`` command-line
+argument.
+
+For example, if you run the command:
+
+::
+
+   ispc foo.ispc -o foo.o --target=sse2,sse4-x2,avx-x2
+
+Then four object files will be generated: ``foo_sse2.o``, ``foo_sse4.o``,
+``foo_avx.o``, and ``foo.o``.[#]_  Link all of these into your executable, and
+when you call a function in ``foo.ispc`` from your application code,
+``ispc`` will determine which instruction sets are supported by the CPU the
+code is running on and will call the most appropriate version of the
+function available.  
+
+.. [#] Similarly, if you choose to generate assembly language output or
+   LLVM bitcode output, multiple versions of those files will be created.
+
+In general, the version of the function that runs will be the one in the
+most general instruction set that is supported by the system.  If you only
+compile SSE2 and SSE4 variants and run on a system that supports AVX, for
+example, then the SSE4 variant will be executed.  If the system doesn't
+is not able to run any of the available variants of the function (for
+example, trying to run a function that only has SSE4 and AVX variants on a
+system that only supports SSE2), then the standard library ``abort()``
+function will be called.
+
+One subtlety is that all non-static global variables (if any) must have the
+same size and layout with all of the targets used.  For example, if you
+have the global variables:
+
+::
+
+   uniform int foo[2*programCount];
+   int bar;
+
+and compile to both SSE2 and AVX targets, both of these variables will have
+different sizes (the first due to program count having the value 4 for SSE2
+and 8 for AVX, and the second due to ``varying`` types having different
+numbers of elements with the two targets--essentially the same issue as the
+first.)  ``ispc`` issues an error in this case.
+
+
+How can I determine at run-time which vector instruction set's instructions were selected to execute?
+-----------------------------------------------------------------------------------------------------
+
+``ispc`` doesn't provide any API that allows querying which vector ISA's
+instructions are running when multi-target compilation was used.  However,
+this can be solved in "user space" by writing a small helper function.
+Specifically, if you implement a function like this
+
+::
+
+    export uniform int isa() {
+    #if defined(ISPC_TARGET_SSE2)
+        return 0;
+    #elif defined(ISPC_TARGET_SSE4)
+        return 1;
+    #elif defined(ISPC_TARGET_AVX)
+        return 2;
+    #else
+        return -1;
+    #endif
+    }
+
+And then call it from your application code at runtime, it will return 0,
+1, or 2, depending on which target's instructions are running.
+
+The way this works is a little surprising, but it's a useful trick.  Of
+course the preprocessor ``#if`` checks are all compile-time only
+operations.  What's actually happening is that the function is compiled
+multiple times, once for each target, with the appropriate ``ISPC_TARGET``
+preprocessor symbol set.  Then, a small dispatch function is generated for
+the application to actually call.  This dispatch function in turn calls the
+appropriate version of the function based on the CPU of the system it's
+executing on, which in turn returns the appropriate value.
+
+In a similar fashion, it's possible to find out at run-time the value of
+``programCount`` for the target that's actually being used.
+
+::
+
+    export uniform int width() { return programCount; }
+
+
+Is it possible to inline ispc functions in C/C++ code?
+------------------------------------------------------
+
+If you're willing to use the ``clang`` C/C++ compiler that's part of the
+LLVM tool suite, then it is possible to inline ``ispc`` code with C/C++
+(and conversely, to inline C/C++ calls in ``ispc``).  Doing so can provide
+performance advantages when calling out to short functions written in the
+"other" language.  Note that you don't need to use ``clang`` to compile all
+of your C/C++ code, but only for the files where you want to be able to
+inline.  In order to do this, you must have a full installation of LLVM
+version 3.0 or later, including the ``clang`` compiler.
+
+The basic approach is to have the various compilers emit LLVM intermediate
+representation (IR) code and to then use tools from LLVM to link together
+the IR from the compilers and then re-optimize it, which gives the LLVM
+optimizer the opportunity to do additional inlining and cross-function
+optimizations.  If you have source files ``foo.ispc`` and ``foo.cpp``,
+first emit LLVM IR:
+
+::
+
+   ispc --emit-llvm -o foo_ispc.bc foo.ispc
+   clang -O2 -c -emit-llvm -o foo_cpp.bc foo.cpp
+
+Next, link the two IR files into a single file and run the LLVM optimizer
+on the result:
+
+::
+  
+    llvm-link foo_ispc.bc foo_cpp.bc -o - | opt -O3 -o foo_opt.bc
+
+And finally, generate a native object file:
+
+::
+
+   llc -filetype=obj foo_opt.bc -o foo.o
+
+This file can in turn be linked in with the rest of your object files when
+linking your applicaiton.
+
+(Note that if you're using the AVX instruction set, you must provide the
+``-mattr=+avx`` flag to ``llc``.)
+    
+
+Why is it illegal to pass "varying" values from C/C++ to ispc functions?
+------------------------------------------------------------------------
+
+If any of the types in the parameter list to an exported function is
+"varying" (including recursively, and members of structure types, etc.),
+then ``ispc`` will issue an error and refuse to compile the function:
+
+::
+
+    % echo "export int add(int x) { return ++x; }" | ispc
+    <stdin>:1:12: Error: Illegal to return a "varying" type from exported function "foo" 
+    <stdin>:1:20: Error: Varying parameter "x" is illegal in an exported function. 
+
+While there's no fundamental reason why this isn't possible, recall the
+definition of "varying" variables: they have one value for each program
+instance in the gang.  As such, the number of values and amount of storage
+required to represent a varying variable depends on the gang size
+(i.e. ``programCount``), which can have different values depending on the
+compilation target.
+
+``ispc`` therefore prohibits passing "varying" values between the
+application and the ``ispc`` program in order to prevent the
+application-side code from depending on a particular gang size, in order to
+encourage portability to different gang sizes.  (A generally desirable
+programming practice.)
+
+For cases where the size of data is actually fixed from the application
+side, the value can be passed via a pointer to a short ``uniform`` array,
+as follows:
+
+::
+
+    export void add4(uniform int ptr[4]) {
+        foreach (i = 0 ... 4)
+            ptr[i]++;
+    }
+
+On the 4-wide SSE instruction set, this compiles to a single vector add
+instruction (and associated move instructions), while it still also
+efficiently computes the correct result on 8-wide AVX targets.
+
+
+Programming Techniques
+======================
+
+What primitives are there for communicating between SPMD program instances?
+---------------------------------------------------------------------------
+
+The ``broadcast()``, ``rotate()``, and ``shuffle()`` standard library
+routines provide a variety of mechanisms for the running program instances
+to communicate values to each other during execution.  Note that there's no
+need to synchronize the program instances before communicating between
+them, due to the synchronized execution model of gangs of program instances
+in ``ispc``.
+
+How can a gang of program instances generate variable amounts of output efficiently?
+------------------------------------------------------------------------------------
+
+It's not unusual to have a gang of program instances where each program
+instance generates a variable amount of output (perhaps some generate no
+output, some generate one output value, some generate many output values
+and so forth), and where one would like to have the output densely packed
+in an output array.  The ``exclusive_scan_add()`` function from the
+standard library is quite useful in this situation.
+
+Consider the following function:
+
+::
+
+    uniform int func(uniform float outArray[], ...) {
+       int numOut = ...;  // figure out how many to be output
+       float outLocal[MAX_OUT]; // staging area
+
+       // each program instance in the gang puts its results in
+       //  outLocal[0], ..., outLocal[numOut-1]
+
+       int startOffset = exclusive_scan_add(numOut);
+       for (int i = 0; i < numOut; ++i)
+           outArray[startOffset + i] = outLocal[i];
+       return reduce_add(numOut);
+    }
+
+Here, each program instance has computed a number, ``numOut``, of values to
+output, and has stored them in the ``outLocal`` array.  Assume that four
+program instances are running and that the first one wants to output one
+value, the second two values, and the third and fourth three values each.
+In this case, ``exclusive_scan_add()`` will return the values (0, 1, 3, 6)
+to the four program instances, respectively.  
+
+The first program instance will then write its one result to
+``outArray[0]``, the second will write its two values to ``outArray[1]``
+and ``outArray[2]``, and so forth.  The ``reduce_add()`` call at the end
+returns the total number of values that all of the program instances have
+written to the array.
+
+FIXME: add discussion of foreach_active as an option here once that's in
+
+Is it possible to use ispc for explicit vector programming?
+-----------------------------------------------------------
+
+The typical model for programming in ``ispc`` is an *implicit* parallel
+model, where one writes a program that is apparently doing scalar
+computation on values and the program is then vectorized to run in parallel
+across the SIMD lanes of a processor.  However, ``ispc`` also has some
+support for explicit vector unit programming, where the vectorization is
+explicit.  Some computations may be more effectively described in the
+explicit model rather than the implicit model.
+
+This support is provided via ``uniform`` instances of short vectors
+Specifically, if this short program
+
+::
+
+    export uniform float<8> madd(uniform float<8> a, uniform float<8> b,
+                                 uniform float<8> c) {
+        return a + b * c;
+    }
+
+is compiled with the AVX target, ``ispc`` generates the following assembly:
+
+::
+
+    _madd:
+	vmulps	%ymm2, %ymm1, %ymm1
+	vaddps	%ymm0, %ymm1, %ymm0
+	ret
+
+(And similarly, if compiled with a 4-wide SSE target, two ``mulps`` and two
+``addps`` instructions are generated, and so forth.)
+
+Note that ``ispc`` doesn't currently support control-flow based on
+``uniform`` short vector types; it is thus not possible to write code like:
+
+::
+
+    export uniform int<8> count(uniform float<8> a, uniform float<8> b) {
+        uniform int<8> sum = 0;
+        while (a++ < b)
+            ++sum;
+    }
+
+
+How can I debug my ispc programs using Valgrind?
+------------------------------------------------
+
+The `valgrind`_ memory checker is an extremely useful memory checker for
+Linux and OSX; it detects a range of memory errors, including accessing
+memory after it has been freed, accessing memory beyond the end of an
+array, accessing uninitialized stack variables, and so forth.
+In general, applications that use ``ispc`` code run with ``valgrind``
+without modification and ``valgrind`` will detect the same range of memory
+errors in ``ispc`` code that it does in C/C++ code.  
+
+.. _valgrind: http://valgrind.org
+
+One issue to be aware of is that until recently, ``valgrind`` only
+supported the SSE2 vector instructions; if you are using a version of
+``valgrind`` older than the 3.7.0 release (5 November 2011), you should
+compile your ``ispc`` programs with ``--target=sse2`` before running them
+through ``valgrind``.  (Note that if no target is specified, then ``ispc``
+chooses a target based on the capabilities of the system you're running
+``ispc`` on.)  If you run an ``ispc`` program that uses instructions that
+``valgrind`` doesn't support, you'll see an error message like:
+
+::
+
+    vex amd64->IR: unhandled instruction bytes: 0xC5 0xFA 0x10 0x0 0xC5 0xFA 0x11 0x84
+    ==46059== valgrind: Unrecognised instruction at address 0x100002707.
+
+The just-released valgrind 3.7.0 adds support for the SSE4.2 instruction
+set; if you're using that version (and your system supports SSE4.2), then
+you can use ``--target=sse4`` when compiling to run with ``valgrind``.
+
+Note that ``valgrind`` does not yet support programs that use the AVX
+instruction set.
+
+foreach statements generate more complex assembly than I'd expect; what's going on?
+-----------------------------------------------------------------------------------
+
+Given a simple ``foreach`` loop like the following:
+
+::
+
+    void foo(uniform float a[], uniform int count) {
+        foreach (i = 0 ... count)
+            a[i] *= 2;
+    }
+
+
+the ``ispc`` compiler generates approximately 40 instructions--why isn't
+the generated code simpler?
+
+There are two main components to the code: one handles
+``programCount``-sized chunks of elements of the array, and the other
+handles any excess elements at the end of the array that don't completely
+fill a gang.  The code for the main loop is essentially what one would
+expect: a vector of values are laoded from the array, the multiply is done,
+and the result is stored.
+
+::
+
+    LBB0_2:                                 ## %foreach_full_body
+	movslq	%edx, %rdx
+	vmovups	(%rdi,%rdx), %ymm1
+	vmulps	%ymm0, %ymm1, %ymm1
+	vmovups	%ymm1, (%rdi,%rdx)
+	addl	$32, %edx
+	addl	$8, %eax
+	cmpl	%ecx, %eax
+	jl	LBB0_2
+
+
+Then, there is a sequence of instructions that handles any additional
+elements at the end of the array.  (These instructions don't execute if
+there aren't any left-over values to process, but they do lengthen the
+amount of generated code.)
+
+::
+
+  ## BB#4:                                ## %partial_inner_only
+	vmovd	%eax, %xmm0
+	vinsertf128	$1, %xmm0, %ymm0, %ymm0
+	vpermilps	$0, %ymm0, %ymm0 ## ymm0 = ymm0[0,0,0,0,4,4,4,4]
+	vextractf128	$1, %ymm0, %xmm3
+	vmovd	%esi, %xmm2
+	vmovaps	LCPI0_1(%rip), %ymm1
+	vextractf128	$1, %ymm1, %xmm4
+	vpaddd	%xmm4, %xmm3, %xmm3
+        # ....
+	vmulps	LCPI0_0(%rip), %ymm1, %ymm1
+	vmaskmovps	%ymm1, %ymm0, (%rdi,%rax)
+
+
+If you know that the number of elements to be processed will always be an
+exact multiple of the 8, 16, etc., then adding a simple assignment to
+``count`` like the one below gives the compiler enough information to be
+able to eliminate the code for the additional array elements.
+
+::
+
+    void foo(uniform float a[], uniform int count) {
+        // This assignment doesn't change the value of count
+        // if it's a multiple of 16, but it gives the compiler
+        // insight into this fact, allowing for simpler code to
+        // be generated for the foreach loop.
+        count = (count & ~(16-1));
+        foreach (i = 0 ... count)
+            a[i] *= 2;
+    }
+
+With this new version of ``foo()``, only the code for the first loop above
+is generated.
+
+
+How do I launch an individual task for each active program instance?
+--------------------------------------------------------------------
+
+Recall from the `discussion of "launch" in the ispc User's Guide`_ that a
+``launch`` statement launches a single task corresponding to a single gang
+of executing program instances, where the indices of the active program
+instances are the same as were active when the ``launch`` statement
+executed.
+
+.. _discussion of "launch" in the ispc User's Guide: ispc.html#task-parallelism-launch-and-sync-statements
+
+In some situations, it's desirable to be able to launch an individual task
+for each executing program instance.  For example, we might be performing
+an iterative computation where a subset of the program instances determine
+that an item they are responsible for requires additional processing.
+
+::
+
+    bool itemNeedsMoreProcessing(int);
+    int itemNum = ...;
+    if (itemNeedsMoreProcessing(itemNum)) {
+        // do additional work 
+    }
+
+For performance reasons, it may be desirable to apply an entire gang's
+worth of comptuation to each item that needs additional processing; 
+there may be available parallelism in this computation such that we'd like
+to process each of the items with SPMD computation.
+
+In this case, the ``foreach_active`` and ``unmasked`` constructs can be
+applied together to accomplish this goal.
+
+::
+
+    // do additional work 
+    task void doWork(uniform int index);
+    foreach_active (index) {
+        unmasked {
+            launch doWork(extract(itemNum, index)); 
+        }
+    }
+
+Recall that the body of the ``foreach_active`` loop runs once for each
+active program instance, with each active program instance's
+``programIndex`` value available in ``index`` in the above.  In the loop,
+we can re-establish an "all on" execution mask, enabling execution in all
+of the program instances in the gang, such that execution in ``doWork()``
+starts with all instances running.  (Alternatively, the ``unmasked`` block
+could be in the definition of ``doWork()``.)
+
--- a/docs/ispc.rst
+++ b/docs/ispc.rst
--- a/docs/news.rst
+++ b/docs/news.rst
@@ -0,0 +1,71 @@
+=========
+ispc News
+=========
+
+ispc 1.3.0 is Released
+----------------------
+
+A major new version of ``ispc`` has been released.  In addition to a number
+of new language features, this release notably features initial support for
+compiling to the Intel Xeon Phi (Many Integrated Core) architecture.
+
+ispc 1.2.1 is Released
+----------------------
+
+This is a bugfix release, fixing approximately 20 bugs in the system and
+improving error handling and error reporting.  New functionality includes
+very efficient float/half conversion routines thanks to Fabian 
+Giesen.  See the `1.2.1 release notes`_ for details.
+
+.. _1.2.1 release notes: https://github.com/ispc/ispc/tree/master/docs/ReleaseNotes.txt
+
+ispc 1.2.0 is Released
+-----------------------
+
+A new major release was posted on March 20, 2012.  This release includes
+significant new functionality for cleanly handling "structure of arrays"
+(SoA) data layout and a new model for how uniform and varying are handled
+with structure types.  
+
+Paper on ispc To Appear in InPar 2012
+-------------------------------------
+
+A technical paper on ``ispc``, `ispc: A SPMD Compiler for High-Performance
+CPU Programming`_, by Matt Pharr and William R. Mark, has been accepted to
+the `InPar 2012`_ conference. This paper describes a number of the design
+features and key characteristics of the ``ispc`` implementation.
+
+(© 2012 IEEE. Personal use of this material is permitted. Permission from
+IEEE must be obtained for all other uses, in any current or future media,
+including reprinting/republishing this material for advertising or
+promotional purposes, creating new collective works, for resale or
+redistribution to servers or lists, or reuse of any copyrighted component
+of this work in other works.).
+
+.. _ispc\: A SPMD Compiler for High-Performance CPU Programming: https://github.com/downloads/ispc/ispc/ispc_inpar_2012.pdf
+.. _InPar 2012: http://innovativeparallel.org/
+
+ispc 1.1.4 is Released
+----------------------
+
+On February 4, 2012, the 1.1.4 release of ``ispc`` was posted; new features
+include ``new`` and ``delete`` for dynamic memory allocation in ``ispc``
+programs, "local" atomic operations in the standard library, and a new
+scalar compilation target.  See the `1.1.4 release notes`_ for details.
+
+.. _1.1.4 release notes: https://github.com/ispc/ispc/tree/master/docs/ReleaseNotes.txt
+
+
+ispc 1.1.3 is Released
+----------------------
+
+With this release, the language now supports "switch" statements, with the same semantics and syntax as in C.
+
+This release includes fixes for two important performance related issues:
+the quality of code generated for "foreach" statements has been
+substantially improved, and performance regression with code for "gathers"
+that was introduced in v1.1.2 has been fixed in this release.
+
+Thanks to Jean-Luc Duprat for a number of patches that improve support for
+building on various platforms, and to Pierre-Antoine Lacaze for patches so
+that ispc builds under MinGW.
--- a/docs/perf.rst
+++ b/docs/perf.rst
@@ -0,0 +1,85 @@
+===========
+Performance
+===========
+
+The SPMD programming model that ``ispc`` makes it easy to harness the
+computational power available in SIMD vector units on modern CPUs, while
+its basis in C makes it easy for programmers to adopt and use
+productively.  This page summarizes the performance of ``ispc`` with the
+workloads in the ``examples/`` directory of the ``ispc`` distribution.
+
+These results were measured on a 4-core Apple iMac with a 4-core 3.4GHz
+Intel® Core-i7 processor using the Intel® AVX instruction set.  The basis
+for comparison is a reference C++ implementation compiled with gcc 4.2.1,
+the version distributed with OS X 10.7.2.  (The reference implementation is
+also included in the ``examples/`` directory.)
+
+.. list-table:: Performance of ``ispc`` with a variety of the workloads
+   from the ``examples/`` directory of the ``ispc`` distribution, compared
+   a reference C++ implementation compiled with gcc 4.2.1.
+
+  * - Workload
+    - ``ispc``, 1 core
+    - ``ispc``, 4 cores
+  * - `AOBench`_ (512 x 512 resolution)
+    - 6.19x
+    - 28.06x
+  * - `Binomial Options`_ (128k options)
+    - 7.94x
+    - 33.43x
+  * - `Black-Scholes Options`_ (128k options)
+    - 8.45x
+    - 32.48x
+  * - `Deferred Shading`_ (1280p)
+    - 5.02x
+    - 23.06x
+  * - `Mandelbrot Set`_
+    - 6.21x
+    - 20.28x
+  * - `Perlin Noise Function`_
+    - 5.37x
+    - n/a
+  * - `Ray Tracer`_ (Sponza dataset)
+    - 4.31x
+    - 20.29x
+  * - `3D Stencil`_
+    - 4.05x
+    - 15.53x
+  * - `Volume Rendering`_
+    - 3.60x
+    - 17.53x
+
+
+.. _AOBench: https://github.com/ispc/ispc/tree/master/examples/aobench
+.. _Binomial Options: https://github.com/ispc/ispc/tree/master/examples/options
+.. _Black-Scholes Options: https://github.com/ispc/ispc/tree/master/examples/options
+.. _Deferred Shading: https://github.com/ispc/ispc/tree/master/examples/deferred
+.. _Mandelbrot Set: https://github.com/ispc/ispc/tree/master/examples/mandelbrot_tasks
+.. _Ray Tracer: https://github.com/ispc/ispc/tree/master/examples/rt
+.. _Perlin Noise Function: https://github.com/ispc/ispc/tree/master/examples/noise
+.. _3D Stencil: https://github.com/ispc/ispc/tree/master/examples/stencil
+.. _Volume Rendering: https://github.com/ispc/ispc/tree/master/examples/volume_rendering
+
+
+The following table shows speedups for a number of the examples on a
+2.40GHz, 40-core Intel® Xeon E7-8870 system with the Intel® SSE4
+instruction set, running Microsoft Windows Server 2008 Enterprise.  Here,
+the serial C/C++ baseline code was compiled with MSVC 2010.
+ 
+.. list-table:: Performance of ``ispc`` with a variety of the workloads
+   from the ``examples/`` directory of the ``ispc`` distribution, on 
+   system with 40 CPU cores.
+
+  * - Workload
+    - ``ispc``, 40 cores
+  * - AOBench (2048 x 2048 resolution)
+    - 182.36x
+  * - Binomial Options (2m options)
+    - 63.85x
+  * - Black-Scholes Options (2m options)
+    - 83.97x
+  * - Ray Tracer (Sponza dataset)
+    - 195.67x
+  * - Volume Rendering
+    - 243.18x
+
--- a/docs/perfguide.rst
+++ b/docs/perfguide.rst
@@ -0,0 +1,829 @@
+==============================================
+Intel® SPMD Program Compiler Performance Guide
+==============================================
+
+The SPMD programming model provided by ``ispc`` naturally delivers
+excellent performance for many workloads thanks to efficient use of CPU
+SIMD vector hardware.  This guide provides more details about how to get
+the most out of ``ispc`` in practice.
+
+* `Key Concepts`_
+
+  + `Efficient Iteration With "foreach"`_
+  + `Improving Control Flow Coherence With "foreach_tiled"`_
+  + `Using Coherent Control Flow Constructs`_
+  + `Use "uniform" Whenever Appropriate`_
+  + `Use "Structure of Arrays" Layout When Possible`_
+
+* `Tips and Techniques`_
+
+  + `Understanding Gather and Scatter`_
+  + `Avoid 64-bit Addressing Calculations When Possible`_
+  + `Avoid Computation With 8 and 16-bit Integer Types`_
+  + `Implementing Reductions Efficiently`_
+  + `Using "foreach_active" Effectively`_
+  + `Using Low-level Vector Tricks`_
+  + `The "Fast math" Option`_
+  + `"inline" Aggressively`_
+  + `Avoid The System Math Library`_
+  + `Declare Variables In The Scope Where They're Used`_
+  + `Instrumenting ISPC Programs To Understand Runtime Behavior`_
+  + `Choosing A Target Vector Width`_
+
+* `Disclaimer and Legal Information`_
+
+* `Optimization Notice`_
+
+Key Concepts
+============
+
+This section describes the four most important concepts to understand and
+keep in mind when writing high-performance ``ispc`` programs.  It assumes
+good familiarity with the topics covered in the ``ispc`` `Users Guide`_.
+
+.. _Users Guide: ispc.html
+
+Efficient Iteration With "foreach"
+----------------------------------
+
+The ``foreach`` parallel iteration construct is semantically equivalent to
+a regular ``for()`` loop, though it offers meaningful performance benefits.
+(See the `documentation on "foreach" in the Users Guide`_ for a review of
+its syntax and semantics.)  As an example, consider this simple function
+that iterates over some number of elements in an array, doing computation
+on each one:
+
+.. _documentation on "foreach" in the Users Guide: ispc.html#parallel-iteration-statements-foreach-and-foreach-tiled
+
+::
+
+    export void foo(uniform int a[], uniform int count) {
+        for (int i = programIndex; i < count; i += programCount) {
+            // do some computation on a[i]
+        }
+    }
+
+Depending on the specifics of the computation being performed, the code
+generated for this function could likely be improved by modifying the code 
+so that the loop only goes as far through the data as is possible to pack
+an entire gang of program instances with computation each time through the
+loop.  Doing so enables the ``ispc`` compiler to generate more efficient
+code for cases where it knows that the execution mask is "all on".  Then,
+an ``if`` statement at the end handles processing the ragged extra bits of
+data that didn't fully fill a gang.
+
+::
+
+    export void foo(uniform int a[], uniform int count) {
+        // First, just loop up to the point where all program instances
+        // in the gang will be active at the loop iteration start
+        uniform int countBase = count & ~(programCount-1);
+        for (uniform int i = 0; i < countBase; i += programCount) {
+            int index = i + programIndex;
+            // do some computation on a[index]
+        }
+        // Now handle the ragged extra bits at the end
+        if (countBase < count) {
+            int index = countBase + programIndex;
+            // do some computation on a[index]
+        }
+    }
+
+While the performance of the above code will likely be better than the
+first version of the function, the loop body code has been duplicated (or
+has been forced to move into a separate utility function).
+
+Using the ``foreach`` looping construct as below provides all of the
+performance benefits of the second version of this function, with the
+compactness of the first.
+
+::
+
+    export void foo(uniform int a[], uniform int count) {
+        foreach (i = 0 ... count) {
+            // do some computation on a[i]
+        }
+    }
+
+Improving Control Flow Coherence With "foreach_tiled"
+-----------------------------------------------------
+
+Depending on the computation being performed, ``foreach_tiled`` may give
+better performance than ``foreach``.  (See the `documentation in the Users
+Guide`_ for the syntax and semantics of ``foreach_tiled``.)  Given a
+multi-dimensional iteration like:
+
+.. _documentation in the Users Guide: ispc.html#parallel-iteration-statements-foreach-and-foreach-tiled
+
+::
+
+    foreach (i = 0 ... width, j = 0 ... height) {
+        // do computation on element (i,j)
+    }
+
+if the ``foreach`` statement is used, elements in the gang of program
+instances will be mapped to values of ``i`` and ``j`` by taking spans of
+``programCount`` elements across ``i`` with a single value of ``j``.  For
+example, the ``foreach`` statement above roughly corresponds to:
+
+::
+
+    for (uniform int j = 0; j < height; ++j)
+        for (int i = 0; i < width; i += programCount) {
+            // do computation 
+    }
+
+When a multi-dimensional domain is being iterated over, ``foreach_tiled``
+statement maps program instances to data in a way that tries to select
+square n-dimensional segments of the domain.  For example, on a compilation
+target with 8-wide gangs of program instances, it generates code that
+iterates over the domain the same way as the following code (though more
+efficiently):
+
+::
+
+    for (int j = programIndex/4; j < height; j += 2)
+        for (int i = programIndex%4; i < width; i += 4) {
+            // do computation 
+    }
+
+Thus, each gang of program instances operates on a 2x4 tile of the domain.
+With higher-dimensional iteration and different gang sizes, a similar
+mapping is performed--e.g. for 2D iteration with a 16-wide gang size, 4x4
+tiles are iterated over; for 4D iteration with a 8-gang, 1x2x2x2 tiles are
+processed, and so forth.  
+
+Performance benefit can come from using ``foreach_tiled`` in that it
+essentially optimizes for the benefit of iterating over *compact* regions
+of the domain (while ``foreach`` iterates over the domain in a way that
+generally allows linear memory access.)  There are two benefits from
+processing compact regions of the domain.  
+
+First, it's often the case that the control flow coherence of the program
+instances in the gang is improved; if data-dependent control flow decisions
+are related to the values of the data in the domain being processed, and if
+the data values have some coherence, iterating with compact regions will
+improve control flow coherence.
+
+Second, processing compact regions may mean that the data accessed by
+program instances in the gang is be more coherent, leading to performance
+benefits from better cache hit rates.
+
+As a concrete example, for the ray tracer example in the ``ispc``
+distribution (in the ``examples/rt`` directory), performance is 20% better
+when the pixels are iterated over using ``foreach_tiled`` than ``foreach``,
+because more coherent regions of the scene are accessed by the set of rays
+in the gang of program instances.
+
+
+Using Coherent Control Flow Constructs
+--------------------------------------
+
+Recall from the ``ispc`` Users Guide, in the `SPMD-on-SIMD Execution Model
+section`_ that ``if`` statements with a ``uniform`` test compile to more
+efficient code than ``if`` tests with varying tests.  The coherent ``cif``
+statement can provide many benefits of ``if`` with a uniform test in the
+case where the test is actually varying.
+
+.. _SPMD-on-SIMD Execution Model section: ispc.html#the-spmd-on-simd-execution-model
+
+In this case, the code the compiler generates for the ``if``
+test is along the lines of the following pseudo-code:
+
+::
+
+   bool expr = /* evaluate cif condition */
+   if (all(expr)) {
+       // run "true" case of if test only
+   } else if (!any(expr)) {
+       // run "false" case of if test only
+   } else {
+       // run both true and false cases, updating mask appropriately
+   }
+
+For ``if`` statements where the different running SPMD program instances
+don't have coherent values for the boolean ``if`` test, using ``cif``
+introduces some additional overhead from the ``all`` and ``any`` tests as
+well as the corresponding branches.  For cases where the program
+instances often do compute the same boolean value, this overhead is
+worthwhile.  If the control flow is in fact usually incoherent, this
+overhead only costs performance.
+
+In a similar fashion, ``ispc`` provides ``cfor``, ``cwhile``, and ``cdo``
+statements.  These statements are semantically the same as the
+corresponding non-"c"-prefixed functions.
+
+Use "uniform" Whenever Appropriate
+----------------------------------
+
+For any variable that will always have the same value across all of the
+program instances in a gang, declare the variable with the  ``uniform``
+qualifier.  Doing so enables the ``ispc`` compiler to emit better code in
+many different ways.
+
+As a simple example, consider a ``for`` loop that always does the same
+number of iterations:
+
+::
+
+    for (int i = 0; i < 10; ++i)
+        // do something ten times
+
+If this is written with ``i`` as a ``varying`` variable, as above, there's
+additional overhead in the code generated for the loop as the compiler
+emits instructions to handle the possibility of not all program instances
+following the same control flow path (as might be the case if the loop
+limit, 10, was itself a ``varying`` value.)
+
+If the above loop is instead written with ``i`` ``uniform``, as:
+
+::
+
+    for (uniform int i = 0; i < 10; ++i)
+        // do something ten times
+
+Then better code can be generated (and the loop possibly unrolled).
+
+In some cases, the compiler may be able to detect simple cases like these,
+but it's always best to provide the compiler with as much help as possible
+to understand the actual form of your computation.
+
+
+Use "Structure of Arrays" Layout When Possible
+----------------------------------------------
+
+In general, memory access performance (for both reads and writes) is best
+when the running program instances access a contiguous region of memory; in
+this case efficient vector load and store instructions can often be used
+rather than gathers and scatters.  As an example of this issue, consider an
+array of a simple point datatype laid out and accessed in conventional
+"array of structures" (AOS) layout:
+
+::
+
+    struct Point { float x, y, z; };
+    uniform Point pts[...];
+    float v = pts[programIndex].x;
+
+In the above code, the access to ``pts[programIndex].x`` accesses
+non-sequential memory locations, due to the ``y`` and ``z`` values between
+the desired ``x`` values in memory.  A "gather" is required to get the
+value of ``v``, with a corresponding decrease in performance.
+
+If ``Point`` was defined as a "structure of arrays" (SOA) type, the access
+can be much more efficient:
+
+::
+
+    struct Point8 { float x[8], y[8], z[8]; };
+    uniform Point8 pts8[...];
+    int majorIndex = programIndex / 8;
+    int minorIndex = programIndex % 8;
+    float v = pts8[majorIndex].x[minorIndex];
+
+In this case, each ``Point8`` has 8 ``x`` values contiguous in memory
+before 8 ``y`` values and then 8 ``z`` values.  If the gang size is 8 or
+less, the access for ``v`` will have the same value of ``majorIndex`` for
+all program instances and will access consecutive elements of the ``x[8]``
+array with a vector load.  (For larger gang sizes, two 8-wide vector loads
+would be issues, which is also quite efficient.)
+
+However, the syntax in the above code is messy; accessing SOA data in this
+fashion is much less elegant than the corresponding code for accessing the
+data with AOS layout.  The ``soa`` qualifier in ``ispc`` can be used to
+cause the corresponding transformation to be made to the ``Point`` type,
+while preserving the clean syntax for data access that comes with AOS
+layout:
+
+::
+
+    soa<8> Point pts[...]; 
+    float v = pts[programIndex].x;
+
+Thanks to having SOA layout a first-class concept in the language's type
+system, it's easy to write functions that convert data between the
+layouts.  For example, the ``aos_to_soa`` function below converts ``count``
+elements of the given ``Point`` type from AOS to 8-wide SOA layout.  (It
+assumes that the caller has pre-allocated sufficient space in the
+``pts_soa`` output array.
+
+::
+
+    void aos_to_soa(uniform Point pts_aos[], uniform int count,
+                    soa<8> pts_soa[]) {
+         foreach (i = 0 ... count)
+             pts_soa[i] = pts_aos[i];
+    }
+
+Analogously, a function could be written to convert back from SOA to AOS if
+needed.
+
+
+Tips and Techniques
+===================
+
+This section introduces a number of additional techniques that are worth
+keeping in mind when writing ``ispc`` programs.
+
+Understanding Gather and Scatter
+--------------------------------
+
+Memory reads and writes from the program instances in a gang that access
+irregular memory locations (rather than a consecutive set of locations, or
+a single location) can be relatively inefficient.  As an example, consider
+the "simple" array indexing calculation below:
+
+::
+
+    int i = ....;
+    uniform float x[10] = { ... };
+    float f = x[i];
+
+Since the index ``i`` is a varying value, the program instances in the gang
+will in general be reading different locations in the array ``x``.  Because
+current CPUs have a "gather" instruction, the ``ispc`` compiler has to
+serialize these memory reads, performing a separate memory load for each
+running program instance, packing the result into ``f``.  (The analogous
+case happens for a write into ``x[i]``.)
+
+In many cases, gathers like these are unavoidable; the program instances
+just need to access incoherent memory locations.  However, if the array
+index ``i`` actually has the same value for all of the program instances or
+if it represents an access to a consecutive set of array locations, much
+more efficient load and store instructions can be generated instead of
+gathers and scatters, respectively.
+
+In many cases, the ``ispc`` compiler is able to deduce that the memory
+locations accessed by a varying index are either all the same or are
+uniform.  For example, given:
+
+::
+
+  uniform int x = ...;
+  int y = x;
+  return array[y];
+
+The compiler is able to determine that all of the program instances are
+loading from the same location, even though ``y`` is not a ``uniform``
+variable.  In this case, the compiler will transform this load to a regular
+vector load, rather than a general gather.
+
+Sometimes the running program instances will access a linear sequence of
+memory locations; this happens most frequently when array indexing is done
+based on the built-in ``programIndex`` variable.  In many of these cases,
+the compiler is also able to detect this case and then do a vector load.
+For example, given:
+
+::
+
+    for (int i = programIndex; i < count; i += programCount)
+      // process array[i];
+
+Regular vector loads and stores are issued for accesses to ``array[i]``.
+
+Both of these cases have been ones where the compiler is able to determine
+statically that the index has the same value at compile-time.  It's 
+often the case that this determination can't be made at compile time, but
+this is often the case at run time.  The ``reduce_equal()`` function from
+the standard library can be used in this case; it checks to see if the
+given value is the same across over all of the running program instances,
+returning true and its ``uniform`` value if so.
+
+The following function shows the use of ``reduce_equal()`` to check for an
+equal index at execution time and then either do a scalar load and
+broadcast or a general gather.
+
+::
+
+    uniform float array[..] = { ... };
+    float value;
+    int i = ...;
+    uniform int ui;
+    if (reduce_equal(i, &ui) == true)
+        value = array[ui]; // scalar load + broadcast
+    else
+        value = array[i];  // gather
+
+For a simple case like the one above, the overhead of doing the
+``reduce_equal()`` check is likely not worthwhile compared to just always
+doing a gather.  In more complex cases, where a number of accesses are done
+based on the index, it can be worth doing.  See the example
+``examples/volume_rendering`` in the ``ispc`` distribution for the use of
+this technique in an instance where it is beneficial to performance.
+
+Understanding Memory Read Coalescing
+------------------------------------
+
+XXXX todo
+
+
+Avoid 64-bit Addressing Calculations When Possible
+--------------------------------------------------
+
+Even when compiling to a 64-bit architecture target, ``ispc`` does many of
+the addressing calculations in 32-bit precision by default--this behavior
+can be overridden with the ``--addressing=64`` command-line argument.  This
+option should only be used if it's necessary to be able to address over 4GB
+of memory in the ``ispc`` code, as it essentially doubles the cost of
+memory addressing calculations in the generated code.
+
+Avoid Computation With 8 and 16-bit Integer Types
+-------------------------------------------------
+
+The code generated for 8 and 16-bit integer types is generally not as
+efficient as the code generated for 32-bit integer types.  It is generally
+worthwhile to use 32-bit integer types for intermediate computations, even
+if the final result will be stored in a smaller integer type.
+
+Implementing Reductions Efficiently
+-----------------------------------
+
+It's often necessary to compute a reduction over a data set--for example,
+one might want to add all of the values in an array, compute their minimum,
+etc.  ``ispc`` provides a few capabilities that make it easy to efficiently
+compute reductions like these.  However, it's important to use these
+capabilities appropriately for best results.
+
+As an example, consider the task of computing the sum of all of the values
+in an array.  In C code, we might have:
+
+::
+
+    /* C implementation of a sum reduction */
+    float sum(const float array[], int count) {
+        float sum = 0;
+        for (int i = 0; i < count; ++i)
+            sum += array[i];
+        return sum;
+    } 
+
+Exactly this computation could also be expressed as a purely uniform
+computation in ``ispc``, though without any benefit from vectorization:
+
+::
+
+    /* inefficient ispc implementation of a sum reduction */
+    uniform float sum(const uniform float array[], uniform int count) {
+        uniform float sum = 0;
+        for (uniform int i = 0; i < count; ++i)
+            sum += array[i];
+        return sum;
+    } 
+
+As a first try, one might try using the ``reduce_add()`` function from the
+``ispc`` standard library; it takes a ``varying`` value and returns the sum
+of that value across all of the active program instances.
+
+::
+
+    /* inefficient ispc implementation of a sum reduction */
+    uniform float sum(const uniform float array[], uniform int count) {
+        uniform float sum = 0;
+        foreach (i = 0 ... count)
+            sum += reduce_add(array[i+programIndex]);
+        return sum;
+    } 
+
+This implementation loads a gang's worth of values from the array, one for
+each of the program instances, and then uses ``reduce_add()`` to reduce
+across the program instances and then update the sum.  Unfortunately this
+approach loses most benefit from vectorization, as it does more work on the
+cross-program instance ``reduce_add()`` call than it saves from the vector
+load of values.
+
+The most efficient approach is to do the reduction in two phases: rather
+than using a ``uniform`` variable to store the sum, we maintain a varying
+value, such that each program instance is effectively computing a local
+partial sum on the subset of array values that it has loaded from the
+array.  When the loop over array elements concludes, a single call to
+``reduce_add()`` computes the final reduction across each of the program
+instances' elements of ``sum``.  This approach effectively compiles to a
+single vector load and a single vector add for each loop iteration's of
+values--very efficient code in the end.
+
+::
+
+    /* good ispc implementation of a sum reduction */
+    uniform float sum(const uniform float array[], uniform int count) {
+        float sum = 0;
+        foreach (i = 0 ... count)
+            sum += array[i+programIndex];
+        return reduce_add(sum);
+    } 
+
+Using "foreach_active" Effectively
+----------------------------------
+
+For high-performance code,
+
+For example, consider this segment of code, from the introduction of
+``foreach_active`` in the ispc User's Guide:
+
+::
+
+    uniform float array[...] = { ... };    
+    int index = ...;
+    foreach_active (i) {
+        ++array[index];
+    }  
+
+Here, ``index`` was assumed to possibly have the same value for multiple
+program instances, so the updates to ``array[index]`` are serialized by the
+``foreach_active`` statement in order to not have undefined results when
+``index`` values do collide.
+
+The code generated by the compiler can be improved  in this case by making
+it clear that only a single element of the array is accessed by
+``array[index]`` and that thus a general gather or scatter isn't required.
+Specifically, by using the ``extract()`` function from the standard library
+to extract the current program instance's value of ``index`` into a
+``uniform`` variable and then using that to index into ``array``, as below,
+more efficient code is generated.
+
+::
+
+    foreach_active (instanceNum) {
+        uniform int unifIndex = extract(index, instanceNum);
+        ++array[unifIndex];
+    }
+
+
+Using Low-level Vector Tricks
+-----------------------------
+
+Many low-level Intel® SSE and AVX coding constructs can be implemented in
+``ispc`` code.  The ``ispc`` standard library functions ``intbits()`` and
+``floatbits()`` are often useful in this context.  Recall that
+``intbits()`` takes a ``float`` value and returns it as an integer where
+the bits of the integer are the same as the bit representation in memory of
+the ``float``.  (In other words, it does *not* perform an integer to
+floating-point conversion.)  ``floatbits()``, then, performs the inverse
+computation.
+
+As an example of the use of these functions, the following code efficiently
+reverses the sign of the given values.
+
+::
+
+  float flipsign(float a) {
+      unsigned int i = intbits(a);
+      i ^= 0x80000000;
+      return floatbits(i);
+  }
+
+This code compiles down to a single XOR instruction.
+
+The "Fast math" Option
+----------------------
+
+``ispc`` has a ``--opt=fast-math`` command-line flag that enables a number of
+optimizations that may be undesirable in code where numerical precision is
+critically important.  For many graphics applications, for example, the
+approximations introduced may be acceptable, however.  The following two
+optimizations are performed when ``--opt=fast-math`` is used.  By default, the
+``--opt=fast-math`` flag is off.
+
+* Expressions like ``x / y``, where ``y`` is a compile-time constant, are
+  transformed to ``x * (1./y)``, where the inverse value of ``y`` is
+  precomputed at compile time.
+
+* Expressions like ``x / y``, where ``y`` is not a compile-time constant,
+  are transformed to ``x * rcp(y)``, where ``rcp()`` maps to the
+  approximate reciprocal instruction from the ``ispc`` standard library.
+
+
+"inline" Aggressively
+---------------------
+
+Inlining functions aggressively is generally beneficial for performance
+with ``ispc``.  Definitely use the ``inline`` qualifier for any short
+functions (a few lines long), and experiment with it for longer functions.
+
+Avoid The System Math Library
+-----------------------------
+
+The default math library for transcendentals and the like that ``ispc`` has
+higher error than the system's math library, though is much more efficient
+due to being vectorized across the program instances and due to the fact
+that the functions can be inlined in the final code.  (It generally has
+errors in the range of 10ulps, while the system math library generally has
+no more than 1ulp of error for transcendentals.)
+
+If the ``--math-lib=system`` command-line option is used when compiling an
+``ispc`` program, then calls to the system math library will be generated
+instead.  This option should only be used if the higher precision is
+absolutely required as the performance impact of using it can be
+significant.
+
+Declare Variables In The Scope Where They're Used
+-------------------------------------------------
+
+Performance is slightly improved by declaring variables at the same block
+scope where they are first used.  For example, in code like the
+following, if the lifetime of ``foo`` is only within the scope of the
+``if`` clause, write the code like this:  
+
+::
+
+    float func() {
+        ....
+        if (x < y) {
+            float foo;
+            ... use foo ...
+        }
+    }
+
+Try not to write code as:
+
+::
+
+    float func() {
+        float foo;
+        ....
+        if (x < y) {
+            ... use foo ...
+        }
+    }
+
+Doing so can reduce the amount of masked store instructions that the
+compiler needs to generate.
+
+Instrumenting ISPC Programs To Understand Runtime Behavior
+----------------------------------------------------------
+
+``ispc`` has an optional instrumentation feature that can help you
+understand performance issues.  If a program is compiled using the
+``--instrument`` flag, the compiler emits calls to a function with the
+following signature at various points in the program (for
+example, at interesting points in the control flow, when scatters or
+gathers happen.)
+
+::
+
+    extern "C" {
+        void ISPCInstrument(const char *fn, const char *note, 
+                            int line, uint64_t mask);
+    }
+
+This function is passed the file name of the ``ispc`` file running, a short
+note indicating what is happening, the line number in the source file, and
+the current mask of active program instances in the gang.  You must provide an
+implementation of this function and link it in with your application.
+
+For example, when the ``ispc`` program runs, this function might be called
+as follows:
+
+::
+
+   ISPCInstrument("foo.ispc", "function entry", 55, 0xfull);
+
+This call indicates that at the currently executing program has just
+entered the function defined at line 55 of the file ``foo.ispc``, with a
+mask of all lanes currently executing (assuming a four-wide gang size
+target machine).
+
+For a fuller example of the utility of this functionality, see
+``examples/aobench_instrumented`` in the ``ispc`` distribution.  This
+example includes an implementation of the ``ISPCInstrument()`` function
+that collects aggregate data about the program's execution behavior.
+
+When running this example, you will want to direct to the ``ao`` executable
+to generate a low resolution image, because the instrumentation adds
+substantial execution overhead.  For example:
+
+::
+
+    % ./ao 1 32 32
+
+After the ``ao`` program exits, a summary report along the following lines
+will be printed.  In the first few lines, you can see how many times a few
+functions were called, and the average percentage of SIMD lanes that were
+active upon function entry.
+
+:: 
+
+    ao.ispc(0067) - function entry: 342424 calls (0 / 0.00% all off!), 95.86% active lanes
+    ao.ispc(0067) - return: uniform control flow: 342424 calls (0 / 0.00% all off!), 95.86% active lanes
+    ao.ispc(0071) - function entry: 1122 calls (0 / 0.00% all off!), 97.33% active lanes
+    ao.ispc(0075) - return: uniform control flow: 1122 calls (0 / 0.00% all off!), 97.33% active lanes
+    ao.ispc(0079) - function entry: 10072 calls (0 / 0.00% all off!), 45.09% active lanes
+    ao.ispc(0088) - function entry: 36928 calls (0 / 0.00% all off!), 97.40% active lanes
+    ...
+
+
+Choosing A Target Vector Width
+------------------------------
+
+By default, ``ispc`` compiles to the natural vector width of the target
+instruction set.  For example, for SSE2 and SSE4, it compiles four-wide,
+and for AVX, it complies 8-wide.  For some programs, higher performance may
+be seen if the program is compiled to a doubled vector width--8-wide for
+SSE and 16-wide for AVX.  
+
+For workloads that don't require many of registers, this method can lead to
+significantly more efficient execution thanks to greater instruction level
+parallelism and amortization of various overhead over more program
+instances.  For other workloads, it may lead to a slowdown due to higher
+register pressure; trying both approaches for key kernels may be
+worthwhile.
+
+This option is only available for each of the SSE2, SSE4 and AVX targets.
+It is selected with the ``--target=sse2-x2``, ``--target=sse4-x2`` and
+``--target=avx-x2`` options, respectively.
+
+
+Disclaimer and Legal Information
+================================
+
+INFORMATION IN THIS DOCUMENT IS PROVIDED IN CONNECTION WITH INTEL(R) PRODUCTS.
+NO LICENSE, EXPRESS OR IMPLIED, BY ESTOPPEL OR OTHERWISE, TO ANY INTELLECTUAL
+PROPERTY RIGHTS IS GRANTED BY THIS DOCUMENT. EXCEPT AS PROVIDED IN INTEL'S TERMS
+AND CONDITIONS OF SALE FOR SUCH PRODUCTS, INTEL ASSUMES NO LIABILITY WHATSOEVER,
+AND INTEL DISCLAIMS ANY EXPRESS OR IMPLIED WARRANTY, RELATING TO SALE AND/OR USE
+OF INTEL PRODUCTS INCLUDING LIABILITY OR WARRANTIES RELATING TO FITNESS FOR A
+PARTICULAR PURPOSE, MERCHANTABILITY, OR INFRINGEMENT OF ANY PATENT, COPYRIGHT
+OR OTHER INTELLECTUAL PROPERTY RIGHT.
+
+UNLESS OTHERWISE AGREED IN WRITING BY INTEL, THE INTEL PRODUCTS ARE NOT DESIGNED
+NOR INTENDED FOR ANY APPLICATION IN WHICH THE FAILURE OF THE INTEL PRODUCT COULD
+CREATE A SITUATION WHERE PERSONAL INJURY OR DEATH MAY OCCUR.
+
+Intel may make changes to specifications and product descriptions at any time,
+without notice. Designers must not rely on the absence or characteristics of any
+features or instructions marked "reserved" or "undefined." Intel reserves these
+for future definition and shall have no responsibility whatsoever for conflicts
+or incompatibilities arising from future changes to them. The information here
+is subject to change without notice. Do not finalize a design with this
+information.
+
+The products described in this document may contain design defects or errors
+known as errata which may cause the product to deviate from published
+specifications. Current characterized errata are available on request.
+
+Contact your local Intel sales office or your distributor to obtain the latest
+specifications and before placing your product order.
+
+Copies of documents which have an order number and are referenced in this
+document, or other Intel literature, may be obtained by calling 1-800-548-4725,
+or by visiting Intel's Web Site.
+
+Intel processor numbers are not a measure of performance. Processor numbers
+differentiate features within each processor family, not across different
+processor families. See http://www.intel.com/products/processor_number for
+details.
+
+BunnyPeople, Celeron, Celeron Inside, Centrino, Centrino Atom,
+Centrino Atom Inside, Centrino Inside, Centrino logo, Core Inside, FlashFile,
+i960, InstantIP, Intel, Intel logo, Intel386, Intel486, IntelDX2, IntelDX4,
+IntelSX2, Intel Atom, Intel Atom Inside, Intel Core, Intel Inside,
+Intel Inside logo, Intel. Leap ahead., Intel. Leap ahead. logo, Intel NetBurst,
+Intel NetMerge, Intel NetStructure, Intel SingleDriver, Intel SpeedStep,
+Intel StrataFlash, Intel Viiv, Intel vPro, Intel XScale, Itanium,
+Itanium Inside, MCS, MMX, Oplus, OverDrive, PDCharm, Pentium, Pentium Inside,
+skoool, Sound Mark, The Journey Inside, Viiv Inside, vPro Inside, VTune, Xeon,
+and Xeon Inside are trademarks of Intel Corporation in the U.S. and other
+countries.
+
+* Other names and brands may be claimed as the property of others.
+
+Copyright(C) 2011, Intel Corporation. All rights reserved.
+
+
+Optimization Notice
+===================
+
+Intel compilers, associated libraries and associated development tools may
+include or utilize options that optimize for instruction sets that are
+available in both Intel and non-Intel microprocessors (for example SIMD
+instruction sets), but do not optimize equally for non-Intel
+microprocessors.  In addition, certain compiler options for Intel
+compilers, including some that are not specific to Intel
+micro-architecture, are reserved for Intel microprocessors.  For a detailed
+description of Intel compiler options, including the instruction sets and
+specific microprocessors they implicate, please refer to the "Intel
+Compiler User and Reference Guides" under "Compiler Options."  Many library
+routines that are part of Intel compiler products are more highly optimized
+for Intel microprocessors than for other microprocessors.  While the
+compilers and libraries in Intel compiler products offer optimizations for
+both Intel and Intel-compatible microprocessors, depending on the options
+you select, your code and other factors, you likely will get extra
+performance on Intel microprocessors.
+
+Intel compilers, associated libraries and associated development tools may
+or may not optimize to the same degree for non-Intel microprocessors for
+optimizations that are not unique to Intel microprocessors.  These
+optimizations include Intel® Streaming SIMD Extensions 2 (Intel® SSE2),
+Intel® Streaming SIMD Extensions 3 (Intel® SSE3), and Supplemental
+Streaming SIMD Extensions 3 (Intel SSSE3) instruction sets and other
+optimizations.  Intel does not guarantee the availability, functionality,
+or effectiveness of any optimization on microprocessors not manufactured by
+Intel.  Microprocessor-dependent optimizations in this product are intended
+for use with Intel microprocessors.
+
+While Intel believes our compilers and libraries are excellent choices to
+assist in obtaining the best performance on Intel and non-Intel
+microprocessors, Intel recommends that you evaluate other compilers and
+libraries to determine which best meet your requirements.  We hope to win
+your business by striving to offer the best performance of any compiler or
+library; please let us know if you find we do not.
+
--- a/docs/template-news.txt
+++ b/docs/template-news.txt
@@ -0,0 +1,66 @@
+%(head_prefix)s
+%(head)s
+<script type="text/javascript">
+
+  var _gaq = _gaq || [];
+  _gaq.push(['_setAccount', 'UA-1486404-4']);
+  _gaq.push(['_trackPageview']);
+
+  (function() {
+    var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
+    ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
+    var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
+  })();
+
+</script>
+%(stylesheet)s
+%(body_prefix)s
+<div id="wrap">
+  <div id="wrap2">
+    <div id="header">
+      <h1 id="logo">Intel SPMD Program Compiler</h1>
+      <div id="slogan">An open-source compiler for high-performance SIMD programming on
+      the CPU</div>
+    </div>
+    <div id="nav">
+      <div id="nbar">
+        <ul>
+          <li><a href="index.html">Overview</a></li>
+          <li id="selected"><a href="news.html">News</a></li>
+          <li><a href="features.html">Features</a></li>
+          <li><a href="downloads.html">Downloads</a></li>
+          <li><a href="documentation.html">Documentation</a></li>
+          <li><a href="perf.html">Performance</a></li>
+          <li><a href="contrib.html">Contributors</a></li>
+        </ul>
+      </div>
+    </div>
+    <div id="content-wrap">
+      <div id="sidebar">
+          <div class="widgetspace">
+            <h1>Resources</h1>
+            <ul class="menu">
+              <li><a href="http://github.com/ispc/ispc/">ispc page on github</a></li>
+              <li><a href="http://groups.google.com/group/ispc-users/">ispc
+              users mailing list</a></li>
+              <li><a href="http://groups.google.com/group/ispc-dev/">ispc
+              developers mailing list</a></li>
+              <li><a href="http://github.com/ispc/ispc/wiki/">Wiki</a></li>
+              <li><a href="http://github.com/ispc/ispc/issues/">Bug tracking</a></li>
+              <li><a href="doxygen/index.html">Doxygen</a></li>
+            </ul>
+        </div>
+      </div>
+%(body_pre_docinfo)s
+%(docinfo)s
+<div id="content">
+%(body)s
+</div>
+    <div class="clearfix"></div>
+    <div id="footer"> &copy; 2011-2012 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue  by: <a href="http://www.themebin.com/">ThemeBin</a>
+      <!-- Please Do Not remove this link, thank u -->
+      </div>
+      </div>
+      </div>
+      </div>
+%(body_suffix)s
--- a/docs/template-perf.txt
+++ b/docs/template-perf.txt
@@ -0,0 +1,66 @@
+%(head_prefix)s
+%(head)s
+<script type="text/javascript">
+
+  var _gaq = _gaq || [];
+  _gaq.push(['_setAccount', 'UA-1486404-4']);
+  _gaq.push(['_trackPageview']);
+
+  (function() {
+    var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
+    ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
+    var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
+  })();
+
+</script>
+%(stylesheet)s
+%(body_prefix)s
+<div id="wrap">
+  <div id="wrap2">
+    <div id="header">
+      <h1 id="logo">Intel SPMD Program Compiler</h1>
+      <div id="slogan">An open-source compiler for high-performance SIMD programming on
+      the CPU</div>
+    </div>
+    <div id="nav">
+      <div id="nbar">
+        <ul>
+          <li><a href="index.html">Overview</a></li>
+          <li><a href="news.html">News</a></li>
+          <li><a href="features.html">Features</a></li>
+          <li><a href="downloads.html">Downloads</a></li>
+          <li><a href="documentation.html">Documentation</a></li>
+          <li id="selected"><a href="perf.html">Performance</a></li>
+          <li><a href="contrib.html">Contributors</a></li>
+        </ul>
+      </div>
+    </div>
+    <div id="content-wrap">
+      <div id="sidebar">
+          <div class="widgetspace">
+            <h1>Resources</h1>
+            <ul class="menu">
+              <li><a href="http://github.com/ispc/ispc/">ispc page on github</a></li>
+              <li><a href="http://groups.google.com/group/ispc-users/">ispc
+              users mailing list</a></li>
+              <li><a href="http://groups.google.com/group/ispc-dev/">ispc
+              developers mailing list</a></li>
+              <li><a href="http://github.com/ispc/ispc/wiki/">Wiki</a></li>
+              <li><a href="http://github.com/ispc/ispc/issues/">Bug tracking</a></li>
+              <li><a href="doxygen/index.html">Doxygen</a></li>
+            </ul>
+        </div>
+      </div>
+%(body_pre_docinfo)s
+%(docinfo)s
+<div id="content">
+%(body)s
+</div>
+    <div class="clearfix"></div>
+    <div id="footer"> &copy; 2011-2012 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue  by: <a href="http://www.themebin.com/">ThemeBin</a>
+      <!-- Please Do Not remove this link, thank u -->
+      </div>
+      </div>
+      </div>
+      </div>
+%(body_suffix)s
--- a/docs/template.txt
+++ b/docs/template.txt
@@ -0,0 +1,66 @@
+%(head_prefix)s
+%(head)s
+<script type="text/javascript">
+
+  var _gaq = _gaq || [];
+  _gaq.push(['_setAccount', 'UA-1486404-4']);
+  _gaq.push(['_trackPageview']);
+
+  (function() {
+    var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
+    ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
+    var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
+  })();
+
+</script>
+%(stylesheet)s
+%(body_prefix)s
+<div id="wrap">
+  <div id="wrap2">
+    <div id="header">
+      <h1 id="logo">Intel SPMD Program Compiler</h1>
+      <div id="slogan">An open-source compiler for high-performance SIMD programming on
+      the CPU</div>
+    </div>
+    <div id="nav">
+      <div id="nbar">
+        <ul>
+          <li><a href="index.html">Overview</a></li>
+          <li><a href="news.html">News</a></li>
+          <li><a href="features.html">Features</a></li>
+          <li><a href="downloads.html">Downloads</a></li>
+          <li id="selected"><a href="documentation.html">Documentation</a></li>
+          <li><a href="perf.html">Performance</a></li>
+          <li><a href="contrib.html">Contributors</a></li>
+        </ul>
+      </div>
+    </div>
+    <div id="content-wrap">
+      <div id="sidebar">
+          <div class="widgetspace">
+            <h1>Resources</h1>
+            <ul class="menu">
+              <li><a href="http://github.com/ispc/ispc/">ispc page on github</a></li>
+              <li><a href="http://groups.google.com/group/ispc-users/">ispc
+              users mailing list</a></li>
+              <li><a href="http://groups.google.com/group/ispc-dev/">ispc
+              developers mailing list</a></li>
+              <li><a href="http://github.com/ispc/ispc/wiki/">Wiki</a></li>
+              <li><a href="http://github.com/ispc/ispc/issues/">Bug tracking</a></li>
+              <li><a href="doxygen/index.html">Doxygen</a></li>
+            </ul>
+        </div>
+      </div>
+%(body_pre_docinfo)s
+%(docinfo)s
+<div id="content">
+%(body)s
+</div>
+    <div class="clearfix"></div>
+    <div id="footer"> &copy; 2011-2012 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue  by: <a href="http://www.themebin.com/">ThemeBin</a>
+      <!-- Please Do Not remove this link, thank u -->
+      </div>
+      </div>
+      </div>
+      </div>
+%(body_suffix)s
--- a/doxygen.cfg
+++ b/doxygen.cfg
--- a/examples/README.txt
+++ b/examples/README.txt
@@ -0,0 +1,162 @@
+====================
+ISPC Examples README
+====================
+
+This directory has a number of sample ispc programs.  Before building them
+(on an system), install the appropriate ispc compiler binary into a
+directory in your path.  Then, if you're running Windows, open the
+"examples.sln" file and built from there.  For building under Linux/OSX,
+there are makefiles in each directory that build the examples individually.
+
+Almost all of them benchmark ispc implementations of the given computation
+against regular serial C++ implementations, printing out a comparison of
+the runtimes and the speedup delivered by ispc.  It may be instructive to
+do a side-by-side diff of the C++ and ispc implementations of these
+algorithms to learn more about wirting ispc code.
+
+ 
+AOBench
+=======
+
+This is an ISPC implementation of the "AO bench" benchmark
+(http://syoyo.wordpress.com/2009/01/26/ao-bench-is-evolving/).  The command
+line arguments are:
+
+ao (num iterations) (x res) (yres)
+
+It executes the program for the given number of iterations, rendering an
+(xres x yres) image each time and measuring the computation time with both
+serial and ispc implementations.
+
+
+AOBench_Instrumented
+====================
+
+This version of AO Bench is compiled with the --instrument ispc compiler
+flag.  This causes the compiler to emit calls to a (user-supplied)
+ISPCInstrument() function at interesting places in the compiled code.  An
+example implementation of this function that counts the number of times the
+callback is made and records some statistics about control flow coherence
+is provided in the instrument.cpp file.
+
+
+Deferred
+========
+
+This example shows an extensive example of using ispc for efficient
+deferred shading of scenes with thousands of lights; it's an implementation
+of the algorithm that Johan Andersson described at SIGGRAPH 2009,
+implemented by Andrew Lauritzen and Jefferson Montgomery.  The basic idea
+is that a pre-rendered G-buffer is partitioned into tiles, and in each
+tile, the set of lights that contribute to the tile is first computed.
+Then, the pixels in the tile are then shaded using just those light
+sources. (See slides 19-29 of
+http://s09.idav.ucdavis.edu/talks/04-JAndersson-ParallelFrostbite-Siggraph09.pdf
+for more details on the algorithm.)
+
+This directory includes three implementations of the algorithm:
+
+- An ispc implementation that first does a static partitioning of the
+  screen into tiles to parallelize across the CPU cores.  Within each tile
+  ispc kernels provide highly efficient implementations of the light
+  culling and shading calculations.
+- A "best practices" serial C++ implementation.  This implementation does a
+  dynamic partitioning of the screen, refining tiles with significant Z
+  depth complexity (these tiles often have a large number of lights that
+  affect them).  Within each final tile, the pixels are shaded using
+  regular C++ code.
+- If the Cilk extensions are available in your compiler, an ispc
+  implementation that uses Cilk will also be built.
+  (See http://software.intel.com/en-us/articles/intel-cilk-plus/).  Like 
+  the "best practices" serial implementation, this version does dynamic
+  tile partitioning for better load balancing and then uses ispc for the
+  light culling and shading.
+
+
+GMRES
+=====
+
+An implementation of the generalized minimal residual method for solving
+sparse matrix equations.
+(http://en.wikipedia.org/wiki/Generalized_minimal_residual_method)
+
+
+Mandelbrot
+==========
+
+Mandelbrot set generation.  This example is extensively documented at the
+http://ispc.github.com/example.html page.
+
+
+Mandelbrot_tasks
+================
+
+Implementation of Mandelbrot set generation that also parallelizes across
+cores using tasks.  Under Windows, a simple task system built on
+Microsoft's Concurrency Runtime is used (see tasks_concrt.cpp).  On OSX, a
+task system based on Grand Central Dispatch is used (tasks_gcd.cpp), and on
+Linux, a pthreads-based task system is used (tasks_pthreads.cpp).  When
+using tasks with ispc, no task system is mandated; the user is free to plug
+in any task system they want, for ease of interoperating with existing task
+systems.
+
+
+Noise
+=====
+
+This example has an implementation of Ken Perlin's procedural "noise"
+function, as described in his 2002 "Improving Noise" SIGGRAPH paper.
+
+ 
+Options
+=======
+
+This program implements both the Black-Scholes and Binomial options pricing
+models in both ispc and regular serial C++ code.
+
+
+Perfbench
+=========
+
+This runs a number of microbenchmarks to measure system performance and
+code generation quality.
+
+
+RT
+==
+
+This is a simple ray tracer; it reads in camera parameters and a bounding
+volume hierarchy and renders the scene from the given viewpoint.  The
+command line arguments are:
+
+rt <scene name base>
+
+Where <scene base name> is one of "cornell", "teapot", or "sponza".
+
+The implementation originally derives from the bounding volume hierarchy
+and triangle intersection code from pbrt; see the pbrt source code and/or
+"Physically Based Rendering" book for more about the basic algorithmic
+details.
+
+
+Simple
+======
+
+This is a simple "hello world" type program that shows a ~10 line
+application program calling out to a ~5 line ispc program to do a simple
+computation.
+
+
+Volume
+======
+
+Ray-marching volume rendering, with single scattering lighting model.  To
+run it, specify a camera parameter file and a volume density file, e.g.:
+
+volume camera.dat density_highres.vol
+
+(See, e.g. Chapters 11 and 16 of "Physically Based Rendering" for
+information about the algorithm implemented here.)  The volume data set
+included here was generated by the example implementation of the "Wavelet
+Turbulence for Fluid Simulation" SIGGRAPH 2008 paper by Kim et
+al. (http://www.cs.cornell.edu/~tedkim/WTURB/)
--- a/examples/aobench/.gitignore
+++ b/examples/aobench/.gitignore
@@ -0,0 +1,2 @@
+ao
+*.ppm
--- a/examples/aobench/Makefile
+++ b/examples/aobench/Makefile
@@ -0,0 +1,7 @@
+
+EXAMPLE=ao
+CPP_SRC=ao.cpp ao_serial.cpp
+ISPC_SRC=ao.ispc
+ISPC_TARGETS=sse2,sse4,avx
+
+include ../common.mk
--- a/examples/aobench/ao.cpp
+++ b/examples/aobench/ao.cpp
@@ -0,0 +1,186 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#define NOMINMAX
+#pragma warning (disable: 4244)
+#pragma warning (disable: 4305)
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#ifdef __linux__
+#include <malloc.h>
+#endif
+#include <math.h>
+#include <map>
+#include <string>
+#include <algorithm>
+#include <sys/types.h>
+
+#include "ao_ispc.h"
+using namespace ispc;
+
+#include "../timing.h"
+
+#define NSUBSAMPLES        2
+
+extern void ao_serial(int w, int h, int nsubsamples, float image[]);
+
+static unsigned int test_iterations;
+static unsigned int width, height;
+static unsigned char *img;
+static float *fimg;
+
+
+static unsigned char
+clamp(float f)
+{
+    int i = (int)(f * 255.5);
+
+    if (i < 0) i = 0;
+    if (i > 255) i = 255;
+
+    return (unsigned char)i;
+}
+
+
+static void
+savePPM(const char *fname, int w, int h)
+{
+    for (int y = 0; y < h; y++) {
+        for (int x = 0; x < w; x++)  {
+            img[3 * (y * w + x) + 0] = clamp(fimg[3 *(y * w + x) + 0]);
+            img[3 * (y * w + x) + 1] = clamp(fimg[3 *(y * w + x) + 1]);
+            img[3 * (y * w + x) + 2] = clamp(fimg[3 *(y * w + x) + 2]);
+        }
+    }
+
+    FILE *fp = fopen(fname, "wb");
+    if (!fp) {
+        perror(fname);
+        exit(1);
+    }
+
+    fprintf(fp, "P6\n");
+    fprintf(fp, "%d %d\n", w, h);
+    fprintf(fp, "255\n");
+    fwrite(img, w * h * 3, 1, fp);
+    fclose(fp);
+    printf("Wrote image file %s\n", fname);
+}
+
+
+int main(int argc, char **argv)
+{
+    if (argc != 4) {
+        printf ("%s\n", argv[0]);
+        printf ("Usage: ao [num test iterations] [width] [height]\n");
+        getchar();
+        exit(-1);
+    }
+    else {
+        test_iterations = atoi(argv[1]);
+        width = atoi (argv[2]);
+        height = atoi (argv[3]);
+    }
+
+    // Allocate space for output images
+    img = new unsigned char[width * height * 3];
+    fimg = new float[width * height * 3];
+
+    //
+    // Run the ispc path, test_iterations times, and report the minimum
+    // time for any of them.
+    //
+    double minTimeISPC = 1e30;
+    for (unsigned int i = 0; i < test_iterations; i++) {
+        memset((void *)fimg, 0, sizeof(float) * width * height * 3);
+        assert(NSUBSAMPLES == 2);
+
+        reset_and_start_timer();
+        ao_ispc(width, height, NSUBSAMPLES, fimg);
+        double t = get_elapsed_mcycles();
+        minTimeISPC = std::min(minTimeISPC, t);
+    }
+
+    // Report results and save image
+    printf("[aobench ispc]:\t\t\t[%.3f] M cycles (%d x %d image)\n", 
+           minTimeISPC, width, height);
+    savePPM("ao-ispc.ppm", width, height); 
+
+    //
+    // Run the ispc + tasks path, test_iterations times, and report the
+    // minimum time for any of them.
+    //
+    double minTimeISPCTasks = 1e30;
+    for (unsigned int i = 0; i < test_iterations; i++) {
+        memset((void *)fimg, 0, sizeof(float) * width * height * 3);
+        assert(NSUBSAMPLES == 2);
+
+        reset_and_start_timer();
+        ao_ispc_tasks(width, height, NSUBSAMPLES, fimg);
+        double t = get_elapsed_mcycles();
+        minTimeISPCTasks = std::min(minTimeISPCTasks, t);
+    }
+
+    // Report results and save image
+    printf("[aobench ispc + tasks]:\t\t[%.3f] M cycles (%d x %d image)\n", 
+           minTimeISPCTasks, width, height);
+    savePPM("ao-ispc-tasks.ppm", width, height); 
+
+    //
+    // Run the serial path, again test_iteration times, and report the
+    // minimum time.
+    //
+    double minTimeSerial = 1e30;
+    for (unsigned int i = 0; i < test_iterations; i++) {
+        memset((void *)fimg, 0, sizeof(float) * width * height * 3);
+        reset_and_start_timer();
+        ao_serial(width, height, NSUBSAMPLES, fimg);
+        double t = get_elapsed_mcycles();
+        minTimeSerial = std::min(minTimeSerial, t);
+    }
+
+    // Report more results, save another image...
+    printf("[aobench serial]:\t\t[%.3f] M cycles (%d x %d image)\n", minTimeSerial, 
+           width, height);
+    printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", 
+           minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks);
+    savePPM("ao-serial.ppm", width, height); 
+        
+    return 0;
+}
--- a/examples/aobench/ao.ispc
+++ b/examples/aobench/ao.ispc
@@ -0,0 +1,272 @@
+// -*- mode: c++ -*-
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+/*
+  Based on Syoyo Fujita's aobench: http://code.google.com/p/aobench
+*/
+
+#define NAO_SAMPLES		8
+#define M_PI 3.1415926535f
+
+typedef float<3> vec;
+
+struct Isect {
+    float      t;
+    vec        p;
+    vec        n;
+    int        hit; 
+};
+
+struct Sphere {
+    vec        center;
+    float      radius;
+};
+
+struct Plane {
+    vec    p;
+    vec    n;
+};
+
+struct Ray {
+    vec org;
+    vec dir;
+};
+
+static inline float dot(vec a, vec b) {
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+static inline vec vcross(vec v0, vec v1) {
+    vec ret;
+    ret.x = v0.y * v1.z - v0.z * v1.y;
+    ret.y = v0.z * v1.x - v0.x * v1.z;
+    ret.z = v0.x * v1.y - v0.y * v1.x;
+    return ret;
+}
+
+static inline void vnormalize(vec &v) {
+    float len2 = dot(v, v);
+    float invlen = rsqrt(len2);
+    v *= invlen;
+}
+
+
+static void
+ray_plane_intersect(Isect &isect, Ray &ray, uniform Plane &plane) {
+    float d = -dot(plane.p, plane.n);
+    float v = dot(ray.dir, plane.n);
+
+    cif (abs(v) < 1.0e-17) 
+        return;
+    else {
+        float t = -(dot(ray.org, plane.n) + d) / v;
+
+        cif ((t > 0.0) && (t < isect.t)) {
+            isect.t = t;
+            isect.hit = 1;
+            isect.p = ray.org + ray.dir * t;
+            isect.n = plane.n;
+        }
+    }
+}
+
+
+static inline void
+ray_sphere_intersect(Isect &isect, Ray &ray, uniform Sphere &sphere) {
+    vec rs = ray.org - sphere.center;
+
+    float B = dot(rs, ray.dir);
+    float C = dot(rs, rs) - sphere.radius * sphere.radius;
+    float D = B * B - C;
+
+    cif (D > 0.) {
+        float t = -B - sqrt(D);
+
+        cif ((t > 0.0) && (t < isect.t)) {
+            isect.t = t;
+            isect.hit = 1;
+            isect.p = ray.org + t * ray.dir;
+            isect.n = isect.p - sphere.center;
+            vnormalize(isect.n);
+        }
+    }
+}
+
+
+static void
+orthoBasis(vec basis[3], vec n) {
+    basis[2] = n;
+    basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;
+
+    if ((n.x < 0.6) && (n.x > -0.6)) {
+        basis[1].x = 1.0;
+    } else if ((n.y < 0.6) && (n.y > -0.6)) {
+        basis[1].y = 1.0;
+    } else if ((n.z < 0.6) && (n.z > -0.6)) {
+        basis[1].z = 1.0;
+    } else {
+        basis[1].x = 1.0;
+    }
+
+    basis[0] = vcross(basis[1], basis[2]);
+    vnormalize(basis[0]);
+
+    basis[1] = vcross(basis[2], basis[0]);
+    vnormalize(basis[1]);
+}
+
+
+static float
+ambient_occlusion(Isect &isect, uniform Plane &plane, uniform Sphere spheres[3],
+                  RNGState &rngstate) {
+    float eps = 0.0001f;
+    vec p, n;
+    vec basis[3];
+    float occlusion = 0.0;
+
+    p = isect.p + eps * isect.n;
+
+    orthoBasis(basis, isect.n);
+
+    static const uniform int ntheta = NAO_SAMPLES;
+    static const uniform int nphi   = NAO_SAMPLES;
+    for (uniform int j = 0; j < ntheta; j++) {
+        for (uniform int i = 0; i < nphi; i++) {
+            Ray ray;
+            Isect occIsect;
+
+            float theta = sqrt(frandom(&rngstate));
+            float phi   = 2.0f * M_PI * frandom(&rngstate);
+            float x = cos(phi) * theta;
+            float y = sin(phi) * theta;
+            float z = sqrt(1.0 - theta * theta);
+
+            // local . global
+            float rx = x * basis[0].x + y * basis[1].x + z * basis[2].x;
+            float ry = x * basis[0].y + y * basis[1].y + z * basis[2].y;
+            float rz = x * basis[0].z + y * basis[1].z + z * basis[2].z;
+
+            ray.org = p;
+            ray.dir.x = rx;
+            ray.dir.y = ry;
+            ray.dir.z = rz;
+
+            occIsect.t   = 1.0e+17;
+            occIsect.hit = 0;
+
+            for (uniform int snum = 0; snum < 3; ++snum)
+                ray_sphere_intersect(occIsect, ray, spheres[snum]); 
+            ray_plane_intersect (occIsect, ray, plane); 
+
+            if (occIsect.hit) occlusion += 1.0;
+        }
+    }
+
+    occlusion = (ntheta * nphi - occlusion) / (float)(ntheta * nphi);
+    return occlusion;
+}
+
+
+/* Compute the image for the scanlines from [y0,y1), for an overall image
+   of width w and height h.
+ */
+static void ao_scanlines(uniform int y0, uniform int y1, uniform int w, 
+                         uniform int h,  uniform int nsubsamples, 
+                         uniform float image[]) {
+    static uniform Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
+    static uniform Sphere spheres[3] = {
+        { { -2.0f, 0.0f, -3.5f }, 0.5f },
+        { { -0.5f, 0.0f, -3.0f }, 0.5f },
+        { { 1.0f, 0.0f, -2.2f }, 0.5f } };
+    RNGState rngstate;
+
+    seed_rng(&rngstate, programIndex + (y0 << (programIndex & 15)));
+    float invSamples = 1.f / nsubsamples;
+
+    foreach_tiled(y = y0 ... y1, x = 0 ... w, 
+                  u = 0 ... nsubsamples, v = 0 ... nsubsamples) {
+        float du = (float)u * invSamples, dv = (float)v * invSamples;
+
+        // Figure out x,y pixel in NDC
+        float px =  (x + du - (w / 2.0f)) / (w / 2.0f);
+        float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
+        float ret = 0.f;
+        Ray ray;
+        Isect isect;
+
+        ray.org = 0.f;
+
+        // Poor man's perspective projection
+        ray.dir.x = px;
+        ray.dir.y = py;
+        ray.dir.z = -1.0;
+        vnormalize(ray.dir);
+
+        isect.t   = 1.0e+17;
+        isect.hit = 0;
+
+        for (uniform int snum = 0; snum < 3; ++snum)
+            ray_sphere_intersect(isect, ray, spheres[snum]);
+        ray_plane_intersect(isect, ray, plane);
+
+        // Note use of 'coherent' if statement; the set of rays we
+        // trace will often all hit or all miss the scene
+        cif (isect.hit) {
+            ret = ambient_occlusion(isect, plane, spheres, rngstate);
+            ret *= invSamples * invSamples;
+
+            int offset = 3 * (y * w + x);
+            atomic_add_local(&image[offset], ret);
+            atomic_add_local(&image[offset+1], ret);
+            atomic_add_local(&image[offset+2], ret);
+        }
+    }
+}
+
+
+export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples, 
+                    uniform float image[]) {
+    ao_scanlines(0, h, w, h, nsubsamples, image);
+}
+
+
+static void task ao_task(uniform int width, uniform int height, 
+                         uniform int nsubsamples, uniform float image[]) {
+    ao_scanlines(taskIndex, taskIndex+1, width, height, nsubsamples, image);
+}
+
+
+export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples, 
+                          uniform float image[]) {
+    launch[h] ao_task(w, h, nsubsamples, image);
+}
--- a/examples/aobench/ao_serial.cpp
+++ b/examples/aobench/ao_serial.cpp
@@ -0,0 +1,314 @@
+// -*- mode: c++ -*-
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+/*
+  Based on Syoyo Fujita's aobench: http://code.google.com/p/aobench
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#define NOMINMAX
+#pragma warning (disable: 4244)
+#pragma warning (disable: 4305)
+#endif
+
+#include <stdlib.h>
+#include <math.h>
+
+#ifdef _MSC_VER
+static long long drand48_x = 0x1234ABCD330E;
+
+static inline void srand48(int x) {
+    drand48_x = x ^ (x << 16);
+}
+
+static inline double drand48() {
+    drand48_x = drand48_x * 0x5DEECE66D + 0xB;
+    return (drand48_x & 0xFFFFFFFFFFFF) * (1.0 / 281474976710656.0);
+}
+#endif // _MSC_VER
+
+#ifdef _MSC_VER
+__declspec(align(16)) 
+#endif
+struct vec {
+    vec() { x=y=z=pad=0.; }
+    vec(float xx, float yy, float zz) { x = xx; y = yy; z = zz; }
+
+    vec operator*(float f) const { return vec(x*f, y*f, z*f); }
+    vec operator+(const vec &f2) const { 
+        return vec(x+f2.x, y+f2.y, z+f2.z); 
+    }
+    vec operator-(const vec &f2) const { 
+        return vec(x-f2.x, y-f2.y, z-f2.z); 
+    }
+    vec operator*(const vec &f2) const { 
+        return vec(x*f2.x, y*f2.y, z*f2.z); 
+    }
+    float x, y, z;
+    float pad;
+}
+#ifndef _MSC_VER
+__attribute__ ((aligned(16)))
+#endif
+;
+inline vec operator*(float f, const vec &v) { return vec(f*v.x, f*v.y, f*v.z); }
+
+
+#define NAO_SAMPLES		8
+
+#ifdef M_PI
+#undef M_PI
+#endif
+#define M_PI 3.1415926535f
+
+struct Isect {
+    float      t;
+    vec        p;
+    vec        n;
+    int        hit; 
+};
+
+struct Sphere {
+    vec        center;
+    float      radius;
+
+};
+
+struct Plane {
+    vec    p;
+    vec    n;
+};
+
+struct Ray {
+    vec org;
+    vec dir;
+};
+
+static inline float dot(const vec &a, const vec &b) {
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+static inline vec vcross(const vec &v0, const vec &v1) {
+    vec ret;
+    ret.x = v0.y * v1.z - v0.z * v1.y;
+    ret.y = v0.z * v1.x - v0.x * v1.z;
+    ret.z = v0.x * v1.y - v0.y * v1.x;
+    return ret;
+}
+
+static inline void vnormalize(vec &v) {
+    float len2 = dot(v, v);
+    float invlen = 1.f / sqrtf(len2);
+    v = v * invlen;
+}
+
+
+static inline void
+ray_plane_intersect(Isect &isect, Ray &ray, 
+                    Plane &plane) {
+    float d = -dot(plane.p, plane.n);
+    float v = dot(ray.dir, plane.n);
+
+    if (fabsf(v) < 1.0e-17f) 
+        return;
+    else {
+        float t = -(dot(ray.org, plane.n) + d) / v;
+
+        if ((t > 0.0) && (t < isect.t)) {
+            isect.t = t;
+            isect.hit = 1;
+            isect.p = ray.org + ray.dir * t;
+            isect.n = plane.n;
+        }
+    }
+}
+
+
+static inline void
+ray_sphere_intersect(Isect &isect, Ray &ray, 
+                     Sphere &sphere) {
+    vec rs = ray.org - sphere.center;
+
+    float B = dot(rs, ray.dir);
+    float C = dot(rs, rs) - sphere.radius * sphere.radius;
+    float D = B * B - C;
+
+    if (D > 0.) {
+        float t = -B - sqrtf(D);
+
+        if ((t > 0.0) && (t < isect.t)) {
+            isect.t = t;
+            isect.hit = 1;
+            isect.p = ray.org + t * ray.dir;
+            isect.n = isect.p - sphere.center;
+            vnormalize(isect.n);
+        }
+    }
+}
+
+
+static inline void
+orthoBasis(vec basis[3], const vec &n) {
+    basis[2] = n;
+    basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;
+
+    if ((n.x < 0.6f) && (n.x > -0.6f)) {
+        basis[1].x = 1.0;
+    } else if ((n.y < 0.6f) && (n.y > -0.6f)) {
+        basis[1].y = 1.0;
+    } else if ((n.z < 0.6f) && (n.z > -0.6f)) {
+        basis[1].z = 1.0;
+    } else {
+        basis[1].x = 1.0;
+    }
+
+    basis[0] = vcross(basis[1], basis[2]);
+    vnormalize(basis[0]);
+
+    basis[1] = vcross(basis[2], basis[0]);
+    vnormalize(basis[1]);
+}
+
+
+static float
+ambient_occlusion(Isect &isect, Plane &plane, 
+                  Sphere spheres[3]) {
+    float eps = 0.0001f;
+    vec p, n;
+    vec basis[3];
+    float occlusion = 0.0;
+
+    p = isect.p + eps * isect.n;
+
+    orthoBasis(basis, isect.n);
+
+    static const int ntheta = NAO_SAMPLES;
+    static const int nphi   = NAO_SAMPLES;
+    for (int j = 0; j < ntheta; j++) {
+        for (int i = 0; i < nphi; i++) {
+            Ray ray;
+            Isect occIsect;
+
+            float theta = sqrtf(drand48());
+            float phi   = 2.0f * M_PI * drand48();
+            float x = cosf(phi) * theta;
+            float y = sinf(phi) * theta;
+            float z = sqrtf(1.0f - theta * theta);
+
+            // local . global
+            float rx = x * basis[0].x + y * basis[1].x + z * basis[2].x;
+            float ry = x * basis[0].y + y * basis[1].y + z * basis[2].y;
+            float rz = x * basis[0].z + y * basis[1].z + z * basis[2].z;
+
+            ray.org = p;
+            ray.dir.x = rx;
+            ray.dir.y = ry;
+            ray.dir.z = rz;
+
+            occIsect.t   = 1.0e+17f;
+            occIsect.hit = 0;
+
+            for (int snum = 0; snum < 3; ++snum)
+                ray_sphere_intersect(occIsect, ray, spheres[snum]); 
+            ray_plane_intersect (occIsect, ray, plane); 
+
+            if (occIsect.hit) occlusion += 1.f;
+        }
+    }
+
+    occlusion = (ntheta * nphi - occlusion) / (float)(ntheta * nphi);
+    return occlusion;
+}
+
+
+/* Compute the image for the scanlines from [y0,y1), for an overall image
+   of width w and height h.
+ */
+static void ao_scanlines(int y0, int y1, int w, int h, int nsubsamples,
+                         float image[]) {
+    static Plane plane = { vec(0.0f, -0.5f, 0.0f), vec(0.f, 1.f, 0.f) };
+    static Sphere spheres[3] = {
+        { vec(-2.0f, 0.0f, -3.5f), 0.5f },
+        { vec(-0.5f, 0.0f, -3.0f), 0.5f },
+        { vec(1.0f, 0.0f, -2.2f), 0.5f } };
+
+    srand48(y0);
+    
+    for (int y = y0; y < y1; ++y) {
+        for (int x = 0; x < w; ++x)  {
+            int offset = 3 * (y * w + x);
+            for (int u = 0; u < nsubsamples; ++u) {
+                for (int v = 0; v < nsubsamples; ++v) {
+                    float px = (x + (u / (float)nsubsamples) - (w / 2.0f)) / (w / 2.0f);
+                    float py = -(y + (v / (float)nsubsamples) - (h / 2.0f)) / (h / 2.0f);
+                    float ret = 0.f;
+                    Ray ray;
+                    Isect isect;
+
+                    ray.org = vec(0.f, 0.f, 0.f);
+
+                    ray.dir.x = px;
+                    ray.dir.y = py;
+                    ray.dir.z = -1.0f;
+                    vnormalize(ray.dir);
+
+                    isect.t   = 1.0e+17f;
+                    isect.hit = 0;
+
+                    for (int snum = 0; snum < 3; ++snum)
+                        ray_sphere_intersect(isect, ray, spheres[snum]);
+                    ray_plane_intersect(isect, ray, plane);
+
+                    if (isect.hit)
+                        ret = ambient_occlusion(isect, plane, spheres);
+
+                    // Update image for AO for this ray
+                    image[offset+0] += ret;
+                    image[offset+1] += ret;
+                    image[offset+2] += ret;
+                }
+            }
+            // Normalize image pixels by number of samples taken per pixel
+            image[offset+0] /= nsubsamples * nsubsamples;
+            image[offset+1] /= nsubsamples * nsubsamples;
+            image[offset+2] /= nsubsamples * nsubsamples;
+        }
+    }
+}
+
+
+void ao_serial(int w, int h, int nsubsamples, 
+               float image[]) {
+    ao_scanlines(0, h, w, h, nsubsamples, image);
+}
--- a/examples/aobench/aobench.vcxproj
+++ b/examples/aobench/aobench.vcxproj
@@ -0,0 +1,176 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="ao.cpp" />
+    <ClCompile Include="ao_serial.cpp" />
+    <ClCompile Include="../tasksys.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="ao.ispc">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>aobench</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ExecutablePath);$(ProjectDir)..\..</ExecutablePath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
--- a/examples/aobench_instrumented/.gitignore
+++ b/examples/aobench_instrumented/.gitignore
@@ -0,0 +1,2 @@
+ao
+*.ppm
--- a/examples/aobench_instrumented/Makefile
+++ b/examples/aobench_instrumented/Makefile
@@ -0,0 +1,26 @@
+
+CXX=g++ -m64
+CXXFLAGS=-Iobjs/ -g3 -Wall
+ISPC=ispc
+ISPCFLAGS=-O2 --instrument --arch=x86-64 --target=sse2
+
+default: ao
+
+.PHONY: dirs clean
+
+dirs:
+	/bin/mkdir -p objs/
+
+clean:
+	/bin/rm -rf objs *~ ao
+
+ao: objs/ao.o objs/instrument.o objs/ao_ispc.o ../tasksys.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $^ -lm -lpthread
+
+objs/%.o: %.cpp dirs
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/ao.o: objs/ao_ispc.h 
+
+objs/%_ispc.h objs/%_ispc.o: %.ispc dirs
+	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_instrumented_ispc.h
--- a/examples/aobench_instrumented/ao.cpp
+++ b/examples/aobench_instrumented/ao.cpp
@@ -0,0 +1,131 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifdef _MSC_VER
+#define NOMINMAX
+#pragma warning (disable: 4244)
+#pragma warning (disable: 4305)
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#ifdef __linux__
+#include <malloc.h>
+#endif
+#include <math.h>
+#include <map>
+#include <string>
+#include <algorithm>
+#include <sys/types.h>
+
+#include "ao_instrumented_ispc.h"
+using namespace ispc;
+
+#include "instrument.h"
+#include "../timing.h"
+
+#define NSUBSAMPLES        2
+
+static unsigned int test_iterations;
+static unsigned int width, height;
+static unsigned char *img;
+static float *fimg;
+
+
+static unsigned char
+clamp(float f)
+{
+    int i = (int)(f * 255.5);
+
+    if (i < 0) i = 0;
+    if (i > 255) i = 255;
+
+    return (unsigned char)i;
+}
+
+
+static void
+savePPM(const char *fname, int w, int h)
+{
+    for (int y = 0; y < h; y++) {
+        for (int x = 0; x < w; x++)  {
+            img[3 * (y * w + x) + 0] = clamp(fimg[3 *(y * w + x) + 0]);
+            img[3 * (y * w + x) + 1] = clamp(fimg[3 *(y * w + x) + 1]);
+            img[3 * (y * w + x) + 2] = clamp(fimg[3 *(y * w + x) + 2]);
+        }
+    }
+
+    FILE *fp = fopen(fname, "wb");
+    if (!fp) {
+        perror(fname);
+        exit(1);
+    }
+
+    fprintf(fp, "P6\n");
+    fprintf(fp, "%d %d\n", w, h);
+    fprintf(fp, "255\n");
+    fwrite(img, w * h * 3, 1, fp);
+    fclose(fp);
+    printf("Wrote image file %s\n", fname);
+}
+
+
+
+int main(int argc, char **argv)
+{
+    if (argc != 4) {
+        printf ("%s\n", argv[0]);
+        printf ("Usage: ao [num test iterations] [width] [height]\n");
+        getchar();
+        exit(-1);
+    }
+    else {
+        test_iterations = atoi(argv[1]);
+        width = atoi (argv[2]);
+        height = atoi (argv[3]);
+    }
+
+    // Allocate space for output images
+    img = new unsigned char[width * height * 3];
+    fimg = new float[width * height * 3];
+
+    ao_ispc(width, height, NSUBSAMPLES, fimg);
+
+    savePPM("ao-ispc.ppm", width, height); 
+
+    ISPCPrintInstrument();
+
+    return 0;
+}
--- a/examples/aobench_instrumented/ao.ispc
+++ b/examples/aobench_instrumented/ao.ispc
@@ -0,0 +1,333 @@
+// -*- mode: c++ -*-
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+/*
+  Based on Syoyo Fujita's aobench: http://code.google.com/p/aobench
+*/
+
+#define NAO_SAMPLES		8
+#define M_PI 3.1415926535f
+
+typedef float<3> vec;
+
+struct Isect {
+    float      t;
+    vec        p;
+    vec        n;
+    int        hit; 
+};
+
+struct Sphere {
+    vec        center;
+    float      radius;
+
+};
+
+struct Plane {
+    vec    p;
+    vec    n;
+};
+
+struct Ray {
+    vec org;
+    vec dir;
+};
+
+static inline float dot(vec a, vec b) {
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+static inline vec vcross(vec v0, vec v1) {
+    vec ret;
+    ret.x = v0.y * v1.z - v0.z * v1.y;
+    ret.y = v0.z * v1.x - v0.x * v1.z;
+    ret.z = v0.x * v1.y - v0.y * v1.x;
+    return ret;
+}
+
+static inline void vnormalize(vec &v) {
+    float len2 = dot(v, v);
+    float invlen = rsqrt(len2);
+    v *= invlen;
+}
+
+
+static inline void
+ray_plane_intersect(Isect &isect, Ray &ray, Plane &plane) {
+    float d = -dot(plane.p, plane.n);
+    float v = dot(ray.dir, plane.n);
+
+    cif (abs(v) < 1.0e-17) 
+        return;
+    else {
+        float t = -(dot(ray.org, plane.n) + d) / v;
+
+        cif ((t > 0.0) && (t < isect.t)) {
+            isect.t = t;
+            isect.hit = 1;
+            isect.p = ray.org + ray.dir * t;
+            isect.n = plane.n;
+        }
+    }
+}
+
+
+static inline void
+ray_sphere_intersect(Isect &isect, Ray &ray, Sphere &sphere) {
+    vec rs = ray.org - sphere.center;
+
+    float B = dot(rs, ray.dir);
+    float C = dot(rs, rs) - sphere.radius * sphere.radius;
+    float D = B * B - C;
+
+    cif (D > 0.) {
+        float t = -B - sqrt(D);
+
+        cif ((t > 0.0) && (t < isect.t)) {
+            isect.t = t;
+            isect.hit = 1;
+            isect.p = ray.org + t * ray.dir;
+            isect.n = isect.p - sphere.center;
+            vnormalize(isect.n);
+        }
+    }
+}
+
+
+static inline void
+orthoBasis(vec basis[3], vec n) {
+    basis[2] = n;
+    basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;
+
+    if ((n.x < 0.6) && (n.x > -0.6)) {
+        basis[1].x = 1.0;
+    } else if ((n.y < 0.6) && (n.y > -0.6)) {
+        basis[1].y = 1.0;
+    } else if ((n.z < 0.6) && (n.z > -0.6)) {
+        basis[1].z = 1.0;
+    } else {
+        basis[1].x = 1.0;
+    }
+
+    basis[0] = vcross(basis[1], basis[2]);
+    vnormalize(basis[0]);
+
+    basis[1] = vcross(basis[2], basis[0]);
+    vnormalize(basis[1]);
+}
+
+
+static inline float
+ambient_occlusion(Isect &isect, Plane &plane, Sphere spheres[3], 
+                  RNGState &rngstate) {
+    float eps = 0.0001f;
+    vec p, n;
+    vec basis[3];
+    float occlusion = 0.0;
+
+    p = isect.p + eps * isect.n;
+
+    orthoBasis(basis, isect.n);
+
+    static const uniform int ntheta = NAO_SAMPLES;
+    static const uniform int nphi   = NAO_SAMPLES;
+    for (uniform int j = 0; j < ntheta; j++) {
+        for (uniform int i = 0; i < nphi; i++) {
+            Ray ray;
+            Isect occIsect;
+
+            float theta = sqrt(frandom(&rngstate));
+            float phi   = 2.0f * M_PI * frandom(&rngstate);
+            float x = cos(phi) * theta;
+            float y = sin(phi) * theta;
+            float z = sqrt(1.0 - theta * theta);
+
+            // local . global
+            float rx = x * basis[0].x + y * basis[1].x + z * basis[2].x;
+            float ry = x * basis[0].y + y * basis[1].y + z * basis[2].y;
+            float rz = x * basis[0].z + y * basis[1].z + z * basis[2].z;
+
+            ray.org = p;
+            ray.dir.x = rx;
+            ray.dir.y = ry;
+            ray.dir.z = rz;
+
+            occIsect.t   = 1.0e+17;
+            occIsect.hit = 0;
+
+            for (uniform int snum = 0; snum < 3; ++snum)
+                ray_sphere_intersect(occIsect, ray, spheres[snum]); 
+            ray_plane_intersect (occIsect, ray, plane); 
+
+            if (occIsect.hit) occlusion += 1.0;
+        }
+    }
+
+    occlusion = (ntheta * nphi - occlusion) / (float)(ntheta * nphi);
+    return occlusion;
+}
+
+
+/* Compute the image for the scanlines from [y0,y1), for an overall image
+   of width w and height h.
+ */
+static void ao_scanlines(uniform int y0, uniform int y1, uniform int w, 
+                         uniform int h,  uniform int nsubsamples, 
+                         uniform float image[]) {
+    static Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
+    static Sphere spheres[3] = {
+        { { -2.0f, 0.0f, -3.5f }, 0.5f },
+        { { -0.5f, 0.0f, -3.0f }, 0.5f },
+        { { 1.0f, 0.0f, -2.2f }, 0.5f } };
+    RNGState rngstate;
+
+    seed_rng(&rngstate, programIndex + (y0 << (programIndex & 15)));
+
+    // Compute the mapping between the 'programCount'-wide program
+    // instances running in parallel and samples in the image.  
+    //
+    // For now, we'll always take four samples per pixel, so start by
+    // initializing du and dv with offsets into subpixel samples.  We'll
+    // take care of further updating du and dv for the case where we're
+    // doing more than 4 program instances in parallel shortly.
+    uniform float uSteps[4] = { 0, 1, 0, 1 };
+    uniform float vSteps[4] = { 0, 0, 1, 1 };
+    float du = uSteps[programIndex % 4] / nsubsamples;
+    float dv = vSteps[programIndex % 4] / nsubsamples;
+
+    // Now handle the case where we are able to do more than one pixel's
+    // worth of work at once.  nx records the number of pixels in the x
+    // direction we do per iteration and ny the number in y.
+    uniform int nx = 1, ny = 1;
+
+    // FIXME: We actually need ny to be 1 regardless of the decomposition,
+    // since the task decomposition is one scanline high.
+
+    if (programCount == 8) {
+        // Do two pixels at once in the x direction
+        nx = 2;
+        if (programIndex >= 4) 
+            // And shift the offsets for the second pixel's worth of work
+            ++du;
+    }
+    else if (programCount == 16) {
+        nx = 4;
+        ny = 1;
+        if (programIndex >= 4 && programIndex < 8)
+            ++du;
+        if (programIndex >= 8 && programIndex < 12)
+            du += 2;
+        if (programIndex >= 12)
+            du += 3;
+    }
+
+    // Now loop over all of the pixels, stepping in x and y as calculated
+    // above.  (Assumes that ny divides y and nx divides x...)
+    for (uniform int y = y0; y < y1; y += ny) {
+        for (uniform int x = 0; x < w; x += nx)  {
+            // Figure out x,y pixel in NDC
+            float px =  (x + du - (w / 2.0f)) / (w / 2.0f);
+            float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
+            float ret = 0.f;
+            Ray ray;
+            Isect isect;
+
+            ray.org = 0.f;
+
+            // Poor man's perspective projection
+            ray.dir.x = px;
+            ray.dir.y = py;
+            ray.dir.z = -1.0;
+            vnormalize(ray.dir);
+
+            isect.t   = 1.0e+17;
+            isect.hit = 0;
+
+            for (uniform int snum = 0; snum < 3; ++snum)
+                ray_sphere_intersect(isect, ray, spheres[snum]);
+            ray_plane_intersect(isect, ray, plane);
+
+            // Note use of 'coherent' if statement; the set of rays we
+            // trace will often all hit or all miss the scene
+            cif (isect.hit)
+                ret = ambient_occlusion(isect, plane, spheres, rngstate);
+
+            // This is a little grungy; we have results for
+            // programCount-worth of values.  Because we're doing 2x2
+            // subsamples, we need to peel them off in groups of four,
+            // average the four values for each pixel, and update the
+            // output image.
+            //
+            // Store the varying value to a uniform array of the same size.
+            // See the discussion about communication among program
+            // instances in the ispc user's manual for more discussion on
+            // this idiom.
+            uniform float retArray[programCount];
+            retArray[programIndex] = ret;
+
+            // offset to the first pixel in the image
+            uniform int offset = 3 * (y * w + x);
+            for (uniform int p = 0; p < programCount; p += 4, offset += 3) {
+                // Get the four sample values for this pixel
+                uniform float sumret = retArray[p] + retArray[p+1] + retArray[p+2] +
+                    retArray[p+3];
+
+                // Normalize by number of samples taken
+                sumret /= nsubsamples * nsubsamples; 
+                
+                // Store result in the image
+                image[offset+0] = sumret;
+                image[offset+1] = sumret;
+                image[offset+2] = sumret;
+            }
+        }
+    }
+}
+
+
+export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples, 
+                    uniform float image[]) {
+    ao_scanlines(0, h, w, h, nsubsamples, image);
+}
+
+
+static void task ao_task(uniform int width, uniform int height, 
+                         uniform int nsubsamples, uniform float image[]) {
+    ao_scanlines(taskIndex, taskIndex+1, width, height, nsubsamples, image);
+}
+
+
+export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples, 
+                          uniform float image[]) {
+    launch[h] ao_task(w, h, nsubsamples, image);
+}
--- a/examples/aobench_instrumented/aobench_instrumented.vcxproj
+++ b/examples/aobench_instrumented/aobench_instrumented.vcxproj
@@ -0,0 +1,174 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="ao.cpp" />
+    <ClCompile Include="instrument.cpp" />
+    <ClCompile Include="../tasksys.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="ao.ispc">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename)_instrumented.obj -h $(TargetDir)%(Filename)_instrumented_ispc.h --arch=x86 --instrument --target=sse2
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename)_instrumented.obj -h $(TargetDir)%(Filename)_instrumented_ispc.h --instrument --target=sse2
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename)_instrumented.obj;$(TargetDir)%(Filename)_instrumented_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename)_instrumented.obj;$(TargetDir)%(Filename)_instrumented_ispc.h</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename)_instrumented.obj -h $(TargetDir)%(Filename)_instrumented_ispc.h --arch=x86 --instrument --target=sse2
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename)_instrumented.obj -h $(TargetDir)%(Filename)_instrumented_ispc.h --instrument --target=sse2
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename)_instrumented.obj;$(TargetDir)%(Filename)_instrumented_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename)_instrumented.obj;$(TargetDir)%(Filename)_instrumented_ispc.h</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>aobench_instrumented</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+    <PreBuildEventUseInBuild>true</PreBuildEventUseInBuild>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+    <PreBuildEventUseInBuild>true</PreBuildEventUseInBuild>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+    <PreBuildEventUseInBuild>true</PreBuildEventUseInBuild>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+    <PreBuildEventUseInBuild>true</PreBuildEventUseInBuild>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
--- a/examples/aobench_instrumented/instrument.cpp
+++ b/examples/aobench_instrumented/instrument.cpp
@@ -0,0 +1,94 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#include "instrument.h"
+#include <stdio.h>
+#include <assert.h>
+#include <string>
+#include <map>
+
+struct CallInfo {
+    CallInfo() { count = laneCount = allOff = 0; }
+    int count;
+    int laneCount;
+    int allOff;
+};
+
+static std::map<std::string, CallInfo> callInfo;
+
+int countbits(int i) {
+    int ret = 0;
+    while (i) {
+        if (i & 0x1)
+            ++ret;
+        i >>= 1;
+    }
+    return ret;
+}
+
+
+// Callback function that ispc compiler emits calls to when --instrument
+// command-line flag is given while compiling.
+void
+ISPCInstrument(const char *fn, const char *note, int line, int mask) {
+    char sline[16];
+    sprintf(sline, "%04d", line);
+    std::string s = std::string(fn) + std::string("(") + std::string(sline) +
+        std::string(") - ") + std::string(note);
+
+    // Find or create a CallInfo instance for this callsite.
+    CallInfo &ci = callInfo[s];
+
+    // And update its statistics... 
+    ++ci.count;
+    if (mask == 0)
+        ++ci.allOff;
+    ci.laneCount += countbits(mask);
+}
+
+
+void
+ISPCPrintInstrument() {
+    // When program execution is done, go through the stats and print them
+    // out.  (This function is called by ao.cpp).
+    std::map<std::string, CallInfo>::iterator citer = callInfo.begin();
+    while (citer != callInfo.end()) {
+        CallInfo &ci = citer->second;
+        float activePct = 100.f * ci.laneCount / (4.f * ci.count);
+        float allOffPct = 100.f * ci.allOff / ci.count;
+        printf("%s: %d calls (%d / %.2f%% all off!), %.2f%% active lanes\n",
+               citer->first.c_str(), ci.count, ci.allOff, allOffPct,
+               activePct);
+        ++citer;
+    }
+}
--- a/examples/aobench_instrumented/instrument.h
+++ b/examples/aobench_instrumented/instrument.h
@@ -0,0 +1,45 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifndef INSTRUMENT_H
+#define INSTRUMENT_H 1
+
+#include <stdint.h>
+
+extern "C" { 
+    void ISPCInstrument(const char *fn, const char *note, int line, int mask);
+}
+
+void ISPCPrintInstrument();
+
+#endif // INSTRUMENT_H
--- a/examples/common.mk
+++ b/examples/common.mk
@@ -0,0 +1,74 @@
+
+TASK_CXX=../tasksys.cpp
+TASK_LIB=-lpthread
+TASK_OBJ=objs/tasksys.o
+
+CXX=g++
+CXXFLAGS=-Iobjs/ -O2 -m64
+CC=gcc
+CCFLAGS=-Iobjs/ -O2 -m64
+
+LIBS=-lm $(TASK_LIB) -lstdc++
+ISPC=ispc -O2 --arch=x86-64 $(ISPC_FLAGS)
+ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc.o $(ISPC_SRC:.ispc=)_ispc_sse2.o \
+	$(ISPC_SRC:.ispc=)_ispc_sse4.o $(ISPC_SRC:.ispc=)_ispc_avx.o)
+ISPC_HEADER=objs/$(ISPC_SRC:.ispc=_ispc.h)
+
+CPP_OBJS=$(addprefix objs/, $(CPP_SRC:.cpp=.o))
+CC_OBJS=$(addprefix objs/, $(CC_SRC:.c=.o))
+OBJS=$(CPP_OBJS) $(CC_OBJS) $(TASK_OBJ) $(ISPC_OBJS)
+
+default: $(EXAMPLE)
+
+all: $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16 $(EXAMPLE)-scalar
+
+.PHONY: dirs clean
+
+dirs:
+	/bin/mkdir -p objs/
+
+objs/%.cpp objs/%.o objs/%.h: dirs
+
+clean:
+	/bin/rm -rf objs *~ $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16
+
+$(EXAMPLE): $(OBJS)
+	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
+
+objs/%.o: %.cpp dirs $(ISPC_HEADER)
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/%.o: %.c dirs $(ISPC_HEADER)
+	$(CC) $< $(CCFLAGS) -c -o $@
+
+objs/%.o: ../%.cpp dirs
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/$(EXAMPLE).o: objs/$(EXAMPLE)_ispc.h
+
+objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
+	$(ISPC) --target=$(ISPC_TARGETS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
+
+objs/$(ISPC_SRC:.ispc=)_sse4.cpp: $(ISPC_SRC)
+	$(ISPC) $< -o $@ --target=generic-4 --emit-c++ --c++-include-file=sse4.h
+
+objs/$(ISPC_SRC:.ispc=)_sse4.o: objs/$(ISPC_SRC:.ispc=)_sse4.cpp
+	$(CXX) -I../intrinsics -msse4.2 $< $(CXXFLAGS) -c -o $@
+
+$(EXAMPLE)-sse4: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_sse4.o
+	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
+
+objs/$(ISPC_SRC:.ispc=)_generic16.cpp: $(ISPC_SRC)
+	$(ISPC) $< -o $@ --target=generic-16 --emit-c++ --c++-include-file=generic-16.h
+
+objs/$(ISPC_SRC:.ispc=)_generic16.o: objs/$(ISPC_SRC:.ispc=)_generic16.cpp
+	$(CXX) -I../intrinsics $< $(CXXFLAGS) -c -o $@
+
+$(EXAMPLE)-generic16: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_generic16.o
+	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
+
+objs/$(ISPC_SRC:.ispc=)_scalar.o: $(ISPC_SRC)
+	$(ISPC) $< -o $@ --target=generic-1
+
+$(EXAMPLE)-scalar: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_scalar.o
+	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
--- a/examples/deferred/Makefile
+++ b/examples/deferred/Makefile
@@ -0,0 +1,8 @@
+
+EXAMPLE=deferred_shading
+CPP_SRC=common.cpp main.cpp dynamic_c.cpp dynamic_cilk.cpp
+ISPC_SRC=kernels.ispc
+ISPC_TARGETS=sse2,sse4-x2,avx-x2
+ISPC_FLAGS=--opt=fast-math
+
+include ../common.mk
--- a/examples/deferred/common.cpp
+++ b/examples/deferred/common.cpp
@@ -0,0 +1,210 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#define ISPC_IS_WINDOWS
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
+#include <fcntl.h>
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <stdint.h>
+#include <algorithm>
+#include <assert.h>
+#include <vector>
+#ifdef ISPC_IS_WINDOWS
+  #define WIN32_LEAN_AND_MEAN
+  #include <windows.h>
+#endif
+#ifdef ISPC_IS_LINUX
+  #include <malloc.h>
+#endif
+#include "deferred.h"
+#include "../timing.h"
+
+///////////////////////////////////////////////////////////////////////////
+
+static void *
+lAlignedMalloc(size_t size, int32_t alignment) {
+#ifdef ISPC_IS_WINDOWS
+    return _aligned_malloc(size, alignment);
+#endif
+#ifdef ISPC_IS_LINUX
+    return memalign(alignment, size);
+#endif
+#ifdef ISPC_IS_APPLE
+    void *mem = malloc(size + (alignment-1) + sizeof(void*));
+    char *amem = ((char*)mem) + sizeof(void*);
+    amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
+                                        (alignment - 1)));
+    ((void**)amem)[-1] = mem;
+    return amem;
+#endif
+}
+
+
+static void
+lAlignedFree(void *ptr) {
+#ifdef ISPC_IS_WINDOWS
+    _aligned_free(ptr);
+#endif
+#ifdef ISPC_IS_LINUX
+    free(ptr);
+#endif
+#ifdef ISPC_IS_APPLE
+    free(((void**)ptr)[-1]);
+#endif
+}
+
+
+Framebuffer::Framebuffer(int width, int height) {
+    nPixels = width*height;
+    r = (uint8_t *)lAlignedMalloc(nPixels, ALIGNMENT_BYTES);
+    g = (uint8_t *)lAlignedMalloc(nPixels, ALIGNMENT_BYTES);
+    b = (uint8_t *)lAlignedMalloc(nPixels, ALIGNMENT_BYTES);
+}
+
+
+Framebuffer::~Framebuffer() {
+    lAlignedFree(r);
+    lAlignedFree(g);
+    lAlignedFree(b);
+}
+
+
+void
+Framebuffer::clear() {
+    memset(r, 0, nPixels);
+    memset(g, 0, nPixels);
+    memset(b, 0, nPixels);
+}
+
+
+InputData *
+CreateInputDataFromFile(const char *path) {
+    FILE *in = fopen(path, "rb");
+    if (!in) return 0;
+
+    InputData *input = new InputData;
+
+    // Load header
+    if (fread(&input->header, sizeof(ispc::InputHeader), 1, in) != 1) {
+        fprintf(stderr, "Preumature EOF reading file \"%s\"\n", path);
+        return NULL;
+    }
+
+    // Load data chunk and update pointers
+    input->chunk = (uint8_t *)lAlignedMalloc(input->header.inputDataChunkSize, 
+                                             ALIGNMENT_BYTES);
+    if (fread(input->chunk, input->header.inputDataChunkSize, 1, in) != 1) {
+        fprintf(stderr, "Preumature EOF reading file \"%s\"\n", path);
+        return NULL;
+    }
+    
+    input->arrays.zBuffer =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaZBuffer]];
+    input->arrays.normalEncoded_x =
+        (uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaNormalEncoded_x]];
+    input->arrays.normalEncoded_y =
+        (uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaNormalEncoded_y]];
+    input->arrays.specularAmount =
+        (uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaSpecularAmount]];
+    input->arrays.specularPower =
+        (uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaSpecularPower]];
+    input->arrays.albedo_x =
+        (uint8_t *)&input->chunk[input->header.inputDataArrayOffsets[idaAlbedo_x]];
+    input->arrays.albedo_y =
+        (uint8_t *)&input->chunk[input->header.inputDataArrayOffsets[idaAlbedo_y]];
+    input->arrays.albedo_z =
+        (uint8_t *)&input->chunk[input->header.inputDataArrayOffsets[idaAlbedo_z]];
+    input->arrays.lightPositionView_x =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightPositionView_x]];
+    input->arrays.lightPositionView_y =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightPositionView_y]];
+    input->arrays.lightPositionView_z =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightPositionView_z]];
+    input->arrays.lightAttenuationBegin =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightAttenuationBegin]];
+    input->arrays.lightColor_x =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightColor_x]];
+    input->arrays.lightColor_y =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightColor_y]];
+    input->arrays.lightColor_z =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightColor_z]];
+    input->arrays.lightAttenuationEnd =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightAttenuationEnd]];
+
+    fclose(in);
+    return input;
+}
+
+
+void DeleteInputData(InputData *input) {
+    lAlignedFree(input->chunk);
+}
+
+
+void WriteFrame(const char *filename, const InputData *input,
+                const Framebuffer &framebuffer) {
+    // Deswizzle and copy to RGBA output
+    // Doesn't need to be fast... only happens once
+    size_t imageBytes = 3 * input->header.framebufferWidth * 
+        input->header.framebufferHeight;
+    uint8_t* framebufferAOS = (uint8_t *)lAlignedMalloc(imageBytes, ALIGNMENT_BYTES);
+    memset(framebufferAOS, 0, imageBytes);
+
+    for (int i = 0; i < input->header.framebufferWidth * 
+                        input->header.framebufferHeight; ++i) {
+        framebufferAOS[3 * i + 0] = framebuffer.r[i];
+        framebufferAOS[3 * i + 1] = framebuffer.g[i];
+        framebufferAOS[3 * i + 2] = framebuffer.b[i];
+    }
+    
+    // Write out simple PPM file
+    FILE *out = fopen(filename, "wb");
+    fprintf(out, "P6 %d %d 255\n", input->header.framebufferWidth, 
+            input->header.framebufferHeight);
+    fwrite(framebufferAOS, imageBytes, 1, out);
+    fclose(out);
+
+    lAlignedFree(framebufferAOS);
+}
--- a/examples/deferred/data/pp1280x720.bin
+++ b/examples/deferred/data/pp1280x720.bin
--- a/examples/deferred/data/pp1920x1200.bin
+++ b/examples/deferred/data/pp1920x1200.bin
--- a/examples/deferred/deferred.h
+++ b/examples/deferred/deferred.h
@@ -0,0 +1,108 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifndef DEFERRED_H
+#define DEFERRED_H
+
+// Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)!
+#define MIN_TILE_WIDTH 16
+#define MIN_TILE_HEIGHT 16
+#define MAX_LIGHTS 1024
+
+enum InputDataArraysEnum {
+    idaZBuffer = 0,
+    idaNormalEncoded_x,
+    idaNormalEncoded_y,
+    idaSpecularAmount,
+    idaSpecularPower,
+    idaAlbedo_x,
+    idaAlbedo_y,
+    idaAlbedo_z,
+    idaLightPositionView_x,
+    idaLightPositionView_y,
+    idaLightPositionView_z,
+    idaLightAttenuationBegin,
+    idaLightColor_x,
+    idaLightColor_y,
+    idaLightColor_z,
+    idaLightAttenuationEnd,
+
+    idaNum
+};
+
+#ifndef ISPC
+
+#include <stdint.h>
+#include "kernels_ispc.h"
+
+#define ALIGNMENT_BYTES 64
+
+#define MAX_LIGHTS 1024
+
+#define VISUALIZE_LIGHT_COUNT 0
+
+struct InputData
+{
+    ispc::InputHeader header;
+    ispc::InputDataArrays arrays;
+    uint8_t *chunk;
+};
+
+
+struct Framebuffer {
+    Framebuffer(int width, int height);
+    ~Framebuffer();
+
+    void clear();
+
+    uint8_t *r, *g, *b;
+
+private:
+    int nPixels;
+    Framebuffer(const Framebuffer &);
+    Framebuffer &operator=(const Framebuffer *);
+};
+
+
+InputData *CreateInputDataFromFile(const char *path);
+void DeleteInputData(InputData *input);
+void WriteFrame(const char *filename, const InputData *input,
+                const Framebuffer &framebuffer);
+void InitDynamicC(InputData *input);
+void InitDynamicCilk(InputData *input);
+void DispatchDynamicC(InputData *input, Framebuffer *framebuffer);
+void DispatchDynamicCilk(InputData *input, Framebuffer *framebuffer);
+
+#endif // !ISPC
+
+#endif // DEFERRED_H
--- a/examples/deferred/deferred_shading.vcxproj
+++ b/examples/deferred/deferred_shading.vcxproj
@@ -0,0 +1,178 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{87f53c53-957e-4e91-878a-bc27828fb9eb}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>mandelbrot</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="common.cpp" />
+    <ClCompile Include="dynamic_c.cpp" />
+    <ClCompile Include="dynamic_cilk.cpp" />
+    <ClCompile Include="main.cpp" />
+    <ClCompile Include="../tasksys.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="kernels.ispc">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
--- a/examples/deferred/dynamic_c.cpp
+++ b/examples/deferred/dynamic_c.cpp
@@ -0,0 +1,870 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#include "deferred.h"
+#include "kernels_ispc.h"
+#include <algorithm>
+#include <stdint.h>
+#include <assert.h>
+#include <math.h>
+
+#ifdef _MSC_VER
+#define ISPC_IS_WINDOWS
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
+#ifdef ISPC_IS_LINUX
+#include <malloc.h>
+#endif // ISPC_IS_LINUX
+
+// Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)!
+#define MIN_TILE_WIDTH 16
+#define MIN_TILE_HEIGHT 16
+
+
+#define DYNAMIC_TREE_LEVELS 5
+// If this is set to 1 then the result will be identical to the static version
+#define DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE 1
+
+static void *
+lAlignedMalloc(size_t size, int32_t alignment) {
+#ifdef ISPC_IS_WINDOWS
+    return _aligned_malloc(size, alignment);
+#endif
+#ifdef ISPC_IS_LINUX
+    return memalign(alignment, size);
+#endif
+#ifdef ISPC_IS_APPLE
+    void *mem = malloc(size + (alignment-1) + sizeof(void*));
+    char *amem = ((char*)mem) + sizeof(void*);
+    amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
+                                        (alignment - 1)));
+    ((void**)amem)[-1] = mem;
+    return amem;
+#endif
+}
+
+
+static void
+lAlignedFree(void *ptr) {
+#ifdef ISPC_IS_WINDOWS
+    _aligned_free(ptr);
+#endif
+#ifdef ISPC_IS_LINUX
+    free(ptr);
+#endif
+#ifdef ISPC_IS_APPLE
+    free(((void**)ptr)[-1]);
+#endif
+}
+
+
+static void
+ComputeZBounds(int tileStartX, int tileEndX,
+               int tileStartY, int tileEndY,
+               // G-buffer data
+               float zBuffer[],
+               int gBufferWidth,
+               // Camera data
+               float cameraProj_33, float cameraProj_43,
+               float cameraNear, float cameraFar,
+               // Output
+               float *minZ, float *maxZ)
+{
+    // Find Z bounds
+    float laneMinZ = cameraFar;
+    float laneMaxZ = cameraNear;
+    for (int y = tileStartY; y < tileEndY; ++y) {
+        for (int x = tileStartX; x < tileEndX; ++x) {
+            // Unproject depth buffer Z value into view space
+            float z = zBuffer[(y * gBufferWidth + x)];
+            float viewSpaceZ = cameraProj_43 / (z - cameraProj_33);
+
+            // Work out Z bounds for our samples
+            // Avoid considering skybox/background or otherwise invalid pixels
+            if ((viewSpaceZ < cameraFar) && (viewSpaceZ >= cameraNear)) {
+                laneMinZ = std::min(laneMinZ, viewSpaceZ);
+                laneMaxZ = std::max(laneMaxZ, viewSpaceZ);
+            }
+        }
+    }
+    *minZ = laneMinZ;
+    *maxZ = laneMaxZ;
+}
+
+
+static void
+ComputeZBoundsRow(int tileY, int tileWidth, int tileHeight,
+                  int numTilesX, int numTilesY,
+                  // G-buffer data
+                  float zBuffer[],
+                  int gBufferWidth,
+                  // Camera data
+                  float cameraProj_33, float cameraProj_43,
+                  float cameraNear, float cameraFar,
+                  // Output
+                  float minZArray[],
+                  float maxZArray[])
+{
+    for (int tileX = 0; tileX < numTilesX; ++tileX) {
+        float minZ, maxZ;
+        ComputeZBounds(tileX * tileWidth, tileX * tileWidth + tileWidth,
+                       tileY * tileHeight, tileY * tileHeight + tileHeight,
+                       zBuffer, gBufferWidth, cameraProj_33, cameraProj_43, 
+                       cameraNear, cameraFar, &minZ, &maxZ);
+        minZArray[tileX] = minZ;
+        maxZArray[tileX] = maxZ;
+    }
+}
+
+
+class MinMaxZTree
+{
+public:
+    // Currently (min) tile dimensions must divide gBuffer dimensions evenly
+    // Levels must be small enough that neither dimension goes below one tile
+    MinMaxZTree(
+        int tileWidth, int tileHeight, int levels,
+        int gBufferWidth, int gBufferHeight)
+        : mTileWidth(tileWidth), mTileHeight(tileHeight), mLevels(levels)
+    {
+        mNumTilesX = gBufferWidth / mTileWidth;
+        mNumTilesY = gBufferHeight / mTileHeight;
+        
+        // Allocate arrays
+        mMinZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
+        mMaxZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
+        for (int i = 0; i < mLevels; ++i) {
+            int x = NumTilesX(i);
+            int y = NumTilesY(i);
+            assert(x > 0);
+            assert(y > 0);
+            // NOTE: If the following two asserts fire it probably means that
+            // the base tile dimensions do not evenly divide the G-buffer dimensions
+            assert(x * (mTileWidth << i) >= gBufferWidth);
+            assert(y * (mTileHeight << i) >= gBufferHeight);
+            mMinZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
+            mMaxZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
+        }
+    }
+
+    void Update(float *zBuffer, int gBufferPitchInElements,
+        float cameraProj_33, float cameraProj_43,
+        float cameraNear, float cameraFar)
+    {
+        for (int tileY = 0; tileY < mNumTilesY; ++tileY) {
+            ComputeZBoundsRow(tileY, mTileWidth, mTileHeight, mNumTilesX, mNumTilesY,
+                              zBuffer, gBufferPitchInElements,
+                              cameraProj_33, cameraProj_43, cameraNear, cameraFar,
+                              mMinZArrays[0] + (tileY * mNumTilesX),
+                              mMaxZArrays[0] + (tileY * mNumTilesX));
+        }
+
+        // Generate other levels
+        for (int level = 1; level < mLevels; ++level) {
+            int destTilesX = NumTilesX(level);
+            int destTilesY = NumTilesY(level);
+            int srcLevel = level - 1;
+            int srcTilesX = NumTilesX(srcLevel);
+            int srcTilesY = NumTilesY(srcLevel);
+            for (int y = 0; y < destTilesY; ++y) {
+                for (int x = 0; x < destTilesX; ++x) {
+                    int srcX = x << 1;
+                    int srcY = y << 1;
+                    // NOTE: Ugly branches to deal with non-multiple dimensions at some levels
+                    // TODO: SSE branchless min/max is probably better...
+                    float minZ = mMinZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
+                    float maxZ = mMaxZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
+                    if (srcX + 1 < srcTilesX) {
+                        minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY) * srcTilesX + 
+                                                                    (srcX + 1)]);
+                        maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY) * srcTilesX +
+                                                                    (srcX + 1)]);
+                        if (srcY + 1 < srcTilesY) {
+                            minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                        (srcX + 1)]);
+                            maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                        (srcX + 1)]);
+                        }
+                    }
+                    if (srcY + 1 < srcTilesY) {
+                        minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                    (srcX    )]);
+                        maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                    (srcX    )]);
+                    }
+                    mMinZArrays[level][y * destTilesX + x] = minZ;
+                    mMaxZArrays[level][y * destTilesX + x] = maxZ;
+                }
+            }
+        }
+    }
+
+    ~MinMaxZTree() {
+        for (int i = 0; i < mLevels; ++i) {
+            lAlignedFree(mMinZArrays[i]);
+            lAlignedFree(mMaxZArrays[i]);
+        }
+        lAlignedFree(mMinZArrays);
+        lAlignedFree(mMaxZArrays); 
+    }
+
+    int Levels() const { return mLevels; }
+
+    // These round UP, so beware that the last tile for a given level may not be completely full
+    // TODO: Verify this...
+    int NumTilesX(int level = 0) const { return (mNumTilesX + (1 << level) - 1) >> level; }
+    int NumTilesY(int level = 0) const { return (mNumTilesY + (1 << level) - 1) >> level; }
+    int TileWidth(int level = 0) const { return (mTileWidth << level); }
+    int TileHeight(int level = 0) const { return (mTileHeight << level); }
+
+    float MinZ(int level, int tileX, int tileY) const {
+        return mMinZArrays[level][tileY * NumTilesX(level) + tileX];
+    }
+    float MaxZ(int level, int tileX, int tileY) const {
+        return mMaxZArrays[level][tileY * NumTilesX(level) + tileX];
+    }
+
+private:
+    int mTileWidth;
+    int mTileHeight;
+    int mLevels;
+    int mNumTilesX;
+    int mNumTilesY;
+
+    // One array for each "level" in the tree
+    float **mMinZArrays;
+    float **mMaxZArrays;
+};
+
+static MinMaxZTree *gMinMaxZTree = 0;
+
+void InitDynamicC(InputData *input) {
+    gMinMaxZTree = 
+        new MinMaxZTree(MIN_TILE_WIDTH, MIN_TILE_HEIGHT, DYNAMIC_TREE_LEVELS,
+                        input->header.framebufferWidth, 
+                        input->header.framebufferHeight);
+}
+
+
+/* We're going to split a tile into 4 sub-tiles.  This function
+   reclassifies the tile's lights with respect to the sub-tiles. */
+static void
+SplitTileMinMax(
+    int tileMidX, int tileMidY,
+    // Subtile data (00, 10, 01, 11)
+    float subtileMinZ[],
+    float subtileMaxZ[],
+    // G-buffer data
+    int gBufferWidth, int gBufferHeight,
+    // Camera data
+    float cameraProj_11, float cameraProj_22,
+    // Light Data
+    int lightIndices[],
+    int numLights,
+    float light_positionView_x_array[],
+    float light_positionView_y_array[],
+    float light_positionView_z_array[],
+    float light_attenuationEnd_array[],
+    // Outputs
+    int subtileIndices[],
+    int subtileIndicesPitch,
+    int subtileNumLights[]
+    )
+{
+    float gBufferScale_x = 0.5f * (float)gBufferWidth;
+    float gBufferScale_y = 0.5f * (float)gBufferHeight;
+        
+    float frustumPlanes_xy[2] = { -(cameraProj_11 * gBufferScale_x),
+                                   (cameraProj_22 * gBufferScale_y) };
+    float frustumPlanes_z[2] = { tileMidX - gBufferScale_x,
+                                 tileMidY - gBufferScale_y };
+
+    for (int i = 0; i < 2; ++i) {
+        // Normalize
+        float norm = 1.f / sqrtf(frustumPlanes_xy[i] * frustumPlanes_xy[i] + 
+                                 frustumPlanes_z[i] * frustumPlanes_z[i]);
+        frustumPlanes_xy[i] *= norm;
+        frustumPlanes_z[i] *= norm;
+    }
+
+    // Initialize
+    int subtileLightOffset[4];
+    subtileLightOffset[0] = 0 * subtileIndicesPitch;
+    subtileLightOffset[1] = 1 * subtileIndicesPitch;
+    subtileLightOffset[2] = 2 * subtileIndicesPitch;
+    subtileLightOffset[3] = 3 * subtileIndicesPitch;
+
+    for (int i = 0; i < numLights; ++i) {
+        int lightIndex = lightIndices[i];
+
+        float light_positionView_x = light_positionView_x_array[lightIndex];
+        float light_positionView_y = light_positionView_y_array[lightIndex];
+        float light_positionView_z = light_positionView_z_array[lightIndex];
+        float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
+        float light_attenuationEndNeg = -light_attenuationEnd;
+        
+        // Test lights again against subtile z bounds
+        bool inFrustum[4];
+        inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) &&
+            (subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg);
+        inFrustum[1] = (light_positionView_z - subtileMinZ[1] >= light_attenuationEndNeg) && 
+            (subtileMaxZ[1] - light_positionView_z >= light_attenuationEndNeg);
+        inFrustum[2] = (light_positionView_z - subtileMinZ[2] >= light_attenuationEndNeg) && 
+            (subtileMaxZ[2] - light_positionView_z >= light_attenuationEndNeg);
+        inFrustum[3] = (light_positionView_z - subtileMinZ[3] >= light_attenuationEndNeg) && 
+            (subtileMaxZ[3] - light_positionView_z >= light_attenuationEndNeg);
+
+        float dx = light_positionView_z * frustumPlanes_z[0] + 
+            light_positionView_x * frustumPlanes_xy[0];
+        float dy = light_positionView_z * frustumPlanes_z[1] +
+            light_positionView_y * frustumPlanes_xy[1];
+        
+        if (fabsf(dx) > light_attenuationEnd) {
+            bool positiveX = dx > 0.0f;
+            inFrustum[0] = inFrustum[0] &&  positiveX;    // 00 subtile
+            inFrustum[1] = inFrustum[1] && !positiveX;    // 10 subtile
+            inFrustum[2] = inFrustum[2] &&  positiveX;    // 01 subtile
+            inFrustum[3] = inFrustum[3] && !positiveX;    // 11 subtile
+        }
+        if (fabsf(dy) > light_attenuationEnd) {
+            bool positiveY = dy > 0.0f;
+            inFrustum[0] = inFrustum[0] &&  positiveY;    // 00 subtile
+            inFrustum[1] = inFrustum[1] &&  positiveY;    // 10 subtile
+            inFrustum[2] = inFrustum[2] && !positiveY;    // 01 subtile
+            inFrustum[3] = inFrustum[3] && !positiveY;    // 11 subtile
+        }
+
+        if (inFrustum[0])
+            subtileIndices[subtileLightOffset[0]++] = lightIndex;
+        if (inFrustum[1])
+            subtileIndices[subtileLightOffset[1]++] = lightIndex;
+        if (inFrustum[2])
+            subtileIndices[subtileLightOffset[2]++] = lightIndex;
+        if (inFrustum[3])
+            subtileIndices[subtileLightOffset[3]++] = lightIndex;
+    }
+
+    subtileNumLights[0] = subtileLightOffset[0] - 0 * subtileIndicesPitch;
+    subtileNumLights[1] = subtileLightOffset[1] - 1 * subtileIndicesPitch;
+    subtileNumLights[2] = subtileLightOffset[2] - 2 * subtileIndicesPitch;
+    subtileNumLights[3] = subtileLightOffset[3] - 3 * subtileIndicesPitch;
+}
+
+
+static inline float
+dot3(float x, float y, float z, float a, float b, float c) {
+    return (x*a + y*b + z*c);
+}
+
+
+static inline void
+normalize3(float x, float y, float z, float &ox, float &oy, float &oz) {
+    float n = 1.f / sqrtf(x*x + y*y + z*z);
+    ox = x * n;
+    oy = y * n;
+    oz = z * n;
+}
+
+
+static inline float
+Unorm8ToFloat32(uint8_t u) {
+    return (float)u * (1.0f / 255.0f);
+}
+
+
+static inline uint8_t
+Float32ToUnorm8(float f) {
+    return (uint8_t)(f * 255.0f);
+}
+
+
+static inline float
+half_to_float_fast(uint16_t h) {
+    uint32_t hs = h & (int32_t)0x8000u;  // Pick off sign bit
+    uint32_t he = h & (int32_t)0x7C00u;  // Pick off exponent bits
+    uint32_t hm = h & (int32_t)0x03FFu;  // Pick off mantissa bits
+
+    // sign
+    uint32_t xs = ((uint32_t) hs) << 16; 
+    // Exponent: unbias the halfp, then bias the single
+    int32_t xes = ((int32_t) (he >> 10)) - 15 + 127; 
+    // Exponent
+    uint32_t xe = (uint32_t) (xes << 23);
+    // Mantissa
+    uint32_t xm = ((uint32_t) hm) << 13; 
+
+    uint32_t bits = (xs | xe | xm);
+    float *fp = reinterpret_cast<float *>(&bits);
+    return *fp;
+}
+
+
+static void
+ShadeTileC(
+    int32_t tileStartX, int32_t tileEndX,
+    int32_t tileStartY, int32_t tileEndY,
+    int32_t gBufferWidth, int32_t gBufferHeight,
+    const ispc::InputDataArrays &inputData,
+    // Camera data
+    float cameraProj_11, float cameraProj_22,
+    float cameraProj_33, float cameraProj_43,
+    // Light list
+    int32_t tileLightIndices[],
+    int32_t tileNumLights,
+    // UI
+    bool visualizeLightCount,
+    // Output
+    uint8_t framebuffer_r[],
+    uint8_t framebuffer_g[],
+    uint8_t framebuffer_b[]
+    )
+{
+    if (tileNumLights == 0 || visualizeLightCount) {
+        uint8_t c = (uint8_t)(std::min(tileNumLights << 2, 255));
+        for (int32_t y = tileStartY; y < tileEndY; ++y) {
+            for (int32_t x = tileStartX; x < tileEndX; ++x) {
+                int32_t framebufferIndex = (y * gBufferWidth + x);
+                framebuffer_r[framebufferIndex] = c;
+                framebuffer_g[framebufferIndex] = c;
+                framebuffer_b[framebufferIndex] = c;
+            }
+        }
+    } else {
+        float twoOverGBufferWidth = 2.0f / gBufferWidth;
+        float twoOverGBufferHeight = 2.0f / gBufferHeight;
+        
+        for (int32_t y = tileStartY; y < tileEndY; ++y) {
+            float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
+
+            for (int32_t x = tileStartX; x < tileEndX; ++x) {
+                int32_t gBufferOffset = y * gBufferWidth + x;
+                
+                // Reconstruct position and (negative) view vector from G-buffer
+                float surface_positionView_x, surface_positionView_y, surface_positionView_z;
+                float Vneg_x, Vneg_y, Vneg_z;
+
+                float z = inputData.zBuffer[gBufferOffset];
+
+                // Compute screen/clip-space position
+                // NOTE: Mind DX11 viewport transform and pixel center!
+                float positionScreen_x = (0.5f + (float)(x)) * 
+                    twoOverGBufferWidth - 1.0f;
+
+                // Unproject depth buffer Z value into view space
+                surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
+                surface_positionView_x = positionScreen_x * surface_positionView_z / 
+                    cameraProj_11;
+                surface_positionView_y = positionScreen_y * surface_positionView_z / 
+                    cameraProj_22;
+                
+                // We actually end up with a vector pointing *at* the
+                // surface (i.e. the negative view vector)
+                normalize3(surface_positionView_x, surface_positionView_y, 
+                           surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
+
+                // Reconstruct normal from G-buffer
+                float surface_normal_x, surface_normal_y, surface_normal_z;
+                float normal_x = half_to_float_fast(inputData.normalEncoded_x[gBufferOffset]);
+                float normal_y = half_to_float_fast(inputData.normalEncoded_y[gBufferOffset]);
+                    
+                float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
+                float m = sqrtf(4.0f * f - 1.0f);
+                    
+                surface_normal_x = m * (4.0f * normal_x - 2.0f);
+                surface_normal_y = m * (4.0f * normal_y - 2.0f);
+                surface_normal_z = 3.0f - 8.0f * f;
+
+                // Load other G-buffer parameters
+                float surface_specularAmount = 
+                    half_to_float_fast(inputData.specularAmount[gBufferOffset]);
+                float surface_specularPower  = 
+                    half_to_float_fast(inputData.specularPower[gBufferOffset]);
+                float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
+                float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
+                float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
+                
+                float lit_x = 0.0f;
+                float lit_y = 0.0f;
+                float lit_z = 0.0f;
+                for (int32_t tileLightIndex = 0; tileLightIndex < tileNumLights; 
+                     ++tileLightIndex) {
+                    int32_t lightIndex = tileLightIndices[tileLightIndex];
+                                        
+                    // Gather light data relevant to initial culling
+                    float light_positionView_x = 
+                        inputData.lightPositionView_x[lightIndex];
+                    float light_positionView_y = 
+                        inputData.lightPositionView_y[lightIndex];
+                    float light_positionView_z = 
+                        inputData.lightPositionView_z[lightIndex];
+                    float light_attenuationEnd = 
+                        inputData.lightAttenuationEnd[lightIndex];
+                    
+                    // Compute light vector
+                    float L_x = light_positionView_x - surface_positionView_x;
+                    float L_y = light_positionView_y - surface_positionView_y;
+                    float L_z = light_positionView_z - surface_positionView_z;
+
+                    float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
+                    
+                    // Clip at end of attenuation
+                    float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
+
+                    if (distanceToLight2 < light_attenutaionEnd2) {                    
+                        float distanceToLight = sqrtf(distanceToLight2);
+
+                        float distanceToLightRcp = 1.f / distanceToLight;
+                        L_x *= distanceToLightRcp;
+                        L_y *= distanceToLightRcp;
+                        L_z *= distanceToLightRcp;
+
+                        // Start computing brdf
+                        float NdotL = dot3(surface_normal_x, surface_normal_y, 
+                                           surface_normal_z, L_x, L_y, L_z);
+                    
+                        // Clip back facing
+                        if (NdotL > 0.0f) {
+                            float light_attenuationBegin = 
+                                inputData.lightAttenuationBegin[lightIndex];
+
+                            // Light distance attenuation (linstep)
+                            float lightRange = (light_attenuationEnd - light_attenuationBegin);
+                            float falloffPosition = (light_attenuationEnd - distanceToLight);
+                            float attenuation = std::min(falloffPosition / lightRange, 1.0f);
+
+                            float H_x = (L_x - Vneg_x);
+                            float H_y = (L_y - Vneg_y);
+                            float H_z = (L_z - Vneg_z);
+                            normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
+                    
+                            float NdotH = dot3(surface_normal_x, surface_normal_y, 
+                                               surface_normal_z, H_x, H_y, H_z);
+                            NdotH = std::max(NdotH, 0.0f);
+
+                            float specular = powf(NdotH, surface_specularPower);
+                            float specularNorm = (surface_specularPower + 2.0f) * 
+                                (1.0f / 8.0f);
+                            float specularContrib = surface_specularAmount * 
+                                specularNorm * specular;
+
+                            float k = attenuation * NdotL * (1.0f + specularContrib);
+                    
+                            float light_color_x = inputData.lightColor_x[lightIndex];
+                            float light_color_y = inputData.lightColor_y[lightIndex];
+                            float light_color_z = inputData.lightColor_z[lightIndex];
+
+                            float lightContrib_x = surface_albedo_x * light_color_x;
+                            float lightContrib_y = surface_albedo_y * light_color_y;
+                            float lightContrib_z = surface_albedo_z * light_color_z;
+
+                            lit_x += lightContrib_x * k;
+                            lit_y += lightContrib_y * k;
+                            lit_z += lightContrib_z * k;
+                        }
+                    }
+                }
+
+                // Gamma correct
+                float gamma = 1.0 / 2.2f;
+                lit_x = powf(std::min(std::max(lit_x, 0.0f), 1.0f), gamma);
+                lit_y = powf(std::min(std::max(lit_y, 0.0f), 1.0f), gamma);
+                lit_z = powf(std::min(std::max(lit_z, 0.0f), 1.0f), gamma);
+                
+                framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
+                framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
+                framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
+            }
+        }
+    }
+}
+
+
+void
+ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY, 
+                        int *lightIndices, int numLights, 
+                        Framebuffer *framebuffer) {
+    const MinMaxZTree *minMaxZTree = gMinMaxZTree;
+    
+    // If we few enough lights or this is the base case (last level), shade
+    // this full tile directly
+    if (level == 0 || numLights < DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE) {
+        int width = minMaxZTree->TileWidth(level);
+        int height = minMaxZTree->TileHeight(level);
+        int startX = tileX * width;
+        int startY = tileY * height;
+        int endX = std::min(input->header.framebufferWidth, startX + width);
+        int endY = std::min(input->header.framebufferHeight, startY + height);
+        
+        // Skip entirely offscreen tiles
+        if (endX > startX && endY > startY) {
+            ShadeTileC(startX, endX, startY, endY,
+                       input->header.framebufferWidth, input->header.framebufferHeight,
+                       input->arrays,
+                       input->header.cameraProj[0][0], input->header.cameraProj[1][1], 
+                       input->header.cameraProj[2][2], input->header.cameraProj[3][2],
+                       lightIndices, numLights, VISUALIZE_LIGHT_COUNT, 
+                       framebuffer->r, framebuffer->g, framebuffer->b);
+        }
+    } 
+    else {
+        // Otherwise, subdivide and 4-way recurse using X and Y splitting planes
+        // Move down a level in the tree
+        --level;
+        tileX <<= 1;
+        tileY <<= 1;
+        int width = minMaxZTree->TileWidth(level);
+        int height = minMaxZTree->TileHeight(level);
+
+        // Work out splitting coords
+        int midX = (tileX + 1) * width;
+        int midY = (tileY + 1) * height;
+
+        // Read subtile min/max data
+        // NOTE: We must be sure to handle out-of-bounds access here since
+        // sometimes we'll only have 1 or 2 subtiles for non-pow-2
+        // framebuffer sizes.
+        bool rightTileExists = (tileX + 1 < minMaxZTree->NumTilesX(level));
+        bool bottomTileExists = (tileY + 1 < minMaxZTree->NumTilesY(level));
+
+        // NOTE: Order is 00, 10, 01, 11
+        // Set defaults up to cull all lights if the tile doesn't exist (offscreen)
+        float minZ[4] = {input->header.cameraFar, input->header.cameraFar, 
+                         input->header.cameraFar, input->header.cameraFar};
+        float maxZ[4] = {input->header.cameraNear, input->header.cameraNear, 
+                         input->header.cameraNear, input->header.cameraNear};
+
+        minZ[0] = minMaxZTree->MinZ(level, tileX, tileY);
+        maxZ[0] = minMaxZTree->MaxZ(level, tileX, tileY);
+        if (rightTileExists) {
+            minZ[1] = minMaxZTree->MinZ(level, tileX + 1, tileY);
+            maxZ[1] = minMaxZTree->MaxZ(level, tileX + 1, tileY);
+            if (bottomTileExists) {
+                minZ[3] = minMaxZTree->MinZ(level, tileX + 1, tileY + 1);
+                maxZ[3] = minMaxZTree->MaxZ(level, tileX + 1, tileY + 1);
+            }
+        }
+        if (bottomTileExists) {
+            minZ[2] = minMaxZTree->MinZ(level, tileX, tileY + 1);
+            maxZ[2] = minMaxZTree->MaxZ(level, tileX, tileY + 1);
+        }
+
+        // Cull lights into subtile lists
+#ifdef ISPC_IS_WINDOWS
+        __declspec(align(ALIGNMENT_BYTES)) 
+#endif
+            int subtileLightIndices[4][MAX_LIGHTS]
+#ifndef ISPC_IS_WINDOWS
+            __attribute__ ((aligned(ALIGNMENT_BYTES)))
+#endif
+;
+        int subtileNumLights[4];
+        SplitTileMinMax(midX, midY, minZ, maxZ,
+            input->header.framebufferWidth, input->header.framebufferHeight, 
+            input->header.cameraProj[0][0], input->header.cameraProj[1][1],
+            lightIndices, numLights, input->arrays.lightPositionView_x, 
+            input->arrays.lightPositionView_y, input->arrays.lightPositionView_z, 
+            input->arrays.lightAttenuationEnd,
+            subtileLightIndices[0], MAX_LIGHTS, subtileNumLights);
+        
+        // Recurse into subtiles
+        ShadeDynamicTileRecurse(input, level, tileX    , tileY, 
+                                subtileLightIndices[0], subtileNumLights[0],
+                                framebuffer);
+        ShadeDynamicTileRecurse(input, level, tileX + 1, tileY,
+                                subtileLightIndices[1], subtileNumLights[1],
+                                framebuffer);
+        ShadeDynamicTileRecurse(input, level, tileX    , tileY + 1,
+                                subtileLightIndices[2], subtileNumLights[2],
+                                framebuffer);
+        ShadeDynamicTileRecurse(input, level, tileX + 1, tileY + 1,
+                                subtileLightIndices[3], subtileNumLights[3],
+                                framebuffer);
+    }
+}
+
+
+static int
+IntersectLightsWithTileMinMax(
+    int tileStartX, int tileEndX,
+    int tileStartY, int tileEndY,
+    // Tile data
+    float minZ,
+    float maxZ,
+    // G-buffer data
+    int gBufferWidth, int gBufferHeight,
+    // Camera data
+    float cameraProj_11, float cameraProj_22,
+    // Light Data
+    int numLights,
+    float light_positionView_x_array[],
+    float light_positionView_y_array[],
+    float light_positionView_z_array[],
+    float light_attenuationEnd_array[],
+    // Output
+    int tileLightIndices[]
+    )
+{
+    float gBufferScale_x = 0.5f * (float)gBufferWidth;
+    float gBufferScale_y = 0.5f * (float)gBufferHeight;
+        
+    float frustumPlanes_xy[4];
+    float frustumPlanes_z[4];
+
+    // This one is totally constant over the whole screen... worth pulling it up at all?
+    float frustumPlanes_xy_v[4] = { -(cameraProj_11 * gBufferScale_x),
+                                    (cameraProj_11 * gBufferScale_x),
+                                    (cameraProj_22 * gBufferScale_y),
+                                    -(cameraProj_22 * gBufferScale_y) };
+    
+    float frustumPlanes_z_v[4] = {  tileEndX - gBufferScale_x,
+                                    -tileStartX + gBufferScale_x,
+                                    tileEndY - gBufferScale_y,
+                                    -tileStartY + gBufferScale_y };
+
+    for (int i = 0; i < 4; ++i) {
+        float norm = 1.f / sqrtf(frustumPlanes_xy_v[i] * frustumPlanes_xy_v[i] + 
+                                 frustumPlanes_z_v[i] * frustumPlanes_z_v[i]);
+        frustumPlanes_xy_v[i] *= norm;
+        frustumPlanes_z_v[i] *= norm;
+
+        frustumPlanes_xy[i] = frustumPlanes_xy_v[i];
+        frustumPlanes_z[i] = frustumPlanes_z_v[i];
+    }
+
+    int tileNumLights = 0;
+
+    for (int lightIndex = 0; lightIndex < numLights; ++lightIndex) {
+        float light_positionView_z = light_positionView_z_array[lightIndex];
+        float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
+        float light_attenuationEndNeg = -light_attenuationEnd;
+
+        float d = light_positionView_z - minZ;
+        bool inFrustum = (d >= light_attenuationEndNeg);
+
+        d = maxZ - light_positionView_z;
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+        
+        if (!inFrustum) 
+            continue;
+
+        float light_positionView_x = light_positionView_x_array[lightIndex];
+        float light_positionView_y = light_positionView_y_array[lightIndex];
+
+        d = light_positionView_z * frustumPlanes_z[0] + 
+            light_positionView_x * frustumPlanes_xy[0];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+        d = light_positionView_z * frustumPlanes_z[1] + 
+            light_positionView_x * frustumPlanes_xy[1];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+        d = light_positionView_z * frustumPlanes_z[2] + 
+            light_positionView_y * frustumPlanes_xy[2];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+        d = light_positionView_z * frustumPlanes_z[3] + 
+            light_positionView_y * frustumPlanes_xy[3];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+        
+        // Pack and store intersecting lights
+        if (inFrustum)
+            tileLightIndices[tileNumLights++] = lightIndex;
+    }
+
+    return tileNumLights;
+}
+
+
+void
+ShadeDynamicTile(InputData *input, int level, int tileX, int tileY,
+                 Framebuffer *framebuffer) {
+    const MinMaxZTree *minMaxZTree = gMinMaxZTree;
+
+    // Get Z min/max for this tile
+    int width = minMaxZTree->TileWidth(level);
+    int height = minMaxZTree->TileHeight(level);
+    float minZ = minMaxZTree->MinZ(level, tileX, tileY);
+    float maxZ = minMaxZTree->MaxZ(level, tileX, tileY);
+
+    int startX = tileX * width;
+    int startY = tileY * height;
+    int endX = std::min(input->header.framebufferWidth, startX + width);
+    int endY = std::min(input->header.framebufferHeight, startY + height);
+
+    // This is a root tile, so first do a full 6-plane cull
+#ifdef ISPC_IS_WINDOWS
+    __declspec(align(ALIGNMENT_BYTES)) 
+#endif
+        int lightIndices[MAX_LIGHTS]
+#ifndef ISPC_IS_WINDOWS
+        __attribute__ ((aligned(ALIGNMENT_BYTES)))
+#endif
+;
+    int numLights = IntersectLightsWithTileMinMax(
+        startX, endX, startY, endY,    minZ, maxZ,
+        input->header.framebufferWidth, input->header.framebufferHeight,
+        input->header.cameraProj[0][0], input->header.cameraProj[1][1],
+        MAX_LIGHTS, input->arrays.lightPositionView_x, 
+        input->arrays.lightPositionView_y, input->arrays.lightPositionView_z, 
+        input->arrays.lightAttenuationEnd, lightIndices);
+
+    // Now kick off the recursive process for this tile
+    ShadeDynamicTileRecurse(input, level, tileX, tileY, lightIndices, 
+                            numLights, framebuffer);
+}
+
+
+void
+DispatchDynamicC(InputData *input, Framebuffer *framebuffer)
+{
+    MinMaxZTree *minMaxZTree = gMinMaxZTree;
+        
+    // Update min/max Z tree
+    minMaxZTree->Update(input->arrays.zBuffer, input->header.framebufferWidth,
+        input->header.cameraProj[2][2], input->header.cameraProj[3][2], 
+        input->header.cameraNear, input->header.cameraFar);
+
+    int rootLevel = minMaxZTree->Levels() - 1;
+    int rootTilesX = minMaxZTree->NumTilesX(rootLevel);
+    int rootTilesY = minMaxZTree->NumTilesY(rootLevel);
+    int rootTiles = rootTilesX * rootTilesY;
+    for (int g = 0; g < rootTiles; ++g) {
+        uint32_t tileY = g / rootTilesX;
+        uint32_t tileX = g % rootTilesX;
+        ShadeDynamicTile(input, rootLevel, tileX, tileY, framebuffer);
+    }
+}
--- a/examples/deferred/dynamic_cilk.cpp
+++ b/examples/deferred/dynamic_cilk.cpp
@@ -0,0 +1,398 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifdef __cilk
+
+#include "deferred.h"
+#include "kernels_ispc.h"
+#include <algorithm>
+#include <assert.h>
+
+#ifdef _MSC_VER
+#define ISPC_IS_WINDOWS
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
+#ifdef ISPC_IS_LINUX
+#include <malloc.h>
+#endif // ISPC_IS_LINUX
+
+// Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)!
+#define MIN_TILE_WIDTH 16
+#define MIN_TILE_HEIGHT 16
+
+
+#define DYNAMIC_TREE_LEVELS 5
+// If this is set to 1 then the result will be identical to the static version
+#define DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE 1
+
+static void *
+lAlignedMalloc(size_t size, int32_t alignment) {
+#ifdef ISPC_IS_WINDOWS
+    return _aligned_malloc(size, alignment);
+#endif
+#ifdef ISPC_IS_LINUX
+    return memalign(alignment, size);
+#endif
+#ifdef ISPC_IS_APPLE
+    void *mem = malloc(size + (alignment-1) + sizeof(void*));
+    char *amem = ((char*)mem) + sizeof(void*);
+    amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
+                                        (alignment - 1)));
+    ((void**)amem)[-1] = mem;
+    return amem;
+#endif
+}
+
+
+static void
+lAlignedFree(void *ptr) {
+#ifdef ISPC_IS_WINDOWS
+    _aligned_free(ptr);
+#endif
+#ifdef ISPC_IS_LINUX
+    free(ptr);
+#endif
+#ifdef ISPC_IS_APPLE
+    free(((void**)ptr)[-1]);
+#endif
+}
+
+
+class MinMaxZTreeCilk
+{
+public:
+    // Currently (min) tile dimensions must divide gBuffer dimensions evenly
+    // Levels must be small enough that neither dimension goes below one tile
+    MinMaxZTreeCilk(
+        int tileWidth, int tileHeight, int levels,
+        int gBufferWidth, int gBufferHeight)
+        : mTileWidth(tileWidth), mTileHeight(tileHeight), mLevels(levels)
+    {
+        mNumTilesX = gBufferWidth / mTileWidth;
+        mNumTilesY = gBufferHeight / mTileHeight;
+        
+        // Allocate arrays
+        mMinZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
+        mMaxZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
+        for (int i = 0; i < mLevels; ++i) {
+            int x = NumTilesX(i);
+            int y = NumTilesY(i);
+            assert(x > 0);
+            assert(y > 0);
+            // NOTE: If the following two asserts fire it probably means that
+            // the base tile dimensions do not evenly divide the G-buffer dimensions
+            assert(x * (mTileWidth << i) >= gBufferWidth);
+            assert(y * (mTileHeight << i) >= gBufferHeight);
+            mMinZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
+            mMaxZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
+        }
+    }
+
+    void Update(float *zBuffer, int gBufferPitchInElements,
+        float cameraProj_33, float cameraProj_43,
+        float cameraNear, float cameraFar)
+    {
+        // Compute level 0 in parallel. Outer loops is here since we use Cilk
+        _Cilk_for (int tileY = 0; tileY < mNumTilesY; ++tileY) {
+            ispc::ComputeZBoundsRow(tileY,
+                mTileWidth, mTileHeight, mNumTilesX, mNumTilesY,
+                zBuffer, gBufferPitchInElements,
+                cameraProj_33, cameraProj_43, cameraNear, cameraFar,
+                mMinZArrays[0] + (tileY * mNumTilesX),
+                mMaxZArrays[0] + (tileY * mNumTilesX));
+        }
+
+        // Generate other levels
+        // NOTE: We currently don't use ispc here since it's sort of an
+        // awkward gather-based reduction Using SSE odd pack/unpack
+        // instructions might actually work here when we need to optimize
+        for (int level = 1; level < mLevels; ++level) {
+            int destTilesX = NumTilesX(level);
+            int destTilesY = NumTilesY(level);
+            int srcLevel = level - 1;
+            int srcTilesX = NumTilesX(srcLevel);
+            int srcTilesY = NumTilesY(srcLevel);
+            _Cilk_for (int y = 0; y < destTilesY; ++y) {
+                for (int x = 0; x < destTilesX; ++x) {
+                    int srcX = x << 1;
+                    int srcY = y << 1;
+                    // NOTE: Ugly branches to deal with non-multiple dimensions at some levels
+                    // TODO: SSE branchless min/max is probably better...
+                    float minZ = mMinZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
+                    float maxZ = mMaxZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
+                    if (srcX + 1 < srcTilesX) {
+                        minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY) * srcTilesX + 
+                                                                    (srcX + 1)]);
+                        maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY) * srcTilesX +
+                                                                    (srcX + 1)]);
+                        if (srcY + 1 < srcTilesY) {
+                            minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                        (srcX + 1)]);
+                            maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                        (srcX + 1)]);
+                        }
+                    }
+                    if (srcY + 1 < srcTilesY) {
+                        minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                    (srcX    )]);
+                        maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                    (srcX    )]);
+                    }
+                    mMinZArrays[level][y * destTilesX + x] = minZ;
+                    mMaxZArrays[level][y * destTilesX + x] = maxZ;
+                }
+            }
+        }
+    }
+
+    ~MinMaxZTreeCilk() {
+        for (int i = 0; i < mLevels; ++i) {
+            lAlignedFree(mMinZArrays[i]);
+            lAlignedFree(mMaxZArrays[i]);
+        }
+        lAlignedFree(mMinZArrays);
+        lAlignedFree(mMaxZArrays); 
+    }
+
+    int Levels() const { return mLevels; }
+
+    // These round UP, so beware that the last tile for a given level may not be completely full
+    // TODO: Verify this...
+    int NumTilesX(int level = 0) const { return (mNumTilesX + (1 << level) - 1) >> level; }
+    int NumTilesY(int level = 0) const { return (mNumTilesY + (1 << level) - 1) >> level; }
+    int TileWidth(int level = 0) const { return (mTileWidth << level); }
+    int TileHeight(int level = 0) const { return (mTileHeight << level); }
+
+    float MinZ(int level, int tileX, int tileY) const {
+        return mMinZArrays[level][tileY * NumTilesX(level) + tileX];
+    }
+    float MaxZ(int level, int tileX, int tileY) const {
+        return mMaxZArrays[level][tileY * NumTilesX(level) + tileX];
+    }
+
+private:
+    int mTileWidth;
+    int mTileHeight;
+    int mLevels;
+    int mNumTilesX;
+    int mNumTilesY;
+
+    // One array for each "level" in the tree
+    float **mMinZArrays;
+    float **mMaxZArrays;
+};
+
+static MinMaxZTreeCilk *gMinMaxZTreeCilk = 0;
+
+void InitDynamicCilk(InputData *input) {
+    gMinMaxZTreeCilk = 
+        new MinMaxZTreeCilk(MIN_TILE_WIDTH, MIN_TILE_HEIGHT, DYNAMIC_TREE_LEVELS,
+                            input->header.framebufferWidth, 
+                            input->header.framebufferHeight);
+}
+
+
+static void
+ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY, 
+                        int *lightIndices, int numLights, 
+                        Framebuffer *framebuffer) {
+    const MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
+    
+    // If we few enough lights or this is the base case (last level), shade
+    // this full tile directly
+    if (level == 0 || numLights < DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE) {
+        int width = minMaxZTree->TileWidth(level);
+        int height = minMaxZTree->TileHeight(level);
+        int startX = tileX * width;
+        int startY = tileY * height;
+        int endX = std::min(input->header.framebufferWidth, startX + width);
+        int endY = std::min(input->header.framebufferHeight, startY + height);
+        
+        // Skip entirely offscreen tiles
+        if (endX > startX && endY > startY) {
+            ispc::ShadeTile(
+                startX, endX, startY, endY,
+                input->header.framebufferWidth, input->header.framebufferHeight,
+                &input->arrays,
+                input->header.cameraProj[0][0], input->header.cameraProj[1][1], 
+                input->header.cameraProj[2][2], input->header.cameraProj[3][2],
+                lightIndices, numLights, VISUALIZE_LIGHT_COUNT, 
+                framebuffer->r, framebuffer->g, framebuffer->b);
+        }
+    } 
+    else {
+        // Otherwise, subdivide and 4-way recurse using X and Y splitting planes
+        // Move down a level in the tree
+        --level;
+        tileX <<= 1;
+        tileY <<= 1;
+        int width = minMaxZTree->TileWidth(level);
+        int height = minMaxZTree->TileHeight(level);
+
+        // Work out splitting coords
+        int midX = (tileX + 1) * width;
+        int midY = (tileY + 1) * height;
+
+        // Read subtile min/max data
+        // NOTE: We must be sure to handle out-of-bounds access here since
+        // sometimes we'll only have 1 or 2 subtiles for non-pow-2
+        // framebuffer sizes.
+        bool rightTileExists = (tileX + 1 < minMaxZTree->NumTilesX(level));
+        bool bottomTileExists = (tileY + 1 < minMaxZTree->NumTilesY(level));
+
+        // NOTE: Order is 00, 10, 01, 11
+        // Set defaults up to cull all lights if the tile doesn't exist (offscreen)
+        float minZ[4] = {input->header.cameraFar, input->header.cameraFar, 
+                         input->header.cameraFar, input->header.cameraFar};
+        float maxZ[4] = {input->header.cameraNear, input->header.cameraNear, 
+                         input->header.cameraNear, input->header.cameraNear};
+
+        minZ[0] = minMaxZTree->MinZ(level, tileX, tileY);
+        maxZ[0] = minMaxZTree->MaxZ(level, tileX, tileY);
+        if (rightTileExists) {
+            minZ[1] = minMaxZTree->MinZ(level, tileX + 1, tileY);
+            maxZ[1] = minMaxZTree->MaxZ(level, tileX + 1, tileY);
+            if (bottomTileExists) {
+                minZ[3] = minMaxZTree->MinZ(level, tileX + 1, tileY + 1);
+                maxZ[3] = minMaxZTree->MaxZ(level, tileX + 1, tileY + 1);
+            }
+        }
+        if (bottomTileExists) {
+            minZ[2] = minMaxZTree->MinZ(level, tileX, tileY + 1);
+            maxZ[2] = minMaxZTree->MaxZ(level, tileX, tileY + 1);
+        }
+
+        // Cull lights into subtile lists
+#ifdef ISPC_IS_WINDOWS
+        __declspec(align(ALIGNMENT_BYTES)) 
+#endif
+            int subtileLightIndices[4][MAX_LIGHTS]
+#ifndef ISPC_IS_WINDOWS
+            __attribute__ ((aligned(ALIGNMENT_BYTES)))
+#endif
+;
+        int subtileNumLights[4];
+        ispc::SplitTileMinMax(midX, midY, minZ, maxZ,
+            input->header.framebufferWidth, input->header.framebufferHeight, 
+            input->header.cameraProj[0][0], input->header.cameraProj[1][1],
+            lightIndices, numLights, input->arrays.lightPositionView_x, 
+            input->arrays.lightPositionView_y, input->arrays.lightPositionView_z, 
+            input->arrays.lightAttenuationEnd,
+            subtileLightIndices[0], MAX_LIGHTS, subtileNumLights);
+        
+        // Recurse into subtiles
+        _Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX    , tileY, 
+                                            subtileLightIndices[0], subtileNumLights[0],
+                                            framebuffer);
+        _Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX + 1, tileY,
+                                            subtileLightIndices[1], subtileNumLights[1],
+                                            framebuffer);
+        _Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX    , tileY + 1,
+                                            subtileLightIndices[2], subtileNumLights[2],
+                                            framebuffer);
+        ShadeDynamicTileRecurse(input, level, tileX + 1, tileY + 1,
+                                subtileLightIndices[3], subtileNumLights[3],
+                                framebuffer);
+    }
+}
+
+
+static void
+ShadeDynamicTile(InputData *input, int level, int tileX, int tileY,
+                 Framebuffer *framebuffer) {
+    const MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
+
+    // Get Z min/max for this tile
+    int width = minMaxZTree->TileWidth(level);
+    int height = minMaxZTree->TileHeight(level);
+    float minZ = minMaxZTree->MinZ(level, tileX, tileY);
+    float maxZ = minMaxZTree->MaxZ(level, tileX, tileY);
+
+    int startX = tileX * width;
+    int startY = tileY * height;
+    int endX = std::min(input->header.framebufferWidth, startX + width);
+    int endY = std::min(input->header.framebufferHeight, startY + height);
+
+    // This is a root tile, so first do a full 6-plane cull
+#ifdef ISPC_IS_WINDOWS
+    __declspec(align(ALIGNMENT_BYTES)) 
+#endif
+        int lightIndices[MAX_LIGHTS]
+#ifndef ISPC_IS_WINDOWS
+        __attribute__ ((aligned(ALIGNMENT_BYTES)))
+#endif
+;
+    int numLights = ispc::IntersectLightsWithTileMinMax(
+        startX, endX, startY, endY,    minZ, maxZ,
+        input->header.framebufferWidth, input->header.framebufferHeight,
+        input->header.cameraProj[0][0], input->header.cameraProj[1][1],
+        MAX_LIGHTS, input->arrays.lightPositionView_x, 
+        input->arrays.lightPositionView_y, input->arrays.lightPositionView_z, 
+        input->arrays.lightAttenuationEnd, lightIndices);
+
+    // Now kick off the recursive process for this tile
+    ShadeDynamicTileRecurse(input, level, tileX, tileY, lightIndices, 
+                            numLights, framebuffer);
+}
+
+
+void
+DispatchDynamicCilk(InputData *input, Framebuffer *framebuffer)
+{
+    MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
+        
+    // Update min/max Z tree
+    minMaxZTree->Update(input->arrays.zBuffer, input->header.framebufferWidth,
+        input->header.cameraProj[2][2], input->header.cameraProj[3][2], 
+        input->header.cameraNear, input->header.cameraFar);
+
+    // Launch the "root" tiles.  Ideally these should at least fill the
+    // machine... at the moment we have a static number of "levels" to the
+    // mip tree but it might make sense to compute it based on the width of
+    // the machine.
+    int rootLevel = minMaxZTree->Levels() - 1;
+    int rootTilesX = minMaxZTree->NumTilesX(rootLevel);
+    int rootTilesY = minMaxZTree->NumTilesY(rootLevel);
+    int rootTiles = rootTilesX * rootTilesY;
+    _Cilk_for (int g = 0; g < rootTiles; ++g) {
+        uint32_t tileY = g / rootTilesX;
+        uint32_t tileX = g % rootTilesX;
+        ShadeDynamicTile(input, rootLevel, tileX, tileY, framebuffer);
+    }
+}
+
+#endif // __cilk
--- a/examples/deferred/kernels.ispc
+++ b/examples/deferred/kernels.ispc
@@ -0,0 +1,672 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#include "deferred.h"
+
+struct InputDataArrays
+{
+    float *zBuffer;
+    unsigned int16 *normalEncoded_x; // half float
+    unsigned int16 *normalEncoded_y; // half float
+    unsigned int16 *specularAmount; // half float
+    unsigned int16 *specularPower; // half float
+    unsigned int8 *albedo_x; // unorm8
+    unsigned int8 *albedo_y; // unorm8
+    unsigned int8 *albedo_z; // unorm8
+    float *lightPositionView_x;
+    float *lightPositionView_y;
+    float *lightPositionView_z;
+    float *lightAttenuationBegin;
+    float *lightColor_x;
+    float *lightColor_y;
+    float *lightColor_z;
+    float *lightAttenuationEnd;
+};
+
+struct InputHeader
+{
+    float cameraProj[4][4];
+    float cameraNear;
+    float cameraFar;
+
+    int32 framebufferWidth;
+    int32 framebufferHeight;
+    int32 numLights;
+    int32 inputDataChunkSize;
+    int32 inputDataArrayOffsets[idaNum];
+};
+
+
+///////////////////////////////////////////////////////////////////////////
+// Common utility routines
+
+static inline float
+dot3(float x, float y, float z, float a, float b, float c) {
+    return (x*a + y*b + z*c);
+}
+
+
+static inline void
+normalize3(float x, float y, float z, float &ox, float &oy, float &oz) {
+    float n = rsqrt(x*x + y*y + z*z);
+    ox = x * n;
+    oy = y * n;
+    oz = z * n;
+}
+
+
+static inline float
+Unorm8ToFloat32(unsigned int8 u) {
+    return (float)u * (1.0f / 255.0f);
+}
+
+
+static inline unsigned int8
+Float32ToUnorm8(float f) {
+    return (unsigned int8)(f * 255.0f);
+}
+
+
+static void
+ComputeZBounds(
+    uniform int32 tileStartX, uniform int32 tileEndX,
+    uniform int32 tileStartY, uniform int32 tileEndY,
+    // G-buffer data
+    uniform float zBuffer[],
+    uniform int32 gBufferWidth,
+    // Camera data
+    uniform float cameraProj_33, uniform float cameraProj_43,
+    uniform float cameraNear, uniform float cameraFar,
+    // Output
+    uniform float &minZ,
+    uniform float &maxZ
+    )
+{
+    // Find Z bounds
+    float laneMinZ = cameraFar;
+    float laneMaxZ = cameraNear;
+    for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
+        foreach (x = tileStartX ... tileEndX) {
+            // Unproject depth buffer Z value into view space
+            float z = zBuffer[y * gBufferWidth + x];
+            float viewSpaceZ = cameraProj_43 / (z - cameraProj_33);
+
+            // Work out Z bounds for our samples
+            // Avoid considering skybox/background or otherwise invalid pixels
+            if ((viewSpaceZ < cameraFar) && (viewSpaceZ >= cameraNear)) {
+                laneMinZ = min(laneMinZ, viewSpaceZ);
+                laneMaxZ = max(laneMaxZ, viewSpaceZ);
+            }
+        }
+    }
+    minZ = reduce_min(laneMinZ);
+    maxZ = reduce_max(laneMaxZ);
+}
+
+
+export uniform int32
+IntersectLightsWithTileMinMax(
+    uniform int32 tileStartX, uniform int32 tileEndX,
+    uniform int32 tileStartY, uniform int32 tileEndY,
+    // Tile data
+    uniform float minZ,
+    uniform float maxZ,
+    // G-buffer data
+    uniform int32 gBufferWidth, uniform int32 gBufferHeight,
+    // Camera data
+    uniform float cameraProj_11, uniform float cameraProj_22,
+    // Light Data
+    uniform int32 numLights,
+    uniform float light_positionView_x_array[],
+    uniform float light_positionView_y_array[],
+    uniform float light_positionView_z_array[],
+    uniform float light_attenuationEnd_array[],
+    // Output
+    uniform int32 tileLightIndices[]
+    )
+{
+    uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
+    uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
+        
+    uniform float frustumPlanes_xy[4] = {
+        -(cameraProj_11 * gBufferScale_x),
+         (cameraProj_11 * gBufferScale_x),
+         (cameraProj_22 * gBufferScale_y),
+        -(cameraProj_22 * gBufferScale_y) };
+    uniform float frustumPlanes_z[4] = {
+         tileEndX - gBufferScale_x,
+        -tileStartX + gBufferScale_x,
+         tileEndY - gBufferScale_y,
+        -tileStartY + gBufferScale_y };
+
+    for (uniform int i = 0; i < 4; ++i) {
+        uniform float norm = rsqrt(frustumPlanes_xy[i] * frustumPlanes_xy[i] + 
+                                   frustumPlanes_z[i] * frustumPlanes_z[i]);
+        frustumPlanes_xy[i] *= norm;
+        frustumPlanes_z[i] *= norm;
+    }
+
+    uniform int32 tileNumLights = 0;
+
+    foreach (lightIndex = 0 ... numLights) {
+        float light_positionView_z = light_positionView_z_array[lightIndex];
+        float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
+        float light_attenuationEndNeg = -light_attenuationEnd;
+
+        float d = light_positionView_z - minZ;
+        bool inFrustum = (d >= light_attenuationEndNeg);
+
+        d = maxZ - light_positionView_z;
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+        
+        // This seems better than cif(!inFrustum) ccontinue; here since we
+        // don't actually need to mask the rest of this function - this is
+        // just a greedy early-out.  Could also structure all of this as
+        // nested if() statements, but this a bit easier to read
+        if (any(inFrustum)) {
+            float light_positionView_x = light_positionView_x_array[lightIndex];
+            float light_positionView_y = light_positionView_y_array[lightIndex];
+
+            d = light_positionView_z * frustumPlanes_z[0] + 
+                light_positionView_x * frustumPlanes_xy[0];
+            inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+            d = light_positionView_z * frustumPlanes_z[1] + 
+                light_positionView_x * frustumPlanes_xy[1];
+            inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+            d = light_positionView_z * frustumPlanes_z[2] + 
+                light_positionView_y * frustumPlanes_xy[2];
+            inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+            d = light_positionView_z * frustumPlanes_z[3] + 
+                light_positionView_y * frustumPlanes_xy[3];
+            inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+        
+            // Pack and store intersecting lights
+            cif (inFrustum) {
+                tileNumLights += packed_store_active(&tileLightIndices[tileNumLights], 
+                                                     lightIndex);
+            }
+        }
+    }
+
+    return tileNumLights;
+}
+
+
+static uniform int32
+IntersectLightsWithTile(
+    uniform int32 tileStartX, uniform int32 tileEndX,
+    uniform int32 tileStartY, uniform int32 tileEndY,
+    uniform int32 gBufferWidth, uniform int32 gBufferHeight,
+    // G-buffer data
+    uniform float zBuffer[],
+    // Camera data
+    uniform float cameraProj_11, uniform float cameraProj_22,
+    uniform float cameraProj_33, uniform float cameraProj_43,
+    uniform float cameraNear, uniform float cameraFar,
+    // Light Data
+    uniform int32 numLights,
+    uniform float light_positionView_x_array[],
+    uniform float light_positionView_y_array[],
+    uniform float light_positionView_z_array[],
+    uniform float light_attenuationEnd_array[],
+    // Output
+    uniform int32 tileLightIndices[]
+    )
+{
+    uniform float minZ, maxZ;
+    ComputeZBounds(tileStartX, tileEndX, tileStartY, tileEndY,
+        zBuffer, gBufferWidth, cameraProj_33, cameraProj_43, cameraNear, cameraFar,
+        minZ, maxZ);
+
+    uniform int32 tileNumLights = IntersectLightsWithTileMinMax(
+        tileStartX, tileEndX, tileStartY, tileEndY, minZ, maxZ,
+        gBufferWidth, gBufferHeight, cameraProj_11, cameraProj_22,
+        MAX_LIGHTS, light_positionView_x_array, light_positionView_y_array, 
+        light_positionView_z_array, light_attenuationEnd_array,
+        tileLightIndices);
+
+    return tileNumLights;
+}
+
+
+export void
+ShadeTile(
+    uniform int32 tileStartX, uniform int32 tileEndX,
+    uniform int32 tileStartY, uniform int32 tileEndY,
+    uniform int32 gBufferWidth, uniform int32 gBufferHeight,
+    uniform InputDataArrays &inputData,
+    // Camera data
+    uniform float cameraProj_11, uniform float cameraProj_22,
+    uniform float cameraProj_33, uniform float cameraProj_43,
+    // Light list
+    uniform int32 tileLightIndices[],
+    uniform int32 tileNumLights,
+    // UI
+    uniform bool visualizeLightCount,
+    // Output
+    uniform unsigned int8 framebuffer_r[],
+    uniform unsigned int8 framebuffer_g[],
+    uniform unsigned int8 framebuffer_b[]
+    )
+{
+    if (tileNumLights == 0 || visualizeLightCount) {
+        uniform unsigned int8 c = (unsigned int8)(min(tileNumLights << 2, 255));
+        for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
+            foreach (x = tileStartX ... tileEndX) {
+                int32 framebufferIndex = (y * gBufferWidth + x);
+                framebuffer_r[framebufferIndex] = c;
+                framebuffer_g[framebufferIndex] = c;
+                framebuffer_b[framebufferIndex] = c;
+            }
+        }
+    } else {
+        uniform float twoOverGBufferWidth = 2.0f / gBufferWidth;
+        uniform float twoOverGBufferHeight = 2.0f / gBufferHeight;
+        
+        for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
+            uniform float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
+
+            foreach (x = tileStartX ... tileEndX) {
+                int32 gBufferOffset = y * gBufferWidth + x;
+                
+                // Reconstruct position and (negative) view vector from G-buffer
+                float surface_positionView_x, surface_positionView_y, surface_positionView_z;
+                float Vneg_x, Vneg_y, Vneg_z;
+
+                float z = inputData.zBuffer[gBufferOffset];
+
+                // Compute screen/clip-space position
+                // NOTE: Mind DX11 viewport transform and pixel center!
+                float positionScreen_x = (0.5f + (float)(x)) * 
+                    twoOverGBufferWidth - 1.0f;
+
+                // Unproject depth buffer Z value into view space
+                surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
+                surface_positionView_x = positionScreen_x * surface_positionView_z / 
+                    cameraProj_11;
+                surface_positionView_y = positionScreen_y * surface_positionView_z / 
+                    cameraProj_22;
+                
+                // We actually end up with a vector pointing *at* the
+                // surface (i.e. the negative view vector)
+                normalize3(surface_positionView_x, surface_positionView_y, 
+                           surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
+
+                // Reconstruct normal from G-buffer
+                float surface_normal_x, surface_normal_y, surface_normal_z;
+                float normal_x = half_to_float(inputData.normalEncoded_x[gBufferOffset]);
+                float normal_y = half_to_float(inputData.normalEncoded_y[gBufferOffset]);
+                    
+                float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
+                float m = sqrt(4.0f * f - 1.0f);
+                    
+                surface_normal_x = m * (4.0f * normal_x - 2.0f);
+                surface_normal_y = m * (4.0f * normal_y - 2.0f);
+                surface_normal_z = 3.0f - 8.0f * f;
+
+                // Load other G-buffer parameters
+                float surface_specularAmount = 
+                    half_to_float(inputData.specularAmount[gBufferOffset]);
+                float surface_specularPower  = 
+                    half_to_float(inputData.specularPower[gBufferOffset]);
+                float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
+                float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
+                float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
+                
+                float lit_x = 0.0f;
+                float lit_y = 0.0f;
+                float lit_z = 0.0f;
+                for (uniform int32 tileLightIndex = 0; tileLightIndex < tileNumLights; 
+                     ++tileLightIndex) {
+                    uniform int32 lightIndex = tileLightIndices[tileLightIndex];
+                                        
+                    // Gather light data relevant to initial culling
+                    uniform float light_positionView_x = 
+                        inputData.lightPositionView_x[lightIndex];
+                    uniform float light_positionView_y = 
+                        inputData.lightPositionView_y[lightIndex];
+                    uniform float light_positionView_z = 
+                        inputData.lightPositionView_z[lightIndex];
+                    uniform float light_attenuationEnd = 
+                        inputData.lightAttenuationEnd[lightIndex];
+                    
+                    // Compute light vector
+                    float L_x = light_positionView_x - surface_positionView_x;
+                    float L_y = light_positionView_y - surface_positionView_y;
+                    float L_z = light_positionView_z - surface_positionView_z;
+
+                    float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
+                    
+                    // Clip at end of attenuation
+                    float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
+
+                    cif (distanceToLight2 < light_attenutaionEnd2) {                    
+                        float distanceToLight = sqrt(distanceToLight2);
+
+                        // HLSL "rcp" is allowed to be fairly inaccurate
+                        float distanceToLightRcp = rcp(distanceToLight);
+                        L_x *= distanceToLightRcp;
+                        L_y *= distanceToLightRcp;
+                        L_z *= distanceToLightRcp;
+
+                        // Start computing brdf
+                        float NdotL = dot3(surface_normal_x, surface_normal_y, 
+                                           surface_normal_z, L_x, L_y, L_z);
+                    
+                        // Clip back facing
+                        cif (NdotL > 0.0f) {
+                            uniform float light_attenuationBegin = 
+                                inputData.lightAttenuationBegin[lightIndex];
+
+                            // Light distance attenuation (linstep)
+                            float lightRange = (light_attenuationEnd - light_attenuationBegin);
+                            float falloffPosition = (light_attenuationEnd - distanceToLight);
+                            float attenuation = min(falloffPosition / lightRange, 1.0f);
+
+                            float H_x = (L_x - Vneg_x);
+                            float H_y = (L_y - Vneg_y);
+                            float H_z = (L_z - Vneg_z);
+                            normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
+                    
+                            float NdotH = dot3(surface_normal_x, surface_normal_y, 
+                                               surface_normal_z, H_x, H_y, H_z);
+                            NdotH = max(NdotH, 0.0f);
+
+                            float specular = pow(NdotH, surface_specularPower);
+                            float specularNorm = (surface_specularPower + 2.0f) * 
+                                (1.0f / 8.0f);
+                            float specularContrib = surface_specularAmount * 
+                                specularNorm * specular;
+
+                            float k = attenuation * NdotL * (1.0f + specularContrib);
+                    
+                            uniform float light_color_x = inputData.lightColor_x[lightIndex];
+                            uniform float light_color_y = inputData.lightColor_y[lightIndex];
+                            uniform float light_color_z = inputData.lightColor_z[lightIndex];
+
+                            float lightContrib_x = surface_albedo_x * light_color_x;
+                            float lightContrib_y = surface_albedo_y * light_color_y;
+                            float lightContrib_z = surface_albedo_z * light_color_z;
+
+                            lit_x += lightContrib_x * k;
+                            lit_y += lightContrib_y * k;
+                            lit_z += lightContrib_z * k;
+                        }
+                    }
+                }
+
+                // Gamma correct
+                // These pows are pretty slow right now, but we can do
+                // something faster if really necessary to squeeze every
+                // last bit of performance out of it
+                float gamma = 1.0 / 2.2f;
+                lit_x = pow(clamp(lit_x, 0.0f, 1.0f), gamma);
+                lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma);
+                lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma);
+                
+                framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
+                framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
+                framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
+            }
+        }
+    }
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// Static decomposition
+
+task void
+RenderTile(uniform int num_groups_x, uniform int num_groups_y,
+           uniform InputHeader &inputHeader,
+           uniform InputDataArrays &inputData,
+           uniform int visualizeLightCount,
+           // Output
+           uniform unsigned int8 framebuffer_r[],
+           uniform unsigned int8 framebuffer_g[],
+           uniform unsigned int8 framebuffer_b[]) {
+    uniform int32 group_y = taskIndex / num_groups_x;
+    uniform int32 group_x = taskIndex % num_groups_x;
+    uniform int32 tile_start_x = group_x * MIN_TILE_WIDTH;
+    uniform int32 tile_start_y = group_y * MIN_TILE_HEIGHT;
+    uniform int32 tile_end_x = tile_start_x + MIN_TILE_WIDTH;
+    uniform int32 tile_end_y = tile_start_y + MIN_TILE_HEIGHT;
+
+    uniform int framebufferWidth = inputHeader.framebufferWidth;
+    uniform int framebufferHeight = inputHeader.framebufferHeight;
+    uniform float cameraProj_00 = inputHeader.cameraProj[0][0];
+    uniform float cameraProj_11 = inputHeader.cameraProj[1][1];
+    uniform float cameraProj_22 = inputHeader.cameraProj[2][2];
+    uniform float cameraProj_32 = inputHeader.cameraProj[3][2];
+
+    // Light intersection: figure out which lights illuminate this tile.
+    uniform int tileLightIndices[MAX_LIGHTS];  // Light list for the tile
+    uniform int numTileLights = 
+        IntersectLightsWithTile(tile_start_x, tile_end_x, 
+                                tile_start_y, tile_end_y,
+                                framebufferWidth, framebufferHeight,
+                                inputData.zBuffer,
+                                cameraProj_00, cameraProj_11,
+                                cameraProj_22, cameraProj_32,
+                                inputHeader.cameraNear, inputHeader.cameraFar,
+                                MAX_LIGHTS,
+                                inputData.lightPositionView_x, 
+                                inputData.lightPositionView_y, 
+                                inputData.lightPositionView_z, 
+                                inputData.lightAttenuationEnd,
+                                tileLightIndices);
+
+    // And now shade the tile, using the lights in tileLightIndices
+    ShadeTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y,
+              framebufferWidth, framebufferHeight, inputData,
+              cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32,
+              tileLightIndices, numTileLights, visualizeLightCount, 
+              framebuffer_r, framebuffer_g, framebuffer_b);
+}
+
+
+export void
+RenderStatic(uniform InputHeader &inputHeader,
+             uniform InputDataArrays &inputData,
+             uniform int visualizeLightCount,
+             // Output
+             uniform unsigned int8 framebuffer_r[],
+             uniform unsigned int8 framebuffer_g[],
+             uniform unsigned int8 framebuffer_b[]) {
+    uniform int num_groups_x = (inputHeader.framebufferWidth + 
+                                MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
+    uniform int num_groups_y = (inputHeader.framebufferHeight + 
+                                MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT;
+    uniform int num_groups = num_groups_x * num_groups_y;
+
+    // Launch a task to render each tile, each of which is MIN_TILE_WIDTH
+    // by MIN_TILE_HEIGHT pixels.
+    launch[num_groups] RenderTile(num_groups_x, num_groups_y,
+                                  inputHeader, inputData, visualizeLightCount,
+                                  framebuffer_r, framebuffer_g, framebuffer_b);
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// Routines for dynamic decomposition path
+
+// This computes the z min/max range for a whole row worth of tiles.
+export void
+ComputeZBoundsRow(
+    uniform int32 tileY,
+    uniform int32 tileWidth, uniform int32 tileHeight,
+    uniform int32 numTilesX, uniform int32 numTilesY,
+    // G-buffer data
+    uniform float zBuffer[],
+    uniform int32 gBufferWidth,
+    // Camera data
+    uniform float cameraProj_33, uniform float cameraProj_43,
+    uniform float cameraNear, uniform float cameraFar,
+    // Output
+    uniform float minZArray[],
+    uniform float maxZArray[]
+    )
+{
+    for (uniform int32 tileX = 0; tileX < numTilesX; ++tileX) {
+        uniform float minZ, maxZ;
+        ComputeZBounds(
+            tileX * tileWidth, tileX * tileWidth + tileWidth,
+            tileY * tileHeight, tileY * tileHeight + tileHeight,
+            zBuffer, gBufferWidth,
+            cameraProj_33, cameraProj_43, cameraNear, cameraFar,
+            minZ, maxZ);
+        minZArray[tileX] = minZ;
+        maxZArray[tileX] = maxZ;
+    }
+}
+
+
+// Reclassifies the lights with respect to four sub-tiles when we refine a tile.
+// numLights need not be a multiple of programCount here, but the input and output arrays
+// should be able to handle programCount-sized load/stores.
+export void
+SplitTileMinMax(
+    uniform int32 tileMidX, uniform int32 tileMidY,
+    // Subtile data (00, 10, 01, 11)
+    uniform float subtileMinZ[],
+    uniform float subtileMaxZ[],
+    // G-buffer data
+    uniform int32 gBufferWidth, uniform int32 gBufferHeight,
+    // Camera data
+    uniform float cameraProj_11, uniform float cameraProj_22,
+    // Light Data
+    uniform int32 lightIndices[],
+    uniform int32 numLights,
+    uniform float light_positionView_x_array[],
+    uniform float light_positionView_y_array[],
+    uniform float light_positionView_z_array[],
+    uniform float light_attenuationEnd_array[],
+    // Outputs
+    uniform int32 subtileIndices[],
+    uniform int32 subtileIndicesPitch,
+    uniform int32 subtileNumLights[]
+    )
+{
+    uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
+    uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
+        
+    uniform float frustumPlanes_xy[2] = { -(cameraProj_11 * gBufferScale_x),
+                                           (cameraProj_22 * gBufferScale_y) };
+    uniform float frustumPlanes_z[2] = { tileMidX - gBufferScale_x,
+                                         tileMidY - gBufferScale_y };
+
+    // Normalize
+    uniform float norm[2] = { rsqrt(frustumPlanes_xy[0] * frustumPlanes_xy[0] + 
+                                    frustumPlanes_z[0] * frustumPlanes_z[0]),
+                              rsqrt(frustumPlanes_xy[1] * frustumPlanes_xy[1] + 
+                                    frustumPlanes_z[1] * frustumPlanes_z[1]) };
+    frustumPlanes_xy[0] *= norm[0];
+    frustumPlanes_xy[1] *= norm[1];
+    frustumPlanes_z[0] *= norm[0];
+    frustumPlanes_z[1] *= norm[1];
+
+    // Initialize
+    uniform int32 subtileLightOffset[4];
+    subtileLightOffset[0] = 0 * subtileIndicesPitch;
+    subtileLightOffset[1] = 1 * subtileIndicesPitch;
+    subtileLightOffset[2] = 2 * subtileIndicesPitch;
+    subtileLightOffset[3] = 3 * subtileIndicesPitch;
+
+    foreach (i = 0 ... numLights) {
+        int32 lightIndex = lightIndices[i];
+
+        float light_positionView_x = light_positionView_x_array[lightIndex];
+        float light_positionView_y = light_positionView_y_array[lightIndex];
+        float light_positionView_z = light_positionView_z_array[lightIndex];
+        float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
+        float light_attenuationEndNeg = -light_attenuationEnd;
+        
+        // Test lights again subtile z bounds
+        bool inFrustum[4];
+        inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) &&
+            (subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg);
+        inFrustum[1] = (light_positionView_z - subtileMinZ[1] >= light_attenuationEndNeg) && 
+            (subtileMaxZ[1] - light_positionView_z >= light_attenuationEndNeg);
+        inFrustum[2] = (light_positionView_z - subtileMinZ[2] >= light_attenuationEndNeg) && 
+            (subtileMaxZ[2] - light_positionView_z >= light_attenuationEndNeg);
+        inFrustum[3] = (light_positionView_z - subtileMinZ[3] >= light_attenuationEndNeg) && 
+            (subtileMaxZ[3] - light_positionView_z >= light_attenuationEndNeg);
+
+        float dx = light_positionView_z * frustumPlanes_z[0] + 
+            light_positionView_x * frustumPlanes_xy[0];
+        float dy = light_positionView_z * frustumPlanes_z[1] +
+            light_positionView_y * frustumPlanes_xy[1];
+        
+        cif (abs(dx) > light_attenuationEnd) {
+            bool positiveX = dx > 0.0f;
+            inFrustum[0] = inFrustum[0] &&  positiveX;    // 00 subtile
+            inFrustum[1] = inFrustum[1] && !positiveX;    // 10 subtile
+            inFrustum[2] = inFrustum[2] &&  positiveX;    // 01 subtile
+            inFrustum[3] = inFrustum[3] && !positiveX;    // 11 subtile
+        }
+        cif (abs(dy) > light_attenuationEnd) {
+            bool positiveY = dy > 0.0f;
+            inFrustum[0] = inFrustum[0] &&  positiveY;    // 00 subtile
+            inFrustum[1] = inFrustum[1] &&  positiveY;    // 10 subtile
+            inFrustum[2] = inFrustum[2] && !positiveY;    // 01 subtile
+            inFrustum[3] = inFrustum[3] && !positiveY;    // 11 subtile
+        }
+
+        // Pack and store intersecting lights
+        // TODO: Experiment with a loop here instead
+        cif (inFrustum[0])
+            subtileLightOffset[0] += 
+            packed_store_active(&subtileIndices[subtileLightOffset[0]],
+                                lightIndex);
+        cif (inFrustum[1])
+            subtileLightOffset[1] += 
+            packed_store_active(&subtileIndices[subtileLightOffset[1]],
+                                lightIndex);
+        cif (inFrustum[2])
+            subtileLightOffset[2] += 
+            packed_store_active(&subtileIndices[subtileLightOffset[2]], 
+                                lightIndex);
+        cif (inFrustum[3])
+            subtileLightOffset[3] += 
+            packed_store_active(&subtileIndices[subtileLightOffset[3]], 
+                                lightIndex);
+    }
+
+    subtileNumLights[0] = subtileLightOffset[0] - 0 * subtileIndicesPitch;
+    subtileNumLights[1] = subtileLightOffset[1] - 1 * subtileIndicesPitch;
+    subtileNumLights[2] = subtileLightOffset[2] - 2 * subtileIndicesPitch;
+    subtileNumLights[3] = subtileLightOffset[3] - 3 * subtileIndicesPitch;
+}
--- a/examples/deferred/main.cpp
+++ b/examples/deferred/main.cpp
@@ -0,0 +1,139 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifdef _MSC_VER
+#define ISPC_IS_WINDOWS
+#define NOMINMAX
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
+#include <fcntl.h>
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <stdint.h>
+#include <algorithm>
+#include <assert.h>
+#include <vector>
+#ifdef ISPC_IS_WINDOWS
+  #define WIN32_LEAN_AND_MEAN
+  #include <windows.h>
+#endif
+#include "deferred.h"
+#include "kernels_ispc.h"
+#include "../timing.h"
+
+///////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char** argv) {
+    if (argc != 2) {
+        printf("usage: deferred_shading <input_file (e.g. data/pp1280x720.bin)>\n");
+        return 1;
+    }
+
+    InputData *input = CreateInputDataFromFile(argv[1]);
+    if (!input) {
+        printf("Failed to load input file \"%s\"!\n", argv[1]);
+        return 1;
+    }
+
+    Framebuffer framebuffer(input->header.framebufferWidth,
+                            input->header.framebufferHeight);
+
+    InitDynamicC(input);
+#ifdef __cilk
+    InitDynamicCilk(input);
+#endif // __cilk
+
+    int nframes = 5;
+    double ispcCycles = 1e30;
+    for (int i = 0; i < 5; ++i) {
+        framebuffer.clear();
+        reset_and_start_timer();
+        for (int j = 0; j < nframes; ++j)
+            ispc::RenderStatic(input->header, input->arrays,
+                               VISUALIZE_LIGHT_COUNT,
+                               framebuffer.r, framebuffer.g, framebuffer.b);
+        double mcycles = get_elapsed_mcycles() / nframes;
+        ispcCycles = std::min(ispcCycles, mcycles);
+    }
+    printf("[ispc static + tasks]:\t\t[%.3f] million cycles to render "
+           "%d x %d image\n", ispcCycles,
+           input->header.framebufferWidth, input->header.framebufferHeight);
+    WriteFrame("deferred-ispc-static.ppm", input, framebuffer);
+
+#ifdef __cilk
+    double dynamicCilkCycles = 1e30;
+    for (int i = 0; i < 5; ++i) {
+        framebuffer.clear();
+        reset_and_start_timer();
+        for (int j = 0; j < nframes; ++j)
+            DispatchDynamicCilk(input, &framebuffer);
+        double mcycles = get_elapsed_mcycles() / nframes;
+        dynamicCilkCycles = std::min(dynamicCilkCycles, mcycles);
+    }
+    printf("[ispc + Cilk dynamic]:\t\t[%.3f] million cycles to render image\n", 
+           dynamicCilkCycles);
+    WriteFrame("deferred-ispc-dynamic.ppm", input, framebuffer);
+#endif // __cilk
+
+    double serialCycles = 1e30;
+    for (int i = 0; i < 5; ++i) {
+        framebuffer.clear();
+        reset_and_start_timer();
+        for (int j = 0; j < nframes; ++j)
+            DispatchDynamicC(input, &framebuffer);
+        double mcycles = get_elapsed_mcycles() / nframes;
+        serialCycles = std::min(serialCycles, mcycles);
+    }
+    printf("[C++ serial dynamic, 1 core]:\t[%.3f] million cycles to render image\n", 
+           serialCycles);
+    WriteFrame("deferred-serial-dynamic.ppm", input, framebuffer);
+
+#ifdef __cilk
+    printf("\t\t\t\t(%.2fx speedup from static ISPC, %.2fx from Cilk+ISPC)\n", 
+           serialCycles/ispcCycles, serialCycles/dynamicCilkCycles);
+#else
+    printf("\t\t\t\t(%.2fx speedup from ISPC)\n", serialCycles/ispcCycles);
+#endif // __cilk
+
+    DeleteInputData(input);
+
+    return 0;
+}
--- a/examples/examples.sln
+++ b/examples/examples.sln
@@ -0,0 +1,136 @@
+
+Microsoft Visual Studio Solution File, Format Version 11.00
+# Visual Studio 2010
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simple", "simple\simple.vcxproj", "{947C5311-8B78-4D05-BEE4-BCF342D4B367}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "rt", "rt\rt.vcxproj", "{E787BC3F-2D2E-425E-A64D-4721E2FF3DC9}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "aobench", "aobench\aobench.vcxproj", "{F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mandelbrot", "mandelbrot\mandelbrot.vcxproj", "{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C1}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "options", "options\options.vcxproj", "{8C7B5D29-1E76-44E6-BBB8-09830E5DEEAE}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mandelbrot_tasks", "mandelbrot_tasks\mandelbrot_tasks.vcxproj", "{E80DA7D4-AB22-4648-A068-327307156BE6}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "aobench_instrumented", "aobench_instrumented\aobench_instrumented.vcxproj", "{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "noise", "noise\noise.vcxproj", "{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "volume", "volume_rendering\volume.vcxproj", "{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "stencil", "stencil\stencil.vcxproj", "{2EF070A1-F62F-4E6A-944B-88D140945C3C}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "deferred_shading", "deferred\deferred_shading.vcxproj", "{87F53C53-957E-4E91-878A-BC27828FB9EB}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "perfbench", "perfbench\perfbench.vcxproj", "{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Win32 = Debug|Win32
+		Debug|x64 = Debug|x64
+		Release|Win32 = Release|Win32
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{947C5311-8B78-4D05-BEE4-BCF342D4B367}.Debug|Win32.ActiveCfg = Debug|Win32
+		{947C5311-8B78-4D05-BEE4-BCF342D4B367}.Debug|Win32.Build.0 = Debug|Win32
+		{947C5311-8B78-4D05-BEE4-BCF342D4B367}.Debug|x64.ActiveCfg = Debug|x64
+		{947C5311-8B78-4D05-BEE4-BCF342D4B367}.Debug|x64.Build.0 = Debug|x64
+		{947C5311-8B78-4D05-BEE4-BCF342D4B367}.Release|Win32.ActiveCfg = Release|Win32
+		{947C5311-8B78-4D05-BEE4-BCF342D4B367}.Release|Win32.Build.0 = Release|Win32
+		{947C5311-8B78-4D05-BEE4-BCF342D4B367}.Release|x64.ActiveCfg = Release|x64
+		{947C5311-8B78-4D05-BEE4-BCF342D4B367}.Release|x64.Build.0 = Release|x64
+		{E787BC3F-2D2E-425E-A64D-4721E2FF3DC9}.Debug|Win32.ActiveCfg = Debug|Win32
+		{E787BC3F-2D2E-425E-A64D-4721E2FF3DC9}.Debug|Win32.Build.0 = Debug|Win32
+		{E787BC3F-2D2E-425E-A64D-4721E2FF3DC9}.Debug|x64.ActiveCfg = Debug|x64
+		{E787BC3F-2D2E-425E-A64D-4721E2FF3DC9}.Debug|x64.Build.0 = Debug|x64
+		{E787BC3F-2D2E-425E-A64D-4721E2FF3DC9}.Release|Win32.ActiveCfg = Release|Win32
+		{E787BC3F-2D2E-425E-A64D-4721E2FF3DC9}.Release|Win32.Build.0 = Release|Win32
+		{E787BC3F-2D2E-425E-A64D-4721E2FF3DC9}.Release|x64.ActiveCfg = Release|x64
+		{E787BC3F-2D2E-425E-A64D-4721E2FF3DC9}.Release|x64.Build.0 = Release|x64
+		{F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB}.Debug|Win32.ActiveCfg = Debug|Win32
+		{F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB}.Debug|Win32.Build.0 = Debug|Win32
+		{F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB}.Debug|x64.ActiveCfg = Debug|x64
+		{F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB}.Debug|x64.Build.0 = Debug|x64
+		{F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB}.Release|Win32.ActiveCfg = Release|Win32
+		{F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB}.Release|Win32.Build.0 = Release|Win32
+		{F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB}.Release|x64.ActiveCfg = Release|x64
+		{F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB}.Release|x64.Build.0 = Release|x64
+		{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C1}.Debug|Win32.ActiveCfg = Debug|Win32
+		{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C1}.Debug|Win32.Build.0 = Debug|Win32
+		{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C1}.Debug|x64.ActiveCfg = Debug|x64
+		{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C1}.Debug|x64.Build.0 = Debug|x64
+		{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C1}.Release|Win32.ActiveCfg = Release|Win32
+		{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C1}.Release|Win32.Build.0 = Release|Win32
+		{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C1}.Release|x64.ActiveCfg = Release|x64
+		{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C1}.Release|x64.Build.0 = Release|x64
+		{8C7B5D29-1E76-44E6-BBB8-09830E5DEEAE}.Debug|Win32.ActiveCfg = Debug|Win32
+		{8C7B5D29-1E76-44E6-BBB8-09830E5DEEAE}.Debug|Win32.Build.0 = Debug|Win32
+		{8C7B5D29-1E76-44E6-BBB8-09830E5DEEAE}.Debug|x64.ActiveCfg = Debug|x64
+		{8C7B5D29-1E76-44E6-BBB8-09830E5DEEAE}.Debug|x64.Build.0 = Debug|x64
+		{8C7B5D29-1E76-44E6-BBB8-09830E5DEEAE}.Release|Win32.ActiveCfg = Release|Win32
+		{8C7B5D29-1E76-44E6-BBB8-09830E5DEEAE}.Release|Win32.Build.0 = Release|Win32
+		{8C7B5D29-1E76-44E6-BBB8-09830E5DEEAE}.Release|x64.ActiveCfg = Release|x64
+		{8C7B5D29-1E76-44E6-BBB8-09830E5DEEAE}.Release|x64.Build.0 = Release|x64
+		{E80DA7D4-AB22-4648-A068-327307156BE6}.Debug|Win32.ActiveCfg = Debug|Win32
+		{E80DA7D4-AB22-4648-A068-327307156BE6}.Debug|Win32.Build.0 = Debug|Win32
+		{E80DA7D4-AB22-4648-A068-327307156BE6}.Debug|x64.ActiveCfg = Debug|x64
+		{E80DA7D4-AB22-4648-A068-327307156BE6}.Debug|x64.Build.0 = Debug|x64
+		{E80DA7D4-AB22-4648-A068-327307156BE6}.Release|Win32.ActiveCfg = Release|Win32
+		{E80DA7D4-AB22-4648-A068-327307156BE6}.Release|Win32.Build.0 = Release|Win32
+		{E80DA7D4-AB22-4648-A068-327307156BE6}.Release|x64.ActiveCfg = Release|x64
+		{E80DA7D4-AB22-4648-A068-327307156BE6}.Release|x64.Build.0 = Release|x64
+		{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}.Debug|Win32.ActiveCfg = Debug|Win32
+		{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}.Debug|Win32.Build.0 = Debug|Win32
+		{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}.Debug|x64.ActiveCfg = Debug|x64
+		{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}.Debug|x64.Build.0 = Debug|x64
+		{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}.Release|Win32.ActiveCfg = Release|Win32
+		{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}.Release|Win32.Build.0 = Release|Win32
+		{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}.Release|x64.ActiveCfg = Release|x64
+		{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}.Release|x64.Build.0 = Release|x64
+		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Debug|Win32.ActiveCfg = Debug|Win32
+		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Debug|Win32.Build.0 = Debug|Win32
+		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Debug|x64.ActiveCfg = Debug|x64
+		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Debug|x64.Build.0 = Debug|x64
+		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Release|Win32.ActiveCfg = Release|Win32
+		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Release|Win32.Build.0 = Release|Win32
+		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Release|x64.ActiveCfg = Release|x64
+		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Release|x64.Build.0 = Release|x64
+		{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}.Debug|Win32.ActiveCfg = Debug|Win32
+		{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}.Debug|Win32.Build.0 = Debug|Win32
+		{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}.Debug|x64.ActiveCfg = Debug|x64
+		{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}.Debug|x64.Build.0 = Debug|x64
+		{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}.Release|Win32.ActiveCfg = Release|Win32
+		{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}.Release|Win32.Build.0 = Release|Win32
+		{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}.Release|x64.ActiveCfg = Release|x64
+		{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}.Release|x64.Build.0 = Release|x64
+		{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Debug|Win32.ActiveCfg = Debug|Win32
+		{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Debug|Win32.Build.0 = Debug|Win32
+		{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Debug|x64.ActiveCfg = Debug|x64
+		{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Debug|x64.Build.0 = Debug|x64
+		{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Release|Win32.ActiveCfg = Release|Win32
+		{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Release|Win32.Build.0 = Release|Win32
+		{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Release|x64.ActiveCfg = Release|x64
+		{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Release|x64.Build.0 = Release|x64
+		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Debug|Win32.ActiveCfg = Debug|Win32
+		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Debug|Win32.Build.0 = Debug|Win32
+		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Debug|x64.ActiveCfg = Debug|x64
+		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Debug|x64.Build.0 = Debug|x64
+		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|Win32.ActiveCfg = Release|Win32
+		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|Win32.Build.0 = Release|Win32
+		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|x64.ActiveCfg = Release|x64
+		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|x64.Build.0 = Release|x64
+		{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Debug|Win32.ActiveCfg = Debug|Win32
+		{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Debug|Win32.Build.0 = Debug|Win32
+		{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Debug|x64.ActiveCfg = Debug|x64
+		{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Debug|x64.Build.0 = Debug|x64
+		{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Release|Win32.ActiveCfg = Release|Win32
+		{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Release|Win32.Build.0 = Release|Win32
+		{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Release|x64.ActiveCfg = Release|x64
+		{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/examples/gmres/Makefile
+++ b/examples/gmres/Makefile
@@ -0,0 +1,8 @@
+
+EXAMPLE=gmres
+CPP_SRC=algorithm.cpp main.cpp matrix.cpp
+CC_SRC=mmio.c
+ISPC_SRC=matrix.ispc
+ISPC_TARGETS=sse2,sse4-x2,avx-x2
+
+include ../common.mk
--- a/examples/gmres/algorithm.cpp
+++ b/examples/gmres/algorithm.cpp
@@ -0,0 +1,231 @@
+/*
+  Copyright (c) 2012, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+
+/*===========================================================================*\
+|* Includes
+\*===========================================================================*/
+#include "algorithm.h"
+#include "stdio.h"
+#include "debug.h"
+
+
+/*===========================================================================*\
+|* GMRES
+\*===========================================================================*/
+/* upper_triangular_right_solve:
+ * ----------------------------
+ * Given upper triangular matrix R and rhs vector b, solve for
+ * x.  This "solve" ignores the rows, columns of R that are greater than the
+ * dimensions of x.
+ */
+void upper_triangular_right_solve (const DenseMatrix &R, const Vector &b, Vector &x) 
+{
+    // Dimensionality check
+    ASSERT(R.rows() >= b.size());
+    ASSERT(R.cols() >= x.size());
+    ASSERT(b.size() >= x.size());
+
+    int max_row = x.size() - 1;
+
+    // first solve step:
+    x[max_row] = b[max_row] / R(max_row, max_row);
+
+    for (int row = max_row - 1; row >= 0; row--) {
+        double xi = b[row];
+        for (int col = max_row; col > row; col--)
+            xi -= x[col] * R(row, col);
+        x[row] = xi / R(row, row);
+    }
+}
+
+/* create_rotation (used in gmres):
+ * -------------------------------
+ * Construct a Givens rotation to zero out the lowest non-zero entry in a partially
+ * factored Hessenburg matrix.  Note that the previous Givens rotations should be
+ * applied to this column before creating a new rotation.
+ */
+void create_rotation (const DenseMatrix &H, size_t col, Vector &Cn, Vector &Sn) 
+{
+    double a = H(col,     col);
+    double b = H(col + 1, col);
+    double r;
+
+    if (b == 0) {
+        Cn[col] = copysign(1, a);
+        Sn[col] = 0;
+    } 
+    else if (a == 0) {
+        Cn[col] = 0;
+        Sn[col] = copysign(1, b);
+    }
+    else {
+        r       = sqrt(a*a + b*b);
+        Sn[col] = -b / r;
+        Cn[col] =  a / r;
+    }
+}
+
+/* Applies the 'col'th Givens rotation stored in vectors Sn and Cn to the 'col'th 
+ * column of the DenseMatrix M.  (Previous columns don't need the rotation applied b/c
+ * presumeably, the first col-1 columns are already upper triangular, and so their
+ * entries in the col and col+1 rows are 0.)
+ */
+void apply_rotation (DenseMatrix &H, size_t col, Vector &Cn, Vector &Sn) 
+{
+    double c = Cn[col];
+    double s = Sn[col];
+    double tmp    = c * H(col, col) - s * H(col+1, col);
+    H(col+1, col) = s * H(col, col) + c * H(col+1, col);
+    H(col,   col) = tmp;
+}
+
+/* Applies the 'col'th Givens rotation to the vector.
+ */
+void apply_rotation (Vector &v, size_t col, Vector &Cn, Vector &Sn) 
+{
+    double a = v[col];
+    double b = v[col + 1];
+
+    double c = Cn[col];
+    double s = Sn[col];
+
+    v[col]     = c * a - s * b;
+    v[col + 1] = s * a + c * b;
+}
+
+/* Applies the first 'col' Givens rotations to the newly-created column
+ * of H.  (Leaves other columns alone.)
+ */
+void update_column (DenseMatrix &H, size_t col, Vector &Cn, Vector &Sn) 
+{
+    for (int i = 0; i < col; i++) {
+        double c    = Cn[i];
+        double s    = Sn[i];
+        double t    = c * H(i,col) - s * H(i+1,col);
+        H(i+1, col) = s * H(i,col) + c * H(i+1,col);
+        H(i,   col) = t;
+    }
+}
+
+/* After a new column has been added to the hessenburg matrix, factor it back into
+ * an upper-triangular matrix by:
+ * - applying the previous Givens rotations to the new column
+ * - computing the new Givens rotation to make the column upper triangluar
+ * - applying the new Givens rotation to the column, and
+ * - applying the new Givens rotation to the solution vector
+ */
+void update_qr_decomp (DenseMatrix &H, Vector &s, size_t col, Vector &Cn, Vector &Sn)
+{
+    update_column(  H, col, Cn, Sn);
+    create_rotation(H, col, Cn, Sn);
+    apply_rotation( H, col, Cn, Sn);
+    apply_rotation( s, col, Cn, Sn);
+}
+
+void gmres (const Matrix &A, const Vector &b, Vector &x, int num_iters, double max_err)  
+{
+    DEBUG_PRINT("gmres starting!\n");
+    x.zero();
+
+    ASSERT(A.rows() == A.cols());
+    DenseMatrix Qstar(num_iters + 1, A.rows());
+    DenseMatrix H(num_iters + 1, num_iters);
+
+    // arrays for storing parameters of givens rotations
+    Vector Sn(num_iters);
+    Vector Cn(num_iters);
+
+    // array for storing the rhs projected onto the hessenburg's column space
+    Vector G(num_iters+1);
+    G.zero();
+
+    double beta = b.norm();
+    G[0] = beta;
+
+    // temp vector, stores Aqi
+    Vector w(A.rows());
+
+    w.copy(b);
+    w.normalize();
+    Qstar.set_row(0, w);
+
+    int iter = 0;
+    Vector temp(A.rows(), false);
+    double rel_err;
+
+    while (iter < num_iters) 
+    {
+        // w = Aqi
+        Qstar.row(iter, temp);
+        A.multiply(temp, w);
+
+        // construct ith column of H, i+1th row of Qstar:        
+        for (int row = 0; row <= iter; row++) {
+            Qstar.row(row, temp);
+            H(row, iter) = temp.dot(w);
+            w.add_ax(-H(row, iter), temp);
+        }
+
+        H(iter+1, iter) = w.norm();
+        w.divide(H(iter+1, iter));
+        Qstar.set_row(iter+1, w);
+
+        update_qr_decomp (H, G, iter, Cn, Sn);
+
+        rel_err = fabs(G[iter+1] / beta);
+
+        if (rel_err < max_err)
+            break;
+
+        if (iter % 100 == 0)
+            DEBUG_PRINT("Iter %d: %f err\n", iter, rel_err);
+
+        iter++;
+    }
+
+    if (iter == num_iters) {
+        fprintf(stderr, "Error: gmres failed to converge in %d iterations (relative err: %f)\n", num_iters, rel_err);
+        exit(-1);
+    }
+
+    // We've reached an acceptable solution (?):
+
+    DEBUG_PRINT("gmres completed in %d iterations (rel. resid. %f, max %f)\n", num_iters, rel_err, max_err);
+    Vector y(iter+1);
+    upper_triangular_right_solve(H, G, y);
+    for (int i = 0; i < iter + 1; i++) {
+        Qstar.row(i, temp);
+        x.add_ax(y[i], temp);
+    }
+}
--- a/examples/gmres/algorithm.h
+++ b/examples/gmres/algorithm.h
@@ -0,0 +1,50 @@
+/*
+  Copyright (c) 2012, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+
+#ifndef __ALGORITHM_H__
+#define __ALGORITHM_H__
+
+#include "matrix.h"
+
+
+/* Generalized Minimal Residual Method:
+ * -----------------------------------
+ * Takes a square matrix and an rhs and uses GMRES to find an estimate for x.
+ * The specified error is relative.
+ */
+void gmres (const Matrix &A, const Vector &b, Vector &x, int num_iters, double err);
+
+
+
+#endif
--- a/examples/gmres/data/c-18/c-18.mtx
+++ b/examples/gmres/data/c-18/c-18.mtx
--- a/examples/gmres/data/c-18/c-18_b.mtx
+++ b/examples/gmres/data/c-18/c-18_b.mtx
--- a/examples/gmres/data/c-21/c-21.mtx
+++ b/examples/gmres/data/c-21/c-21.mtx
--- a/examples/gmres/data/c-21/c-21_b.mtx
+++ b/examples/gmres/data/c-21/c-21_b.mtx
--- a/examples/gmres/data/c-22/c-22.mtx
+++ b/examples/gmres/data/c-22/c-22.mtx
--- a/examples/gmres/data/c-22/c-22_b.mtx
+++ b/examples/gmres/data/c-22/c-22_b.mtx
--- a/examples/gmres/data/c-25/c-25.mtx
+++ b/examples/gmres/data/c-25/c-25.mtx
--- a/examples/gmres/data/c-25/c-25_b.mtx
+++ b/examples/gmres/data/c-25/c-25_b.mtx
--- a/examples/gmres/debug.h
+++ b/examples/gmres/debug.h
@@ -0,0 +1,55 @@
+/*
+  Copyright (c) 2012, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+
+#ifndef __DEBUG_H__
+#define __DEBUG_H__
+
+#include <cassert>
+
+
+/**************************************************************\
+| Macros
+\**************************************************************/
+#define DEBUG
+
+#ifdef DEBUG
+#define ASSERT(expr) assert(expr)
+#define DEBUG_PRINT(...) printf(__VA_ARGS__)
+#else
+#define ASSERT(expr)
+#define DEBUG_PRINT(...)
+#endif
+
+
+#endif
--- a/examples/gmres/main.cpp
+++ b/examples/gmres/main.cpp
@@ -0,0 +1,79 @@
+/*
+  Copyright (c) 2012, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+
+#include "matrix.h"
+#include "algorithm.h"
+#include "util.h"
+#include <cmath>
+#include "../timing.h"
+
+
+int main (int argc, char **argv) 
+{
+    if (argc < 4) {
+        printf("usage: %s <input-matrix> <input-rhs> <output-file>\n", argv[0]);
+        return -1;
+    }
+
+    double gmres_cycles;
+
+    DEBUG_PRINT("Loading A...\n");
+    Matrix *A = CRSMatrix::matrix_from_mtf(argv[1]);
+    if (A == NULL) 
+        return -1;
+    DEBUG_PRINT("... size: %lu\n", A->cols());
+
+    DEBUG_PRINT("Loading b...\n");
+    Vector *b = Vector::vector_from_mtf(argv[2]);
+    if (b == NULL)
+        return -1;
+
+    Vector x(A->cols());
+    DEBUG_PRINT("Beginning gmres...\n");
+    gmres(*A, *b, x, A->cols() / 2, .01);
+
+    // Write result out to file
+    x.to_mtf(argv[argc-1]);
+
+    // Compute residual (double-check)
+#ifdef DEBUG
+    Vector bprime(b->size());
+    A->multiply(x, bprime);
+    Vector resid(bprime.size(), &(bprime[0]));
+    resid.subtract(*b);
+    DEBUG_PRINT("residual error check: %lg\n", resid.norm() / b->norm());
+#endif
+    // Print profiling results
+    DEBUG_PRINT("-- Total mcycles to solve : %.03f --\n", gmres_cycles);
+}
--- a/examples/gmres/matrix.cpp
+++ b/examples/gmres/matrix.cpp
@@ -0,0 +1,246 @@
+/*
+  Copyright (c) 2012, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+
+/**************************************************************\
+| Includes
+\**************************************************************/
+#include "matrix.h"
+#include "matrix_ispc.h"
+
+extern "C" {
+#include "mmio.h"
+}
+
+/**************************************************************\
+| DenseMatrix methods
+\**************************************************************/
+void DenseMatrix::multiply (const Vector &v, Vector &r) const 
+{
+    // Dimensionality check
+    ASSERT(v.size() == cols());
+    ASSERT(r.size() == rows());
+
+    for (int i = 0; i < rows(); i++)
+        r[i] = v.dot(entries + i * num_cols);
+}
+
+const Vector *DenseMatrix::row (size_t row) const {
+    return new Vector(num_cols, entries + row * num_cols, true);
+}
+
+void DenseMatrix::row (size_t row, Vector &r) {
+    r.entries = entries + row * cols();
+    r._size   = cols();
+}
+
+void DenseMatrix::set_row(size_t row, const Vector &v) 
+{
+    ASSERT(v.size() == num_cols);
+    memcpy(entries + row * num_cols, v.entries, num_cols * sizeof(double));
+}
+
+
+/**************************************************************\
+| CRSMatrix Methods
+\**************************************************************/
+#include <stdio.h>
+#include <stdlib.h>
+#include <vector>
+#include <algorithm>
+
+
+struct entry {
+    int row;
+    int col;
+    double val;
+};
+
+bool compare_entries(struct entry i, struct entry j) {
+    if (i.row < j.row)
+        return true;
+    if (i.row > j.row)
+        return false;
+
+    return i.col < j.col;
+}
+
+#define ERR_OUT(...) { fprintf(stderr, __VA_ARGS__); return NULL; }
+
+CRSMatrix *CRSMatrix::matrix_from_mtf (char *path) {
+    FILE *f;
+    MM_typecode matcode;
+
+    int m, n, nz;
+
+    if ((f = fopen(path, "r")) == NULL) 
+        ERR_OUT("Error: %s does not name a valid/readable file.\n", path);
+
+    if (mm_read_banner(f, &matcode) != 0)
+        ERR_OUT("Error: Could not process Matrix Market banner.\n");
+
+    if (mm_is_complex(matcode)) 
+        ERR_OUT("Error: Application does not support complex numbers.\n")
+
+    if (mm_is_dense(matcode))
+        ERR_OUT("Error: supplied matrix is dense (should be sparse.)\n");
+
+    if (!mm_is_matrix(matcode))
+        ERR_OUT("Error: %s does not encode a matrix.\n", path)
+
+    if (mm_read_mtx_crd_size(f, &m, &n, &nz) != 0)
+        ERR_OUT("Error: could not read matrix size from file.\n");
+
+    if (m != n)
+        ERR_OUT("Error: Application does not support non-square matrices.");
+
+    std::vector<struct entry> entries;
+    entries.resize(nz);
+
+    for (int i = 0; i < nz; i++) {
+        fscanf(f, "%d %d %lg\n", &entries[i].row, &entries[i].col, &entries[i].val);
+        // Adjust from 1-based to 0-based
+        entries[i].row--;
+        entries[i].col--;
+    }
+
+    sort(entries.begin(), entries.end(), compare_entries);
+
+    CRSMatrix *M = new CRSMatrix(m, n, nz);
+    int cur_row = -1;
+    for (int i = 0; i < nz; i++) {
+        while (entries[i].row > cur_row)
+            M->row_offsets[++cur_row] = i;
+        M->entries[i] = entries[i].val;
+        M->columns[i] = entries[i].col;
+    }
+
+    return M;
+}
+
+Vector *Vector::vector_from_mtf (char *path) {
+    FILE *f;
+    MM_typecode matcode;
+
+    int m, n, nz;
+
+    if ((f = fopen(path, "r")) == NULL) 
+        ERR_OUT("Error: %s does not name a valid/readable file.\n", path);
+
+    if (mm_read_banner(f, &matcode) != 0)
+        ERR_OUT("Error: Could not process Matrix Market banner.\n");
+
+    if (mm_is_complex(matcode)) 
+        ERR_OUT("Error: Application does not support complex numbers.\n")
+
+    if (mm_is_dense(matcode)) {
+        if (mm_read_mtx_array_size(f, &m, &n) != 0)
+            ERR_OUT("Error: could not read matrix size from file.\n");
+    } else {
+        if (mm_read_mtx_crd_size(f, &m, &n, &nz) != 0)
+            ERR_OUT("Error: could not read matrix size from file.\n");
+    }
+    if (n != 1)
+        ERR_OUT("Error: %s does not describe a vector.\n", path);
+
+    Vector *x = new Vector(m);
+
+    if (mm_is_dense(matcode)) {
+        double val;
+        for (int i = 0; i < m; i++) {
+            fscanf(f, "%lg\n", &val);
+            (*x)[i] = val;
+        }
+    }
+    else {
+        x->zero();
+        double val;
+        int row;
+        int col;
+        for (int i = 0; i < nz; i++) {
+            fscanf(f, "%d %d %lg\n", &row, &col, &val);
+            (*x)[row-1] = val;
+        }
+    }
+    return x;
+}
+
+#define ERR(...) { fprintf(stderr, __VA_ARGS__); exit(-1); }
+
+void Vector::to_mtf (char *path) {
+    FILE *f;
+    MM_typecode matcode;
+
+    mm_initialize_typecode(&matcode);
+    mm_set_matrix(&matcode);
+    mm_set_real(&matcode);
+    mm_set_dense(&matcode);
+    mm_set_general(&matcode);
+
+    if ((f = fopen(path, "w")) == NULL)
+        ERR("Error: cannot open/write to %s\n", path);
+
+    mm_write_banner(f, matcode);
+    mm_write_mtx_array_size(f, size(), 1);
+    for (int i = 0; i < size(); i++)
+        fprintf(f, "%lg\n", entries[i]);
+
+    fclose(f);
+}
+
+void CRSMatrix::multiply (const Vector &v, Vector &r) const
+{
+    ASSERT(v.size() == cols());
+    ASSERT(r.size() == rows());
+
+    for (int row = 0; row < rows(); row++) 
+    {
+        int row_offset = row_offsets[row];
+        int next_offset = ((row + 1 == rows()) ? _nonzeroes : row_offsets[row + 1]);
+
+        double sum = 0;
+        for (int i = row_offset; i < next_offset; i++)
+        {
+            sum += v[columns[i]] * entries[i];
+        }
+        r[row] = sum;
+    }
+}
+
+void CRSMatrix::zero ( ) 
+{
+    entries.clear();
+    row_offsets.clear();
+    columns.clear();
+    _nonzeroes = 0;
+}
--- a/examples/gmres/matrix.h
+++ b/examples/gmres/matrix.h
@@ -0,0 +1,279 @@
+/*
+  Copyright (c) 2012, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+
+#ifndef __MATRIX_H__
+#define __MATRIX_H__
+
+/**************************************************************\
+| Includes
+\**************************************************************/
+#include <cstring> // size_t
+#include <cstdlib> // malloc, memcpy, etc.
+#include <cmath>   // sqrt
+#include <vector>
+
+#include "debug.h"
+#include "matrix_ispc.h"
+
+
+class DenseMatrix;
+/**************************************************************\
+| Vector class
+\**************************************************************/
+class Vector {
+ public:
+    static Vector *vector_from_mtf(char *path);
+    void to_mtf (char *path);
+
+    Vector(size_t size, bool alloc_mem=true) 
+        {
+            shared_ptr = false;
+            _size      = size;
+			
+            if (alloc_mem)
+                entries = (double *) malloc(sizeof(double) * _size);
+            else {
+                shared_ptr = true;
+                entries    = NULL;
+            }
+        }
+
+    Vector(size_t size, double *content, bool share_ptr=false) 
+        {
+            _size = size;
+            if (share_ptr) {
+                entries = content;
+                shared_ptr = true;
+            }
+            else {
+                shared_ptr = false;
+                entries = (double *) malloc(sizeof(double) * _size);
+                memcpy(entries, content, sizeof(double) * _size);
+            }
+        }
+
+    ~Vector() { if (!shared_ptr) free(entries); }
+
+    const double & operator [] (size_t index) const 
+    { 
+        ASSERT(index < _size); 
+        return *(entries + index); 
+    }
+
+    double &operator [] (size_t index) 
+    {
+        ASSERT(index < _size);
+        return *(entries + index);
+    }
+
+    bool operator == (const Vector &v) const 
+    {
+        if (v.size() != _size)
+            return false;
+
+        for (int i = 0; i < _size; i++)
+            if (entries[i] != v[i])
+                return false;
+
+        return true;
+    }
+
+    size_t size() const {return _size; }
+
+    double dot (const Vector &b) const 
+    {
+        ASSERT(b.size() == this->size());
+        return ispc::vector_dot(entries, b.entries, size());
+    }
+
+    double dot (const double * const b) const 
+    {
+        return ispc::vector_dot(entries, b, size());
+    }
+
+    void zero () 
+    {
+        ispc::zero(entries, size()); 
+    }
+
+    double norm () const { return sqrtf(dot(entries)); }
+
+    void normalize () { this->divide(this->norm()); }
+
+    void add (const Vector &a) 
+    {
+        ASSERT(size() == a.size());
+        ispc::vector_add(entries, a.entries, size());
+    }
+
+    void subtract (const Vector &s)
+    {
+        ASSERT(size() == s.size());
+        ispc::vector_sub(entries, s.entries, size());
+    }
+
+    void multiply (double scalar) 
+    {
+        ispc::vector_mult(entries, scalar, size());
+    }
+
+    void divide (double scalar) 
+    {
+        ispc::vector_div(entries, scalar, size());
+    }
+
+    // Note: x may be longer than *(this)
+    void add_ax (double a, const Vector &x) {
+        ASSERT(x.size() >= size());
+        ispc::vector_add_ax(entries, a, x.entries, size());
+    }
+
+    // Note that copy only copies the first size() elements of the
+    // supplied vector, i.e. the supplied vector can be longer than
+    // this one.  This is useful in least squares calculations.
+    void copy (const Vector &other) {
+        ASSERT(other.size() >= size());
+        memcpy(entries, other.entries, size() * sizeof(double));
+    }
+
+    friend class DenseMatrix;
+
+ private:
+    size_t  _size;
+    bool     shared_ptr;
+    double  *entries;
+};
+
+
+/**************************************************************\
+| Matrix base class
+\**************************************************************/
+class Matrix {
+    friend class Vector;
+	
+ public:
+    Matrix(size_t size_r, size_t size_c) 
+        { 
+            num_rows = size_r; 
+            num_cols = size_c; 
+        }
+    ~Matrix(){}
+
+    size_t rows() const { return num_rows; }
+    size_t cols() const { return num_cols; }
+
+    virtual void multiply (const Vector &v, Vector &r) const = 0;
+    virtual void zero () = 0;
+
+ protected:
+    size_t num_rows;
+    size_t num_cols;
+};
+
+/**************************************************************\
+| DenseMatrix class
+\**************************************************************/
+class DenseMatrix : public Matrix { 
+    friend class Vector;
+
+ public:
+ DenseMatrix(size_t size_r, size_t size_c) : Matrix(size_r, size_c) 
+        {
+            entries = (double *) malloc(size_r * size_c * sizeof(double));
+        }
+
+ DenseMatrix(size_t size_r, size_t size_c, const double *content) : Matrix (size_r, size_c)
+        {
+            entries = (double *) malloc(size_r * size_c * sizeof(double));
+            memcpy(entries, content, size_r * size_c * sizeof(double));
+        }
+
+    virtual void multiply (const Vector &v, Vector &r) const;
+
+    double &operator () (unsigned int r, unsigned int c)
+    {
+        return *(entries + r * num_cols + c);
+    }
+
+    const double &operator () (unsigned int r, unsigned int c) const
+    {
+        return *(entries + r * num_cols + c);			
+    }
+
+    const Vector *row(size_t row) const;
+    void          row(size_t row, Vector &r);
+    void      set_row(size_t row, const Vector &v);
+
+    virtual void zero() { ispc::zero(entries, rows() * cols()); }
+
+    void copy (const DenseMatrix &other) 
+    {
+        ASSERT(rows() == other.rows());
+        ASSERT(cols() == other.cols());
+        memcpy(entries, other.entries, rows() * cols() * sizeof(double));
+    }
+
+ private:
+    double *entries;
+    bool shared_ptr;
+};
+
+/**************************************************************\
+| CSRMatrix (compressed row storage, a sparse matrix format)
+\**************************************************************/
+class CRSMatrix : public Matrix { 
+ public:
+    CRSMatrix (size_t size_r, size_t size_c, size_t nonzeroes) :
+    Matrix(size_r, size_c) 
+        {
+            _nonzeroes = nonzeroes;
+            entries.resize(nonzeroes);
+            columns.resize(nonzeroes);
+            row_offsets.resize(size_r);
+        }
+
+    virtual void multiply(const Vector &v, Vector &r) const;
+
+    virtual void zero();
+
+    static CRSMatrix *matrix_from_mtf (char *path);
+
+ private:
+    unsigned int        _nonzeroes;
+    std::vector<double>  entries;
+    std::vector<int>     row_offsets;
+    std::vector<int>     columns;
+};
+
+#endif
--- a/examples/gmres/matrix.ispc
+++ b/examples/gmres/matrix.ispc
@@ -0,0 +1,122 @@
+/*
+  Copyright (c) 2012, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+
+/**************************************************************\
+| General
+\**************************************************************/
+export void zero (uniform double data[],
+                  uniform int size)
+{
+    foreach (i = 0 ... size)
+        data[i] = 0.0;
+}
+
+
+/**************************************************************\
+| Vector helpers
+\**************************************************************/
+export void vector_add (uniform double a[], 
+                        const uniform double b[], 
+                        const uniform int size) 
+{
+    foreach (i = 0 ... size)
+        a[i] += b[i];
+}
+
+export void vector_sub (uniform double a[], 
+                        const uniform double b[], 
+                        const uniform int size) 
+{
+    foreach (i = 0 ... size)
+        a[i] -= b[i];
+}
+
+export void vector_mult (uniform double a[],
+                         const uniform double b,
+                         const uniform int size)
+{
+    foreach (i = 0 ... size)
+        a[i] *= b;
+}
+
+export void vector_div (uniform double a[],
+                        const uniform double b,
+                        const uniform int size)
+{
+    foreach (i = 0 ... size)
+        a[i] /= b;
+}
+
+export void vector_add_ax (uniform double r[],
+                           const uniform double a,
+                           const uniform double x[],
+                           const uniform int    size)
+{
+    foreach (i = 0 ... size)
+        r[i] += a * x[i];
+}
+
+export uniform double vector_dot (const uniform double a[],
+                                  const uniform double b[],
+                                  const uniform int size)
+{
+    varying double sum = 0.0;
+    foreach (i = 0 ... size)
+        sum += a[i] * b[i];
+    return reduce_add(sum);
+}
+
+/**************************************************************\
+| Matrix helpers
+\**************************************************************/
+export void sparse_multiply (const uniform double entries[],
+                             const uniform double columns[],
+                             const uniform double row_offsets[],
+                             const uniform int rows,
+                             const uniform int cols,
+                             const uniform int nonzeroes,
+                             const uniform double v[],
+                             uniform double r[]) 
+{
+    foreach (row = 0 ... rows) {
+        int row_offset = row_offsets[row];
+        int next_offset = ((row + 1 == rows) ? nonzeroes : row_offsets[row+1]);
+
+        double sum = 0;
+        for (int j = row_offset; j < next_offset; j++)
+            sum += v[columns[j]] * entries[j];
+        r[row] = sum;
+    }
+}
+
--- a/examples/gmres/mmio.c
+++ b/examples/gmres/mmio.c
@@ -0,0 +1,511 @@
+/* 
+*   Matrix Market I/O library for ANSI C
+*
+*   See http://math.nist.gov/MatrixMarket for details.
+*
+*
+*/
+
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <ctype.h>
+
+#include "mmio.h"
+
+int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_,
+                double **val_, int **I_, int **J_)
+{
+    FILE *f;
+    MM_typecode matcode;
+    int M, N, nz;
+    int i;
+    double *val;
+    int *I, *J;
+ 
+    if ((f = fopen(fname, "r")) == NULL)
+            return -1;
+ 
+ 
+    if (mm_read_banner(f, &matcode) != 0)
+    {
+        printf("mm_read_unsymetric: Could not process Matrix Market banner ");
+        printf(" in file [%s]\n", fname);
+        return -1;
+    }
+ 
+ 
+ 
+    if ( !(mm_is_real(matcode) && mm_is_matrix(matcode) &&
+            mm_is_sparse(matcode)))
+    {
+        fprintf(stderr, "Sorry, this application does not support ");
+        fprintf(stderr, "Market Market type: [%s]\n",
+                mm_typecode_to_str(matcode));
+        return -1;
+    }
+ 
+    /* find out size of sparse matrix: M, N, nz .... */
+ 
+    if (mm_read_mtx_crd_size(f, &M, &N, &nz) !=0)
+    {
+        fprintf(stderr, "read_unsymmetric_sparse(): could not parse matrix size.\n");
+        return -1;
+    }
+ 
+    *M_ = M;
+    *N_ = N;
+    *nz_ = nz;
+ 
+    /* reseve memory for matrices */
+ 
+    I = (int *) malloc(nz * sizeof(int));
+    J = (int *) malloc(nz * sizeof(int));
+    val = (double *) malloc(nz * sizeof(double));
+ 
+    *val_ = val;
+    *I_ = I;
+    *J_ = J;
+ 
+    /* NOTE: when reading in doubles, ANSI C requires the use of the "l"  */
+    /*   specifier as in "%lg", "%lf", "%le", otherwise errors will occur */
+    /*  (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15)            */
+ 
+    for (i=0; i<nz; i++)
+    {
+        fscanf(f, "%d %d %lg\n", &I[i], &J[i], &val[i]);
+        I[i]--;  /* adjust from 1-based to 0-based */
+        J[i]--;
+    }
+    fclose(f);
+ 
+    return 0;
+}
+
+int mm_is_valid(MM_typecode matcode)
+{
+    if (!mm_is_matrix(matcode)) return 0;
+    if (mm_is_dense(matcode) && mm_is_pattern(matcode)) return 0;
+    if (mm_is_real(matcode) && mm_is_hermitian(matcode)) return 0;
+    if (mm_is_pattern(matcode) && (mm_is_hermitian(matcode) || 
+                mm_is_skew(matcode))) return 0;
+    return 1;
+}
+
+int mm_read_banner(FILE *f, MM_typecode *matcode)
+{
+    char line[MM_MAX_LINE_LENGTH];
+    char banner[MM_MAX_TOKEN_LENGTH];
+    char mtx[MM_MAX_TOKEN_LENGTH]; 
+    char crd[MM_MAX_TOKEN_LENGTH];
+    char data_type[MM_MAX_TOKEN_LENGTH];
+    char storage_scheme[MM_MAX_TOKEN_LENGTH];
+    char *p;
+
+
+    mm_clear_typecode(matcode);  
+
+    if (fgets(line, MM_MAX_LINE_LENGTH, f) == NULL) 
+        return MM_PREMATURE_EOF;
+
+    if (sscanf(line, "%s %s %s %s %s", banner, mtx, crd, data_type, 
+        storage_scheme) != 5)
+        return MM_PREMATURE_EOF;
+
+    for (p=mtx; *p!='\0'; *p=tolower(*p),p++);  /* convert to lower case */
+    for (p=crd; *p!='\0'; *p=tolower(*p),p++);  
+    for (p=data_type; *p!='\0'; *p=tolower(*p),p++);
+    for (p=storage_scheme; *p!='\0'; *p=tolower(*p),p++);
+
+    /* check for banner */
+    if (strncmp(banner, MatrixMarketBanner, strlen(MatrixMarketBanner)) != 0)
+        return MM_NO_HEADER;
+
+    /* first field should be "mtx" */
+    if (strcmp(mtx, MM_MTX_STR) != 0)
+        return  MM_UNSUPPORTED_TYPE;
+    mm_set_matrix(matcode);
+
+
+    /* second field describes whether this is a sparse matrix (in coordinate
+            storgae) or a dense array */
+
+
+    if (strcmp(crd, MM_SPARSE_STR) == 0)
+        mm_set_sparse(matcode);
+    else
+    if (strcmp(crd, MM_DENSE_STR) == 0)
+            mm_set_dense(matcode);
+    else
+        return MM_UNSUPPORTED_TYPE;
+    
+
+    /* third field */
+
+    if (strcmp(data_type, MM_REAL_STR) == 0)
+        mm_set_real(matcode);
+    else
+    if (strcmp(data_type, MM_COMPLEX_STR) == 0)
+        mm_set_complex(matcode);
+    else
+    if (strcmp(data_type, MM_PATTERN_STR) == 0)
+        mm_set_pattern(matcode);
+    else
+    if (strcmp(data_type, MM_INT_STR) == 0)
+        mm_set_integer(matcode);
+    else
+        return MM_UNSUPPORTED_TYPE;
+    
+
+    /* fourth field */
+
+    if (strcmp(storage_scheme, MM_GENERAL_STR) == 0)
+        mm_set_general(matcode);
+    else
+    if (strcmp(storage_scheme, MM_SYMM_STR) == 0)
+        mm_set_symmetric(matcode);
+    else
+    if (strcmp(storage_scheme, MM_HERM_STR) == 0)
+        mm_set_hermitian(matcode);
+    else
+    if (strcmp(storage_scheme, MM_SKEW_STR) == 0)
+        mm_set_skew(matcode);
+    else
+        return MM_UNSUPPORTED_TYPE;
+        
+
+    return 0;
+}
+
+int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz)
+{
+    if (fprintf(f, "%d %d %d\n", M, N, nz) != 3)
+        return MM_COULD_NOT_WRITE_FILE;
+    else 
+        return 0;
+}
+
+int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz )
+{
+    char line[MM_MAX_LINE_LENGTH];
+    int num_items_read;
+
+    /* set return null parameter values, in case we exit with errors */
+    *M = *N = *nz = 0;
+
+    /* now continue scanning until you reach the end-of-comments */
+    do 
+    {
+        if (fgets(line,MM_MAX_LINE_LENGTH,f) == NULL) 
+            return MM_PREMATURE_EOF;
+    }while (line[0] == '%');
+
+    /* line[] is either blank or has M,N, nz */
+    if (sscanf(line, "%d %d %d", M, N, nz) == 3)
+        return 0;
+        
+    else
+    do
+    { 
+        num_items_read = fscanf(f, "%d %d %d", M, N, nz); 
+        if (num_items_read == EOF) return MM_PREMATURE_EOF;
+    }
+    while (num_items_read != 3);
+
+    return 0;
+}
+
+
+int mm_read_mtx_array_size(FILE *f, int *M, int *N)
+{
+    char line[MM_MAX_LINE_LENGTH];
+    int num_items_read;
+    /* set return null parameter values, in case we exit with errors */
+    *M = *N = 0;
+	
+    /* now continue scanning until you reach the end-of-comments */
+    do 
+    {
+        if (fgets(line,MM_MAX_LINE_LENGTH,f) == NULL) 
+            return MM_PREMATURE_EOF;
+    }while (line[0] == '%');
+
+    /* line[] is either blank or has M,N, nz */
+    if (sscanf(line, "%d %d", M, N) == 2)
+        return 0;
+        
+    else /* we have a blank line */
+    do
+    { 
+        num_items_read = fscanf(f, "%d %d", M, N); 
+        if (num_items_read == EOF) return MM_PREMATURE_EOF;
+    }
+    while (num_items_read != 2);
+
+    return 0;
+}
+
+int mm_write_mtx_array_size(FILE *f, int M, int N)
+{
+    if (fprintf(f, "%d %d\n", M, N) != 2)
+        return MM_COULD_NOT_WRITE_FILE;
+    else 
+        return 0;
+}
+
+
+
+/*-------------------------------------------------------------------------*/
+
+/******************************************************************/
+/* use when I[], J[], and val[]J, and val[] are already allocated */
+/******************************************************************/
+
+int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[],
+        double val[], MM_typecode matcode)
+{
+    int i;
+    if (mm_is_complex(matcode))
+    {
+        for (i=0; i<nz; i++)
+            if (fscanf(f, "%d %d %lg %lg", &I[i], &J[i], &val[2*i], &val[2*i+1])
+                != 4) return MM_PREMATURE_EOF;
+    }
+    else if (mm_is_real(matcode))
+    {
+        for (i=0; i<nz; i++)
+        {
+            if (fscanf(f, "%d %d %lg\n", &I[i], &J[i], &val[i])
+                != 3) return MM_PREMATURE_EOF;
+
+        }
+    }
+
+    else if (mm_is_pattern(matcode))
+    {
+        for (i=0; i<nz; i++)
+            if (fscanf(f, "%d %d", &I[i], &J[i])
+                != 2) return MM_PREMATURE_EOF;
+    }
+    else
+        return MM_UNSUPPORTED_TYPE;
+
+    return 0;
+        
+}
+
+int mm_read_mtx_crd_entry(FILE *f, int *I, int *J,
+        double *real, double *imag, MM_typecode matcode)
+{
+    if (mm_is_complex(matcode))
+    {
+            if (fscanf(f, "%d %d %lg %lg", I, J, real, imag)
+                != 4) return MM_PREMATURE_EOF;
+    }
+    else if (mm_is_real(matcode))
+    {
+            if (fscanf(f, "%d %d %lg\n", I, J, real)
+                != 3) return MM_PREMATURE_EOF;
+
+    }
+
+    else if (mm_is_pattern(matcode))
+    {
+            if (fscanf(f, "%d %d", I, J) != 2) return MM_PREMATURE_EOF;
+    }
+    else
+        return MM_UNSUPPORTED_TYPE;
+
+    return 0;
+        
+}
+
+
+/************************************************************************
+    mm_read_mtx_crd()  fills M, N, nz, array of values, and return
+                        type code, e.g. 'MCRS'
+
+                        if matrix is complex, values[] is of size 2*nz,
+                            (nz pairs of real/imaginary values)
+************************************************************************/
+
+int mm_read_mtx_crd(char *fname, int *M, int *N, int *nz, int **I, int **J, 
+        double **val, MM_typecode *matcode)
+{
+    int ret_code;
+    FILE *f;
+
+    if (strcmp(fname, "stdin") == 0) f=stdin;
+    else
+    if ((f = fopen(fname, "r")) == NULL)
+        return MM_COULD_NOT_READ_FILE;
+
+
+    if ((ret_code = mm_read_banner(f, matcode)) != 0)
+        return ret_code;
+
+    if (!(mm_is_valid(*matcode) && mm_is_sparse(*matcode) && 
+            mm_is_matrix(*matcode)))
+        return MM_UNSUPPORTED_TYPE;
+
+    if ((ret_code = mm_read_mtx_crd_size(f, M, N, nz)) != 0)
+        return ret_code;
+
+
+    *I = (int *)  malloc(*nz * sizeof(int));
+    *J = (int *)  malloc(*nz * sizeof(int));
+    *val = NULL;
+
+    if (mm_is_complex(*matcode))
+    {
+        *val = (double *) malloc(*nz * 2 * sizeof(double));
+        ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, 
+                *matcode);
+        if (ret_code != 0) return ret_code;
+    }
+    else if (mm_is_real(*matcode))
+    {
+        *val = (double *) malloc(*nz * sizeof(double));
+        ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, 
+                *matcode);
+        if (ret_code != 0) return ret_code;
+    }
+
+    else if (mm_is_pattern(*matcode))
+    {
+        ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, 
+                *matcode);
+        if (ret_code != 0) return ret_code;
+    }
+
+    if (f != stdin) fclose(f);
+    return 0;
+}
+
+int mm_write_banner(FILE *f, MM_typecode matcode)
+{
+    char *str = mm_typecode_to_str(matcode);
+    int ret_code;
+
+    ret_code = fprintf(f, "%s %s\n", MatrixMarketBanner, str);
+    free(str);
+    if (ret_code !=2 )
+        return MM_COULD_NOT_WRITE_FILE;
+    else
+        return 0;
+}
+
+int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[],
+        double val[], MM_typecode matcode)
+{
+    FILE *f;
+    int i;
+
+    if (strcmp(fname, "stdout") == 0) 
+        f = stdout;
+    else
+    if ((f = fopen(fname, "w")) == NULL)
+        return MM_COULD_NOT_WRITE_FILE;
+    
+    /* print banner followed by typecode */
+    fprintf(f, "%s ", MatrixMarketBanner);
+    fprintf(f, "%s\n", mm_typecode_to_str(matcode));
+
+    /* print matrix sizes and nonzeros */
+    fprintf(f, "%d %d %d\n", M, N, nz);
+
+    /* print values */
+    if (mm_is_pattern(matcode))
+        for (i=0; i<nz; i++)
+            fprintf(f, "%d %d\n", I[i], J[i]);
+    else
+    if (mm_is_real(matcode))
+        for (i=0; i<nz; i++)
+            fprintf(f, "%d %d %20.16g\n", I[i], J[i], val[i]);
+    else
+    if (mm_is_complex(matcode))
+        for (i=0; i<nz; i++)
+            fprintf(f, "%d %d %20.16g %20.16g\n", I[i], J[i], val[2*i], 
+                        val[2*i+1]);
+    else
+    {
+        if (f != stdout) fclose(f);
+        return MM_UNSUPPORTED_TYPE;
+    }
+
+    if (f !=stdout) fclose(f);
+
+    return 0;
+}
+  
+
+/**
+*  Create a new copy of a string s.  mm_strdup() is a common routine, but
+*  not part of ANSI C, so it is included here.  Used by mm_typecode_to_str().
+*
+*/
+char *mm_strdup(const char *s)
+{
+	int len = strlen(s);
+	char *s2 = (char *) malloc((len+1)*sizeof(char));
+	return strcpy(s2, s);
+}
+
+char  *mm_typecode_to_str(MM_typecode matcode)
+{
+    char buffer[MM_MAX_LINE_LENGTH];
+    char *types[4];
+	char *mm_strdup(const char *);
+    int error =0;
+
+    /* check for MTX type */
+    if (mm_is_matrix(matcode)) 
+        types[0] = MM_MTX_STR;
+    else
+        error=1;
+
+    /* check for CRD or ARR matrix */
+    if (mm_is_sparse(matcode))
+        types[1] = MM_SPARSE_STR;
+    else
+    if (mm_is_dense(matcode))
+        types[1] = MM_DENSE_STR;
+    else
+        return NULL;
+
+    /* check for element data type */
+    if (mm_is_real(matcode))
+        types[2] = MM_REAL_STR;
+    else
+    if (mm_is_complex(matcode))
+        types[2] = MM_COMPLEX_STR;
+    else
+    if (mm_is_pattern(matcode))
+        types[2] = MM_PATTERN_STR;
+    else
+    if (mm_is_integer(matcode))
+        types[2] = MM_INT_STR;
+    else
+        return NULL;
+
+
+    /* check for symmetry type */
+    if (mm_is_general(matcode))
+        types[3] = MM_GENERAL_STR;
+    else
+    if (mm_is_symmetric(matcode))
+        types[3] = MM_SYMM_STR;
+    else 
+    if (mm_is_hermitian(matcode))
+        types[3] = MM_HERM_STR;
+    else 
+    if (mm_is_skew(matcode))
+        types[3] = MM_SKEW_STR;
+    else
+        return NULL;
+
+    sprintf(buffer,"%s %s %s %s", types[0], types[1], types[2], types[3]);
+    return mm_strdup(buffer);
+
+}
--- a/examples/gmres/mmio.h
+++ b/examples/gmres/mmio.h
@@ -0,0 +1,135 @@
+/* 
+*   Matrix Market I/O library for ANSI C
+*
+*   See http://math.nist.gov/MatrixMarket for details.
+*
+*
+*/
+
+#ifndef MM_IO_H
+#define MM_IO_H
+
+#define MM_MAX_LINE_LENGTH 1025
+#define MatrixMarketBanner "%%MatrixMarket"
+#define MM_MAX_TOKEN_LENGTH 64
+
+typedef char MM_typecode[4];
+
+#include <stdio.h>
+
+char *mm_typecode_to_str(MM_typecode matcode);
+
+int mm_read_banner(FILE *f, MM_typecode *matcode);
+int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz);
+int mm_read_mtx_array_size(FILE *f, int *M, int *N);
+
+int mm_write_banner(FILE *f, MM_typecode matcode);
+int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz);
+int mm_write_mtx_array_size(FILE *f, int M, int N);
+
+
+/********************* MM_typecode query fucntions ***************************/
+
+#define mm_is_matrix(typecode)	((typecode)[0]=='M')
+
+#define mm_is_sparse(typecode)	((typecode)[1]=='C')
+#define mm_is_coordinate(typecode)((typecode)[1]=='C')
+#define mm_is_dense(typecode)	((typecode)[1]=='A')
+#define mm_is_array(typecode)	((typecode)[1]=='A')
+
+#define mm_is_complex(typecode)	((typecode)[2]=='C')
+#define mm_is_real(typecode)		((typecode)[2]=='R')
+#define mm_is_pattern(typecode)	((typecode)[2]=='P')
+#define mm_is_integer(typecode) ((typecode)[2]=='I')
+
+#define mm_is_symmetric(typecode)((typecode)[3]=='S')
+#define mm_is_general(typecode)	((typecode)[3]=='G')
+#define mm_is_skew(typecode)	((typecode)[3]=='K')
+#define mm_is_hermitian(typecode)((typecode)[3]=='H')
+
+int mm_is_valid(MM_typecode matcode);		/* too complex for a macro */
+
+
+/********************* MM_typecode modify fucntions ***************************/
+
+#define mm_set_matrix(typecode)	((*typecode)[0]='M')
+#define mm_set_coordinate(typecode)	((*typecode)[1]='C')
+#define mm_set_array(typecode)	((*typecode)[1]='A')
+#define mm_set_dense(typecode)	mm_set_array(typecode)
+#define mm_set_sparse(typecode)	mm_set_coordinate(typecode)
+
+#define mm_set_complex(typecode)((*typecode)[2]='C')
+#define mm_set_real(typecode)	((*typecode)[2]='R')
+#define mm_set_pattern(typecode)((*typecode)[2]='P')
+#define mm_set_integer(typecode)((*typecode)[2]='I')
+
+
+#define mm_set_symmetric(typecode)((*typecode)[3]='S')
+#define mm_set_general(typecode)((*typecode)[3]='G')
+#define mm_set_skew(typecode)	((*typecode)[3]='K')
+#define mm_set_hermitian(typecode)((*typecode)[3]='H')
+
+#define mm_clear_typecode(typecode) ((*typecode)[0]=(*typecode)[1]= \
+									(*typecode)[2]=' ',(*typecode)[3]='G')
+
+#define mm_initialize_typecode(typecode) mm_clear_typecode(typecode)
+
+
+/********************* Matrix Market error codes ***************************/
+
+
+#define MM_COULD_NOT_READ_FILE	11
+#define MM_PREMATURE_EOF		12
+#define MM_NOT_MTX				13
+#define MM_NO_HEADER			14
+#define MM_UNSUPPORTED_TYPE		15
+#define MM_LINE_TOO_LONG		16
+#define MM_COULD_NOT_WRITE_FILE	17
+
+
+/******************** Matrix Market internal definitions ********************
+
+   MM_matrix_typecode: 4-character sequence
+
+				    ojbect 		sparse/   	data        storage 
+						  		dense     	type        scheme
+
+   string position:	 [0]        [1]			[2]         [3]
+
+   Matrix typecode:  M(atrix)  C(oord)		R(eal)   	G(eneral)
+						        A(array)	C(omplex)   H(ermitian)
+											P(attern)   S(ymmetric)
+								    		I(nteger)	K(kew)
+
+ ***********************************************************************/
+
+#define MM_MTX_STR		"matrix"
+#define MM_ARRAY_STR	"array"
+#define MM_DENSE_STR	"array"
+#define MM_COORDINATE_STR "coordinate" 
+#define MM_SPARSE_STR	"coordinate"
+#define MM_COMPLEX_STR	"complex"
+#define MM_REAL_STR		"real"
+#define MM_INT_STR		"integer"
+#define MM_GENERAL_STR  "general"
+#define MM_SYMM_STR		"symmetric"
+#define MM_HERM_STR		"hermitian"
+#define MM_SKEW_STR		"skew-symmetric"
+#define MM_PATTERN_STR  "pattern"
+
+
+/*  high level routines */
+
+int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[],
+		 double val[], MM_typecode matcode);
+int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[],
+		double val[], MM_typecode matcode);
+int mm_read_mtx_crd_entry(FILE *f, int *I, int *J, double *real, double *img,
+			MM_typecode matcode);
+
+int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_,
+                double **val_, int **I_, int **J_);
+
+
+
+#endif
--- a/examples/gmres/util.h
+++ b/examples/gmres/util.h
@@ -0,0 +1,53 @@
+/*
+  Copyright (c) 2012, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+
+#ifndef __UTIL_H__
+#define __UTIL_H__
+
+#include <stdio.h>
+#include "matrix.h"
+
+
+inline void printMatrix (DenseMatrix &M, const char *name) {
+    printf("Matrix %s:\n", name);
+    for (int row = 0; row < M.rows(); row++) {
+        printf("row %2d: ", row + 1);
+        for (int col = 0; col < M.cols(); col++)
+            printf("%6f ", M(row, col));
+        printf("\n");
+    }
+    printf("\n");
+}
+
+#endif
--- a/examples/intrinsics/generic-16.h
+++ b/examples/intrinsics/generic-16.h
--- a/Show More
+++ b/Show More