Release notes and doxygen version nubmer bump for 1.2.1

Bump release number to 1.2.1
Merge pull request #233 from nipunn1313/master
2012-04-06 16:02:19 -07:00 · 2012-04-06 15:30:54 -07:00 · 2012-04-06 15:24:12 -07:00 · 2012-04-06 17:58:21 -04:00 · 2012-04-06 17:54:55 -04:00 · 2012-04-05 20:39:39 -07:00
749 changed files with 55229 additions and 16875 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -5,4 +5,11 @@ ispc
 ispc_test
 objs
 docs/doxygen
-docs/ispc.html
+docs/*.html
 tests*/*cpp
 tests*/*run
 examples/*/*.png
 examples/*/*.ppm
 examples/*/objs/*
--- a/141
+++ b/141
@@ -2,41 +2,76 @@
 # ispc Makefile
 #
 # If you have your own special version of llvm and/or clang, change
 # these variables to match.
 LLVM_CONFIG=$(shell which llvm-config)
 CLANG_INCLUDE=$(shell $(LLVM_CONFIG) --includedir)
 # Add llvm bin to the path so any scripts run will go to the right llvm-config
 LLVM_BIN= $(shell $(LLVM_CONFIG) --bindir)
 export PATH:=$(LLVM_BIN):$(PATH)
 ARCH_OS = $(shell uname)
 ifeq ($(ARCH_OS), Darwin)
 	ARCH_OS2 = "OSX"
 else
 	ARCH_OS2 = $(shell uname -o)
 endif
 ARCH_TYPE = $(shell arch)
 ifeq ($(shell $(LLVM_CONFIG) --version), 3.1svn)
  LLVM_LIBS=-lLLVMAsmParser -lLLVMInstrumentation -lLLVMLinker			\
 	-lLLVMArchive -lLLVMBitReader -lLLVMDebugInfo -lLLVMJIT -lLLVMipo	\
 	-lLLVMBitWriter -lLLVMTableGen 			\
 	-lLLVMX86Disassembler -lLLVMX86CodeGen -lLLVMSelectionDAG		\
 	-lLLVMAsmPrinter -lLLVMX86AsmParser -lLLVMX86Desc -lLLVMX86Info		\
 	-lLLVMX86AsmPrinter -lLLVMX86Utils -lLLVMMCDisassembler	-lLLVMMCParser	\
 	-lLLVMCodeGen -lLLVMScalarOpts	-lLLVMInstCombine -lLLVMTransformUtils	\
 	-lLLVMipa -lLLVMAnalysis -lLLVMMCJIT -lLLVMRuntimeDyld			\
 	-lLLVMExecutionEngine -lLLVMTarget -lLLVMMC -lLLVMObject -lLLVMCore 	\
 	-lLLVMSupport
 else
  LLVM_LIBS=$(shell $(LLVM_CONFIG) --libs)
 endif
 CLANG=clang
 CLANG_LIBS = -lclangFrontend -lclangDriver \
             -lclangSerialization -lclangParse -lclangSema \
             -lclangAnalysis -lclangAST -lclangLex -lclangBasic
 ifeq ($(shell $(LLVM_CONFIG) --version), 3.1svn)
  CLANG_LIBS += -lclangEdit
 endif
-ISPC_LIBS=$(CLANG_LIBS) \
+ISPC_LIBS=$(shell $(LLVM_CONFIG) --ldflags) $(CLANG_LIBS) $(LLVM_LIBS) \
-	$(shell llvm-config --ldflags --libs) \
+	-lpthread
 	-lpthread -ldl
 ISPC_TEST_LIBS=$(shell llvm-config --ldflags --libs) \
 	-lpthread -ldl
-LLVM_CXXFLAGS=$(shell llvm-config --cppflags)
+ifeq ($(ARCH_OS),Linux)
-LLVM_VERSION=$(shell llvm-config --version | sed s/\\./_/)
+	ISPC_LIBS += -ldl
-LLVM_VERSION_DEF=-DLLVM_$(LLVM_VERSION)
+endif
 ifeq ($(ARCH_OS2),Msys)
 	ISPC_LIBS += -lshlwapi -limagehlp -lpsapi
 endif
 LLVM_CXXFLAGS=$(shell $(LLVM_CONFIG) --cppflags)
 LLVM_VERSION=LLVM_$(shell $(LLVM_CONFIG) --version | sed s/\\./_/)
 LLVM_VERSION_DEF=-D$(LLVM_VERSION)
 BUILD_DATE=$(shell date +%Y%m%d)
 BUILD_VERSION=$(shell git log --abbrev-commit --abbrev=16 | head -1)
 CXX=g++
 CPP=cpp
-CXXFLAGS=-g3 $(LLVM_CXXFLAGS) -I. -Iobjs/ -Wall $(LLVM_VERSION_DEF) \
+OPT=-g3
 CXXFLAGS=$(OPT) $(LLVM_CXXFLAGS) -I. -Iobjs/ -I$(CLANG_INCLUDE)  \
 	-Wall $(LLVM_VERSION_DEF) \
 	-DBUILD_DATE="\"$(BUILD_DATE)\"" -DBUILD_VERSION="\"$(BUILD_VERSION)\""
 LDFLAGS=
 ifeq ($(ARCH_OS),Linux)
  # try to link everything statically under Linux (including libstdc++) so
  # that the binaries we generate will be portable across distributions...
-  ifeq ($(ARCH_TYPE),x86_64)
+    LDFLAGS=-static
    LDFLAGS=-static -L/usr/lib/gcc/x86_64-linux-gnu/4.4
  else
    LDFLAGS=-L/usr/lib/gcc/i686-redhat-linux/4.6.0
  endif
 endif
 LEX=flex
@@ -44,21 +79,25 @@ YACC=bison -d -v -t
 ###########################################################################
-CXX_SRC=builtins.cpp ctx.cpp decl.cpp expr.cpp ispc.cpp \
+CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
-	llvmutil.cpp main.cpp module.cpp opt.cpp stmt.cpp sym.cpp type.cpp \
+	ispc.cpp llvmutil.cpp main.cpp module.cpp opt.cpp stmt.cpp sym.cpp \
-	util.cpp
+	type.cpp util.cpp
-HEADERS=builtins.h ctx.h decl.h expr.h ispc.h llvmutil.h module.h \
+HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
 	opt.h stmt.h sym.h type.h util.h
-BUILTINS_SRC=builtins-avx.ll builtins-avx-x2.ll builtins-sse2.ll \
+TARGETS=avx1 avx1-x2 avx2 avx2-x2 sse2 sse2-x2 sse4 sse4-x2 generic-4 generic-8 \
-	builtins-sse4.ll builtins-sse4x2.ll
+	generic-16 generic-1
 BUILTINS_SRC=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS))) \
 	builtins/dispatch.ll
 BUILTINS_OBJS=$(addprefix builtins-, $(notdir $(BUILTINS_SRC:.ll=.o))) \
 	builtins-c-32.cpp builtins-c-64.cpp 
 BISON_SRC=parse.yy
 FLEX_SRC=lex.ll
-OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_SRC:.ll=.o) \
+OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_OBJS) \
-	builtins-c-32.o builtins-c-64.o stdlib_ispc.o $(BISON_SRC:.yy=.o) \
+	stdlib_generic_ispc.o stdlib_x86_ispc.o \
-	$(FLEX_SRC:.ll=.o))
+	$(BISON_SRC:.yy=.o) $(FLEX_SRC:.ll=.o))
-default: ispc ispc_test
+default: ispc
 .PHONY: dirs clean depend doxygen print_llvm_src
 .PRECIOUS: objs/builtins-%.cpp
@@ -77,7 +116,7 @@ print_llvm_src:
 	@echo Using LLVM `llvm-config --version` from `llvm-config --libdir`
 clean:
-	/bin/rm -rf objs ispc ispc_test
+	/bin/rm -rf objs ispc
 doxygen:
 	/bin/rm -rf docs/doxygen
@@ -87,14 +126,18 @@ ispc: print_llvm_src dirs $(OBJS)
 	@echo Creating ispc executable
 	@$(CXX) $(LDFLAGS) -o $@ $(OBJS) $(ISPC_LIBS)
 ispc_test: dirs ispc_test.cpp
 	@echo Creating ispc_test executable
 	@$(CXX) $(LDFLAGS) $(CXXFLAGS) -o $@ ispc_test.cpp $(ISPC_TEST_LIBS)
 objs/%.o: %.cpp
 	@echo Compiling $<
 	@$(CXX) $(CXXFLAGS) -o $@ -c $<
 objs/cbackend.o: cbackend.cpp
 	@echo Compiling $<
 	@$(CXX) -fno-rtti -fno-exceptions $(CXXFLAGS) -o $@ -c $<
 objs/%.o: objs/%.cpp
 	@echo Compiling $<
 	@$(CXX) $(CXXFLAGS) -o $@ -c $<
 objs/parse.cc: parse.yy
 	@echo Running bison on $<
 	@$(YACC) -o $@ $<
@@ -111,34 +154,24 @@ objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc
 	@echo Compiling $<
 	@$(CXX) $(CXXFLAGS) -o $@ -c $<
-objs/builtins-%.cpp: builtins-%.ll builtins.m4 builtins-sse.ll builtins-avx-common.ll
+objs/builtins-%.cpp: builtins/%.ll builtins/util.m4 $(wildcard builtins/*common.ll)
 	@echo Creating C++ source from builtin definitions file $<
 	@m4 -DLLVM_VERSION=$(LLVM_VERSION) builtins.m4 $< | ./bitcode2cpp.py $< > $@
 objs/builtins-%.o: objs/builtins-%.cpp
 	@echo Compiling $<
 	@$(CXX) $(CXXFLAGS) -o $@ -c $<
 objs/builtins-c-32.cpp: builtins-c.c
 	@echo Creating C++ source from builtins definition file $<
-	@$(CLANG) -m32 -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py builtins-c-32.c > $@
+	@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) $< | python bitcode2cpp.py $< > $@
-objs/builtins-c-32.o: objs/builtins-c-32.cpp
+objs/builtins-c-32.cpp: builtins/builtins.c
 	@echo Compiling $<
 	@$(CXX) $(CXXFLAGS) -o $@ -c $<
 objs/builtins-c-64.cpp: builtins-c.c
 	@echo Creating C++ source from builtins definition file $<
-	@$(CLANG) -m64 -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py builtins-c-64.c > $@
+	@$(CLANG) -m32 -emit-llvm -c $< -o - | llvm-dis - | python bitcode2cpp.py c-32 > $@
-objs/builtins-c-64.o: objs/builtins-c-64.cpp
+objs/builtins-c-64.cpp: builtins/builtins.c
-	@echo Compiling $<
+	@echo Creating C++ source from builtins definition file $<
-	@$(CXX) $(CXXFLAGS) -o $@ -c $<
+	@$(CLANG) -m64 -emit-llvm -c $< -o - | llvm-dis - | python bitcode2cpp.py c-64 > $@
-objs/stdlib_ispc.cpp: stdlib.ispc
+objs/stdlib_generic_ispc.cpp: stdlib.ispc
-	@echo Creating C++ source from $<
+	@echo Creating C++ source from $< for generic
-	@$(CLANG) -E -x c -DISPC=1 -DPI=3.1415926536 $< -o - | ./stdlib2cpp.py > $@
+	@$(CLANG) -E -x c -DISPC_TARGET_GENERIC=1 -DISPC=1 -DPI=3.1415926536 $< -o - | \
 		python stdlib2cpp.py generic > $@
-objs/stdlib_ispc.o: objs/stdlib_ispc.cpp
+objs/stdlib_x86_ispc.cpp: stdlib.ispc
-	@echo Compiling $<
+	@echo Creating C++ source from $< for x86
-	@$(CXX) $(CXXFLAGS) -o $@ -c $<
+	@$(CLANG) -E -x c -DISPC=1 -DPI=3.1415926536 $< -o - | \
 		python stdlib2cpp.py x86 > $@
--- a/README.rst
+++ b/README.rst
@@ -0,0 +1,90 @@
 ==============================
 Intel(r) SPMD Program Compiler
 ==============================
 ``ispc`` is a compiler for a variant of the C programming language, with
 extensions for `single program, multiple data
 <http://en.wikipedia.org/wiki/SPMD>`_ programming.  Under the SPMD model,
 the programmer writes a program that generally appears to be a regular
 serial program, though the execution model is actually that a number of
 *program instances* execute in parallel on the hardware.
 Overview
 --------
 ``ispc`` compiles a C-based SPMD programming language to run on the SIMD
 units of CPUs; it frequently provides a 3x or more speedup on CPUs with
 4-wide vector SSE units and 5x-6x on CPUs with 8-wide AVX vector units,
 without any of the difficulty of writing intrinsics code.  Parallelization
 across multiple cores is also supported by ``ispc``, making it
 possible to write programs that achieve performance improvement that scales
 by both number of cores and vector unit size.
 There are a few key principles in the design of ``ispc``:
  * To build a small set of extensions to the C language that
    would deliver excellent performance to performance-oriented
    programmers who want to run SPMD programs on the CPU.
  * To provide a thin abstraction layer between the programmer
    and the hardware--in particular, to have an execution and
    data model where the programmer can cleanly reason about the
    mapping of their source program to compiled assembly language
    and the underlying hardware.
  * To make it possible to harness the computational power of SIMD
    vector units without the extremely low-programmer-productivity
    activity of directly writing intrinsics.
  * To explore opportunities from close coupling between C/C++
    application code and SPMD ``ispc`` code running on the
    same processor--to have lightweight function calls between
    the two languages and to share data directly via pointers without
    copying or reformatting.
 ``ispc`` is an open source compiler with the BSD license.  It uses the
 remarkable `LLVM Compiler Infrastructure <http://llvm.org>`_ for back-end
 code generation and optimization and is `hosted on
 github <http://github.com/ispc/ispc/>`_. It supports Windows, Mac, and
 Linux, with both x86 and x86-64 targets.  It currently supports the SSE2,
 SSE4, AVX1, and AVX2 instruction sets.
 Features
 --------
 ``ispc`` provides a number of key features to developers:
  * Familiarity as an extension of the C programming
    language: ``ispc`` supports familiar C syntax and
    programming idioms, while adding the ability to write SPMD
    programs.
  * High-quality SIMD code generation: the performance
    of code generated by ``ispc`` is often close to that of
    hand-written intrinsics code.
  * Ease of adoption with existing software
    systems: functions written in ``ispc`` directly
    interoperate with application functions written in C/C++ and
    with application data structures.
  * Portability across over a decade of CPU
    generations: ``ispc`` has targets for SSE2, SSE4, AVX
    (and soon, AVX2).
  * Portability across operating systems: Microsoft
    Windows, Mac OS X, and Linux are all supported
    by ``ispc``.
  * Debugging with standard tools: ``ispc``
    programs can be debugged with standard debuggers (OS X and
    Linux only).
 Additional Resources
 --------------------
 Prebuilt ``ispc`` binaries for Windows, OS X and Linux can be downloaded
 from the `ispc downloads page <http://ispc.github.com/downloads.html>`_.
 See also additional
 `documentation <http://ispc.github.com/documentation.html>`_ and additional
 `performance information <http://ispc.github.com/perf.html>`_.
--- a/README.txt
+++ b/README.txt
@@ -1,22 +0,0 @@
 ==============================
 Intel(r) SPMD Program Compiler
 ==============================
 Welcome to the Intel(r) SPMD Program Compiler (ispc)!  
 ispc is a new compiler for "single program, multiple data" (SPMD)
 programs. Under the SPMD model, the programmer writes a program that mostly
 appears to be a regular serial program, though the execution model is
 actually that a number of program instances execute in parallel on the
 hardware. ispc compiles a C-based SPMD programming language to run on the
 SIMD units of CPUs; it frequently provides a a 3x or more speedup on CPUs
 with 4-wide SSE units, without any of the difficulty of writing intrinsics
 code.
 ispc is an open source compiler under the BSD license; see the file
 LICENSE.txt.  ispc supports Windows, Mac, and Linux, with both x86 and
 x86-64 targets.  It currently supports the SSE2, SSE4, and AVX instruction
 sets.
 For more information and examples, as well as a wiki and the bug database,
 see the ispc distribution site, http://ispc.github.com.
--- a/ast.cpp
+++ b/ast.cpp
@@ -0,0 +1,471 @@
 /*
  Copyright (c) 2011-2012, Intel Corporation
  All rights reserved.
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are
  met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of Intel Corporation nor the names of its
      contributors may be used to endorse or promote products derived from
      this software without specific prior written permission.
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 */
 /** @file ast.cpp
    @brief General functionality related to abstract syntax trees and
    traversal of them.
 */
 #include "ast.h"
 #include "expr.h"
 #include "func.h"
 #include "stmt.h"
 #include "sym.h"
 #include "util.h"
 ///////////////////////////////////////////////////////////////////////////
 // ASTNode
 ASTNode::~ASTNode() {
 }
 ///////////////////////////////////////////////////////////////////////////
 // AST
 void
 AST::AddFunction(Symbol *sym, const std::vector<Symbol *> &args, Stmt *code) {
    if (sym == NULL)
        return;
    functions.push_back(new Function(sym, args, code));
 }
 void
 AST::GenerateIR() {
    for (unsigned int i = 0; i < functions.size(); ++i)
        functions[i]->GenerateIR();
 }
 ///////////////////////////////////////////////////////////////////////////
 ASTNode *
 WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
        void *data) {
    if (node == NULL)
        return node;
    // Call the callback function
    if (preFunc != NULL) {
        if (preFunc(node, data) == false)
            // The function asked us to not continue recursively, so stop.
            return node;
    }
    ////////////////////////////////////////////////////////////////////////////
    // Handle Statements
    if (dynamic_cast<Stmt *>(node) != NULL) {
        ExprStmt *es;
        DeclStmt *ds;
        IfStmt *is;
        DoStmt *dos;
        ForStmt *fs;
        ForeachStmt *fes;
        CaseStmt *cs;
        DefaultStmt *defs;
        SwitchStmt *ss;
        ReturnStmt *rs;
        LabeledStmt *ls;
        StmtList *sl;
        PrintStmt *ps;
        AssertStmt *as;
        DeleteStmt *dels;
        if ((es = dynamic_cast<ExprStmt *>(node)) != NULL)
            es->expr = (Expr *)WalkAST(es->expr, preFunc, postFunc, data);
        else if ((ds = dynamic_cast<DeclStmt *>(node)) != NULL) {
            for (unsigned int i = 0; i < ds->vars.size(); ++i)
                ds->vars[i].init = (Expr *)WalkAST(ds->vars[i].init, preFunc, 
                                                   postFunc, data);
        }
        else if ((is = dynamic_cast<IfStmt *>(node)) != NULL) {
            is->test = (Expr *)WalkAST(is->test, preFunc, postFunc, data);
            is->trueStmts = (Stmt *)WalkAST(is->trueStmts, preFunc, 
                                            postFunc, data);
            is->falseStmts = (Stmt *)WalkAST(is->falseStmts, preFunc, 
                                             postFunc, data);
        }
        else if ((dos = dynamic_cast<DoStmt *>(node)) != NULL) {
            dos->testExpr = (Expr *)WalkAST(dos->testExpr, preFunc, 
                                            postFunc, data);
            dos->bodyStmts = (Stmt *)WalkAST(dos->bodyStmts, preFunc, 
                                             postFunc, data);
        }
        else if ((fs = dynamic_cast<ForStmt *>(node)) != NULL) {
            fs->init = (Stmt *)WalkAST(fs->init, preFunc, postFunc, data);
            fs->test = (Expr *)WalkAST(fs->test, preFunc, postFunc, data);
            fs->step = (Stmt *)WalkAST(fs->step, preFunc, postFunc, data);
            fs->stmts = (Stmt *)WalkAST(fs->stmts, preFunc, postFunc, data);
        }
        else if ((fes = dynamic_cast<ForeachStmt *>(node)) != NULL) {
            for (unsigned int i = 0; i < fes->startExprs.size(); ++i)
                fes->startExprs[i] = (Expr *)WalkAST(fes->startExprs[i], preFunc, 
                                                     postFunc, data);
            for (unsigned int i = 0; i < fes->endExprs.size(); ++i)
                fes->endExprs[i] = (Expr *)WalkAST(fes->endExprs[i], preFunc, 
                                                   postFunc, data);
            fes->stmts = (Stmt *)WalkAST(fes->stmts, preFunc, postFunc, data);
        }
        else if ((cs = dynamic_cast<CaseStmt *>(node)) != NULL)
            cs->stmts = (Stmt *)WalkAST(cs->stmts, preFunc, postFunc, data);
        else if ((defs = dynamic_cast<DefaultStmt *>(node)) != NULL)
            defs->stmts = (Stmt *)WalkAST(defs->stmts, preFunc, postFunc, data);
        else if ((ss = dynamic_cast<SwitchStmt *>(node)) != NULL) {
            ss->expr = (Expr *)WalkAST(ss->expr, preFunc, postFunc, data);
            ss->stmts = (Stmt *)WalkAST(ss->stmts, preFunc, postFunc, data);
        }
        else if (dynamic_cast<BreakStmt *>(node) != NULL ||
                 dynamic_cast<ContinueStmt *>(node) != NULL ||
                 dynamic_cast<GotoStmt *>(node) != NULL) {
            // nothing
        }
        else if ((ls = dynamic_cast<LabeledStmt *>(node)) != NULL)
            ls->stmt = (Stmt *)WalkAST(ls->stmt, preFunc, postFunc, data);
        else if ((rs = dynamic_cast<ReturnStmt *>(node)) != NULL)
            rs->expr = (Expr *)WalkAST(rs->expr, preFunc, postFunc, data);
        else if ((sl = dynamic_cast<StmtList *>(node)) != NULL) {
            std::vector<Stmt *> &sls = sl->stmts;
            for (unsigned int i = 0; i < sls.size(); ++i)
                sls[i] = (Stmt *)WalkAST(sls[i], preFunc, postFunc, data);
        }
        else if ((ps = dynamic_cast<PrintStmt *>(node)) != NULL)
            ps->values = (Expr *)WalkAST(ps->values, preFunc, postFunc, data);
        else if ((as = dynamic_cast<AssertStmt *>(node)) != NULL)
            as->expr = (Expr *)WalkAST(as->expr, preFunc, postFunc, data);
        else if ((dels = dynamic_cast<DeleteStmt *>(node)) != NULL)
            dels->expr = (Expr *)WalkAST(dels->expr, preFunc, postFunc, data);
        else
            FATAL("Unhandled statement type in WalkAST()");
    }
    else {
        ///////////////////////////////////////////////////////////////////////////
        // Handle expressions
        Assert(dynamic_cast<Expr *>(node) != NULL);
        UnaryExpr *ue;
        BinaryExpr *be;
        AssignExpr *ae;
        SelectExpr *se;
        ExprList *el;
        FunctionCallExpr *fce;
        IndexExpr *ie;
        MemberExpr *me;
        TypeCastExpr *tce;
        ReferenceExpr *re;
        PtrDerefExpr *ptrderef;
        RefDerefExpr *refderef;
        SizeOfExpr *soe;
        AddressOfExpr *aoe;
        NewExpr *newe;
        if ((ue = dynamic_cast<UnaryExpr *>(node)) != NULL)
            ue->expr = (Expr *)WalkAST(ue->expr, preFunc, postFunc, data);
        else if ((be = dynamic_cast<BinaryExpr *>(node)) != NULL) {
            be->arg0 = (Expr *)WalkAST(be->arg0, preFunc, postFunc, data);
            be->arg1 = (Expr *)WalkAST(be->arg1, preFunc, postFunc, data);
        }
        else if ((ae = dynamic_cast<AssignExpr *>(node)) != NULL) {
            ae->lvalue = (Expr *)WalkAST(ae->lvalue, preFunc, postFunc, data);
            ae->rvalue = (Expr *)WalkAST(ae->rvalue, preFunc, postFunc, data);
        }
        else if ((se = dynamic_cast<SelectExpr *>(node)) != NULL) {
            se->test = (Expr *)WalkAST(se->test, preFunc, postFunc, data);
            se->expr1 = (Expr *)WalkAST(se->expr1, preFunc, postFunc, data);
            se->expr2 = (Expr *)WalkAST(se->expr2, preFunc, postFunc, data);
        }
        else if ((el = dynamic_cast<ExprList *>(node)) != NULL) {
            for (unsigned int i = 0; i < el->exprs.size(); ++i)
                el->exprs[i] = (Expr *)WalkAST(el->exprs[i], preFunc, 
                                               postFunc, data);
        }
        else if ((fce = dynamic_cast<FunctionCallExpr *>(node)) != NULL) {
            fce->func = (Expr *)WalkAST(fce->func, preFunc, postFunc, data);
            fce->args = (ExprList *)WalkAST(fce->args, preFunc, postFunc, data);
            fce->launchCountExpr = (Expr *)WalkAST(fce->launchCountExpr, preFunc,
                                                   postFunc, data);
        }
        else if ((ie = dynamic_cast<IndexExpr *>(node)) != NULL) {
            ie->baseExpr = (Expr *)WalkAST(ie->baseExpr, preFunc, postFunc, data);
            ie->index = (Expr *)WalkAST(ie->index, preFunc, postFunc, data);
        }
        else if ((me = dynamic_cast<MemberExpr *>(node)) != NULL)
            me->expr = (Expr *)WalkAST(me->expr, preFunc, postFunc, data);
        else if ((tce = dynamic_cast<TypeCastExpr *>(node)) != NULL)
            tce->expr = (Expr *)WalkAST(tce->expr, preFunc, postFunc, data);
        else if ((re = dynamic_cast<ReferenceExpr *>(node)) != NULL)
            re->expr = (Expr *)WalkAST(re->expr, preFunc, postFunc, data);
        else if ((ptrderef = dynamic_cast<PtrDerefExpr *>(node)) != NULL)
            ptrderef->expr = (Expr *)WalkAST(ptrderef->expr, preFunc, postFunc,
                                             data);
        else if ((refderef = dynamic_cast<RefDerefExpr *>(node)) != NULL)
            refderef->expr = (Expr *)WalkAST(refderef->expr, preFunc, postFunc,
                                             data);
        else if ((soe = dynamic_cast<SizeOfExpr *>(node)) != NULL)
            soe->expr = (Expr *)WalkAST(soe->expr, preFunc, postFunc, data);
        else if ((aoe = dynamic_cast<AddressOfExpr *>(node)) != NULL)
            aoe->expr = (Expr *)WalkAST(aoe->expr, preFunc, postFunc, data);
        else if ((newe = dynamic_cast<NewExpr *>(node)) != NULL) {
            newe->countExpr = (Expr *)WalkAST(newe->countExpr, preFunc, 
                                              postFunc, data);
            newe->initExpr = (Expr *)WalkAST(newe->initExpr, preFunc, 
                                             postFunc, data);
        }
        else if (dynamic_cast<SymbolExpr *>(node) != NULL ||
                 dynamic_cast<ConstExpr *>(node) != NULL ||
                 dynamic_cast<FunctionSymbolExpr *>(node) != NULL ||
                 dynamic_cast<SyncExpr *>(node) != NULL ||
                 dynamic_cast<NullPointerExpr *>(node) != NULL) {
            // nothing to do 
        }
        else 
            FATAL("Unhandled expression type in WalkAST().");
    }
    // Call the callback function
    if (postFunc != NULL)
        return postFunc(node, data);
    else
        return node;
 }
 static ASTNode *
 lOptimizeNode(ASTNode *node, void *) {
    return node->Optimize();
 }
 ASTNode *
 Optimize(ASTNode *root) {
    return WalkAST(root, NULL, lOptimizeNode, NULL);
 }
 Expr *
 Optimize(Expr *expr) {
    return (Expr *)Optimize((ASTNode *)expr);
 }
 Stmt *
 Optimize(Stmt *stmt) {
    return (Stmt *)Optimize((ASTNode *)stmt);
 }
 static ASTNode *
 lTypeCheckNode(ASTNode *node, void *) {
    return node->TypeCheck();
 }
 ASTNode *
 TypeCheck(ASTNode *root) {
    return WalkAST(root, NULL, lTypeCheckNode, NULL);
 }
 Expr *
 TypeCheck(Expr *expr) {
    return (Expr *)TypeCheck((ASTNode *)expr);
 }
 Stmt *
 TypeCheck(Stmt *stmt) {
    return (Stmt *)TypeCheck((ASTNode *)stmt);
 }
 struct CostData {
    CostData() { cost = foreachDepth = 0; }
    int cost;
    int foreachDepth;
 };
 static bool
 lCostCallbackPre(ASTNode *node, void *d) {
    CostData *data = (CostData *)d;
    if (dynamic_cast<ForeachStmt *>(node) != NULL)
        ++data->foreachDepth;
    if (data->foreachDepth == 0)
        data->cost += node->EstimateCost();
    return true;
 }
 static ASTNode *
 lCostCallbackPost(ASTNode *node, void *d) {
    CostData *data = (CostData *)d;
    if (dynamic_cast<ForeachStmt *>(node) != NULL)
        --data->foreachDepth;
    return node;
 }
 int
 EstimateCost(ASTNode *root) {
    CostData data;
    WalkAST(root, lCostCallbackPre, lCostCallbackPost, &data);
    return data.cost;
 }
 /** Given an AST node, check to see if it's safe if we happen to run the
    code for that node with the execution mask all off.
 */
 static bool
 lCheckAllOffSafety(ASTNode *node, void *data) {
    bool *okPtr = (bool *)data;
    FunctionCallExpr *fce;
    if ((fce = dynamic_cast<FunctionCallExpr *>(node)) != NULL) {
        if (fce->func == NULL)
            return false;
        const Type *type = fce->func->GetType();
        const PointerType *pt = dynamic_cast<const PointerType *>(type);
        if (pt != NULL)
            type = pt->GetBaseType();
        const FunctionType *ftype = dynamic_cast<const FunctionType *>(type);
        Assert(ftype != NULL);
        if (ftype->isSafe == false) {
            *okPtr = false;
            return false;
        }
    }
    if (dynamic_cast<AssertStmt *>(node) != NULL) {
        // While it's fine to run the assert for varying tests, it's not
        // desirable to check an assert on a uniform variable if all of the
        // lanes are off.
        *okPtr = false;
        return false;
    }
    if (dynamic_cast<NewExpr *>(node) != NULL ||
        dynamic_cast<DeleteStmt *>(node) != NULL) {
        // We definitely don't want to run the uniform variants of these if
        // the mask is all off.  It's also worth skipping the overhead of
        // executing the varying versions of them in the all-off mask case.
        *okPtr = false;
        return false;
    }
    if (dynamic_cast<ForeachStmt *>(node) != NULL) {
        // foreach() statements also shouldn't be run with an all-off mask.
        // Since they re-establish an 'all on' mask, this would be pretty
        // unintuitive.  (More generally, it's possibly a little strange to
        // allow foreach() in the presence of any non-uniform control
        // flow...)
        *okPtr = false;
        return false;
    }
    if (g->target.allOffMaskIsSafe == true)
        // Don't worry about memory accesses if we have a target that can
        // safely run them with the mask all off
        return true;
    IndexExpr *ie;
    if ((ie = dynamic_cast<IndexExpr *>(node)) != NULL && ie->baseExpr != NULL) {
        const Type *type = ie->baseExpr->GetType();
        if (type == NULL)
            return true;
        if (dynamic_cast<const ReferenceType *>(type) != NULL)
            type = type->GetReferenceTarget();
        ConstExpr *ce = dynamic_cast<ConstExpr *>(ie->index);
        if (ce == NULL) {
            // indexing with a variable... -> not safe
            *okPtr = false;
            return false;
        }
        const PointerType *pointerType = 
            dynamic_cast<const PointerType *>(type);
        if (pointerType != NULL) {
            // pointer[index] -> can't be sure -> not safe
            *okPtr = false;
            return false;
        }
        const SequentialType *seqType = 
            dynamic_cast<const SequentialType *>(type);
        Assert(seqType != NULL);
        int nElements = seqType->GetElementCount();
        if (nElements == 0) {
            // Unsized array, so we can't be sure -> not safe
            *okPtr = false;
            return false;
        }
        int32_t indices[ISPC_MAX_NVEC];
        int count = ce->AsInt32(indices);
        for (int i = 0; i < count; ++i) {
            if (indices[i] < 0 || indices[i] >= nElements) {
                // Index is out of bounds -> not safe
                *okPtr = false;
                return false;
            }
        }
        // All indices are in-bounds
        return true;
    }
    MemberExpr *me;
    if ((me = dynamic_cast<MemberExpr *>(node)) != NULL &&
        me->dereferenceExpr) {
        *okPtr = false;
        return false;
    }
    if (dynamic_cast<PtrDerefExpr *>(node) != NULL) {
        *okPtr = false;
        return false;
    }
    return true;
 }
 bool
 SafeToRunWithMaskAllOff(ASTNode *root) {
    bool safe = true;
    WalkAST(root, lCheckAllOffSafety, NULL, &safe);
    return safe;
 }
--- a/ast.h
+++ b/ast.h
@@ -0,0 +1,151 @@
 /*
  Copyright (c) 2011, Intel Corporation
  All rights reserved.
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are
  met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of Intel Corporation nor the names of its
      contributors may be used to endorse or promote products derived from
      this software without specific prior written permission.
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 */
 /** @file ast.h
    @brief 
 */
 #ifndef ISPC_AST_H
 #define ISPC_AST_H 1
 #include "ispc.h"
 #include <vector>
 /** @brief Abstract base class for nodes in the abstract syntax tree (AST).
    This class defines a basic interface that all abstract syntax tree
    (AST) nodes must implement.  The base classes for both expressions
    (Expr) and statements (Stmt) inherit from this class.
 */
 class ASTNode {
 public:
    ASTNode(SourcePos p) : pos(p) { }
    virtual ~ASTNode();
    /** The Optimize() method should perform any appropriate early-stage
        optimizations on the node (e.g. constant folding).  This method
        will be called after the node's children have already been
        optimized, and the caller will store the returned ASTNode * in
        place of the original node.  This method should return NULL if an
        error is encountered during optimization. */
    virtual ASTNode *Optimize() = 0;
    /** Type checking should be performed by the node when this method is
        called.  In the event of an error, a NULL value may be returned.
        As with ASTNode::Optimize(), the caller should store the returned
        pointer in place of the original ASTNode *. */
    virtual ASTNode *TypeCheck() = 0;
    /** Estimate the execution cost of the node (not including the cost of
        the children.  The value returned should be based on the COST_*
        enumerant values defined in ispc.h. */
    virtual int EstimateCost() const = 0;
    /** All AST nodes must track the file position where they are
        defined. */
    SourcePos pos;
 };
 /** Simple representation of the abstract syntax trees for all of the
    functions declared in a compilation unit.
 */
 class AST {
 public:
    /** Add the AST for a function described by the given declaration
        information and source code. */
    void AddFunction(Symbol *sym, const std::vector<Symbol *> &args, 
                     Stmt *code);
    /** Generate LLVM IR for all of the functions into the current
        module. */
    void GenerateIR();
 private:
    std::vector<Function *> functions;
 };
 /** Callback function type for preorder traversial visiting function for
    the AST walk.
 */
 typedef bool (* ASTPreCallBackFunc)(ASTNode *node, void *data);
 /** Callback function type for postorder traversial visiting function for
    the AST walk.
 */
 typedef ASTNode * (* ASTPostCallBackFunc)(ASTNode *node, void *data);
 /** Walk (some portion of) an AST, starting from the given root node.  At
    each node, if preFunc is non-NULL, call it, passing the given void
    *data pointer; if the call to preFunc function returns false, then the
    children of the node aren't visited.  This function then makes
    recursive calls to WalkAST() to process the node's children; after
    doing so, calls postFunc, at the node.  The return value from the
    postFunc call is ignored. */
 extern ASTNode *WalkAST(ASTNode *root, ASTPreCallBackFunc preFunc,
                        ASTPostCallBackFunc postFunc, void *data);
 /** Perform simple optimizations on the AST or portion thereof passed to
    this function, returning the resulting AST. */
 extern ASTNode *Optimize(ASTNode *root);
 /** Convenience version of Optimize() for Expr *s that returns an Expr *
    (rather than an ASTNode *, which would require the caller to cast back
    to an Expr *). */ 
 extern Expr *Optimize(Expr *);
 /** Convenience version of Optimize() for Expr *s that returns an Stmt *
    (rather than an ASTNode *, which would require the caller to cast back
    to a Stmt *). */ 
 extern Stmt *Optimize(Stmt *);
 /** Perform type-checking on the given AST (or portion of one), returning a
    pointer to the root of the resulting AST. */
 extern ASTNode *TypeCheck(ASTNode *root);
 /** Convenience version of TypeCheck() for Expr *s that returns an Expr *. */
 extern Expr *TypeCheck(Expr *);
 /** Convenience version of TypeCheck() for Stmt *s that returns an Stmt *. */
 extern Stmt *TypeCheck(Stmt *);
 /** Returns an estimate of the execution cost of the tree starting at
    the given root. */
 extern int EstimateCost(ASTNode *root);
 /** Returns true if it would be safe to run the given code with an "all
    off" mask. */ 
 extern bool SafeToRunWithMaskAllOff(ASTNode *root);
 #endif // ISPC_AST_H
--- a/bitcode2cpp.py
+++ b/bitcode2cpp.py
@@ -11,7 +11,10 @@ length=0
 src=str(sys.argv[1])
-target = re.sub(".*builtins-", "", src)
+target = re.sub("builtins/target-", "", src)
 target = re.sub(r"builtins\\target-", "", target)
 target = re.sub("builtins/", "", target)
 target = re.sub(r"builtins\\", "", target)
 target = re.sub("\.ll$", "", target)
 target = re.sub("\.c$", "", target)
 target = re.sub("-", "_", target)
@@ -23,17 +26,21 @@ if platform.system() == 'Windows' or string.find(platform.system(), "CYGWIN_NT")
 try:
    as_out=subprocess.Popen([llvm_as, "-", "-o", "-"], stdout=subprocess.PIPE)
 except IOError:
-    print >> sys.stderr, "Couldn't open " + src
+    sys.stderr.write("Couldn't open " + src)
    sys.exit(1)
-print "unsigned char builtins_bitcode_" + target + "[] = {"
+width = 16;
-for line in as_out.stdout.readlines():
+sys.stdout.write("unsigned char builtins_bitcode_" + target + "[] = {\n")
-    length = length + len(line)
+
-    for c in line:
+data = as_out.stdout.read()
-        print ord(c)
+for i in range(0, len(data), 1):
-        print ", "
+        sys.stdout.write("0x%0.2X, " % ord(data[i:i+1]))
-print " 0 };\n\n"
+
-print "int builtins_bitcode_" + target + "_length = " + str(length) + ";\n"
+        if i%width == (width-1):
            sys.stdout.write("\n")
 sys.stdout.write("0x00 };\n\n")
 sys.stdout.write("int builtins_bitcode_" + target + "_length = " + str(i+1) + ";\n")
 as_out.wait()
--- a/buildall.bat
+++ b/buildall.bat
@@ -0,0 +1,15 @@
@echo off
 REM If LLVM_INSTALL_DIR isn't set globally in your environment,
 REM it can be set here_
 set LLVM_INSTALL_DIR=c:\users\mmp\llvm-dev
 REM Both the LLVM binaries and python need to be in the path
 set path=%LLVM_INSTALL_DIR%\bin;%PATH%;c:\cygwin\bin
 msbuild ispc.vcxproj /V:m /p:Platform=Win32 /p:Configuration=Release
 msbuild examples\examples.sln /V:m /p:Platform=x64 /p:Configuration=Release /t:rebuild
 msbuild examples\examples.sln /V:m /p:Platform=x64 /p:Configuration=Debug /t:rebuild
 msbuild examples\examples.sln /V:m /p:Platform=Win32 /p:Configuration=Release /t:rebuild
 msbuild examples\examples.sln /V:m /p:Platform=Win32 /p:Configuration=Debug /t:rebuild
--- a/buildispc.bat
+++ b/buildispc.bat
@@ -0,0 +1,11 @@
@echo off
 REM If LLVM_INSTALL_DIR isn't set globally in your environment,
 REM it can be set here_
 set LLVM_INSTALL_DIR=c:\users\mmp\llvm-dev
 set LLVM_VERSION=3.1svn
 REM Both the LLVM binaries and python need to be in the path
 set path=%LLVM_INSTALL_DIR%\bin;%PATH%;c:\cygwin\bin
 msbuild ispc.vcxproj /V:m /p:Platform=Win32 /p:Configuration=Release
--- a/builtins-sse.ll
+++ b/builtins-sse.ll
@@ -1,417 +0,0 @@
 ;;  Copyright (c) 2010-2011, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
 ;;  modification, are permitted provided that the following conditions are
 ;;  met:
 ;;
 ;;    * Redistributions of source code must retain the above copyright
 ;;      notice, this list of conditions and the following disclaimer.
 ;;
 ;;    * Redistributions in binary form must reproduce the above copyright
 ;;      notice, this list of conditions and the following disclaimer in the
 ;;      documentation and/or other materials provided with the distribution.
 ;;
 ;;    * Neither the name of Intel Corporation nor the names of its
 ;;      contributors may be used to endorse or promote products derived from
 ;;      this software without specific prior written permission.
 ;;
 ;;
 ;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 ;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 ;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 ;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
 ;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 ;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 ;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 ;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 ;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 ;; This file declares implementations of various stdlib builtins that
 ;; only require SSE version 1 and 2 functionality; this file, in turn
 ;; is then included by builtins-sse2.ll and builtins-sse4.ll to provide
 ;; those definitions for them.
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 int64minmax(4)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
 declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
 declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
 define internal <4 x float> @__rcp_varying_float(<4 x float>) nounwind readonly alwaysinline {
  %call = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %0)
  ; do one N-R iteration to improve precision
  ;  float iv = __rcp_v(v);
  ;  return iv * (2. - v * iv);
  %v_iv = fmul <4 x float> %0, %call
  %two_minus = fsub <4 x float> <float 2., float 2., float 2., float 2.>, %v_iv  
  %iv_mul = fmul <4 x float> %call, %two_minus
  ret <4 x float> %iv_mul
 }
 define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
  ; do the rcpss call
  %vecval = insertelement <4 x float> undef, float %0, i32 0
  %call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
  %scall = extractelement <4 x float> %call, i32 0
  ; do one N-R iteration to improve precision, as above
  %v_iv = fmul float %0, %scall
  %two_minus = fsub float 2., %v_iv  
  %iv_mul = fmul float %scall, %two_minus
  ret float %iv_mul
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; rsqrt
 declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
 declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
 define internal <4 x float> @__rsqrt_varying_float(<4 x float> %v) nounwind readonly alwaysinline {
  ;  float is = __rsqrt_v(v);
  %is = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %v)
  ; Newton-Raphson iteration to improve precision
  ;  return 0.5 * is * (3. - (v * is) * is);
  %v_is = fmul <4 x float> %v, %is
  %v_is_is = fmul <4 x float> %v_is, %is
  %three_sub = fsub <4 x float> <float 3., float 3., float 3., float 3.>, %v_is_is
  %is_mul = fmul <4 x float> %is, %three_sub
  %half_scale = fmul <4 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
  ret <4 x float> %half_scale
 }
 define internal float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
  ;  uniform float is = extract(__rsqrt_u(v), 0);
  %v = insertelement <4 x float> undef, float %0, i32 0
  %vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
  %is = extractelement <4 x float> %vis, i32 0
  ; Newton-Raphson iteration to improve precision
  ;  return 0.5 * is * (3. - (v * is) * is);
  %v_is = fmul float %0, %is
  %v_is_is = fmul float %v_is, %is
  %three_sub = fsub float 3., %v_is_is
  %is_mul = fmul float %is, %three_sub
  %half_scale = fmul float 0.5, %is_mul
  ret float %half_scale
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; sqrt
 declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
 declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
 define internal <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysinline {
  %call = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %0)
  ret <4 x float> %call
 }
 define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
  sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
  ret float %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; fast math mode
 declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
 declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
 define internal void @__fastmath() nounwind alwaysinline {
  %ptr = alloca i32
  %ptr8 = bitcast i32 * %ptr to i8 *
  call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
  %oldval = load i32 *%ptr
  ; turn on DAZ (64)/FTZ (32768) -> 32832
  %update = or i32 %oldval, 32832
  store i32 %update, i32 *%ptr
  call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8)
  ret void
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; svml stuff
 declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
 declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
 declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
 declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
 declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
 declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
 declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
 declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
 declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
 define internal <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
  %ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
  ret <4 x float> %ret
 }
 define internal <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
  %ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
  ret <4 x float> %ret
 }
 define internal void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
  %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
  store <4 x float> %s, <4 x float> * %1
  ret void
 }
 define internal <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
  %ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
  ret <4 x float> %ret
 }
 define internal <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
  %ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
  ret <4 x float> %ret
 }
 define internal <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
  %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
  ret <4 x float> %ret
 }
 define internal <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
  %ret = call <4 x float> @__svml_expf4(<4 x float> %0)
  ret <4 x float> %ret
 }
 define internal <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
  %ret = call <4 x float> @__svml_logf4(<4 x float> %0)
  ret <4 x float> %ret
 }
 define internal <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
  %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
  ret <4 x float> %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
 declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
 declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
 declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
 declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
 define internal <4 x float> @__max_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
  %call = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %0, <4 x float> %1)
  ret <4 x float> %call
 }
 define internal float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
  ret float %ret
 }
 define internal <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
  %call = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %0, <4 x float> %1)
  ret <4 x float> %call
 }
 define internal float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
  ret float %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; double precision sqrt
 declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
 declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
 define internal <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
  unary2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
  ret <4 x double> %ret
 }
 define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
  sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
  ret double %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; double precision min/max
 declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
 declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
 declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
 declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
 define internal <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone {
  binary2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
  ret <4 x double> %ret
 }
 define internal double @__min_uniform_double(double, double) nounwind readnone {
  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
  ret double %ret
 }
 define internal <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone {
  binary2to4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
  ret <4 x double> %ret
 }
 define internal double @__max_uniform_double(double, double) nounwind readnone {
  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
  ret double %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
 declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
 define internal i32 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
  %floatmask = bitcast <4 x i32> %0 to <4 x float>
  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
  ret i32 %v
 }
 define internal float @__reduce_min_float(<4 x float>) nounwind readnone {
  reduce4(float, @__min_varying_float, @__min_uniform_float)
 }
 define internal float @__reduce_max_float(<4 x float>) nounwind readnone {
  reduce4(float, @__max_varying_float, @__max_uniform_float)
 }
 define internal i32 @__reduce_add_int32(<4 x i32> %v) nounwind readnone {
  %v1 = shufflevector <4 x i32> %v, <4 x i32> undef,
                      <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
  %m1 = add <4 x i32> %v1, %v
  %m1a = extractelement <4 x i32> %m1, i32 0
  %m1b = extractelement <4 x i32> %m1, i32 1
  %sum = add i32 %m1a, %m1b
  ret i32 %sum
 }
 define internal i32 @__reduce_min_int32(<4 x i32>) nounwind readnone {
  reduce4(i32, @__min_varying_int32, @__min_uniform_int32)
 }
 define internal i32 @__reduce_max_int32(<4 x i32>) nounwind readnone {
  reduce4(i32, @__max_varying_int32, @__max_uniform_int32)
 }
 define internal i32 @__reduce_add_uint32(<4 x i32> %v) nounwind readnone {
  %r = call i32 @__reduce_add_int32(<4 x i32> %v)
  ret i32 %r
 }
 define internal i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone {
  reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32)
 }
 define internal i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone {
  reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32)
 }
 define internal double @__reduce_add_double(<4 x double>) nounwind readnone {
  %v0 = shufflevector <4 x double> %0, <4 x double> undef,
                      <2 x i32> <i32 0, i32 1>
  %v1 = shufflevector <4 x double> %0, <4 x double> undef,
                      <2 x i32> <i32 2, i32 3>
  %sum = fadd <2 x double> %v0, %v1
  %e0 = extractelement <2 x double> %sum, i32 0
  %e1 = extractelement <2 x double> %sum, i32 1
  %m = fadd double %e0, %e1
  ret double %m
 }
 define internal double @__reduce_min_double(<4 x double>) nounwind readnone {
  reduce4(double, @__min_varying_double, @__min_uniform_double)
 }
 define internal double @__reduce_max_double(<4 x double>) nounwind readnone {
  reduce4(double, @__max_varying_double, @__max_uniform_double)
 }
 define internal i64 @__reduce_add_int64(<4 x i64>) nounwind readnone {
  %v0 = shufflevector <4 x i64> %0, <4 x i64> undef,
                      <2 x i32> <i32 0, i32 1>
  %v1 = shufflevector <4 x i64> %0, <4 x i64> undef,
                      <2 x i32> <i32 2, i32 3>
  %sum = add <2 x i64> %v0, %v1
  %e0 = extractelement <2 x i64> %sum, i32 0
  %e1 = extractelement <2 x i64> %sum, i32 1
  %m = add i64 %e0, %e1
  ret i64 %m
 }
 define internal i64 @__reduce_min_int64(<4 x i64>) nounwind readnone {
  reduce4(i64, @__min_varying_int64, @__min_uniform_int64)
 }
 define internal i64 @__reduce_max_int64(<4 x i64>) nounwind readnone {
  reduce4(i64, @__max_varying_int64, @__max_uniform_int64)
 }
 define internal i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone {
  reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64)
 }
 define internal i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone {
  reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
 }
 reduce_equal(4)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store
 masked_store_blend_8_16_by_4()
 gen_masked_store(4, i8, 8)
 gen_masked_store(4, i16, 16)
 gen_masked_store(4, i32, 32)
 gen_masked_store(4, i64, 64)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts
 load_and_broadcast(4, i8, 8)
 load_and_broadcast(4, i16, 16)
 load_and_broadcast(4, i32, 32)
 load_and_broadcast(4, i64, 64)
 load_masked(4, i8,  8,  1)
 load_masked(4, i16, 16, 2)
 load_masked(4, i32, 32, 4)
 load_masked(4, i64, 64, 8)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter
 ; define these with the macros from stdlib.m4
 gen_gather(4, i8)
 gen_gather(4, i16)
 gen_gather(4, i32)
 gen_gather(4, i64)
 gen_scatter(4, i8)
 gen_scatter(4, i16)
 gen_scatter(4, i32)
 gen_scatter(4, i64)
--- a/builtins-sse2.ll
+++ b/builtins-sse2.ll
@@ -1,357 +0,0 @@
 ;;  Copyright (c) 2010-2011, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
 ;;  modification, are permitted provided that the following conditions are
 ;;  met:
 ;;
 ;;    * Redistributions of source code must retain the above copyright
 ;;      notice, this list of conditions and the following disclaimer.
 ;;
 ;;    * Redistributions in binary form must reproduce the above copyright
 ;;      notice, this list of conditions and the following disclaimer in the
 ;;      documentation and/or other materials provided with the distribution.
 ;;
 ;;    * Neither the name of Intel Corporation nor the names of its
 ;;      contributors may be used to endorse or promote products derived from
 ;;      this software without specific prior written permission.
 ;;
 ;;
 ;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 ;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 ;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 ;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
 ;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 ;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 ;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 ;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 ;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; Define the standard library builtins for the SSE2 target
 ; Define some basics for a 4-wide target
 stdlib_core(4)
 packed_load_and_store(4)
 scans(4)
 ; Include the various definitions of things that only require SSE1 and SSE2
 include(`builtins-sse.ll')
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding
 ;;
 ;; There are not any rounding instructions in SSE2, so we have to emulate
 ;; the functionality with multiple instructions...
 ; The code for __round_* is the result of compiling the following source
 ; code.
 ;
 ; export float Round(float x) {
 ;    unsigned int sign = signbits(x);
 ;    unsigned int ix = intbits(x);
 ;    ix ^= sign;
 ;    x = floatbits(ix);
 ;    x += 0x1.0p23f;
 ;    x -= 0x1.0p23f;
 ;    ix = intbits(x);
 ;    ix ^= sign;
 ;    x = floatbits(ix);
 ;    return x;
 ;}
 define internal <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline {
  %float_to_int_bitcast.i.i.i.i = bitcast <4 x float> %0 to <4 x i32>
  %bitop.i.i = and <4 x i32> %float_to_int_bitcast.i.i.i.i, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
  %bitop.i = xor <4 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
  %int_to_float_bitcast.i.i40.i = bitcast <4 x i32> %bitop.i to <4 x float>
  %binop.i = fadd <4 x float> %int_to_float_bitcast.i.i40.i, <float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06>
  %binop21.i = fadd <4 x float> %binop.i, <float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06>
  %float_to_int_bitcast.i.i.i = bitcast <4 x float> %binop21.i to <4 x i32>
  %bitop31.i = xor <4 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
  %int_to_float_bitcast.i.i.i = bitcast <4 x i32> %bitop31.i to <4 x float>
  ret <4 x float> %int_to_float_bitcast.i.i.i
 }
 define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
  %float_to_int_bitcast.i.i.i.i = bitcast float %0 to i32
  %bitop.i.i = and i32 %float_to_int_bitcast.i.i.i.i, -2147483648
  %bitop.i = xor i32 %bitop.i.i, %float_to_int_bitcast.i.i.i.i
  %int_to_float_bitcast.i.i40.i = bitcast i32 %bitop.i to float
  %binop.i = fadd float %int_to_float_bitcast.i.i40.i, 8.388608e+06
  %binop21.i = fadd float %binop.i, -8.388608e+06
  %float_to_int_bitcast.i.i.i = bitcast float %binop21.i to i32
  %bitop31.i = xor i32 %float_to_int_bitcast.i.i.i, %bitop.i.i
  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop31.i to float
  ret float %int_to_float_bitcast.i.i.i
 }
 ;; Similarly, for implementations of the __floor* functions below, we have the
 ;; bitcode from compiling the following source code...
 ;export float Floor(float x) {
 ;    float y = Round(x);
 ;    unsigned int cmp = y > x ? 0xffffffff : 0;
 ;    float delta = -1.f;
 ;    unsigned int idelta = intbits(delta);
 ;    idelta &= cmp;
 ;    delta = floatbits(idelta);
 ;    return y + delta;
 ;}
 define internal <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
  %calltmp.i = tail call <4 x float> @__round_varying_float(<4 x float> %0) nounwind
  %bincmp.i = fcmp ogt <4 x float> %calltmp.i, %0
  %val_to_boolvec32.i = sext <4 x i1> %bincmp.i to <4 x i32>
  %bitop.i = and <4 x i32> %val_to_boolvec32.i, <i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432>
  %int_to_float_bitcast.i.i.i = bitcast <4 x i32> %bitop.i to <4 x float>
  %binop.i = fadd <4 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
  ret <4 x float> %binop.i
 }
 define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
  %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
  %bincmp.i = fcmp ogt float %calltmp.i, %0
  %selectexpr.i = sext i1 %bincmp.i to i32
  %bitop.i = and i32 %selectexpr.i, -1082130432
  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
  %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
  ret float %binop.i
 }
 ;; And here is the code we compiled to get the __ceil* functions below
 ;
 ;export uniform float Ceil(uniform float x) {
 ;    uniform float y = Round(x);
 ;    uniform int yltx = y < x ? 0xffffffff : 0;
 ;    uniform float delta = 1.f;
 ;    uniform int idelta = intbits(delta);
 ;    idelta &= yltx;
 ;    delta = floatbits(idelta);
 ;    return y + delta;
 ;}
 define internal <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
  %calltmp.i = tail call <4 x float> @__round_varying_float(<4 x float> %0) nounwind
  %bincmp.i = fcmp olt <4 x float> %calltmp.i, %0
  %val_to_boolvec32.i = sext <4 x i1> %bincmp.i to <4 x i32>
  %bitop.i = and <4 x i32> %val_to_boolvec32.i, <i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216>
  %int_to_float_bitcast.i.i.i = bitcast <4 x i32> %bitop.i to <4 x float>
  %binop.i = fadd <4 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
  ret <4 x float> %binop.i
 }
 define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
  %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
  %bincmp.i = fcmp olt float %calltmp.i, %0
  %selectexpr.i = sext i1 %bincmp.i to i32
  %bitop.i = and i32 %selectexpr.i, 1065353216
  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
  %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
  ret float %binop.i
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding doubles
 declare double @round(double)
 declare double @floor(double)
 declare double @ceil(double)
 define internal <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline {
  unary1to4(double, @round)
 }
 define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
  %r = call double @round(double %0)
  ret double %r
 }
 define internal <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
  unary1to4(double, @floor)
 }
 define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
  %r = call double @floor(double %0)
  ret double %r
 }
 define internal <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
  unary1to4(double, @ceil)
 }
 define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
  %r = call double @ceil(double %0)
  ret double %r
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; min/max
 ; There is no blend instruction with SSE2, so we simulate it with bit
 ; operations on i32s.  For these two vselect functions, for each
 ; vector element, if the mask is on, we return the corresponding value
 ; from %1, and otherwise return the value from %0.
 define internal <4 x i32> @__vselect_i32(<4 x i32>, <4 x i32> ,
                                         <4 x i32> %mask) nounwind readnone alwaysinline {
  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
  %cleared_old = and <4 x i32> %0, %notmask
  %masked_new = and <4 x i32> %1, %mask
  %new = or <4 x i32> %cleared_old, %masked_new
  ret <4 x i32> %new
 }
 define internal <4 x float> @__vselect_float(<4 x float>, <4 x float>,
                                             <4 x i32> %mask) nounwind readnone alwaysinline {
  %v0 = bitcast <4 x float> %0 to <4 x i32>
  %v1 = bitcast <4 x float> %1 to <4 x i32>
  %r = call <4 x i32> @__vselect_i32(<4 x i32> %v0, <4 x i32> %v1, <4 x i32> %mask)
  %rf = bitcast <4 x i32> %r to <4 x float>
  ret <4 x float> %rf
 }
 ; To do vector integer min and max, we do the vector compare and then sign
 ; extend the i1 vector result to an i32 mask.  The __vselect does the
 ; rest...
 define internal <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
  %c = icmp slt <4 x i32> %0, %1
  %mask = sext <4 x i1> %c to <4 x i32>
  %v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
  ret <4 x i32> %v
 }
 define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
  %c = icmp slt i32 %0, %1
  %r = select i1 %c, i32 %0, i32 %1
  ret i32 %r
 }
 define internal <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
  %c = icmp sgt <4 x i32> %0, %1
  %mask = sext <4 x i1> %c to <4 x i32>
  %v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
  ret <4 x i32> %v
 }
 define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
  %c = icmp sgt i32 %0, %1
  %r = select i1 %c, i32 %0, i32 %1
  ret i32 %r
 }
 ; The functions for unsigned ints are similar, just with unsigned
 ; comparison functions...
 define internal <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
  %c = icmp ult <4 x i32> %0, %1
  %mask = sext <4 x i1> %c to <4 x i32>
  %v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
  ret <4 x i32> %v
 }
 define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
  %c = icmp ult i32 %0, %1
  %r = select i1 %c, i32 %0, i32 %1
  ret i32 %r
 }
 define internal <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
  %c = icmp ugt <4 x i32> %0, %1
  %mask = sext <4 x i1> %c to <4 x i32>
  %v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
  ret <4 x i32> %v
 }
 define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
  %c = icmp ugt i32 %0, %1
  %r = select i1 %c, i32 %0, i32 %1
  ret i32 %r
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
 declare i32 @llvm.ctpop.i32(i32)
 declare i64 @llvm.ctpop.i64(i64)
 define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
  %val = call i32 @llvm.ctpop.i32(i32 %0)
  ret i32 %val
 }
 define internal i64 @__popcnt_int64(i64) nounwind readnone alwaysinline {
  %val = call i64 @llvm.ctpop.i64(i64 %0)
  ret i64 %val
 }
 define internal float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline {
  %v1 = shufflevector <4 x float> %v, <4 x float> undef,
                      <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
  %m1 = fadd <4 x float> %v1, %v
  %m1a = extractelement <4 x float> %m1, i32 0
  %m1b = extractelement <4 x float> %m1, i32 1
  %sum = fadd float %m1a, %m1b
  ret float %sum
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store
 define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>, 
                                     <4 x i32> %mask) nounwind alwaysinline {
  %val = load <4 x i32> * %0, align 4
  %newval = call <4 x i32> @__vselect_i32(<4 x i32> %val, <4 x i32> %1, <4 x i32> %mask) 
  store <4 x i32> %newval, <4 x i32> * %0, align 4
  ret void
 }
 define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
                                     <4 x i32> %mask) nounwind alwaysinline {
  %oldValue = load <4 x i64>* %ptr, align 8
  ; Do 4x64-bit blends by doing two <4 x i32> blends, where the <4 x i32> values
  ; are actually bitcast <2 x i64> values
  ;
  ; set up the first two 64-bit values
  %old01  = shufflevector <4 x i64> %oldValue, <4 x i64> undef,
                          <2 x i32> <i32 0, i32 1>
  %old01f = bitcast <2 x i64> %old01 to <4 x float>
  %new01  = shufflevector <4 x i64> %new, <4 x i64> undef,
                          <2 x i32> <i32 0, i32 1>
  %new01f = bitcast <2 x i64> %new01 to <4 x float>
  ; compute mask--note that the indices 0 and 1 are doubled-up
  %mask01 = shufflevector <4 x i32> %mask, <4 x i32> undef,
                          <4 x i32> <i32 0, i32 0, i32 1, i32 1>
  ; and blend the two of the values
  %result01f = call <4 x float> @__vselect_float(<4 x float> %old01f, <4 x float> %new01f, <4 x i32> %mask01)
  %result01 = bitcast <4 x float> %result01f to <2 x i64>
  ; and again
  %old23  = shufflevector <4 x i64> %oldValue, <4 x i64> undef,
                          <2 x i32> <i32 2, i32 3>
  %old23f = bitcast <2 x i64> %old23 to <4 x float>
  %new23  = shufflevector <4 x i64> %new, <4 x i64> undef,
                          <2 x i32> <i32 2, i32 3>
  %new23f = bitcast <2 x i64> %new23 to <4 x float>
  ; compute mask--note that the values 2 and 3 are doubled-up
  %mask23 = shufflevector <4 x i32> %mask, <4 x i32> undef,
                          <4 x i32> <i32 2, i32 2, i32 3, i32 3>
  ; and blend the two of the values
  %result23f = call <4 x float> @__vselect_float(<4 x float> %old23f, <4 x float> %new23f, <4 x i32> %mask23)
  %result23 = bitcast <4 x float> %result23f to <2 x i64>
  ; reconstruct the final <4 x i64> vector
  %final = shufflevector <2 x i64> %result01, <2 x i64> %result23,
                         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  store <4 x i64> %final, <4 x i64> * %ptr, align 8
  ret void
 }
--- a/builtins-sse4.ll
+++ b/builtins-sse4.ll
@@ -1,300 +0,0 @@
 ;;  Copyright (c) 2010-2011, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
 ;;  modification, are permitted provided that the following conditions are
 ;;  met:
 ;;
 ;;    * Redistributions of source code must retain the above copyright
 ;;      notice, this list of conditions and the following disclaimer.
 ;;
 ;;    * Redistributions in binary form must reproduce the above copyright
 ;;      notice, this list of conditions and the following disclaimer in the
 ;;      documentation and/or other materials provided with the distribution.
 ;;
 ;;    * Neither the name of Intel Corporation nor the names of its
 ;;      contributors may be used to endorse or promote products derived from
 ;;      this software without specific prior written permission.
 ;;
 ;;
 ;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 ;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 ;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 ;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
 ;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 ;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 ;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 ;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 ;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; Define common 4-wide stuff
 stdlib_core(4)
 packed_load_and_store(4)
 scans(4)
 ; Define the stuff that can be done with base SSE1/SSE2 instructions
 include(`builtins-sse.ll')
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding floats
 declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
 declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
 define internal <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline {
  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 8)
  ret <4 x float> %call
 }
 define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
  ; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
  ; the roundss intrinsic is a total mess--docs say:
  ;
  ;  __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
  ;       
  ;  b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
  ;  on b0. The higher order 96 bits are copied directly from input parameter a. The
  ;  return value is described by the following equations:
  ;
  ;  r0 = RND(b0)
  ;  r1 = a1
  ;  r2 = a2
  ;  r3 = a3
  ;
  ;  It doesn't matter what we pass as a, since we only need the r0 value
  ;  here.  So we pass the same register for both.  Further, only the 0th
  ;  element of the b parameter matters
  %xi = insertelement <4 x float> undef, float %0, i32 0
  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
  %rs = extractelement <4 x float> %xr, i32 0
  ret float %rs
 }
 define internal <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 9)
  ret <4 x float> %call
 }
 define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <4 x float> undef, float %0, i32 0
  ; roundps, round down 0b01 | don't signal precision exceptions 0b1010 = 9
  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
  %rs = extractelement <4 x float> %xr, i32 0
  ret float %rs
 }
 define internal <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 10)
  ret <4 x float> %call
 }
 define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <4 x float> undef, float %0, i32 0
  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
  %rs = extractelement <4 x float> %xr, i32 0
  ret float %rs
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding doubles
 declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
 declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
 define internal <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline {
  round2to4double(%0, 8)
 }
 define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
  %xi = insertelement <2 x double> undef, double %0, i32 0
  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
  %rs = extractelement <2 x double> %xr, i32 0
  ret double %rs
 }
 define internal <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  round2to4double(%0, 9)
 }
 define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <2 x double> undef, double %0, i32 0
  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
  %rs = extractelement <2 x double> %xr, i32 0
  ret double %rs
 }
 define internal <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  round2to4double(%0, 10)
 }
 define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <2 x double> undef, double %0, i32 0
  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
  %rs = extractelement <2 x double> %xr, i32 0
  ret double %rs
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int32 min/max
 declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
 declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
 define internal <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
  %call = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %0, <4 x i32> %1)
  ret <4 x i32> %call
 }
 define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminsd, %0, %1)
  ret i32 %ret
 }
 define internal <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
  %call = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %0, <4 x i32> %1)
  ret <4 x i32> %call
 }
 define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
  ret i32 %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; unsigned int min/max
 declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
 declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
 define internal <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
  %call = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %0, <4 x i32> %1)
  ret <4 x i32> %call
 }
 define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminud, %0, %1)
  ret i32 %ret
 }
 define internal <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
  %call = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %0, <4 x i32> %1)
  ret <4 x i32> %call
 }
 define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxud, %0, %1)
  ret i32 %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
 declare i32 @llvm.ctpop.i32(i32) nounwind readnone
 define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
  %call = call i32 @llvm.ctpop.i32(i32 %0)
  ret i32 %call
 }
 declare i64 @llvm.ctpop.i64(i64) nounwind readnone
 define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
  %call = call i64 @llvm.ctpop.i64(i64 %0)
  ret i64 %call
 }
 declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
 define internal float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline {
  %v1 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %0, <4 x float> %0)
  %v2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %v1, <4 x float> %v1)
  %scalar = extractelement <4 x float> %v2, i32 0
  ret float %scalar
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store
 declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
                                             <4 x float>) nounwind readnone
 define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>, 
                                     <4 x i32> %mask) nounwind alwaysinline {
  %mask_as_float = bitcast <4 x i32> %mask to <4 x float>
  %oldValue = load <4 x i32>* %0, align 4
  %oldAsFloat = bitcast <4 x i32> %oldValue to <4 x float>
  %newAsFloat = bitcast <4 x i32> %1 to <4 x float>
  %blend = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %oldAsFloat,
                                                     <4 x float> %newAsFloat,
                                                     <4 x float> %mask_as_float)
  %blendAsInt = bitcast <4 x float> %blend to <4 x i32>
  store <4 x i32> %blendAsInt, <4 x i32>* %0, align 4
  ret void
 }
 define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
                                     <4 x i32> %i32mask) nounwind alwaysinline {
  %oldValue = load <4 x i64>* %ptr, align 8
  %mask = bitcast <4 x i32> %i32mask to <4 x float>
  ; Do 4x64-bit blends by doing two <4 x i32> blends, where the <4 x i32> values
  ; are actually bitcast <2 x i64> values
  ;
  ; set up the first two 64-bit values
  %old01  = shufflevector <4 x i64> %oldValue, <4 x i64> undef,
                          <2 x i32> <i32 0, i32 1>
  %old01f = bitcast <2 x i64> %old01 to <4 x float>
  %new01  = shufflevector <4 x i64> %new, <4 x i64> undef,
                          <2 x i32> <i32 0, i32 1>
  %new01f = bitcast <2 x i64> %new01 to <4 x float>
  ; compute mask--note that the indices 0 and 1 are doubled-up
  %mask01 = shufflevector <4 x float> %mask, <4 x float> undef,
                          <4 x i32> <i32 0, i32 0, i32 1, i32 1>
  ; and blend the two of the values
  %result01f = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %old01f,
                                                         <4 x float> %new01f,
                                                         <4 x float> %mask01)
  %result01 = bitcast <4 x float> %result01f to <2 x i64>
  ; and again
  %old23  = shufflevector <4 x i64> %oldValue, <4 x i64> undef,
                          <2 x i32> <i32 2, i32 3>
  %old23f = bitcast <2 x i64> %old23 to <4 x float>
  %new23  = shufflevector <4 x i64> %new, <4 x i64> undef,
                          <2 x i32> <i32 2, i32 3>
  %new23f = bitcast <2 x i64> %new23 to <4 x float>
  ; compute mask--note that the values 2 and 3 are doubled-up
  %mask23 = shufflevector <4 x float> %mask, <4 x float> undef,
                          <4 x i32> <i32 2, i32 2, i32 3, i32 3>
  ; and blend the two of the values
  %result23f = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %old23f,
                                                         <4 x float> %new23f,
                                                         <4 x float> %mask23)
  %result23 = bitcast <4 x float> %result23f to <2 x i64>
  ; reconstruct the final <4 x i64> vector
  %final = shufflevector <2 x i64> %result01, <2 x i64> %result23,
                         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  store <4 x i64> %final, <4 x i64> * %ptr, align 8
  ret void
 }
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -99,6 +99,9 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
        return intAsUnsigned ? AtomicType::UniformUInt64 : AtomicType::UniformInt64;
    // varying
    if (LLVMTypes::MaskType != LLVMTypes::Int32VectorType &&
        t == LLVMTypes::MaskType)
        return AtomicType::VaryingBool;
    else if (t == LLVMTypes::Int8VectorType)
        return intAsUnsigned ? AtomicType::VaryingUInt8 : AtomicType::VaryingInt8;
    else if (t == LLVMTypes::Int16VectorType)
@@ -114,59 +117,39 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
    // pointers to uniform
    else if (t == LLVMTypes::Int8PointerType)
-        return new ReferenceType(intAsUnsigned ? AtomicType::UniformUInt8 :
+        return PointerType::GetUniform(intAsUnsigned ? AtomicType::UniformUInt8 :
-                                                 AtomicType::UniformInt8, false);
+                                       AtomicType::UniformInt8);
    else if (t == LLVMTypes::Int16PointerType)
-        return new ReferenceType(intAsUnsigned ? AtomicType::UniformUInt16 :
+        return PointerType::GetUniform(intAsUnsigned ? AtomicType::UniformUInt16 :
-                                                 AtomicType::UniformInt16, false);
+                                       AtomicType::UniformInt16);
    else if (t == LLVMTypes::Int32PointerType)
-        return new ReferenceType(intAsUnsigned ? AtomicType::UniformUInt32 :
+        return PointerType::GetUniform(intAsUnsigned ? AtomicType::UniformUInt32 :
-                                                 AtomicType::UniformInt32, false);
+                                       AtomicType::UniformInt32);
    else if (t == LLVMTypes::Int64PointerType)
-        return new ReferenceType(intAsUnsigned ? AtomicType::UniformUInt64 :
+        return PointerType::GetUniform(intAsUnsigned ? AtomicType::UniformUInt64 :
-                                                 AtomicType::UniformInt64, false);
+                                       AtomicType::UniformInt64);
    else if (t == LLVMTypes::FloatPointerType)
-        return new ReferenceType(AtomicType::UniformFloat, false);
+        return PointerType::GetUniform(AtomicType::UniformFloat);
    else if (t == LLVMTypes::DoublePointerType)
-        return new ReferenceType(AtomicType::UniformDouble, false);
+        return PointerType::GetUniform(AtomicType::UniformDouble);
    // pointers to varying
    else if (t == LLVMTypes::Int8VectorPointerType)
-        return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt8 :
+        return PointerType::GetUniform(intAsUnsigned ? AtomicType::VaryingUInt8 :
-                                                 AtomicType::VaryingInt8, false);
+                                       AtomicType::VaryingInt8);
    else if (t == LLVMTypes::Int16VectorPointerType)
-        return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt16 :
+        return PointerType::GetUniform(intAsUnsigned ? AtomicType::VaryingUInt16 :
-                                                 AtomicType::VaryingInt16, false);
+                                       AtomicType::VaryingInt16);
    else if (t == LLVMTypes::Int32VectorPointerType)
-        return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt32 :
+        return PointerType::GetUniform(intAsUnsigned ? AtomicType::VaryingUInt32 :
-                                                 AtomicType::VaryingInt32, false);
+                                       AtomicType::VaryingInt32);
    else if (t == LLVMTypes::Int64VectorPointerType)
-        return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt64 :
+        return PointerType::GetUniform(intAsUnsigned ? AtomicType::VaryingUInt64 :
-                                                 AtomicType::VaryingInt64, false);
+                                       AtomicType::VaryingInt64);
    else if (t == LLVMTypes::FloatVectorPointerType)
-        return new ReferenceType(AtomicType::VaryingFloat, false);
+        return PointerType::GetUniform(AtomicType::VaryingFloat);
    else if (t == LLVMTypes::DoubleVectorPointerType)
-        return new ReferenceType(AtomicType::VaryingDouble, false);
+        return PointerType::GetUniform(AtomicType::VaryingDouble);
    // arrays
    else if (llvm::isa<const llvm::PointerType>(t)) {
        const llvm::PointerType *pt = llvm::dyn_cast<const llvm::PointerType>(t);
        // Is it a pointer to an unsized array of objects?  If so, then
        // create the equivalent ispc type.  Note that it has to be a
        // reference to an array, since ispc passes arrays to functions by
        // reference.
        const llvm::ArrayType *at = 
            llvm::dyn_cast<const llvm::ArrayType>(pt->getElementType());
        if (at != NULL) {
            const Type *eltType = lLLVMTypeToISPCType(at->getElementType(),
                                                      intAsUnsigned);
            if (eltType == NULL)
                return NULL;
            return new ReferenceType(new ArrayType(eltType, at->getNumElements()),
                                     false);
        }
    }
    return NULL;
 }
@@ -181,11 +164,9 @@ lCreateSymbol(const std::string &name, const Type *returnType,
    noPos.name = "__stdlib";
    FunctionType *funcType = new FunctionType(returnType, argTypes, noPos);
-    // set NULL default arguments
+
-    std::vector<ConstExpr *> defaults;
+    Debug(noPos, "Created builtin symbol \"%s\" [%s]\n", name.c_str(),
-    for (unsigned int j = 0; j < ftype->getNumParams(); ++j)
+          funcType->GetString().c_str());
        defaults.push_back(NULL);
    funcType->SetArgumentDefaults(defaults);
    Symbol *sym = new Symbol(name, noPos, funcType);
    sym->function = func;
@@ -208,20 +189,20 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
    if (name.size() < 3 || name[0] != '_' || name[1] != '_')
        return false;
    Debug(SourcePos(), "Attempting to create ispc symbol for function \"%s\".",
          name.c_str());
    // An unfortunate hack: we want this builtin function to have the
    // signature "int __sext_varying_bool(bool)", but the ispc function
    // symbol creation code below assumes that any LLVM vector of i32s is a
    // varying int32.  Here, we need that to be interpreted as a varying
    // bool, so just have a one-off override for that one...
-    if (name == "__sext_varying_bool") {
+    if (g->target.maskBitCount != 1 && name == "__sext_varying_bool") {
        const Type *returnType = AtomicType::VaryingInt32;
        std::vector<const Type *> argTypes;
        argTypes.push_back(AtomicType::VaryingBool);
        std::vector<ConstExpr *> defaults;
        defaults.push_back(NULL);
        FunctionType *funcType = new FunctionType(returnType, argTypes, noPos);
        funcType->SetArgumentDefaults(defaults);
        Symbol *sym = new Symbol(name, noPos, funcType);
        sym->function = func;
@@ -238,22 +219,27 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
        const Type *returnType = lLLVMTypeToISPCType(ftype->getReturnType(),
                                                     intAsUnsigned);
-        if (!returnType)
+        if (returnType == NULL) {
            Debug(SourcePos(), "Failed: return type not representable for "
                  "builtin %s.", name.c_str());
            // return type not representable in ispc -> not callable from ispc
            return false;
        }
        // Iterate over the arguments and try to find their equivalent ispc
        // types.  Track if any of the arguments has an integer type.
-        bool anyIntArgs = false, anyReferenceArgs = false;
+        bool anyIntArgs = false;
        std::vector<const Type *> argTypes;
        for (unsigned int j = 0; j < ftype->getNumParams(); ++j) {
            const llvm::Type *llvmArgType = ftype->getParamType(j);
            const Type *type = lLLVMTypeToISPCType(llvmArgType, intAsUnsigned);
-            if (type == NULL)
+            if (type == NULL) {
                Debug(SourcePos(), "Failed: type of parameter %d not "
                      "representable for builtin %s", j, name.c_str());
                return false;
            }
            anyIntArgs |= 
                (Type::Equal(type, lLLVMTypeToISPCType(llvmArgType, !intAsUnsigned)) == false);
            anyReferenceArgs |= (dynamic_cast<const ReferenceType *>(type) != NULL);
            argTypes.push_back(type);
        }
@@ -261,19 +247,6 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
        // so that we get symbols for things with no integer types!
        if (i == 0 || anyIntArgs == true)
            lCreateSymbol(name, returnType, argTypes, ftype, func, symbolTable);
        // If there are any reference types, also make a variant of the
        // symbol that has them as const references.  This obviously
        // doesn't make sense for many builtins, but we'll give the stdlib
        // the option to call one if it needs one.
        if (anyReferenceArgs == true) {
            for (unsigned int j = 0; j < argTypes.size(); ++j) {
                if (dynamic_cast<const ReferenceType *>(argTypes[j]) != NULL)
                    argTypes[j] = argTypes[j]->GetAsConstType();
                lCreateSymbol(name + "_refsconst", returnType, argTypes, 
                              ftype, func, symbolTable);
            }
        }
    }
    return true;
@@ -287,7 +260,7 @@ static void
 lAddModuleSymbols(llvm::Module *module, SymbolTable *symbolTable) {
 #if 0
    // FIXME: handle globals?
-    assert(module->global_empty());
+    Assert(module->global_empty());
 #endif
    llvm::Module::iterator iter;
@@ -317,16 +290,299 @@ lCheckModuleIntrinsics(llvm::Module *module) {
        // check the llvm.x86.* intrinsics for now...
        if (!strncmp(funcName.c_str(), "llvm.x86.", 9)) {
            llvm::Intrinsic::ID id = (llvm::Intrinsic::ID)func->getIntrinsicID();
-            assert(id != 0);
+            Assert(id != 0);
            LLVM_TYPE_CONST llvm::Type *intrinsicType = 
                llvm::Intrinsic::getType(*g->ctx, id);
            intrinsicType = llvm::PointerType::get(intrinsicType, 0);
-            assert(func->getType() == intrinsicType);
+            Assert(func->getType() == intrinsicType);
        }
    }
 }
 /** We'd like to have all of these functions declared as 'internal' in
    their respective bitcode files so that if they aren't needed by the
    user's program they are elimiated from the final output.  However, if
    we do so, then they aren't brought in by the LinkModules() call below
    since they aren't yet used by anything in the module they're being
    linked with (in LLVM 3.1, at least).
    Therefore, we don't declare them as internal when we first define them,
    but instead mark them as internal after they've been linked in.  This
    is admittedly a kludge.
 */
 static void
 lSetInternalFunctions(llvm::Module *module) {
    const char *names[] = {
        "__add_float",
        "__add_int32",
        "__add_uniform_double",
        "__add_uniform_int32",
        "__add_uniform_int64",
        "__add_varying_double",
        "__add_varying_int32",
        "__add_varying_int64",
        "__aos_to_soa3_float",
        "__aos_to_soa3_float16",
        "__aos_to_soa3_float4",
        "__aos_to_soa3_float8",
        "__aos_to_soa3_int32",
        "__aos_to_soa4_float",
        "__aos_to_soa4_float16",
        "__aos_to_soa4_float4",
        "__aos_to_soa4_float8",
        "__aos_to_soa4_int32",
        "__atomic_add_int32_global",
        "__atomic_add_int64_global",
        "__atomic_add_uniform_int32_global",
        "__atomic_add_uniform_int64_global",
        "__atomic_and_int32_global",
        "__atomic_and_int64_global",
        "__atomic_and_uniform_int32_global",
        "__atomic_and_uniform_int64_global",
        "__atomic_compare_exchange_double_global",
        "__atomic_compare_exchange_float_global",
        "__atomic_compare_exchange_int32_global",
        "__atomic_compare_exchange_int64_global",
        "__atomic_compare_exchange_uniform_double_global",
        "__atomic_compare_exchange_uniform_float_global",
        "__atomic_compare_exchange_uniform_int32_global",
        "__atomic_compare_exchange_uniform_int64_global",
        "__atomic_max_uniform_int32_global",
        "__atomic_max_uniform_int64_global",
        "__atomic_min_uniform_int32_global",
        "__atomic_min_uniform_int64_global",
        "__atomic_or_int32_global",
        "__atomic_or_int64_global",
        "__atomic_or_uniform_int32_global",
        "__atomic_or_uniform_int64_global",
        "__atomic_sub_int32_global",
        "__atomic_sub_int64_global",
        "__atomic_sub_uniform_int32_global",
        "__atomic_sub_uniform_int64_global",
        "__atomic_swap_double_global",
        "__atomic_swap_float_global",
        "__atomic_swap_int32_global",
        "__atomic_swap_int64_global",
        "__atomic_swap_uniform_double_global",
        "__atomic_swap_uniform_float_global",
        "__atomic_swap_uniform_int32_global",
        "__atomic_swap_uniform_int64_global",
        "__atomic_umax_uniform_uint32_global",
        "__atomic_umax_uniform_uint64_global",
        "__atomic_umin_uniform_uint32_global",
        "__atomic_umin_uniform_uint64_global",
        "__atomic_xor_int32_global",
        "__atomic_xor_int64_global",
        "__atomic_xor_uniform_int32_global",
        "__atomic_xor_uniform_int64_global",
        "__broadcast_double",
        "__broadcast_float",
        "__broadcast_i16",
        "__broadcast_i32",
        "__broadcast_i64",
        "__broadcast_i8",
        "__ceil_uniform_double",
        "__ceil_uniform_float",
        "__ceil_varying_double",
        "__ceil_varying_float",
        "__clock",
        "__count_trailing_zeros_i32",
        "__count_trailing_zeros_i64",
        "__count_leading_zeros_i32",
        "__count_leading_zeros_i64",
        "__delete_uniform",
        "__delete_varying",
        "__do_assert_uniform",
        "__do_assert_varying",
        "__do_print", 
        "__doublebits_uniform_int64",
        "__doublebits_varying_int64",
        "__exclusive_scan_add_double",
        "__exclusive_scan_add_float",
        "__exclusive_scan_add_i32",
        "__exclusive_scan_add_i64",
        "__exclusive_scan_and_i32",
        "__exclusive_scan_and_i64",
        "__exclusive_scan_or_i32",
        "__exclusive_scan_or_i64",
        "__extract_int16",
        "__extract_int32",
        "__extract_int64",
        "__extract_int8",
        "__fastmath",
        "__float_to_half_uniform",
        "__float_to_half_varying",
        "__floatbits_uniform_int32",
        "__floatbits_varying_int32",
        "__floor_uniform_double",
        "__floor_uniform_float",
        "__floor_varying_double",
        "__floor_varying_float",
        "__half_to_float_uniform",
        "__half_to_float_varying",
        "__insert_int16",
        "__insert_int32",
        "__insert_int64",
        "__insert_int8",
        "__intbits_uniform_double",
        "__intbits_uniform_float",
        "__intbits_varying_double",
        "__intbits_varying_float",
        "__max_uniform_double",
        "__max_uniform_float",
        "__max_uniform_int32",
        "__max_uniform_int64",
        "__max_uniform_uint32",
        "__max_uniform_uint64",
        "__max_varying_double",
        "__max_varying_float",
        "__max_varying_int32",
        "__max_varying_int64",
        "__max_varying_uint32",
        "__max_varying_uint64",
        "__memory_barrier",
        "__memcpy32",
        "__memcpy64",
        "__memmove32",
        "__memmove64",
        "__memset32",
        "__memset64",
        "__min_uniform_double",
        "__min_uniform_float",
        "__min_uniform_int32",
        "__min_uniform_int64",
        "__min_uniform_uint32",
        "__min_uniform_uint64",
        "__min_varying_double",
        "__min_varying_float",
        "__min_varying_int32",
        "__min_varying_int64",
        "__min_varying_uint32",
        "__min_varying_uint64",
        "__movmsk",
        "__new_uniform",
        "__new_varying32",
        "__new_varying64",
        "__num_cores",
        "__packed_load_active",
        "__packed_store_active",
        "__popcnt_int32",
        "__popcnt_int64",
        "__prefetch_read_uniform_1",
        "__prefetch_read_uniform_2",
        "__prefetch_read_uniform_3",
        "__prefetch_read_uniform_nt",
        "__rcp_uniform_float",
        "__rcp_varying_float",
        "__reduce_add_double",
        "__reduce_add_float",
        "__reduce_add_int32",
        "__reduce_add_int64",
        "__reduce_add_uint32",
        "__reduce_add_uint64",
        "__reduce_equal_double",
        "__reduce_equal_float",
        "__reduce_equal_int32",
        "__reduce_equal_int64",
        "__reduce_max_double",
        "__reduce_max_float",
        "__reduce_max_int32",
        "__reduce_max_int64",
        "__reduce_max_uint32",
        "__reduce_max_uint64",
        "__reduce_min_double",
        "__reduce_min_float",
        "__reduce_min_int32",
        "__reduce_min_int64",
        "__reduce_min_uint32",
        "__reduce_min_uint64",
        "__rotate_double",
        "__rotate_float",
        "__rotate_i16",
        "__rotate_i32",
        "__rotate_i64",
        "__rotate_i8",
        "__round_uniform_double",
        "__round_uniform_float",
        "__round_varying_double",
        "__round_varying_float",
        "__rsqrt_uniform_float",
        "__rsqrt_varying_float",
        "__sext_uniform_bool",
        "__sext_varying_bool",
        "__shuffle2_double",
        "__shuffle2_float",
        "__shuffle2_i16",
        "__shuffle2_i32",
        "__shuffle2_i64",
        "__shuffle2_i8",
        "__shuffle_double",
        "__shuffle_float",
        "__shuffle_i16",
        "__shuffle_i32",
        "__shuffle_i64",
        "__shuffle_i8",
        "__soa_to_aos3_float",
        "__soa_to_aos3_float16",
        "__soa_to_aos3_float4",
        "__soa_to_aos3_float8",
        "__soa_to_aos3_int32",
        "__soa_to_aos4_float",
        "__soa_to_aos4_float16",
        "__soa_to_aos4_float4",
        "__soa_to_aos4_float8",
        "__soa_to_aos4_int32",
        "__sqrt_uniform_double",
        "__sqrt_uniform_float",
        "__sqrt_varying_double",
        "__sqrt_varying_float",
        "__stdlib_acosf",
        "__stdlib_asinf",
        "__stdlib_atan",
        "__stdlib_atan2",
        "__stdlib_atan2f",
        "__stdlib_atanf",
        "__stdlib_cos",
        "__stdlib_cosf",
        "__stdlib_exp",
        "__stdlib_expf",
        "__stdlib_log",
        "__stdlib_logf",
        "__stdlib_pow",
        "__stdlib_powf",
        "__stdlib_sin",
        "__stdlib_sincos",
        "__stdlib_sincosf",
        "__stdlib_sinf",
        "__stdlib_tan",
        "__stdlib_tanf",
        "__svml_sin",
        "__svml_cos",
        "__svml_sincos",
        "__svml_tan",
        "__svml_atan",
        "__svml_atan2",
        "__svml_exp",
        "__svml_log",
        "__svml_pow",
        "__undef_uniform",
        "__undef_varying",
        "__vec4_add_float",
        "__vec4_add_int32",
        "__vselect_float",
        "__vselect_i32",
    };
    int count = sizeof(names) / sizeof(names[0]);
    for (int i = 0; i < count; ++i) {
        llvm::Function *f = module->getFunction(names[i]);
        if (f != NULL && f->empty() == false)
            f->setLinkage(llvm::GlobalValue::InternalLinkage);
    }
 }
 /** This utility function takes serialized binary LLVM bitcode and adds its
    definitions to the given module.  Functions in the bitcode that can be
    mapped to ispc functions are also added to the symbol table.
@@ -336,9 +592,9 @@ lCheckModuleIntrinsics(llvm::Module *module) {
    @param module      Module to link the bitcode into
    @param symbolTable Symbol table to add definitions to
 */
-static void
+void
-lAddBitcode(const unsigned char *bitcode, int length,
+AddBitcodeToModule(const unsigned char *bitcode, int length,
-            llvm::Module *module, SymbolTable *symbolTable) {
+                   llvm::Module *module, SymbolTable *symbolTable) {
    std::string bcErr;
    llvm::StringRef sb = llvm::StringRef((char *)bitcode, length);
    llvm::MemoryBuffer *bcBuf = llvm::MemoryBuffer::getMemBuffer(sb);
@@ -356,16 +612,22 @@ lAddBitcode(const unsigned char *bitcode, int length,
        // linking together modules with incompatible target triples..
        llvm::Triple mTriple(m->module->getTargetTriple());
        llvm::Triple bcTriple(bcModule->getTargetTriple());
-        assert(bcTriple.getArch() == llvm::Triple::UnknownArch ||
+        Assert(bcTriple.getArch() == llvm::Triple::UnknownArch ||
               mTriple.getArch() == bcTriple.getArch());
-        assert(bcTriple.getVendor() == llvm::Triple::UnknownVendor ||
+        Assert(bcTriple.getVendor() == llvm::Triple::UnknownVendor ||
               mTriple.getVendor() == bcTriple.getVendor());
        bcModule->setTargetTriple(mTriple.str());
        std::string(linkError);
-        if (llvm::Linker::LinkModules(module, bcModule, &linkError))
+        if (llvm::Linker::LinkModules(module, bcModule, 
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
                                      llvm::Linker::DestroySource,
 #endif // LLVM_3_0
                                      &linkError))
            Error(SourcePos(), "Error linking stdlib bitcode: %s", linkError.c_str());
-        lAddModuleSymbols(module, symbolTable);
+        lSetInternalFunctions(module);
        if (symbolTable != NULL)
            lAddModuleSymbols(module, symbolTable);
        lCheckModuleIntrinsics(module);
    }
 }
@@ -377,8 +639,9 @@ lAddBitcode(const unsigned char *bitcode, int length,
 static void
 lDefineConstantInt(const char *name, int val, llvm::Module *module,
                   SymbolTable *symbolTable) {
-    Symbol *pw = new Symbol(name, SourcePos(), AtomicType::UniformConstInt32);
+    Symbol *pw = 
-    pw->isStatic = true;
+        new Symbol(name, SourcePos(), AtomicType::UniformInt32->GetAsConstType(),
                   SC_STATIC);
    pw->constValue = new ConstExpr(pw->type, val, SourcePos());
    LLVM_TYPE_CONST llvm::Type *ltype = LLVMTypes::Int32Type;
    llvm::Constant *linit = LLVMInt32(val);
@@ -395,11 +658,10 @@ lDefineConstantIntFunc(const char *name, int val, llvm::Module *module,
                       SymbolTable *symbolTable) {
    std::vector<const Type *> args;
    FunctionType *ft = new FunctionType(AtomicType::UniformInt32, args, SourcePos());
-    Symbol *sym = new Symbol(name, SourcePos(), ft);
+    Symbol *sym = new Symbol(name, SourcePos(), ft, SC_STATIC);
    sym->isStatic = true;
    llvm::Function *func = module->getFunction(name);
-    assert(func != NULL); // it should be declared already...
+    Assert(func != NULL); // it should be declared already...
    func->addFnAttr(llvm::Attribute::AlwaysInline);
    llvm::BasicBlock *bblock = llvm::BasicBlock::Create(*g->ctx, "entry", func, 0);
    llvm::ReturnInst::Create(*g->ctx, LLVMInt32(val), bblock);
@@ -412,9 +674,9 @@ lDefineConstantIntFunc(const char *name, int val, llvm::Module *module,
 static void
 lDefineProgramIndex(llvm::Module *module, SymbolTable *symbolTable) {
-    Symbol *pidx = new Symbol("programIndex", SourcePos(), 
+    Symbol *pidx = 
-                              AtomicType::VaryingConstInt32);
+        new Symbol("programIndex", SourcePos(), 
-    pidx->isStatic = true;
+                   AtomicType::VaryingInt32->GetAsConstType(), SC_STATIC);
    int pi[ISPC_MAX_NVEC];
    for (int i = 0; i < g->target.vectorWidth; ++i)
@@ -434,17 +696,17 @@ void
 DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *module,
             bool includeStdlibISPC) {
    // Add the definitions from the compiled builtins-c.c file
-    if (g->target.is32bit) {
+    if (g->target.is32Bit) {
        extern unsigned char builtins_bitcode_c_32[];
        extern int builtins_bitcode_c_32_length;
-        lAddBitcode(builtins_bitcode_c_32, builtins_bitcode_c_32_length, 
+        AddBitcodeToModule(builtins_bitcode_c_32, builtins_bitcode_c_32_length, 
-                    module, symbolTable);
+                           module, symbolTable);
    }
    else {
        extern unsigned char builtins_bitcode_c_64[];
        extern int builtins_bitcode_c_64_length;
-        lAddBitcode(builtins_bitcode_c_64, builtins_bitcode_c_64_length, 
+        AddBitcodeToModule(builtins_bitcode_c_64, builtins_bitcode_c_64_length, 
-                    module, symbolTable);
+                           module, symbolTable);
    }
    // Next, add the target's custom implementations of the various needed
@@ -453,22 +715,36 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
    case Target::SSE2:
        extern unsigned char builtins_bitcode_sse2[];
        extern int builtins_bitcode_sse2_length;
-        lAddBitcode(builtins_bitcode_sse2, builtins_bitcode_sse2_length, module,
+        extern unsigned char builtins_bitcode_sse2_x2[];
-                    symbolTable);
+        extern int builtins_bitcode_sse2_x2_length;
        switch (g->target.vectorWidth) {
        case 4: 
            AddBitcodeToModule(builtins_bitcode_sse2, builtins_bitcode_sse2_length, 
                               module, symbolTable);
            break;
        case 8:
            AddBitcodeToModule(builtins_bitcode_sse2_x2, builtins_bitcode_sse2_x2_length, 
                               module, symbolTable);
            break;
        default:
            FATAL("logic error in DefineStdlib");
        }
        break;
    case Target::SSE4:
        extern unsigned char builtins_bitcode_sse4[];
        extern int builtins_bitcode_sse4_length;
-        extern unsigned char builtins_bitcode_sse4x2[];
+        extern unsigned char builtins_bitcode_sse4_x2[];
-        extern int builtins_bitcode_sse4x2_length;
+        extern int builtins_bitcode_sse4_x2_length;
        switch (g->target.vectorWidth) {
        case 4: 
-            lAddBitcode(builtins_bitcode_sse4, builtins_bitcode_sse4_length, 
+            AddBitcodeToModule(builtins_bitcode_sse4,
-                        module, symbolTable);
+                               builtins_bitcode_sse4_length, 
                               module, symbolTable);
            break;
        case 8:
-            lAddBitcode(builtins_bitcode_sse4x2, builtins_bitcode_sse4x2_length, 
+            AddBitcodeToModule(builtins_bitcode_sse4_x2, 
-                        module, symbolTable);
+                               builtins_bitcode_sse4_x2_length, 
                               module, symbolTable);
            break;
        default:
            FATAL("logic error in DefineStdlib");
@@ -477,16 +753,72 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
    case Target::AVX:
        switch (g->target.vectorWidth) {
        case 8:
-            extern unsigned char builtins_bitcode_avx[];
+            extern unsigned char builtins_bitcode_avx1[];
-            extern int builtins_bitcode_avx_length;
+            extern int builtins_bitcode_avx1_length;
-            lAddBitcode(builtins_bitcode_avx, builtins_bitcode_avx_length, module, 
+            AddBitcodeToModule(builtins_bitcode_avx1, 
-                        symbolTable);
+                               builtins_bitcode_avx1_length, 
                               module, symbolTable);
            break;
        case 16:
-            extern unsigned char builtins_bitcode_avx_x2[];
+            extern unsigned char builtins_bitcode_avx1_x2[];
-            extern int builtins_bitcode_avx_x2_length;
+            extern int builtins_bitcode_avx1_x2_length;
-            lAddBitcode(builtins_bitcode_avx_x2, builtins_bitcode_avx_x2_length,
+            AddBitcodeToModule(builtins_bitcode_avx1_x2, 
-                        module,  symbolTable);
+                               builtins_bitcode_avx1_x2_length,
                               module,  symbolTable);
            break;
        default:
            FATAL("logic error in DefineStdlib");
        }
        break;
    case Target::AVX2:
        switch (g->target.vectorWidth) {
        case 8:
            extern unsigned char builtins_bitcode_avx2[];
            extern int builtins_bitcode_avx2_length;
            AddBitcodeToModule(builtins_bitcode_avx2, 
                               builtins_bitcode_avx2_length, 
                               module, symbolTable);
            break;
        case 16:
            extern unsigned char builtins_bitcode_avx2_x2[];
            extern int builtins_bitcode_avx2_x2_length;
            AddBitcodeToModule(builtins_bitcode_avx2_x2, 
                               builtins_bitcode_avx2_x2_length,
                               module,  symbolTable);
            break;
        default:
            FATAL("logic error in DefineStdlib");
        }
        break;
    case Target::GENERIC:
        switch (g->target.vectorWidth) {
        case 4:
            extern unsigned char builtins_bitcode_generic_4[];
            extern int builtins_bitcode_generic_4_length;
            AddBitcodeToModule(builtins_bitcode_generic_4, 
                               builtins_bitcode_generic_4_length, 
                               module, symbolTable);
            break;
        case 8:
            extern unsigned char builtins_bitcode_generic_8[];
            extern int builtins_bitcode_generic_8_length;
            AddBitcodeToModule(builtins_bitcode_generic_8, 
                               builtins_bitcode_generic_8_length, 
                               module, symbolTable);
            break;
        case 16:
            extern unsigned char builtins_bitcode_generic_16[];
            extern int builtins_bitcode_generic_16_length;
            AddBitcodeToModule(builtins_bitcode_generic_16, 
                               builtins_bitcode_generic_16_length, 
                               module, symbolTable);
            break;
 	case 1:
            extern unsigned char builtins_bitcode_generic_1[];
            extern int builtins_bitcode_generic_1_length;
            AddBitcodeToModule(builtins_bitcode_generic_1, 
                               builtins_bitcode_generic_1_length, 
                               module, symbolTable);
            break;
        default:
            FATAL("logic error in DefineStdlib");
@@ -516,17 +848,22 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
    lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload, module,
                           symbolTable);
    lDefineConstantInt("__have_native_half", (g->target.isa == Target::AVX2),
                       module, symbolTable);
    if (includeStdlibISPC) {
        // If the user wants the standard library to be included, parse the
        // serialized version of the stdlib.ispc file to get its
-        // definitions added.  Disable emission of performance warnings for
+        // definitions added.
-        // now, since the user doesn't care about any of that in the stdlib
+      if (g->target.isa == Target::GENERIC&&g->target.vectorWidth!=1) { // 1 wide uses x86 stdlib
-        // implementation...
+            extern char stdlib_generic_code[];
-        bool epf = g->emitPerfWarnings;
+            yy_scan_string(stdlib_generic_code);
-        g->emitPerfWarnings = false;
+            yyparse();
-        extern char stdlib_code[];
+        }
-        yy_scan_string(stdlib_code);
+        else {
-        yyparse();
+            extern char stdlib_x86_code[];
-        g->emitPerfWarnings = epf;
+            yy_scan_string(stdlib_x86_code);
            yyparse();
        }
    }
 }
--- a/builtins.h
+++ b/builtins.h
@@ -55,4 +55,7 @@
 void DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *module,
                  bool includeStdlib);
 void AddBitcodeToModule(const unsigned char *bitcode, int length,
                        llvm::Module *module, SymbolTable *symbolTable = NULL);
 #endif // ISPC_STDLIB_H
--- a/builtins.m4
+++ b/builtins.m4
--- a/builtins/builtins.c
+++ b/builtins/builtins.c
@@ -57,6 +57,7 @@
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdarg.h>
 typedef int Bool;
@@ -132,6 +133,8 @@ void __do_print(const char *format, const char *types, int width, int mask,
                case 'V': PRINT_VECTOR("%llu", unsigned long long);
                case 'd': PRINT_SCALAR("%f", double);
                case 'D': PRINT_VECTOR("%f", double);
                case 'p': PRINT_SCALAR("%p", void *);
                case 'P': PRINT_VECTOR("%p", void *);
                default:
                    printf("UNKNOWN TYPE ");
                    putchar(*types);
@@ -146,22 +149,22 @@ void __do_print(const char *format, const char *types, int width, int mask,
 int __num_cores() {
-#ifdef _MSC_VER
+#if defined(_MSC_VER) || defined(__MINGW32__)
-	// This is quite a hack.  Including all of windows.h to get this definition
+    // This is quite a hack.  Including all of windows.h to get this definition
-	// pulls in a bunch of stuff that leads to undefined symbols at link time.
+    // pulls in a bunch of stuff that leads to undefined symbols at link time.
-	// So we don't #include <windows.h> but instead have the equivalent declarations
+    // So we don't #include <windows.h> but instead have the equivalent declarations
-	// here.  Presumably this struct declaration won't be changing in the future
+    // here.  Presumably this struct declaration won't be changing in the future
-	// anyway...
+    // anyway...
-  	struct SYSTEM_INFO {
+    struct SYSTEM_INFO {
-        int pad0[2];
+        int pad0[2];
-        void *pad1[2];
+        void *pad1[2];
-        int *pad2;
+        int *pad2;
-        int dwNumberOfProcessors;
+        int dwNumberOfProcessors;
        int pad3[3];
-	};
+    };
    struct SYSTEM_INFO sysInfo;
-	extern void __stdcall GetSystemInfo(struct SYSTEM_INFO *);
+    extern void __stdcall GetSystemInfo(struct SYSTEM_INFO *);
    GetSystemInfo(&sysInfo);
    return sysInfo.dwNumberOfProcessors;
 #else
--- a/builtins/dispatch.ll
+++ b/builtins/dispatch.ll
@@ -0,0 +1,151 @@
 ;;  Copyright (c) 2011, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
 ;;  modification, are permitted provided that the following conditions are
 ;;  met:
 ;;
 ;;    * Redistributions of source code must retain the above copyright
 ;;      notice, this list of conditions and the following disclaimer.
 ;;
 ;;    * Redistributions in binary form must reproduce the above copyright
 ;;      notice, this list of conditions and the following disclaimer in the
 ;;      documentation and/or other materials provided with the distribution.
 ;;
 ;;    * Neither the name of Intel Corporation nor the names of its
 ;;      contributors may be used to endorse or promote products derived from
 ;;      this software without specific prior written permission.
 ;;
 ;;
 ;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 ;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 ;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 ;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
 ;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 ;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 ;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 ;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 ;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 ;; This file defines various functions that are used when generating the
 ;; the "dispatch" object/assembly file that has entrypoints for each
 ;; exported function in a module that dispatch to the best available
 ;; variant of that function that will run on the system's CPU.
 ;; Stores the best target ISA that the system on which we're actually
 ;; running supports.  -1 represents "uninitialized", otherwise this value
 ;; should correspond to one of the enumerant values of Target::ISA from
 ;; ispc.h.
@__system_best_isa = internal global i32 -1
 declare void @abort() noreturn
 ;; The below is the result of running "clang -O2 -emit-llvm -c -o -" on the
 ;; following code...  Specifically, __get_system_isa should return a value
 ;; corresponding to one of the Target::ISA enumerant values that gives the
 ;; most capable ISA that the curremt system can run.
 ;;
 ;; Note: clang from LLVM 2.9 should be used if this is updated, for maximum
 ;; backwards compatibility for anyone building ispc with LLVM 2.9.
 ;;
 ;; #include <stdint.h>
 ;; #include <stdlib.h>
 ;; 
 ;; static void __cpuid(int info[4], int infoType) {
 ;;     __asm__ __volatile__ ("cpuid"
 ;;                           : "=a" (info[0]), "=b" (info[1]), "=c" (info[2]), "=d" (info[3])
 ;;                           : "0" (infoType));
 ;; }
 ;; 
 ;; /* Save %ebx in case it's the PIC register */
 ;; static void __cpuid_count(int info[4], int level, int count) {
 ;;   __asm__ __volatile__ ("xchg{l}\t{%%}ebx, %1\n\t"
 ;;                         "cpuid\n\t"
 ;;                         "xchg{l}\t{%%}ebx, %1\n\t"
 ;;                         : "=a" (info[0]), "=r" (info[1]), "=c" (info[2]), "=d" (info[3])
 ;;                         : "0" (level), "2" (count));
 ;; }
 ;; 
 ;; int32_t __get_system_isa() {
 ;;     int info[4];
 ;;     __cpuid(info, 1);
 ;; 
 ;;     /* NOTE: the values returned below must be the same as the
 ;;        corresponding enumerant values in Target::ISA. */
 ;;     if ((info[2] & (1 << 28)) != 0) {
 ;;         // AVX1 for sure. Do we have AVX2?
 ;;         // Call cpuid with eax=7, ecx=0
 ;;         __cpuid_count(info, 7, 0);
 ;;         if ((info[1] & (1 << 5)) != 0)
 ;;             return 3; // AVX2
 ;;         else
 ;;             return 2; // AVX1
 ;;     }
 ;;     else if ((info[2] & (1 << 19)) != 0)
 ;;         return 1; // SSE4
 ;;     else if ((info[3] & (1 << 26)) != 0)
 ;;         return 0; // SSE2
 ;;     else
 ;;         abort();
 ;; }
 %0 = type { i32, i32, i32, i32 }
 define i32 @__get_system_isa() nounwind ssp {
 entry:
  %0 = tail call %0 asm sideeffect "cpuid", "={ax},={bx},={cx},={dx},0,~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind
  %asmresult9.i = extractvalue %0 %0, 2
  %asmresult10.i = extractvalue %0 %0, 3
  %and = and i32 %asmresult9.i, 268435456
  %cmp = icmp eq i32 %and, 0
  br i1 %cmp, label %if.else7, label %if.then
 if.then:                                          ; preds = %entry
  %1 = tail call %0 asm sideeffect "xchg$(l$)\09$(%$)ebx, $1\0A\09cpuid\0A\09xchg$(l$)\09$(%$)ebx, $1\0A\09", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 7, i32 0) nounwind
  %asmresult9.i24 = extractvalue %0 %1, 1
  %and4 = lshr i32 %asmresult9.i24, 5
  %2 = and i32 %and4, 1
  %3 = or i32 %2, 2
  br label %return
 if.else7:                                         ; preds = %entry
  %and10 = and i32 %asmresult9.i, 524288
  %cmp11 = icmp eq i32 %and10, 0
  br i1 %cmp11, label %if.else13, label %return
 if.else13:                                        ; preds = %if.else7
  %and16 = and i32 %asmresult10.i, 67108864
  %cmp17 = icmp eq i32 %and16, 0
  br i1 %cmp17, label %if.else19, label %return
 if.else19:                                        ; preds = %if.else13
  tail call void @abort() noreturn nounwind
  unreachable
 return:                                           ; preds = %if.else13, %if.else7, %if.then
  %retval.0 = phi i32 [ %3, %if.then ], [ 1, %if.else7 ], [ 0, %if.else13 ]
  ret i32 %retval.0
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; This function is called by each of the dispatch functions we generate;
 ;; it sets @__system_best_isa if it is unset.
 define void @__set_system_isa() {
 entry:
  %bi = load i32* @__system_best_isa
  %unset = icmp eq i32 %bi, -1
  br i1 %unset, label %set_system_isa, label %done
 set_system_isa:
  %bival = call i32 @__get_system_isa()
  store i32 %bival, i32* @__system_best_isa
  ret void
 done:
  ret void
 }
--- a/builtins/target-avx-common.ll
+++ b/builtins/target-avx-common.ll
@@ -30,18 +30,19 @@
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; *** Untested *** AVX target implementation.
+;; AVX target implementation.
-;;
+
-;; The LLVM AVX code generator is incomplete, so the ispc AVX target
+ctlztz()
-;; hasn't yet been tested.  There is therefore a higher-than-normal
+define_prefetches()
-;; chance that there are bugs in the code in this file.
+define_shuffles()
 aossoa()
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
 declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
-define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
+define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
 ;    uniform float iv = extract(__rcp_u(v), 0);
 ;    return iv * (2. - v * iv);
  %vecval = insertelement <4 x float> undef, float %0, i32 0
@@ -60,7 +61,7 @@ define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline
 declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
-define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
+define float @__round_uniform_float(float) nounwind readonly alwaysinline {
  ; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
  ; the roundss intrinsic is a total mess--docs say:
  ;
@@ -83,7 +84,7 @@ define internal float @__round_uniform_float(float) nounwind readonly alwaysinli
  ret float %rs
 }
-define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
+define float @__floor_uniform_float(float) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <4 x float> undef, float %0, i32 0
  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
@@ -92,7 +93,7 @@ define internal float @__floor_uniform_float(float) nounwind readonly alwaysinli
  ret float %rs
 }
-define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
+define float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <4 x float> undef, float %0, i32 0
  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
@@ -106,14 +107,14 @@ define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinlin
 declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
-define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
+define double @__round_uniform_double(double) nounwind readonly alwaysinline {
  %xi = insertelement <2 x double> undef, double %0, i32 0
  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
  %rs = extractelement <2 x double> %xr, i32 0
  ret double %rs
 }
-define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
+define double @__floor_uniform_double(double) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <2 x double> undef, double %0, i32 0
  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
@@ -122,7 +123,7 @@ define internal double @__floor_uniform_double(double) nounwind readonly alwaysi
  ret double %rs
 }
-define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
+define double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <2 x double> undef, double %0, i32 0
  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
@@ -137,7 +138,7 @@ define internal double @__ceil_uniform_double(double) nounwind readonly alwaysin
 declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
-define internal float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
+define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
  ;  uniform float is = extract(__rsqrt_u(v), 0);
  %v = insertelement <4 x float> undef, float %0, i32 0
  %vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
@@ -158,7 +159,7 @@ define internal float @__rsqrt_uniform_float(float) nounwind readonly alwaysinli
 declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
-define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
+define float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
  sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
  ret float %ret
 }
@@ -170,7 +171,7 @@ define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinlin
 declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
 declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
-define internal void @__fastmath() nounwind alwaysinline {
+define void @__fastmath() nounwind alwaysinline {
  %ptr = alloca i32
  %ptr8 = bitcast i32 * %ptr to i8 *
  call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
@@ -189,12 +190,12 @@ define internal void @__fastmath() nounwind alwaysinline {
 declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
 declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
-define internal float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
+define float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
  ret float %ret
 }
-define internal float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
+define float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
  ret float %ret
 }
@@ -206,12 +207,12 @@ define internal float @__min_uniform_float(float, float) nounwind readonly alway
 declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
 declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
-define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+define i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminsd, %0, %1)
  ret i32 %ret
 }
-define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+define i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
  ret i32 %ret
 }
@@ -223,12 +224,12 @@ define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinlin
 declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
 declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
-define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+define i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminud, %0, %1)
  ret i32 %ret
 }
-define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxud, %0, %1)
  ret i32 %ret
 }
@@ -238,14 +239,14 @@ define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinli
 declare i32 @llvm.ctpop.i32(i32) nounwind readnone
-define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
+define i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
  %call = call i32 @llvm.ctpop.i32(i32 %0)
  ret i32 %call
 }
 declare i64 @llvm.ctpop.i64(i64) nounwind readnone
-define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
+define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
  %call = call i64 @llvm.ctpop.i64(i64 %0)
  ret i64 %call
 }
@@ -255,7 +256,7 @@ define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
 declare <2 x double> @llvm.x86.sse.sqrt.sd(<2 x double>) nounwind readnone
-define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
+define double @__sqrt_uniform_double(double) nounwind alwaysinline {
  sse_unary_scalar(ret, 2, double, @llvm.x86.sse.sqrt.sd, %0)
  ret double %ret
 }
@@ -267,12 +268,12 @@ define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
 declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
 declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
-define internal double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
+define double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
  ret double %ret
 }
-define internal double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
+define double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
  ret double %ret
 }
--- a/builtins/target-avx-x2.ll
+++ b/builtins/target-avx-x2.ll
@@ -29,29 +29,26 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; *** Untested *** AVX target implementation.
 ;;
 ;; The LLVM AVX code generator is incomplete, so the ispc AVX target
 ;; hasn't yet been tested.  There is therefore a higher-than-normal
 ;; chance that there are bugs in the code in this file.
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; Basic 16-wide definitions
-stdlib_core(16)
+define(`WIDTH',`16')
-packed_load_and_store(16)
+define(`MASK',`i32')
-scans(16)
+include(`util.m4')
 int64minmax(16)
-include(`builtins-avx-common.ll')
+stdlib_core()
 packed_load_and_store()
 scans()
 int64minmax()
 include(`target-avx-common.ll')
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
 declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
-define internal <16 x float> @__rcp_varying_float(<16 x float>) nounwind readonly alwaysinline {
+define <16 x float> @__rcp_varying_float(<16 x float>) nounwind readonly alwaysinline {
  ;  float iv = __rcp_v(v);
  ;  return iv * (2. - v * iv);
@@ -71,17 +68,17 @@ define internal <16 x float> @__rcp_varying_float(<16 x float>) nounwind readonl
 declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
-define internal <16 x float> @__round_varying_float(<16 x float>) nounwind readonly alwaysinline {
+define <16 x float> @__round_varying_float(<16 x float>) nounwind readonly alwaysinline {
  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
  round8to16(%0, 8)
 }
-define internal <16 x float> @__floor_varying_float(<16 x float>) nounwind readonly alwaysinline {
+define <16 x float> @__floor_varying_float(<16 x float>) nounwind readonly alwaysinline {
  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  round8to16(%0, 9)
 }
-define internal <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly alwaysinline {
+define <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly alwaysinline {
  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  round8to16(%0, 10)
 }
@@ -91,15 +88,15 @@ define internal <16 x float> @__ceil_varying_float(<16 x float>) nounwind readon
 declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
-define internal <16 x double> @__round_varying_double(<16 x double>) nounwind readonly alwaysinline {
+define <16 x double> @__round_varying_double(<16 x double>) nounwind readonly alwaysinline {
  round4to16double(%0, 8)
 }
-define internal <16 x double> @__floor_varying_double(<16 x double>) nounwind readonly alwaysinline {
+define <16 x double> @__floor_varying_double(<16 x double>) nounwind readonly alwaysinline {
  round4to16double(%0, 9)
 }
-define internal <16 x double> @__ceil_varying_double(<16 x double>) nounwind readonly alwaysinline {
+define <16 x double> @__ceil_varying_double(<16 x double>) nounwind readonly alwaysinline {
  round4to16double(%0, 10)
 }
@@ -109,7 +106,7 @@ define internal <16 x double> @__ceil_varying_double(<16 x double>) nounwind rea
 declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
-define internal <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline {
+define <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline {
  ;  float is = __rsqrt_v(v);
  unary8to16(is, float, @llvm.x86.avx.rsqrt.ps.256, %v)
  ;  return 0.5 * is * (3. - (v * is) * is);
@@ -132,7 +129,7 @@ define internal <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind re
 declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
-define internal <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly alwaysinline {
+define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly alwaysinline {
  unary8to16(call, float, @llvm.x86.avx.sqrt.ps.256, %0)
  ret <16 x float> %call
 }
@@ -160,52 +157,25 @@ declare <16 x float> @__svml_pow(<16 x float>, <16 x float>)
 declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
 declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
-define internal <16 x float> @__max_varying_float(<16 x float>,
+define <16 x float> @__max_varying_float(<16 x float>,
                                                  <16 x float>) nounwind readonly alwaysinline {
  binary8to16(call, float, @llvm.x86.avx.max.ps.256, %0, %1)
  ret <16 x float> %call
 }
-define internal <16 x float> @__min_varying_float(<16 x float>,
+define <16 x float> @__min_varying_float(<16 x float>,
                                                  <16 x float>) nounwind readonly alwaysinline {
  binary8to16(call, float, @llvm.x86.avx.min.ps.256, %0, %1)
  ret <16 x float> %call
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max
 define internal <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
  binary4to16(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
  ret <16 x i32> %ret
 }
 define internal <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
  binary4to16(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
  ret <16 x i32> %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unsigned int min/max
 define internal <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
  binary4to16(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
  ret <16 x i32> %ret
 }
 define internal <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
  binary4to16(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
  ret <16 x i32> %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops
 declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
-define internal i32 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
+define i32 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
  %floatmask = bitcast <16 x i32> %0 to <16 x float>
  %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -224,7 +194,7 @@ define internal i32 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
 declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone
-define internal float @__reduce_add_float(<16 x float>) nounwind readonly alwaysinline {
+define float @__reduce_add_float(<16 x float>) nounwind readonly alwaysinline {
  %va = shufflevector <16 x float> %0, <16 x float> undef,
          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %vb = shufflevector <16 x float> %0, <16 x float> undef,
@@ -239,12 +209,12 @@ define internal float @__reduce_add_float(<16 x float>) nounwind readonly always
 }
-define internal float @__reduce_min_float(<16 x float>) nounwind readnone alwaysinline {
+define float @__reduce_min_float(<16 x float>) nounwind readnone alwaysinline {
  reduce16(float, @__min_varying_float, @__min_uniform_float)
 }
-define internal float @__reduce_max_float(<16 x float>) nounwind readnone alwaysinline {
+define float @__reduce_max_float(<16 x float>) nounwind readnone alwaysinline {
  reduce16(float, @__max_varying_float, @__max_uniform_float)
 }
@@ -253,28 +223,28 @@ reduce_equal(16)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal int32 ops
-define internal <16 x i32> @__add_varying_int32(<16 x i32>,
+define <16 x i32> @__add_varying_int32(<16 x i32>,
                                                <16 x i32>) nounwind readnone alwaysinline {
  %s = add <16 x i32> %0, %1
  ret <16 x i32> %s
 }
-define internal i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
+define i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
  %s = add i32 %0, %1
  ret i32 %s
 }
-define internal i32 @__reduce_add_int32(<16 x i32>) nounwind readnone alwaysinline {
+define i32 @__reduce_add_int32(<16 x i32>) nounwind readnone alwaysinline {
  reduce16(i32, @__add_varying_int32, @__add_uniform_int32)
 }
-define internal i32 @__reduce_min_int32(<16 x i32>) nounwind readnone alwaysinline {
+define i32 @__reduce_min_int32(<16 x i32>) nounwind readnone alwaysinline {
  reduce16(i32, @__min_varying_int32, @__min_uniform_int32)
 }
-define internal i32 @__reduce_max_int32(<16 x i32>) nounwind readnone alwaysinline {
+define i32 @__reduce_max_int32(<16 x i32>) nounwind readnone alwaysinline {
  reduce16(i32, @__max_varying_int32, @__max_uniform_int32)
 }
@@ -282,17 +252,17 @@ define internal i32 @__reduce_max_int32(<16 x i32>) nounwind readnone alwaysinli
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;; horizontal uint32 ops
-define internal i32 @__reduce_add_uint32(<16 x i32> %v) nounwind readnone alwaysinline {
+define i32 @__reduce_add_uint32(<16 x i32> %v) nounwind readnone alwaysinline {
  %r = call i32 @__reduce_add_int32(<16 x i32> %v)
  ret i32 %r
 }
-define internal i32 @__reduce_min_uint32(<16 x i32>) nounwind readnone alwaysinline {
+define i32 @__reduce_min_uint32(<16 x i32>) nounwind readnone alwaysinline {
  reduce16(i32, @__min_varying_uint32, @__min_uniform_uint32)
 }
-define internal i32 @__reduce_max_uint32(<16 x i32>) nounwind readnone alwaysinline {
+define i32 @__reduce_max_uint32(<16 x i32>) nounwind readnone alwaysinline {
  reduce16(i32, @__max_varying_uint32, @__max_uniform_uint32)
 }
@@ -302,7 +272,7 @@ define internal i32 @__reduce_max_uint32(<16 x i32>) nounwind readnone alwaysinl
 declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
-define internal double @__reduce_add_double(<16 x double>) nounwind readonly alwaysinline {
+define double @__reduce_add_double(<16 x double>) nounwind readonly alwaysinline {
  %va = shufflevector <16 x double> %0, <16 x double> undef,
         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %vb = shufflevector <16 x double> %0, <16 x double> undef,
@@ -322,12 +292,12 @@ define internal double @__reduce_add_double(<16 x double>) nounwind readonly alw
  ret double %sum
 }
-define internal double @__reduce_min_double(<16 x double>) nounwind readnone alwaysinline {
+define double @__reduce_min_double(<16 x double>) nounwind readnone alwaysinline {
  reduce16(double, @__min_varying_double, @__min_uniform_double)
 }
-define internal double @__reduce_max_double(<16 x double>) nounwind readnone alwaysinline {
+define double @__reduce_max_double(<16 x double>) nounwind readnone alwaysinline {
  reduce16(double, @__max_varying_double, @__max_uniform_double)
 }
@@ -335,28 +305,28 @@ define internal double @__reduce_max_double(<16 x double>) nounwind readnone alw
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal int64 ops
-define internal <16 x i64> @__add_varying_int64(<16 x i64>,
+define <16 x i64> @__add_varying_int64(<16 x i64>,
                                                <16 x i64>) nounwind readnone alwaysinline {
  %s = add <16 x i64> %0, %1
  ret <16 x i64> %s
 }
-define internal i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
+define i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
  %s = add i64 %0, %1
  ret i64 %s
 }
-define internal i64 @__reduce_add_int64(<16 x i64>) nounwind readnone alwaysinline {
+define i64 @__reduce_add_int64(<16 x i64>) nounwind readnone alwaysinline {
  reduce16(i64, @__add_varying_int64, @__add_uniform_int64)
 }
-define internal i64 @__reduce_min_int64(<16 x i64>) nounwind readnone alwaysinline {
+define i64 @__reduce_min_int64(<16 x i64>) nounwind readnone alwaysinline {
  reduce16(i64, @__min_varying_int64, @__min_uniform_int64)
 }
-define internal i64 @__reduce_max_int64(<16 x i64>) nounwind readnone alwaysinline {
+define i64 @__reduce_max_int64(<16 x i64>) nounwind readnone alwaysinline {
  reduce16(i64, @__max_varying_int64, @__max_uniform_int64)
 }
@@ -364,17 +334,17 @@ define internal i64 @__reduce_max_int64(<16 x i64>) nounwind readnone alwaysinli
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;; horizontal uint64 ops
-define internal i64 @__reduce_add_uint64(<16 x i64> %v) nounwind readnone alwaysinline {
+define i64 @__reduce_add_uint64(<16 x i64> %v) nounwind readnone alwaysinline {
  %r = call i64 @__reduce_add_int64(<16 x i64> %v)
  ret i64 %r
 }
-define internal i64 @__reduce_min_uint64(<16 x i64>) nounwind readnone alwaysinline {
+define i64 @__reduce_min_uint64(<16 x i64>) nounwind readnone alwaysinline {
  reduce16(i64, @__min_varying_uint64, @__min_uniform_uint64)
 }
-define internal i64 @__reduce_max_uint64(<16 x i64>) nounwind readnone alwaysinline {
+define i64 @__reduce_max_uint64(<16 x i64>) nounwind readnone alwaysinline {
  reduce16(i64, @__max_varying_uint64, @__max_uniform_uint64)
 }
@@ -388,13 +358,13 @@ load_and_broadcast(16, i32, 32)
 load_and_broadcast(16, i64, 64)
 ; no masked load instruction for i8 and i16 types??
-load_masked(16, i8,  8,  1)
+masked_load(16, i8,  8,  1)
-load_masked(16, i16, 16, 2)
+masked_load(16, i16, 16, 2)
 declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x float> %mask)
 declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
-define <16 x i32> @__load_masked_32(i8 *, <16 x i32> %mask) nounwind alwaysinline {
+define <16 x i32> @__masked_load_32(i8 *, <16 x i32> %mask) nounwind alwaysinline {
  %floatmask = bitcast <16 x i32> %mask to <16 x float>
  %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -412,7 +382,7 @@ define <16 x i32> @__load_masked_32(i8 *, <16 x i32> %mask) nounwind alwaysinlin
 }
-define <16 x i64> @__load_masked_64(i8 *, <16 x i32> %mask) nounwind alwaysinline {
+define <16 x i64> @__masked_load_64(i8 *, <16 x i32> %mask) nounwind alwaysinline {
  ; double up masks, bitcast to doubles
  %mask0 = shufflevector <16 x i32> %mask, <16 x i32> undef,
     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
@@ -625,12 +595,7 @@ define void @__masked_store_blend_64(<16 x i64>* nocapture %ptr, <16 x i64> %new
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; gather/scatter
+;; scatter
 gen_gather(16, i8)
 gen_gather(16, i16)
 gen_gather(16, i32)
 gen_gather(16, i64)
 gen_scatter(16, i8)
 gen_scatter(16, i16)
@@ -642,7 +607,7 @@ gen_scatter(16, i64)
 declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
-define internal <16 x double> @__sqrt_varying_double(<16 x double>) nounwind alwaysinline {
+define <16 x double> @__sqrt_varying_double(<16 x double>) nounwind alwaysinline {
  unary4to16(ret, double, @llvm.x86.avx.sqrt.pd.256, %0)
  ret <16 x double> %ret
 }
@@ -654,12 +619,12 @@ define internal <16 x double> @__sqrt_varying_double(<16 x double>) nounwind alw
 declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
 declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
-define internal <16 x double> @__min_varying_double(<16 x double>, <16 x double>) nounwind readnone alwaysinline {
+define <16 x double> @__min_varying_double(<16 x double>, <16 x double>) nounwind readnone alwaysinline {
  binary4to16(ret, double, @llvm.x86.avx.min.pd.256, %0, %1)
  ret <16 x double> %ret
 }
-define internal <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwind readnone alwaysinline {
+define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwind readnone alwaysinline {
  binary4to16(ret, double, @llvm.x86.avx.max.pd.256, %0, %1)
  ret <16 x double> %ret
 }
--- a/builtins/target-avx.ll
+++ b/builtins/target-avx.ll
@@ -29,29 +29,26 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; *** Untested *** AVX target implementation.
 ;;
 ;; The LLVM AVX code generator is incomplete, so the ispc AVX target
 ;; hasn't yet been tested.  There is therefore a higher-than-normal
 ;; chance that there are bugs in the code in this file.
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; Basic 8-wide definitions
-stdlib_core(8)
+define(`WIDTH',`8')
-packed_load_and_store(8)
+define(`MASK',`i32')
-scans(8)
+include(`util.m4')
 int64minmax(8)
-include(`builtins-avx-common.ll')
+stdlib_core()
 packed_load_and_store()
 scans()
 int64minmax()
 include(`target-avx-common.ll')
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
 declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
-define internal <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
+define <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
  ;  float iv = __rcp_v(v);
  ;  return iv * (2. - v * iv);
@@ -69,19 +66,19 @@ define internal <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly
 declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
-define internal <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
+define <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
  %call = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %0, i32 8)
  ret <8 x float> %call
 }
-define internal <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
+define <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  %call = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %0, i32 9)
  ret <8 x float> %call
 }
-define internal <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
+define <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  %call = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %0, i32 10)
  ret <8 x float> %call
@@ -92,17 +89,17 @@ define internal <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly
 declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
-define internal <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
+define <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
  round4to8double(%0, 8)
 }
-define internal <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
+define <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
  round4to8double(%0, 9)
 }
-define internal <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
+define <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10
  round4to8double(%0, 10)
 }
@@ -113,7 +110,7 @@ define internal <8 x double> @__ceil_varying_double(<8 x double>) nounwind reado
 declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
-define internal <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
+define <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
  ;  float is = __rsqrt_v(v);
  %is = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %v)
  ;  return 0.5 * is * (3. - (v * is) * is);
@@ -132,7 +129,7 @@ define internal <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind read
 declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
-define internal <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
+define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
  %call = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %0)
  ret <8 x float> %call
 }
@@ -160,52 +157,25 @@ declare <8 x float> @__svml_pow(<8 x float>, <8 x float>)
 declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
 declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
-define internal <8 x float> @__max_varying_float(<8 x float>,
+define <8 x float> @__max_varying_float(<8 x float>,
                                                 <8 x float>) nounwind readonly alwaysinline {
  %call = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %0, <8 x float> %1)
  ret <8 x float> %call
 }
-define internal <8 x float> @__min_varying_float(<8 x float>,
+define <8 x float> @__min_varying_float(<8 x float>,
                                                 <8 x float>) nounwind readonly alwaysinline {
  %call = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %0, <8 x float> %1)
  ret <8 x float> %call
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max
 define internal <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
  binary4to8(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
  ret <8 x i32> %ret
 }
 define internal <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
  binary4to8(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
  ret <8 x i32> %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unsigned int min/max
 define internal <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
  binary4to8(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
  ret <8 x i32> %ret
 }
 define internal <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
  binary4to8(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
  ret <8 x i32> %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops
 declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
-define internal i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
+define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
  %floatmask = bitcast <8 x i32> %0 to <8 x float>
  %v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone
  ret i32 %v
@@ -216,7 +186,7 @@ define internal i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
 declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone
-define internal float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
+define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
  %v1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %0, <8 x float> %0)
  %v2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v1, <8 x float> %v1)
  %scalar1 = extractelement <8 x float> %v2, i32 0
@@ -226,12 +196,12 @@ define internal float @__reduce_add_float(<8 x float>) nounwind readonly alwaysi
 }
-define internal float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
+define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
  reduce8(float, @__min_varying_float, @__min_uniform_float)
 }
-define internal float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
+define float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
  reduce8(float, @__max_varying_float, @__max_uniform_float)
 }
@@ -240,28 +210,28 @@ reduce_equal(8)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal int32 ops
-define internal <8 x i32> @__add_varying_int32(<8 x i32>,
+define <8 x i32> @__add_varying_int32(<8 x i32>,
-                                               <8 x i32>) nounwind readnone alwaysinline {
+                                      <8 x i32>) nounwind readnone alwaysinline {
  %s = add <8 x i32> %0, %1
  ret <8 x i32> %s
 }
-define internal i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
+define i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
  %s = add i32 %0, %1
  ret i32 %s
 }
-define internal i32 @__reduce_add_int32(<8 x i32>) nounwind readnone alwaysinline {
+define i32 @__reduce_add_int32(<8 x i32>) nounwind readnone alwaysinline {
  reduce8(i32, @__add_varying_int32, @__add_uniform_int32)
 }
-define internal i32 @__reduce_min_int32(<8 x i32>) nounwind readnone alwaysinline {
+define i32 @__reduce_min_int32(<8 x i32>) nounwind readnone alwaysinline {
  reduce8(i32, @__min_varying_int32, @__min_uniform_int32)
 }
-define internal i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
+define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
  reduce8(i32, @__max_varying_int32, @__max_uniform_int32)
 }
@@ -269,17 +239,17 @@ define internal i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinlin
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;; horizontal uint32 ops
-define internal i32 @__reduce_add_uint32(<8 x i32> %v) nounwind readnone alwaysinline {
+define i32 @__reduce_add_uint32(<8 x i32> %v) nounwind readnone alwaysinline {
  %r = call i32 @__reduce_add_int32(<8 x i32> %v)
  ret i32 %r
 }
-define internal i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
+define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
  reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32)
 }
-define internal i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
+define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
  reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32)
 }
@@ -289,7 +259,7 @@ define internal i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinli
 declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
-define internal double @__reduce_add_double(<8 x double>) nounwind readonly alwaysinline {
+define double @__reduce_add_double(<8 x double>) nounwind readonly alwaysinline {
  %v0 = shufflevector <8 x double> %0, <8 x double> undef,
                      <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %v1 = shufflevector <8 x double> %0, <8 x double> undef,
@@ -303,12 +273,12 @@ define internal double @__reduce_add_double(<8 x double>) nounwind readonly alwa
  ret double %sum
 }
-define internal double @__reduce_min_double(<8 x double>) nounwind readnone alwaysinline {
+define double @__reduce_min_double(<8 x double>) nounwind readnone alwaysinline {
  reduce8(double, @__min_varying_double, @__min_uniform_double)
 }
-define internal double @__reduce_max_double(<8 x double>) nounwind readnone alwaysinline {
+define double @__reduce_max_double(<8 x double>) nounwind readnone alwaysinline {
  reduce8(double, @__max_varying_double, @__max_uniform_double)
 }
@@ -316,28 +286,28 @@ define internal double @__reduce_max_double(<8 x double>) nounwind readnone alwa
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal int64 ops
-define internal <8 x i64> @__add_varying_int64(<8 x i64>,
+define <8 x i64> @__add_varying_int64(<8 x i64>,
-                                               <8 x i64>) nounwind readnone alwaysinline {
+                                      <8 x i64>) nounwind readnone alwaysinline {
  %s = add <8 x i64> %0, %1
  ret <8 x i64> %s
 }
-define internal i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
+define i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
  %s = add i64 %0, %1
  ret i64 %s
 }
-define internal i64 @__reduce_add_int64(<8 x i64>) nounwind readnone alwaysinline {
+define i64 @__reduce_add_int64(<8 x i64>) nounwind readnone alwaysinline {
  reduce8(i64, @__add_varying_int64, @__add_uniform_int64)
 }
-define internal i64 @__reduce_min_int64(<8 x i64>) nounwind readnone alwaysinline {
+define i64 @__reduce_min_int64(<8 x i64>) nounwind readnone alwaysinline {
  reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
 }
-define internal i64 @__reduce_max_int64(<8 x i64>) nounwind readnone alwaysinline {
+define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone alwaysinline {
  reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
 }
@@ -345,17 +315,17 @@ define internal i64 @__reduce_max_int64(<8 x i64>) nounwind readnone alwaysinlin
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;; horizontal uint64 ops
-define internal i64 @__reduce_add_uint64(<8 x i64> %v) nounwind readnone alwaysinline {
+define i64 @__reduce_add_uint64(<8 x i64> %v) nounwind readnone alwaysinline {
  %r = call i64 @__reduce_add_int64(<8 x i64> %v)
  ret i64 %r
 }
-define internal i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone alwaysinline {
+define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone alwaysinline {
  reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
 }
-define internal i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone alwaysinline {
+define i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone alwaysinline {
  reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
 }
@@ -369,13 +339,13 @@ load_and_broadcast(8, i32, 32)
 load_and_broadcast(8, i64, 64)
 ; no masked load instruction for i8 and i16 types??
-load_masked(8, i8,  8,  1)
+masked_load(8, i8,  8,  1)
-load_masked(8, i16, 16, 2)
+masked_load(8, i16, 16, 2)
 declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x float> %mask)
 declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
-define <8 x i32> @__load_masked_32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
+define <8 x i32> @__masked_load_32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
  %floatmask = bitcast <8 x i32> %mask to <8 x float>
  %floatval = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8 * %0, <8 x float> %floatmask)
  %retval = bitcast <8 x float> %floatval to <8 x i32>
@@ -383,7 +353,7 @@ define <8 x i32> @__load_masked_32(i8 *, <8 x i32> %mask) nounwind alwaysinline
 }
-define <8 x i64> @__load_masked_64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
+define <8 x i64> @__masked_load_64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
  ; double up masks, bitcast to doubles
  %mask0 = shufflevector <8 x i32> %mask, <8 x i32> undef,
     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
@@ -406,9 +376,6 @@ define <8 x i64> @__load_masked_64(i8 *, <8 x i32> %mask) nounwind alwaysinline
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store
 ; FIXME: there is no AVX instruction for these, but we could be clever
 ; by packing the bits down and setting the last 3/4 or half, respectively,
 ; of the mask to zero...  Not sure if this would be a win in the end
 gen_masked_store(8, i8, 8)
 gen_masked_store(8, i16, 16)
@@ -523,12 +490,7 @@ define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; gather/scatter
+;; scatter
 gen_gather(8, i8)
 gen_gather(8, i16)
 gen_gather(8, i32)
 gen_gather(8, i64)
 gen_scatter(8, i8)
 gen_scatter(8, i16)
@@ -540,7 +502,7 @@ gen_scatter(8, i64)
 declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
-define internal <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
+define <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
  unary4to8(ret, double, @llvm.x86.avx.sqrt.pd.256, %0)
  ret <8 x double> %ret
 }
@@ -552,12 +514,12 @@ define internal <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alway
 declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
 declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
-define internal <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
+define <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
  binary4to8(ret, double, @llvm.x86.avx.min.pd.256, %0, %1)
  ret <8 x double> %ret
 }
-define internal <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
+define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
  binary4to8(ret, double, @llvm.x86.avx.max.pd.256, %0, %1)
  ret <8 x double> %ret
 }
--- a/builtins/target-avx1-x2.ll
+++ b/builtins/target-avx1-x2.ll
@@ -0,0 +1,77 @@
 ;;  Copyright (c) 2010-2011, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
 ;;  modification, are permitted provided that the following conditions are
 ;;  met:
 ;;
 ;;    * Redistributions of source code must retain the above copyright
 ;;      notice, this list of conditions and the following disclaimer.
 ;;
 ;;    * Redistributions in binary form must reproduce the above copyright
 ;;      notice, this list of conditions and the following disclaimer in the
 ;;      documentation and/or other materials provided with the distribution.
 ;;
 ;;    * Neither the name of Intel Corporation nor the names of its
 ;;      contributors may be used to endorse or promote products derived from
 ;;      this software without specific prior written permission.
 ;;
 ;;
 ;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 ;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 ;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 ;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
 ;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 ;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 ;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 ;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 ;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 include(`target-avx-x2.ll')
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max
 define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
  binary4to16(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
  ret <16 x i32> %ret
 }
 define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
  binary4to16(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
  ret <16 x i32> %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unsigned int min/max
 define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
  binary4to16(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
  ret <16 x i32> %ret
 }
 define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
  binary4to16(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
  ret <16 x i32> %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; half conversion routines
 declare float @__half_to_float_uniform(i16 %v) nounwind readnone
 declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
 declare i16 @__float_to_half_uniform(float %v) nounwind readnone
 declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather
 gen_gather(16, i8)
 gen_gather(16, i16)
 gen_gather(16, i32)
 gen_gather(16, i64)
--- a/builtins/target-avx1.ll
+++ b/builtins/target-avx1.ll
@@ -0,0 +1,75 @@
 ;;  Copyright (c) 2010-2011, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
 ;;  modification, are permitted provided that the following conditions are
 ;;  met:
 ;;
 ;;    * Redistributions of source code must retain the above copyright
 ;;      notice, this list of conditions and the following disclaimer.
 ;;
 ;;    * Redistributions in binary form must reproduce the above copyright
 ;;      notice, this list of conditions and the following disclaimer in the
 ;;      documentation and/or other materials provided with the distribution.
 ;;
 ;;    * Neither the name of Intel Corporation nor the names of its
 ;;      contributors may be used to endorse or promote products derived from
 ;;      this software without specific prior written permission.
 ;;
 ;;
 ;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 ;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 ;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 ;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
 ;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 ;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 ;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 ;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 ;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 include(`target-avx.ll')
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max
 define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
  binary4to8(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
  ret <8 x i32> %ret
 }
 define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
  binary4to8(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
  ret <8 x i32> %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unsigned int min/max
 define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
  binary4to8(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
  ret <8 x i32> %ret
 }
 define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
  binary4to8(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
  ret <8 x i32> %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; half conversion routines
 declare float @__half_to_float_uniform(i16 %v) nounwind readnone
 declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
 declare i16 @__float_to_half_uniform(float %v) nounwind readnone
 declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather
 gen_gather(8, i8)
 gen_gather(8, i16)
 gen_gather(8, i32)
 gen_gather(8, i64)
--- a/builtins/target-avx2-x2.ll
+++ b/builtins/target-avx2-x2.ll
@@ -0,0 +1,129 @@
 ;;  Copyright (c) 2010-2011, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
 ;;  modification, are permitted provided that the following conditions are
 ;;  met:
 ;;
 ;;    * Redistributions of source code must retain the above copyright
 ;;      notice, this list of conditions and the following disclaimer.
 ;;
 ;;    * Redistributions in binary form must reproduce the above copyright
 ;;      notice, this list of conditions and the following disclaimer in the
 ;;      documentation and/or other materials provided with the distribution.
 ;;
 ;;    * Neither the name of Intel Corporation nor the names of its
 ;;      contributors may be used to endorse or promote products derived from
 ;;      this software without specific prior written permission.
 ;;
 ;;
 ;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 ;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 ;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 ;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
 ;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 ;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 ;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 ;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 ;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 include(`target-avx-x2.ll')
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max
 declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readonly
 declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readonly
 define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
  binary8to16(m, i32, @llvm.x86.avx2.pmins.d, %0, %1)
  ret <16 x i32> %m
 }
 define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
  binary8to16(m, i32, @llvm.x86.avx2.pmaxs.d, %0, %1)
  ret <16 x i32> %m
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unsigned int min/max
 declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readonly
 declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readonly
 define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
  binary8to16(m, i32, @llvm.x86.avx2.pminu.d, %0, %1)
  ret <16 x i32> %m
 }
 define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
  binary8to16(m, i32, @llvm.x86.avx2.pmaxu.d, %0, %1)
  ret <16 x i32> %m
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float/half conversions
 declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
 ; 0 is round nearest even
 declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
 define <16 x float> @__half_to_float_varying(<16 x i16> %v) nounwind readnone {
  %r_0 = shufflevector <16 x i16> %v, <16 x i16> undef,
             <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %vr_0 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_0)
  %r_1 = shufflevector <16 x i16> %v, <16 x i16> undef,
             <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  %vr_1 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_1)
  %r = shufflevector <8 x float> %vr_0, <8 x float> %vr_1, 
           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  ret <16 x float> %r
 }
 define <16 x i16> @__float_to_half_varying(<16 x float> %v) nounwind readnone {
  %r_0 = shufflevector <16 x float> %v, <16 x float> undef,
             <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %vr_0 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_0, i32 0)
  %r_1 = shufflevector <16 x float> %v, <16 x float> undef,
             <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  %vr_1 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_1, i32 0)
  %r = shufflevector <8 x i16> %vr_0, <8 x i16> %vr_1, 
           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  ret <16 x i16> %r
 }
 define float @__half_to_float_uniform(i16 %v) nounwind readnone {
  %v1 = bitcast i16 %v to <1 x i16>
  %vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
                      i32 undef, i32 undef, i32 undef, i32 undef>
  %rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
  %r = extractelement <8 x float> %rv, i32 0
  ret float %r
 }
 define i16 @__float_to_half_uniform(float %v) nounwind readnone {
  %v1 = bitcast float %v to <1 x float>
  %vv = shufflevector <1 x float> %v1, <1 x float> undef,
           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
                      i32 undef, i32 undef, i32 undef, i32 undef>
  ; round to nearest even
  %rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
  %r = extractelement <8 x i16> %rv, i32 0
  ret i16 %r
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather
 gen_gather(16, i8)
 gen_gather(16, i16)
 gen_gather(16, i32)
 gen_gather(16, i64)
--- a/builtins/target-avx2.ll
+++ b/builtins/target-avx2.ll
@@ -0,0 +1,110 @@
 ;;  Copyright (c) 2010-2011, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
 ;;  modification, are permitted provided that the following conditions are
 ;;  met:
 ;;
 ;;    * Redistributions of source code must retain the above copyright
 ;;      notice, this list of conditions and the following disclaimer.
 ;;
 ;;    * Redistributions in binary form must reproduce the above copyright
 ;;      notice, this list of conditions and the following disclaimer in the
 ;;      documentation and/or other materials provided with the distribution.
 ;;
 ;;    * Neither the name of Intel Corporation nor the names of its
 ;;      contributors may be used to endorse or promote products derived from
 ;;      this software without specific prior written permission.
 ;;
 ;;
 ;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 ;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 ;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 ;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
 ;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 ;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 ;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 ;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 ;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 include(`target-avx.ll')
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max
 declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readonly
 declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readonly
 define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
  %m = call <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32> %0, <8 x i32> %1)
  ret <8 x i32> %m
 }
 define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
  %m = call <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32> %0, <8 x i32> %1)
  ret <8 x i32> %m
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unsigned int min/max
 declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readonly
 declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readonly
 define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
  %m = call <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32> %0, <8 x i32> %1)
  ret <8 x i32> %m
 }
 define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
  %m = call <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32> %0, <8 x i32> %1)
  ret <8 x i32> %m
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float/half conversions
 declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
 ; 0 is round nearest even
 declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
 define <8 x float> @__half_to_float_varying(<8 x i16> %v) nounwind readnone {
  %r = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %v)
  ret <8 x float> %r
 }
 define <8 x i16> @__float_to_half_varying(<8 x float> %v) nounwind readnone {
  %r = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %v, i32 0)
  ret <8 x i16> %r
 }
 define float @__half_to_float_uniform(i16 %v) nounwind readnone {
  %v1 = bitcast i16 %v to <1 x i16>
  %vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
                      i32 undef, i32 undef, i32 undef, i32 undef>
  %rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
  %r = extractelement <8 x float> %rv, i32 0
  ret float %r
 }
 define i16 @__float_to_half_uniform(float %v) nounwind readnone {
  %v1 = bitcast float %v to <1 x float>
  %vv = shufflevector <1 x float> %v1, <1 x float> undef,
           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
                      i32 undef, i32 undef, i32 undef, i32 undef>
  ; round to nearest even
  %rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
  %r = extractelement <8 x i16> %rv, i32 0
  ret i16 %r
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather
 gen_gather(8, i8)
 gen_gather(8, i16)
 gen_gather(8, i32)
 gen_gather(8, i64)
--- a/builtins/target-generic-1.ll
+++ b/builtins/target-generic-1.ll
@@ -0,0 +1,935 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; Define the standard library builtins for the NOVEC target
 define(`MASK',`i32')
 define(`WIDTH',`1')
 include(`util.m4')
 ; Define some basics for a 1-wide target
 stdlib_core()
 packed_load_and_store()
 scans()
 int64minmax()
 aossoa()
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store
 gen_masked_store(1, i8, 8)
 gen_masked_store(1, i16, 16)
 gen_masked_store(1, i32, 32)
 gen_masked_store(1, i64, 64)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts
 load_and_broadcast(1, i8, 8)
 load_and_broadcast(1, i16, 16)
 load_and_broadcast(1, i32, 32)
 load_and_broadcast(1, i64, 64)
 masked_load(1, i8,  8,  1)
 masked_load(1, i16, 16, 2)
 masked_load(1, i32, 32, 4)
 masked_load(1, i64, 64, 8)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter
 ; define these with the macros from stdlib.m4
 gen_gather(1, i8)
 gen_gather(1, i16)
 gen_gather(1, i32)
 gen_gather(1, i64)
 gen_scatter(1, i8)
 gen_scatter(1, i16)
 gen_scatter(1, i32)
 gen_scatter(1, i64)
 define  <1 x i8> @__vselect_i8(<1 x i8>, <1 x i8> ,
                                         <1 x i32> %mask) nounwind readnone alwaysinline {
 ;  %mv = trunc <1 x i32> %mask to <1 x i8>
 ;  %notmask = xor <1 x i8> %mv, <i8 -1>
 ;  %cleared_old = and <1 x i8> %0, %notmask
 ;  %masked_new = and <1 x i8> %1, %mv
 ;  %new = or <1 x i8> %cleared_old, %masked_new
 ;  ret <1 x i8> %new
   ; not doing this the easy way because of problems with LLVM's scalarizer
 ;   %cmp = icmp eq <1 x i32> %mask, <i32 0>
 ;   %sel = select <1 x i1> %cmp, <1 x i8> %0, <1 x i8> %1
    %m = extractelement <1 x i32> %mask, i32 0
    %cmp = icmp eq i32 %m, 0
    %d0 = extractelement <1 x i8> %0, i32 0
    %d1 = extractelement <1 x i8> %1, i32 0
    %sel = select i1 %cmp, i8 %d0, i8 %d1    
    %r = insertelement <1 x i8> undef, i8 %sel, i32 0
   ret <1 x i8> %r
 }
 define  <1 x i16> @__vselect_i16(<1 x i16>, <1 x i16> ,
                                         <1 x i32> %mask) nounwind readnone alwaysinline {
 ;  %mv = trunc <1 x i32> %mask to <1 x i16>
 ;  %notmask = xor <1 x i16> %mv, <i16 -1>
 ;  %cleared_old = and <1 x i16> %0, %notmask
 ;  %masked_new = and <1 x i16> %1, %mv
 ;  %new = or <1 x i16> %cleared_old, %masked_new
 ;  ret <1 x i16> %new
 ;   %cmp = icmp eq <1 x i32> %mask, <i32 0>
 ;   %sel = select <1 x i1> %cmp, <1 x i16> %0, <1 x i16> %1
    %m = extractelement <1 x i32> %mask, i32 0
    %cmp = icmp eq i32 %m, 0
    %d0 = extractelement <1 x i16> %0, i32 0
    %d1 = extractelement <1 x i16> %1, i32 0
    %sel = select i1 %cmp, i16 %d0, i16 %d1    
    %r = insertelement <1 x i16> undef, i16 %sel, i32 0
   ret <1 x i16> %r
 ;   ret <1 x i16> %sel
 }
 define  <1 x i32> @__vselect_i32(<1 x i32>, <1 x i32> ,
                                         <1 x i32> %mask) nounwind readnone alwaysinline {
 ;  %notmask = xor <1 x i32> %mask, <i32 -1>
 ;  %cleared_old = and <1 x i32> %0, %notmask
 ;  %masked_new = and <1 x i32> %1, %mask
 ;  %new = or <1 x i32> %cleared_old, %masked_new
 ;  ret <1 x i32> %new
 ;   %cmp = icmp eq <1 x i32> %mask, <i32 0>
 ;   %sel = select <1 x i1> %cmp, <1 x i32> %0, <1 x i32> %1
 ;   ret <1 x i32> %sel
    %m = extractelement <1 x i32> %mask, i32 0
    %cmp = icmp eq i32 %m, 0
    %d0 = extractelement <1 x i32> %0, i32 0
    %d1 = extractelement <1 x i32> %1, i32 0
    %sel = select i1 %cmp, i32 %d0, i32 %d1    
    %r = insertelement <1 x i32> undef, i32 %sel, i32 0
   ret <1 x i32> %r
 }
 define  <1 x i64> @__vselect_i64(<1 x i64>, <1 x i64> ,
                                         <1 x i32> %mask) nounwind readnone alwaysinline {
 ;  %newmask = zext <1 x i32> %mask to <1 x i64>
 ;  %notmask = xor <1 x i64> %newmask, <i64 -1>
 ;  %cleared_old = and <1 x i64> %0, %notmask
 ;  %masked_new = and <1 x i64> %1, %newmask
 ;  %new = or <1 x i64> %cleared_old, %masked_new
 ;  ret <1 x i64> %new
 ;   %cmp = icmp eq <1 x i32> %mask, <i32 0>
 ;   %sel = select <1 x i1> %cmp, <1 x i64> %0, <1 x i64> %1
 ;   ret <1 x i64> %sel
    %m = extractelement <1 x i32> %mask, i32 0
    %cmp = icmp eq i32 %m, 0
    %d0 = extractelement <1 x i64> %0, i32 0
    %d1 = extractelement <1 x i64> %1, i32 0
    %sel = select i1 %cmp, i64 %d0, i64 %d1    
    %r = insertelement <1 x i64> undef, i64 %sel, i32 0
   ret <1 x i64> %r
 }
 define  <1 x float> @__vselect_float(<1 x float>, <1 x float>,
                                             <1 x i32> %mask) nounwind readnone alwaysinline {
 ;  %v0 = bitcast <1 x float> %0 to <1 x i32>
 ;  %v1 = bitcast <1 x float> %1 to <1 x i32>
 ;  %r = call <1 x i32> @__vselect_i32(<1 x i32> %v0, <1 x i32> %v1, <1 x i32> %mask)
 ;  %rf = bitcast <1 x i32> %r to <1 x float>
 ;  ret <1 x float> %rf
 ;   %cmp = icmp eq <1 x i32> %mask, <i32 0>
 ;   %sel = select <1 x i1> %cmp, <1 x float> %0, <1 x float> %1
 ;   ret <1 x float> %sel
    %m = extractelement <1 x i32> %mask, i32 0
    %cmp = icmp eq i32 %m, 0
    %d0 = extractelement <1 x float> %0, i32 0
    %d1 = extractelement <1 x float> %1, i32 0
    %sel = select i1 %cmp, float %d0, float %d1    
    %r = insertelement <1 x float> undef, float %sel, i32 0
   ret <1 x float> %r
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store
 define void @__masked_store_blend_8(<1 x i8>* nocapture, <1 x i8>, 
                                     <1 x i32> %mask) nounwind alwaysinline {
  %val = load <1 x i8> * %0, align 4
  %newval = call <1 x i8> @__vselect_i8(<1 x i8> %val, <1 x i8> %1, <1 x i32> %mask) 
  store <1 x i8> %newval, <1 x i8> * %0, align 4
  ret void
 }
 define void @__masked_store_blend_16(<1 x i16>* nocapture, <1 x i16>, 
                                     <1 x i32> %mask) nounwind alwaysinline {
  %val = load <1 x i16> * %0, align 4
  %newval = call <1 x i16> @__vselect_i16(<1 x i16> %val, <1 x i16> %1, <1 x i32> %mask) 
  store <1 x i16> %newval, <1 x i16> * %0, align 4
  ret void
 }
 define void @__masked_store_blend_32(<1 x i32>* nocapture, <1 x i32>, 
                                     <1 x i32> %mask) nounwind alwaysinline {
  %val = load <1 x i32> * %0, align 4
  %newval = call <1 x i32> @__vselect_i32(<1 x i32> %val, <1 x i32> %1, <1 x i32> %mask) 
  store <1 x i32> %newval, <1 x i32> * %0, align 4
  ret void
 }
 define void @__masked_store_blend_64(<1 x i64>* nocapture, <1 x i64>,
                                     <1 x i32> %mask) nounwind alwaysinline {
  %val = load <1 x i64> * %0, align 4
  %newval = call <1 x i64> @__vselect_i64(<1 x i64> %val, <1 x i64> %1, <1 x i32> %mask) 
  store <1 x i64> %newval, <1 x i64> * %0, align 4
  ret void
 }
 define  i32 @__movmsk(<1 x i32>) nounwind readnone alwaysinline {
  %item = extractelement <1 x i32> %0, i32 0
  %v = lshr i32 %item, 31
  ret i32 %v
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding
 ;;
 ;; There are not any rounding instructions in SSE2, so we have to emulate
 ;; the functionality with multiple instructions...
 ; The code for __round_* is the result of compiling the following source
 ; code.
 ;
 ; export float Round(float x) {
 ;    unsigned int sign = signbits(x);
 ;    unsigned int ix = intbits(x);
 ;    ix ^= sign;
 ;    x = floatbits(ix);
 ;    x += 0x1.0p23f;
 ;    x -= 0x1.0p23f;
 ;    ix = intbits(x);
 ;    ix ^= sign;
 ;    x = floatbits(ix);
 ;    return x;
 ;}
 define  <1 x float> @__round_varying_float(<1 x float>) nounwind readonly alwaysinline {
  %float_to_int_bitcast.i.i.i.i = bitcast <1 x float> %0 to <1 x i32>
  %bitop.i.i = and <1 x i32> %float_to_int_bitcast.i.i.i.i, <i32 -2147483648>
  %bitop.i = xor <1 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
  %int_to_float_bitcast.i.i40.i = bitcast <1 x i32> %bitop.i to <1 x float>
  %binop.i = fadd <1 x float> %int_to_float_bitcast.i.i40.i, <float 8.388608e+06>
  %binop21.i = fadd <1 x float> %binop.i, <float -8.388608e+06>
  %float_to_int_bitcast.i.i.i = bitcast <1 x float> %binop21.i to <1 x i32>
  %bitop31.i = xor <1 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
  %int_to_float_bitcast.i.i.i = bitcast <1 x i32> %bitop31.i to <1 x float>
  ret <1 x float> %int_to_float_bitcast.i.i.i
 }
 ;; Similarly, for implementations of the __floor* functions below, we have the
 ;; bitcode from compiling the following source code...
 ;export float Floor(float x) {
 ;    float y = Round(x);
 ;    unsigned int cmp = y > x ? 0xffffffff : 0;
 ;    float delta = -1.f;
 ;    unsigned int idelta = intbits(delta);
 ;    idelta &= cmp;
 ;    delta = floatbits(idelta);
 ;    return y + delta;
 ;}
 define  <1 x float> @__floor_varying_float(<1 x float>) nounwind readonly alwaysinline {
  %calltmp.i = tail call <1 x float> @__round_varying_float(<1 x float> %0) nounwind
  %bincmp.i = fcmp ogt <1 x float> %calltmp.i, %0
  %val_to_boolvec32.i = sext <1 x i1> %bincmp.i to <1 x i32>
  %bitop.i = and <1 x i32> %val_to_boolvec32.i, <i32 -1082130432>
  %int_to_float_bitcast.i.i.i = bitcast <1 x i32> %bitop.i to <1 x float>
  %binop.i = fadd <1 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
  ret <1 x float> %binop.i
 }
 ;; And here is the code we compiled to get the __ceil* functions below
 ;
 ;export uniform float Ceil(uniform float x) {
 ;    uniform float y = Round(x);
 ;    uniform int yltx = y < x ? 0xffffffff : 0;
 ;    uniform float delta = 1.f;
 ;    uniform int idelta = intbits(delta);
 ;    idelta &= yltx;
 ;    delta = floatbits(idelta);
 ;    return y + delta;
 ;}
 define  <1 x float> @__ceil_varying_float(<1 x float>) nounwind readonly alwaysinline {
  %calltmp.i = tail call <1 x float> @__round_varying_float(<1 x float> %0) nounwind
  %bincmp.i = fcmp olt <1 x float> %calltmp.i, %0
  %val_to_boolvec32.i = sext <1 x i1> %bincmp.i to <1 x i32>
  %bitop.i = and <1 x i32> %val_to_boolvec32.i, <i32 1065353216>
  %int_to_float_bitcast.i.i.i = bitcast <1 x i32> %bitop.i to <1 x float>
  %binop.i = fadd <1 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
  ret <1 x float> %binop.i
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding doubles
 ; expecting math lib to provide this
 declare double @ceil (double) nounwind readnone
 declare double @floor (double) nounwind readnone
 declare double @round (double) nounwind readnone
 ;declare float     @llvm.sqrt.f32(float %Val)
 declare double    @llvm.sqrt.f64(double %Val)
 declare float     @llvm.sin.f32(float %Val)
 declare float     @llvm.cos.f32(float %Val)
 declare float     @llvm.sqrt.f32(float %Val)
 declare float     @llvm.exp.f32(float %Val)
 declare float     @llvm.log.f32(float %Val)
 declare float     @llvm.pow.f32(float %f, float %e)
 ;; stuff that could be in builtins ...
 define(`unary1to1', `
  %v_0 = extractelement <1 x $1> %0, i32 0
  %r_0 = call $1 $2($1 %v_0)
  %ret_0 = insertelement <1 x $1> undef, $1 %r_0, i32 0
  ret <1 x $1> %ret_0
 ')
 ;; dummy 1 wide vector ops
 define  void
@__aos_to_soa4_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2,
        <1 x float> %v3, <1 x float> * noalias %out0, 
        <1 x float> * noalias %out1, <1 x float> * noalias %out2, 
        <1 x float> * noalias %out3) nounwind alwaysinline { 
  store <1 x float> %v0, <1 x float > * %out0
  store <1 x float> %v1, <1 x float > * %out1
  store <1 x float> %v2, <1 x float > * %out2
  store <1 x float> %v3, <1 x float > * %out3
  ret void
 }
 define  void
@__soa_to_aos4_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2,
        <1 x float> %v3, <1 x float> * noalias %out0, 
        <1 x float> * noalias %out1, <1 x float> * noalias %out2, 
        <1 x float> * noalias %out3) nounwind alwaysinline { 
  call void @__aos_to_soa4_float1(<1 x float> %v0, <1 x float> %v1, 
    <1 x float> %v2, <1 x float> %v3, <1 x float> * %out0, 
    <1 x float> * %out1, <1 x float> * %out2, <1 x float> * %out3)
  ret void
 }
 define  void
@__aos_to_soa3_float1(<1 x float> %v0, <1 x float> %v1,
         <1 x float> %v2, <1 x float> * %out0, <1 x float> * %out1,
         <1 x float> * %out2) {
  store <1 x float> %v0, <1 x float > * %out0
  store <1 x float> %v1, <1 x float > * %out1
  store <1 x float> %v2, <1 x float > * %out2
  ret void
 }
 define  void
@__soa_to_aos3_float1(<1 x float> %v0, <1 x float> %v1,
         <1 x float> %v2, <1 x float> * %out0, <1 x float> * %out1,
         <1 x float> * %out2) {
  call void @__aos_to_soa3_float1(<1 x float> %v0, <1 x float> %v1,
         <1 x float> %v2, <1 x float> * %out0, <1 x float> * %out1,
         <1 x float> * %out2)
  ret void
 }
 ;; end builtins
 define  <1 x double> @__round_varying_double(<1 x double>) nounwind readonly alwaysinline {
  unary1to1(double, @round)
 }
 define  <1 x double> @__floor_varying_double(<1 x double>) nounwind readonly alwaysinline {
  unary1to1(double, @floor)
 }
 define  <1 x double> @__ceil_varying_double(<1 x double>) nounwind readonly alwaysinline {
  unary1to1(double, @ceil)
 }
 ; To do vector integer min and max, we do the vector compare and then sign
 ; extend the i1 vector result to an i32 mask.  The __vselect does the
 ; rest...
 define  <1 x i32> @__min_varying_int32(<1 x i32>, <1 x i32>) nounwind readonly alwaysinline {
  %c = icmp slt <1 x i32> %0, %1
  %mask = sext <1 x i1> %c to <1 x i32>
  %v = call <1 x i32> @__vselect_i32(<1 x i32> %1, <1 x i32> %0, <1 x i32> %mask)
  ret <1 x i32> %v
 }
 define  i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
  %c = icmp slt i32 %0, %1
  %r = select i1 %c, i32 %0, i32 %1
  ret i32 %r
 }
 define  <1 x i32> @__max_varying_int32(<1 x i32>, <1 x i32>) nounwind readonly alwaysinline {
  %c = icmp sgt <1 x i32> %0, %1
  %mask = sext <1 x i1> %c to <1 x i32>
  %v = call <1 x i32> @__vselect_i32(<1 x i32> %1, <1 x i32> %0, <1 x i32> %mask)
  ret <1 x i32> %v
 }
 define  i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
  %c = icmp sgt i32 %0, %1
  %r = select i1 %c, i32 %0, i32 %1
  ret i32 %r
 }
 ; The functions for unsigned ints are similar, just with unsigned
 ; comparison functions...
 define  <1 x i32> @__min_varying_uint32(<1 x i32>, <1 x i32>) nounwind readonly alwaysinline {
  %c = icmp ult <1 x i32> %0, %1
  %mask = sext <1 x i1> %c to <1 x i32>
  %v = call <1 x i32> @__vselect_i32(<1 x i32> %1, <1 x i32> %0, <1 x i32> %mask)
  ret <1 x i32> %v
 }
 define  i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
  %c = icmp ult i32 %0, %1
  %r = select i1 %c, i32 %0, i32 %1
  ret i32 %r
 }
 define  <1 x i32> @__max_varying_uint32(<1 x i32>, <1 x i32>) nounwind readonly alwaysinline {
  %c = icmp ugt <1 x i32> %0, %1
  %mask = sext <1 x i1> %c to <1 x i32>
  %v = call <1 x i32> @__vselect_i32(<1 x i32> %1, <1 x i32> %0, <1 x i32> %mask)
  ret <1 x i32> %v
 }
 define  i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
  %c = icmp ugt i32 %0, %1
  %r = select i1 %c, i32 %0, i32 %1
  ret i32 %r
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
 declare i32 @llvm.ctpop.i32(i32) nounwind readnone
 define  i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
  %call = call i32 @llvm.ctpop.i32(i32 %0)
  ret i32 %call
 }
 declare i64 @llvm.ctpop.i64(i64) nounwind readnone
 define  i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
  %call = call i64 @llvm.ctpop.i64(i64 %0)
  ret i64 %call
 }
 define  float @__reduce_add_float(<1 x float> %v) nounwind readonly alwaysinline {
  %r = extractelement <1 x float> %v, i32 0
  ret float %r
 }
 define  float @__reduce_min_float(<1 x float>) nounwind readnone {
  %r = extractelement <1 x float> %0, i32 0
  ret float %r
 }
 define  float @__reduce_max_float(<1 x float>) nounwind readnone {
  %r = extractelement <1 x float> %0, i32 0
  ret float %r
 }
 define  i32 @__reduce_add_int32(<1 x i32> %v) nounwind readnone {
  %r = extractelement <1 x i32> %v, i32 0
  ret i32 %r
 }
 define  i32 @__reduce_min_int32(<1 x i32>) nounwind readnone {
  %r = extractelement <1 x i32> %0, i32 0
  ret i32 %r
 }
 define  i32 @__reduce_max_int32(<1 x i32>) nounwind readnone {
  %r = extractelement <1 x i32> %0, i32 0
  ret i32 %r
 }
 define  i32 @__reduce_add_uint32(<1 x i32> %v) nounwind readnone {
  %r = call i32 @__reduce_add_int32(<1 x i32> %v)
  ret i32 %r
 }
 define  i32 @__reduce_min_uint32(<1 x i32>) nounwind readnone {
  %r = extractelement <1 x i32> %0, i32 0
  ret i32 %r
 }
 define  i32 @__reduce_max_uint32(<1 x i32>) nounwind readnone {
  %r = extractelement <1 x i32> %0, i32 0
  ret i32 %r
 }
 define  double @__reduce_add_double(<1 x double>) nounwind readnone {
  %m = extractelement <1 x double> %0, i32 0
  ret double %m
 }
 define  double @__reduce_min_double(<1 x double>) nounwind readnone {
  %m = extractelement <1 x double> %0, i32 0
  ret double %m
 }
 define  double @__reduce_max_double(<1 x double>) nounwind readnone {
  %m = extractelement <1 x double> %0, i32 0
  ret double %m
 }
 define  i64 @__reduce_add_int64(<1 x i64>) nounwind readnone {
  %m = extractelement <1 x i64> %0, i32 0
  ret i64 %m
 }
 define  i64 @__reduce_min_int64(<1 x i64>) nounwind readnone {
  %m = extractelement <1 x i64> %0, i32 0
  ret i64 %m
 }
 define  i64 @__reduce_max_int64(<1 x i64>) nounwind readnone {
  %m = extractelement <1 x i64> %0, i32 0
  ret i64 %m
 }
 define  i64 @__reduce_min_uint64(<1 x i64>) nounwind readnone {
  %m = extractelement <1 x i64> %0, i32 0
  ret i64 %m
 }
 define  i64 @__reduce_max_uint64(<1 x i64>) nounwind readnone {
  %m = extractelement <1 x i64> %0, i32 0
  ret i64 %m
 }
 define  i1 @__reduce_equal_int32(<1 x i32> %vv, i32 * %samevalue,
                                      <1 x i32> %mask) nounwind alwaysinline {
  %v=extractelement <1 x i32> %vv, i32 0
  store i32 %v, i32 * %samevalue
  ret i1 true
 }
 define  i1 @__reduce_equal_float(<1 x float> %vv, float * %samevalue,
                                      <1 x i32> %mask) nounwind alwaysinline {
  %v=extractelement <1 x float> %vv, i32 0
  store float %v, float * %samevalue
  ret i1 true
 }
 define  i1 @__reduce_equal_int64(<1 x i64> %vv, i64 * %samevalue,
                                      <1 x i32> %mask) nounwind alwaysinline {
  %v=extractelement <1 x i64> %vv, i32 0
  store i64 %v, i64 * %samevalue
  ret i1 true
 }
 define  i1 @__reduce_equal_double(<1 x double> %vv, double * %samevalue,
                                      <1 x i32> %mask) nounwind alwaysinline {
  %v=extractelement <1 x double> %vv, i32 0
  store double %v, double * %samevalue
  ret i1 true
 }
 ; extracting/reinserting elements because I want to be able to remove vectors later on
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
 define  <1 x float> @__rcp_varying_float(<1 x float>) nounwind readonly alwaysinline {
  ;%call = call <1 x float> @llvm.x86.sse.rcp.ps(<1 x float> %0)
  ; do one N-R iteration to improve precision
  ;  float iv = __rcp_v(v);
  ;  return iv * (2. - v * iv);
  ;%v_iv = fmul <1 x float> %0, %call
  ;%two_minus = fsub <1 x float> <float 2., float 2., float 2., float 2.>, %v_iv  
  ;%iv_mul = fmul <1 x float> %call, %two_minus
  ;ret <1 x float> %iv_mul
  %d = extractelement <1 x float> %0, i32 0
  %r = fdiv float 1.,%d
  %rv = insertelement <1 x float> undef, float %r, i32 0
  ret <1 x float> %rv
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; sqrt
 define  <1 x float> @__sqrt_varying_float(<1 x float>) nounwind readonly alwaysinline {
  ;%call = call <1 x float> @llvm.x86.sse.sqrt.ps(<1 x float> %0)
  ;ret <1 x float> %call
  %d = extractelement <1 x float> %0, i32 0
  %r = call float @llvm.sqrt.f32(float %d)
  %rv = insertelement <1 x float> undef, float %r, i32 0
  ret <1 x float> %rv
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; rsqrt
 define  <1 x float> @__rsqrt_varying_float(<1 x float> %v) nounwind readonly alwaysinline {
  ;  float is = __rsqrt_v(v);
  ;%is = call <1 x float> @llvm.x86.sse.rsqrt.ps(<1 x float> %v)
  ; Newton-Raphson iteration to improve precision
  ;  return 0.5 * is * (3. - (v * is) * is);
  ;%v_is = fmul <1 x float> %v, %is
  ;%v_is_is = fmul <1 x float> %v_is, %is
  ;%three_sub = fsub <1 x float> <float 3., float 3., float 3., float 3.>, %v_is_is
  ;%is_mul = fmul <1 x float> %is, %three_sub
  ;%half_scale = fmul <1 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
  ;ret <1 x float> %half_scale
  %s = call <1 x float> @__sqrt_varying_float(<1 x float> %v)
  %r = call <1 x float> @__rcp_varying_float(<1 x float> %s)
  ret <1 x float> %r
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; svml stuff
 define  <1 x float> @__svml_sin(<1 x float>) nounwind readnone alwaysinline {
  ;%ret = call <1 x float> @__svml_sinf4(<1 x float> %0)
  ;ret <1 x float> %ret
  ;%r = extractelement <1 x float> %0, i32 0
  ;%s = call float @llvm.sin.f32(float %r)
  ;%rv = insertelement <1 x float> undef, float %r, i32 0
  ;ret <1 x float> %rv
  unary1to1(float,@llvm.sin.f32)
 }
 define  <1 x float> @__svml_cos(<1 x float>) nounwind readnone alwaysinline {
  ;%ret = call <1 x float> @__svml_cosf4(<1 x float> %0)
  ;ret <1 x float> %ret
  ;%r = extractelement <1 x float> %0, i32 0
  ;%s = call float @llvm.cos.f32(float %r)
  ;%rv = insertelement <1 x float> undef, float %r, i32 0
  ;ret <1 x float> %rv
  unary1to1(float, @llvm.cos.f32)
 }
 define  void @__svml_sincos(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline {
 ;  %s = call <1 x float> @__svml_sincosf4(<1 x float> * %2, <1 x float> %0)
 ;  store <1 x float> %s, <1 x float> * %1
 ;  ret void
   %sin = call <1 x float> @__svml_sin (<1 x float> %0)
   %cos = call <1 x float> @__svml_cos (<1 x float> %0)
   store <1 x float> %sin, <1 x float> * %1
   store <1 x float> %cos, <1 x float> * %2
   ret void
 }
 define  <1 x float> @__svml_tan(<1 x float>) nounwind readnone alwaysinline {
  ;%ret = call <1 x float> @__svml_tanf4(<1 x float> %0)
  ;ret <1 x float> %ret
  ;%r = extractelement <1 x float> %0, i32 0
  ;%s = call float @llvm_tan_f32(float %r)
  ;%rv = insertelement <1 x float> undef, float %r, i32 0
  ;ret <1 x float> %rv
  ;unasry1to1(float, @llvm.tan.f32)
  ; UNSUPPORTED!
  ret <1 x float > %0
 }
 define  <1 x float> @__svml_atan(<1 x float>) nounwind readnone alwaysinline {
 ;  %ret = call <1 x float> @__svml_atanf4(<1 x float> %0)
 ;  ret <1 x float> %ret
  ;%r = extractelement <1 x float> %0, i32 0
  ;%s = call float @llvm_atan_f32(float %r)
  ;%rv = insertelement <1 x float> undef, float %r, i32 0
  ;ret <1 x float> %rv
  ;unsary1to1(float,@llvm.atan.f32)
  ;UNSUPPORTED!
  ret <1 x float > %0
 }
 define  <1 x float> @__svml_atan2(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
  ;%ret = call <1 x float> @__svml_atan2f4(<1 x float> %0, <1 x float> %1)
  ;ret <1 x float> %ret
  ;%y = extractelement <1 x float> %0, i32 0
  ;%x = extractelement <1 x float> %1, i32 0
  ;%q = fdiv float %y, %x
  ;%a = call float @llvm.atan.f32 (float %q)
  ;%rv = insertelement <1 x float> undef, float %a, i32 0
  ;ret <1 x float> %rv
  ; UNSUPPORTED!
  ret <1 x float > %0
 }
 define  <1 x float> @__svml_exp(<1 x float>) nounwind readnone alwaysinline {
  ;%ret = call <1 x float> @__svml_expf4(<1 x float> %0)
  ;ret <1 x float> %ret
  unary1to1(float, @llvm.exp.f32)
 }
 define  <1 x float> @__svml_log(<1 x float>) nounwind readnone alwaysinline {
  ;%ret = call <1 x float> @__svml_logf4(<1 x float> %0)
  ;ret <1 x float> %ret
  unary1to1(float, @llvm.log.f32)
 }
 define  <1 x float> @__svml_pow(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
  ;%ret = call <1 x float> @__svml_powf4(<1 x float> %0, <1 x float> %1)
  ;ret <1 x float> %ret
  %r = extractelement <1 x float> %0, i32 0
  %e  = extractelement <1 x float> %1, i32 0
  %s = call float @llvm.pow.f32(float %r,float %e)
  %rv = insertelement <1 x float> undef, float %s, i32 0
  ret <1 x float> %rv
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
 define  <1 x float> @__max_varying_float(<1 x float>, <1 x float>) nounwind readonly alwaysinline {
 ;  %call = call <1 x float> @llvm.x86.sse.max.ps(<1 x float> %0, <1 x float> %1)
 ;  ret <1 x float> %call
  %a = extractelement <1 x float> %0, i32 0
  %b = extractelement <1 x float> %1, i32 0
  %d = fcmp ogt float %a, %b  
  %r = select i1 %d, float %a, float %b
  %rv = insertelement <1 x float> undef, float %r, i32 0
  ret <1 x float> %rv    
 }
 define  <1 x float> @__min_varying_float(<1 x float>, <1 x float>) nounwind readonly alwaysinline {
 ;  %call = call <1 x float> @llvm.x86.sse.min.ps(<1 x float> %0, <1 x float> %1)
 ;  ret <1 x float> %call
  %a = extractelement <1 x float> %0, i32 0
  %b = extractelement <1 x float> %1, i32 0
  %d = fcmp olt float %a, %b  
  %r = select i1 %d, float %a, float %b
  %rv = insertelement <1 x float> undef, float %r, i32 0
  ret <1 x float> %rv    
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; double precision sqrt
 ;declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
 define  <1 x double> @__sqrt_varying_double(<1 x double>) nounwind alwaysinline {
  ;unarya2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
  ;ret <1 x double> %ret
  unary1to1(double, @llvm.sqrt.f64)
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; double precision min/max
 ;declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
 ;declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
 define  <1 x double> @__min_varying_double(<1 x double>, <1 x double>) nounwind readnone {
  ;binarsy2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
  ;ret <1 x double> %ret
  %a = extractelement <1 x double> %0, i32 0
  %b = extractelement <1 x double> %1, i32 0
  %d = fcmp olt double %a, %b  
  %r = select i1 %d, double %a, double %b
  %rv = insertelement <1 x double> undef, double %r, i32 0
  ret <1 x double> %rv    
 }
 define  <1 x double> @__max_varying_double(<1 x double>, <1 x double>) nounwind readnone {
  ;binary2sto4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
  ;ret <1 x double> %ret
  %a = extractelement <1 x double> %0, i32 0
  %b = extractelement <1 x double> %1, i32 0
  %d = fcmp ogt double %a, %b  
  %r = select i1 %d, double %a, double %b
  %rv = insertelement <1 x double> undef, double %r, i32 0
  ret <1 x double> %rv    
 }
 define  float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
 ;    uniform float iv = extract(__rcp_u(v), 0);
 ;    return iv * (2. - v * iv);
  %r = fdiv float 1.,%0
  ret float %r
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding floats
 define  float @__round_uniform_float(float) nounwind readonly alwaysinline {
  ; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
  ; the roundss intrinsic is a total mess--docs say:
  ;
  ;  __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
  ;       
  ;  b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
  ;  on b0. The higher order 96 bits are copied directly from input parameter a. The
  ;  return value is described by the following equations:
  ;
  ;  r0 = RND(b0)
  ;  r1 = a1
  ;  r2 = a2
  ;  r3 = a3
  ;
  ;  It doesn't matter what we pass as a, since we only need the r0 value
  ;  here.  So we pass the same register for both.
  %v = insertelement<1 x float> undef, float %0, i32 0
  %rv = call <1 x float> @__round_varying_float(<1 x float> %v)
  %r=extractelement <1 x float> %rv, i32 0
  ret float %r
 }
 define  float @__floor_uniform_float(float) nounwind readonly alwaysinline {
  %v = insertelement<1 x float> undef, float %0, i32 0
  %rv = call <1 x float> @__floor_varying_float(<1 x float> %v)
  %r=extractelement <1 x float> %rv, i32 0
  ret float %r
 }
 define  float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
  %v = insertelement<1 x float> undef, float %0, i32 0
  %rv = call <1 x float> @__ceil_varying_float(<1 x float> %v)
  %r=extractelement <1 x float> %rv, i32 0
  ret float %r
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding doubles
 define  double @__round_uniform_double(double) nounwind readonly alwaysinline {
       %rs=call double @round(double %0)
       ret double %rs
 }
 define  double @__floor_uniform_double(double) nounwind readonly alwaysinline {
  %rs = call double @floor(double %0)
  ret double %rs
 }
 define  double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
  %rs = call double @ceil(double %0)
  ret double %rs
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; sqrt
 define  float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
  %ret = call float @llvm.sqrt.f32(float %0)
  ret float %ret
 }
 define  double @__sqrt_uniform_double(double) nounwind readonly alwaysinline {
  %ret = call double @llvm.sqrt.f64(double %0)
  ret double %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rsqrt
 define  float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
  %s = call float @__sqrt_uniform_float(float %0)
  %r = call float @__rcp_uniform_float(float %s)
  ret float %r
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; fastmath
 define  void @__fastmath() nounwind alwaysinline {
 ; no-op
  ret void
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
 define  float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
  %d = fcmp ogt float %0, %1 
  %r = select i1 %d, float %0, float %1
  ret float %r
 }
 define  float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
  %d = fcmp olt float %0, %1 
  %r = select i1 %d, float %0, float %1
  ret float %r
 }
 define  double @__max_uniform_double(double, double) nounwind readonly alwaysinline {
  %d = fcmp ogt double %0, %1 
  %r = select i1 %d, double %0, double %1
  ret double %r
 }
 define  double @__min_uniform_double(double, double) nounwind readonly alwaysinline {
  %d = fcmp olt double %0, %1 
  %r = select i1 %d, double %0, double %1
  ret double %r
 }
 define_shuffles()
 ctlztz()
 define_prefetches()
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; half conversion routines
 declare float @__half_to_float_uniform(i16 %v) nounwind readnone
 declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
 declare i16 @__float_to_half_uniform(float %v) nounwind readnone
 declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
--- a/builtins/target-generic-16.ll
+++ b/builtins/target-generic-16.ll
@@ -0,0 +1,34 @@
 ;;  Copyright (c) 2010-2011, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
 ;;  modification, are permitted provided that the following conditions are
 ;;  met:
 ;;
 ;;    * Redistributions of source code must retain the above copyright
 ;;      notice, this list of conditions and the following disclaimer.
 ;;
 ;;    * Redistributions in binary form must reproduce the above copyright
 ;;      notice, this list of conditions and the following disclaimer in the
 ;;      documentation and/or other materials provided with the distribution.
 ;;
 ;;    * Neither the name of Intel Corporation nor the names of its
 ;;      contributors may be used to endorse or promote products derived from
 ;;      this software without specific prior written permission.
 ;;
 ;;
 ;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 ;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 ;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 ;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
 ;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 ;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 ;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 ;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 ;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 define(`WIDTH',`16')
 include(`target-generic-common.ll')
--- a/builtins/target-generic-4.ll
+++ b/builtins/target-generic-4.ll
@@ -0,0 +1,34 @@
 ;;  Copyright (c) 2010-2011, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
 ;;  modification, are permitted provided that the following conditions are
 ;;  met:
 ;;
 ;;    * Redistributions of source code must retain the above copyright
 ;;      notice, this list of conditions and the following disclaimer.
 ;;
 ;;    * Redistributions in binary form must reproduce the above copyright
 ;;      notice, this list of conditions and the following disclaimer in the
 ;;      documentation and/or other materials provided with the distribution.
 ;;
 ;;    * Neither the name of Intel Corporation nor the names of its
 ;;      contributors may be used to endorse or promote products derived from
 ;;      this software without specific prior written permission.
 ;;
 ;;
 ;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 ;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 ;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 ;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
 ;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 ;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 ;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 ;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 ;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 define(`WIDTH',`4')
 include(`target-generic-common.ll')
--- a/builtins/target-generic-8.ll
+++ b/builtins/target-generic-8.ll
@@ -0,0 +1,34 @@
 ;;  Copyright (c) 2010-2011, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
 ;;  modification, are permitted provided that the following conditions are
 ;;  met:
 ;;
 ;;    * Redistributions of source code must retain the above copyright
 ;;      notice, this list of conditions and the following disclaimer.
 ;;
 ;;    * Redistributions in binary form must reproduce the above copyright
 ;;      notice, this list of conditions and the following disclaimer in the
 ;;      documentation and/or other materials provided with the distribution.
 ;;
 ;;    * Neither the name of Intel Corporation nor the names of its
 ;;      contributors may be used to endorse or promote products derived from
 ;;      this software without specific prior written permission.
 ;;
 ;;
 ;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 ;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 ;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 ;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
 ;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 ;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 ;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 ;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 ;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 define(`WIDTH',`8')
 include(`target-generic-common.ll')
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -0,0 +1,336 @@
 ;;  Copyright (c) 2010-2011, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
 ;;  modification, are permitted provided that the following conditions are
 ;;  met:
 ;;
 ;;    * Redistributions of source code must retain the above copyright
 ;;      notice, this list of conditions and the following disclaimer.
 ;;
 ;;    * Redistributions in binary form must reproduce the above copyright
 ;;      notice, this list of conditions and the following disclaimer in the
 ;;      documentation and/or other materials provided with the distribution.
 ;;
 ;;    * Neither the name of Intel Corporation nor the names of its
 ;;      contributors may be used to endorse or promote products derived from
 ;;      this software without specific prior written permission.
 ;;
 ;;
 ;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 ;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 ;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 ;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
 ;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 ;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 ;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 ;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 ;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 define(`MASK',`i1')
 include(`util.m4')
 stdlib_core()
 scans()
 reduce_equal(WIDTH)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; broadcast/rotate/shuffle
 declare <WIDTH x float> @__smear_float(float) nounwind readnone
 declare <WIDTH x double> @__smear_double(double) nounwind readnone
 declare <WIDTH x i8> @__smear_i8(i8) nounwind readnone
 declare <WIDTH x i16> @__smear_i16(i16) nounwind readnone
 declare <WIDTH x i32> @__smear_i32(i32) nounwind readnone
 declare <WIDTH x i64> @__smear_i64(i64) nounwind readnone
 declare <WIDTH x float> @__broadcast_float(<WIDTH x float>, i32) nounwind readnone
 declare <WIDTH x double> @__broadcast_double(<WIDTH x double>, i32) nounwind readnone
 declare <WIDTH x i8> @__broadcast_i8(<WIDTH x i8>, i32) nounwind readnone
 declare <WIDTH x i16> @__broadcast_i16(<WIDTH x i16>, i32) nounwind readnone
 declare <WIDTH x i32> @__broadcast_i32(<WIDTH x i32>, i32) nounwind readnone
 declare <WIDTH x i64> @__broadcast_i64(<WIDTH x i64>, i32) nounwind readnone
 declare <WIDTH x i8> @__rotate_i8(<WIDTH x i8>, i32) nounwind readnone
 declare <WIDTH x i16> @__rotate_i16(<WIDTH x i16>, i32) nounwind readnone
 declare <WIDTH x float> @__rotate_float(<WIDTH x float>, i32) nounwind readnone
 declare <WIDTH x i32> @__rotate_i32(<WIDTH x i32>, i32) nounwind readnone
 declare <WIDTH x double> @__rotate_double(<WIDTH x double>, i32) nounwind readnone
 declare <WIDTH x i64> @__rotate_i64(<WIDTH x i64>, i32) nounwind readnone
 declare <WIDTH x i8> @__shuffle_i8(<WIDTH x i8>, <WIDTH x i32>) nounwind readnone
 declare <WIDTH x i8> @__shuffle2_i8(<WIDTH x i8>, <WIDTH x i8>,
                                    <WIDTH x i32>) nounwind readnone
 declare <WIDTH x i16> @__shuffle_i16(<WIDTH x i16>, <WIDTH x i32>) nounwind readnone
 declare <WIDTH x i16> @__shuffle2_i16(<WIDTH x i16>, <WIDTH x i16>,
                                      <WIDTH x i32>) nounwind readnone
 declare <WIDTH x float> @__shuffle_float(<WIDTH x float>,
                                         <WIDTH x i32>) nounwind readnone
 declare <WIDTH x float> @__shuffle2_float(<WIDTH x float>, <WIDTH x float>,
                                          <WIDTH x i32>) nounwind readnone
 declare <WIDTH x i32> @__shuffle_i32(<WIDTH x i32>,
                                     <WIDTH x i32>) nounwind readnone
 declare <WIDTH x i32> @__shuffle2_i32(<WIDTH x i32>, <WIDTH x i32>,
                                      <WIDTH x i32>) nounwind readnone
 declare <WIDTH x double> @__shuffle_double(<WIDTH x double>,
                                           <WIDTH x i32>) nounwind readnone
 declare <WIDTH x double> @__shuffle2_double(<WIDTH x double>,
                                            <WIDTH x double>, <WIDTH x i32>) nounwind readnone
 declare <WIDTH x i64> @__shuffle_i64(<WIDTH x i64>,
                                     <WIDTH x i32>) nounwind readnone
 declare <WIDTH x i64> @__shuffle2_i64(<WIDTH x i64>, <WIDTH x i64>,
                                      <WIDTH x i32>) nounwind readnone
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; aos/soa
 declare void @__soa_to_aos3_float(<WIDTH x float> %v0, <WIDTH x float> %v1,
                                  <WIDTH x float> %v2, float * noalias %p) nounwind
 declare void @__aos_to_soa3_float(float * noalias %p, <WIDTH x float> * %out0,
                                  <WIDTH x float> * %out1, <WIDTH x float> * %out2) nounwind
 declare void @__soa_to_aos4_float(<WIDTH x float> %v0, <WIDTH x float> %v1,
                                  <WIDTH x float> %v2, <WIDTH x float> %v3,
                                  float * noalias %p) nounwind
 declare void @__aos_to_soa4_float(float * noalias %p, <WIDTH x float> * noalias %out0,
                                  <WIDTH x float> * noalias %out1,
                                  <WIDTH x float> * noalias %out2,
                                  <WIDTH x float> * noalias %out3) nounwind
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; half conversion routines
 declare float @__half_to_float_uniform(i16 %v) nounwind readnone
 declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
 declare i16 @__float_to_half_uniform(float %v) nounwind readnone
 declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; math
 declare void @__fastmath() nounwind 
 ;; round/floor/ceil
 declare float @__round_uniform_float(float) nounwind readnone 
 declare float @__floor_uniform_float(float) nounwind readnone 
 declare float @__ceil_uniform_float(float) nounwind readnone 
 declare double @__round_uniform_double(double) nounwind readnone 
 declare double @__floor_uniform_double(double) nounwind readnone 
 declare double @__ceil_uniform_double(double) nounwind readnone 
 declare <WIDTH x float> @__round_varying_float(<WIDTH x float>) nounwind readnone 
 declare <WIDTH x float> @__floor_varying_float(<WIDTH x float>) nounwind readnone 
 declare <WIDTH x float> @__ceil_varying_float(<WIDTH x float>) nounwind readnone 
 declare <WIDTH x double> @__round_varying_double(<WIDTH x double>) nounwind readnone 
 declare <WIDTH x double> @__floor_varying_double(<WIDTH x double>) nounwind readnone 
 declare <WIDTH x double> @__ceil_varying_double(<WIDTH x double>) nounwind readnone 
 ;; min/max
 declare float @__max_uniform_float(float, float) nounwind readnone 
 declare float @__min_uniform_float(float, float) nounwind readnone 
 declare i32 @__min_uniform_int32(i32, i32) nounwind readnone 
 declare i32 @__max_uniform_int32(i32, i32) nounwind readnone 
 declare i32 @__min_uniform_uint32(i32, i32) nounwind readnone 
 declare i32 @__max_uniform_uint32(i32, i32) nounwind readnone 
 declare i64 @__min_uniform_int64(i64, i64) nounwind readnone 
 declare i64 @__max_uniform_int64(i64, i64) nounwind readnone 
 declare i64 @__min_uniform_uint64(i64, i64) nounwind readnone 
 declare i64 @__max_uniform_uint64(i64, i64) nounwind readnone 
 declare double @__min_uniform_double(double, double) nounwind readnone 
 declare double @__max_uniform_double(double, double) nounwind readnone 
 declare <WIDTH x float> @__max_varying_float(<WIDTH x float>,
                                             <WIDTH x float>) nounwind readnone 
 declare <WIDTH x float> @__min_varying_float(<WIDTH x float>,
                                             <WIDTH x float>) nounwind readnone 
 declare <WIDTH x i32> @__min_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone 
 declare <WIDTH x i32> @__max_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone 
 declare <WIDTH x i32> @__min_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone 
 declare <WIDTH x i32> @__max_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone 
 declare <WIDTH x i64> @__min_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone 
 declare <WIDTH x i64> @__max_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone 
 declare <WIDTH x i64> @__min_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone 
 declare <WIDTH x i64> @__max_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone 
 declare <WIDTH x double> @__min_varying_double(<WIDTH x double>,
                                               <WIDTH x double>) nounwind readnone
 declare <WIDTH x double> @__max_varying_double(<WIDTH x double>,
                                               <WIDTH x double>) nounwind readnone 
 ;; sqrt/rsqrt/rcp
 declare float @__rsqrt_uniform_float(float) nounwind readnone 
 declare float @__rcp_uniform_float(float) nounwind readnone 
 declare float @__sqrt_uniform_float(float) nounwind readnone 
 declare <WIDTH x float> @__rcp_varying_float(<WIDTH x float>) nounwind readnone 
 declare <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float>) nounwind readnone 
 declare <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone 
 declare double @__sqrt_uniform_double(double) nounwind readnone
 declare <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readnone
 ;; bit ops
 declare i32 @__popcnt_int32(i32) nounwind readnone
 declare i64 @__popcnt_int64(i64) nounwind readnone 
 declare i32 @__count_trailing_zeros_i32(i32) nounwind readnone
 declare i64 @__count_trailing_zeros_i64(i64) nounwind readnone
 declare i32 @__count_leading_zeros_i32(i32) nounwind readnone
 declare i64 @__count_leading_zeros_i64(i64) nounwind readnone
 ;; svml
 ; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
 ; or, use the macro to call the 4-wide ones twice with our 8-wide
 ; vectors...
 declare <WIDTH x float> @__svml_sin(<WIDTH x float>)
 declare <WIDTH x float> @__svml_cos(<WIDTH x float>)
 declare void @__svml_sincos(<WIDTH x float>, <WIDTH x float> *, <WIDTH x float> *)
 declare <WIDTH x float> @__svml_tan(<WIDTH x float>)
 declare <WIDTH x float> @__svml_atan(<WIDTH x float>)
 declare <WIDTH x float> @__svml_atan2(<WIDTH x float>, <WIDTH x float>)
 declare <WIDTH x float> @__svml_exp(<WIDTH x float>)
 declare <WIDTH x float> @__svml_log(<WIDTH x float>)
 declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; reductions
 declare i32 @__movmsk(<WIDTH x i1>) nounwind readnone 
 declare float @__reduce_add_float(<WIDTH x float>) nounwind readnone
 declare float @__reduce_min_float(<WIDTH x float>) nounwind readnone 
 declare float @__reduce_max_float(<WIDTH x float>) nounwind readnone 
 declare i32 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone 
 declare i32 @__reduce_min_int32(<WIDTH x i32>) nounwind readnone 
 declare i32 @__reduce_max_int32(<WIDTH x i32>) nounwind readnone 
 declare i32 @__reduce_add_uint32(<WIDTH x i32>) nounwind readnone 
 declare i32 @__reduce_min_uint32(<WIDTH x i32>) nounwind readnone 
 declare i32 @__reduce_max_uint32(<WIDTH x i32>) nounwind readnone 
 declare double @__reduce_add_double(<WIDTH x double>) nounwind readnone 
 declare double @__reduce_min_double(<WIDTH x double>) nounwind readnone 
 declare double @__reduce_max_double(<WIDTH x double>) nounwind readnone 
 declare i64 @__reduce_add_int64(<WIDTH x i64>) nounwind readnone 
 declare i64 @__reduce_min_int64(<WIDTH x i64>) nounwind readnone 
 declare i64 @__reduce_max_int64(<WIDTH x i64>) nounwind readnone 
 declare i64 @__reduce_add_uint64(<WIDTH x i64>) nounwind readnone 
 declare i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone 
 declare i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts
 load_and_broadcast(WIDTH, i8, 8)
 load_and_broadcast(WIDTH, i16, 16)
 load_and_broadcast(WIDTH, i32, 32)
 load_and_broadcast(WIDTH, i64, 64)
 declare <WIDTH x i8> @__masked_load_8(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
 declare <WIDTH x i16> @__masked_load_16(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
 declare <WIDTH x i32> @__masked_load_32(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
 declare <WIDTH x i64> @__masked_load_64(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
 declare void @__masked_store_8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
                               <WIDTH x i1>) nounwind 
 declare void @__masked_store_16(<WIDTH x i16>* nocapture, <WIDTH x i16>, 
                                <WIDTH x i1>) nounwind 
 declare void @__masked_store_32(<WIDTH x i32>* nocapture, <WIDTH x i32>, 
                                <WIDTH x i1>) nounwind 
 declare void @__masked_store_64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
                                <WIDTH x i1> %mask) nounwind 
 ifelse(LLVM_VERSION, `LLVM_3_1svn',`
 define void @__masked_store_blend_8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
                                     <WIDTH x i1>) nounwind alwaysinline {
  %v = load <WIDTH x i8> * %0
  %v1 = select <WIDTH x i1> %2, <WIDTH x i8> %1, <WIDTH x i8> %v
  store <WIDTH x i8> %v1, <WIDTH x i8> * %0
  ret void
 }
 define void @__masked_store_blend_16(<WIDTH x i16>* nocapture, <WIDTH x i16>, 
                                     <WIDTH x i1>) nounwind alwaysinline {
  %v = load <WIDTH x i16> * %0
  %v1 = select <WIDTH x i1> %2, <WIDTH x i16> %1, <WIDTH x i16> %v
  store <WIDTH x i16> %v1, <WIDTH x i16> * %0
  ret void
 }
 define void @__masked_store_blend_32(<WIDTH x i32>* nocapture, <WIDTH x i32>, 
                                     <WIDTH x i1>) nounwind alwaysinline {
  %v = load <WIDTH x i32> * %0
  %v1 = select <WIDTH x i1> %2, <WIDTH x i32> %1, <WIDTH x i32> %v
  store <WIDTH x i32> %v1, <WIDTH x i32> * %0
  ret void
 }
 define void @__masked_store_blend_64(<WIDTH x i64>* nocapture,
                            <WIDTH x i64>, <WIDTH x i1>) nounwind alwaysinline {
  %v = load <WIDTH x i64> * %0
  %v1 = select <WIDTH x i1> %2, <WIDTH x i64> %1, <WIDTH x i64> %v
  store <WIDTH x i64> %v1, <WIDTH x i64> * %0
  ret void
 }
 ',`
 declare void @__masked_store_blend_8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
                                     <WIDTH x i1>) nounwind 
 declare void @__masked_store_blend_16(<WIDTH x i16>* nocapture, <WIDTH x i16>, 
                                      <WIDTH x i1>) nounwind 
 declare void @__masked_store_blend_32(<WIDTH x i32>* nocapture, <WIDTH x i32>, 
                                      <WIDTH x i1>) nounwind 
 declare void @__masked_store_blend_64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
                                      <WIDTH x i1> %mask) nounwind 
 ')
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter
 define(`gather_scatter', `
 declare <WIDTH x $1> @__gather_base_offsets32_$1(i8 * nocapture, <WIDTH x i32>,
                        i32, <WIDTH x i32>, <WIDTH x i1>) nounwind readonly 
 declare <WIDTH x $1> @__gather_base_offsets64_$1(i8 * nocapture, <WIDTH x i64>,
                        i32, <WIDTH x i64>, <WIDTH x i1>) nounwind readonly 
 declare <WIDTH x $1> @__gather32_$1(<WIDTH x i32>, 
                                    <WIDTH x i1>) nounwind readonly 
 declare <WIDTH x $1> @__gather64_$1(<WIDTH x i64>, 
                                    <WIDTH x i1>) nounwind readonly 
 declare void @__scatter_base_offsets32_$1(i8* nocapture, <WIDTH x i32>,
                  i32, <WIDTH x i32>, <WIDTH x $1>, <WIDTH x i1>) nounwind 
 declare void @__scatter_base_offsets64_$1(i8* nocapture, <WIDTH x i64>,
                  i32, <WIDTH x i64>, <WIDTH x $1>, <WIDTH x i1>) nounwind 
 declare void @__scatter32_$1(<WIDTH x i32>, <WIDTH x $1>,
                             <WIDTH x i1>) nounwind 
 declare void @__scatter64_$1(<WIDTH x i64>, <WIDTH x $1>,
                              <WIDTH x i1>) nounwind 
 ')
 gather_scatter(i8)
 gather_scatter(i16)
 gather_scatter(i32)
 gather_scatter(i64)
 declare i32 @__packed_load_active(i32 * nocapture, <WIDTH x i32> * nocapture,
                                  <WIDTH x i1>) nounwind
 declare i32 @__packed_store_active(i32 * nocapture, <WIDTH x i32> %vals,
                                   <WIDTH x i1>) nounwind
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; prefetch
 declare void @__prefetch_read_uniform_1(i8 * nocapture) nounwind 
 declare void @__prefetch_read_uniform_2(i8 * nocapture) nounwind 
 declare void @__prefetch_read_uniform_3(i8 * nocapture) nounwind 
 declare void @__prefetch_read_uniform_nt(i8 * nocapture) nounwind 
--- a/builtins/target-sse2-common.ll
+++ b/builtins/target-sse2-common.ll
@@ -0,0 +1,271 @@
 ;;  Copyright (c) 2010-2011, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
 ;;  modification, are permitted provided that the following conditions are
 ;;  met:
 ;;
 ;;    * Redistributions of source code must retain the above copyright
 ;;      notice, this list of conditions and the following disclaimer.
 ;;
 ;;    * Redistributions in binary form must reproduce the above copyright
 ;;      notice, this list of conditions and the following disclaimer in the
 ;;      documentation and/or other materials provided with the distribution.
 ;;
 ;;    * Neither the name of Intel Corporation nor the names of its
 ;;      contributors may be used to endorse or promote products derived from
 ;;      this software without specific prior written permission.
 ;;
 ;;
 ;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 ;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 ;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 ;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
 ;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 ;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 ;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 ;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 ;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 ctlztz()
 define_prefetches()
 define_shuffles()
 aossoa()
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
 declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
 define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
  ; do the rcpss call
  %vecval = insertelement <4 x float> undef, float %0, i32 0
  %call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
  %scall = extractelement <4 x float> %call, i32 0
  ; do one N-R iteration to improve precision, as above
  %v_iv = fmul float %0, %scall
  %two_minus = fsub float 2., %v_iv  
  %iv_mul = fmul float %scall, %two_minus
  ret float %iv_mul
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; rsqrt
 declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
 define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
  ;  uniform float is = extract(__rsqrt_u(v), 0);
  %v = insertelement <4 x float> undef, float %0, i32 0
  %vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
  %is = extractelement <4 x float> %vis, i32 0
  ; Newton-Raphson iteration to improve precision
  ;  return 0.5 * is * (3. - (v * is) * is);
  %v_is = fmul float %0, %is
  %v_is_is = fmul float %v_is, %is
  %three_sub = fsub float 3., %v_is_is
  %is_mul = fmul float %is, %three_sub
  %half_scale = fmul float 0.5, %is_mul
  ret float %half_scale
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; sqrt
 declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
 define float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
  sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
  ret float %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; fast math mode
 declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
 declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
 define void @__fastmath() nounwind alwaysinline {
  %ptr = alloca i32
  %ptr8 = bitcast i32 * %ptr to i8 *
  call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
  %oldval = load i32 *%ptr
  ; turn on DAZ (64)/FTZ (32768) -> 32832
  %update = or i32 %oldval, 32832
  store i32 %update, i32 *%ptr
  call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8)
  ret void
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
 declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
 declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
 define float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
  ret float %ret
 }
 define float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
  ret float %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; double precision sqrt
 declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
 define double @__sqrt_uniform_double(double) nounwind alwaysinline {
  sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
  ret double %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; double precision min/max
 declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
 declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
 define double @__min_uniform_double(double, double) nounwind readnone {
  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
  ret double %ret
 }
 define double @__max_uniform_double(double, double) nounwind readnone {
  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
  ret double %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding
 ;;
 ;; There are not any rounding instructions in SSE2, so we have to emulate
 ;; the functionality with multiple instructions...
 ; The code for __round_* is the result of compiling the following source
 ; code.
 ;
 ; export float Round(float x) {
 ;    unsigned int sign = signbits(x);
 ;    unsigned int ix = intbits(x);
 ;    ix ^= sign;
 ;    x = floatbits(ix);
 ;    x += 0x1.0p23f;
 ;    x -= 0x1.0p23f;
 ;    ix = intbits(x);
 ;    ix ^= sign;
 ;    x = floatbits(ix);
 ;    return x;
 ;}
 define float @__round_uniform_float(float) nounwind readonly alwaysinline {
  %float_to_int_bitcast.i.i.i.i = bitcast float %0 to i32
  %bitop.i.i = and i32 %float_to_int_bitcast.i.i.i.i, -2147483648
  %bitop.i = xor i32 %bitop.i.i, %float_to_int_bitcast.i.i.i.i
  %int_to_float_bitcast.i.i40.i = bitcast i32 %bitop.i to float
  %binop.i = fadd float %int_to_float_bitcast.i.i40.i, 8.388608e+06
  %binop21.i = fadd float %binop.i, -8.388608e+06
  %float_to_int_bitcast.i.i.i = bitcast float %binop21.i to i32
  %bitop31.i = xor i32 %float_to_int_bitcast.i.i.i, %bitop.i.i
  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop31.i to float
  ret float %int_to_float_bitcast.i.i.i
 }
 ;; Similarly, for implementations of the __floor* functions below, we have the
 ;; bitcode from compiling the following source code...
 ;export float Floor(float x) {
 ;    float y = Round(x);
 ;    unsigned int cmp = y > x ? 0xffffffff : 0;
 ;    float delta = -1.f;
 ;    unsigned int idelta = intbits(delta);
 ;    idelta &= cmp;
 ;    delta = floatbits(idelta);
 ;    return y + delta;
 ;}
 define float @__floor_uniform_float(float) nounwind readonly alwaysinline {
  %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
  %bincmp.i = fcmp ogt float %calltmp.i, %0
  %selectexpr.i = sext i1 %bincmp.i to i32
  %bitop.i = and i32 %selectexpr.i, -1082130432
  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
  %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
  ret float %binop.i
 }
 ;; And here is the code we compiled to get the __ceil* functions below
 ;
 ;export uniform float Ceil(uniform float x) {
 ;    uniform float y = Round(x);
 ;    uniform int yltx = y < x ? 0xffffffff : 0;
 ;    uniform float delta = 1.f;
 ;    uniform int idelta = intbits(delta);
 ;    idelta &= yltx;
 ;    delta = floatbits(idelta);
 ;    return y + delta;
 ;}
 define float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
  %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
  %bincmp.i = fcmp olt float %calltmp.i, %0
  %selectexpr.i = sext i1 %bincmp.i to i32
  %bitop.i = and i32 %selectexpr.i, 1065353216
  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
  %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
  ret float %binop.i
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding doubles
 declare double @round(double)
 declare double @floor(double)
 declare double @ceil(double)
 define double @__round_uniform_double(double) nounwind readonly alwaysinline {
  %r = call double @round(double %0)
  ret double %r
 }
 define double @__floor_uniform_double(double) nounwind readonly alwaysinline {
  %r = call double @floor(double %0)
  ret double %r
 }
 define double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
  %r = call double @ceil(double %0)
  ret double %r
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
 declare i32 @llvm.ctpop.i32(i32)
 declare i64 @llvm.ctpop.i64(i64)
 define i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
  %val = call i32 @llvm.ctpop.i32(i32 %0)
  ret i32 %val
 }
 define i64 @__popcnt_int64(i64) nounwind readnone alwaysinline {
  %val = call i64 @llvm.ctpop.i64(i64 %0)
  ret i64 %val
 }
--- a/builtins/target-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -0,0 +1,643 @@
 ;;  Copyright (c) 2010-2011, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
 ;;  modification, are permitted provided that the following conditions are
 ;;  met:
 ;;
 ;;    * Redistributions of source code must retain the above copyright
 ;;      notice, this list of conditions and the following disclaimer.
 ;;
 ;;    * Redistributions in binary form must reproduce the above copyright
 ;;      notice, this list of conditions and the following disclaimer in the
 ;;      documentation and/or other materials provided with the distribution.
 ;;
 ;;    * Neither the name of Intel Corporation nor the names of its
 ;;      contributors may be used to endorse or promote products derived from
 ;;      this software without specific prior written permission.
 ;;
 ;;
 ;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 ;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 ;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 ;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
 ;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 ;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 ;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 ;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 ;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 ;; This file defines the target for "double-pumped" SSE2, i.e. running
 ;; with 8-wide vectors
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; standard 8-wide definitions from m4 macros
 define(`WIDTH',`8')
 define(`MASK',`i32')
 include(`util.m4')
 stdlib_core()
 packed_load_and_store()
 scans()
 int64minmax()
 include(`target-sse2-common.ll')
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; half conversion routines
 declare float @__half_to_float_uniform(i16 %v) nounwind readnone
 declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
 declare i16 @__float_to_half_uniform(float %v) nounwind readnone
 declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
 declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
 define <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
  ;  float iv = __rcp_v(v);
  ;  return iv * (2. - v * iv);
  unary4to8(call, float, @llvm.x86.sse.rcp.ps, %0)
  ; do one N-R iteration
  %v_iv = fmul <8 x float> %0, %call
  %two_minus = fsub <8 x float> <float 2., float 2., float 2., float 2.,
                                 float 2., float 2., float 2., float 2.>, %v_iv  
  %iv_mul = fmul <8 x float> %call, %two_minus
  ret <8 x float> %iv_mul
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rsqrt
 declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
 define <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
  ;  float is = __rsqrt_v(v);
  unary4to8(is, float, @llvm.x86.sse.rsqrt.ps, %v)
  ;  return 0.5 * is * (3. - (v * is) * is);
  %v_is = fmul <8 x float> %v, %is
  %v_is_is = fmul <8 x float> %v_is, %is
  %three_sub = fsub <8 x float> <float 3., float 3., float 3., float 3.,
                                 float 3., float 3., float 3., float 3.>, %v_is_is
  %is_mul = fmul <8 x float> %is, %three_sub
  %half_scale = fmul <8 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
                                  float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
  ret <8 x float> %half_scale
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; sqrt
 declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
 define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
  unary4to8(call, float, @llvm.x86.sse.sqrt.ps, %0)
  ret <8 x float> %call
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; svml stuff
 declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
 declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
 declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
 declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
 declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
 declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
 declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
 declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
 declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
 define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline {
  unary4to8(ret, float, @__svml_sinf4, %0)
  ret <8 x float> %ret
 }
 define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline {
  unary4to8(ret, float, @__svml_cosf4, %0)
  ret <8 x float> %ret
 }
 define void @__svml_sincos(<8 x float>, <8 x float> *,
                                    <8 x float> *) nounwind readnone alwaysinline {
  ; call svml_sincosf4 two times with the two 4-wide sub-vectors
  %a = shufflevector <8 x float> %0, <8 x float> undef,
         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %b = shufflevector <8 x float> %0, <8 x float> undef,
         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %cospa = alloca <4 x float>
  %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a)
  %cospb = alloca <4 x float>
  %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b)
  %sin = shufflevector <4 x float> %sa, <4 x float> %sb,
         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
                    i32 4, i32 5, i32 6, i32 7>
  store <8 x float> %sin, <8 x float> * %1
  %cosa = load <4 x float> * %cospa
  %cosb = load <4 x float> * %cospb
  %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb,
         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
                    i32 4, i32 5, i32 6, i32 7>
  store <8 x float> %cos, <8 x float> * %2
  ret void
 }
 define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline {
  unary4to8(ret, float, @__svml_tanf4, %0)
  ret <8 x float> %ret
 }
 define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline {
  unary4to8(ret, float, @__svml_atanf4, %0)
  ret <8 x float> %ret
 }
 define <8 x float> @__svml_atan2(<8 x float>,
                                          <8 x float>) nounwind readnone alwaysinline {
  binary4to8(ret, float, @__svml_atan2f4, %0, %1)
  ret <8 x float> %ret
 }
 define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline {
  unary4to8(ret, float, @__svml_expf4, %0)
  ret <8 x float> %ret
 }
 define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline {
  unary4to8(ret, float, @__svml_logf4, %0)
  ret <8 x float> %ret
 }
 define <8 x float> @__svml_pow(<8 x float>,
                                        <8 x float>) nounwind readnone alwaysinline {
  binary4to8(ret, float, @__svml_powf4, %0, %1)
  ret <8 x float> %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
 declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
 declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
 define <8 x float> @__max_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
  binary4to8(call, float, @llvm.x86.sse.max.ps, %0, %1)
  ret <8 x float> %call
 }
 define <8 x float> @__min_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
  binary4to8(call, float, @llvm.x86.sse.min.ps, %0, %1)
  ret <8 x float> %call
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; min/max
 ; There is no blend instruction with SSE2, so we simulate it with bit
 ; operations on i32s.  For these two vselect functions, for each
 ; vector element, if the mask is on, we return the corresponding value
 ; from %1, and otherwise return the value from %0.
 define <8 x i32> @__vselect_i32(<8 x i32>, <8 x i32> ,
                                         <8 x i32> %mask) nounwind readnone alwaysinline {
  %notmask = xor <8 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
  %cleared_old = and <8 x i32> %0, %notmask
  %masked_new = and <8 x i32> %1, %mask
  %new = or <8 x i32> %cleared_old, %masked_new
  ret <8 x i32> %new
 }
 define <8 x float> @__vselect_float(<8 x float>, <8 x float>,
                                             <8 x i32> %mask) nounwind readnone alwaysinline {
  %v0 = bitcast <8 x float> %0 to <8 x i32>
  %v1 = bitcast <8 x float> %1 to <8 x i32>
  %r = call <8 x i32> @__vselect_i32(<8 x i32> %v0, <8 x i32> %v1, <8 x i32> %mask)
  %rf = bitcast <8 x i32> %r to <8 x float>
  ret <8 x float> %rf
 }
 ; To do vector integer min and max, we do the vector compare and then sign
 ; extend the i1 vector result to an i32 mask.  The __vselect does the
 ; rest...
 define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
  %c = icmp slt <8 x i32> %0, %1
  %mask = sext <8 x i1> %c to <8 x i32>
  %v = call <8 x i32> @__vselect_i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> %mask)
  ret <8 x i32> %v
 }
 define i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
  %c = icmp slt i32 %0, %1
  %r = select i1 %c, i32 %0, i32 %1
  ret i32 %r
 }
 define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
  %c = icmp sgt <8 x i32> %0, %1
  %mask = sext <8 x i1> %c to <8 x i32>
  %v = call <8 x i32> @__vselect_i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> %mask)
  ret <8 x i32> %v
 }
 define i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
  %c = icmp sgt i32 %0, %1
  %r = select i1 %c, i32 %0, i32 %1
  ret i32 %r
 }
 ; The functions for unsigned ints are similar, just with unsigned
 ; comparison functions...
 define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
  %c = icmp ult <8 x i32> %0, %1
  %mask = sext <8 x i1> %c to <8 x i32>
  %v = call <8 x i32> @__vselect_i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> %mask)
  ret <8 x i32> %v
 }
 define i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
  %c = icmp ult i32 %0, %1
  %r = select i1 %c, i32 %0, i32 %1
  ret i32 %r
 }
 define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
  %c = icmp ugt <8 x i32> %0, %1
  %mask = sext <8 x i1> %c to <8 x i32>
  %v = call <8 x i32> @__vselect_i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> %mask)
  ret <8 x i32> %v
 }
 define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
  %c = icmp ugt i32 %0, %1
  %r = select i1 %c, i32 %0, i32 %1
  ret i32 %r
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
 declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
 define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
  ; first do two 4-wide movmsk calls
  %floatmask = bitcast <8 x i32> %0 to <8 x float>
  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
  %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
  ; and shift the first one over by 4 before ORing it with the value 
  ; of the second one
  %v1s = shl i32 %v1, 4
  %v = or i32 %v0, %v1s
  ret i32 %v
 }
 define <4 x float> @__vec4_add_float(<4 x float> %v0,
                                     <4 x float> %v1) nounwind readnone alwaysinline {
  %v = fadd <4 x float> %v0, %v1
  ret <4 x float> %v
 }
 define float @__add_float(float, float) nounwind readnone alwaysinline {
  %v = fadd float %0, %1
  ret float %v
 }
 define float @__reduce_add_float(<8 x float>) nounwind readnone alwaysinline {
  reduce8by4(float, @__vec4_add_float, @__add_float)
 }
 define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
  reduce8(float, @__min_varying_float, @__min_uniform_float)
 }
 define float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
  reduce8(float, @__max_varying_float, @__max_uniform_float)
 }
 ; helper function for reduce_add_int32
 define <4 x i32> @__vec4_add_int32(<4 x i32> %v0,
                                   <4 x i32> %v1) nounwind readnone alwaysinline {
  %v = add <4 x i32> %v0, %v1
  ret <4 x i32> %v
 }
 ; helper function for reduce_add_int32
 define i32 @__add_int32(i32, i32) nounwind readnone alwaysinline {
  %v = add i32 %0, %1
  ret i32 %v
 }
 define i32 @__reduce_add_int32(<8 x i32>) nounwind readnone alwaysinline {
  reduce8by4(i32, @__vec4_add_int32, @__add_int32)
 }
 define i32 @__reduce_min_int32(<8 x i32>) nounwind readnone alwaysinline {
  reduce8(i32, @__min_varying_int32, @__min_uniform_int32)
 }
 define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
  reduce8(i32, @__max_varying_int32, @__max_uniform_int32)
 }
 define i32 @__reduce_add_uint32(<8 x i32> %v) nounwind readnone alwaysinline {
  %r = call i32 @__reduce_add_int32(<8 x i32> %v)
  ret i32 %r
 }
 define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
  reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32)
 }
 define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
  reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32)
 }
 define <4 x double> @__add_varying_double(<4 x double>,
                                     <4 x double>) nounwind readnone alwaysinline {
  %r = fadd <4 x double> %0, %1
  ret <4 x double> %r
 }
 define double @__add_uniform_double(double, double) nounwind readnone alwaysinline {
  %r = fadd double %0, %1
  ret double %r
 }
 define double @__reduce_add_double(<8 x double>) nounwind readnone {
  reduce8by4(double, @__add_varying_double, @__add_uniform_double)
 }
 define double @__reduce_min_double(<8 x double>) nounwind readnone {
  reduce8(double, @__min_varying_double, @__min_uniform_double)
 }
 define double @__reduce_max_double(<8 x double>) nounwind readnone {
  reduce8(double, @__max_varying_double, @__max_uniform_double)
 }
 define <4 x i64> @__add_varying_int64(<4 x i64>,
                                               <4 x i64>) nounwind readnone alwaysinline {
  %r = add <4 x i64> %0, %1
  ret <4 x i64> %r
 }
 define i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
  %r = add i64 %0, %1
  ret i64 %r
 }
 define i64 @__reduce_add_int64(<8 x i64>) nounwind readnone {
  reduce8by4(i64, @__add_varying_int64, @__add_uniform_int64)
 }
 define i64 @__reduce_min_int64(<8 x i64>) nounwind readnone {
  reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
 }
 define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone {
  reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
 }
 define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone {
  reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
 }
 define i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone {
  reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
 }
 reduce_equal(8)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts
 load_and_broadcast(8, i8, 8)
 load_and_broadcast(8, i16, 16)
 load_and_broadcast(8, i32, 32)
 load_and_broadcast(8, i64, 64)
 masked_load(8, i8,  8,  1)
 masked_load(8, i16, 16, 2)
 masked_load(8, i32, 32, 4)
 masked_load(8, i64, 64, 8)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter
 gen_gather(8, i8)
 gen_gather(8, i16)
 gen_gather(8, i32)
 gen_gather(8, i64)
 gen_scatter(8, i8)
 gen_scatter(8, i16)
 gen_scatter(8, i32)
 gen_scatter(8, i64)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float rounding
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding
 ;;
 ;; There are not any rounding instructions in SSE2, so we have to emulate
 ;; the functionality with multiple instructions...
 ; The code for __round_* is the result of compiling the following source
 ; code.
 ;
 ; export float Round(float x) {
 ;    unsigned int sign = signbits(x);
 ;    unsigned int ix = intbits(x);
 ;    ix ^= sign;
 ;    x = floatbits(ix);
 ;    x += 0x1.0p23f;
 ;    x -= 0x1.0p23f;
 ;    ix = intbits(x);
 ;    ix ^= sign;
 ;    x = floatbits(ix);
 ;    return x;
 ;}
 define <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
  %float_to_int_bitcast.i.i.i.i = bitcast <8 x float> %0 to <8 x i32>
  %bitop.i.i = and <8 x i32> %float_to_int_bitcast.i.i.i.i, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
  %bitop.i = xor <8 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
  %int_to_float_bitcast.i.i40.i = bitcast <8 x i32> %bitop.i to <8 x float>
  %binop.i = fadd <8 x float> %int_to_float_bitcast.i.i40.i, <float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06>
  %binop21.i = fadd <8 x float> %binop.i, <float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06>
  %float_to_int_bitcast.i.i.i = bitcast <8 x float> %binop21.i to <8 x i32>
  %bitop31.i = xor <8 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
  %int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop31.i to <8 x float>
  ret <8 x float> %int_to_float_bitcast.i.i.i
 }
 ;; Similarly, for implementations of the __floor* functions below, we have the
 ;; bitcode from compiling the following source code...
 ;export float Floor(float x) {
 ;    float y = Round(x);
 ;    unsigned int cmp = y > x ? 0xffffffff : 0;
 ;    float delta = -1.f;
 ;    unsigned int idelta = intbits(delta);
 ;    idelta &= cmp;
 ;    delta = floatbits(idelta);
 ;    return y + delta;
 ;}
 define <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
  %calltmp.i = tail call <8 x float> @__round_varying_float(<8 x float> %0) nounwind
  %bincmp.i = fcmp ogt <8 x float> %calltmp.i, %0
  %val_to_boolvec32.i = sext <8 x i1> %bincmp.i to <8 x i32>
  %bitop.i = and <8 x i32> %val_to_boolvec32.i, <i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432>
  %int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop.i to <8 x float>
  %binop.i = fadd <8 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
  ret <8 x float> %binop.i
 }
 ;; And here is the code we compiled to get the __ceil* functions below
 ;
 ;export uniform float Ceil(uniform float x) {
 ;    uniform float y = Round(x);
 ;    uniform int yltx = y < x ? 0xffffffff : 0;
 ;    uniform float delta = 1.f;
 ;    uniform int idelta = intbits(delta);
 ;    idelta &= yltx;
 ;    delta = floatbits(idelta);
 ;    return y + delta;
 ;}
 define <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
  %calltmp.i = tail call <8 x float> @__round_varying_float(<8 x float> %0) nounwind
  %bincmp.i = fcmp olt <8 x float> %calltmp.i, %0
  %val_to_boolvec32.i = sext <8 x i1> %bincmp.i to <8 x i32>
  %bitop.i = and <8 x i32> %val_to_boolvec32.i, <i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216>
  %int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop.i to <8 x float>
  %binop.i = fadd <8 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
  ret <8 x float> %binop.i
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding doubles
 define <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
  unary1to8(double, @round)
 }
 define <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
  unary1to8(double, @floor)
 }
 define <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
  unary1to8(double, @ceil)
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store
 gen_masked_store(8, i8, 8)
 gen_masked_store(8, i16, 16)
 gen_masked_store(8, i32, 32)
 gen_masked_store(8, i64, 64)
 masked_store_blend_8_16_by_8()
 define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>, 
                                     <8 x i32> %mask) nounwind alwaysinline {
  %val = load <8 x i32> * %0, align 4
  %newval = call <8 x i32> @__vselect_i32(<8 x i32> %val, <8 x i32> %1, <8 x i32> %mask) 
  store <8 x i32> %newval, <8 x i32> * %0, align 4
  ret void
 }
 define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
                                     <8 x i32> %mask) nounwind alwaysinline {
  %oldValue = load <8 x i64>* %ptr, align 8
  ; Do 8x64-bit blends by doing two <8 x i32> blends, where the <8 x i32> values
  ; are actually bitcast <2 x i64> values
  ;
  ; set up the first two 64-bit values
  %old0123  = shufflevector <8 x i64> %oldValue, <8 x i64> undef,
                            <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %old0123f = bitcast <4 x i64> %old0123 to <8 x float>
  %new0123  = shufflevector <8 x i64> %new, <8 x i64> undef,
                            <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %new0123f = bitcast <4 x i64> %new0123 to <8 x float>
  ; compute mask--note that the indices are doubled-up
  %mask0123 = shufflevector <8 x i32> %mask, <8 x i32> undef,
              <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
  ; and blend the first 4 values
  %result0123f = call <8 x float> @__vselect_float(<8 x float> %old0123f, <8 x float> %new0123f,
                                                   <8 x i32> %mask0123)
  %result0123 = bitcast <8 x float> %result0123f to <4 x i64>
  ; and again
  %old4567  = shufflevector <8 x i64> %oldValue, <8 x i64> undef,
                            <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %old4567f = bitcast <4 x i64> %old4567 to <8 x float>
  %new4567  = shufflevector <8 x i64> %new, <8 x i64> undef,
                            <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %new4567f = bitcast <4 x i64> %new4567 to <8 x float>
  ; compute mask--note that the values are doubled-up
  %mask4567 = shufflevector <8 x i32> %mask, <8 x i32> undef,
              <8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
  ; and blend the two of the values
  %result4567f = call <8 x float> @__vselect_float(<8 x float> %old4567f, <8 x float> %new4567f,
                                                   <8 x i32> %mask4567)
  %result4567 = bitcast <8 x float> %result4567f to <4 x i64>
  ; reconstruct the final <8 x i64> vector
  %final = shufflevector <4 x i64> %result0123, <4 x i64> %result4567,
           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  store <8 x i64> %final, <8 x i64> * %ptr, align 8
  ret void
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; double precision sqrt
 declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
 define <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
  unary2to8(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
  ret <8 x double> %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; double precision float min/max
 declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
 declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
 define <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
  binary2to8(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
  ret <8 x double> %ret
 }
 define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
  binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
  ret <8 x double> %ret
 }
--- a/builtins/target-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -0,0 +1,585 @@
 ;;  Copyright (c) 2010-2011, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
 ;;  modification, are permitted provided that the following conditions are
 ;;  met:
 ;;
 ;;    * Redistributions of source code must retain the above copyright
 ;;      notice, this list of conditions and the following disclaimer.
 ;;
 ;;    * Redistributions in binary form must reproduce the above copyright
 ;;      notice, this list of conditions and the following disclaimer in the
 ;;      documentation and/or other materials provided with the distribution.
 ;;
 ;;    * Neither the name of Intel Corporation nor the names of its
 ;;      contributors may be used to endorse or promote products derived from
 ;;      this software without specific prior written permission.
 ;;
 ;;
 ;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 ;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 ;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 ;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
 ;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 ;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 ;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 ;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 ;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; Define the standard library builtins for the SSE2 target
 ; Define some basics for a 4-wide target
 define(`WIDTH',`4')
 define(`MASK',`i32')
 include(`util.m4')
 stdlib_core()
 packed_load_and_store()
 scans()
 int64minmax()
 include(`target-sse2-common.ll')
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; half conversion routines
 declare float @__half_to_float_uniform(i16 %v) nounwind readnone
 declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
 declare i16 @__float_to_half_uniform(float %v) nounwind readnone
 declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding
 ;;
 ;; There are not any rounding instructions in SSE2, so we have to emulate
 ;; the functionality with multiple instructions...
 ; The code for __round_* is the result of compiling the following source
 ; code.
 ;
 ; export float Round(float x) {
 ;    unsigned int sign = signbits(x);
 ;    unsigned int ix = intbits(x);
 ;    ix ^= sign;
 ;    x = floatbits(ix);
 ;    x += 0x1.0p23f;
 ;    x -= 0x1.0p23f;
 ;    ix = intbits(x);
 ;    ix ^= sign;
 ;    x = floatbits(ix);
 ;    return x;
 ;}
 define <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline {
  %float_to_int_bitcast.i.i.i.i = bitcast <4 x float> %0 to <4 x i32>
  %bitop.i.i = and <4 x i32> %float_to_int_bitcast.i.i.i.i, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
  %bitop.i = xor <4 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
  %int_to_float_bitcast.i.i40.i = bitcast <4 x i32> %bitop.i to <4 x float>
  %binop.i = fadd <4 x float> %int_to_float_bitcast.i.i40.i, <float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06>
  %binop21.i = fadd <4 x float> %binop.i, <float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06>
  %float_to_int_bitcast.i.i.i = bitcast <4 x float> %binop21.i to <4 x i32>
  %bitop31.i = xor <4 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
  %int_to_float_bitcast.i.i.i = bitcast <4 x i32> %bitop31.i to <4 x float>
  ret <4 x float> %int_to_float_bitcast.i.i.i
 }
 ;; Similarly, for implementations of the __floor* functions below, we have the
 ;; bitcode from compiling the following source code...
 ;export float Floor(float x) {
 ;    float y = Round(x);
 ;    unsigned int cmp = y > x ? 0xffffffff : 0;
 ;    float delta = -1.f;
 ;    unsigned int idelta = intbits(delta);
 ;    idelta &= cmp;
 ;    delta = floatbits(idelta);
 ;    return y + delta;
 ;}
 define <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
  %calltmp.i = tail call <4 x float> @__round_varying_float(<4 x float> %0) nounwind
  %bincmp.i = fcmp ogt <4 x float> %calltmp.i, %0
  %val_to_boolvec32.i = sext <4 x i1> %bincmp.i to <4 x i32>
  %bitop.i = and <4 x i32> %val_to_boolvec32.i, <i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432>
  %int_to_float_bitcast.i.i.i = bitcast <4 x i32> %bitop.i to <4 x float>
  %binop.i = fadd <4 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
  ret <4 x float> %binop.i
 }
 ;; And here is the code we compiled to get the __ceil* functions below
 ;
 ;export uniform float Ceil(uniform float x) {
 ;    uniform float y = Round(x);
 ;    uniform int yltx = y < x ? 0xffffffff : 0;
 ;    uniform float delta = 1.f;
 ;    uniform int idelta = intbits(delta);
 ;    idelta &= yltx;
 ;    delta = floatbits(idelta);
 ;    return y + delta;
 ;}
 define <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
  %calltmp.i = tail call <4 x float> @__round_varying_float(<4 x float> %0) nounwind
  %bincmp.i = fcmp olt <4 x float> %calltmp.i, %0
  %val_to_boolvec32.i = sext <4 x i1> %bincmp.i to <4 x i32>
  %bitop.i = and <4 x i32> %val_to_boolvec32.i, <i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216>
  %int_to_float_bitcast.i.i.i = bitcast <4 x i32> %bitop.i to <4 x float>
  %binop.i = fadd <4 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
  ret <4 x float> %binop.i
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding doubles
 define <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline {
  unary1to4(double, @round)
 }
 define <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
  unary1to4(double, @floor)
 }
 define <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
  unary1to4(double, @ceil)
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; min/max
 ; There is no blend instruction with SSE2, so we simulate it with bit
 ; operations on i32s.  For these two vselect functions, for each
 ; vector element, if the mask is on, we return the corresponding value
 ; from %1, and otherwise return the value from %0.
 define <4 x i32> @__vselect_i32(<4 x i32>, <4 x i32> ,
                                <4 x i32> %mask) nounwind readnone alwaysinline {
  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
  %cleared_old = and <4 x i32> %0, %notmask
  %masked_new = and <4 x i32> %1, %mask
  %new = or <4 x i32> %cleared_old, %masked_new
  ret <4 x i32> %new
 }
 define <4 x float> @__vselect_float(<4 x float>, <4 x float>,
                                    <4 x i32> %mask) nounwind readnone alwaysinline {
  %v0 = bitcast <4 x float> %0 to <4 x i32>
  %v1 = bitcast <4 x float> %1 to <4 x i32>
  %r = call <4 x i32> @__vselect_i32(<4 x i32> %v0, <4 x i32> %v1, <4 x i32> %mask)
  %rf = bitcast <4 x i32> %r to <4 x float>
  ret <4 x float> %rf
 }
 ; To do vector integer min and max, we do the vector compare and then sign
 ; extend the i1 vector result to an i32 mask.  The __vselect does the
 ; rest...
 define <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
  %c = icmp slt <4 x i32> %0, %1
  %mask = sext <4 x i1> %c to <4 x i32>
  %v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
  ret <4 x i32> %v
 }
 define i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
  %c = icmp slt i32 %0, %1
  %r = select i1 %c, i32 %0, i32 %1
  ret i32 %r
 }
 define <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
  %c = icmp sgt <4 x i32> %0, %1
  %mask = sext <4 x i1> %c to <4 x i32>
  %v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
  ret <4 x i32> %v
 }
 define i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
  %c = icmp sgt i32 %0, %1
  %r = select i1 %c, i32 %0, i32 %1
  ret i32 %r
 }
 ; The functions for unsigned ints are similar, just with unsigned
 ; comparison functions...
 define <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
  %c = icmp ult <4 x i32> %0, %1
  %mask = sext <4 x i1> %c to <4 x i32>
  %v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
  ret <4 x i32> %v
 }
 define i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
  %c = icmp ult i32 %0, %1
  %r = select i1 %c, i32 %0, i32 %1
  ret i32 %r
 }
 define <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
  %c = icmp ugt <4 x i32> %0, %1
  %mask = sext <4 x i1> %c to <4 x i32>
  %v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
  ret <4 x i32> %v
 }
 define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
  %c = icmp ugt i32 %0, %1
  %r = select i1 %c, i32 %0, i32 %1
  ret i32 %r
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
 declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
 define i32 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
  %floatmask = bitcast <4 x i32> %0 to <4 x float>
  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
  ret i32 %v
 }
 define float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline {
  %v1 = shufflevector <4 x float> %v, <4 x float> undef,
                      <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
  %m1 = fadd <4 x float> %v1, %v
  %m1a = extractelement <4 x float> %m1, i32 0
  %m1b = extractelement <4 x float> %m1, i32 1
  %sum = fadd float %m1a, %m1b
  ret float %sum
 }
 define float @__reduce_min_float(<4 x float>) nounwind readnone {
  reduce4(float, @__min_varying_float, @__min_uniform_float)
 }
 define float @__reduce_max_float(<4 x float>) nounwind readnone {
  reduce4(float, @__max_varying_float, @__max_uniform_float)
 }
 define i32 @__reduce_add_int32(<4 x i32> %v) nounwind readnone {
  %v1 = shufflevector <4 x i32> %v, <4 x i32> undef,
                      <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
  %m1 = add <4 x i32> %v1, %v
  %m1a = extractelement <4 x i32> %m1, i32 0
  %m1b = extractelement <4 x i32> %m1, i32 1
  %sum = add i32 %m1a, %m1b
  ret i32 %sum
 }
 define i32 @__reduce_min_int32(<4 x i32>) nounwind readnone {
  reduce4(i32, @__min_varying_int32, @__min_uniform_int32)
 }
 define i32 @__reduce_max_int32(<4 x i32>) nounwind readnone {
  reduce4(i32, @__max_varying_int32, @__max_uniform_int32)
 }
 define i32 @__reduce_add_uint32(<4 x i32> %v) nounwind readnone {
  %r = call i32 @__reduce_add_int32(<4 x i32> %v)
  ret i32 %r
 }
 define i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone {
  reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32)
 }
 define i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone {
  reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32)
 }
 define double @__reduce_add_double(<4 x double>) nounwind readnone {
  %v0 = shufflevector <4 x double> %0, <4 x double> undef,
                      <2 x i32> <i32 0, i32 1>
  %v1 = shufflevector <4 x double> %0, <4 x double> undef,
                      <2 x i32> <i32 2, i32 3>
  %sum = fadd <2 x double> %v0, %v1
  %e0 = extractelement <2 x double> %sum, i32 0
  %e1 = extractelement <2 x double> %sum, i32 1
  %m = fadd double %e0, %e1
  ret double %m
 }
 define double @__reduce_min_double(<4 x double>) nounwind readnone {
  reduce4(double, @__min_varying_double, @__min_uniform_double)
 }
 define double @__reduce_max_double(<4 x double>) nounwind readnone {
  reduce4(double, @__max_varying_double, @__max_uniform_double)
 }
 define i64 @__reduce_add_int64(<4 x i64>) nounwind readnone {
  %v0 = shufflevector <4 x i64> %0, <4 x i64> undef,
                      <2 x i32> <i32 0, i32 1>
  %v1 = shufflevector <4 x i64> %0, <4 x i64> undef,
                      <2 x i32> <i32 2, i32 3>
  %sum = add <2 x i64> %v0, %v1
  %e0 = extractelement <2 x i64> %sum, i32 0
  %e1 = extractelement <2 x i64> %sum, i32 1
  %m = add i64 %e0, %e1
  ret i64 %m
 }
 define i64 @__reduce_min_int64(<4 x i64>) nounwind readnone {
  reduce4(i64, @__min_varying_int64, @__min_uniform_int64)
 }
 define i64 @__reduce_max_int64(<4 x i64>) nounwind readnone {
  reduce4(i64, @__max_varying_int64, @__max_uniform_int64)
 }
 define i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone {
  reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64)
 }
 define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone {
  reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
 }
 reduce_equal(4)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store
 define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>, 
                                     <4 x i32> %mask) nounwind alwaysinline {
  %val = load <4 x i32> * %0, align 4
  %newval = call <4 x i32> @__vselect_i32(<4 x i32> %val, <4 x i32> %1, <4 x i32> %mask) 
  store <4 x i32> %newval, <4 x i32> * %0, align 4
  ret void
 }
 define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
                                     <4 x i32> %mask) nounwind alwaysinline {
  %oldValue = load <4 x i64>* %ptr, align 8
  ; Do 4x64-bit blends by doing two <4 x i32> blends, where the <4 x i32> values
  ; are actually bitcast <2 x i64> values
  ;
  ; set up the first two 64-bit values
  %old01  = shufflevector <4 x i64> %oldValue, <4 x i64> undef,
                          <2 x i32> <i32 0, i32 1>
  %old01f = bitcast <2 x i64> %old01 to <4 x float>
  %new01  = shufflevector <4 x i64> %new, <4 x i64> undef,
                          <2 x i32> <i32 0, i32 1>
  %new01f = bitcast <2 x i64> %new01 to <4 x float>
  ; compute mask--note that the indices 0 and 1 are doubled-up
  %mask01 = shufflevector <4 x i32> %mask, <4 x i32> undef,
                          <4 x i32> <i32 0, i32 0, i32 1, i32 1>
  ; and blend the two of the values
  %result01f = call <4 x float> @__vselect_float(<4 x float> %old01f, <4 x float> %new01f, <4 x i32> %mask01)
  %result01 = bitcast <4 x float> %result01f to <2 x i64>
  ; and again
  %old23  = shufflevector <4 x i64> %oldValue, <4 x i64> undef,
                          <2 x i32> <i32 2, i32 3>
  %old23f = bitcast <2 x i64> %old23 to <4 x float>
  %new23  = shufflevector <4 x i64> %new, <4 x i64> undef,
                          <2 x i32> <i32 2, i32 3>
  %new23f = bitcast <2 x i64> %new23 to <4 x float>
  ; compute mask--note that the values 2 and 3 are doubled-up
  %mask23 = shufflevector <4 x i32> %mask, <4 x i32> undef,
                          <4 x i32> <i32 2, i32 2, i32 3, i32 3>
  ; and blend the two of the values
  %result23f = call <4 x float> @__vselect_float(<4 x float> %old23f, <4 x float> %new23f, <4 x i32> %mask23)
  %result23 = bitcast <4 x float> %result23f to <2 x i64>
  ; reconstruct the final <4 x i64> vector
  %final = shufflevector <2 x i64> %result01, <2 x i64> %result23,
                         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  store <4 x i64> %final, <4 x i64> * %ptr, align 8
  ret void
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
 declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
 define <4 x float> @__rcp_varying_float(<4 x float>) nounwind readonly alwaysinline {
  %call = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %0)
  ; do one N-R iteration to improve precision
  ;  float iv = __rcp_v(v);
  ;  return iv * (2. - v * iv);
  %v_iv = fmul <4 x float> %0, %call
  %two_minus = fsub <4 x float> <float 2., float 2., float 2., float 2.>, %v_iv  
  %iv_mul = fmul <4 x float> %call, %two_minus
  ret <4 x float> %iv_mul
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; rsqrt
 declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
 define <4 x float> @__rsqrt_varying_float(<4 x float> %v) nounwind readonly alwaysinline {
  ;  float is = __rsqrt_v(v);
  %is = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %v)
  ; Newton-Raphson iteration to improve precision
  ;  return 0.5 * is * (3. - (v * is) * is);
  %v_is = fmul <4 x float> %v, %is
  %v_is_is = fmul <4 x float> %v_is, %is
  %three_sub = fsub <4 x float> <float 3., float 3., float 3., float 3.>, %v_is_is
  %is_mul = fmul <4 x float> %is, %three_sub
  %half_scale = fmul <4 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
  ret <4 x float> %half_scale
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; sqrt
 declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
 define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysinline {
  %call = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %0)
  ret <4 x float> %call
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; svml stuff
 declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
 declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
 declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
 declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
 declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
 declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
 declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
 declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
 declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
 define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
  %ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
  ret <4 x float> %ret
 }
 define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
  %ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
  ret <4 x float> %ret
 }
 define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
  %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
  store <4 x float> %s, <4 x float> * %1
  ret void
 }
 define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
  %ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
  ret <4 x float> %ret
 }
 define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
  %ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
  ret <4 x float> %ret
 }
 define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
  %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
  ret <4 x float> %ret
 }
 define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
  %ret = call <4 x float> @__svml_expf4(<4 x float> %0)
  ret <4 x float> %ret
 }
 define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
  %ret = call <4 x float> @__svml_logf4(<4 x float> %0)
  ret <4 x float> %ret
 }
 define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
  %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
  ret <4 x float> %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
 declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
 declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
 define <4 x float> @__max_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
  %call = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %0, <4 x float> %1)
  ret <4 x float> %call
 }
 define <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
  %call = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %0, <4 x float> %1)
  ret <4 x float> %call
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; double precision sqrt
 declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
 define <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
  unary2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
  ret <4 x double> %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; double precision min/max
 declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
 declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
 define <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone {
  binary2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
  ret <4 x double> %ret
 }
 define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone {
  binary2to4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
  ret <4 x double> %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store
 masked_store_blend_8_16_by_4()
 gen_masked_store(4, i8, 8)
 gen_masked_store(4, i16, 16)
 gen_masked_store(4, i32, 32)
 gen_masked_store(4, i64, 64)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts
 load_and_broadcast(4, i8, 8)
 load_and_broadcast(4, i16, 16)
 load_and_broadcast(4, i32, 32)
 load_and_broadcast(4, i64, 64)
 masked_load(4, i8,  8,  1)
 masked_load(4, i16, 16, 2)
 masked_load(4, i32, 32, 4)
 masked_load(4, i64, 64, 8)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter
 ; define these with the macros from stdlib.m4
 gen_gather(4, i8)
 gen_gather(4, i16)
 gen_gather(4, i32)
 gen_gather(4, i64)
 gen_scatter(4, i8)
 gen_scatter(4, i16)
 gen_scatter(4, i32)
 gen_scatter(4, i64)
--- a/builtins/target-sse4-common.ll
+++ b/builtins/target-sse4-common.ll
@@ -0,0 +1,276 @@
 ;;  Copyright (c) 2010-2011, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
 ;;  modification, are permitted provided that the following conditions are
 ;;  met:
 ;;
 ;;    * Redistributions of source code must retain the above copyright
 ;;      notice, this list of conditions and the following disclaimer.
 ;;
 ;;    * Redistributions in binary form must reproduce the above copyright
 ;;      notice, this list of conditions and the following disclaimer in the
 ;;      documentation and/or other materials provided with the distribution.
 ;;
 ;;    * Neither the name of Intel Corporation nor the names of its
 ;;      contributors may be used to endorse or promote products derived from
 ;;      this software without specific prior written permission.
 ;;
 ;;
 ;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 ;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 ;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 ;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
 ;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 ;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 ;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 ;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 ;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 ctlztz()
 define_prefetches()
 define_shuffles()
 aossoa()
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding floats
 declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
 define float @__round_uniform_float(float) nounwind readonly alwaysinline {
  ; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
  ; the roundss intrinsic is a total mess--docs say:
  ;
  ;  __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
  ;       
  ;  b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
  ;  on b0. The higher order 96 bits are copied directly from input parameter a. The
  ;  return value is described by the following equations:
  ;
  ;  r0 = RND(b0)
  ;  r1 = a1
  ;  r2 = a2
  ;  r3 = a3
  ;
  ;  It doesn't matter what we pass as a, since we only need the r0 value
  ;  here.  So we pass the same register for both.  Further, only the 0th
  ;  element of the b parameter matters
  %xi = insertelement <4 x float> undef, float %0, i32 0
  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
  %rs = extractelement <4 x float> %xr, i32 0
  ret float %rs
 }
 define float @__floor_uniform_float(float) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <4 x float> undef, float %0, i32 0
  ; roundps, round down 0b01 | don't signal precision exceptions 0b1010 = 9
  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
  %rs = extractelement <4 x float> %xr, i32 0
  ret float %rs
 }
 define float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <4 x float> undef, float %0, i32 0
  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
  %rs = extractelement <4 x float> %xr, i32 0
  ret float %rs
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding doubles
 declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
 define double @__round_uniform_double(double) nounwind readonly alwaysinline {
  %xi = insertelement <2 x double> undef, double %0, i32 0
  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
  %rs = extractelement <2 x double> %xr, i32 0
  ret double %rs
 }
 define double @__floor_uniform_double(double) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <2 x double> undef, double %0, i32 0
  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
  %rs = extractelement <2 x double> %xr, i32 0
  ret double %rs
 }
 define double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <2 x double> undef, double %0, i32 0
  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
  %rs = extractelement <2 x double> %xr, i32 0
  ret double %rs
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
 declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
 define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
  ; do the rcpss call
  %vecval = insertelement <4 x float> undef, float %0, i32 0
  %call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
  %scall = extractelement <4 x float> %call, i32 0
  ; do one N-R iteration to improve precision, as above
  %v_iv = fmul float %0, %scall
  %two_minus = fsub float 2., %v_iv  
  %iv_mul = fmul float %scall, %two_minus
  ret float %iv_mul
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; rsqrt
 declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
 define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
  ;  uniform float is = extract(__rsqrt_u(v), 0);
  %v = insertelement <4 x float> undef, float %0, i32 0
  %vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
  %is = extractelement <4 x float> %vis, i32 0
  ; Newton-Raphson iteration to improve precision
  ;  return 0.5 * is * (3. - (v * is) * is);
  %v_is = fmul float %0, %is
  %v_is_is = fmul float %v_is, %is
  %three_sub = fsub float 3., %v_is_is
  %is_mul = fmul float %is, %three_sub
  %half_scale = fmul float 0.5, %is_mul
  ret float %half_scale
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; sqrt
 declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
 define float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
  sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
  ret float %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; fast math mode
 declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
 declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
 define void @__fastmath() nounwind alwaysinline {
  %ptr = alloca i32
  %ptr8 = bitcast i32 * %ptr to i8 *
  call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
  %oldval = load i32 *%ptr
  ; turn on DAZ (64)/FTZ (32768) -> 32832
  %update = or i32 %oldval, 32832
  store i32 %update, i32 *%ptr
  call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8)
  ret void
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
 declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
 declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
 define float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
  ret float %ret
 }
 define float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
  ret float %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; double precision sqrt
 declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
 define double @__sqrt_uniform_double(double) nounwind alwaysinline {
  sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
  ret double %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; double precision min/max
 declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
 declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
 define double @__min_uniform_double(double, double) nounwind readnone {
  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
  ret double %ret
 }
 define double @__max_uniform_double(double, double) nounwind readnone {
  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
  ret double %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int32 min/max
 declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
 declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
 define i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminsd, %0, %1)
  ret i32 %ret
 }
 define i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
  ret i32 %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; unsigned int min/max
 declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
 declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
 define i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminud, %0, %1)
  ret i32 %ret
 }
 define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxud, %0, %1)
  ret i32 %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
 declare i32 @llvm.ctpop.i32(i32) nounwind readnone
 define i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
  %call = call i32 @llvm.ctpop.i32(i32 %0)
  ret i32 %call
 }
 declare i64 @llvm.ctpop.i64(i64) nounwind readnone
 define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
  %call = call i64 @llvm.ctpop.i64(i64 %0)
  ret i64 %call
 }
--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -36,18 +36,31 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; standard 8-wide definitions from m4 macros
-stdlib_core(8)
+define(`WIDTH',`8')
-packed_load_and_store(8)
+define(`MASK',`i32')
-scans(8)
+include(`util.m4')
-int64minmax(8)
+
 stdlib_core()
 packed_load_and_store()
 scans()
 int64minmax()
 include(`target-sse4-common.ll')
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; half conversion routines
 declare float @__half_to_float_uniform(i16 %v) nounwind readnone
 declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
 declare i16 @__float_to_half_uniform(float %v) nounwind readnone
 declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
 declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
 declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
-define internal <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
+define <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
  ;  float iv = __rcp_v(v);
  ;  return iv * (2. - v * iv);
@@ -60,27 +73,12 @@ define internal <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly
  ret <8 x float> %iv_mul
 }
 define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
 ;    uniform float iv = extract(__rcp_u(v), 0);
 ;    return iv * (2. - v * iv);
  %vecval = insertelement <4 x float> undef, float %0, i32 0
  %call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
  %scall = extractelement <4 x float> %call, i32 0
  ; do one N-R iteration
  %v_iv = fmul float %0, %scall
  %two_minus = fsub float 2., %v_iv  
  %iv_mul = fmul float %scall, %two_minus
  ret float %iv_mul
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rsqrt
 declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
 declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
-define internal <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
+define <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
  ;  float is = __rsqrt_v(v);
  unary4to8(is, float, @llvm.x86.sse.rsqrt.ps, %v)
  ;  return 0.5 * is * (3. - (v * is) * is);
@@ -94,56 +92,16 @@ define internal <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind read
  ret <8 x float> %half_scale
 }
 define internal float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
  ;  uniform float is = extract(__rsqrt_u(v), 0);
  %v = insertelement <4 x float> undef, float %0, i32 0
  %vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
  %is = extractelement <4 x float> %vis, i32 0
  ;  return 0.5 * is * (3. - (v * is) * is);
  %v_is = fmul float %0, %is
  %v_is_is = fmul float %v_is, %is
  %three_sub = fsub float 3., %v_is_is
  %is_mul = fmul float %is, %three_sub
  %half_scale = fmul float 0.5, %is_mul
  ret float %half_scale
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; sqrt
 declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
 declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
-define internal <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
+define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
  unary4to8(call, float, @llvm.x86.sse.sqrt.ps, %0)
  ret <8 x float> %call
 }
 define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
  sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
  ret float %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; fast math
 declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
 declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
 define internal void @__fastmath() nounwind alwaysinline {
  %ptr = alloca i32
  %ptr8 = bitcast i32 * %ptr to i8 *
  call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
  %oldval = load i32 *%ptr
  ; turn on DAZ (64)/FTZ (32768) -> 32832
  %update = or i32 %oldval, 32832
  store i32 %update, i32 *%ptr
  call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8)
  ret void
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; svml stuff
@@ -158,17 +116,17 @@ declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
 declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
-define internal <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline {
  unary4to8(ret, float, @__svml_sinf4, %0)
  ret <8 x float> %ret
 }
-define internal <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline {
  unary4to8(ret, float, @__svml_cosf4, %0)
  ret <8 x float> %ret
 }
-define internal void @__svml_sincos(<8 x float>, <8 x float> *,
+define void @__svml_sincos(<8 x float>, <8 x float> *,
                                    <8 x float> *) nounwind readnone alwaysinline {
  ; call svml_sincosf4 two times with the two 4-wide sub-vectors
  %a = shufflevector <8 x float> %0, <8 x float> undef,
@@ -197,33 +155,33 @@ define internal void @__svml_sincos(<8 x float>, <8 x float> *,
  ret void
 }
-define internal <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline {
  unary4to8(ret, float, @__svml_tanf4, %0)
  ret <8 x float> %ret
 }
-define internal <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline {
  unary4to8(ret, float, @__svml_atanf4, %0)
  ret <8 x float> %ret
 }
-define internal <8 x float> @__svml_atan2(<8 x float>,
+define <8 x float> @__svml_atan2(<8 x float>,
                                          <8 x float>) nounwind readnone alwaysinline {
  binary4to8(ret, float, @__svml_atan2f4, %0, %1)
  ret <8 x float> %ret
 }
-define internal <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline {
  unary4to8(ret, float, @__svml_expf4, %0)
  ret <8 x float> %ret
 }
-define internal <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline {
  unary4to8(ret, float, @__svml_logf4, %0)
  ret <8 x float> %ret
 }
-define internal <8 x float> @__svml_pow(<8 x float>,
+define <8 x float> @__svml_pow(<8 x float>,
                                        <8 x float>) nounwind readnone alwaysinline {
  binary4to8(ret, float, @__svml_powf4, %0, %1)
  ret <8 x float> %ret
@@ -234,91 +192,52 @@ define internal <8 x float> @__svml_pow(<8 x float>,
 ;; float min/max
 declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
 declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
 declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
 declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
-define internal <8 x float> @__max_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
+define <8 x float> @__max_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
  binary4to8(call, float, @llvm.x86.sse.max.ps, %0, %1)
  ret <8 x float> %call
 }
-define internal float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
+define <8 x float> @__min_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
  ret float %ret
 }
 define internal <8 x float> @__min_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
  binary4to8(call, float, @llvm.x86.sse.min.ps, %0, %1)
  ret <8 x float> %call
 }
 define internal float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
  ret float %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int32 min/max
-declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
+define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
 declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
 define internal <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
  binary4to8(call, i32, @llvm.x86.sse41.pminsd, %0, %1)
  ret <8 x i32> %call
 }
-define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminsd, %0, %1)
  ret i32 %ret
 }
 define internal <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
  binary4to8(call, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
  ret <8 x i32> %call
 }
 define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
  ret i32 %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; unsigned int min/max
-declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
+define <8 x i32> @__min_varying_uint32(<8 x i32>,
 declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
 define internal <8 x i32> @__min_varying_uint32(<8 x i32>,
                                                <8 x i32>) nounwind readonly alwaysinline {
  binary4to8(call, i32, @llvm.x86.sse41.pminud, %0, %1)
  ret <8 x i32> %call
 }
-define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+define <8 x i32> @__max_varying_uint32(<8 x i32>,
  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminud, %0, %1)
  ret i32 %ret
 }
 define internal <8 x i32> @__max_varying_uint32(<8 x i32>,
                                                <8 x i32>) nounwind readonly alwaysinline {
  binary4to8(call, i32, @llvm.x86.sse41.pmaxud, %0, %1)
  ret <8 x i32> %call
 }
 define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxud, %0, %1)
  ret i32 %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
 declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
-define internal i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
+define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
  ; first do two 4-wide movmsk calls
  %floatmask = bitcast <8 x i32> %0 to <8 x float>
  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
@@ -335,103 +254,103 @@ define internal i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
  ret i32 %v
 }
-define internal float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
+define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
  reduce8by4(float, @llvm.x86.sse.min.ps, @__min_uniform_float)
 }
-define internal float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
+define float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
  reduce8by4(float, @llvm.x86.sse.max.ps, @__max_uniform_float)
 }
 ; helper function for reduce_add_int32
-define internal <4 x i32> @__vec4_add_int32(<4 x i32> %v0,
+define <4 x i32> @__vec4_add_int32(<4 x i32> %v0,
-                                            <4 x i32> %v1) nounwind readnone alwaysinline {
+                                   <4 x i32> %v1) nounwind readnone alwaysinline {
  %v = add <4 x i32> %v0, %v1
  ret <4 x i32> %v
 }
 ; helper function for reduce_add_int32
-define internal i32 @__add_int32(i32, i32) nounwind readnone alwaysinline {
+define i32 @__add_int32(i32, i32) nounwind readnone alwaysinline {
  %v = add i32 %0, %1
  ret i32 %v
 }
-define internal i32 @__reduce_add_int32(<8 x i32>) nounwind readnone alwaysinline {
+define i32 @__reduce_add_int32(<8 x i32>) nounwind readnone alwaysinline {
  reduce8by4(i32, @__vec4_add_int32, @__add_int32)
 }
-define internal i32 @__reduce_min_int32(<8 x i32>) nounwind readnone alwaysinline {
+define i32 @__reduce_min_int32(<8 x i32>) nounwind readnone alwaysinline {
  reduce8by4(i32, @llvm.x86.sse41.pminsd, @__min_uniform_int32)
 }
-define internal i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
+define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
  reduce8by4(i32, @llvm.x86.sse41.pmaxsd, @__max_uniform_int32)
 }
-define internal i32 @__reduce_add_uint32(<8 x i32> %v) nounwind readnone alwaysinline {
+define i32 @__reduce_add_uint32(<8 x i32> %v) nounwind readnone alwaysinline {
  %r = call i32 @__reduce_add_int32(<8 x i32> %v)
  ret i32 %r
 }
-define internal i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
+define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
  reduce8by4(i32, @llvm.x86.sse41.pminud, @__min_uniform_uint32)
 }
-define internal i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
+define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
  reduce8by4(i32, @llvm.x86.sse41.pmaxud, @__max_uniform_uint32)
 }
-define internal <4 x double> @__add_varying_double(<4 x double>,
+define <4 x double> @__add_varying_double(<4 x double>,
                                     <4 x double>) nounwind readnone alwaysinline {
  %r = fadd <4 x double> %0, %1
  ret <4 x double> %r
 }
-define internal double @__add_uniform_double(double, double) nounwind readnone alwaysinline {
+define double @__add_uniform_double(double, double) nounwind readnone alwaysinline {
  %r = fadd double %0, %1
  ret double %r
 }
-define internal double @__reduce_add_double(<8 x double>) nounwind readnone {
+define double @__reduce_add_double(<8 x double>) nounwind readnone {
  reduce8by4(double, @__add_varying_double, @__add_uniform_double)
 }
-define internal double @__reduce_min_double(<8 x double>) nounwind readnone {
+define double @__reduce_min_double(<8 x double>) nounwind readnone {
  reduce8(double, @__min_varying_double, @__min_uniform_double)
 }
-define internal double @__reduce_max_double(<8 x double>) nounwind readnone {
+define double @__reduce_max_double(<8 x double>) nounwind readnone {
  reduce8(double, @__max_varying_double, @__max_uniform_double)
 }
-define internal <4 x i64> @__add_varying_int64(<4 x i64>,
+define <4 x i64> @__add_varying_int64(<4 x i64>,
                                               <4 x i64>) nounwind readnone alwaysinline {
  %r = add <4 x i64> %0, %1
  ret <4 x i64> %r
 }
-define internal i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
+define i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
  %r = add i64 %0, %1
  ret i64 %r
 }
-define internal i64 @__reduce_add_int64(<8 x i64>) nounwind readnone {
+define i64 @__reduce_add_int64(<8 x i64>) nounwind readnone {
  reduce8by4(i64, @__add_varying_int64, @__add_uniform_int64)
 }
-define internal i64 @__reduce_min_int64(<8 x i64>) nounwind readnone {
+define i64 @__reduce_min_int64(<8 x i64>) nounwind readnone {
  reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
 }
-define internal i64 @__reduce_max_int64(<8 x i64>) nounwind readnone {
+define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone {
  reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
 }
-define internal i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone {
+define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone {
  reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
 }
-define internal i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone {
+define i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone {
  reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
 }
@@ -445,10 +364,10 @@ load_and_broadcast(8, i16, 16)
 load_and_broadcast(8, i32, 32)
 load_and_broadcast(8, i64, 64)
-load_masked(8, i8,  8,  1)
+masked_load(8, i8,  8,  1)
-load_masked(8, i16, 16, 2)
+masked_load(8, i16, 16, 2)
-load_masked(8, i32, 32, 4)
+masked_load(8, i32, 32, 4)
-load_masked(8, i64, 64, 8)
+masked_load(8, i64, 64, 8)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter
@@ -467,129 +386,47 @@ gen_scatter(8, i64)
 ;; float rounding
 declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
 declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
-define internal <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
+define <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
  round4to8(%0, 8)
 }
-define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
+define <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
  ; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
  ; the roundss intrinsic is a total mess--docs say:
  ;
  ;  __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
  ;       
  ;  b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
  ;  on b0. The higher order 96 bits are copied directly from input parameter a. The
  ;  return value is described by the following equations:
  ;
  ;  r0 = RND(b0)
  ;  r1 = a1
  ;  r2 = a2
  ;  r3 = a3
  ;
  ;  It doesn't matter what we pass as a, since we only need the r0 value
  ;  here.  So we pass the same register for both.  
  %xi = insertelement <4 x float> undef, float %0, i32 0
  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
  %rs = extractelement <4 x float> %xr, i32 0
  ret float %rs
 }
 define internal <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  round4to8(%0, 9)
 }
-define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
+define <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <4 x float> undef, float %0, i32 0
  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
  %rs = extractelement <4 x float> %xr, i32 0
  ret float %rs
 }
 define internal <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  round4to8(%0, 10)
 }
 define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <4 x float> undef, float %0, i32 0
  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
  %rs = extractelement <4 x float> %xr, i32 0
  ret float %rs
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding doubles
 declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
 declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
-define internal <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
+define <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
  round2to8double(%0, 8)
 }
-define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
+define <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
  %xi = insertelement <2 x double> undef, double %0, i32 0
  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
  %rs = extractelement <2 x double> %xr, i32 0
  ret double %rs
 }
 define internal <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  round2to8double(%0, 9)
 }
-define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
+define <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <2 x double> undef, double %0, i32 0
  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
  %rs = extractelement <2 x double> %xr, i32 0
  ret double %rs
 }
 define internal <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  round2to8double(%0, 10)
 }
 define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <2 x double> undef, double %0, i32 0
  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
  %rs = extractelement <2 x double> %xr, i32 0
  ret double %rs
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
 declare i32 @llvm.ctpop.i32(i32) nounwind readnone
 define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
  %call = call i32 @llvm.ctpop.i32(i32 %0)
  ret i32 %call
 }
 declare i64 @llvm.ctpop.i64(i64) nounwind readnone
 define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
  %call = call i64 @llvm.ctpop.i64(i64 %0)
  ret i64 %call
 }
 declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
-define internal float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
+define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
  %a = shufflevector <8 x float> %0, <8 x float> undef,
         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %b = shufflevector <8 x float> %0, <8 x float> undef,
@@ -718,44 +555,24 @@ define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
 ;; double precision sqrt
 declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
 declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
-define internal <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
+define <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
  unary2to8(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
  ret <8 x double> %ret
 }
 define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
  sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.pd, %0)
  ret double %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; double precision float min/max
 declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
 declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
 declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
 declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
-define internal <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
+define <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
  binary2to8(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
  ret <8 x double> %ret
 }
-define internal double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
+define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.pd, %0, %1)
  ret double %ret
 }
 define internal <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
  binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
  ret <8 x double> %ret
 }
 define internal double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.pd, %0, %1)
  ret double %ret
 }
--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -0,0 +1,484 @@
 ;;  Copyright (c) 2010-2011, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
 ;;  modification, are permitted provided that the following conditions are
 ;;  met:
 ;;
 ;;    * Redistributions of source code must retain the above copyright
 ;;      notice, this list of conditions and the following disclaimer.
 ;;
 ;;    * Redistributions in binary form must reproduce the above copyright
 ;;      notice, this list of conditions and the following disclaimer in the
 ;;      documentation and/or other materials provided with the distribution.
 ;;
 ;;    * Neither the name of Intel Corporation nor the names of its
 ;;      contributors may be used to endorse or promote products derived from
 ;;      this software without specific prior written permission.
 ;;
 ;;
 ;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 ;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 ;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 ;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
 ;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 ;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 ;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 ;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 ;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; Define common 4-wide stuff
 define(`WIDTH',`4')
 define(`MASK',`i32')
 include(`util.m4')
 stdlib_core()
 packed_load_and_store()
 scans()
 int64minmax()
 include(`target-sse4-common.ll')
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; half conversion routines
 declare float @__half_to_float_uniform(i16 %v) nounwind readnone
 declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
 declare i16 @__float_to_half_uniform(float %v) nounwind readnone
 declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
 declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
 define <4 x float> @__rcp_varying_float(<4 x float>) nounwind readonly alwaysinline {
  %call = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %0)
  ; do one N-R iteration to improve precision
  ;  float iv = __rcp_v(v);
  ;  return iv * (2. - v * iv);
  %v_iv = fmul <4 x float> %0, %call
  %two_minus = fsub <4 x float> <float 2., float 2., float 2., float 2.>, %v_iv  
  %iv_mul = fmul <4 x float> %call, %two_minus
  ret <4 x float> %iv_mul
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; rsqrt
 declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
 define <4 x float> @__rsqrt_varying_float(<4 x float> %v) nounwind readonly alwaysinline {
  ;  float is = __rsqrt_v(v);
  %is = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %v)
  ; Newton-Raphson iteration to improve precision
  ;  return 0.5 * is * (3. - (v * is) * is);
  %v_is = fmul <4 x float> %v, %is
  %v_is_is = fmul <4 x float> %v_is, %is
  %three_sub = fsub <4 x float> <float 3., float 3., float 3., float 3.>, %v_is_is
  %is_mul = fmul <4 x float> %is, %three_sub
  %half_scale = fmul <4 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
  ret <4 x float> %half_scale
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; sqrt
 declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
 define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysinline {
  %call = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %0)
  ret <4 x float> %call
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; double precision sqrt
 declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
 define <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
  unary2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
  ret <4 x double> %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding floats
 declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
 define <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline {
  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 8)
  ret <4 x float> %call
 }
 define <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 9)
  ret <4 x float> %call
 }
 define <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 10)
  ret <4 x float> %call
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding doubles
 declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
 define <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline {
  round2to4double(%0, 8)
 }
 define <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  round2to4double(%0, 9)
 }
 define <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  round2to4double(%0, 10)
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
 declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
 declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
 define <4 x float> @__max_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
  %call = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %0, <4 x float> %1)
  ret <4 x float> %call
 }
 define <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
  %call = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %0, <4 x float> %1)
  ret <4 x float> %call
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int32 min/max
 define <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
  %call = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %0, <4 x i32> %1)
  ret <4 x i32> %call
 }
 define <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
  %call = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %0, <4 x i32> %1)
  ret <4 x i32> %call
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; unsigned int min/max
 define <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
  %call = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %0, <4 x i32> %1)
  ret <4 x i32> %call
 }
 define <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
  %call = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %0, <4 x i32> %1)
  ret <4 x i32> %call
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; double precision min/max
 declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
 declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
 define <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone {
  binary2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
  ret <4 x double> %ret
 }
 define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone {
  binary2to4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
  ret <4 x double> %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; svml stuff
 declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
 declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
 declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
 declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
 declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
 declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
 declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
 declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
 declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
 define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
  %ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
  ret <4 x float> %ret
 }
 define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
  %ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
  ret <4 x float> %ret
 }
 define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
  %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
  store <4 x float> %s, <4 x float> * %1
  ret void
 }
 define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
  %ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
  ret <4 x float> %ret
 }
 define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
  %ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
  ret <4 x float> %ret
 }
 define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
  %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
  ret <4 x float> %ret
 }
 define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
  %ret = call <4 x float> @__svml_expf4(<4 x float> %0)
  ret <4 x float> %ret
 }
 define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
  %ret = call <4 x float> @__svml_logf4(<4 x float> %0)
  ret <4 x float> %ret
 }
 define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
  %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
  ret <4 x float> %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
 declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
 define i32 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
  %floatmask = bitcast <4 x i32> %0 to <4 x float>
  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
  ret i32 %v
 }
 declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
 define float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline {
  %v1 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %0, <4 x float> %0)
  %v2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %v1, <4 x float> %v1)
  %scalar = extractelement <4 x float> %v2, i32 0
  ret float %scalar
 }
 define float @__reduce_min_float(<4 x float>) nounwind readnone {
  reduce4(float, @__min_varying_float, @__min_uniform_float)
 }
 define float @__reduce_max_float(<4 x float>) nounwind readnone {
  reduce4(float, @__max_varying_float, @__max_uniform_float)
 }
 define i32 @__reduce_add_int32(<4 x i32> %v) nounwind readnone {
  %v1 = shufflevector <4 x i32> %v, <4 x i32> undef,
                      <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
  %m1 = add <4 x i32> %v1, %v
  %m1a = extractelement <4 x i32> %m1, i32 0
  %m1b = extractelement <4 x i32> %m1, i32 1
  %sum = add i32 %m1a, %m1b
  ret i32 %sum
 }
 define i32 @__reduce_min_int32(<4 x i32>) nounwind readnone {
  reduce4(i32, @__min_varying_int32, @__min_uniform_int32)
 }
 define i32 @__reduce_max_int32(<4 x i32>) nounwind readnone {
  reduce4(i32, @__max_varying_int32, @__max_uniform_int32)
 }
 define i32 @__reduce_add_uint32(<4 x i32> %v) nounwind readnone {
  %r = call i32 @__reduce_add_int32(<4 x i32> %v)
  ret i32 %r
 }
 define i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone {
  reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32)
 }
 define i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone {
  reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32)
 }
 define double @__reduce_add_double(<4 x double>) nounwind readnone {
  %v0 = shufflevector <4 x double> %0, <4 x double> undef,
                      <2 x i32> <i32 0, i32 1>
  %v1 = shufflevector <4 x double> %0, <4 x double> undef,
                      <2 x i32> <i32 2, i32 3>
  %sum = fadd <2 x double> %v0, %v1
  %e0 = extractelement <2 x double> %sum, i32 0
  %e1 = extractelement <2 x double> %sum, i32 1
  %m = fadd double %e0, %e1
  ret double %m
 }
 define double @__reduce_min_double(<4 x double>) nounwind readnone {
  reduce4(double, @__min_varying_double, @__min_uniform_double)
 }
 define double @__reduce_max_double(<4 x double>) nounwind readnone {
  reduce4(double, @__max_varying_double, @__max_uniform_double)
 }
 define i64 @__reduce_add_int64(<4 x i64>) nounwind readnone {
  %v0 = shufflevector <4 x i64> %0, <4 x i64> undef,
                      <2 x i32> <i32 0, i32 1>
  %v1 = shufflevector <4 x i64> %0, <4 x i64> undef,
                      <2 x i32> <i32 2, i32 3>
  %sum = add <2 x i64> %v0, %v1
  %e0 = extractelement <2 x i64> %sum, i32 0
  %e1 = extractelement <2 x i64> %sum, i32 1
  %m = add i64 %e0, %e1
  ret i64 %m
 }
 define i64 @__reduce_min_int64(<4 x i64>) nounwind readnone {
  reduce4(i64, @__min_varying_int64, @__min_uniform_int64)
 }
 define i64 @__reduce_max_int64(<4 x i64>) nounwind readnone {
  reduce4(i64, @__max_varying_int64, @__max_uniform_int64)
 }
 define i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone {
  reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64)
 }
 define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone {
  reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
 }
 reduce_equal(4)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store
 declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
                                             <4 x float>) nounwind readnone
 define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>, 
                                     <4 x i32> %mask) nounwind alwaysinline {
  %mask_as_float = bitcast <4 x i32> %mask to <4 x float>
  %oldValue = load <4 x i32>* %0, align 4
  %oldAsFloat = bitcast <4 x i32> %oldValue to <4 x float>
  %newAsFloat = bitcast <4 x i32> %1 to <4 x float>
  %blend = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %oldAsFloat,
                                                     <4 x float> %newAsFloat,
                                                     <4 x float> %mask_as_float)
  %blendAsInt = bitcast <4 x float> %blend to <4 x i32>
  store <4 x i32> %blendAsInt, <4 x i32>* %0, align 4
  ret void
 }
 define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
                                     <4 x i32> %i32mask) nounwind alwaysinline {
  %oldValue = load <4 x i64>* %ptr, align 8
  %mask = bitcast <4 x i32> %i32mask to <4 x float>
  ; Do 4x64-bit blends by doing two <4 x i32> blends, where the <4 x i32> values
  ; are actually bitcast <2 x i64> values
  ;
  ; set up the first two 64-bit values
  %old01  = shufflevector <4 x i64> %oldValue, <4 x i64> undef,
                          <2 x i32> <i32 0, i32 1>
  %old01f = bitcast <2 x i64> %old01 to <4 x float>
  %new01  = shufflevector <4 x i64> %new, <4 x i64> undef,
                          <2 x i32> <i32 0, i32 1>
  %new01f = bitcast <2 x i64> %new01 to <4 x float>
  ; compute mask--note that the indices 0 and 1 are doubled-up
  %mask01 = shufflevector <4 x float> %mask, <4 x float> undef,
                          <4 x i32> <i32 0, i32 0, i32 1, i32 1>
  ; and blend the two of the values
  %result01f = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %old01f,
                                                         <4 x float> %new01f,
                                                         <4 x float> %mask01)
  %result01 = bitcast <4 x float> %result01f to <2 x i64>
  ; and again
  %old23  = shufflevector <4 x i64> %oldValue, <4 x i64> undef,
                          <2 x i32> <i32 2, i32 3>
  %old23f = bitcast <2 x i64> %old23 to <4 x float>
  %new23  = shufflevector <4 x i64> %new, <4 x i64> undef,
                          <2 x i32> <i32 2, i32 3>
  %new23f = bitcast <2 x i64> %new23 to <4 x float>
  ; compute mask--note that the values 2 and 3 are doubled-up
  %mask23 = shufflevector <4 x float> %mask, <4 x float> undef,
                          <4 x i32> <i32 2, i32 2, i32 3, i32 3>
  ; and blend the two of the values
  %result23f = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %old23f,
                                                         <4 x float> %new23f,
                                                         <4 x float> %mask23)
  %result23 = bitcast <4 x float> %result23f to <2 x i64>
  ; reconstruct the final <4 x i64> vector
  %final = shufflevector <2 x i64> %result01, <2 x i64> %result23,
                         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  store <4 x i64> %final, <4 x i64> * %ptr, align 8
  ret void
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store
 masked_store_blend_8_16_by_4()
 gen_masked_store(4, i8, 8)
 gen_masked_store(4, i16, 16)
 gen_masked_store(4, i32, 32)
 gen_masked_store(4, i64, 64)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts
 load_and_broadcast(4, i8, 8)
 load_and_broadcast(4, i16, 16)
 load_and_broadcast(4, i32, 32)
 load_and_broadcast(4, i64, 64)
 masked_load(4, i8,  8,  1)
 masked_load(4, i16, 16, 2)
 masked_load(4, i32, 32, 4)
 masked_load(4, i64, 64, 8)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter
 ; define these with the macros from stdlib.m4
 gen_gather(4, i8)
 gen_gather(4, i16)
 gen_gather(4, i32)
 gen_gather(4, i64)
 gen_scatter(4, i8)
 gen_scatter(4, i16)
 gen_scatter(4, i32)
 gen_scatter(4, i64)
--- a/builtins/util.m4
+++ b/builtins/util.m4
--- a/cbackend.cpp
+++ b/cbackend.cpp
--- a/ctx.cpp
+++ b/ctx.cpp
--- a/ctx.h
+++ b/ctx.h
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
  All rights reserved.
  Redistribution and use in source and binary forms, with or without
@@ -39,11 +39,10 @@
 #define ISPC_CTX_H 1
 #include "ispc.h"
 #include <map>
 #include <llvm/InstrTypes.h>
 #include <llvm/Instructions.h>
 #ifndef LLVM_2_8
 #include <llvm/Analysis/DIBuilder.h>
 #endif
 #include <llvm/Analysis/DebugInfo.h>
 struct CFInfo;
@@ -59,17 +58,22 @@ struct CFInfo;
 class FunctionEmitContext {
 public:
    /** Create a new FunctionEmitContext.
-        @param returnType   The return type of the function
+        @param function     The Function object representing the function
        @param function     LLVM function in the current module that corresponds
                            to the function
        @param funSym       Symbol that corresponds to the function
        @param llvmFunction LLVM function in the current module that corresponds
                            to the function
        @param firstStmtPos Source file position of the first statement in the
                            function
     */
-    FunctionEmitContext(const Type *returnType, llvm::Function *function, Symbol *funSym,
+    FunctionEmitContext(Function *function, Symbol *funSym, 
                        llvm::Function *llvmFunction,
                        SourcePos firstStmtPos);
    ~FunctionEmitContext();
    /** Returns the Function * corresponding to the function that we're
        currently generating code for. */
    const Function *GetFunction() const;
    /** @name Current basic block management
        @{
     */
@@ -83,20 +87,33 @@ public:
    /** @name Mask management
        @{
     */
-    /** Returns the current mask value */ 
+    /** Returns the mask value at entry to the current function. */ 
-    llvm::Value *GetMask();
+    llvm::Value *GetFunctionMask();
    /** Returns the mask value corresponding to "varying" control flow
        within the current function.  (i.e. this doesn't include the effect
        of the mask at function entry. */
    llvm::Value *GetInternalMask();
    /** Returns the complete current mask value--i.e. the logical AND of
        the function entry mask and the internal mask. */ 
    llvm::Value *GetFullMask();
    /** Returns a pointer to storage in memory that stores the current full
        mask. */
    llvm::Value *GetFullMaskPointer();
    /** Provides the value of the mask at function entry */
-    void SetEntryMask(llvm::Value *val);
+    void SetFunctionMask(llvm::Value *val);
-    /** Sets the mask to a new value */
+    /** Sets the internal mask to a new value */
-    void SetMask(llvm::Value *val);
+    void SetInternalMask(llvm::Value *val);
-    /** Sets the mask to (oldMask & val) */
+    /** Sets the internal mask to (oldMask & val) */
-    void MaskAnd(llvm::Value *oldMask, llvm::Value *val);
+    void SetInternalMaskAnd(llvm::Value *oldMask, llvm::Value *val);
-    /** Sets the mask to (oldMask & ~val) */
+    /** Sets the internal mask to (oldMask & ~val) */
-    void MaskAndNot(llvm::Value *oldMask, llvm::Value *test);
+    void SetInternalMaskAndNot(llvm::Value *oldMask, llvm::Value *test);
    /** Emits a branch instruction to the basic block btrue if any of the
        lanes of current mask are on and bfalse if none are on. */
@@ -115,9 +132,8 @@ public:
        @{
    */
    /** Notifies the FunctionEmitContext that we're starting emission of an
-        'if' statement with a uniform test.  The value of the mask going
+        'if' statement with a uniform test.  */
-        into the 'if' statement is provided in the oldMask parameter. */
+    void StartUniformIf();
    void StartUniformIf(llvm::Value *oldMask);
    /** Notifies the FunctionEmitContext that we're starting emission of an
        'if' statement with a varying test.  The value of the mask going
@@ -132,10 +148,9 @@ public:
        for a loop.  Basic blocks are provides for where 'break' and
        'continue' statements should jump to (if all running lanes want to
        break or continue), uniformControlFlow indicates whether the loop
-        condition is 'uniform', and oldMask provides the current mask going
+        condition is 'uniform'. */
        into the loop. */
    void StartLoop(llvm::BasicBlock *breakTarget, llvm::BasicBlock *continueTarget, 
-                   bool uniformControlFlow, llvm::Value *oldMask);
+                   bool uniformControlFlow);
    /** Informs FunctionEmitContext of the value of the mask at the start
        of a loop body. */
@@ -145,6 +160,11 @@ public:
        finished. */
    void EndLoop();
    /** Indicates that code generation for a 'foreach' or 'foreach_tiled'
        loop is about to start. */
    void StartForeach();
    void EndForeach();
    /** Emit code for a 'break' statement in a loop.  If doCoherenceCheck
        is true, then if we're in a 'varying' loop, code will be emitted to
        see if all of the lanes want to break, in which case a jump to the
@@ -165,10 +185,69 @@ public:
        previous iteration. */
    void RestoreContinuedLanes();
    /** Indicates that code generation for a "switch" statement is about to
        start.  isUniform indicates whether the "switch" value is uniform,
        and bbAfterSwitch gives the basic block immediately following the
        "switch" statement.  (For example, if the switch condition is
        uniform, we jump here upon executing a "break" statement.) */
    void StartSwitch(bool isUniform, llvm::BasicBlock *bbAfterSwitch);
    /** Indicates the end of code generation for a "switch" statement. */
    void EndSwitch();
    /** Emits code for a "switch" statement in the program.
        @param expr         Gives the value of the expression after the "switch"
        @param defaultBlock Basic block to execute for the "default" case.  This
                            should be NULL if there is no "default" label inside
                            the switch.
        @param caseBlocks   vector that stores the mapping from label values
                            after "case" statements to basic blocks corresponding
                            to the "case" labels.
        @param nextBlocks   For each basic block for a "case" or "default" 
                            label, this gives the basic block for the 
                            immediately-following "case" or "default" label (or
                            the basic block after the "switch" statement for the
                            last label.)
    */
    void SwitchInst(llvm::Value *expr, llvm::BasicBlock *defaultBlock,
                    const std::vector<std::pair<int, llvm::BasicBlock *> > &caseBlocks,
                    const std::map<llvm::BasicBlock *, llvm::BasicBlock *> &nextBlocks);
    /** Generates code for a "default" label after a "switch" statement.
        The checkMask parameter indicates whether additional code should be
        generated to check to see if the execution mask is all off after
        the default label (in which case a jump to the following label will
        be issued. */
    void EmitDefaultLabel(bool checkMask, SourcePos pos);
    /** Generates code for a "case" label after a "switch" statement.  See
        the documentation for EmitDefaultLabel() for discussion of the
        checkMask parameter. */
    void EmitCaseLabel(int value, bool checkMask, SourcePos pos);
    /** Returns the current number of nested levels of 'varying' control
        flow */
    int VaryingCFDepth() const;
    bool InForeachLoop() const;
    /** Temporarily disables emission of performance warnings from gathers
        and scatters from subsequent code. */
    void DisableGatherScatterWarnings();
    /** Reenables emission of gather/scatter performance warnings. */
    void EnableGatherScatterWarnings();
    void SetContinueTarget(llvm::BasicBlock *bb) { continueTarget = bb; }
    /** Step through the code and find label statements; create a basic
        block for each one, so that subsequent calls to
        GetLabeledBasicBlock() return the corresponding basic block. */
    void InitializeLabelMap(Stmt *code);
    /** If there is a label in the function with the given name, return the
        new basic block that it starts. */
    llvm::BasicBlock *GetLabeledBasicBlock(const std::string &label);
    /** Called to generate code for 'return' statement; value is the
        expression in the return statement (if non-NULL), and
        doCoherenceCheck indicates whether instructions should be generated
@@ -188,6 +267,10 @@ public:
        i1 value that indicates if all of the mask lanes are on. */
    llvm::Value *All(llvm::Value *mask);
    /** Given a boolean mask value of type LLVMTypes::MaskType, return an
        i1 value that indicates if all of the mask lanes are off. */
    llvm::Value *None(llvm::Value *mask);
    /** Given a boolean mask value of type LLVMTypes::MaskType, return an
        i32 value wherein the i'th bit is on if and only if the i'th lane
        of the mask is on. */
@@ -210,9 +293,6 @@ public:
        i32. */
    llvm::Value *I1VecToBoolVec(llvm::Value *b);
    /** Returns the size of the given type. */
    llvm::Value *SizeOf(LLVM_TYPE_CONST llvm::Type *ty);
    /** If the user has asked to compile the program with instrumentation,
        this inserts a callback to the user-supplied instrumentation
        function at the current point in the code. */
@@ -296,12 +376,18 @@ public:
                         llvm::CmpInst::Predicate pred,
                         llvm::Value *v0, llvm::Value *v1, const char *name = NULL);
    /** Given a scalar value, return a vector of the same type (or an
        array, for pointer types). */
    llvm::Value *SmearUniform(llvm::Value *value, const char *name = NULL);
    llvm::Value *BitCastInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
                             const char *name = NULL);
    llvm::Value *PtrToIntInst(llvm::Value *value, const char *name = NULL);
    llvm::Value *PtrToIntInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
                              const char *name = NULL);
    llvm::Value *IntToPtrInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
                              const char *name = NULL);
    llvm::Instruction *TruncInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
                                 const char *name = NULL);
    llvm::Instruction *CastInst(llvm::Instruction::CastOps op, llvm::Value *value,
@@ -313,26 +399,48 @@ public:
    llvm::Instruction *ZExtInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type, 
                                const char *name = NULL);
-    /** This GEP method is a generalization of the standard one in LLVM; it
+    /** Given two integer-typed values (but possibly one vector and the
-        supports both uniform and varying basePtr values (an array of
+        other not, and or of possibly-different bit-widths), update their
-        pointers) as well as uniform and varying index values (arrays of
+        values as needed so that the two have the same (more general)
-        indices). */
+        type. */ 
-    llvm::Value *GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index0,
+    void MatchIntegerTypes(llvm::Value **v0, llvm::Value **v1);
                                   llvm::Value *index1, const char *name = NULL);
-    /** This is a convenience method to generate a GEP instruction with
+    /** Create a new slice pointer out of the given pointer to an soa type
-        indices with values with known constant values as the ispc program
+        and an integer offset to a slice within that type. */
-        is being compiled. */
+    llvm::Value *MakeSlicePointer(llvm::Value *ptr, llvm::Value *offset);
-    llvm::Value *GetElementPtrInst(llvm::Value *basePtr, int v0, int v1,
+
    /** These GEP methods are generalizations of the standard ones in LLVM;
        they support both uniform and varying basePtr values as well as
        uniform and varying index values (arrays of indices).  Varying base
        pointers are expected to come in as vectors of i32/i64 (depending
        on the target), since LLVM doesn't currently support vectors of
        pointers.  The underlying type of the base pointer must be provided
        via the ptrType parameter */
    llvm::Value *GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index,
                                   const Type *ptrType, const char *name = NULL);
    llvm::Value *GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index0,
                                   llvm::Value *index1, const Type *ptrType,
                                   const char *name = NULL);
-    /** Load from the memory location(s) given by lvalue.  The lvalue may
+    /** This method returns a new pointer that represents offsetting the
-        be varying, in which case this corresponds to a gather from the
+        given base pointer to point at the given element number of the
-        multiple memory locations given by the array of pointer values
+        structure type that the base pointer points to.  (The provided
-        given by the lvalue.  If the lvalue is not varying, then the type
+        pointer must be a pointer to a structure type.  The ptrType gives
-        parameter may be NULL. */
+        the type of the pointer, though it may be NULL if the base pointer
-    llvm::Value *LoadInst(llvm::Value *lvalue, const Type *type,
+        is uniform. */
-                          const char *name = NULL);
+    llvm::Value *AddElementOffset(llvm::Value *basePtr, int elementNum,
                                  const Type *ptrType, const char *name = NULL,
                                  const PointerType **resultPtrType = NULL);
    /** Load from the memory location(s) given by lvalue, using the given
        mask.  The lvalue may be varying, in which case this corresponds to
        a gather from the multiple memory locations given by the array of
        pointer values given by the lvalue.  If the lvalue is not varying,
        then both the mask pointer and the type pointer may be NULL. */
    llvm::Value *LoadInst(llvm::Value *ptr, llvm::Value *mask,
                          const Type *ptrType, const char *name = NULL);
    llvm::Value *LoadInst(llvm::Value *ptr, const char *name = NULL);
    /** Emits an alloca instruction to allocate stack storage for the given
        type.  If a non-zero alignment is specified, the object is also
@@ -340,21 +448,27 @@ public:
        instruction is added at the start of the function in the entry
        basic block; if it should be added to the current basic block, then
        the atEntryBlock parameter should be false. */ 
-    llvm::Value *AllocaInst(LLVM_TYPE_CONST llvm::Type *llvmType, const char *name = NULL,
+    llvm::Value *AllocaInst(LLVM_TYPE_CONST llvm::Type *llvmType, 
-                            int align = 0, bool atEntryBlock = true);
+                            const char *name = NULL, int align = 0, 
                            bool atEntryBlock = true);
    /** Standard store instruction; for this variant, the lvalue must be a
        single pointer, not a varying lvalue. */
-    void StoreInst(llvm::Value *rvalue, llvm::Value *lvalue, 
+    void StoreInst(llvm::Value *value, llvm::Value *ptr);
                   const char *name = NULL);
    /** In this variant of StoreInst(), the lvalue may be varying.  If so,
        this corresponds to a scatter.  Whether the lvalue is uniform of
        varying, the given storeMask is used to mask the stores so that
        they only execute for the active program instances. */
-    void StoreInst(llvm::Value *rvalue, llvm::Value *lvalue,
+    void StoreInst(llvm::Value *value, llvm::Value *ptr,
-                   llvm::Value *storeMask, const Type *rvalueType,
+                   llvm::Value *storeMask, const Type *valueType,
-                   const char *name = NULL);
+                   const Type *ptrType);
    /** Copy count bytes of memory from the location pointed to by src to
        the location pointed to by dest.  (src and dest must not be
        overlapping.) */ 
    void MemcpyInst(llvm::Value *dest, llvm::Value *src, llvm::Value *count,
                    llvm::Value *align = NULL);
    void BranchInst(llvm::BasicBlock *block);
    void BranchInst(llvm::BasicBlock *trueBlock, llvm::BasicBlock *falseBlock,
@@ -376,24 +490,30 @@ public:
    llvm::Instruction *SelectInst(llvm::Value *test, llvm::Value *val0,
                                  llvm::Value *val1, const char *name = NULL);
-    llvm::Instruction *CallInst(llvm::Function *func, 
+    /** Emits IR to do a function call with the given arguments.  If the
-                                const std::vector<llvm::Value *> &args,
+        function type is a varying function pointer type, its full type
-                                const char *name = NULL);
+        must be provided in funcType.  funcType can be NULL if func is a
        uniform function pointer. */
    llvm::Value *CallInst(llvm::Value *func, const FunctionType *funcType,
                          const std::vector<llvm::Value *> &args,
                          const char *name = NULL);
    /** This is a convenience method that issues a call instruction to a
        function that takes just a single argument. */
-    llvm::Instruction *CallInst(llvm::Function *func, llvm::Value *arg,
+    llvm::Value *CallInst(llvm::Value *func, const FunctionType *funcType,
-                                const char *name = NULL);
+                          llvm::Value *arg, const char *name = NULL);
    /** This is a convenience method that issues a call instruction to a
        function that takes two arguments. */
-    llvm::Instruction *CallInst(llvm::Function *func, llvm::Value *arg0,
+    llvm::Value *CallInst(llvm::Value *func, const FunctionType *funcType,
-                                llvm::Value *arg1, const char *name = NULL);
+                          llvm::Value *arg0, llvm::Value *arg1,
                          const char *name = NULL);
    /** Launch an asynchronous task to run the given function, passing it
        he given argument values. */
-    llvm::Instruction *LaunchInst(llvm::Function *callee, 
+    llvm::Value *LaunchInst(llvm::Value *callee, 
-                                  std::vector<llvm::Value *> &argVals,
+                            std::vector<llvm::Value *> &argVals,
-                                  llvm::Value *launchCount);
+                            llvm::Value *launchCount);
    void SyncInst();
@@ -401,6 +521,12 @@ public:
    /** @} */
 private:
    /** Pointer to the Function for which we're currently generating code. */
    Function *function;
    /** LLVM function representation for the current function. */
    llvm::Function *llvmFunction;
    /** The basic block into which we add any alloca instructions that need
        to go at the very start of the function. */
    llvm::BasicBlock *allocaBlock;
@@ -410,8 +536,16 @@ private:
    llvm::BasicBlock *bblock;
    /** Pointer to stack-allocated memory that stores the current value of
-        the program mask. */
+        the full program mask. */
-    llvm::Value *maskPtr;
+    llvm::Value *fullMaskPointer;
    /** Pointer to stack-allocated memory that stores the current value of
        the program mask representing varying control flow within the
        function. */
    llvm::Value *internalMaskPointer;
    /** Value of the program mask when the function starts execution.  */
    llvm::Value *functionMaskValue;
    /** Current source file position; if debugging information is being
        generated, this position is used to set file/line information for
@@ -422,20 +556,14 @@ private:
        for error messages and debugging symbols. */
    SourcePos funcStartPos;
    /** Type of result that the current function returns. */
    const Type *returnType;
    /** Value of the program mask when the function starts execution.  */
    llvm::Value *entryMask;
    /** If currently in a loop body, the value of the mask at the start of
        the loop. */
    llvm::Value *loopMask;
-    /** If currently in a loop body, this is a pointer to memory to store a
+    /** If currently in a loop body or switch statement, this is a pointer
-        mask value that represents which of the lanes have executed a
+        to memory to store a mask value that represents which of the lanes
-        'break' statement.  If we're not in a loop body, this should be
+        have executed a 'break' statement.  If we're not in a loop body or
-        NULL. */
+        switch, this should be NULL. */
    llvm::Value *breakLanesPtr;
    /** Similar to breakLanesPtr, if we're inside a loop, this is a pointer
@@ -443,16 +571,49 @@ private:
        'continue' statement. */
    llvm::Value *continueLanesPtr;
-    /** If we're inside a loop, this gives the basic block immediately
+    /** If we're inside a loop or switch statement, this gives the basic
-        after the current loop, which we will jump to if all of the lanes
+        block immediately after the current loop or switch, which we will
-        have executed a break statement or are otherwise done with the
+        jump to if all of the lanes have executed a break statement or are
-        loop. */
+        otherwise done with it. */
    llvm::BasicBlock *breakTarget;
    /** If we're inside a loop, this gives the block to jump to if all of
        the running lanes have executed a 'continue' statement. */
    llvm::BasicBlock *continueTarget;
    /** @name Switch statement state
        These variables store various state that's active when we're
        generating code for a switch statement.  They should all be NULL
        outside of a switch.
        @{
    */
    /** The value of the expression used to determine which case in the
        statements after the switch to execute. */
    llvm::Value *switchExpr;
    /** Map from case label numbers to the basic block that will hold code
        for that case. */
    const std::vector<std::pair<int, llvm::BasicBlock *> > *caseBlocks;
    /** The basic block of code to run for the "default" label in the
        switch statement. */
    llvm::BasicBlock *defaultBlock;
    /** For each basic block for the code for cases (and the default label,
        if present), this map gives the basic block for the immediately
        following case/default label. */
    const std::map<llvm::BasicBlock *, llvm::BasicBlock *> *nextBlocks;
    /** Records whether the switch condition was uniform; this is a
        distinct notion from whether the switch represents uniform or
        varying control flow; we may have varying control flow from a
        uniform switch condition if there is a 'break' inside the switch
        that's under varying control flow. */
    bool switchConditionWasUniform;
    /** @} */
    /** A pointer to memory that records which of the program instances
        have executed a 'return' statement (and are thus really truly done
        running any more instructions in this functions. */
@@ -490,20 +651,44 @@ private:
        tasks launched from the current function. */
    llvm::Value *launchGroupHandlePtr;
    /** Nesting count of the number of times calling code has disabled (and
        not yet reenabled) gather/scatter performance warnings. */
    int disableGSWarningCount;
    std::map<std::string, llvm::BasicBlock *> labelMap;
    static bool initLabelBBlocks(ASTNode *node, void *data);
    llvm::Value *pointerVectorToVoidPointers(llvm::Value *value);
-    static void addGSMetadata(llvm::Instruction *inst, SourcePos pos);
+    static void addGSMetadata(llvm::Value *inst, SourcePos pos);
-    bool ifsInLoopAllUniform() const;
+    bool ifsInCFAllUniform(int cfType) const;
    void jumpIfAllLoopLanesAreDone(llvm::BasicBlock *target);
    llvm::Value *emitGatherCallback(llvm::Value *lvalue, llvm::Value *retPtr);
-    void restoreMaskGivenReturns(llvm::Value *oldMask);
+    llvm::Value *applyVaryingGEP(llvm::Value *basePtr, llvm::Value *index, 
                                 const Type *ptrType);
-    void scatter(llvm::Value *rvalue, llvm::Value *lvalue, 
+    void restoreMaskGivenReturns(llvm::Value *oldMask);
-                 llvm::Value *maskPtr, const Type *rvalueType);
+    void addSwitchMaskCheck(llvm::Value *mask);
-    llvm::Value *gather(llvm::Value *lvalue, const Type *type,
+    bool inSwitchStatement() const;
-                        const char *name);
+    llvm::Value *getMaskAtSwitchEntry();
-    void maskedStore(llvm::Value *rvalue, llvm::Value *lvalue,
+
-                     const Type *rvalueType, llvm::Value *maskPtr);
+    CFInfo *popCFState();
    void scatter(llvm::Value *value, llvm::Value *ptr, const Type *valueType,
                 const Type *ptrType, llvm::Value *mask);
    void maskedStore(llvm::Value *value, llvm::Value *ptr, const Type *ptrType,
                     llvm::Value *mask);
    void storeUniformToSOA(llvm::Value *value, llvm::Value *ptr, 
                           llvm::Value *mask, const Type *valueType,
                           const PointerType *ptrType);
    llvm::Value *loadUniformFromSOA(llvm::Value *ptr, llvm::Value *mask,
                                    const PointerType *ptrType, const char *name);
    llvm::Value *gather(llvm::Value *ptr, const PointerType *ptrType,
                        llvm::Value *mask, const char *name);
    llvm::Value *addVaryingOffsetsIfNeeded(llvm::Value *ptr, const Type *ptrType);
 };
 #endif // ISPC_CTX_H
--- a/decl.cpp
+++ b/decl.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
  All rights reserved.
  Redistribution and use in source and binary forms, with or without
@@ -38,10 +38,81 @@
 #include "decl.h"
 #include "util.h"
 #include "module.h"
 #include "sym.h"
 #include "type.h"
 #include "stmt.h"
 #include "expr.h"
 #include <stdio.h>
 #include <string.h>
 #include <set>
 static void
 lPrintTypeQualifiers(int typeQualifiers) {
    if (typeQualifiers & TYPEQUAL_INLINE)    printf("inline ");
    if (typeQualifiers & TYPEQUAL_CONST)     printf("const ");
    if (typeQualifiers & TYPEQUAL_UNIFORM)   printf("uniform ");
    if (typeQualifiers & TYPEQUAL_VARYING)   printf("varying ");
    if (typeQualifiers & TYPEQUAL_TASK)      printf("task ");
    if (typeQualifiers & TYPEQUAL_SIGNED)    printf("signed ");
    if (typeQualifiers & TYPEQUAL_UNSIGNED)  printf("unsigned ");
    if (typeQualifiers & TYPEQUAL_EXPORT)    printf("export ");
 }
 /** Given a Type and a set of type qualifiers, apply the type qualifiers to
    the type, returning the type that is the result. 
 */
 static const Type *
 lApplyTypeQualifiers(int typeQualifiers, const Type *type, SourcePos pos) {
    if (type == NULL)
        return NULL;
    if ((typeQualifiers & TYPEQUAL_CONST) != 0)
        type = type->GetAsConstType();
    if ((typeQualifiers & TYPEQUAL_UNIFORM) != 0) {
        if (Type::Equal(type, AtomicType::Void))
            Error(pos, "\"uniform\" qualifier is illegal with \"void\" type.");
        else
            type = type->GetAsUniformType();
    }
    else if ((typeQualifiers & TYPEQUAL_VARYING) != 0) {
        if (Type::Equal(type, AtomicType::Void))
            Error(pos, "\"varying\" qualifier is illegal with \"void\" type.");
        else
            type = type->GetAsVaryingType();
    }
    else
        if (Type::Equal(type, AtomicType::Void) == false)
            type = type->GetAsUnboundVariabilityType();
    if ((typeQualifiers & TYPEQUAL_UNSIGNED) != 0) {
        if ((typeQualifiers & TYPEQUAL_SIGNED) != 0)
            Error(pos, "Illegal to apply both \"signed\" and \"unsigned\" "
                  "qualifiers.");
        const Type *unsignedType = type->GetAsUnsignedType();
        if (unsignedType != NULL)
            type = unsignedType;
        else {
            const Type *resolvedType = 
                type->ResolveUnboundVariability(Variability::Varying);
            Error(pos, "\"unsigned\" qualifier is illegal with \"%s\" type.",
                  resolvedType->GetString().c_str());
        }
    }
    if ((typeQualifiers & TYPEQUAL_SIGNED) != 0 && type->IsIntType() == false) {
        const Type *resolvedType = 
            type->ResolveUnboundVariability(Variability::Varying);
        Error(pos, "\"signed\" qualifier is illegal with non-integer type "
              "\"%s\".", resolvedType->GetString().c_str());
    }
    return type;
 }
 ///////////////////////////////////////////////////////////////////////////
 // DeclSpecs
@@ -49,272 +120,604 @@
 DeclSpecs::DeclSpecs(const Type *t, StorageClass sc, int tq) {
    baseType = t;
    storageClass = sc;
-    typeQualifier = tq;
+    typeQualifiers = tq;
    soaWidth = 0;
    vectorSize = 0;
 }
 const Type *
 DeclSpecs::GetBaseType(SourcePos pos) const {
    const Type *retType = baseType;
    if (retType == NULL) {
        Warning(pos, "No type specified in declaration.  Assuming int32.");
        retType = AtomicType::UniformInt32->GetAsUnboundVariabilityType();
    }
    if (vectorSize > 0) {
        const AtomicType *atomicType = dynamic_cast<const AtomicType *>(retType);
        if (atomicType == NULL) {
            Error(pos, "Only atomic types (int, float, ...) are legal for vector "
                  "types.");
            return NULL;
        }
        retType = new VectorType(atomicType, vectorSize);
    }
    retType = lApplyTypeQualifiers(typeQualifiers, retType, pos);
    if (soaWidth > 0) {
        const StructType *st = dynamic_cast<const StructType *>(retType);
        if (st == NULL) {
            Error(pos, "Illegal to provide soa<%d> qualifier with non-struct "
                  "type \"%s\".", soaWidth, retType->GetString().c_str());
            return NULL;
        }
        else if (soaWidth <= 0 || (soaWidth & (soaWidth - 1)) != 0) {
            Error(pos, "soa<%d> width illegal.  Value must be positive power "
                  "of two.", soaWidth);
            return NULL;
        }
        if (st->IsUniformType()) {
            Error(pos, "\"uniform\" qualifier and \"soa<%d>\" qualifier can't "
                  "both be used in a type declaration.", soaWidth);
            return NULL;
        }
        else if (st->IsVaryingType()) {
            Error(pos, "\"varying\" qualifier and \"soa<%d>\" qualifier can't "
                  "both be used in a type declaration.", soaWidth);
            return NULL;
        }
        else
            retType = st->GetAsSOAType(soaWidth);
        if (soaWidth < g->target.vectorWidth)
            PerformanceWarning(pos, "soa<%d> width smaller than gang size %d "
                               "currently leads to inefficient code to access "
                               "soa types.", soaWidth, g->target.vectorWidth);
    }
    return retType;
 }
 static const char *
 lGetStorageClassName(StorageClass storageClass) {
    switch (storageClass) {
    case SC_NONE:     return "";
    case SC_EXTERN:   return "extern";
    case SC_EXTERN_C: return "extern \"C\"";
    case SC_STATIC:   return "static";
    case SC_TYPEDEF:  return "typedef";
    default:          FATAL("Unhandled storage class in lGetStorageClassName");
                      return "";
    }
 }
 void
 DeclSpecs::Print() const {
-    if (storageClass == SC_EXTERN)   printf("extern ");
+    printf("Declspecs: [%s ", lGetStorageClassName(storageClass));
    if (storageClass == SC_EXTERN_C) printf("extern \"C\" ");
    if (storageClass == SC_EXPORT)   printf("export ");
    if (storageClass == SC_STATIC)   printf("static ");
    if (storageClass == SC_TYPEDEF)  printf("typedef ");
    if (soaWidth > 0) printf("soa<%d> ", soaWidth);
-
+    lPrintTypeQualifiers(typeQualifiers);
-    if (typeQualifier & TYPEQUAL_INLINE)    printf("inline ");
+    printf("base type: %s", baseType->GetString().c_str());
    if (typeQualifier & TYPEQUAL_CONST)     printf("const ");
    if (typeQualifier & TYPEQUAL_UNIFORM)   printf("uniform ");
    if (typeQualifier & TYPEQUAL_VARYING)   printf("varying ");
    if (typeQualifier & TYPEQUAL_TASK)      printf("task ");
    if (typeQualifier & TYPEQUAL_REFERENCE) printf("reference ");
    if (typeQualifier & TYPEQUAL_UNSIGNED)  printf("unsigned ");
    printf("%s", baseType->GetString().c_str());
    if (vectorSize > 0) printf("<%d>", vectorSize);
    printf("]");
 }
 ///////////////////////////////////////////////////////////////////////////
 // Declarator
-Declarator::Declarator(Symbol *s, SourcePos p) 
+Declarator::Declarator(DeclaratorKind dk, SourcePos p) 
-  : pos(p) { 
+    : pos(p), kind(dk) { 
-    sym = s;
+    child = NULL;
-    functionArgs = NULL;
+    typeQualifiers = 0;
-    isFunction = false;
+    arraySize = -1;
    sym = NULL;
    initExpr = NULL;
 }
 void
 Declarator::AddArrayDimension(int size) {
    assert(size > 0 || size == -1); // -1 -> unsized
    arraySize.push_back(size);
 }
 void
 Declarator::InitFromDeclSpecs(DeclSpecs *ds) {
-    sym->type = GetType(ds);
+    const Type *t = GetType(ds);
    if (t == NULL) {
        Assert(m->errorCount > 0);
        return;
    }
-    if (ds->storageClass == SC_STATIC)
+    Symbol *sym = GetSymbol();
-        sym->isStatic = true;
+    if (sym != NULL) {
        sym->type = t;
        sym->storageClass = ds->storageClass;
    }
 }
 Symbol *
 Declarator::GetSymbol() const {
    // The symbol lives at the last child in the chain, so walk down there
    // and return the one there.
    const Declarator *d = this;
    while (d->child != NULL)
        d = d->child;
    return d->sym;
 }
 void
-Declarator::Print() const {
+Declarator::Print(int indent) const {
-    printf("%s", sym->name.c_str());
+    printf("%*cdeclarator: [", indent, ' ');
    pos.Print();
    lPrintTypeQualifiers(typeQualifiers);
    Symbol *sym = GetSymbol();
    if (sym != NULL)
        printf("%s", sym->name.c_str());
    else
        printf("(null symbol)");
    printf(", array size = %d", arraySize);
    printf(", kind = ");
    switch (kind) {
    case DK_BASE:      printf("base");      break;
    case DK_POINTER:   printf("pointer");   break;
    case DK_REFERENCE: printf("reference"); break;
    case DK_ARRAY:     printf("array");     break;
    case DK_FUNCTION:  printf("function");  break;
    default:           FATAL("Unhandled declarator kind");
    }
    if (initExpr != NULL) {
        printf(" = (");
        initExpr->Print();
        printf(")");
    }
-    pos.Print();
+
    if (functionParams.size() > 0) {
        for (unsigned int i = 0; i < functionParams.size(); ++i) {
            printf("\n%*cfunc param %d:\n", indent, ' ', i);
            functionParams[i]->Print(indent+4);
        }
    }
    if (child != NULL)
        child->Print(indent + 4);
    printf("]\n");
 }
-static const Type *
+Symbol *
-lGetType(const Declarator *decl, DeclSpecs *ds, 
+Declarator::GetFunctionInfo(DeclSpecs *ds, std::vector<Symbol *> *funArgs) {
-         std::vector<int>::const_iterator arrayIter) {
+    const FunctionType *type = 
-    if (arrayIter == decl->arraySize.end()) {
+        dynamic_cast<const FunctionType *>(GetType(ds));
-        // If we don't have an array (or have processed all of the array
+    if (type == NULL)
-        // dimensions in previous recursive calls), we can go ahead and
+        return NULL;
-        // figure out the final non-array type we have here.
+
-        const Type *type = ds->baseType;
+    Symbol *declSym = GetSymbol();
-        if (type == NULL) {
+    Assert(declSym != NULL);
-            Error(decl->pos, "Type not provided in variable declaration for variable \"%s\".",
+
-                  decl->sym->name.c_str());
+    // Get the symbol for the function from the symbol table.  (It should
    // already have been added to the symbol table by AddGlobal() by the
    // time we get here.)
    Symbol *funSym = m->symbolTable->LookupFunction(declSym->name.c_str(), type);
    if (funSym == NULL)
        // May be NULL due to error earlier in compilation
        Assert(m->errorCount > 0);
    else
        funSym->pos = pos;
    // Walk down to the declarator for the function.  (We have to get past
    // the stuff that specifies the function's return type before we get to
    // the function's declarator.)
    Declarator *d = this;
    while (d != NULL && d->kind != DK_FUNCTION)
        d = d->child;
    Assert(d != NULL);
    for (unsigned int i = 0; i < d->functionParams.size(); ++i) {
        Symbol *sym = d->GetSymbolForFunctionParameter(i);
        if (sym->type == NULL) {
            Assert(m->errorCount > 0);
            continue;
        }
        else
            sym->type = sym->type->ResolveUnboundVariability(Variability::Varying);
        funArgs->push_back(sym);
    }
    if (funSym != NULL)
        funSym->type = funSym->type->ResolveUnboundVariability(Variability::Varying);
    return funSym;
 }
 const Type *
 Declarator::GetType(const Type *base, DeclSpecs *ds) const {
    bool hasUniformQual = ((typeQualifiers & TYPEQUAL_UNIFORM) != 0);
    bool hasVaryingQual = ((typeQualifiers & TYPEQUAL_VARYING) != 0);
    bool isTask =         ((typeQualifiers & TYPEQUAL_TASK) != 0);
    bool isExported =     ((typeQualifiers & TYPEQUAL_EXPORT) != 0);
    bool isConst =        ((typeQualifiers & TYPEQUAL_CONST) != 0);
    if (hasUniformQual && hasVaryingQual) {
        Error(pos, "Can't provide both \"uniform\" and \"varying\" qualifiers.");
        return NULL;
    }
    if (kind != DK_FUNCTION && isTask)
        Error(pos, "\"task\" qualifier illegal in variable declaration.");
    if (kind != DK_FUNCTION && isExported)
        Error(pos, "\"export\" qualifier illegal in variable declaration.");
    Variability variability(Variability::Unbound);
    if (hasUniformQual)
        variability = Variability::Uniform;
    else if (hasVaryingQual)
        variability = Variability::Varying;
    const Type *type = base;
    switch (kind) {
    case DK_BASE:
        // All of the type qualifiers should be in the DeclSpecs for the
        // base declarator
        Assert(typeQualifiers == 0);
        Assert(child == NULL);
        return type;
    case DK_POINTER:
        /* For now, any pointer to an SOA type gets the slice property; if
           we add the capability to declare pointers as slices or not,
           we'll want to set this based on a type qualifier here. */
        type = new PointerType(type, variability, isConst, type->IsSOAType());
        if (child != NULL)
            return child->GetType(type, ds);
        else
            return type;
        break;
    case DK_REFERENCE:
        if (hasUniformQual)
            Error(pos, "\"uniform\" qualifier is illegal to apply to references.");
        if (hasVaryingQual)
            Error(pos, "\"varying\" qualifier is illegal to apply to references.");
        if (isConst)
            Error(pos, "\"const\" qualifier is to illegal apply to references.");
        // The parser should disallow this already, but double check.
        if (dynamic_cast<const ReferenceType *>(type) != NULL) {
            Error(pos, "References to references are illegal.");
            return NULL;
        }
-        // Account for 'unsigned' and 'const' qualifiers in the type
+        type = new ReferenceType(type);
-        if ((ds->typeQualifier & TYPEQUAL_UNSIGNED) != 0) {
+        if (child != NULL)
-            const Type *unsignedType = type->GetAsUnsignedType();
+            return child->GetType(type, ds);
-            if (unsignedType != NULL)
+        else
-                type = unsignedType;
+            return type;
-            else
+        break;
                Error(decl->pos, "\"unsigned\" qualifier is illegal with \"%s\" type.",
                      type->GetString().c_str());
        }
        if ((ds->typeQualifier & TYPEQUAL_CONST) != 0)
            type = type->GetAsConstType();
-        if (ds->vectorSize > 0) {
+    case DK_ARRAY:
-            const AtomicType *atomicType = dynamic_cast<const AtomicType *>(type);
+        if (Type::Equal(type, AtomicType::Void)) {
-            if (atomicType == NULL) {
+            Error(pos, "Arrays of \"void\" type are illegal.");
-                Error(decl->pos, "Only atomic types (int, float, ...) are legal for vector "
+            return NULL;
-                      "types.");
+        }
-                return NULL;
+        if (dynamic_cast<const ReferenceType *>(type)) {
            Error(pos, "Arrays of references (type \"%s\") are illegal.",
                  type->GetString().c_str());
            return NULL;
        }
        type = new ArrayType(type, arraySize);
        if (child)
            return child->GetType(type, ds);
        else
            return type;
        break;
    case DK_FUNCTION: {
        std::vector<const Type *> args;
        std::vector<std::string> argNames;
        std::vector<Expr *> argDefaults;
        std::vector<SourcePos> argPos;
        // Loop over the function arguments and store the names, types,
        // default values (if any), and source file positions each one in
        // the corresponding vector.
        for (unsigned int i = 0; i < functionParams.size(); ++i) {
            Declaration *d = functionParams[i];
            Symbol *sym = GetSymbolForFunctionParameter(i);
            if (d->declSpecs->storageClass != SC_NONE)
                Error(sym->pos, "Storage class \"%s\" is illegal in "
                      "function parameter declaration for parameter \"%s\".", 
                      lGetStorageClassName(d->declSpecs->storageClass),
                      sym->name.c_str());
            if (Type::Equal(sym->type, AtomicType::Void)) {
                Error(sym->pos, "Parameter with type \"void\" illegal in function "
                      "parameter list.");
                sym->type = NULL;
            }
-            type = new VectorType(atomicType, ds->vectorSize);
+
            const ArrayType *at = dynamic_cast<const ArrayType *>(sym->type);
            if (at != NULL) {
                // As in C, arrays are passed to functions as pointers to
                // their element type.  We'll just immediately make this
                // change now.  (One shortcoming of losing the fact that
                // the it was originally an array is that any warnings or
                // errors later issued that print the function type will
                // report this differently than it was originally declared
                // in the function, but it's not clear that this is a
                // significant problem.)
                if (at->GetElementType() == NULL) {
                    Assert(m->errorCount > 0);
                    return NULL;
                }
                const Type *targetType = at->GetElementType();
                targetType = 
                    targetType->ResolveUnboundVariability(Variability::Varying);
                sym->type = PointerType::GetUniform(targetType);
                // Make sure there are no unsized arrays (other than the
                // first dimension) in function parameter lists.
                at = dynamic_cast<const ArrayType *>(at->GetElementType());
                while (at != NULL) {
                    if (at->GetElementCount() == 0)
                        Error(sym->pos, "Arrays with unsized dimensions in "
                              "dimensions after the first one are illegal in "
                              "function parameter lists.");
                    at = dynamic_cast<const ArrayType *>(at->GetElementType());
                }
            }
            args.push_back(sym->type);
            argNames.push_back(sym->name);
            argPos.push_back(sym->pos);
            Expr *init = NULL;
            if (d->declarators.size()) {
                // Try to find an initializer expression.
                Declarator *decl = d->declarators[0];
                while (decl != NULL) {
                    if (decl->initExpr != NULL) {
                        decl->initExpr = TypeCheck(decl->initExpr);
                        decl->initExpr = Optimize(decl->initExpr);
                        if (decl->initExpr != NULL) {
                            init = dynamic_cast<ConstExpr *>(decl->initExpr);
                            if (init == NULL)
                                init = dynamic_cast<NullPointerExpr *>(decl->initExpr);
                            if (init == NULL)
                                Error(decl->initExpr->pos, "Default value for parameter "
                                      "\"%s\" must be a compile-time constant.", 
                                      sym->name.c_str());
                        }
                        break;
                    }
                    else
                        decl = decl->child;
                }
            }
            argDefaults.push_back(init);
        }
-        // if uniform/varying is specified explicitly, then go with that
+        const Type *returnType = type;
-        if ((ds->typeQualifier & TYPEQUAL_UNIFORM) != 0)
+        if (returnType == NULL) {
-            return type->GetAsUniformType();
+            Error(pos, "No return type provided in function declaration.");
-        else if ((ds->typeQualifier & TYPEQUAL_VARYING) != 0)
+            return NULL;
            return type->GetAsVaryingType();
        else {
            // otherwise, structs are uniform by default and everything
            // else is varying by default
            if (dynamic_cast<const StructType *>(type) != NULL)
                return type->GetAsUniformType();
            else
                return type->GetAsVaryingType();
        }
        if (dynamic_cast<const FunctionType *>(returnType) != NULL) {
            Error(pos, "Illegal to return function type from function.");
            return NULL;
        }
        bool isExternC =  ds && (ds->storageClass == SC_EXTERN_C);
        bool isExported = ds && ((ds->typeQualifiers & TYPEQUAL_EXPORT) != 0);
        bool isTask =     ds && ((ds->typeQualifiers & TYPEQUAL_TASK) != 0);
        if (isExported && isTask) {
            Error(pos, "Function can't have both \"task\" and \"export\" "
                  "qualifiers");
            return NULL;
        }
        if (isExternC && isTask) {
            Error(pos, "Function can't have both \"extern \"C\"\" and \"task\" "
                  "qualifiers");
            return NULL;
        }
        if (isExternC && isExported) {
            Error(pos, "Function can't have both \"extern \"C\"\" and \"export\" "
                  "qualifiers");
            return NULL;
        }
        if (child == NULL) {
            Assert(m->errorCount > 0);
            return NULL;
        }
        const FunctionType *functionType = 
            new FunctionType(returnType, args, argNames, argDefaults,
                             argPos, isTask, isExported, isExternC);
        functionType = functionType->ResolveUnboundVariability(Variability::Varying);
        // handle any explicit __declspecs on the function
        if (ds != NULL) {
            for (int i = 0; i < (int)ds->declSpecList.size(); ++i) {
                std::string str = ds->declSpecList[i].first;
                SourcePos pos = ds->declSpecList[i].second;
                if (str == "safe")
                    (const_cast<FunctionType *>(functionType))->isSafe = true;
                else if (!strncmp(str.c_str(), "cost", 4)) {
                    int cost = atoi(str.c_str() + 4);
                    if (cost < 0)
                        Error(pos, "Negative function cost %d is illegal.",
                              cost);
                    (const_cast<FunctionType *>(functionType))->costOverride = cost;
                }
                else
                    Error(pos, "__declspec parameter \"%s\" unknown.", str.c_str());
            }
        }
        return child->GetType(functionType, ds);
    }
-    else {
+    default:
-        // Peel off one dimension of the array
+        FATAL("Unexpected decl kind");
-        int arraySize = *arrayIter;
+        return NULL;
        ++arrayIter;
        // Get the type, not including the arraySize dimension peeled off
        // above.
        const Type *childType = lGetType(decl, ds, arrayIter);
        int soaWidth = ds->soaWidth;
        if (soaWidth == 0)
            // If there's no "soa<n>" stuff going on, just return a regular
            // array with the appropriate size 
            return new ArrayType(childType, arraySize == -1 ? 0 : arraySize);
       else {
            // Make sure we actually have an array of structs ..
            const StructType *childStructType = 
                dynamic_cast<const StructType *>(childType);
            if (childStructType == NULL) {
                Error(decl->pos, "Illegal to provide soa<%d> qualifier with non-struct "
                      "type \"%s\".", soaWidth, childType->GetString().c_str());
                return new ArrayType(childType, arraySize == -1 ? 0 : arraySize);
            }
            else if ((soaWidth & (soaWidth - 1)) != 0) {
                Error(decl->pos, "soa<%d> width illegal.  Value must be power of two.",
                      soaWidth);
                return NULL;
            }
            else if (arraySize != -1 && (arraySize % soaWidth) != 0) {
                Error(decl->pos, "soa<%d> width must evenly divide array size %d.",
                      soaWidth, arraySize);
                return NULL;
            }
            return new SOAArrayType(childStructType, arraySize == -1 ? 0 : arraySize,
                                    soaWidth);
        }
    }
 }
 const Type *
 Declarator::GetType(DeclSpecs *ds) const {
-    bool hasUniformQual = ((ds->typeQualifier & TYPEQUAL_UNIFORM) != 0);
+    const Type *baseType = ds->GetBaseType(pos);
-    bool hasVaryingQual = ((ds->typeQualifier & TYPEQUAL_VARYING) != 0);
+    const Type *type = GetType(baseType, ds);
    bool isTask =         ((ds->typeQualifier & TYPEQUAL_TASK) != 0);
    bool isReference =    ((ds->typeQualifier & TYPEQUAL_REFERENCE) != 0);
-    if (hasUniformQual && hasVaryingQual) {
+    if (ds->declSpecList.size() > 0 && 
-        Error(pos, "Can't provide both \"uniform\" and \"varying\" qualifiers.");
+        type != NULL &&
-        return NULL;
+        dynamic_cast<const FunctionType *>(type) == NULL) {
        Error(pos, "__declspec specifiers for non-function type \"%s\" are "
              "not used.", type->GetString().c_str());
    }
-    if (isFunction) {
+    return type;
-        std::vector<const Type *> args;
+}
        std::vector<std::string> argNames;
        if (functionArgs) {
            // Loop over the function arguments and get names and types for
            // each one in the args and argNames arrays
            for (unsigned int i = 0; i < functionArgs->size(); ++i) {
                Declaration *d = (*functionArgs)[i];
                Symbol *sym;
                if (d->declarators.size() == 0) {
                    // function declaration like foo(float), w/o a name for
                    // the parameter
                    char buf[32];
                    sprintf(buf, "__anon_parameter_%d", i);
                    sym = new Symbol(buf, pos);
                    Declarator *declarator = new Declarator(sym, sym->pos);
                    sym->type = declarator->GetType(d->declSpecs);
                    d->declarators.push_back(declarator);
                }
                else {
                    assert(d->declarators.size() == 1);
                    sym = d->declarators[0]->sym;
                }
                // Arrays are passed by reference, so convert array
                // parameters to be references here.
                if (dynamic_cast<const ArrayType *>(sym->type) != NULL)
                    sym->type = new ReferenceType(sym->type, sym->type->IsConstType());
-                args.push_back(sym->type);
+Symbol *
-                argNames.push_back(sym->name);
+Declarator::GetSymbolForFunctionParameter(int paramNum) const {
-            }
+    Assert(paramNum < (int)functionParams.size());
-        }
+    Declaration *d = functionParams[paramNum];
-        if (ds->baseType == NULL) {
+    char buf[32];
-            Warning(pos, "No return type provided in declaration of function \"%s\". "
+    Symbol *sym;
-                    "Treating as \"void\".", sym->name.c_str());
+    if (d->declarators.size() == 0) {
-            ds->baseType = AtomicType::Void;
+        // function declaration like foo(float), w/o a name for
-        }
+        // the parameter
-
+        sprintf(buf, "__anon_parameter_%d", paramNum);
-        if (isReference) {
+        sym = new Symbol(buf, pos);
-            Error(pos, "Function return types can't be reference types.");
+        sym->type = d->declSpecs->GetBaseType(pos);
            return NULL;
        }
        const Type *returnType = lGetType(this, ds, arraySize.begin());
        if (returnType == NULL)
            return NULL;
        bool isExported = (ds->storageClass == SC_EXPORT);
        bool isExternC =  (ds->storageClass == SC_EXTERN_C);
        return new FunctionType(returnType, args, pos, &argNames, isTask, 
                                isExported, isExternC);
    }
    else {
-        if (isTask)
+        Assert(d->declarators.size() == 1);
-            Error(pos, "\"task\" qualifier illegal in variable declaration \"%s\".",
+        sym = d->declarators[0]->GetSymbol();
-                  sym->name.c_str());
+        if (sym == NULL) {
-
+            // Handle more complex anonymous declarations like
-        const Type *type = lGetType(this, ds, arraySize.begin());
+            // float (float **).
-
+            sprintf(buf, "__anon_parameter_%d", paramNum);
-        if (type != NULL && isReference) {
+            sym = new Symbol(buf, d->declarators[0]->pos);
-            bool hasConstQual = ((ds->typeQualifier & TYPEQUAL_CONST) != 0);
+            sym->type = d->declarators[0]->GetType(d->declSpecs);
            type = new ReferenceType(type, hasConstQual);
        }
        return type;
    }
    return sym;
 }
 ///////////////////////////////////////////////////////////////////////////
 // Declaration
-void
+Declaration::Declaration(DeclSpecs *ds, std::vector<Declarator *> *dlist) {
-Declaration::AddSymbols(SymbolTable *st) const {
+    declSpecs = ds;
-    assert(declSpecs->storageClass != SC_TYPEDEF);
+    if (dlist != NULL)
-
+        declarators = *dlist;
    for (unsigned int i = 0; i < declarators.size(); ++i)
-       if (declarators[i])
+        if (declarators[i] != NULL)
-           st->AddVariable(declarators[i]->sym);
+            declarators[i]->InitFromDeclSpecs(declSpecs);
 }
 Declaration::Declaration(DeclSpecs *ds, Declarator *d) {
    declSpecs = ds;
    if (d != NULL) {
        d->InitFromDeclSpecs(ds);
        declarators.push_back(d);
    }
 }
 std::vector<VariableDeclaration>
 Declaration::GetVariableDeclarations() const {
    Assert(declSpecs->storageClass != SC_TYPEDEF);
    std::vector<VariableDeclaration> vars;
    for (unsigned int i = 0; i < declarators.size(); ++i) {
        Declarator *decl = declarators[i];
        if (decl == NULL) {
            // Ignore earlier errors
            Assert(m->errorCount > 0);
            continue;
        }
        Symbol *sym = decl->GetSymbol();
        if (sym == NULL || sym->type == NULL) {
            // Ignore errors
            Assert(m->errorCount > 0);
            continue;
        }
        sym->type = sym->type->ResolveUnboundVariability(Variability::Varying);
        if (Type::Equal(sym->type, AtomicType::Void))
            Error(sym->pos, "\"void\" type variable illegal in declaration.");
        else if (dynamic_cast<const FunctionType *>(sym->type) == NULL) {
            m->symbolTable->AddVariable(sym);
            vars.push_back(VariableDeclaration(sym, decl->initExpr));
        }
    }
    return vars;
 }
 void
-Declaration::Print() const {
+Declaration::DeclareFunctions() {
-    printf("Declaration: specs [");
+    Assert(declSpecs->storageClass != SC_TYPEDEF);
-    declSpecs->Print();
+
-    printf("], declarators [");
+    for (unsigned int i = 0; i < declarators.size(); ++i) {
-    for (unsigned int i = 0 ; i < declarators.size(); ++i) {
+        Declarator *decl = declarators[i];
-        declarators[i]->Print();
+        if (decl == NULL) {
-        printf("%s", (i == declarators.size() - 1) ? "]" : ", ");
+            // Ignore earlier errors
            Assert(m->errorCount > 0);
            continue;
        }
        Symbol *sym = decl->GetSymbol();
        if (sym == NULL || sym->type == NULL) {
            // Ignore errors
            Assert(m->errorCount > 0);
            continue;
        }
        sym->type = sym->type->ResolveUnboundVariability(Variability::Varying);
        if (dynamic_cast<const FunctionType *>(sym->type) == NULL)
            continue;
        bool isInline = (declSpecs->typeQualifiers & TYPEQUAL_INLINE);
        m->AddFunctionDeclaration(sym, isInline);
    }
 }
 void
 Declaration::Print(int indent) const {
    printf("%*cDeclaration: specs [", indent, ' ');
    declSpecs->Print();
    printf("], declarators:\n");
    for (unsigned int i = 0 ; i < declarators.size(); ++i)
        declarators[i]->Print(indent+4);
 }
 ///////////////////////////////////////////////////////////////////////////
 void
@@ -322,29 +725,53 @@ GetStructTypesNamesPositions(const std::vector<StructDeclaration *> &sd,
                             std::vector<const Type *> *elementTypes,
                             std::vector<std::string> *elementNames,
                             std::vector<SourcePos> *elementPositions) {
    std::set<std::string> seenNames;
    for (unsigned int i = 0; i < sd.size(); ++i) {
        const Type *type = sd[i]->type;
        if (type == NULL)
            continue;
        // FIXME: making this fake little DeclSpecs here is really
        // disgusting
        DeclSpecs ds(type);
-        if (type->IsUniformType()) 
+        if (Type::Equal(type, AtomicType::Void) == false) {
-            ds.typeQualifier |= TYPEQUAL_UNIFORM;
+            if (type->IsUniformType()) 
-        else
+                ds.typeQualifiers |= TYPEQUAL_UNIFORM;
-            ds.typeQualifier |= TYPEQUAL_VARYING;
+            else if (type->IsVaryingType())
                ds.typeQualifiers |= TYPEQUAL_VARYING;
            else if (type->GetSOAWidth() != 0)
                ds.soaWidth = type->GetSOAWidth();
            // FIXME: ds.vectorSize?
        }
        for (unsigned int j = 0; j < sd[i]->declarators->size(); ++j) {
            Declarator *d = (*sd[i]->declarators)[j];
            d->InitFromDeclSpecs(&ds);
-            // if it's an unsized array, make it a reference to an unsized
+            Symbol *sym = d->GetSymbol();
            // array, so the caller can pass a pointer...
            const ArrayType *at = dynamic_cast<const ArrayType *>(d->sym->type);
            if (at && at->GetElementCount() == 0)
                d->sym->type = new ReferenceType(d->sym->type, type->IsConstType());
-            elementTypes->push_back(d->sym->type);
+            if (Type::Equal(sym->type, AtomicType::Void))
-            elementNames->push_back(d->sym->name);
+                Error(d->pos, "\"void\" type illegal for struct member.");
-            elementPositions->push_back(d->sym->pos);
+
            elementTypes->push_back(sym->type);
            if (seenNames.find(sym->name) != seenNames.end())
                Error(d->pos, "Struct member \"%s\" has same name as a "
                      "previously-declared member.", sym->name.c_str());
            else
                seenNames.insert(sym->name);
            elementNames->push_back(sym->name);
            elementPositions->push_back(sym->pos);
        }
    }
    for (int i = 0; i < (int)elementTypes->size() - 1; ++i) {
        const ArrayType *arrayType = 
            dynamic_cast<const ArrayType *>((*elementTypes)[i]);
        if (arrayType != NULL && arrayType->GetElementCount() == 0)
            Error((*elementPositions)[i], "Unsized arrays aren't allowed except "
                  "for the last member in a struct definition.");
    }
 }
--- a/decl.h
+++ b/decl.h
@@ -56,10 +56,14 @@
 #include "ispc.h"
 struct VariableDeclaration;
 class Declaration;
 class Declarator;
 enum StorageClass {
    SC_NONE,
    SC_EXTERN,
    SC_EXPORT,
    SC_STATIC,
    SC_TYPEDEF,
    SC_EXTERN_C
@@ -74,9 +78,10 @@ enum StorageClass {
 #define TYPEQUAL_UNIFORM    (1<<1)
 #define TYPEQUAL_VARYING    (1<<2)
 #define TYPEQUAL_TASK       (1<<3)
-#define TYPEQUAL_REFERENCE  (1<<4)
+#define TYPEQUAL_SIGNED     (1<<4)
 #define TYPEQUAL_UNSIGNED   (1<<5)
 #define TYPEQUAL_INLINE     (1<<6)
 #define TYPEQUAL_EXPORT     (1<<7)
 /** @brief Representation of the declaration specifiers in a declaration.
@@ -85,22 +90,25 @@ enum StorageClass {
 */
 class DeclSpecs {
 public:
-    DeclSpecs(const Type *t = NULL, StorageClass sc = SC_NONE, int tq = TYPEQUAL_NONE);
+    DeclSpecs(const Type *t = NULL, StorageClass sc = SC_NONE,
              int tq = TYPEQUAL_NONE);
    void Print() const;
    StorageClass storageClass;
    /** Zero or more of the TYPEQUAL_* values, ANDed together. */
-    int typeQualifier;
+    int typeQualifiers;
    /** The basic type provided in the declaration; this should be an
-        AtomicType, a StructType, or a VectorType; other types (like
+        AtomicType, EnumType, StructType, or VectorType; other types (like
        ArrayTypes) will end up being created if a particular declaration
        has an array size, etc.
    */
    const Type *baseType;
    const Type *GetBaseType(SourcePos pos) const;
    /** If this is a declaration with a vector type, this gives the vector
        width.  For non-vector types, this is zero.
     */
@@ -110,9 +118,19 @@ public:
        SOA width specified.  Otherwise this is zero.
     */
    int soaWidth;
    std::vector<std::pair<std::string, SourcePos> > declSpecList;
 };
 enum DeclaratorKind {
    DK_BASE,
    DK_POINTER,
    DK_REFERENCE,
    DK_ARRAY,
    DK_FUNCTION
 };
 /** @brief Representation of the declaration of a single variable.  
    In conjunction with an instance of the DeclSpecs, this gives us
@@ -120,13 +138,7 @@ public:
 */
 class Declarator {
 public:
-    Declarator(Symbol *s, SourcePos p);
+    Declarator(DeclaratorKind dk, SourcePos p);
    /** As the parser peels off array dimension declarations after the
        symbol name, it calls this method to provide them to the
        Declarator.
     */
    void AddArrayDimension(int size);
    /** Once a DeclSpecs instance is available, this method completes the
        initialization of the Symbol, setting its Type accordingly.
@@ -134,21 +146,53 @@ public:
    void InitFromDeclSpecs(DeclSpecs *ds);
    /** Get the actual type of the combination of Declarator and the given
-        DeclSpecs */
+        DeclSpecs.  If an explicit base type is provided, the declarator is
        applied to that type; otherwise the base type from the DeclSpecs is
        used. */
    const Type *GetType(DeclSpecs *ds) const;
    const Type *GetType(const Type *base, DeclSpecs *ds) const;
-    void Print() const;
+    /** Returns the symbol corresponding to the function declared by this
        declarator and symbols for its arguments in *args. */
    Symbol *GetFunctionInfo(DeclSpecs *ds, std::vector<Symbol *> *args);
    Symbol *GetSymbolForFunctionParameter(int paramNum) const;
    /** Returns the symbol associated with the declarator. */
    Symbol *GetSymbol() const;
    void Print(int indent) const;
    /** Position of the declarator in the source program. */
    const SourcePos pos;
    /** The kind of this declarator; complex declarations are assembled as
        a hierarchy of Declarators.  (For example, a pointer to an int
        would have a root declarator with kind DK_POINTER and with the
        Declarator::child member pointing to a DK_BASE declarator for the
        int). */
    const DeclaratorKind kind;
    /** Child pointer if needed; this can only be non-NULL if the
        declarator's kind isn't DK_BASE. */
    Declarator *child;
    /** Type qualifiers provided with the declarator. */
    int typeQualifiers;
    /** For array declarators, this gives the declared size of the array.
        Unsized arrays have arraySize == 0. */ 
    int arraySize;
    /** Symbol associated with the declarator. */
    Symbol *sym;
-    /** If this declarator includes an array specification, the sizes of
+
        the array dimensions are represented here.
     */
    std::vector<int> arraySize;
    /** Initialization expression for the variable.  May be NULL. */
    Expr *initExpr;
-    bool isFunction;
+
-    std::vector<Declaration *> *functionArgs;
+    /** For function declarations, this holds the Declaration *s for the
        funciton's parameters. */
    std::vector<Declaration *> functionParams;
 };
@@ -157,26 +201,21 @@ public:
 */
 class Declaration {
 public:
-    Declaration(DeclSpecs *ds, std::vector<Declarator *> *dlist = NULL) {
+    Declaration(DeclSpecs *ds, std::vector<Declarator *> *dlist = NULL);
-        declSpecs = ds;
+    Declaration(DeclSpecs *ds, Declarator *d);
        if (dlist != NULL)
            declarators = *dlist;
        for (unsigned int i = 0; i < declarators.size(); ++i)
            if (declarators[i] != NULL)
                declarators[i]->InitFromDeclSpecs(declSpecs);
    }
    Declaration(DeclSpecs *ds, Declarator *d) {
        declSpecs = ds;
        if (d) {
            d->InitFromDeclSpecs(ds);
            declarators.push_back(d);
        }
    }
-    /** Adds the symbols for the variables in the declaration to the symbol
+    void Print(int indent) const;
-        table. */
+
-    void AddSymbols(SymbolTable *st) const;
+    /** This method walks through all of the Declarators in a declaration
-    void Print() const;
+        and returns a fully-initialized Symbol and (possibly) and
        initialization expression for each one.  (This allows the rest of
        the system to not have to worry about the mess of the general
        Declarator representation.) */
    std::vector<VariableDeclaration> GetVariableDeclarations() const;
    /** For any function declarations in the Declaration, add the
        declaration to the module. */
    void DeclareFunctions();
    DeclSpecs *declSpecs;
    std::vector<Declarator *> declarators;
--- a/docs/ReleaseNotes.txt
+++ b/docs/ReleaseNotes.txt
@@ -1,3 +1,295 @@
 === v1.2.1 === (6 April 2012)
 This release contains only minor new functionality and is mostly for many
 small bugfixes and improvements to error handling and error reporting.
 The new functionality that is present is:
 * Significantly more efficient versions of the float / half conversion
  routines are now available in the standard library, thanks to Fabian
  Giesen.
 * The last member of a struct can now be a zero-length array; this allows
  the trick of dynamically allocating enough storage for the struct and
  some number of array elements at the end of it.
 Significant bugs fixed include:
 * Issue #205: When a target ISA isn't specified, use the host system's
  capabilities to choose a target for which it will be able to run the
  generated code.
 * Issues #215 and #217: Don't allocate storage for global variables that
  are declared "extern".
 * Issue #197: Allow NULL as a default argument value in a function
  declaration.
 * Issue #223: Fix bugs where taking the address of a function wouldn't work
  as expected.
 * Issue #224: When there are overloaded variants of a function that take
  both reference and const reference parameters, give the non-const
  reference preference when matching values of that underlying type.
 * Issue #225: An error is issed when a varying lvalue is assigned to a
  reference type (rather than crashing).
 * Issue #193: Permit conversions from array types to void *, not just the
  pointer type of the underlying array element.
 * Issue #199: Still evaluate expressions that are cast to (void).
 The documentation has also been improved, with FAQs added to clarify some
 aspects of the ispc pointer model.
 === v1.2.0 === (20 March 2012)
 This is a major new release of ispc, with a number of significant
 improvements to functionality, performance, and compiler robustness.  It
 does, however, include three small changes to language syntax and semantics
 that may require changes to existing programs:
 * Syntax for the "launch" keyword has been cleaned up; it's now no longer
  necessary to bracket the launched function call with angle brackets.
  (In other words, now use "launch foo();", rather than "launch < foo() >;".
 * When using pointers, the pointed-to data type is now "uniform" by
  default.  Use the varying keyword to specify varying pointed-to types when
  needed.  (i.e. "float *ptr" is a varying pointer to uniform float data,
  whereas previously it was a varying pointer to varying float values.)
  Use "varying float *" to specify a varying pointer to varying float data,
  and so forth.
 * The details of "uniform" and "varying" and how they interact with struct
  types have been cleaned up.  Now, when a struct type is declared, if the
  struct elements don't have explicit "uniform" or "varying" qualifiers,
  they are said to have "unbound" variability.  When a struct type is
  instantiated, any unbound variability elements inherit the variability of
  the parent struct type. See http://ispc.github.com/ispc.html#struct-types
  for more details.
 ispc has a new language feature that makes it much easier to use the
 efficient "(array of) structure of arrays" (AoSoA, or SoA) memory layout of
 data.  A new "soa<n>" qualifier can be applied to structure types to
 specify an n-wide SoA version of the corresponding type.  Array indexing
 and pointer operations with arrays SoA types automatically handles the
 two-stage indexing calculation to access the data.  See
 http://ispc.github.com/ispc.html#structure-of-array-types for more details.
 For more efficient access of data that is still in "array of structures"
 (AoS) format, ispc has a new "memory coalescing" optimization that
 automatically detects series of strided loads and/or gathers that can be
 transformed into a more efficient set of vector loads and shuffles.  A
 diagnostic is emitted when this optimization is successfully applied. 
 Smaller changes in this release:
 * The standard library now provides memcpy(), memmove() and memset()
  functions, as well as single-precision asin() and acos() functions.
 * -I can now be specified on the command-line to specify a search path for
  #include files.
 * A number of improvements have been made to error reporting from the
  parser, and a number of cases where malformed programs could cause the
  compiler to crash have been fixed.
 * A number of small improvements to the quality and performance of generated
  code have been made, including finding more cases where 32-bit addressing
  calculations can be safely done on 64-bit systems and generating better
  code for initializer expressions.
 === v1.1.4 === (4 February 2012)
 There are two major bugfixes for Windows in this release.  First, a number
 of failures in AVX code generation on Windows have been fixed; AVX on
 Windows now has no known issues.  Second, a longstanding bug in parsing 64-bit
 integer constants on Windows has been fixed.
 This release features a new experimental scalar target, contributed by Gabe
 Weisz <gweisz@cs.cmu.edu>.  This target ("--target=generic-1") compiles
 gangs of single program instances (i.e. programCount == 1); it can be
 useful for debugging ispc programs.
 The compiler now supports dynamic memory allocation in ispc programs (with
 "new" and "delete" operators based on C++).  See
 http://ispc.github.com/ispc.html#dynamic-memory-allocation in the
 documentation for more information.
 ispc now performs "short circuit" evaluation of the || and && logical
 operators and the ? : selection operator.  (This represents the correction
 of a major incompatibility with C.)  Code like "(index < arraySize &&
 array[index] == 1)" thus now executes as in C, where "array[index]" won't
 be evaluated unless "index" is less than "arraySize".
 The standard library now provides "local" atomic operations, which are
 atomic across the gang of program instances (but not across other gangs or
 other hardware threads.  See the updated documentation on atomics for more
 information:
 http://ispc.github.com/ispc.html#atomic-operations-and-memory-fences.
 The standard library now offers a clock() function, which returns a uniform
 int64 value that counts processor cycles; it can be used for
 fine-resolution timing measurements.
 Finally (of limited interest now): ispc now supports the forthcoming AVX2
 instruction set, due with Haswell-generation CPUs.  All tests and examples
 compile and execute correctly with AVX2.  (Thanks specifically to Craig
 Topper and Nadav Rotem for work on AVX2 support in LLVM, which made this
 possible.)
 === v1.1.3 === (20 January 2012)
 With this release, the language now supports "switch" statements, with the
 same semantics and syntax as in C.
 This release includes fixes for two important performance related issues:
 the quality of code generated for "foreach" statements has been
 substantially improved (https://github.com/ispc/ispc/issues/151), and a
 performance regression with code for "gathers" that was introduced in
 v1.1.2 has been fixed in this release. 
 A number of other small bugs were fixed in this release as well, including
 one where invalid memory would sometimes be incorrectly accessed
 (https://github.com/ispc/ispc/issues/160).
 Thanks to Jean-Luc Duprat for a number of patches that improve support for
 building on various platforms, and to Pierre-Antoine Lacaze for patches so
 that ispc builds under MinGW.
 === v1.1.2 === (9 January 2012)
 The major new feature in this release is support for "generic" C++
 vectorized output; in other words, ispc can emit C++ code that corresponds
 to the vectorized computation that the ispc program represents.  See the
 examples/intrinsics directory in the ispc distribution for two example
 implementations of the set of functions that must be provided map the
 vector calls generated by ispc to target specific functions.
 ispc now has partial support for 'goto' statements; specifically, goto is
 allowed if any enclosing control flow statements (if/for/while/do) have
 'uniform' test expressions, but not if they have 'varying' tests.
 A number of improvements have been made to the code generated for gathers
 and scatters--one of them (better matching x86's "free" scale by 2/4/8 for
 addressing calculations) improved the performance of the noise example by
 14%.
 Many small bugs have been fixed in this release as well, including issue
 numbers 138, 129, 135, 127, 149, and 142.
 === v1.1.1 === (15 December 2011)
 This release doesn't include any significant new functionality, but does
 include a small improvements in generated code and a number of bug fixes.
 The one user-visible language change is that integer constants may be
 specified with 'u' and 'l' suffixes, like in C.  For example, "1024llu"
 defines the constant with unsigned 64-bit type.
 More informative and useful error messages are printed when function
 overload resolution fails.
 Masking is avoided in additional cases when the mask can be
 statically-determined to be all on. 
 A number of small bugs have been fixed:
 - Under some circumstances, incorrect masks were used when assigning a
  value to a reference and when doing gathers/scatters.
 - Incorrect code could be generated in some cases when some instances
  returned part way through a function but others contineud executing.
 - Type checking wasn't being performed for calls through function pointers;
  now an error is issued if the arguments don't match up, etc.
 - Incorrect code was being generated for gather/scatter to structs that had
  elements with varying short-vector types.
 - Typechecking wasn't being performed for "foreach" statements; this led to
  problems like function overload resolution not being performed if an
  overloaded function call was used to determine the iteration range..
 - A number of symbols would be multiply-defined when compiling to multiple
  targets and using the sse2-x2 target as one of them (issue #131).
 === v1.1.0 === (5 December 2011)
 This is a major new release of the compiler, with significant additions to
 language functionality and capabilities.  It includes a number of small
 language syntax changes that will require modification of existing
 programs.  These changes should generally be straightforward and all are
 steps toward eliminating parts of ispc syntax that are incompatible with
 C/C++.  See
 http://ispc.github.com/ispc.html#updating-ispc-programs-for-changes-in-ispc-1-1
 for more information about these changes.
 ispc now fully supports pointers, including pointer arithmetic, implicit
 conversions of arrays to pointers, and all of the other capabilities of
 pointers in C.  See http://ispc.github.com/ispc.html#pointer-types for more
 information about pointers in ispc and
 http://ispc.github.com/ispc.html#function-pointer-types for information
 about function pointers in ispc.
 Reference types are now declared with C++ syntax (e.g. "const float &foo").
 ispc now supports 64-bit addressing.  For performance reasons, this
 capability is disabled by default (even on 64-bit targets), but can be
 enabled with a command-line flag:
 http://ispc.github.com/ispc.html#selecting-32-or-64-bit-addressing.
 This release features new parallel "foreach" statements, which make it
 easier in many instances to map program instances to data for data-parallel
 computation than the programIndex/programCount mechanism:
 http://ispc.github.com/ispc.html#parallel-iteration-statements-foreach-and-foreach-tiled.
 Finally, all of the system's documentation has been significantly revised.
 The documentation of ispc's parallel execution model has been rewritten:
 http://ispc.github.com/ispc.html#the-ispc-parallel-execution-model, and
 there is now a more specific discussion of similarities and differences
 between ispc and C/C++:
 http://ispc.github.com/ispc.html#relationship-to-the-c-programming-language.
 There is now a separate FAQ (http://ispc.github.com/faq.html), and a
 Performance Guide (http://ispc.github.com/perfguide.html).
 === v1.0.12 === (20 October 2011)
 This release includes a new "double-pumped" 8-wide target for SSE2,
 "sse2-x2".  Like the sse4-x2 and avx-x2 targets, this target may deliver
 higher performance for some workloads than the regular sse2 target.  (For
 other workloads, it may be slower.)
 The ispc language now includes an "assert()" statement.  See
 http://ispc.github.com/ispc.html#assertions for more information.
 The compiler now sets a preprocessor #define based on the target ISA; for
 example, ISPC_TARGET_SSE4 is defined for the sse4 targets, and so forth.
 The standard library now provides high-performance routines for converting
 between some "array of structures" and "structure of arrays" formats.
 See
 http://ispc.github.com/ispc.html#converting-between-array-of-structures-and-structure-of-arrays-layout
 for more information.
 Inline functions now have static linkage.
 A number of improvements have been made to the optimization passes that
 detect when gathers and scatters can be transformed into vector stores and
 loads, respectively.  In particular, these passes now handle variables that
 are used as loop induction variables much better.
 === v1.0.11 === (6 October 2011)
 The main new feature in this release is support for generating code for
 multiple targets (e.g., SSE2, SSE4, and AVX) and having the compiled code
 select the best variant at execution time.  For more information, see
 http://ispc.github.com/ispc.html#compiling-with-support-for-multiple-instruction-sets.
 All of the examples now take advantage of the support for multiple
 compilation targets; thus, if one has an AVX system, it's not necessary to
 recompile the examples to use the AVX target.
 Performance of the built-in task system that is used in the examples has
 been improved.
 Finally, the print() statement now works on OSX; it had been broken for the
 last few releases.
 === v1.0.10 === (30 September 2011)
 This release features an extensive new example showing the application of
--- a/docs/build.sh
+++ b/docs/build.sh
@@ -1,6 +1,15 @@
 #!/bin/bash
-rst2html.py ispc.txt > ispc.html
+for i in ispc perfguide faq; do
    rst2html.py --template=template.txt --link-stylesheet \
        --stylesheet-path=css/style.css $i.rst > $i.html
 done
 rst2html.py --template=template-news.txt --link-stylesheet \
    --stylesheet-path=css/style.css news.rst > news.html
 rst2html.py --template=template-perf.txt --link-stylesheet \
        --stylesheet-path=css/style.css perf.rst > perf.html
 #rst2latex --section-numbering --documentclass=article --documentoptions=DIV=9,10pt,letterpaper ispc.txt > ispc.tex
 #pdflatex ispc.tex
--- a/docs/faq.rst
+++ b/docs/faq.rst
@@ -0,0 +1,695 @@
 =====================================
 Frequently Asked Questions About ispc
 =====================================
 This document includes a number of frequently (and not frequently) asked
 questions about ispc, the Intel® SPMD Program Compiler.  The source to this
 document is in the file ``docs/faq.rst`` in the ``ispc`` source
 distribution.
 * Understanding ispc's Output
  + `How can I see the assembly language generated by ispc?`_
  + `How can I have the assembly output be printed using Intel assembly syntax?`_
  + `Why are there multiple versions of exported ispc functions in the assembly output?`_
  + `How can I more easily see gathers and scatters in generated assembly?`_
 * Language Details
  + `What is the difference between "int *foo" and "int foo[]"?`_
  + `Why are pointed-to types "uniform" by default?`_
  + `What am I getting an error about assigning a varying lvalue to a reference type?`_ 
 * Interoperability
  + `How can I supply an initial execution mask in the call from the application?`_
  + `How can I generate a single binary executable with support for multiple instruction sets?`_
  + `How can I determine at run-time which vector instruction set's instructions were selected to execute?`_
  + `Is it possible to inline ispc functions in C/C++ code?`_
  + `Why is it illegal to pass "varying" values from C/C++ to ispc functions?`_ 
 * Programming Techniques
  + `What primitives are there for communicating between SPMD program instances?`_
  + `How can a gang of program instances generate variable amounts of output efficiently?`_
  + `Is it possible to use ispc for explicit vector programming?`_
  + `How can I debug my ispc programs using Valgrind?`_
 Understanding ispc's Output
 ===========================
 How can I see the assembly language generated by ispc?
 ------------------------------------------------------
 The ``--emit-asm`` flag causes assembly output to be generated.  If the
 ``-o`` command-line flag is also supplied, the assembly is stored in the
 given file, or printed to standard output if ``-`` is specified for the
 filename.  For example, given the simple ``ispc`` program:
 ::
    export uniform int foo(uniform int a, uniform int b) {
        return a+b;
    }
 If the SSE4 target is used, then the following assembly is printed:
 ::
    _foo:
            addl    %esi, %edi
            movl    %edi, %eax
            ret
 How can I have the assembly output be printed using Intel assembly syntax?
 --------------------------------------------------------------------------
 The ``ispc`` compiler is currently only able to emit assembly with AT+T
 syntax, where the destination operand is the last operand after an
 instruction.  If you'd prefer Intel assembly output, one option is to use
 Agner Fog's ``objconv`` tool: have ``ispc`` emit a native object file and
 then use ``objconv`` to disassemble it, specifying the assembler syntax
 that you prefer.  ``objconv`` `is available for download here`_.
 .. _is available for download here: http://www.agner.org/optimize/#objconv
 Why are there multiple versions of exported ispc functions in the assembly output?
 ----------------------------------------------------------------------------------
 Two generations of all functions qualified with ``export`` are generated:
 one of them is for being be called by other ``ispc`` functions, and the
 other is to be called by the application.  The application callable
 function has the original function's name, while the ``ispc``-callable
 function has a mangled name that encodes the types of the function's
 parameters.
 The crucial difference between these two functions is that the
 application-callable function doesn't take a parameter encoding the current
 execution mask, while ``ispc``-callable functions have a hidden mask
 parameter.  An implication of this difference is that the ``export``
 function starts with the execution mask "all on".  This allows a number of
 improvements in the generated code, particularly on architectures that
 don't have support for masked load and store instructions.
 As an example, consider this short function, which loads a vector's worth
 values from two arrays in memory, adds them, and writes the result to an
 output array.
 ::
    export void foo(uniform float a[], uniform float b[],
                    uniform float result[]) {
        float aa = a[programIndex], bb = b[programIndex];
        result[programIndex] = aa+bb;
    }
 Here is the assembly code for the application-callable instance of the
 function.
 ::
    _foo:
            movups        (%rsi), %xmm1
            movups        (%rdi), %xmm0
            addps         %xmm1, %xmm0
            movups        %xmm0, (%rdx)
            ret
 And here is the assembly code for the ``ispc``-callable instance of the
 function.
 ::
    "_foo___uptr<Uf>uptr<Uf>uptr<Uf>":
            movmskps      %xmm0, %eax
            cmpl          $15, %eax
            je            LBB0_3
            testl         %eax, %eax
            jne           LBB0_4
            ret
    LBB0_3:
            movups        (%rsi), %xmm1
            movups        (%rdi), %xmm0
            addps         %xmm1, %xmm0
            movups        %xmm0, (%rdx)
            ret
    LBB0_4:
    ####
    ####  Code elided; handle mixed mask case..
    ####
            ret
 There are a few things to notice in this code.  First, the current program
 mask is coming in via the ``%xmm0`` register and the initial few
 instructions in the function essentially check to see if the mask is all on
 or all off.  If the mask is all on, the code at the label LBB0_3 executes;
 it's the same as the code that was generated for ``_foo`` above.  If the
 mask is all off, then there's nothing to be done, and the function can
 return immediately.
 In the case of a mixed mask, a substantial amount of code is generated to
 load from and then store to only the array elements that correspond to
 program instances where the mask is on.  (This code is elided below).  This
 general pattern of having two-code paths for the "all on" and "mixed" mask
 cases is used in the code generated for almost all but the most simple
 functions (where the overhead of the test isn't worthwhile.)
 How can I more easily see gathers and scatters in generated assembly?
 ---------------------------------------------------------------------
 Because CPU vector ISAs don't have native gather and scatter instructions,
 these memory operations are turned into sequences of a series of
 instructions in the code that ``ispc`` generates.  In some cases, it can be
 useful to see where gathers and scatters actually happen in code; there is
 an otherwise undocumented command-line flag that provides this information.
 Consider this simple program:
 ::
    void set(uniform int a[], int value, int index) {
        a[index] = value;
    }
 When compiled normally to the SSE4 target, this program generates this
 extensive code sequence, which makes it more difficult to see what the
 program is actually doing.
 ::
    "_set___uptr<Ui>ii":
            pmulld        LCPI0_0(%rip), %xmm1
            movmskps      %xmm2, %eax
            testb         $1, %al
            je            LBB0_2
            movd          %xmm1, %ecx
            movd          %xmm0, (%rcx,%rdi)
    LBB0_2:
            testb         $2, %al
            je            LBB0_4
            pextrd        $1, %xmm1, %ecx
            pextrd        $1, %xmm0, (%rcx,%rdi)
    LBB0_4:
            testb         $4, %al
            je            LBB0_6
            pextrd        $2, %xmm1, %ecx
            pextrd        $2, %xmm0, (%rcx,%rdi)
    LBB0_6:
            testb        $8, %al
            je            LBB0_8
            pextrd        $3, %xmm1, %eax
            pextrd        $3, %xmm0, (%rax,%rdi)
    LBB0_8:
            ret
 If this program is compiled with the
 ``--opt=disable-handle-pseudo-memory-ops`` command-line flag, then the
 scatter is left as an unresolved function call.  The resulting program
 won't link without unresolved symbols, but the assembly output is much
 easier to understand:
 ::
    "_set___uptr<Ui>ii":
            movaps        %xmm0, %xmm3
            pmulld        LCPI0_0(%rip), %xmm1
            movdqa        %xmm1, %xmm0
            movaps        %xmm3, %xmm1
            jmp        ___pseudo_scatter_base_offsets32_32 ## TAILCALL
 Language Details
 ================
 What is the difference between "int \*foo" and "int foo[]"?
 -----------------------------------------------------------
 In C and C++, declaring a function to take a parameter ``int *foo`` and
 ``int foo[]`` results in the same type for the parameter.  Both are
 pointers to integers.  In ``ispc``, these are different types.  The first
 one is a varying pointer to a uniform integer value in memory, while the
 second results in a uniform pointer to the start of an array of varying
 integer values in memory.
 To understand why the first is a varying pointer to a uniform integer,
 first recall that types without explicit rate qualifiers (``uniform``,
 ``varying``, or ``soa<>``) are ``varying`` by default.  Second, recall from
 the `discussion of pointer types in the ispc User's Guide`_ that pointed-to
 types without rate qualifiers are ``uniform`` by default.  (This second
 rule is discussed further below, in `Why are pointed-to types "uniform" by
 default?`_.)  The type of ``int *foo`` follows from these.
 .. _discussion of pointer types in the ispc User's Guide: ispc.html#pointer-types 
 Conversely, in a function body, ``int foo[10]`` represents a declaration of
 a 10-element array of varying ``int`` values.  In that we'd certainly like
 to be able to pass such an array to a function that takes a ``int []``
 parameter, the natural type for an ``int []`` parameter is a uniform
 pointer to varying integer values.
 In terms of compatibility with C/C++, it's unfortunate that this
 distinction exists, though any other set of rules seems to introduce more
 awkwardness than this one.  (Though we're interested to hear ideas to
 improve these rules!).
 Why are pointed-to types "uniform" by default?
 ----------------------------------------------
 In ``ispc``, types without rate qualifiers are "varying" by default, but
 types pointed to by pointers without rate qualifiers are "uniform" by
 default.  Why this difference?
 ::
    int foo;  // no rate qualifier, "varying int".
    uniform int *foo;  // pointer type has no rate qualifier, pointed-to does.
                       // "varying pointer to uniform int".
    int *foo;  // neither pointer type nor pointed-to type ("int") have
               // rate qualifiers. Pointer type is varying by default,
               // pointed-to is uniform. "varying pointer to uniform int".
    varying int *foo;   // varying pointer to varying int
 The first rule, having types without rate qualifiers be varying by default,
 is a default that keeps the number of "uniform" or "varying" qualifiers in
 ``ispc`` programs low.  Most ``ispc`` programs use mostly "varying"
 variables, so this rule allows most variables to be declared without also
 requiring rate qualifiers.
 On a related note, this rule allows many C/C++ functions to be used to
 define equivalent functions in the SPMD execution model that ``ispc``
 provides with little or no modification:
 ::
    // scalar add in C/C++, SPMD/vector add in ispc
    int add(int a, int b) { return a + b; }
 This motivation also explains why ``uniform int *foo`` represents a varying
 pointer; having pointers be varying by default if they don't have rate
 qualifiers similarly helps with porting code from C/C++ to ``ispc``.
 The tricker issue is why pointed-to types are "uniform" by default.  In our
 experience, data in memory that is accessed via pointers is most often
 uniform; this generally includes all data that has been allocated and
 initialized by the C/C++ application code. In practice, "varying" types are
 more generally (but not exclusively) used for local data in ``ispc``
 functions.  Thus, making the pointed-to type uniform by default leads to
 more concise code for the most common cases.
 What am I getting an error about assigning a varying lvalue to a reference type?
 --------------------------------------------------------------------------------
 Given code like the following:
 ::
    uniform float a[...];
    int index = ...;
    float &r = a[index];
 ``ispc`` issues the error "Initializer for reference-type variable "r" must
 have a uniform lvalue type.".  The underlying issue stems from how
 references are represented in the code generated by ``ispc``.  Recall that
 ``ispc`` supports both uniform and varying pointer types--a uniform pointer
 points to the same location in memory for all program instances in the
 gang, while a varying pointer allows each program instance to have its own
 pointer value.
 References are represented a pointer in the code generated by ``ispc``,
 though this is generally opaque to the user; in ``ispc``, they are
 specifically uniform pointers.  This design decision was made so that given
 code like this:
 ::
    extern void func(float &val);
    float foo = ...;
    func(foo);
 Then the reference would be handled efficiently as a single pointer, rather
 than unnecessarily being turned into a gang-size of pointers.
 However, an implication of this decision is that it's not possible for
 references to refer to completely different things for each of the program
 instances.  (And hence the error that is issued).  In cases where a unique
 per-program-instance pointer is needed, a varying pointer should be used
 instead of a reference.
 Interoperability
 ================
 How can I supply an initial execution mask in the call from the application?
 ----------------------------------------------------------------------------
 Recall that when execution transitions from the application code to an
 ``ispc`` function, all of the program instances are initially executing.
 In some cases, it may desired that only some of them are running, based on
 a data-dependent condition computed in the application program.  This
 situation can easily be handled via an additional parameter from the
 application.
 As a simple example, consider a case where the application code has an
 array of ``float`` values and we'd like the ``ispc`` code to update
 just specific values in that array, where which of those values to be
 updated has been determined by the application.  In C++ code, we might
 have:
 ::
    int count = ...;
    float *array = new float[count];
    bool *shouldUpdate = new bool[count];
    // initialize array and shouldUpdate
    ispc_func(array, shouldUpdate, count);
 Then, the ``ispc`` code could process this update as:
 ::
    export void ispc_func(uniform float array[], uniform bool update[],
                          uniform int count) {
        foreach (i = 0 ... count) {
            cif (update[i] == true)
                // update array[i+programIndex]...
        }
    }
 (In this case a "coherent" if statement is likely to be worthwhile if the
 ``update`` array will tend to have sections that are either all-true or
 all-false.)
 How can I generate a single binary executable with support for multiple instruction sets?
 -----------------------------------------------------------------------------------------
 ``ispc`` can also generate output that supports multiple target instruction
 sets, also generating code that chooses the most appropriate one at runtime
 if multiple targets are specified with the ``--target`` command-line
 argument.
 For example, if you run the command:
 ::
   ispc foo.ispc -o foo.o --target=sse2,sse4-x2,avx-x2
 Then four object files will be generated: ``foo_sse2.o``, ``foo_sse4.o``,
 ``foo_avx.o``, and ``foo.o``.[#]_  Link all of these into your executable, and
 when you call a function in ``foo.ispc`` from your application code,
 ``ispc`` will determine which instruction sets are supported by the CPU the
 code is running on and will call the most appropriate version of the
 function available.  
 .. [#] Similarly, if you choose to generate assembly language output or
   LLVM bitcode output, multiple versions of those files will be created.
 In general, the version of the function that runs will be the one in the
 most general instruction set that is supported by the system.  If you only
 compile SSE2 and SSE4 variants and run on a system that supports AVX, for
 example, then the SSE4 variant will be executed.  If the system doesn't
 is not able to run any of the available variants of the function (for
 example, trying to run a function that only has SSE4 and AVX variants on a
 system that only supports SSE2), then the standard library ``abort()``
 function will be called.
 One subtlety is that all non-static global variables (if any) must have the
 same size and layout with all of the targets used.  For example, if you
 have the global variables:
 ::
   uniform int foo[2*programCount];
   int bar;
 and compile to both SSE2 and AVX targets, both of these variables will have
 different sizes (the first due to program count having the value 4 for SSE2
 and 8 for AVX, and the second due to ``varying`` types having different
 numbers of elements with the two targets--essentially the same issue as the
 first.)  ``ispc`` issues an error in this case.
 How can I determine at run-time which vector instruction set's instructions were selected to execute?
 -----------------------------------------------------------------------------------------------------
 ``ispc`` doesn't provide any API that allows querying which vector ISA's
 instructions are running when multi-target compilation was used.  However,
 this can be solved in "user space" by writing a small helper function.
 Specifically, if you implement a function like this
 ::
    export uniform int isa() {
    #if defined(ISPC_TARGET_SSE2)
        return 0;
    #elif defined(ISPC_TARGET_SSE4)
        return 1;
    #elif defined(ISPC_TARGET_AVX)
        return 2;
    #else
        return -1;
    #endif
    }
 And then call it from your application code at runtime, it will return 0,
 1, or 2, depending on which target's instructions are running.
 The way this works is a little surprising, but it's a useful trick.  Of
 course the preprocessor ``#if`` checks are all compile-time only
 operations.  What's actually happening is that the function is compiled
 multiple times, once for each target, with the appropriate ``ISPC_TARGET``
 preprocessor symbol set.  Then, a small dispatch function is generated for
 the application to actually call.  This dispatch function in turn calls the
 appropriate version of the function based on the CPU of the system it's
 executing on, which in turn returns the appropriate value.
 In a similar fashion, it's possible to find out at run-time the value of
 ``programCount`` for the target that's actually being used.
 ::
    export uniform int width() { return programCount; }
 Is it possible to inline ispc functions in C/C++ code?
 ------------------------------------------------------
 If you're willing to use the ``clang`` C/C++ compiler that's part of the
 LLVM tool suite, then it is possible to inline ``ispc`` code with C/C++
 (and conversely, to inline C/C++ calls in ``ispc``).  Doing so can provide
 performance advantages when calling out to short functions written in the
 "other" language.  Note that you don't need to use ``clang`` to compile all
 of your C/C++ code, but only for the files where you want to be able to
 inline.  In order to do this, you must have a full installation of LLVM
 version 3.0 or later, including the ``clang`` compiler.
 The basic approach is to have the various compilers emit LLVM intermediate
 representation (IR) code and to then use tools from LLVM to link together
 the IR from the compilers and then re-optimize it, which gives the LLVM
 optimizer the opportunity to do additional inlining and cross-function
 optimizations.  If you have source files ``foo.ispc`` and ``foo.cpp``,
 first emit LLVM IR:
 ::
   ispc --emit-llvm -o foo_ispc.bc foo.ispc
   clang -O2 -c -emit-llvm -o foo_cpp.bc foo.cpp
 Next, link the two IR files into a single file and run the LLVM optimizer
 on the result:
 ::
    llvm-link foo_ispc.bc foo_cpp.bc -o - | opt -O3 -o foo_opt.bc
 And finally, generate a native object file:
 ::
   llc -filetype=obj foo_opt.bc -o foo.o
 This file can in turn be linked in with the rest of your object files when
 linking your applicaiton.
 (Note that if you're using the AVX instruction set, you must provide the
 ``-mattr=+avx`` flag to ``llc``.)
 Why is it illegal to pass "varying" values from C/C++ to ispc functions?
 ------------------------------------------------------------------------
 If any of the types in the parameter list to an exported function is
 "varying" (including recursively, and members of structure types, etc.),
 then ``ispc`` will issue an error and refuse to compile the function:
 ::
    % echo "export int add(int x) { return ++x; }" | ispc
    <stdin>:1:12: Error: Illegal to return a "varying" type from exported function "foo" 
    <stdin>:1:20: Error: Varying parameter "x" is illegal in an exported function. 
 While there's no fundamental reason why this isn't possible, recall the
 definition of "varying" variables: they have one value for each program
 instance in the gang.  As such, the number of values and amount of storage
 required to represent a varying variable depends on the gang size
 (i.e. ``programCount``), which can have different values depending on the
 compilation target.
 ``ispc`` therefore prohibits passing "varying" values between the
 application and the ``ispc`` program in order to prevent the
 application-side code from depending on a particular gang size, in order to
 encourage portability to different gang sizes.  (A generally desirable
 programming practice.)
 For cases where the size of data is actually fixed from the application
 side, the value can be passed via a pointer to a short ``uniform`` array,
 as follows:
 ::
    export void add4(uniform int ptr[4]) {
        foreach (i = 0 ... 4)
            ptr[i]++;
    }
 On the 4-wide SSE instruction set, this compiles to a single vector add
 instruction (and associated move instructions), while it still also
 efficiently computes the correct result on 8-wide AVX targets.
 Programming Techniques
 ======================
 What primitives are there for communicating between SPMD program instances?
 ---------------------------------------------------------------------------
 The ``broadcast()``, ``rotate()``, and ``shuffle()`` standard library
 routines provide a variety of mechanisms for the running program instances
 to communicate values to each other during execution.  Note that there's no
 need to synchronize the program instances before communicating between
 them, due to the synchronized execution model of gangs of program instances
 in ``ispc``.
 How can a gang of program instances generate variable amounts of output efficiently?
 ------------------------------------------------------------------------------------
 It's not unusual to have a gang of program instances where each program
 instance generates a variable amount of output (perhaps some generate no
 output, some generate one output value, some generate many output values
 and so forth), and where one would like to have the output densely packed
 in an output array.  The ``exclusive_scan_add()`` function from the
 standard library is quite useful in this situation.
 Consider the following function:
 ::
    uniform int func(uniform float outArray[], ...) {
       int numOut = ...;  // figure out how many to be output
       float outLocal[MAX_OUT]; // staging area
       // each program instance in the gang puts its results in
       //  outLocal[0], ..., outLocal[numOut-1]
       int startOffset = exclusive_scan_add(numOut);
       for (int i = 0; i < numOut; ++i)
           outArray[startOffset + i] = outLocal[i];
       return reduce_add(numOut);
    }
 Here, each program instance has computed a number, ``numOut``, of values to
 output, and has stored them in the ``outLocal`` array.  Assume that four
 program instances are running and that the first one wants to output one
 value, the second two values, and the third and fourth three values each.
 In this case, ``exclusive_scan_add()`` will return the values (0, 1, 3, 6)
 to the four program instances, respectively.  
 The first program instance will then write its one result to
 ``outArray[0]``, the second will write its two values to ``outArray[1]``
 and ``outArray[2]``, and so forth.  The ``reduce_add()`` call at the end
 returns the total number of values that all of the program instances have
 written to the array.
 FIXME: add discussion of foreach_active as an option here once that's in
 Is it possible to use ispc for explicit vector programming?
 -----------------------------------------------------------
 The typical model for programming in ``ispc`` is an *implicit* parallel
 model, where one writes a program that is apparently doing scalar
 computation on values and the program is then vectorized to run in parallel
 across the SIMD lanes of a processor.  However, ``ispc`` also has some
 support for explicit vector unit programming, where the vectorization is
 explicit.  Some computations may be more effectively described in the
 explicit model rather than the implicit model.
 This support is provided via ``uniform`` instances of short vectors
 Specifically, if this short program
 ::
    export uniform float<8> madd(uniform float<8> a, uniform float<8> b,
                                 uniform float<8> c) {
        return a + b * c;
    }
 is compiled with the AVX target, ``ispc`` generates the following assembly:
 ::
    _madd:
 	vmulps	%ymm2, %ymm1, %ymm1
 	vaddps	%ymm0, %ymm1, %ymm0
 	ret
 (And similarly, if compiled with a 4-wide SSE target, two ``mulps`` and two
 ``addps`` instructions are generated, and so forth.)
 Note that ``ispc`` doesn't currently support control-flow based on
 ``uniform`` short vector types; it is thus not possible to write code like:
 ::
    export uniform int<8> count(uniform float<8> a, uniform float<8> b) {
        uniform int<8> sum = 0;
        while (a++ < b)
            ++sum;
    }
 How can I debug my ispc programs using Valgrind?
 ------------------------------------------------
 The `valgrind`_ memory checker is an extremely useful memory checker for
 Linux and OSX; it detects a range of memory errors, including accessing
 memory after it has been freed, accessing memory beyond the end of an
 array, accessing uninitialized stack variables, and so forth.
 In general, applications that use ``ispc`` code run with ``valgrind``
 without modification and ``valgrind`` will detect the same range of memory
 errors in ``ispc`` code that it does in C/C++ code.  
 .. _valgrind: http://valgrind.org
 One issue to be aware of is that until recently, ``valgrind`` only
 supported the SSE2 vector instructions; if you are using a version of
 ``valgrind`` older than the 3.7.0 release (5 November 2011), you should
 compile your ``ispc`` programs with ``--target=sse2`` before running them
 through ``valgrind``.  (Note that if no target is specified, then ``ispc``
 chooses a target based on the capabilities of the system you're running
 ``ispc`` on.)  If you run an ``ispc`` program that uses instructions that
 ``valgrind`` doesn't support, you'll see an error message like:
 ::
    vex amd64->IR: unhandled instruction bytes: 0xC5 0xFA 0x10 0x0 0xC5 0xFA 0x11 0x84
    ==46059== valgrind: Unrecognised instruction at address 0x100002707.
 The just-released valgrind 3.7.0 adds support for the SSE4.2 instruction
 set; if you're using that version (and your system supports SSE4.2), then
 you can use ``--target=sse4`` when compiling to run with ``valgrind``.
 Note that ``valgrind`` does not yet support programs that use the AVX
 instruction set.
--- a/docs/ispc.rst
+++ b/docs/ispc.rst
--- a/docs/ispc.txt
+++ b/docs/ispc.txt
--- a/docs/news.rst
+++ b/docs/news.rst
@@ -0,0 +1,64 @@
 =========
 ispc News
 =========
 ispc 1.2.1 is Released
 ----------------------
 This is a bugfix release, fixing approximately 20 bugs in the system and
 improving error handling and error reporting.  New functionality includes
 very efficient float/half conversion routines thanks to Fabian 
 Giesen.  See the `1.2.1 release notes`_ for details.
 .. _1.2.1 release notes: https://github.com/ispc/ispc/tree/master/docs/ReleaseNotes.txt
 ispc 1.2.0 is Released
 -----------------------
 A new major release was posted on March 20, 2012.  This release includes
 significant new functionality for cleanly handling "structure of arrays"
 (SoA) data layout and a new model for how uniform and varying are handled
 with structure types.  
 Paper on ispc To Appear in InPar 2012
 -------------------------------------
 A technical paper on ``ispc``, `ispc: A SPMD Compiler for High-Performance
 CPU Programming`_, by Matt Pharr and William R. Mark, has been accepted to
 the `InPar 2012`_ conference. This paper describes a number of the design
 features and key characteristics of the ``ispc`` implementation.
 (© 2012 IEEE. Personal use of this material is permitted. Permission from
 IEEE must be obtained for all other uses, in any current or future media,
 including reprinting/republishing this material for advertising or
 promotional purposes, creating new collective works, for resale or
 redistribution to servers or lists, or reuse of any copyrighted component
 of this work in other works.).
 .. _ispc\: A SPMD Compiler for High-Performance CPU Programming: https://github.com/downloads/ispc/ispc/ispc_inpar_2012.pdf
 .. _InPar 2012: http://innovativeparallel.org/
 ispc 1.1.4 is Released
 ----------------------
 On February 4, 2012, the 1.1.4 release of ``ispc`` was posted; new features
 include ``new`` and ``delete`` for dynamic memory allocation in ``ispc``
 programs, "local" atomic operations in the standard library, and a new
 scalar compilation target.  See the `1.1.4 release notes`_ for details.
 .. _1.1.4 release notes: https://github.com/ispc/ispc/tree/master/docs/ReleaseNotes.txt
 ispc 1.1.3 is Released
 ----------------------
 With this release, the language now supports "switch" statements, with the same semantics and syntax as in C.
 This release includes fixes for two important performance related issues:
 the quality of code generated for "foreach" statements has been
 substantially improved, and performance regression with code for "gathers"
 that was introduced in v1.1.2 has been fixed in this release.
 Thanks to Jean-Luc Duprat for a number of patches that improve support for
 building on various platforms, and to Pierre-Antoine Lacaze for patches so
 that ispc builds under MinGW.
--- a/docs/perf.rst
+++ b/docs/perf.rst
@@ -0,0 +1,85 @@
 ===========
 Performance
 ===========
 The SPMD programming model that ``ispc`` makes it easy to harness the
 computational power available in SIMD vector units on modern CPUs, while
 its basis in C makes it easy for programmers to adopt and use
 productively.  This page summarizes the performance of ``ispc`` with the
 workloads in the ``examples/`` directory of the ``ispc`` distribution.
 These results were measured on a 4-core Apple iMac with a 4-core 3.4GHz
 Intel® Core-i7 processor using the Intel® AVX instruction set.  The basis
 for comparison is a reference C++ implementation compiled with gcc 4.2.1,
 the version distributed with OS X 10.7.2.  (The reference implementation is
 also included in the ``examples/`` directory.)
 .. list-table:: Performance of ``ispc`` with a variety of the workloads
   from the ``examples/`` directory of the ``ispc`` distribution, compared
   a reference C++ implementation compiled with gcc 4.2.1.
  * - Workload
    - ``ispc``, 1 core
    - ``ispc``, 4 cores
  * - `AOBench`_ (512 x 512 resolution)
    - 6.19x
    - 28.06x
  * - `Binomial Options`_ (128k options)
    - 7.94x
    - 33.43x
  * - `Black-Scholes Options`_ (128k options)
    - 8.45x
    - 32.48x
  * - `Deferred Shading`_ (1280p)
    - 5.02x
    - 23.06x
  * - `Mandelbrot Set`_
    - 6.21x
    - 20.28x
  * - `Perlin Noise Function`_
    - 5.37x
    - n/a
  * - `Ray Tracer`_ (Sponza dataset)
    - 4.31x
    - 20.29x
  * - `3D Stencil`_
    - 4.05x
    - 15.53x
  * - `Volume Rendering`_
    - 3.60x
    - 17.53x
 .. _AOBench: https://github.com/ispc/ispc/tree/master/examples/aobench
 .. _Binomial Options: https://github.com/ispc/ispc/tree/master/examples/options
 .. _Black-Scholes Options: https://github.com/ispc/ispc/tree/master/examples/options
 .. _Deferred Shading: https://github.com/ispc/ispc/tree/master/examples/deferred
 .. _Mandelbrot Set: https://github.com/ispc/ispc/tree/master/examples/mandelbrot_tasks
 .. _Ray Tracer: https://github.com/ispc/ispc/tree/master/examples/rt
 .. _Perlin Noise Function: https://github.com/ispc/ispc/tree/master/examples/noise
 .. _3D Stencil: https://github.com/ispc/ispc/tree/master/examples/stencil
 .. _Volume Rendering: https://github.com/ispc/ispc/tree/master/examples/volume_rendering
 The following table shows speedups for a number of the examples on a
 2.40GHz, 40-core Intel® Xeon E7-8870 system with the Intel® SSE4
 instruction set, running Microsoft Windows Server 2008 Enterprise.  Here,
 the serial C/C++ baseline code was compiled with MSVC 2010.
 .. list-table:: Performance of ``ispc`` with a variety of the workloads
   from the ``examples/`` directory of the ``ispc`` distribution, on 
   system with 40 CPU cores.
  * - Workload
    - ``ispc``, 40 cores
  * - AOBench (2048 x 2048 resolution)
    - 182.36x
  * - Binomial Options (2m options)
    - 63.85x
  * - Black-Scholes Options (2m options)
    - 83.97x
  * - Ray Tracer (Sponza dataset)
    - 195.67x
  * - Volume Rendering
    - 243.18x
--- a/docs/perfguide.rst
+++ b/docs/perfguide.rst
@@ -0,0 +1,791 @@
 ==============================================
 Intel® SPMD Program Compiler Performance Guide
 ==============================================
 The SPMD programming model provided by ``ispc`` naturally delivers
 excellent performance for many workloads thanks to efficient use of CPU
 SIMD vector hardware.  This guide provides more details about how to get
 the most out of ``ispc`` in practice.
 * `Key Concepts`_
  + `Efficient Iteration With "foreach"`_
  + `Improving Control Flow Coherence With "foreach_tiled"`_
  + `Using Coherent Control Flow Constructs`_
  + `Use "uniform" Whenever Appropriate`_
  + `Use "Structure of Arrays" Layout When Possible`_
 * `Tips and Techniques`_
  + `Understanding Gather and Scatter`_
  + `Avoid 64-bit Addressing Calculations When Possible`_
  + `Avoid Computation With 8 and 16-bit Integer Types`_
  + `Implementing Reductions Efficiently`_
  + `Using Low-level Vector Tricks`_
  + `The "Fast math" Option`_
  + `"inline" Aggressively`_
  + `Avoid The System Math Library`_
  + `Declare Variables In The Scope Where They're Used`_
  + `Instrumenting ISPC Programs To Understand Runtime Behavior`_
  + `Choosing A Target Vector Width`_
 * `Disclaimer and Legal Information`_
 * `Optimization Notice`_
 Key Concepts
 ============
 This section describes the four most important concepts to understand and
 keep in mind when writing high-performance ``ispc`` programs.  It assumes
 good familiarity with the topics covered in the ``ispc`` `Users Guide`_.
 .. _Users Guide: ispc.html
 Efficient Iteration With "foreach"
 ----------------------------------
 The ``foreach`` parallel iteration construct is semantically equivalent to
 a regular ``for()`` loop, though it offers meaningful performance benefits.
 (See the `documentation on "foreach" in the Users Guide`_ for a review of
 its syntax and semantics.)  As an example, consider this simple function
 that iterates over some number of elements in an array, doing computation
 on each one:
 .. _documentation on "foreach" in the Users Guide: ispc.html#parallel-iteration-statements-foreach-and-foreach-tiled
 ::
    export void foo(uniform int a[], uniform int count) {
        for (int i = programIndex; i < count; i += programCount) {
            // do some computation on a[i]
        }
    }
 Depending on the specifics of the computation being performed, the code
 generated for this function could likely be improved by modifying the code 
 so that the loop only goes as far through the data as is possible to pack
 an entire gang of program instances with computation each time through the
 loop.  Doing so enables the ``ispc`` compiler to generate more efficient
 code for cases where it knows that the execution mask is "all on".  Then,
 an ``if`` statement at the end handles processing the ragged extra bits of
 data that didn't fully fill a gang.
 ::
    export void foo(uniform int a[], uniform int count) {
        // First, just loop up to the point where all program instances
        // in the gang will be active at the loop iteration start
        uniform int countBase = count & ~(programCount-1);
        for (uniform int i = 0; i < countBase; i += programCount) {
            int index = i + programIndex;
            // do some computation on a[index]
        }
        // Now handle the ragged extra bits at the end
        if (countBase < count) {
            int index = countBase + programIndex;
            // do some computation on a[index]
        }
    }
 While the performance of the above code will likely be better than the
 first version of the function, the loop body code has been duplicated (or
 has been forced to move into a separate utility function).
 Using the ``foreach`` looping construct as below provides all of the
 performance benefits of the second version of this function, with the
 compactness of the first.
 ::
    export void foo(uniform int a[], uniform int count) {
        foreach (i = 0 ... count) {
            // do some computation on a[i]
        }
    }
 Improving Control Flow Coherence With "foreach_tiled"
 -----------------------------------------------------
 Depending on the computation being performed, ``foreach_tiled`` may give
 better performance than ``foreach``.  (See the `documentation in the Users
 Guide`_ for the syntax and semantics of ``foreach_tiled``.)  Given a
 multi-dimensional iteration like:
 .. _documentation in the Users Guide: ispc.html#parallel-iteration-statements-foreach-and-foreach-tiled
 ::
    foreach (i = 0 ... width, j = 0 ... height) {
        // do computation on element (i,j)
    }
 if the ``foreach`` statement is used, elements in the gang of program
 instances will be mapped to values of ``i`` and ``j`` by taking spans of
 ``programCount`` elements across ``i`` with a single value of ``j``.  For
 example, the ``foreach`` statement above roughly corresponds to:
 ::
    for (uniform int j = 0; j < height; ++j)
        for (int i = 0; i < width; i += programCount) {
            // do computation 
    }
 When a multi-dimensional domain is being iterated over, ``foreach_tiled``
 statement maps program instances to data in a way that tries to select
 square n-dimensional segments of the domain.  For example, on a compilation
 target with 8-wide gangs of program instances, it generates code that
 iterates over the domain the same way as the following code (though more
 efficiently):
 ::
    for (int j = programIndex/4; j < height; j += 2)
        for (int i = programIndex%4; i < width; i += 4) {
            // do computation 
    }
 Thus, each gang of program instances operates on a 2x4 tile of the domain.
 With higher-dimensional iteration and different gang sizes, a similar
 mapping is performed--e.g. for 2D iteration with a 16-wide gang size, 4x4
 tiles are iterated over; for 4D iteration with a 8-gang, 1x2x2x2 tiles are
 processed, and so forth.  
 Performance benefit can come from using ``foreach_tiled`` in that it
 essentially optimizes for the benefit of iterating over *compact* regions
 of the domain (while ``foreach`` iterates over the domain in a way that
 generally allows linear memory access.)  There are two benefits from
 processing compact regions of the domain.  
 First, it's often the case that the control flow coherence of the program
 instances in the gang is improved; if data-dependent control flow decisions
 are related to the values of the data in the domain being processed, and if
 the data values have some coherence, iterating with compact regions will
 improve control flow coherence.
 Second, processing compact regions may mean that the data accessed by
 program instances in the gang is be more coherent, leading to performance
 benefits from better cache hit rates.
 As a concrete example, for the ray tracer example in the ``ispc``
 distribution (in the ``examples/rt`` directory), performance is 20% better
 when the pixels are iterated over using ``foreach_tiled`` than ``foreach``,
 because more coherent regions of the scene are accessed by the set of rays
 in the gang of program instances.
 Using Coherent Control Flow Constructs
 --------------------------------------
 Recall from the ``ispc`` Users Guide, in the `SPMD-on-SIMD Execution Model
 section`_ that ``if`` statements with a ``uniform`` test compile to more
 efficient code than ``if`` tests with varying tests.  The coherent ``cif``
 statement can provide many benefits of ``if`` with a uniform test in the
 case where the test is actually varying.
 .. _SPMD-on-SIMD Execution Model section: ispc.html#the-spmd-on-simd-execution-model
 In this case, the code the compiler generates for the ``if``
 test is along the lines of the following pseudo-code:
 ::
   bool expr = /* evaluate cif condition */
   if (all(expr)) {
       // run "true" case of if test only
   } else if (!any(expr)) {
       // run "false" case of if test only
   } else {
       // run both true and false cases, updating mask appropriately
   }
 For ``if`` statements where the different running SPMD program instances
 don't have coherent values for the boolean ``if`` test, using ``cif``
 introduces some additional overhead from the ``all`` and ``any`` tests as
 well as the corresponding branches.  For cases where the program
 instances often do compute the same boolean value, this overhead is
 worthwhile.  If the control flow is in fact usually incoherent, this
 overhead only costs performance.
 In a similar fashion, ``ispc`` provides ``cfor``, ``cwhile``, and ``cdo``
 statements.  These statements are semantically the same as the
 corresponding non-"c"-prefixed functions.
 Use "uniform" Whenever Appropriate
 ----------------------------------
 For any variable that will always have the same value across all of the
 program instances in a gang, declare the variable with the  ``uniform``
 qualifier.  Doing so enables the ``ispc`` compiler to emit better code in
 many different ways.
 As a simple example, consider a ``for`` loop that always does the same
 number of iterations:
 ::
    for (int i = 0; i < 10; ++i)
        // do something ten times
 If this is written with ``i`` as a ``varying`` variable, as above, there's
 additional overhead in the code generated for the loop as the compiler
 emits instructions to handle the possibility of not all program instances
 following the same control flow path (as might be the case if the loop
 limit, 10, was itself a ``varying`` value.)
 If the above loop is instead written with ``i`` ``uniform``, as:
 ::
    for (uniform int i = 0; i < 10; ++i)
        // do something ten times
 Then better code can be generated (and the loop possibly unrolled).
 In some cases, the compiler may be able to detect simple cases like these,
 but it's always best to provide the compiler with as much help as possible
 to understand the actual form of your computation.
 Use "Structure of Arrays" Layout When Possible
 ----------------------------------------------
 In general, memory access performance (for both reads and writes) is best
 when the running program instances access a contiguous region of memory; in
 this case efficient vector load and store instructions can often be used
 rather than gathers and scatters.  As an example of this issue, consider an
 array of a simple point datatype laid out and accessed in conventional
 "array of structures" (AOS) layout:
 ::
    struct Point { float x, y, z; };
    uniform Point pts[...];
    float v = pts[programIndex].x;
 In the above code, the access to ``pts[programIndex].x`` accesses
 non-sequential memory locations, due to the ``y`` and ``z`` values between
 the desired ``x`` values in memory.  A "gather" is required to get the
 value of ``v``, with a corresponding decrease in performance.
 If ``Point`` was defined as a "structure of arrays" (SOA) type, the access
 can be much more efficient:
 ::
    struct Point8 { float x[8], y[8], z[8]; };
    uniform Point8 pts8[...];
    int majorIndex = programIndex / 8;
    int minorIndex = programIndex % 8;
    float v = pts8[majorIndex].x[minorIndex];
 In this case, each ``Point8`` has 8 ``x`` values contiguous in memory
 before 8 ``y`` values and then 8 ``z`` values.  If the gang size is 8 or
 less, the access for ``v`` will have the same value of ``majorIndex`` for
 all program instances and will access consecutive elements of the ``x[8]``
 array with a vector load.  (For larger gang sizes, two 8-wide vector loads
 would be issues, which is also quite efficient.)
 However, the syntax in the above code is messy; accessing SOA data in this
 fashion is much less elegant than the corresponding code for accessing the
 data with AOS layout.  The ``soa`` qualifier in ``ispc`` can be used to
 cause the corresponding transformation to be made to the ``Point`` type,
 while preserving the clean syntax for data access that comes with AOS
 layout:
 ::
    soa<8> Point pts[...]; 
    float v = pts[programIndex].x;
 Thanks to having SOA layout a first-class concept in the language's type
 system, it's easy to write functions that convert data between the
 layouts.  For example, the ``aos_to_soa`` function below converts ``count``
 elements of the given ``Point`` type from AOS to 8-wide SOA layout.  (It
 assumes that the caller has pre-allocated sufficient space in the
 ``pts_soa`` output array.
 ::
    void aos_to_soa(uniform Point pts_aos[], uniform int count,
                    soa<8> pts_soa[]) {
         foreach (i = 0 ... count)
             pts_soa[i] = pts_aos[i];
    }
 Analogously, a function could be written to convert back from SOA to AOS if
 needed.
 Tips and Techniques
 ===================
 This section introduces a number of additional techniques that are worth
 keeping in mind when writing ``ispc`` programs.
 Understanding Gather and Scatter
 --------------------------------
 Memory reads and writes from the program instances in a gang that access
 irregular memory locations (rather than a consecutive set of locations, or
 a single location) can be relatively inefficient.  As an example, consider
 the "simple" array indexing calculation below:
 ::
    int i = ....;
    uniform float x[10] = { ... };
    float f = x[i];
 Since the index ``i`` is a varying value, the program instances in the gang
 will in general be reading different locations in the array ``x``.  Because
 current CPUs have a "gather" instruction, the ``ispc`` compiler has to
 serialize these memory reads, performing a separate memory load for each
 running program instance, packing the result into ``f``.  (The analogous
 case happens for a write into ``x[i]``.)
 In many cases, gathers like these are unavoidable; the program instances
 just need to access incoherent memory locations.  However, if the array
 index ``i`` actually has the same value for all of the program instances or
 if it represents an access to a consecutive set of array locations, much
 more efficient load and store instructions can be generated instead of
 gathers and scatters, respectively.
 In many cases, the ``ispc`` compiler is able to deduce that the memory
 locations accessed by a varying index are either all the same or are
 uniform.  For example, given:
 ::
  uniform int x = ...;
  int y = x;
  return array[y];
 The compiler is able to determine that all of the program instances are
 loading from the same location, even though ``y`` is not a ``uniform``
 variable.  In this case, the compiler will transform this load to a regular
 vector load, rather than a general gather.
 Sometimes the running program instances will access a linear sequence of
 memory locations; this happens most frequently when array indexing is done
 based on the built-in ``programIndex`` variable.  In many of these cases,
 the compiler is also able to detect this case and then do a vector load.
 For example, given:
 ::
    for (int i = programIndex; i < count; i += programCount)
      // process array[i];
 Regular vector loads and stores are issued for accesses to ``array[i]``.
 Both of these cases have been ones where the compiler is able to determine
 statically that the index has the same value at compile-time.  It's 
 often the case that this determination can't be made at compile time, but
 this is often the case at run time.  The ``reduce_equal()`` function from
 the standard library can be used in this case; it checks to see if the
 given value is the same across over all of the running program instances,
 returning true and its ``uniform`` value if so.
 The following function shows the use of ``reduce_equal()`` to check for an
 equal index at execution time and then either do a scalar load and
 broadcast or a general gather.
 ::
    uniform float array[..] = { ... };
    float value;
    int i = ...;
    uniform int ui;
    if (reduce_equal(i, &ui) == true)
        value = array[ui]; // scalar load + broadcast
    else
        value = array[i];  // gather
 For a simple case like the one above, the overhead of doing the
 ``reduce_equal()`` check is likely not worthwhile compared to just always
 doing a gather.  In more complex cases, where a number of accesses are done
 based on the index, it can be worth doing.  See the example
 ``examples/volume_rendering`` in the ``ispc`` distribution for the use of
 this technique in an instance where it is beneficial to performance.
 Understanding Memory Read Coalescing
 ------------------------------------
 XXXX todo
 Avoid 64-bit Addressing Calculations When Possible
 --------------------------------------------------
 Even when compiling to a 64-bit architecture target, ``ispc`` does many of
 the addressing calculations in 32-bit precision by default--this behavior
 can be overridden with the ``--addressing=64`` command-line argument.  This
 option should only be used if it's necessary to be able to address over 4GB
 of memory in the ``ispc`` code, as it essentially doubles the cost of
 memory addressing calculations in the generated code.
 Avoid Computation With 8 and 16-bit Integer Types
 -------------------------------------------------
 The code generated for 8 and 16-bit integer types is generally not as
 efficient as the code generated for 32-bit integer types.  It is generally
 worthwhile to use 32-bit integer types for intermediate computations, even
 if the final result will be stored in a smaller integer type.
 Implementing Reductions Efficiently
 -----------------------------------
 It's often necessary to compute a reduction over a data set--for example,
 one might want to add all of the values in an array, compute their minimum,
 etc.  ``ispc`` provides a few capabilities that make it easy to efficiently
 compute reductions like these.  However, it's important to use these
 capabilities appropriately for best results.
 As an example, consider the task of computing the sum of all of the values
 in an array.  In C code, we might have:
 ::
    /* C implementation of a sum reduction */
    float sum(const float array[], int count) {
        float sum = 0;
        for (int i = 0; i < count; ++i)
            sum += array[i];
        return sum;
    } 
 Exactly this computation could also be expressed as a purely uniform
 computation in ``ispc``, though without any benefit from vectorization:
 ::
    /* inefficient ispc implementation of a sum reduction */
    uniform float sum(const uniform float array[], uniform int count) {
        uniform float sum = 0;
        for (uniform int i = 0; i < count; ++i)
            sum += array[i];
        return sum;
    } 
 As a first try, one might try using the ``reduce_add()`` function from the
 ``ispc`` standard library; it takes a ``varying`` value and returns the sum
 of that value across all of the active program instances.
 ::
    /* inefficient ispc implementation of a sum reduction */
    uniform float sum(const uniform float array[], uniform int count) {
        uniform float sum = 0;
        foreach (i = 0 ... count)
            sum += reduce_add(array[i+programIndex]);
        return sum;
    } 
 This implementation loads a gang's worth of values from the array, one for
 each of the program instances, and then uses ``reduce_add()`` to reduce
 across the program instances and then update the sum.  Unfortunately this
 approach loses most benefit from vectorization, as it does more work on the
 cross-program instance ``reduce_add()`` call than it saves from the vector
 load of values.
 The most efficient approach is to do the reduction in two phases: rather
 than using a ``uniform`` variable to store the sum, we maintain a varying
 value, such that each program instance is effectively computing a local
 partial sum on the subset of array values that it has loaded from the
 array.  When the loop over array elements concludes, a single call to
 ``reduce_add()`` computes the final reduction across each of the program
 instances' elements of ``sum``.  This approach effectively compiles to a
 single vector load and a single vector add for each loop iteration's of
 values--very efficient code in the end.
 ::
    /* good ispc implementation of a sum reduction */
    uniform float sum(const uniform float array[], uniform int count) {
        float sum = 0;
        foreach (i = 0 ... count)
            sum += array[i+programIndex];
        return reduce_add(sum);
    } 
 Using Low-level Vector Tricks
 -----------------------------
 Many low-level Intel® SSE and AVX coding constructs can be implemented in
 ``ispc`` code.  The ``ispc`` standard library functions ``intbits()`` and
 ``floatbits()`` are often useful in this context.  Recall that
 ``intbits()`` takes a ``float`` value and returns it as an integer where
 the bits of the integer are the same as the bit representation in memory of
 the ``float``.  (In other words, it does *not* perform an integer to
 floating-point conversion.)  ``floatbits()``, then, performs the inverse
 computation.
 As an example of the use of these functions, the following code efficiently
 reverses the sign of the given values.
 ::
  float flipsign(float a) {
      unsigned int i = intbits(a);
      i ^= 0x80000000;
      return floatbits(i);
  }
 This code compiles down to a single XOR instruction.
 The "Fast math" Option
 ----------------------
 ``ispc`` has a ``--opt=fast-math`` command-line flag that enables a number of
 optimizations that may be undesirable in code where numerical precision is
 critically important.  For many graphics applications, for example, the
 approximations introduced may be acceptable, however.  The following two
 optimizations are performed when ``--opt=fast-math`` is used.  By default, the
 ``--opt=fast-math`` flag is off.
 * Expressions like ``x / y``, where ``y`` is a compile-time constant, are
  transformed to ``x * (1./y)``, where the inverse value of ``y`` is
  precomputed at compile time.
 * Expressions like ``x / y``, where ``y`` is not a compile-time constant,
  are transformed to ``x * rcp(y)``, where ``rcp()`` maps to the
  approximate reciprocal instruction from the ``ispc`` standard library.
 "inline" Aggressively
 ---------------------
 Inlining functions aggressively is generally beneficial for performance
 with ``ispc``.  Definitely use the ``inline`` qualifier for any short
 functions (a few lines long), and experiment with it for longer functions.
 Avoid The System Math Library
 -----------------------------
 The default math library for transcendentals and the like that ``ispc`` has
 higher error than the system's math library, though is much more efficient
 due to being vectorized across the program instances and due to the fact
 that the functions can be inlined in the final code.  (It generally has
 errors in the range of 10ulps, while the system math library generally has
 no more than 1ulp of error for transcendentals.)
 If the ``--math-lib=system`` command-line option is used when compiling an
 ``ispc`` program, then calls to the system math library will be generated
 instead.  This option should only be used if the higher precision is
 absolutely required as the performance impact of using it can be
 significant.
 Declare Variables In The Scope Where They're Used
 -------------------------------------------------
 Performance is slightly improved by declaring variables at the same block
 scope where they are first used.  For example, in code like the
 following, if the lifetime of ``foo`` is only within the scope of the
 ``if`` clause, write the code like this:  
 ::
    float func() {
        ....
        if (x < y) {
            float foo;
            ... use foo ...
        }
    }
 Try not to write code as:
 ::
    float func() {
        float foo;
        ....
        if (x < y) {
            ... use foo ...
        }
    }
 Doing so can reduce the amount of masked store instructions that the
 compiler needs to generate.
 Instrumenting ISPC Programs To Understand Runtime Behavior
 ----------------------------------------------------------
 ``ispc`` has an optional instrumentation feature that can help you
 understand performance issues.  If a program is compiled using the
 ``--instrument`` flag, the compiler emits calls to a function with the
 following signature at various points in the program (for
 example, at interesting points in the control flow, when scatters or
 gathers happen.)
 ::
    extern "C" {
        void ISPCInstrument(const char *fn, const char *note, 
                            int line, int mask);
    }
 This function is passed the file name of the ``ispc`` file running, a short
 note indicating what is happening, the line number in the source file, and
 the current mask of active program instances in the gang.  You must provide an
 implementation of this function and link it in with your application.
 For example, when the ``ispc`` program runs, this function might be called
 as follows:
 ::
   ISPCInstrument("foo.ispc", "function entry", 55, 0xf);
 This call indicates that at the currently executing program has just
 entered the function defined at line 55 of the file ``foo.ispc``, with a
 mask of all lanes currently executing (assuming a four-wide gang size
 target machine).
 For a fuller example of the utility of this functionality, see
 ``examples/aobench_instrumented`` in the ``ispc`` distribution.  This
 example includes an implementation of the ``ISPCInstrument()`` function
 that collects aggregate data about the program's execution behavior.
 When running this example, you will want to direct to the ``ao`` executable
 to generate a low resolution image, because the instrumentation adds
 substantial execution overhead.  For example:
 ::
    % ./ao 1 32 32
 After the ``ao`` program exits, a summary report along the following lines
 will be printed.  In the first few lines, you can see how many times a few
 functions were called, and the average percentage of SIMD lanes that were
 active upon function entry.
 :: 
    ao.ispc(0067) - function entry: 342424 calls (0 / 0.00% all off!), 95.86% active lanes
    ao.ispc(0067) - return: uniform control flow: 342424 calls (0 / 0.00% all off!), 95.86% active lanes
    ao.ispc(0071) - function entry: 1122 calls (0 / 0.00% all off!), 97.33% active lanes
    ao.ispc(0075) - return: uniform control flow: 1122 calls (0 / 0.00% all off!), 97.33% active lanes
    ao.ispc(0079) - function entry: 10072 calls (0 / 0.00% all off!), 45.09% active lanes
    ao.ispc(0088) - function entry: 36928 calls (0 / 0.00% all off!), 97.40% active lanes
    ...
 Choosing A Target Vector Width
 ------------------------------
 By default, ``ispc`` compiles to the natural vector width of the target
 instruction set.  For example, for SSE2 and SSE4, it compiles four-wide,
 and for AVX, it complies 8-wide.  For some programs, higher performance may
 be seen if the program is compiled to a doubled vector width--8-wide for
 SSE and 16-wide for AVX.  
 For workloads that don't require many of registers, this method can lead to
 significantly more efficient execution thanks to greater instruction level
 parallelism and amortization of various overhead over more program
 instances.  For other workloads, it may lead to a slowdown due to higher
 register pressure; trying both approaches for key kernels may be
 worthwhile.
 This option is only available for each of the SSE2, SSE4 and AVX targets.
 It is selected with the ``--target=sse2-x2``, ``--target=sse4-x2`` and
 ``--target=avx-x2`` options, respectively.
 Disclaimer and Legal Information
 ================================
 INFORMATION IN THIS DOCUMENT IS PROVIDED IN CONNECTION WITH INTEL(R) PRODUCTS.
 NO LICENSE, EXPRESS OR IMPLIED, BY ESTOPPEL OR OTHERWISE, TO ANY INTELLECTUAL
 PROPERTY RIGHTS IS GRANTED BY THIS DOCUMENT. EXCEPT AS PROVIDED IN INTEL'S TERMS
 AND CONDITIONS OF SALE FOR SUCH PRODUCTS, INTEL ASSUMES NO LIABILITY WHATSOEVER,
 AND INTEL DISCLAIMS ANY EXPRESS OR IMPLIED WARRANTY, RELATING TO SALE AND/OR USE
 OF INTEL PRODUCTS INCLUDING LIABILITY OR WARRANTIES RELATING TO FITNESS FOR A
 PARTICULAR PURPOSE, MERCHANTABILITY, OR INFRINGEMENT OF ANY PATENT, COPYRIGHT
 OR OTHER INTELLECTUAL PROPERTY RIGHT.
 UNLESS OTHERWISE AGREED IN WRITING BY INTEL, THE INTEL PRODUCTS ARE NOT DESIGNED
 NOR INTENDED FOR ANY APPLICATION IN WHICH THE FAILURE OF THE INTEL PRODUCT COULD
 CREATE A SITUATION WHERE PERSONAL INJURY OR DEATH MAY OCCUR.
 Intel may make changes to specifications and product descriptions at any time,
 without notice. Designers must not rely on the absence or characteristics of any
 features or instructions marked "reserved" or "undefined." Intel reserves these
 for future definition and shall have no responsibility whatsoever for conflicts
 or incompatibilities arising from future changes to them. The information here
 is subject to change without notice. Do not finalize a design with this
 information.
 The products described in this document may contain design defects or errors
 known as errata which may cause the product to deviate from published
 specifications. Current characterized errata are available on request.
 Contact your local Intel sales office or your distributor to obtain the latest
 specifications and before placing your product order.
 Copies of documents which have an order number and are referenced in this
 document, or other Intel literature, may be obtained by calling 1-800-548-4725,
 or by visiting Intel's Web Site.
 Intel processor numbers are not a measure of performance. Processor numbers
 differentiate features within each processor family, not across different
 processor families. See http://www.intel.com/products/processor_number for
 details.
 BunnyPeople, Celeron, Celeron Inside, Centrino, Centrino Atom,
 Centrino Atom Inside, Centrino Inside, Centrino logo, Core Inside, FlashFile,
 i960, InstantIP, Intel, Intel logo, Intel386, Intel486, IntelDX2, IntelDX4,
 IntelSX2, Intel Atom, Intel Atom Inside, Intel Core, Intel Inside,
 Intel Inside logo, Intel. Leap ahead., Intel. Leap ahead. logo, Intel NetBurst,
 Intel NetMerge, Intel NetStructure, Intel SingleDriver, Intel SpeedStep,
 Intel StrataFlash, Intel Viiv, Intel vPro, Intel XScale, Itanium,
 Itanium Inside, MCS, MMX, Oplus, OverDrive, PDCharm, Pentium, Pentium Inside,
 skoool, Sound Mark, The Journey Inside, Viiv Inside, vPro Inside, VTune, Xeon,
 and Xeon Inside are trademarks of Intel Corporation in the U.S. and other
 countries.
 * Other names and brands may be claimed as the property of others.
 Copyright(C) 2011, Intel Corporation. All rights reserved.
 Optimization Notice
 ===================
 Intel compilers, associated libraries and associated development tools may
 include or utilize options that optimize for instruction sets that are
 available in both Intel and non-Intel microprocessors (for example SIMD
 instruction sets), but do not optimize equally for non-Intel
 microprocessors.  In addition, certain compiler options for Intel
 compilers, including some that are not specific to Intel
 micro-architecture, are reserved for Intel microprocessors.  For a detailed
 description of Intel compiler options, including the instruction sets and
 specific microprocessors they implicate, please refer to the "Intel
 Compiler User and Reference Guides" under "Compiler Options."  Many library
 routines that are part of Intel compiler products are more highly optimized
 for Intel microprocessors than for other microprocessors.  While the
 compilers and libraries in Intel compiler products offer optimizations for
 both Intel and Intel-compatible microprocessors, depending on the options
 you select, your code and other factors, you likely will get extra
 performance on Intel microprocessors.
 Intel compilers, associated libraries and associated development tools may
 or may not optimize to the same degree for non-Intel microprocessors for
 optimizations that are not unique to Intel microprocessors.  These
 optimizations include Intel® Streaming SIMD Extensions 2 (Intel® SSE2),
 Intel® Streaming SIMD Extensions 3 (Intel® SSE3), and Supplemental
 Streaming SIMD Extensions 3 (Intel SSSE3) instruction sets and other
 optimizations.  Intel does not guarantee the availability, functionality,
 or effectiveness of any optimization on microprocessors not manufactured by
 Intel.  Microprocessor-dependent optimizations in this product are intended
 for use with Intel microprocessors.
 While Intel believes our compilers and libraries are excellent choices to
 assist in obtaining the best performance on Intel and non-Intel
 microprocessors, Intel recommends that you evaluate other compilers and
 libraries to determine which best meet your requirements.  We hope to win
 your business by striving to offer the best performance of any compiler or
 library; please let us know if you find we do not.
--- a/docs/template-news.txt
+++ b/docs/template-news.txt
@@ -0,0 +1,65 @@
 %(head_prefix)s
 %(head)s
 <script type="text/javascript">
  var _gaq = _gaq || [];
  _gaq.push(['_setAccount', 'UA-1486404-4']);
  _gaq.push(['_trackPageview']);
  (function() {
    var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
    ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
    var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
  })();
 </script>
 %(stylesheet)s
 %(body_prefix)s
 <div id="wrap">
  <div id="wrap2">
    <div id="header">
      <h1 id="logo">Intel SPMD Program Compiler</h1>
      <div id="slogan">An open-source compiler for high-performance SIMD programming on
      the CPU</div>
    </div>
    <div id="nav">
      <div id="nbar">
        <ul>
          <li><a href="index.html">Overview</a></li>
          <li id="selected"><a href="news.html">News</a></li>
          <li><a href="features.html">Features</a></li>
          <li><a href="downloads.html">Downloads</a></li>
          <li><a href="documentation.html">Documentation</a></li>
          <li><a href="perf.html">Performance</a></li>
        </ul>
      </div>
    </div>
    <div id="content-wrap">
      <div id="sidebar">
          <div class="widgetspace">
            <h1>Resources</h1>
            <ul class="menu">
              <li><a href="http://github.com/ispc/ispc/">ispc page on github</a></li>
              <li><a href="http://groups.google.com/group/ispc-users/">ispc
              users mailing list</a></li>
              <li><a href="http://groups.google.com/group/ispc-dev/">ispc
              developers mailing list</a></li>
              <li><a href="http://github.com/ispc/ispc/wiki/">Wiki</a></li>
              <li><a href="http://github.com/ispc/ispc/issues/">Bug tracking</a></li>
              <li><a href="doxygen/index.html">Doxygen</a></li>
            </ul>
        </div>
      </div>
 %(body_pre_docinfo)s
 %(docinfo)s
 <div id="content">
 %(body)s
 </div>
    <div class="clearfix"></div>
    <div id="footer"> &copy; 2011-2012 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue  by: <a href="http://www.themebin.com/">ThemeBin</a>
      <!-- Please Do Not remove this link, thank u -->
      </div>
      </div>
      </div>
      </div>
 %(body_suffix)s
--- a/docs/template-perf.txt
+++ b/docs/template-perf.txt
@@ -0,0 +1,65 @@
 %(head_prefix)s
 %(head)s
 <script type="text/javascript">
  var _gaq = _gaq || [];
  _gaq.push(['_setAccount', 'UA-1486404-4']);
  _gaq.push(['_trackPageview']);
  (function() {
    var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
    ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
    var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
  })();
 </script>
 %(stylesheet)s
 %(body_prefix)s
 <div id="wrap">
  <div id="wrap2">
    <div id="header">
      <h1 id="logo">Intel SPMD Program Compiler</h1>
      <div id="slogan">An open-source compiler for high-performance SIMD programming on
      the CPU</div>
    </div>
    <div id="nav">
      <div id="nbar">
        <ul>
          <li><a href="index.html">Overview</a></li>
          <li><a href="news.html">News</a></li>
          <li><a href="features.html">Features</a></li>
          <li><a href="downloads.html">Downloads</a></li>
          <li><a href="documentation.html">Documentation</a></li>
          <li id="selected"><a href="perf.html">Performance</a></li>
        </ul>
      </div>
    </div>
    <div id="content-wrap">
      <div id="sidebar">
          <div class="widgetspace">
            <h1>Resources</h1>
            <ul class="menu">
              <li><a href="http://github.com/ispc/ispc/">ispc page on github</a></li>
              <li><a href="http://groups.google.com/group/ispc-users/">ispc
              users mailing list</a></li>
              <li><a href="http://groups.google.com/group/ispc-dev/">ispc
              developers mailing list</a></li>
              <li><a href="http://github.com/ispc/ispc/wiki/">Wiki</a></li>
              <li><a href="http://github.com/ispc/ispc/issues/">Bug tracking</a></li>
              <li><a href="doxygen/index.html">Doxygen</a></li>
            </ul>
        </div>
      </div>
 %(body_pre_docinfo)s
 %(docinfo)s
 <div id="content">
 %(body)s
 </div>
    <div class="clearfix"></div>
    <div id="footer"> &copy; 2011-2012 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue  by: <a href="http://www.themebin.com/">ThemeBin</a>
      <!-- Please Do Not remove this link, thank u -->
      </div>
      </div>
      </div>
      </div>
 %(body_suffix)s
--- a/docs/template.txt
+++ b/docs/template.txt
@@ -0,0 +1,65 @@
 %(head_prefix)s
 %(head)s
 <script type="text/javascript">
  var _gaq = _gaq || [];
  _gaq.push(['_setAccount', 'UA-1486404-4']);
  _gaq.push(['_trackPageview']);
  (function() {
    var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
    ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
    var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
  })();
 </script>
 %(stylesheet)s
 %(body_prefix)s
 <div id="wrap">
  <div id="wrap2">
    <div id="header">
      <h1 id="logo">Intel SPMD Program Compiler</h1>
      <div id="slogan">An open-source compiler for high-performance SIMD programming on
      the CPU</div>
    </div>
    <div id="nav">
      <div id="nbar">
        <ul>
          <li><a href="index.html">Overview</a></li>
          <li><a href="news.html">News</a></li>
          <li><a href="features.html">Features</a></li>
          <li><a href="downloads.html">Downloads</a></li>
          <li id="selected"><a href="documentation.html">Documentation</a></li>
          <li><a href="perf.html">Performance</a></li>
        </ul>
      </div>
    </div>
    <div id="content-wrap">
      <div id="sidebar">
          <div class="widgetspace">
            <h1>Resources</h1>
            <ul class="menu">
              <li><a href="http://github.com/ispc/ispc/">ispc page on github</a></li>
              <li><a href="http://groups.google.com/group/ispc-users/">ispc
              users mailing list</a></li>
              <li><a href="http://groups.google.com/group/ispc-dev/">ispc
              developers mailing list</a></li>
              <li><a href="http://github.com/ispc/ispc/wiki/">Wiki</a></li>
              <li><a href="http://github.com/ispc/ispc/issues/">Bug tracking</a></li>
              <li><a href="doxygen/index.html">Doxygen</a></li>
            </ul>
        </div>
      </div>
 %(body_pre_docinfo)s
 %(docinfo)s
 <div id="content">
 %(body)s
 </div>
    <div class="clearfix"></div>
    <div id="footer"> &copy; 2011-2012 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue  by: <a href="http://www.themebin.com/">ThemeBin</a>
      <!-- Please Do Not remove this link, thank u -->
      </div>
      </div>
      </div>
      </div>
 %(body_suffix)s
--- a/doxygen.cfg
+++ b/doxygen.cfg
@@ -31,7 +31,7 @@ PROJECT_NAME           = "Intel SPMD Program Compiler"
 # This could be handy for archiving the generated documentation or
 # if some version control system is used.
-PROJECT_NUMBER         = 1.0.10
+PROJECT_NUMBER         = 1.2.1
 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
 # base path where the generated documentation will be put.
@@ -585,7 +585,6 @@ INPUT                  = builtins.h \
                         ctx.h \
                         decl.h \
                         expr.h \
                         gatherbuf.h \
                         ispc.h \
                         llvmutil.h \
                         module.h \
@@ -598,7 +597,6 @@ INPUT                  = builtins.h \
                         ctx.cpp \
                         decl.cpp \
                         expr.cpp \
                         gatherbuf.cpp \
                         ispc.cpp \
                         llvmutil.cpp \
                         main.cpp \
--- a/examples/README.txt
+++ b/examples/README.txt
@@ -39,9 +39,6 @@ example implementation of this function that counts the number of times the
 callback is made and records some statistics about control flow coherence
 is provided in the instrument.cpp file.
 *** Note: on Linux, this example currently hits an assertion in LLVM during
 *** compilation
 Deferred
 ========
@@ -110,6 +107,13 @@ This program implements both the Black-Scholes and Binomial options pricing
 models in both ispc and regular serial C++ code.
 Perfbench
 =========
 This runs a number of microbenchmarks to measure system performance and
 code generation quality.
 RT
 ==
--- a/examples/aobench/Makefile
+++ b/examples/aobench/Makefile
@@ -1,35 +1,7 @@
-ARCH = $(shell uname)
+EXAMPLE=ao
 CPP_SRC=ao.cpp ao_serial.cpp
 ISPC_SRC=ao.ispc
 ISPC_TARGETS=sse2,sse4,avx
-TASK_CXX=../tasksys.cpp
+include ../common.mk
 TASK_LIB=-lpthread
 TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
 CXX=g++
 CXXFLAGS=-Iobjs/ -O3 -Wall -m64
 ISPC=ispc
 ISPCFLAGS=-O2 --target=sse4 --arch=x86-64
 default: ao
 .PHONY: dirs clean
 dirs:
 	/bin/mkdir -p objs/
 clean:
 	/bin/rm -rf objs *~ ao
 ao: dirs objs/ao.o objs/ao_serial.o objs/ao_ispc.o $(TASK_OBJ)
 	$(CXX) $(CXXFLAGS) -o $@ objs/ao.o objs/ao_ispc.o objs/ao_serial.o $(TASK_OBJ) -lm $(TASK_LIB)
 objs/%.o: %.cpp
 	$(CXX) $< $(CXXFLAGS) -c -o $@
 objs/%.o: ../%.cpp
 	$(CXX) $< $(CXXFLAGS) -c -o $@
 objs/ao.o: objs/ao_ispc.h 
 objs/%_ispc.h objs/%_ispc.o: %.ispc
 	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/aobench/ao.cpp
+++ b/examples/aobench/ao.cpp
@@ -55,7 +55,6 @@
 using namespace ispc;
 #include "../timing.h"
 #include "../cpuid.h"
 #define NSUBSAMPLES        2
@@ -105,38 +104,6 @@ savePPM(const char *fname, int w, int h)
 }
 // Make sure that the vector ISA used during compilation is supported by
 // the processor.  The ISPC_TARGET_* macro is set in the ispc-generated
 // header file that we include above.
 static void
 ensureTargetISAIsSupported() {
 #if defined(ISPC_TARGET_SSE2)
    bool isaSupported = CPUSupportsSSE2();
    const char *target = "SSE2";
 #elif defined(ISPC_TARGET_SSE4)
    bool isaSupported = CPUSupportsSSE4();
    const char *target = "SSE4";
 #elif defined(ISPC_TARGET_AVX)
    bool isaSupported = CPUSupportsAVX();
    const char *target = "AVX";
 #else
 #error "Unknown ISPC_TARGET_* value"
 #endif
    if (!isaSupported) {
        fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
                "set, which isn't\n***        supported by this computer's CPU!\n", target);
        fprintf(stderr, "***\n***        Please modify the "
 #ifdef _MSC_VER
                "MSVC project file "
 #else
                "Makefile "
 #endif
                "to select another target (e.g. sse2)\n***\n");
        exit(1);
    }
 }
 int main(int argc, char **argv)
 {
    if (argc != 4) {
@@ -151,8 +118,6 @@ int main(int argc, char **argv)
        height = atoi (argv[3]);
    }
    ensureTargetISAIsSupported();
    // Allocate space for output images
    img = new unsigned char[width * height * 3];
    fimg = new float[width * height * 3];
--- a/examples/aobench/ao.ispc
+++ b/examples/aobench/ao.ispc
@@ -50,7 +50,6 @@ struct Isect {
 struct Sphere {
    vec        center;
    float      radius;
 };
 struct Plane {
@@ -75,16 +74,15 @@ static inline vec vcross(vec v0, vec v1) {
    return ret;
 }
-static inline void vnormalize(reference vec v) {
+static inline void vnormalize(vec &v) {
    float len2 = dot(v, v);
    float invlen = rsqrt(len2);
    v *= invlen;
 }
-static inline void
+static void
-ray_plane_intersect(reference Isect isect, reference Ray ray, 
+ray_plane_intersect(Isect &isect, Ray &ray, uniform Plane &plane) {
                    reference Plane plane) {
    float d = -dot(plane.p, plane.n);
    float v = dot(ray.dir, plane.n);
@@ -104,8 +102,7 @@ ray_plane_intersect(reference Isect isect, reference Ray ray,
 static inline void
-ray_sphere_intersect(reference Isect isect, reference Ray ray, 
+ray_sphere_intersect(Isect &isect, Ray &ray, uniform Sphere &sphere) {
                     reference Sphere sphere) {
    vec rs = ray.org - sphere.center;
    float B = dot(rs, ray.dir);
@@ -126,8 +123,8 @@ ray_sphere_intersect(reference Isect isect, reference Ray ray,
 }
-static inline void
+static void
-orthoBasis(reference vec basis[3], vec n) {
+orthoBasis(vec basis[3], vec n) {
    basis[2] = n;
    basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;
@@ -149,9 +146,9 @@ orthoBasis(reference vec basis[3], vec n) {
 }
-static inline float
+static float
-ambient_occlusion(reference Isect isect, reference Plane plane, 
+ambient_occlusion(Isect &isect, uniform Plane &plane, uniform Sphere spheres[3],
-                  reference Sphere spheres[3], reference RNGState rngstate) {
+                  RNGState &rngstate) {
    float eps = 0.0001f;
    vec p, n;
    vec basis[3];
@@ -168,8 +165,8 @@ ambient_occlusion(reference Isect isect, reference Plane plane,
            Ray ray;
            Isect occIsect;
-            float theta = sqrt(frandom(rngstate));
+            float theta = sqrt(frandom(&rngstate));
-            float phi   = 2.0f * M_PI * frandom(rngstate);
+            float phi   = 2.0f * M_PI * frandom(&rngstate);
            float x = cos(phi) * theta;
            float y = sin(phi) * theta;
            float z = sqrt(1.0 - theta * theta);
@@ -205,113 +202,53 @@ ambient_occlusion(reference Isect isect, reference Plane plane,
 */
 static void ao_scanlines(uniform int y0, uniform int y1, uniform int w, 
                         uniform int h,  uniform int nsubsamples, 
-                         reference uniform float image[]) {
+                         uniform float image[]) {
-    static Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
+    static uniform Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
-    static Sphere spheres[3] = {
+    static uniform Sphere spheres[3] = {
        { { -2.0f, 0.0f, -3.5f }, 0.5f },
        { { -0.5f, 0.0f, -3.0f }, 0.5f },
        { { 1.0f, 0.0f, -2.2f }, 0.5f } };
    RNGState rngstate;
-    seed_rng(rngstate, y0);
+    seed_rng(&rngstate, y0);
    float invSamples = 1.f / nsubsamples;
-    // Compute the mapping between the 'programCount'-wide program
+    foreach_tiled(y = y0 ... y1, x = 0 ... w, 
-    // instances running in parallel and samples in the image.  
+                  u = 0 ... nsubsamples, v = 0 ... nsubsamples) {
-    //
+        float du = (float)u * invSamples, dv = (float)v * invSamples;
    // For now, we'll always take four samples per pixel, so start by
    // initializing du and dv with offsets into subpixel samples.  We'll
    // take care of further updating du and dv for the case where we're
    // doing more than 4 program instances in parallel shortly.
    uniform float uSteps[4] = { 0, 1, 0, 1 };
    uniform float vSteps[4] = { 0, 0, 1, 1 };
    float du = uSteps[programIndex % 4] / nsubsamples;
    float dv = vSteps[programIndex % 4] / nsubsamples;
-    // Now handle the case where we are able to do more than one pixel's
+        // Figure out x,y pixel in NDC
-    // worth of work at once.  nx records the number of pixels in the x
+        float px =  (x + du - (w / 2.0f)) / (w / 2.0f);
-    // direction we do per iteration and ny the number in y.
+        float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
-    uniform int nx = 1, ny = 1;
+        float ret = 0.f;
        Ray ray;
        Isect isect;
-    // FIXME: We actually need ny to be 1 regardless of the decomposition,
+        ray.org = 0.f;
    // since the task decomposition is one scanline high.
-    if (programCount == 8) {
+        // Poor man's perspective projection
-        // Do two pixels at once in the x direction
+        ray.dir.x = px;
-        nx = 2;
+        ray.dir.y = py;
-        if (programIndex >= 4) 
+        ray.dir.z = -1.0;
-            // And shift the offsets for the second pixel's worth of work
+        vnormalize(ray.dir);
            ++du;
    }
    else if (programCount == 16) {
        nx = 4;
        ny = 1;
        if (programIndex >= 4 && programIndex < 8)
            ++du;
        if (programIndex >= 8 && programIndex < 12)
            du += 2;
        if (programIndex >= 12)
            du += 3;
    }
-    // Now loop over all of the pixels, stepping in x and y as calculated
+        isect.t   = 1.0e+17;
-    // above.  (Assumes that ny divides y and nx divides x...)
+        isect.hit = 0;
    for (uniform int y = y0; y < y1; y += ny) {
        for (uniform int x = 0; x < w; x += nx)  {
            // Figure out x,y pixel in NDC
            float px =  (x + du - (w / 2.0f)) / (w / 2.0f);
            float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
            float ret = 0.f;
            Ray ray;
            Isect isect;
-            ray.org = 0.f;
+        for (uniform int snum = 0; snum < 3; ++snum)
            ray_sphere_intersect(isect, ray, spheres[snum]);
        ray_plane_intersect(isect, ray, plane);
-            // Poor man's perspective projection
+        // Note use of 'coherent' if statement; the set of rays we
-            ray.dir.x = px;
+        // trace will often all hit or all miss the scene
-            ray.dir.y = py;
+        cif (isect.hit) {
-            ray.dir.z = -1.0;
+            ret = ambient_occlusion(isect, plane, spheres, rngstate);
-            vnormalize(ray.dir);
+            ret *= invSamples * invSamples;
-            isect.t   = 1.0e+17;
+            int offset = 3 * (y * w + x);
-            isect.hit = 0;
+            atomic_add_local(&image[offset], ret);
-
+            atomic_add_local(&image[offset+1], ret);
-            for (uniform int snum = 0; snum < 3; ++snum)
+            atomic_add_local(&image[offset+2], ret);
                ray_sphere_intersect(isect, ray, spheres[snum]);
            ray_plane_intersect(isect, ray, plane);
            // Note use of 'coherent' if statement; the set of rays we
            // trace will often all hit or all miss the scene
            cif (isect.hit)
                ret = ambient_occlusion(isect, plane, spheres, rngstate);
            // This is a little grungy; we have results for
            // programCount-worth of values.  Because we're doing 2x2
            // subsamples, we need to peel them off in groups of four,
            // average the four values for each pixel, and update the
            // output image.
            //
            // Store the varying value to a uniform array of the same size.
            // See the discussion about communication among program
            // instances in the ispc user's manual for more discussion on
            // this idiom.
            uniform float retArray[programCount];
            retArray[programIndex] = ret;
            // offset to the first pixel in the image
            uniform int offset = 3 * (y * w + x);
            for (uniform int p = 0; p < programCount; p += 4, offset += 3) {
                // Get the four sample values for this pixel
                uniform float sumret = retArray[p] + retArray[p+1] + retArray[p+2] +
                    retArray[p+3];
                // Normalize by number of samples taken
                sumret /= nsubsamples * nsubsamples; 
                // Store result in the image
                image[offset+0] = sumret;
                image[offset+1] = sumret;
                image[offset+2] = sumret;
            }
        }
    }
 }
@@ -331,5 +268,5 @@ static void task ao_task(uniform int width, uniform int height,
 export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples, 
                          uniform float image[]) {
-    launch[h] < ao_task(w, h, nsubsamples, image) >;
+    launch[h] ao_task(w, h, nsubsamples, image);
 }
--- a/examples/aobench/aobench.vcxproj
+++ b/examples/aobench/aobench.vcxproj
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
@@ -26,18 +26,18 @@
  <ItemGroup>
    <CustomBuild Include="ao.ispc">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
    </CustomBuild>
  </ItemGroup>
  <PropertyGroup Label="Globals">
@@ -86,15 +86,19 @@
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <LinkIncremental>true</LinkIncremental>
    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
    <ExecutablePath>$(ExecutablePath);$(ProjectDir)..\..</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <LinkIncremental>false</LinkIncremental>
    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
@@ -103,6 +107,7 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -118,6 +123,7 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -135,6 +141,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
@@ -153,6 +160,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
@@ -165,4 +173,4 @@
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
-</Project>
+</Project>
--- a/examples/aobench_instrumented/Makefile
+++ b/examples/aobench_instrumented/Makefile
@@ -2,7 +2,7 @@
 CXX=g++ -m64
 CXXFLAGS=-Iobjs/ -g3 -Wall
 ISPC=ispc
-ISPCFLAGS=-O2 --instrument --arch=x86-64
+ISPCFLAGS=-O2 --instrument --arch=x86-64 --target=sse2
 default: ao
@@ -14,13 +14,13 @@ dirs:
 clean:
 	/bin/rm -rf objs *~ ao
-ao: dirs objs/ao.o objs/instrument.o objs/ao_ispc.o
+ao: objs/ao.o objs/instrument.o objs/ao_ispc.o ../tasksys.cpp
-	$(CXX) $(CXXFLAGS) -o $@ objs/ao.o objs/ao_ispc.o objs/instrument.o -lm -lpthread
+	$(CXX) $(CXXFLAGS) -o $@ $^ -lm -lpthread
-objs/%.o: %.cpp
+objs/%.o: %.cpp dirs
 	$(CXX) $< $(CXXFLAGS) -c -o $@
 objs/ao.o: objs/ao_ispc.h 
-objs/%_ispc.h objs/%_ispc.o: %.ispc
+objs/%_ispc.h objs/%_ispc.o: %.ispc dirs
-	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
+	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_instrumented_ispc.h
--- a/examples/aobench_instrumented/ao.cpp
+++ b/examples/aobench_instrumented/ao.cpp
@@ -32,7 +32,6 @@
 */
 #ifdef _MSC_VER
 #define _CRT_SECURE_NO_WARNINGS
 #define NOMINMAX
 #pragma warning (disable: 4244)
 #pragma warning (disable: 4305)
@@ -51,12 +50,11 @@
 #include <algorithm>
 #include <sys/types.h>
-#include "ao_ispc.h"
+#include "ao_instrumented_ispc.h"
 using namespace ispc;
 #include "instrument.h"
 #include "../timing.h"
 #include "../cpuid.h"
 #define NSUBSAMPLES        2
@@ -104,37 +102,6 @@ savePPM(const char *fname, int w, int h)
 }
 // Make sure that the vector ISA used during compilation is supported by
 // the processor.  The ISPC_TARGET_* macro is set in the ispc-generated
 // header file that we include above.
 static void
 ensureTargetISAIsSupported() {
 #if defined(ISPC_TARGET_SSE2)
    bool isaSupported = CPUSupportsSSE2();
    const char *target = "SSE2";
 #elif defined(ISPC_TARGET_SSE4)
    bool isaSupported = CPUSupportsSSE4();
    const char *target = "SSE4";
 #elif defined(ISPC_TARGET_AVX)
    bool isaSupported = CPUSupportsAVX();
    const char *target = "AVX";
 #else
 #error "Unknown ISPC_TARGET_* value"
 #endif
    if (!isaSupported) {
        fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
                "set, which isn't\n***        supported by this computer's CPU!\n", target);
        fprintf(stderr, "***\n***        Please modify the "
 #ifdef _MSC_VER
                "MSVC project file "
 #else
                "Makefile "
 #endif
                "to select another target (e.g. sse2)\n***\n");
        exit(1);
    }
 }
 int main(int argc, char **argv)
 {
@@ -150,8 +117,6 @@ int main(int argc, char **argv)
        height = atoi (argv[3]);
    }
    ensureTargetISAIsSupported();
    // Allocate space for output images
    img = new unsigned char[width * height * 3];
    fimg = new float[width * height * 3];
--- a/examples/aobench_instrumented/ao.ispc
+++ b/examples/aobench_instrumented/ao.ispc
@@ -75,7 +75,7 @@ static inline vec vcross(vec v0, vec v1) {
    return ret;
 }
-static inline void vnormalize(reference vec v) {
+static inline void vnormalize(vec &v) {
    float len2 = dot(v, v);
    float invlen = rsqrt(len2);
    v *= invlen;
@@ -83,8 +83,7 @@ static inline void vnormalize(reference vec v) {
 static inline void
-ray_plane_intersect(reference Isect isect, reference Ray ray, 
+ray_plane_intersect(Isect &isect, Ray &ray, Plane &plane) {
                    reference Plane plane) {
    float d = -dot(plane.p, plane.n);
    float v = dot(ray.dir, plane.n);
@@ -104,8 +103,7 @@ ray_plane_intersect(reference Isect isect, reference Ray ray,
 static inline void
-ray_sphere_intersect(reference Isect isect, reference Ray ray, 
+ray_sphere_intersect(Isect &isect, Ray &ray, Sphere &sphere) {
                     reference Sphere sphere) {
    vec rs = ray.org - sphere.center;
    float B = dot(rs, ray.dir);
@@ -127,7 +125,7 @@ ray_sphere_intersect(reference Isect isect, reference Ray ray,
 static inline void
-orthoBasis(reference vec basis[3], vec n) {
+orthoBasis(vec basis[3], vec n) {
    basis[2] = n;
    basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;
@@ -150,8 +148,8 @@ orthoBasis(reference vec basis[3], vec n) {
 static inline float
-ambient_occlusion(reference Isect isect, reference Plane plane, 
+ambient_occlusion(Isect &isect, Plane &plane, Sphere spheres[3], 
-                  reference Sphere spheres[3], reference RNGState rngstate) {
+                  RNGState &rngstate) {
    float eps = 0.0001f;
    vec p, n;
    vec basis[3];
@@ -168,8 +166,8 @@ ambient_occlusion(reference Isect isect, reference Plane plane,
            Ray ray;
            Isect occIsect;
-            float theta = sqrt(frandom(rngstate));
+            float theta = sqrt(frandom(&rngstate));
-            float phi   = 2.0f * M_PI * frandom(rngstate);
+            float phi   = 2.0f * M_PI * frandom(&rngstate);
            float x = cos(phi) * theta;
            float y = sin(phi) * theta;
            float z = sqrt(1.0 - theta * theta);
@@ -203,8 +201,9 @@ ambient_occlusion(reference Isect isect, reference Plane plane,
 /* Compute the image for the scanlines from [y0,y1), for an overall image
   of width w and height h.
 */
-void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h, 
+static void ao_scanlines(uniform int y0, uniform int y1, uniform int w, 
-                  uniform int nsubsamples, reference uniform float image[]) {
+                         uniform int h,  uniform int nsubsamples, 
                         uniform float image[]) {
    static Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
    static Sphere spheres[3] = {
        { { -2.0f, 0.0f, -3.5f }, 0.5f },
@@ -212,7 +211,7 @@ void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,
        { { 1.0f, 0.0f, -2.2f }, 0.5f } };
    RNGState rngstate;
-    seed_rng(rngstate, y0);
+    seed_rng(&rngstate, y0);
    // Compute the mapping between the 'programCount'-wide program
    // instances running in parallel and samples in the image.  
@@ -231,6 +230,9 @@ void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,
    // direction we do per iteration and ny the number in y.
    uniform int nx = 1, ny = 1;
    // FIXME: We actually need ny to be 1 regardless of the decomposition,
    // since the task decomposition is one scanline high.
    if (programCount == 8) {
        // Do two pixels at once in the x direction
        nx = 2;
@@ -239,19 +241,21 @@ void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,
            ++du;
    }
    else if (programCount == 16) {
-        // Two at once in both x and y
+        nx = 4;
-        nx = ny = 2;
+        ny = 1;
-        if ((programIndex >= 4 && programIndex < 8) || programIndex >= 12)
+        if (programIndex >= 4 && programIndex < 8)
            ++du;
-        if (programIndex >= 8)  
+        if (programIndex >= 8 && programIndex < 12)
-            ++dv;
+            du += 2;
        if (programIndex >= 12)
            du += 3;
    }
    // Now loop over all of the pixels, stepping in x and y as calculated
    // above.  (Assumes that ny divides y and nx divides x...)
    for (uniform int y = y0; y < y1; y += ny) {
        for (uniform int x = 0; x < w; x += nx)  {
-            // Figur out x,y pixel in NDC
+            // Figure out x,y pixel in NDC
            float px =  (x + du - (w / 2.0f)) / (w / 2.0f);
            float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
            float ret = 0.f;
@@ -293,7 +297,7 @@ void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,
            // offset to the first pixel in the image
            uniform int offset = 3 * (y * w + x);
-            for (uniform int p = 0; p < programCount; p += 4, ++offset) {
+            for (uniform int p = 0; p < programCount; p += 4, offset += 3) {
                // Get the four sample values for this pixel
                uniform float sumret = retArray[p] + retArray[p+1] + retArray[p+2] +
                    retArray[p+3];
@@ -315,3 +319,15 @@ export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples,
                    uniform float image[]) {
    ao_scanlines(0, h, w, h, nsubsamples, image);
 }
 static void task ao_task(uniform int width, uniform int height, 
                         uniform int nsubsamples, uniform float image[]) {
    ao_scanlines(taskIndex, taskIndex+1, width, height, nsubsamples, image);
 }
 export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples, 
                          uniform float image[]) {
    launch[h] ao_task(w, h, nsubsamples, image);
 }
--- a/examples/aobench_instrumented/aobench_instrumented.vcxproj
+++ b/examples/aobench_instrumented/aobench_instrumented.vcxproj
@@ -21,22 +21,23 @@
  <ItemGroup>
    <ClCompile Include="ao.cpp" />
    <ClCompile Include="instrument.cpp" />
    <ClCompile Include="../tasksys.cpp" />
  </ItemGroup>
  <ItemGroup>
    <CustomBuild Include="ao.ispc">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --instrument
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename)_instrumented.obj -h $(TargetDir)%(Filename)_instrumented_ispc.h --arch=x86 --instrument --target=sse2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --instrument
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename)_instrumented.obj -h $(TargetDir)%(Filename)_instrumented_ispc.h --instrument --target=sse2
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename)_instrumented.obj;$(TargetDir)%(Filename)_instrumented_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename)_instrumented.obj;$(TargetDir)%(Filename)_instrumented_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --instrument
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename)_instrumented.obj -h $(TargetDir)%(Filename)_instrumented_ispc.h --arch=x86 --instrument --target=sse2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --instrument
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename)_instrumented.obj -h $(TargetDir)%(Filename)_instrumented_ispc.h --instrument --target=sse2
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename)_instrumented.obj;$(TargetDir)%(Filename)_instrumented_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename)_instrumented.obj;$(TargetDir)%(Filename)_instrumented_ispc.h</Outputs>
    </CustomBuild>
  </ItemGroup>
  <PropertyGroup Label="Globals">
@@ -85,15 +86,23 @@
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <LinkIncremental>true</LinkIncremental>
    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
    <PreBuildEventUseInBuild>true</PreBuildEventUseInBuild>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
    <PreBuildEventUseInBuild>true</PreBuildEventUseInBuild>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <LinkIncremental>false</LinkIncremental>
    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
    <PreBuildEventUseInBuild>true</PreBuildEventUseInBuild>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
    <PreBuildEventUseInBuild>true</PreBuildEventUseInBuild>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
@@ -101,7 +110,8 @@
      </PrecompiledHeader>
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -114,7 +124,8 @@
      </PrecompiledHeader>
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -129,7 +140,8 @@
      <Optimization>MaxSpeed</Optimization>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -146,7 +158,8 @@
      <Optimization>MaxSpeed</Optimization>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -158,4 +171,4 @@
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
-</Project>
+</Project>
--- a/examples/common.mk
+++ b/examples/common.mk
@@ -0,0 +1,65 @@
 TASK_CXX=../tasksys.cpp
 TASK_LIB=-lpthread
 TASK_OBJ=tasksys.o
 CXX=g++
 CXXFLAGS=-Iobjs/ -O2 -m64
 LIBS=-lm $(TASK_LIB) -lstdc++
 ISPC=ispc -O2 --arch=x86-64 $(ISPC_FLAGS)
 ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc.o $(ISPC_SRC:.ispc=)_ispc_sse2.o \
 	$(ISPC_SRC:.ispc=)_ispc_sse4.o $(ISPC_SRC:.ispc=)_ispc_avx.o)
 ISPC_HEADER=objs/$(ISPC_SRC:.ispc=_ispc.h)
 CPP_OBJS=$(addprefix objs/, $(CPP_SRC:.cpp=.o) $(TASK_OBJ))
 default: $(EXAMPLE)
 all: $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16 $(EXAMPLE)-scalar
 .PHONY: dirs clean
 dirs:
 	/bin/mkdir -p objs/
 objs/%.cpp objs/%.o objs/%.h: dirs
 clean:
 	/bin/rm -rf objs *~ $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16
 $(EXAMPLE): $(CPP_OBJS) $(ISPC_OBJS)
 	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
 objs/%.o: %.cpp dirs $(ISPC_HEADER)
 	$(CXX) $< $(CXXFLAGS) -c -o $@
 objs/%.o: ../%.cpp dirs
 	$(CXX) $< $(CXXFLAGS) -c -o $@
 objs/$(EXAMPLE).o: objs/$(EXAMPLE)_ispc.h
 objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
 	$(ISPC) --target=$(ISPC_TARGETS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
 objs/$(ISPC_SRC:.ispc=)_sse4.cpp: $(ISPC_SRC)
 	$(ISPC) $< -o $@ --target=generic-4 --emit-c++ --c++-include-file=sse4.h
 objs/$(ISPC_SRC:.ispc=)_sse4.o: objs/$(ISPC_SRC:.ispc=)_sse4.cpp
 	$(CXX) -I../intrinsics -msse4.2 $< $(CXXFLAGS) -c -o $@
 $(EXAMPLE)-sse4: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_sse4.o
 	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
 objs/$(ISPC_SRC:.ispc=)_generic16.cpp: $(ISPC_SRC)
 	$(ISPC) $< -o $@ --target=generic-16 --emit-c++ --c++-include-file=generic-16.h
 objs/$(ISPC_SRC:.ispc=)_generic16.o: objs/$(ISPC_SRC:.ispc=)_generic16.cpp
 	$(CXX) -I../intrinsics $< $(CXXFLAGS) -c -o $@
 $(EXAMPLE)-generic16: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_generic16.o
 	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
 objs/$(ISPC_SRC:.ispc=)_scalar.o: $(ISPC_SRC)
 	$(ISPC) $< -o $@ --target=generic-1
 $(EXAMPLE)-scalar: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_scalar.o
 	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
--- a/examples/deferred/Makefile
+++ b/examples/deferred/Makefile
@@ -1,42 +1,8 @@
-ARCH = $(shell uname)
+EXAMPLE=deferred_shading
 CPP_SRC=common.cpp main.cpp dynamic_c.cpp dynamic_cilk.cpp
 ISPC_SRC=kernels.ispc
 ISPC_TARGETS=sse2,sse4-x2,avx-x2
 ISPC_FLAGS=--opt=fast-math
-TASK_CXX=../tasks_pthreads.cpp
+include ../common.mk
 TASK_LIB=-lpthread
 ifeq ($(ARCH), Darwin)
  TASK_CXX=../tasks_gcd.cpp
  TASK_LIB=
 endif
 TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
 CXX=g++
 CXXFLAGS=-Iobjs/ -O3 -Wall -m64
 ISPC=ispc
 ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64 --math-lib=fast
 OBJS=objs/main.o objs/common.o objs/kernels_ispc.o objs/dynamic_c.o objs/dynamic_cilk.o
 default: deferred_shading
 .PHONY: dirs clean
 .PRECIOUS: objs/kernels_ispc.h
 dirs:
 	/bin/mkdir -p objs/
 clean:
 	/bin/rm -rf objs *~ deferred_shading
 deferred_shading: dirs $(OBJS) $(TASK_OBJ)
 	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) $(TASK_OBJ) -lm $(TASK_LIB)
 objs/%.o: %.cpp objs/kernels_ispc.h deferred.h
 	$(CXX) $< $(CXXFLAGS) -c -o $@
 objs/%.o: ../%.cpp
 	$(CXX) $< $(CXXFLAGS) -c -o $@
 objs/%_ispc.h objs/%_ispc.o: %.ispc
 	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/deferred/common.cpp
+++ b/examples/deferred/common.cpp
@@ -64,7 +64,7 @@
 ///////////////////////////////////////////////////////////////////////////
 static void *
-lAlignedMalloc(int64_t size, int32_t alignment) {
+lAlignedMalloc(size_t size, int32_t alignment) {
 #ifdef ISPC_IS_WINDOWS
    return _aligned_malloc(size, alignment);
 #endif
@@ -118,6 +118,7 @@ Framebuffer::clear() {
    memset(b, 0, nPixels);
 }
 InputData *
 CreateInputDataFromFile(const char *path) {
    FILE *in = fopen(path, "rb");
@@ -177,8 +178,7 @@ CreateInputDataFromFile(const char *path) {
 }
-void DeleteInputData(InputData *input)
+void DeleteInputData(InputData *input) {
 {
    lAlignedFree(input->chunk);
 }
@@ -204,6 +204,7 @@ void WriteFrame(const char *filename, const InputData *input,
    fprintf(out, "P6 %d %d 255\n", input->header.framebufferWidth, 
            input->header.framebufferHeight);
    fwrite(framebufferAOS, imageBytes, 1, out);
    fclose(out);
    lAlignedFree(framebufferAOS);
 }
--- a/examples/deferred/deferred_shading.vcxproj
+++ b/examples/deferred/deferred_shading.vcxproj
@@ -64,15 +64,19 @@
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <LinkIncremental>true</LinkIncremental>
    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <LinkIncremental>false</LinkIncremental>
    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
@@ -81,6 +85,7 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -96,6 +101,7 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -113,6 +119,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
@@ -131,6 +138,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
@@ -145,23 +153,23 @@
    <ClCompile Include="dynamic_c.cpp" />
    <ClCompile Include="dynamic_cilk.cpp" />
    <ClCompile Include="main.cpp" />
-    <ClCompile Include="../tasks_concrt.cpp" />
+    <ClCompile Include="../tasksys.cpp" />
  </ItemGroup>
  <ItemGroup>
    <CustomBuild Include="kernels.ispc">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
    </CustomBuild>
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
--- a/examples/deferred/dynamic_c.cpp
+++ b/examples/deferred/dynamic_c.cpp
@@ -60,7 +60,7 @@
 #define DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE 1
 static void *
-lAlignedMalloc(int64_t size, int32_t alignment) {
+lAlignedMalloc(size_t size, int32_t alignment) {
 #ifdef ISPC_IS_WINDOWS
    return _aligned_malloc(size, alignment);
 #endif
@@ -141,12 +141,10 @@ ComputeZBoundsRow(int tileY, int tileWidth, int tileHeight,
 {
    for (int tileX = 0; tileX < numTilesX; ++tileX) {
        float minZ, maxZ;
-        ComputeZBounds(
+        ComputeZBounds(tileX * tileWidth, tileX * tileWidth + tileWidth,
-            tileX * tileWidth, tileX * tileWidth + tileWidth,
+                       tileY * tileHeight, tileY * tileHeight + tileHeight,
-            tileY * tileHeight, tileY * tileHeight + tileHeight,
+                       zBuffer, gBufferWidth, cameraProj_33, cameraProj_43, 
-            zBuffer, gBufferWidth,
+                       cameraNear, cameraFar, &minZ, &maxZ);
            cameraProj_33, cameraProj_43, cameraNear, cameraFar,
            &minZ, &maxZ);
        minZArray[tileX] = minZ;
        maxZArray[tileX] = maxZ;
    }
@@ -282,8 +280,8 @@ void InitDynamicC(InputData *input) {
 }
-// numLights need not be a multiple of programCount here, but the input and output arrays
+/* We're going to split a tile into 4 sub-tiles.  This function
-// should be able to handle programCount-sized load/stores.
+   reclassifies the tile's lights with respect to the sub-tiles. */
 static void
 SplitTileMinMax(
    int tileMidX, int tileMidY,
@@ -339,7 +337,7 @@ SplitTileMinMax(
        float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
        float light_attenuationEndNeg = -light_attenuationEnd;
-        // Test lights again subtile z bounds
+        // Test lights again against subtile z bounds
        bool inFrustum[4];
        inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) &&
            (subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg);
@@ -414,7 +412,8 @@ Float32ToUnorm8(float f) {
 }
-static inline float half_to_float_fast(uint16_t h) {
+static inline float
 half_to_float_fast(uint16_t h) {
    uint32_t hs = h & (int32_t)0x8000u;  // Pick off sign bit
    uint32_t he = h & (int32_t)0x7C00u;  // Pick off exponent bits
    uint32_t hm = h & (int32_t)0x03FFu;  // Pick off mantissa bits
--- a/examples/deferred/dynamic_cilk.cpp
+++ b/examples/deferred/dynamic_cilk.cpp
@@ -31,7 +31,7 @@
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 */
-#ifdef __cilkplusplus
+#ifdef __cilk
 #include "deferred.h"
 #include "kernels_ispc.h"
@@ -60,7 +60,7 @@
 #define DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE 1
 static void *
-lAlignedMalloc(int64_t size, int32_t alignment) {
+lAlignedMalloc(size_t size, int32_t alignment) {
 #ifdef ISPC_IS_WINDOWS
    return _aligned_malloc(size, alignment);
 #endif
@@ -395,4 +395,4 @@ DispatchDynamicCilk(InputData *input, Framebuffer *framebuffer)
    }
 }
-#endif // __cilkplusplus
+#endif // __cilk
--- a/examples/deferred/kernels.ispc
+++ b/examples/deferred/kernels.ispc
@@ -35,39 +35,37 @@
 struct InputDataArrays
 {
-    uniform float zBuffer[];
+    float *zBuffer;
-    uniform unsigned int16 normalEncoded_x[]; // half float
+    unsigned int16 *normalEncoded_x; // half float
-    uniform unsigned int16 normalEncoded_y[]; // half float
+    unsigned int16 *normalEncoded_y; // half float
-    uniform unsigned int16 specularAmount[]; // half float
+    unsigned int16 *specularAmount; // half float
-    uniform unsigned int16 specularPower[]; // half float
+    unsigned int16 *specularPower; // half float
-    uniform unsigned int8 albedo_x[]; // unorm8
+    unsigned int8 *albedo_x; // unorm8
-    uniform unsigned int8 albedo_y[]; // unorm8
+    unsigned int8 *albedo_y; // unorm8
-    uniform unsigned int8 albedo_z[]; // unorm8
+    unsigned int8 *albedo_z; // unorm8
-    uniform float lightPositionView_x[];
+    float *lightPositionView_x;
-    uniform float lightPositionView_y[];
+    float *lightPositionView_y;
-    uniform float lightPositionView_z[];
+    float *lightPositionView_z;
-    uniform float lightAttenuationBegin[];
+    float *lightAttenuationBegin;
-    uniform float lightColor_x[];
+    float *lightColor_x;
-    uniform float lightColor_y[];
+    float *lightColor_y;
-    uniform float lightColor_z[];
+    float *lightColor_z;
-    uniform float lightAttenuationEnd[];
+    float *lightAttenuationEnd;
 };
 struct InputHeader
 {
-    uniform float cameraProj[4][4];
+    float cameraProj[4][4];
-    uniform float cameraNear;
+    float cameraNear;
-    uniform float cameraFar;
+    float cameraFar;
-    uniform int32 framebufferWidth;
+    int32 framebufferWidth;
-    uniform int32 framebufferHeight;
+    int32 framebufferHeight;
-    uniform int32 numLights;
+    int32 numLights;
-    uniform int32 inputDataChunkSize;
+    int32 inputDataChunkSize;
-    uniform int32 inputDataArrayOffsets[idaNum];
+    int32 inputDataArrayOffsets[idaNum];
 };
 export void foo(reference InputHeader h) { }
 ///////////////////////////////////////////////////////////////////////////
 // Common utility routines
@@ -79,8 +77,7 @@ dot3(float x, float y, float z, float a, float b, float c) {
 static inline void
-normalize3(float x, float y, float z, reference float ox, 
+normalize3(float x, float y, float z, float &ox, float &oy, float &oz) {
           reference float oy, reference float oz) {
    float n = rsqrt(x*x + y*y + z*z);
    ox = x * n;
    oy = y * n;
@@ -100,7 +97,6 @@ Float32ToUnorm8(float f) {
 }
 // tile width must be a multiple of programCount (SIMD size)
 static void
 ComputeZBounds(
    uniform int32 tileStartX, uniform int32 tileEndX,
@@ -112,17 +108,17 @@ ComputeZBounds(
    uniform float cameraProj_33, uniform float cameraProj_43,
    uniform float cameraNear, uniform float cameraFar,
    // Output
-    reference uniform float minZ,
+    uniform float &minZ,
-    reference uniform float maxZ
+    uniform float &maxZ
    )
 {
    // Find Z bounds
    float laneMinZ = cameraFar;
    float laneMaxZ = cameraNear;
    for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
-        for (uniform int32 x = tileStartX; x < tileEndX; x += programCount) {
+        foreach (x = tileStartX ... tileEndX) {
            // Unproject depth buffer Z value into view space
-            float z = zBuffer[(y * gBufferWidth + x) + programIndex];
+            float z = zBuffer[y * gBufferWidth + x];
            float viewSpaceZ = cameraProj_43 / (z - cameraProj_33);
            // Work out Z bounds for our samples
@@ -138,8 +134,6 @@ ComputeZBounds(
 }
 // tile width must be a multiple of programCount (SIMD size)
 // numLights must currently be a multiple of programCount (SIMD size)
 export uniform int32
 IntersectLightsWithTileMinMax(
    uniform int32 tileStartX, uniform int32 tileEndX,
@@ -158,51 +152,33 @@ IntersectLightsWithTileMinMax(
    uniform float light_positionView_z_array[],
    uniform float light_attenuationEnd_array[],
    // Output
-    reference uniform int32 tileLightIndices[]
+    uniform int32 tileLightIndices[]
    )
 {
    uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
    uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
-    // Parallize across frustum planes.
+    uniform float frustumPlanes_xy[4] = {
-    // We really only have four side planes here, but write the code to
+        -(cameraProj_11 * gBufferScale_x),
-    // handle programCount > 4 robustly
+         (cameraProj_11 * gBufferScale_x),
-    uniform float frustumPlanes_xy[programCount];
+         (cameraProj_22 * gBufferScale_y),
-    uniform float frustumPlanes_z[programCount];
+        -(cameraProj_22 * gBufferScale_y) };
    uniform float frustumPlanes_z[4] = {
         tileEndX - gBufferScale_x,
        -tileStartX + gBufferScale_x,
         tileEndY - gBufferScale_y,
        -tileStartY + gBufferScale_y };
-    // TODO: If programIndex < 4 here? Don't care about masking off the
+    for (uniform int i = 0; i < 4; ++i) {
-    // rest but if interleaving ("x2" modes) the other lanes should ideally
+        uniform float norm = rsqrt(frustumPlanes_xy[i] * frustumPlanes_xy[i] + 
-    // not be emitted...
+                                   frustumPlanes_z[i] * frustumPlanes_z[i]);
-    {
+        frustumPlanes_xy[i] *= norm;
-        // This one is totally constant over the whole screen... worth pulling it up at all?
+        frustumPlanes_z[i] *= norm;
        float frustumPlanes_xy_v;
        frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 0, -(cameraProj_11 * gBufferScale_x));
        frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 1,  (cameraProj_11 * gBufferScale_x));
        frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 2,  (cameraProj_22 * gBufferScale_y));
        frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 3, -(cameraProj_22 * gBufferScale_y));
        float frustumPlanes_z_v;
        frustumPlanes_z_v = insert(frustumPlanes_z_v, 0,  tileEndX - gBufferScale_x);
        frustumPlanes_z_v = insert(frustumPlanes_z_v, 1, -tileStartX + gBufferScale_x);
        frustumPlanes_z_v = insert(frustumPlanes_z_v, 2,  tileEndY - gBufferScale_y);
        frustumPlanes_z_v = insert(frustumPlanes_z_v, 3, -tileStartY + gBufferScale_y);
        // Normalize
        float norm = rsqrt(frustumPlanes_xy_v * frustumPlanes_xy_v + 
                           frustumPlanes_z_v * frustumPlanes_z_v);
            frustumPlanes_xy_v *= norm;
            frustumPlanes_z_v *= norm;
        // Save out for uniform use later
        frustumPlanes_xy[programIndex] = frustumPlanes_xy_v;
        frustumPlanes_z[programIndex] = frustumPlanes_z_v;
    }
    uniform int32 tileNumLights = 0;
-    for (uniform int32 baseLightIndex = 0; baseLightIndex < numLights; 
+    foreach (lightIndex = 0 ... numLights) {
         baseLightIndex += programCount) {
        int32 lightIndex = baseLightIndex + programIndex;
        float light_positionView_z = light_positionView_z_array[lightIndex];
        float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
        float light_attenuationEndNeg = -light_attenuationEnd;
@@ -217,32 +193,31 @@ IntersectLightsWithTileMinMax(
        // don't actually need to mask the rest of this function - this is
        // just a greedy early-out.  Could also structure all of this as
        // nested if() statements, but this a bit easier to read
-        if (!any(inFrustum)) 
+        if (any(inFrustum)) {
-            continue;
+            float light_positionView_x = light_positionView_x_array[lightIndex];
            float light_positionView_y = light_positionView_y_array[lightIndex];
-        float light_positionView_x = light_positionView_x_array[lightIndex];
+            d = light_positionView_z * frustumPlanes_z[0] + 
-        float light_positionView_y = light_positionView_y_array[lightIndex];
+                light_positionView_x * frustumPlanes_xy[0];
            inFrustum = inFrustum && (d >= light_attenuationEndNeg);
-        d = light_positionView_z * frustumPlanes_z[0] + 
+            d = light_positionView_z * frustumPlanes_z[1] + 
-            light_positionView_x * frustumPlanes_xy[0];
+                light_positionView_x * frustumPlanes_xy[1];
-        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+            inFrustum = inFrustum && (d >= light_attenuationEndNeg);
-        d = light_positionView_z * frustumPlanes_z[1] + 
+            d = light_positionView_z * frustumPlanes_z[2] + 
-            light_positionView_x * frustumPlanes_xy[1];
+                light_positionView_y * frustumPlanes_xy[2];
-        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+            inFrustum = inFrustum && (d >= light_attenuationEndNeg);
-        d = light_positionView_z * frustumPlanes_z[2] + 
+            d = light_positionView_z * frustumPlanes_z[3] + 
-            light_positionView_y * frustumPlanes_xy[2];
+                light_positionView_y * frustumPlanes_xy[3];
-        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+            inFrustum = inFrustum && (d >= light_attenuationEndNeg);
        d = light_positionView_z * frustumPlanes_z[3] + 
            light_positionView_y * frustumPlanes_xy[3];
        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
-        // Pack and store intersecting lights
+            // Pack and store intersecting lights
-        cif (inFrustum) {
+            cif (inFrustum) {
-            tileNumLights += packed_store_active(tileLightIndices, tileNumLights, 
+                tileNumLights += packed_store_active(&tileLightIndices[tileNumLights], 
-                                                 lightIndex);
+                                                     lightIndex);
            }
        }
    }
@@ -250,8 +225,6 @@ IntersectLightsWithTileMinMax(
 }
 // tile width must be a multiple of programCount (SIMD size)
 // numLights must currently be a multiple of programCount (SIMD size)
 static uniform int32
 IntersectLightsWithTile(
    uniform int32 tileStartX, uniform int32 tileEndX,
@@ -270,7 +243,7 @@ IntersectLightsWithTile(
    uniform float light_positionView_z_array[],
    uniform float light_attenuationEnd_array[],
    // Output
-    reference uniform int32 tileLightIndices[]
+    uniform int32 tileLightIndices[]
    )
 {
    uniform float minZ, maxZ;
@@ -289,32 +262,31 @@ IntersectLightsWithTile(
 }
 // tile width must be a multiple of programCount (SIMD size)
 export void
 ShadeTile(
    uniform int32 tileStartX, uniform int32 tileEndX,
    uniform int32 tileStartY, uniform int32 tileEndY,
    uniform int32 gBufferWidth, uniform int32 gBufferHeight,
-    reference uniform InputDataArrays inputData,
+    uniform InputDataArrays &inputData,
    // Camera data
    uniform float cameraProj_11, uniform float cameraProj_22,
    uniform float cameraProj_33, uniform float cameraProj_43,
    // Light list
-    reference uniform int32 tileLightIndices[],
+    uniform int32 tileLightIndices[],
    uniform int32 tileNumLights,
    // UI
    uniform bool visualizeLightCount,
    // Output
-    reference uniform unsigned int8 framebuffer_r[],
+    uniform unsigned int8 framebuffer_r[],
-    reference uniform unsigned int8 framebuffer_g[],
+    uniform unsigned int8 framebuffer_g[],
-    reference uniform unsigned int8 framebuffer_b[]
+    uniform unsigned int8 framebuffer_b[]
    )
 {
    if (tileNumLights == 0 || visualizeLightCount) {
        uniform unsigned int8 c = (unsigned int8)(min(tileNumLights << 2, 255));
        for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
-            for (uniform int32 x = tileStartX; x < tileEndX; x += programCount) {
+            foreach (x = tileStartX ... tileEndX) {
-                int32 framebufferIndex = (y * gBufferWidth + x) + programIndex;
+                int32 framebufferIndex = (y * gBufferWidth + x);
                framebuffer_r[framebufferIndex] = c;
                framebuffer_g[framebufferIndex] = c;
                framebuffer_b[framebufferIndex] = c;
@@ -327,9 +299,8 @@ ShadeTile(
        for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
            uniform float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
-            for (uniform int32 x = tileStartX; x < tileEndX; x += programCount) {
+            foreach (x = tileStartX ... tileEndX) {
-                uniform int32 gBufferOffsetBase = y * gBufferWidth + x;
+                int32 gBufferOffset = y * gBufferWidth + x;
                int32 gBufferOffset = gBufferOffsetBase + programIndex;
                // Reconstruct position and (negative) view vector from G-buffer
                float surface_positionView_x, surface_positionView_y, surface_positionView_z;
@@ -339,7 +310,7 @@ ShadeTile(
                // Compute screen/clip-space position
                // NOTE: Mind DX11 viewport transform and pixel center!
-                float positionScreen_x = (0.5f + (float)(x + programIndex)) * 
+                float positionScreen_x = (0.5f + (float)(x)) * 
                    twoOverGBufferWidth - 1.0f;
                // Unproject depth buffer Z value into view space
@@ -356,8 +327,8 @@ ShadeTile(
                // Reconstruct normal from G-buffer
                float surface_normal_x, surface_normal_y, surface_normal_z;
-                float normal_x = half_to_float_fast(inputData.normalEncoded_x[gBufferOffset]);
+                float normal_x = half_to_float(inputData.normalEncoded_x[gBufferOffset]);
-                float normal_y = half_to_float_fast(inputData.normalEncoded_y[gBufferOffset]);
+                float normal_y = half_to_float(inputData.normalEncoded_y[gBufferOffset]);
                float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
                float m = sqrt(4.0f * f - 1.0f);
@@ -368,9 +339,9 @@ ShadeTile(
                // Load other G-buffer parameters
                float surface_specularAmount = 
-                    half_to_float_fast(inputData.specularAmount[gBufferOffset]);
+                    half_to_float(inputData.specularAmount[gBufferOffset]);
                float surface_specularPower  = 
-                    half_to_float_fast(inputData.specularPower[gBufferOffset]);
+                    half_to_float(inputData.specularPower[gBufferOffset]);
                float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
                float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
                float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
@@ -479,24 +450,21 @@ ShadeTile(
 // Static decomposition
 task void
-RenderTile(uniform int g, uniform int num_groups_x, uniform int num_groups_y,
+RenderTile(uniform int num_groups_x, uniform int num_groups_y,
-           reference uniform InputHeader inputHeader,
+           uniform InputHeader &inputHeader,
-           reference uniform InputDataArrays inputData,
+           uniform InputDataArrays &inputData,
           uniform int visualizeLightCount,
           // Output
-           reference uniform unsigned int8 framebuffer_r[],
+           uniform unsigned int8 framebuffer_r[],
-           reference uniform unsigned int8 framebuffer_g[],
+           uniform unsigned int8 framebuffer_g[],
-           reference uniform unsigned int8 framebuffer_b[]) {
+           uniform unsigned int8 framebuffer_b[]) {
-    uniform int32 group_y = g / num_groups_x;
+    uniform int32 group_y = taskIndex / num_groups_x;
-    uniform int32 group_x = g % num_groups_x;
+    uniform int32 group_x = taskIndex % num_groups_x;
    uniform int32 tile_start_x = group_x * MIN_TILE_WIDTH;
    uniform int32 tile_start_y = group_y * MIN_TILE_HEIGHT;
    uniform int32 tile_end_x = tile_start_x + MIN_TILE_WIDTH;
    uniform int32 tile_end_y = tile_start_y + MIN_TILE_HEIGHT;
    uniform int sTileNumLights = 0;
    uniform int sTileLightIndices[MAX_LIGHTS];  // Light list for the tile
    uniform int framebufferWidth = inputHeader.framebufferWidth;
    uniform int framebufferHeight = inputHeader.framebufferHeight;
    uniform float cameraProj_00 = inputHeader.cameraProj[0][0];
@@ -504,8 +472,9 @@ RenderTile(uniform int g, uniform int num_groups_x, uniform int num_groups_y,
    uniform float cameraProj_22 = inputHeader.cameraProj[2][2];
    uniform float cameraProj_32 = inputHeader.cameraProj[3][2];
-    // Light intersection
+    // Light intersection: figure out which lights illuminate this tile.
-    sTileNumLights = 
+    uniform int tileLightIndices[MAX_LIGHTS];  // Light list for the tile
    uniform int numTileLights = 
        IntersectLightsWithTile(tile_start_x, tile_end_x, 
                                tile_start_y, tile_end_y,
                                framebufferWidth, framebufferHeight,
@@ -518,41 +487,43 @@ RenderTile(uniform int g, uniform int num_groups_x, uniform int num_groups_y,
                                inputData.lightPositionView_y, 
                                inputData.lightPositionView_z, 
                                inputData.lightAttenuationEnd,
-                                sTileLightIndices);
+                                tileLightIndices);
    // And now shade the tile, using the lights in tileLightIndices
    ShadeTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y,
              framebufferWidth, framebufferHeight, inputData,
              cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32,
-              sTileLightIndices, sTileNumLights, visualizeLightCount, 
+              tileLightIndices, numTileLights, visualizeLightCount, 
              framebuffer_r, framebuffer_g, framebuffer_b);
 }
 export void
-RenderStatic(reference uniform InputHeader inputHeader,
+RenderStatic(uniform InputHeader &inputHeader,
-             reference uniform InputDataArrays inputData,
+             uniform InputDataArrays &inputData,
             uniform int visualizeLightCount,
             // Output
-             reference uniform unsigned int8 framebuffer_r[],
+             uniform unsigned int8 framebuffer_r[],
-             reference uniform unsigned int8 framebuffer_g[],
+             uniform unsigned int8 framebuffer_g[],
-             reference uniform unsigned int8 framebuffer_b[]) {
+             uniform unsigned int8 framebuffer_b[]) {
    uniform int num_groups_x = (inputHeader.framebufferWidth + 
                                MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
    uniform int num_groups_y = (inputHeader.framebufferHeight + 
                                MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT;
    uniform int num_groups = num_groups_x * num_groups_y;
-    for (uniform int g = 0; g < num_groups; ++g)
+    // Launch a task to render each tile, each of which is MIN_TILE_WIDTH
-        launch < RenderTile(g, num_groups_x, num_groups_y,
+    // by MIN_TILE_HEIGHT pixels.
-                            inputHeader, inputData, visualizeLightCount,
+    launch[num_groups] RenderTile(num_groups_x, num_groups_y,
-                            framebuffer_r, framebuffer_g, framebuffer_b) >;
+                                  inputHeader, inputData, visualizeLightCount,
                                  framebuffer_r, framebuffer_g, framebuffer_b);
 }
 ///////////////////////////////////////////////////////////////////////////
 // Routines for dynamic decomposition path
-// tile width must be a multiple of programCount (SIMD size)
+// This computes the z min/max range for a whole row worth of tiles.
 export void
 ComputeZBoundsRow(
    uniform int32 tileY,
@@ -565,8 +536,8 @@ ComputeZBoundsRow(
    uniform float cameraProj_33, uniform float cameraProj_43,
    uniform float cameraNear, uniform float cameraFar,
    // Output
-    reference uniform float minZArray[],
+    uniform float minZArray[],
-    reference uniform float maxZArray[]
+    uniform float maxZArray[]
    )
 {
    for (uniform int32 tileX = 0; tileX < numTilesX; ++tileX) {
@@ -583,6 +554,7 @@ ComputeZBoundsRow(
 }
 // Reclassifies the lights with respect to four sub-tiles when we refine a tile.
 // numLights need not be a multiple of programCount here, but the input and output arrays
 // should be able to handle programCount-sized load/stores.
 export void
@@ -596,47 +568,35 @@ SplitTileMinMax(
    // Camera data
    uniform float cameraProj_11, uniform float cameraProj_22,
    // Light Data
-    reference uniform int32 lightIndices[],
+    uniform int32 lightIndices[],
    uniform int32 numLights,
    uniform float light_positionView_x_array[],
    uniform float light_positionView_y_array[],
    uniform float light_positionView_z_array[],
    uniform float light_attenuationEnd_array[],
    // Outputs
-    // TODO: ISPC doesn't currently like multidimensionsal arrays so we'll do the
+    uniform int32 subtileIndices[],
    // indexing math ourselves
    reference uniform int32 subtileIndices[],
    uniform int32 subtileIndicesPitch,
-    reference uniform int32 subtileNumLights[]
+    uniform int32 subtileNumLights[]
    )
 {
    uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
    uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
-    // Parallize across frustum planes
+    uniform float frustumPlanes_xy[2] = { -(cameraProj_11 * gBufferScale_x),
-    // Only have 2 frustum split planes here so may not be worth it, but
+                                           (cameraProj_22 * gBufferScale_y) };
-    // we'll do it for now for consistency
+    uniform float frustumPlanes_z[2] = { tileMidX - gBufferScale_x,
-    uniform float frustumPlanes_xy[programCount];
+                                         tileMidY - gBufferScale_y };
    uniform float frustumPlanes_z[programCount];
    // This one is totally constant over the whole screen... worth pulling it up at all?
    float frustumPlanes_xy_v;
    frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 0, -(cameraProj_11 * gBufferScale_x));
    frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 1,  (cameraProj_22 * gBufferScale_y));
    float frustumPlanes_z_v;
    frustumPlanes_z_v = insert(frustumPlanes_z_v, 0, tileMidX - gBufferScale_x);
    frustumPlanes_z_v = insert(frustumPlanes_z_v, 1, tileMidY - gBufferScale_y);
    // Normalize
-    float norm = rsqrt(frustumPlanes_xy_v * frustumPlanes_xy_v + 
+    uniform float norm[2] = { rsqrt(frustumPlanes_xy[0] * frustumPlanes_xy[0] + 
-                       frustumPlanes_z_v * frustumPlanes_z_v);
+                                    frustumPlanes_z[0] * frustumPlanes_z[0]),
-    frustumPlanes_xy_v *= norm;
+                              rsqrt(frustumPlanes_xy[1] * frustumPlanes_xy[1] + 
-    frustumPlanes_z_v *= norm;
+                                    frustumPlanes_z[1] * frustumPlanes_z[1]) };
-
+    frustumPlanes_xy[0] *= norm[0];
-    // Save out for uniform use later
+    frustumPlanes_xy[1] *= norm[1];
-    frustumPlanes_xy[programIndex] = frustumPlanes_xy_v;
+    frustumPlanes_z[0] *= norm[0];
-    frustumPlanes_z[programIndex] = frustumPlanes_z_v;
+    frustumPlanes_z[1] *= norm[1];
    // Initialize
    uniform int32 subtileLightOffset[4];
@@ -645,12 +605,7 @@ SplitTileMinMax(
    subtileLightOffset[2] = 2 * subtileIndicesPitch;
    subtileLightOffset[3] = 3 * subtileIndicesPitch;
-    for (int32 i = programIndex; i < numLights; i += programCount) {
+    foreach (i = 0 ... numLights) {
        // TODO: ISPC says gather required here when it actually
        // isn't... this could be fixed this by nesting an if() within a
        // uniform loop, but I'm not totally sure if that's a win
        // overall. For now we'll just eat the perf cost for cleanliness
        // since the below are real gathers anyways.
        int32 lightIndex = lightIndices[i];
        float light_positionView_x = light_positionView_x_array[lightIndex];
@@ -693,21 +648,21 @@ SplitTileMinMax(
        // Pack and store intersecting lights
        // TODO: Experiment with a loop here instead
        cif (inFrustum[0])
-            subtileLightOffset[0] += packed_store_active(subtileIndices, 
+            subtileLightOffset[0] += 
-                                                         subtileLightOffset[0], 
+            packed_store_active(&subtileIndices[subtileLightOffset[0]],
-                                                         lightIndex);
+                                lightIndex);
        cif (inFrustum[1])
-            subtileLightOffset[1] += packed_store_active(subtileIndices, 
+            subtileLightOffset[1] += 
-                                                         subtileLightOffset[1], 
+            packed_store_active(&subtileIndices[subtileLightOffset[1]],
-                                                         lightIndex);
+                                lightIndex);
        cif (inFrustum[2])
-            subtileLightOffset[2] += packed_store_active(subtileIndices, 
+            subtileLightOffset[2] += 
-                                                         subtileLightOffset[2], 
+            packed_store_active(&subtileIndices[subtileLightOffset[2]], 
-                                                         lightIndex);
+                                lightIndex);
        cif (inFrustum[3])
-            subtileLightOffset[3] += packed_store_active(subtileIndices, 
+            subtileLightOffset[3] += 
-                                                         subtileLightOffset[3], 
+            packed_store_active(&subtileIndices[subtileLightOffset[3]], 
-                                                         lightIndex);
+                                lightIndex);
    }
    subtileNumLights[0] = subtileLightOffset[0] - 0 * subtileIndicesPitch;
--- a/examples/deferred/main.cpp
+++ b/examples/deferred/main.cpp
@@ -63,7 +63,7 @@
 int main(int argc, char** argv) {
    if (argc != 2) {
-        printf("usage: deferred_shading <input_file>\n");
+        printf("usage: deferred_shading <input_file (e.g. data/pp1280x720.bin)>\n");
        return 1;
    }
@@ -77,9 +77,9 @@ int main(int argc, char** argv) {
                            input->header.framebufferHeight);
    InitDynamicC(input);
-#ifdef __cilkplusplus
+#ifdef __cilk
    InitDynamicCilk(input);
-#endif // __cilkplusplus
+#endif // __cilk
    int nframes = 5;
    double ispcCycles = 1e30;
@@ -98,20 +98,7 @@ int main(int argc, char** argv) {
           input->header.framebufferWidth, input->header.framebufferHeight);
    WriteFrame("deferred-ispc-static.ppm", input, framebuffer);
-    double serialCycles = 1e30;
+#ifdef __cilk
    for (int i = 0; i < 5; ++i) {
        framebuffer.clear();
        reset_and_start_timer();
        for (int j = 0; j < nframes; ++j)
            DispatchDynamicC(input, &framebuffer);
        double mcycles = get_elapsed_mcycles() / nframes;
        serialCycles = std::min(serialCycles, mcycles);
    }
    printf("[C++ serial dynamic, 1 core]:\t[%.3f] million cycles\n", 
           serialCycles);
    WriteFrame("deferred-serial-dynamic.ppm", input, framebuffer);
 #ifdef __cilkplusplus
    double dynamicCilkCycles = 1e30;
    for (int i = 0; i < 5; ++i) {
        framebuffer.clear();
@@ -121,15 +108,30 @@ int main(int argc, char** argv) {
        double mcycles = get_elapsed_mcycles() / nframes;
        dynamicCilkCycles = std::min(dynamicCilkCycles, mcycles);
    }
-    printf("[ispc + Cilk dynamic]:\t\t[%.3f] million cycles\n", 
+    printf("[ispc + Cilk dynamic]:\t\t[%.3f] million cycles to render image\n", 
           dynamicCilkCycles);
    WriteFrame("deferred-ispc-dynamic.ppm", input, framebuffer);
 #endif // __cilk
    double serialCycles = 1e30;
    for (int i = 0; i < 5; ++i) {
        framebuffer.clear();
        reset_and_start_timer();
        for (int j = 0; j < nframes; ++j)
            DispatchDynamicC(input, &framebuffer);
        double mcycles = get_elapsed_mcycles() / nframes;
        serialCycles = std::min(serialCycles, mcycles);
    }
    printf("[C++ serial dynamic, 1 core]:\t[%.3f] million cycles to render image\n", 
           serialCycles);
    WriteFrame("deferred-serial-dynamic.ppm", input, framebuffer);
 #ifdef __cilk
    printf("\t\t\t\t(%.2fx speedup from static ISPC, %.2fx from Cilk+ISPC)\n", 
           serialCycles/ispcCycles, serialCycles/dynamicCilkCycles);
 #else
    printf("\t\t\t\t(%.2fx speedup from ISPC)\n", serialCycles/ispcCycles);
-#endif // __cilkplusplus
+#endif // __cilk
    DeleteInputData(input);
--- a/examples/examples.sln
+++ b/examples/examples.sln
@@ -23,6 +23,8 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "stencil", "stencil\stencil.
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "deferred_shading", "deferred\deferred_shading.vcxproj", "{87F53C53-957E-4E91-878A-BC27828FB9EB}"
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "perfbench", "perfbench\perfbench.vcxproj", "{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|Win32 = Debug|Win32
@@ -119,6 +121,14 @@ Global
 		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|Win32.Build.0 = Release|Win32
 		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|x64.ActiveCfg = Release|x64
 		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|x64.Build.0 = Release|x64
 		{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Debug|Win32.ActiveCfg = Debug|Win32
 		{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Debug|Win32.Build.0 = Debug|Win32
 		{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Debug|x64.ActiveCfg = Debug|x64
 		{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Debug|x64.Build.0 = Debug|x64
 		{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Release|Win32.ActiveCfg = Release|Win32
 		{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Release|Win32.Build.0 = Release|Win32
 		{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Release|x64.ActiveCfg = Release|x64
 		{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
--- a/examples/intrinsics/generic-16.h
+++ b/examples/intrinsics/generic-16.h
--- a/examples/intrinsics/sse4.h
+++ b/examples/intrinsics/sse4.h
--- a/examples/mandelbrot/Makefile
+++ b/examples/mandelbrot/Makefile
@@ -1,26 +1,7 @@
-CXX=g++ -m64
+EXAMPLE=mandelbrot
-CXXFLAGS=-Iobjs/ -O3 -Wall
+CPP_SRC=mandelbrot.cpp mandelbrot_serial.cpp
-ISPC=ispc
+ISPC_SRC=mandelbrot.ispc
-ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64
+ISPC_TARGETS=sse2,sse4-x2,avx-x2
-default: mandelbrot
+include ../common.mk
 .PHONY: dirs clean
 dirs:
 	/bin/mkdir -p objs/
 clean:
 	/bin/rm -rf objs *~ mandelbrot
 mandelbrot: dirs objs/mandelbrot.o objs/mandelbrot_serial.o objs/mandelbrot_ispc.o
 	$(CXX) $(CXXFLAGS) -o $@ objs/mandelbrot.o objs/mandelbrot_ispc.o objs/mandelbrot_serial.o -lm
 objs/%.o: %.cpp
 	$(CXX) $< $(CXXFLAGS) -c -o $@
 objs/mandelbrot.o: objs/mandelbrot_ispc.h 
 objs/%_ispc.h objs/%_ispc.o: %.ispc
 	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/mandelbrot/mandelbrot.cpp
+++ b/examples/mandelbrot/mandelbrot.cpp
@@ -41,7 +41,6 @@
 #include <stdio.h>
 #include <algorithm>
 #include "../timing.h"
 #include "../cpuid.h"
 #include "mandelbrot_ispc.h"
 using namespace ispc;
@@ -68,38 +67,6 @@ writePPM(int *buf, int width, int height, const char *fn) {
 }
 // Make sure that the vector ISA used during compilation is supported by
 // the processor.  The ISPC_TARGET_* macro is set in the ispc-generated
 // header file that we include above.
 static void
 ensureTargetISAIsSupported() {
 #if defined(ISPC_TARGET_SSE2)
    bool isaSupported = CPUSupportsSSE2();
    const char *target = "SSE2";
 #elif defined(ISPC_TARGET_SSE4)
    bool isaSupported = CPUSupportsSSE4();
    const char *target = "SSE4";
 #elif defined(ISPC_TARGET_AVX)
    bool isaSupported = CPUSupportsAVX();
    const char *target = "AVX";
 #else
 #error "Unknown ISPC_TARGET_* value"
 #endif
    if (!isaSupported) {
        fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
                "set, which isn't\n***        supported by this computer's CPU!\n", target);
        fprintf(stderr, "***\n***        Please modify the "
 #ifdef _MSC_VER
                "MSVC project file "
 #else
                "Makefile "
 #endif
                "to select another target (e.g. sse2)\n***\n");
        exit(1);
    }
 }
 int main() {
    unsigned int width = 768;
    unsigned int height = 512;
@@ -111,8 +78,6 @@ int main() {
    int maxIterations = 256;
    int *buf = new int[width*height];
    ensureTargetISAIsSupported();
    //
    // Compute the image using the ispc implementation; report the minimum
    // time of three runs.
--- a/examples/mandelbrot/mandelbrot.ispc
+++ b/examples/mandelbrot/mandelbrot.ispc
@@ -51,7 +51,7 @@ export void mandelbrot_ispc(uniform float x0, uniform float y0,
                            uniform float x1, uniform float y1,
                            uniform int width, uniform int height, 
                            uniform int maxIterations,
-                            reference uniform int output[])
+                            uniform int output[])
 {
    float dx = (x1 - x0) / width;
    float dy = (y1 - y0) / height;
@@ -60,16 +60,16 @@ export void mandelbrot_ispc(uniform float x0, uniform float y0,
        // Note that we'll be doing programCount computations in parallel,
        // so increment i by that much.  This assumes that width evenly
        // divides programCount.
-        for (uniform int i = 0; i < width; i += programCount) {
+        foreach (i = 0 ... width) {
            // Figure out the position on the complex plane to compute the
            // number of iterations at.  Note that the x values are
            // different across different program instances, since its
            // initializer incorporates the value of the programIndex
            // variable.
-            float x = x0 + (programIndex + i) * dx;
+            float x = x0 + i * dx;
            float y = y0 + j * dy;
-            int index = j * width + i + programIndex;
+            int index = j * width + i;
            output[index] = mandel(x, y, maxIterations);
        }
    }
--- a/examples/mandelbrot/mandelbrot.vcxproj
+++ b/examples/mandelbrot/mandelbrot.vcxproj
@@ -64,15 +64,19 @@
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <LinkIncremental>true</LinkIncremental>
    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <LinkIncremental>false</LinkIncremental>
    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
@@ -81,6 +85,7 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -96,6 +101,7 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -113,6 +119,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
@@ -131,6 +138,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
@@ -147,18 +155,18 @@
  <ItemGroup>
    <CustomBuild Include="mandelbrot.ispc">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
    </CustomBuild>
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
--- a/examples/mandelbrot_tasks/Makefile
+++ b/examples/mandelbrot_tasks/Makefile
@@ -1,35 +1,7 @@
-ARCH = $(shell uname)
+EXAMPLE=mandelbrot
 CPP_SRC=mandelbrot.cpp mandelbrot_serial.cpp
 ISPC_SRC=mandelbrot.ispc
 ISPC_TARGETS=sse2,sse4-x2,avx-x2
-TASK_CXX=../tasksys.cpp
+include ../common.mk
 TASK_LIB=-lpthread
 TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
 CXX=g++
 CXXFLAGS=-Iobjs/ -O3 -Wall -m64
 ISPC=ispc
 ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64
 default: mandelbrot
 .PHONY: dirs clean
 dirs:
 	/bin/mkdir -p objs/
 clean:
 	/bin/rm -rf objs *~ mandelbrot
 mandelbrot: dirs objs/mandelbrot.o objs/mandelbrot_serial.o objs/mandelbrot_ispc.o $(TASK_OBJ)
 	$(CXX) $(CXXFLAGS) -o $@ objs/mandelbrot.o objs/mandelbrot_ispc.o objs/mandelbrot_serial.o $(TASK_OBJ) -lm $(TASK_LIB)
 objs/%.o: %.cpp
 	$(CXX) $< $(CXXFLAGS) -c -o $@
 objs/%.o: ../%.cpp
 	$(CXX) $< $(CXXFLAGS) -c -o $@
 objs/mandelbrot.o: objs/mandelbrot_ispc.h 
 objs/%_ispc.h objs/%_ispc.o: %.ispc
 	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/mandelbrot_tasks/mandelbrot.cpp
+++ b/examples/mandelbrot_tasks/mandelbrot.cpp
@@ -42,7 +42,6 @@
 #include <algorithm>
 #include <string.h>
 #include "../timing.h"
 #include "../cpuid.h"
 #include "mandelbrot_ispc.h"
 using namespace ispc;
@@ -69,37 +68,6 @@ writePPM(int *buf, int width, int height, const char *fn) {
 }
 // Make sure that the vector ISA used during compilation is supported by
 // the processor.  The ISPC_TARGET_* macro is set in the ispc-generated
 // header file that we include above.
 static void
 ensureTargetISAIsSupported() {
 #if defined(ISPC_TARGET_SSE2)
    bool isaSupported = CPUSupportsSSE2();
    const char *target = "SSE2";
 #elif defined(ISPC_TARGET_SSE4)
    bool isaSupported = CPUSupportsSSE4();
    const char *target = "SSE4";
 #elif defined(ISPC_TARGET_AVX)
    bool isaSupported = CPUSupportsAVX();
    const char *target = "AVX";
 #else
 #error "Unknown ISPC_TARGET_* value"
 #endif
    if (!isaSupported) {
        fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
                "set, which isn't\n***        supported by this computer's CPU!\n", target);
        fprintf(stderr, "***\n***        Please modify the "
 #ifdef _MSC_VER
                "MSVC project file "
 #else
                "Makefile "
 #endif
                "to select another target (e.g. sse2)\n***\n");
        exit(1);
    }
 }
 static void usage() {
    fprintf(stderr, "usage: mandelbrot [--scale=<factor>]\n");
    exit(1);
@@ -132,8 +100,6 @@ int main(int argc, char *argv[]) {
    else
        usage();
    ensureTargetISAIsSupported();
    int maxIterations = 512;
    int *buf = new int[width*height];
--- a/examples/mandelbrot_tasks/mandelbrot.ispc
+++ b/examples/mandelbrot_tasks/mandelbrot.ispc
@@ -49,52 +49,36 @@ mandel(float c_re, float c_im, int count) {
 }
-/* Task to compute the Mandelbrot iterations for a span of scanlines from
+/* Task to compute the Mandelbrot iterations for a single scanline.
   [ystart,yend).
 */
 task void
-mandelbrot_scanlines(uniform int ybase, uniform int span,
+mandelbrot_scanline(uniform float x0, uniform float dx, 
-                     uniform float x0, uniform float dx, 
+                    uniform float y0, uniform float dy,
-                     uniform float y0, uniform float dy,
+                    uniform int width, uniform int height, 
-                     uniform int width, uniform int maxIterations,
+                    uniform int span,
-                     reference uniform int output[]) {
+                    uniform int maxIterations, uniform int output[]) {
-    uniform int ystart = ybase + taskIndex * span;
+    uniform int ystart = taskIndex * span;
-    uniform int yend = ystart + span;
+    uniform int yend = min((taskIndex+1) * span, (unsigned int)height);
-    for (uniform int j = ystart; j < yend; ++j) {
+    foreach (yi = ystart ... yend, xi = 0 ... width) {
-        for (uniform int i = 0; i < width; i += programCount) {
+        float x = x0 + xi * dx;
-            float x = x0 + (programIndex + i) * dx;
+        float y = y0 + yi * dy;
            float y = y0 + j * dy;
-            int index = j * width + i + programIndex;
+        int index = yi * width + xi;
-            output[index] = mandel(x, y, maxIterations);
+        output[index] = mandel(x, y, maxIterations);
        }
    }
 }
 task void
 mandelbrot_chunk(uniform float x0, uniform float dx,
                 uniform float y0, uniform float dy,
                 uniform int width, uniform int height,
                 uniform int maxIterations, reference uniform int output[]) {
    uniform int ystart = taskIndex * (height/taskCount);
    uniform int yend = (taskIndex+1) * (height/taskCount);
    uniform int span = 1;
    launch[(yend-ystart)/span] < mandelbrot_scanlines(ystart, span, x0, dx, y0, dy,
                                                      width, maxIterations, output) >;
 }
 export void
 mandelbrot_ispc(uniform float x0, uniform float y0, 
                uniform float x1, uniform float y1,
                uniform int width, uniform int height, 
-                uniform int maxIterations, reference uniform int output[]) {
+                uniform int maxIterations, uniform int output[]) {
    uniform float dx = (x1 - x0) / width;
    uniform float dy = (y1 - y0) / height;
    uniform int span = 4;
-    launch[32] < mandelbrot_chunk(x0, dx, y0, dy, width, height,
+    launch[height/span] mandelbrot_scanline(x0, dx, y0, dy, width, height, span,
-                                  maxIterations, output) >;
+                                            maxIterations, output);
 }
--- a/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
+++ b/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
@@ -64,15 +64,19 @@
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <LinkIncremental>true</LinkIncremental>
    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <LinkIncremental>false</LinkIncremental>
    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
@@ -81,6 +85,7 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -96,6 +101,7 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -113,6 +119,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
@@ -131,6 +138,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
@@ -148,18 +156,18 @@
  <ItemGroup>
    <CustomBuild Include="mandelbrot.ispc">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
    </CustomBuild>
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
--- a/examples/noise/Makefile
+++ b/examples/noise/Makefile
@@ -1,26 +1,7 @@
-CXX=g++ -m64
+EXAMPLE=noise
-CXXFLAGS=-Iobjs/ -O3 -Wall
+CPP_SRC=$(EXAMPLE).cpp $(EXAMPLE)_serial.cpp
-ISPC=ispc
+ISPC_SRC=noise.ispc
-ISPCFLAGS=-O2 --target=sse4 --arch=x86-64
+ISPC_TARGETS=sse2,sse4,avx-x2
-default: noise
+include ../common.mk
 .PHONY: dirs clean
 dirs:
 	/bin/mkdir -p objs/
 clean:
 	/bin/rm -rf objs *~ noise
 noise: dirs objs/noise.o objs/noise_serial.o objs/noise_ispc.o
 	$(CXX) $(CXXFLAGS) -o $@ objs/noise.o objs/noise_ispc.o objs/noise_serial.o -lm
 objs/%.o: %.cpp
 	$(CXX) $< $(CXXFLAGS) -c -o $@
 objs/noise.o: objs/noise_ispc.h 
 objs/%_ispc.h objs/%_ispc.o: %.ispc
 	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/noise/noise.cpp
+++ b/examples/noise/noise.cpp
@@ -41,7 +41,6 @@
 #include <stdio.h>
 #include <algorithm>
 #include "../timing.h"
 #include "../cpuid.h"
 #include "noise_ispc.h"
 using namespace ispc;
@@ -66,38 +65,6 @@ writePPM(float *buf, int width, int height, const char *fn) {
 }
 // Make sure that the vector ISA used during compilation is supported by
 // the processor.  The ISPC_TARGET_* macro is set in the ispc-generated
 // header file that we include above.
 static void
 ensureTargetISAIsSupported() {
 #if defined(ISPC_TARGET_SSE2)
    bool isaSupported = CPUSupportsSSE2();
    const char *target = "SSE2";
 #elif defined(ISPC_TARGET_SSE4)
    bool isaSupported = CPUSupportsSSE4();
    const char *target = "SSE4";
 #elif defined(ISPC_TARGET_AVX)
    bool isaSupported = CPUSupportsAVX();
    const char *target = "AVX";
 #else
 #error "Unknown ISPC_TARGET_* value"
 #endif
    if (!isaSupported) {
        fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
                "set, which isn't\n***        supported by this computer's CPU!\n", target);
        fprintf(stderr, "***\n***        Please modify the "
 #ifdef _MSC_VER
                "MSVC project file "
 #else
                "Makefile "
 #endif
                "to select another target (e.g. sse2)\n***\n");
        exit(1);
    }
 }
 int main() {
    unsigned int width = 768;
    unsigned int height = 768;
@@ -108,8 +75,6 @@ int main() {
    float *buf = new float[width*height];
    ensureTargetISAIsSupported();
    //
    // Compute the image using the ispc implementation; report the minimum
    // time of three runs.
--- a/examples/noise/noise.vcxproj
+++ b/examples/noise/noise.vcxproj
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
@@ -64,15 +64,19 @@
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <LinkIncremental>true</LinkIncremental>
    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <LinkIncremental>false</LinkIncremental>
    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
@@ -81,6 +85,7 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -96,6 +101,7 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -113,6 +119,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
@@ -131,6 +138,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
@@ -147,18 +155,18 @@
  <ItemGroup>
    <CustomBuild Include="noise.ispc">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx-x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx-x2
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx-x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx-x2
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
    </CustomBuild>
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
--- a/examples/options/Makefile
+++ b/examples/options/Makefile
@@ -1,26 +1,7 @@
-CXX=g++ -m64
+EXAMPLE=options
-CXXFLAGS=-Iobjs/ -g -Wall
+CPP_SRC=options.cpp options_serial.cpp
-ISPC=ispc
+ISPC_SRC=options.ispc
-ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64
+ISPC_TARGETS=sse2,sse4-x2,avx-x2
-default: options
+include ../common.mk
 .PHONY: dirs clean
 dirs:
 	/bin/mkdir -p objs/
 clean:
 	/bin/rm -rf objs *~ options
 options: dirs objs/options.o objs/options_serial.o objs/options_ispc.o
 	$(CXX) $(CXXFLAGS) -o $@ objs/options.o objs/options_ispc.o objs/options_serial.o -lm
 objs/%.o: %.cpp
 	$(CXX) $< $(CXXFLAGS) -c -o $@
 objs/options.o: objs/options_ispc.h options_defs.h
 objs/%_ispc.h objs/%_ispc.o: %.ispc options_defs.h
 	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/options/options.cpp
+++ b/examples/options/options.cpp
@@ -31,6 +31,8 @@
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 */
 #define NOMINMAX
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -41,7 +43,6 @@ using std::max;
 #include "options_defs.h"
 #include "../timing.h"
 #include "../cpuid.h"
 #include "options_ispc.h"
 using namespace ispc;
@@ -54,49 +55,32 @@ extern void binomial_put_serial(float Sa[], float Xa[], float Ta[],
                                float ra[], float va[], 
                                float result[], int count);
-// Make sure that the vector ISA used during compilation is supported by
+static void usage() {
-// the processor.  The ISPC_TARGET_* macro is set in the ispc-generated
+    printf("usage: options [--count=<num options>]\n");
 // header file that we include above.
 static void
 ensureTargetISAIsSupported() {
 #if defined(ISPC_TARGET_SSE2)
    bool isaSupported = CPUSupportsSSE2();
    const char *target = "SSE2";
 #elif defined(ISPC_TARGET_SSE4)
    bool isaSupported = CPUSupportsSSE4();
    const char *target = "SSE4";
 #elif defined(ISPC_TARGET_AVX)
    bool isaSupported = CPUSupportsAVX();
    const char *target = "AVX";
 #else
 #error "Unknown ISPC_TARGET_* value"
 #endif
    if (!isaSupported) {
        fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
                "set, which isn't\n***        supported by this computer's CPU!\n", target);
        fprintf(stderr, "***\n***        Please modify the "
 #ifdef _MSC_VER
                "MSVC project file "
 #else
                "Makefile "
 #endif
                "to select another target (e.g. sse2)\n***\n");
        exit(1);
    }
 }
-int main() {
+int main(int argc, char *argv[]) {
-    ensureTargetISAIsSupported();
+    int nOptions = 128*1024;
    float *S = new float[N_OPTIONS];
    float *X = new float[N_OPTIONS];
    float *T = new float[N_OPTIONS];
    float *r = new float[N_OPTIONS];
    float *v = new float[N_OPTIONS];
    float *result = new float[N_OPTIONS];
-    for (int i = 0; i < N_OPTIONS; ++i) {
+    for (int i = 1; i < argc; ++i) {
        if (strncmp(argv[i], "--count=", 8) == 0) {
            nOptions = atoi(argv[i] + 8);
            if (nOptions <= 0) {
                usage();
                exit(1);
            }
        }
    }
    float *S = new float[nOptions];
    float *X = new float[nOptions];
    float *T = new float[nOptions];
    float *r = new float[nOptions];
    float *v = new float[nOptions];
    float *result = new float[nOptions];
    for (int i = 0; i < nOptions; ++i) {
        S[i] = 100;  // stock price
        X[i] = 98;   // option strike price
        T[i] = 2;    // time (years)
@@ -104,61 +88,109 @@ int main() {
        v[i] = 5;    // volatility
    }
    double sum;
    //
    // Binomial options pricing model, ispc implementation
    //
-    reset_and_start_timer();
+    double binomial_ispc = 1e30;
-    binomial_put_ispc(S, X, T, r, v, result, N_OPTIONS);
+    for (int i = 0; i < 3; ++i) {
-    double binomial_ispc = get_elapsed_mcycles();
+        reset_and_start_timer();
-    float sum = 0.f;
+        binomial_put_ispc(S, X, T, r, v, result, nOptions);
-    for (int i = 0; i < N_OPTIONS; ++i)
+        double dt = get_elapsed_mcycles();
-        sum += result[i];
+        sum = 0.;
-    printf("[binomial ispc]:\t\t[%.3f] million cycles (avg %f)\n", 
+        for (int i = 0; i < nOptions; ++i)
-           binomial_ispc, sum / N_OPTIONS);
+            sum += result[i];
        binomial_ispc = std::min(binomial_ispc, dt);
    }
    printf("[binomial ispc, 1 thread]:\t[%.3f] million cycles (avg %f)\n", 
           binomial_ispc, sum / nOptions);
    //
    // Binomial options pricing model, ispc implementation, tasks
    //
    double binomial_tasks = 1e30;
    for (int i = 0; i < 3; ++i) {
        reset_and_start_timer();
        binomial_put_ispc_tasks(S, X, T, r, v, result, nOptions);
        double dt = get_elapsed_mcycles();
        sum = 0.;
        for (int i = 0; i < nOptions; ++i)
            sum += result[i];
        binomial_tasks = std::min(binomial_tasks, dt);
    }
    printf("[binomial ispc, tasks]:\t\t[%.3f] million cycles (avg %f)\n", 
           binomial_tasks, sum / nOptions);
    //
    // Binomial options, serial implementation
    //
-    reset_and_start_timer();
+    double binomial_serial = 1e30;
-    binomial_put_serial(S, X, T, r, v, result, N_OPTIONS);
+    for (int i = 0; i < 3; ++i) {
-    double binomial_serial = get_elapsed_mcycles();
+        reset_and_start_timer();
-    sum = 0.f;
+        binomial_put_serial(S, X, T, r, v, result, nOptions);
-    for (int i = 0; i < N_OPTIONS; ++i)
+        double dt = get_elapsed_mcycles();
-        sum += result[i];
+        sum = 0.;
-    printf("[binomial serial]:\t\t[%.3f] million cycles (avg %f)\n", 
+        for (int i = 0; i < nOptions; ++i)
           binomial_serial, sum / N_OPTIONS);
    printf("\t\t\t\t(%.2fx speedup from ISPC)\n", binomial_serial / binomial_ispc);
    //
    // Black-Scholes options pricing model, ispc implementation
    //
    sum = 0.f;
    reset_and_start_timer();
    for (int a = 0; a < N_BLACK_SCHOLES_ROUNDS; ++a) {
        black_scholes_ispc(S, X, T, r, v, result, N_OPTIONS);
        for (int i = 0; i < N_OPTIONS; ++i)
            sum += result[i];
        binomial_serial = std::min(binomial_serial, dt);
    }
-    double bs_ispc = get_elapsed_mcycles();
+    printf("[binomial serial]:\t\t[%.3f] million cycles (avg %f)\n", 
-    printf("[black-scholes ispc]:\t\t[%.3f] million cycles (avg %f)\n", 
+           binomial_serial, sum / nOptions);
-           bs_ispc, sum / (N_BLACK_SCHOLES_ROUNDS * N_OPTIONS));
+
    printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n",
           binomial_serial / binomial_ispc, binomial_serial / binomial_tasks);
    //
    // Black-Scholes options pricing model, ispc implementation, 1 thread
    //
    double bs_ispc = 1e30;
    for (int i = 0; i < 3; ++i) {
        reset_and_start_timer();
        black_scholes_ispc(S, X, T, r, v, result, nOptions);
        double dt = get_elapsed_mcycles();
        sum = 0.;
        for (int i = 0; i < nOptions; ++i)
            sum += result[i];
        bs_ispc = std::min(bs_ispc, dt);
    }
    printf("[black-scholes ispc, 1 thread]:\t[%.3f] million cycles (avg %f)\n", 
           bs_ispc, sum / nOptions);
    //
    // Black-Scholes options pricing model, ispc implementation, tasks
    //
    double bs_ispc_tasks = 1e30;
    for (int i = 0; i < 3; ++i) {
        reset_and_start_timer();
        black_scholes_ispc_tasks(S, X, T, r, v, result, nOptions);
        double dt = get_elapsed_mcycles();
        sum = 0.;
        for (int i = 0; i < nOptions; ++i)
            sum += result[i];
        bs_ispc_tasks = std::min(bs_ispc_tasks, dt);
    }
    printf("[black-scholes ispc, tasks]:\t[%.3f] million cycles (avg %f)\n", 
           bs_ispc_tasks, sum / nOptions);
    //
    // Black-Scholes options pricing model, serial implementation
    //
-    sum = 0.f;
+    double bs_serial = 1e30;
-    reset_and_start_timer();
+    for (int i = 0; i < 3; ++i) {
-    for (int a = 0; a < N_BLACK_SCHOLES_ROUNDS; ++a) {
+        reset_and_start_timer();
-        black_scholes_serial(S, X, T, r, v, result, N_OPTIONS);
+        black_scholes_serial(S, X, T, r, v, result, nOptions);
-        for (int i = 0; i < N_OPTIONS; ++i)
+        double dt = get_elapsed_mcycles();
        sum = 0.;
        for (int i = 0; i < nOptions; ++i)
            sum += result[i];
        bs_serial = std::min(bs_serial, dt);
    }
    double bs_serial = get_elapsed_mcycles();
    printf("[black-scholes serial]:\t\t[%.3f] million cycles (avg %f)\n", bs_serial, 
-           sum / (N_BLACK_SCHOLES_ROUNDS * N_OPTIONS));
+           sum / nOptions);
-    printf("\t\t\t\t(%.2fx speedup from ISPC)\n", bs_serial / bs_ispc);
+    printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", 
           bs_serial / bs_ispc, bs_serial / bs_ispc_tasks);
    return 0;
 }
--- a/examples/options/options.ispc
+++ b/examples/options/options.ispc
@@ -55,49 +55,100 @@ CND(float X) {
    return w;
 }
-export void
+task void
-black_scholes_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[],
+bs_task(uniform float Sa[], uniform float Xa[], uniform float Ta[],
-                   uniform float ra[], uniform float va[], 
+        uniform float ra[], uniform float va[], 
-                   uniform float result[], uniform int count) {
+        uniform float result[], uniform int count) {
-    for (uniform int i = 0; i < count; i += programCount) {
+    uniform int first = taskIndex * (count/taskCount);
-        float S = Sa[i + programIndex], X = Xa[i + programIndex];
+    uniform int last = min(count, (int)((taskIndex+1) * (count/taskCount)));
-        float T = Ta[i + programIndex], r = ra[i + programIndex];
+
-        float v = va[i + programIndex];
+    foreach (i = first ... last) {
        float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
        float d1 = (log(S/X) + (r + v * v * .5f) * T) / (v * sqrt(T));
        float d2 = d1 - v * sqrt(T);
-        result[i + programIndex] = S * CND(d1) - X * exp(-r * T) * CND(d2);
+        result[i] = S * CND(d1) - X * exp(-r * T) * CND(d2);
    }
 }
 export void
 black_scholes_ispc_tasks(uniform float Sa[], uniform float Xa[], uniform float Ta[],
                         uniform float ra[], uniform float va[], 
                         uniform float result[], uniform int count) {
    uniform int nTasks = max((int)64, (int)count/16384);
    launch[nTasks] bs_task(Sa, Xa, Ta, ra, va, result, count);
 }
 export void
 black_scholes_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[],
                   uniform float ra[], uniform float va[], 
                   uniform float result[], uniform int count) {
    foreach (i = 0 ... count) {
        float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
        float d1 = (log(S/X) + (r + v * v * .5f) * T) / (v * sqrt(T));
        float d2 = d1 - v * sqrt(T);
        result[i] = S * CND(d1) - X * exp(-r * T) * CND(d2);
    }
 }
 static inline float
 binomial_put(float S, float X, float T, float r, float v) {
    float V[BINOMIAL_NUM];
    float dt = T / BINOMIAL_NUM;
    float u = exp(v * sqrt(dt));
    float d = 1. / u;
    float disc = exp(r * dt);
    float Pu = (disc - d) / (u - d);
    for (uniform int j = 0; j < BINOMIAL_NUM; ++j) {
        float upow = pow(u, (float)(2*j-BINOMIAL_NUM));
        V[j] = max(0., X - S * upow);
    }
    for (uniform int j = BINOMIAL_NUM-1; j >= 0; --j)
        for (uniform int k = 0; k < j; ++k)
            V[k] = ((1 - Pu) * V[k] + Pu * V[k + 1]) / disc;
    return V[0];
 }
 export void
 binomial_put_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[], 
                  uniform float ra[], uniform float va[], 
                  uniform float result[], uniform int count) {
-    float V[BINOMIAL_NUM];
+    foreach (i = 0 ... count) {
-
+        float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
-    for (uniform int i = 0; i < count; i += programCount) {
+        result[i] = binomial_put(S, X, T, r, v);
        float S = Sa[i + programIndex], X = Xa[i + programIndex];
        float T = Ta[i + programIndex], r = ra[i + programIndex];
        float v = va[i + programIndex];
        float dt = T / BINOMIAL_NUM;
        float u = exp(v * sqrt(dt));
        float d = 1. / u;
        float disc = exp(r * dt);
        float Pu = (disc - d) / (u - d);
        for (uniform int j = 0; j < BINOMIAL_NUM; ++j) {
            float upow = pow(u, (float)(2*j-BINOMIAL_NUM));
            V[j] = max(0., X - S * upow);
        }
        for (uniform int j = BINOMIAL_NUM-1; j >= 0; --j)
            for (uniform int k = 0; k < j; ++k)
                V[k] = ((1 - Pu) * V[k] + Pu * V[k + 1]) / disc;
        result[i + programIndex] = V[0];
    }
 }
 task void
 binomial_task(uniform float Sa[], uniform float Xa[], 
              uniform float Ta[], uniform float ra[], 
              uniform float va[], uniform float result[], 
              uniform int count) {
    uniform int first = taskIndex * (count/taskCount);
    uniform int last = min(count, (int)((taskIndex+1) * (count/taskCount)));
    foreach (i = first ... last) {
        float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
        result[i] = binomial_put(S, X, T, r, v);
    }
 }
 export void
 binomial_put_ispc_tasks(uniform float Sa[], uniform float Xa[], 
                        uniform float Ta[], uniform float ra[], 
                        uniform float va[], uniform float result[], 
                        uniform int count) {
    uniform int nTasks = max((int)64, (int)count/16384);
    launch[nTasks] binomial_task(Sa, Xa, Ta, ra, va, result, count);
 }
--- a/examples/options/options.vcxproj
+++ b/examples/options/options.vcxproj
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
@@ -64,15 +64,19 @@
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <LinkIncremental>true</LinkIncremental>
    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <LinkIncremental>false</LinkIncremental>
    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
@@ -81,6 +85,7 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <DisableSpecificWarnings>4305</DisableSpecificWarnings>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
@@ -97,6 +102,7 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <DisableSpecificWarnings>4305</DisableSpecificWarnings>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
@@ -115,6 +121,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <DisableSpecificWarnings>4305</DisableSpecificWarnings>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -134,6 +141,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <DisableSpecificWarnings>4305</DisableSpecificWarnings>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -147,22 +155,23 @@
  <ItemGroup>
    <ClCompile Include="options.cpp" />
    <ClCompile Include="options_serial.cpp" />
    <ClCompile Include="../tasksys.cpp" />
  </ItemGroup>
  <ItemGroup>
    <CustomBuild Include="options.ispc">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
--- a/examples/options/options_defs.h
+++ b/examples/options/options_defs.h
@@ -35,8 +35,6 @@
 #define OPTIONS_DEFS_H 1
 #define BINOMIAL_NUM 64
 #define N_OPTIONS 65536
 #define N_BLACK_SCHOLES_ROUNDS 20
 #endif // OPTIONS_DEFS_H
--- a/examples/perfbench/Makefile
+++ b/examples/perfbench/Makefile
@@ -0,0 +1,7 @@
 EXAMPLE=perbench
 CPP_SRC=perfbench.cpp perfbench_serial.cpp
 ISPC_SRC=perfbench.ispc
 ISPC_TARGETS=sse2,sse4,avx
 include ../common.mk
--- a/examples/perfbench/perfbench.cpp
+++ b/examples/perfbench/perfbench.cpp
@@ -0,0 +1,108 @@
 /*
  Copyright (c) 2012, Intel Corporation
  All rights reserved.
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are
  met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of Intel Corporation nor the names of its
      contributors may be used to endorse or promote products derived from
      this software without specific prior written permission.
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 */
 #ifdef _MSC_VER
 #define _CRT_SECURE_NO_WARNINGS
 #define NOMINMAX
 #pragma warning (disable: 4244)
 #pragma warning (disable: 4305)
 #endif
 #include <stdio.h>
 #include <algorithm>
 #include "../timing.h"
 #include "perfbench_ispc.h"
 typedef void (FuncType)(float *, int, float *, float *);
 struct PerfTest {
    FuncType *aFunc;
    const char *aName;
    FuncType *bFunc;
    const char *bName;
    const char *testName;
 };
 extern void xyzSumAOS(float *a, int count, float *zeros, float *result);
 extern void xyzSumSOA(float *a, int count, float *zeros, float *result);
 static void
 lInitData(float *ptr, int count) {
    for (int i = 0; i < count; ++i)
        ptr[i] = float(i) / (1024.f * 1024.f);
 }
 static PerfTest tests[] = { 
    { xyzSumAOS, "serial", ispc::xyzSumAOS, "ispc", "AOS vector element sum (with coalescing)" },
    { xyzSumAOS, "serial", ispc::xyzSumAOSStdlib, "ispc", "AOS vector element sum (stdlib swizzle)" },
    { xyzSumAOS, "serial", ispc::xyzSumAOSNoCoalesce, "ispc", "AOS vector element sum (no coalescing)" },
    { xyzSumSOA, "serial", ispc::xyzSumSOA, "ispc", "SOA vector element sum" },
    { ispc::gathers, "gather", ispc::loads, "vector load", "Memory reads" },
    { ispc::scatters, "scatter", ispc::stores, "vector store", "Memory writes" },
 };
 int main() {
    int count = 3*64*1024;
    float *a = new float[count];
    float zeros[32] = { 0 };
    int nTests = sizeof(tests) / sizeof(tests[0]);
    for (int i = 0; i < nTests; ++i) {
        lInitData(a, count);
        reset_and_start_timer();
        float resultA[3] = { 0, 0, 0 };
        for (int j = 0; j < 100; ++j)
            tests[i].aFunc(a, count, zeros, resultA);
        double aTime = get_elapsed_mcycles();
        lInitData(a, count);
        reset_and_start_timer();
        float resultB[3] = { 0, 0, 0 };
        for (int j = 0; j < 100; ++j)
            tests[i].bFunc(a, count, zeros, resultB);
        double bTime = get_elapsed_mcycles();
        printf("%-40s: [%.2f] M cycles %s, [%.2f] M cycles %s (%.2fx speedup).\n",
               tests[i].testName, aTime, tests[i].aName, bTime, tests[i].bName,
               aTime/bTime);
 #if 0
        printf("\t(%f %f %f) - (%f %f %f)\n", resultSerial[0], resultSerial[1],
               resultSerial[2], resultISPC[0], resultISPC[1], resultISPC[2]);
 #endif
    }
    return 0;
 }
--- a/examples/perfbench/perfbench.ispc
+++ b/examples/perfbench/perfbench.ispc
@@ -0,0 +1,170 @@
 /*
  Copyright (c) 2012, Intel Corporation
  All rights reserved.
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are
  met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of Intel Corporation nor the names of its
      contributors may be used to endorse or promote products derived from
      this software without specific prior written permission.
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 */
 export void xyzSumAOS(uniform float array[], uniform int count,
                      uniform float zeros[], uniform float result[]) {
    float xsum = 0, ysum = 0, zsum = 0;
    foreach (i = 0 ... count/3) {
        float x = array[3*i];
        float y = array[3*i+1];
        float z = array[3*i+2];
        xsum += x;
        ysum += y;
        zsum += z;
    }
    result[0] = reduce_add(xsum);
    result[1] = reduce_add(ysum);
    result[2] = reduce_add(zsum);
 }
 export void xyzSumAOSStdlib(uniform float array[], uniform int count,
                            uniform float zeros[], uniform float result[]) {
    float xsum = 0, ysum = 0, zsum = 0;
    for (uniform int i = 0; i < 64*1024 /*count/3*/; i += programCount) {
        float x, y, z;
        aos_to_soa3(&array[3*i], &x, &y, &z);
        xsum += x;
        ysum += y;
        zsum += z;
    }
    result[0] = reduce_add(xsum);
    result[1] = reduce_add(ysum);
    result[2] = reduce_add(zsum);
 }
 export void xyzSumAOSNoCoalesce(uniform float array[], uniform int count,
                                uniform float zerosArray[], uniform float result[]) {
    int zeros = zerosArray[programIndex];
    float xsum = 0, ysum = 0, zsum = 0;
    foreach (i = 0 ... count/3) {
        float x = array[3*i+zeros];
        float y = array[3*i+1+zeros];
        float z = array[3*i+2+zeros];
        xsum += x;
        ysum += y;
        zsum += z;
    }
    result[0] = reduce_add(xsum);
    result[1] = reduce_add(ysum);
    result[2] = reduce_add(zsum);
 }
 export void xyzSumSOA(uniform float array[], uniform int count,
                      uniform float zeros[], uniform float result[]) {
    float xsum = 0, ysum = 0, zsum = 0;
    uniform float * uniform ap = array;
    assert(programCount <= 8);
    for (uniform int i = 0; i < count/3; i += 8, ap += 24) {
        for (uniform int j = 0; j < 8; j += programCount) {
            float x = ap[j + programIndex];
            float y = ap[8 + j + programIndex];
            float z = ap[16 + j + programIndex];
            xsum += x;
            ysum += y;
            zsum += z;
        }
    }
    result[0] = reduce_add(xsum);
    result[1] = reduce_add(ysum);
    result[2] = reduce_add(zsum);
 }
 export void gathers(uniform float array[], uniform int count,
                    uniform float zeros[], uniform float result[]) {
    float sum = 0;
    int zero = zeros[programIndex];
    foreach (i = 0 ... count)
        sum += array[i + zero];
    result[0] = reduce_add(sum);
 }
 export void loads(uniform float array[], uniform int count,
                  uniform float zeros[], uniform float result[]) {
    float sum = 0;
    foreach (i = 0 ... count)
        sum += array[i];
    result[0] = reduce_add(sum);
 }
 export void scatters(uniform float array[], uniform int count,
                     uniform float zeros[], uniform float result[]) {
    int zero = zeros[programIndex];
    foreach (i = 0 ... count)
        array[i + zero] = zero;
 }
 export void stores(uniform float array[], uniform int count,
                   uniform float zeros[], uniform float result[]) {
    int zero = zeros[programIndex];
    foreach (i = 0 ... count)
        array[i] = zero;
 }
 export void normalizeAOSNoCoalesce(uniform float array[], uniform int count,
                                   uniform float zeroArray[]) {
    float zeros = zeroArray[programIndex];
    foreach (i = 0 ... count/3) {
        float x = array[3*i+zeros];
        float y = array[3*i+1+zeros];
        float z = array[3*i+2+zeros];
        float l2 = x*x + y*y + z*z;
        array[3*i] /= l2;
        array[3*i+1] /= l2;
        array[3*i+2] /= l2;
    }
 }
 export void normalizeSOA(uniform float array[], uniform int count,
                         uniform float zeros[]) {
    foreach (i = 0 ... count/3) {
        float x = array[3*i];
        float y = array[3*i+1];
        float z = array[3*i+2];
        float l2 = x*x + y*y + z*z;
        array[3*i] /= l2;
        array[3*i+1] /= l2;
        array[3*i+2] /= l2;
    }
 }
--- a/examples/perfbench/perfbench.vcxproj
+++ b/examples/perfbench/perfbench.vcxproj
@@ -0,0 +1,175 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
      <Configuration>Debug</Configuration>
      <Platform>Win32</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|Win32">
      <Configuration>Release</Configuration>
      <Platform>Win32</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|x64">
      <Configuration>Release</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
  </ItemGroup>
  <PropertyGroup Label="Globals">
    <ProjectGuid>{d923bb7e-a7c8-4850-8fcf-0eb9ce35b4e8}</ProjectGuid>
    <Keyword>Win32Proj</Keyword>
    <RootNamespace>perfbench</RootNamespace>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <LinkIncremental>true</LinkIncremental>
    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <LinkIncremental>false</LinkIncremental>
    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
      <PrecompiledHeader>
      </PrecompiledHeader>
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <ClCompile>
      <PrecompiledHeader>
      </PrecompiledHeader>
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <PrecompiledHeader>
      </PrecompiledHeader>
      <Optimization>MaxSpeed</Optimization>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <PrecompiledHeader>
      </PrecompiledHeader>
      <Optimization>MaxSpeed</Optimization>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
    </Link>
  </ItemDefinitionGroup>
  <ItemGroup>
    <ClCompile Include="perfbench.cpp" />
    <ClCompile Include="perfbench_serial.cpp" />
  </ItemGroup>
  <ItemGroup>
    <CustomBuild Include="perfbench.ispc">
      <FileType>Document</FileType>
      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx
 </Command>
      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx
 </Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx
 </Command>
      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx
 </Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
    </CustomBuild>
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
 </Project>
--- a/examples/perfbench/perfbench_serial.cpp
+++ b/examples/perfbench/perfbench_serial.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2012, Intel Corporation
  All rights reserved.
  Redistribution and use in source and binary forms, with or without
@@ -31,36 +31,31 @@
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 */
-#ifndef ISPC_CPUID_H
+#include <math.h>
 #define ISPC_CPUID_H 1
-#ifdef _MSC_VER
+void
-// Provides a __cpuid() function with same signature as below
+xyzSumAOS(float *a, int count, float *zeros, float *result) {
-#include <intrin.h>
+    float xsum = 0, ysum = 0, zsum = 0;
-#else
+    for (int i = 0; i < count; i += 3) {
-static void __cpuid(int info[4], int infoType) {
+        xsum += a[i];
-    __asm__ __volatile__ ("cpuid"
+        ysum += a[i+1];
-                          : "=a" (info[0]), "=b" (info[1]), "=c" (info[2]), "=d" (info[3])
+        zsum += a[i+2];
-                          : "0" (infoType));
+    }
-}
+    result[0] = xsum;
-#endif
+    result[1] = ysum;
-
+    result[2] = zsum;
 inline bool CPUSupportsSSE2() {
    int info[4];
    __cpuid(info, 1);
    return (info[3] & (1 << 26)) != 0;
 }
-inline bool CPUSupportsSSE4() {
+void
-    int info[4];
+xyzSumSOA(float *a, int count, float *zeros, float *result) {
-    __cpuid(info, 1);
+    float xsum = 0, ysum = 0, zsum = 0;
-    return (info[2] & (1 << 19)) != 0;
+    for (int i = 0; i < count/3; ++i) {
        float *p = a + (i >> 3) * 24 + (i & 7);
        xsum += p[0];
        ysum += p[8];
        zsum += p[16];
    }
    result[0] = xsum;
    result[1] = ysum;
    result[2] = zsum;
 }
 inline bool CPUSupportsAVX() {
    int info[4];
    __cpuid(info, 1);
    return (info[2] & (1 << 28)) != 0;
 }
 #endif // ISPC_CPUID_H
--- a/examples/rt/Makefile
+++ b/examples/rt/Makefile
@@ -1,35 +1,7 @@
-ARCH = $(shell uname)
+EXAMPLE=rt
 CPP_SRC=rt.cpp rt_serial.cpp
 ISPC_SRC=rt.ispc
 ISPC_TARGETS=sse2,sse4-x2,avx
-TASK_CXX=../tasksys.cpp
+include ../common.mk
 TASK_LIB=-lpthread
 TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
 CXX=g++
 CXXFLAGS=-Iobjs/ -O3 -Wall -m64
 ISPC=ispc
 ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64
 default: rt
 .PHONY: dirs clean
 dirs:
 	/bin/mkdir -p objs/
 clean:
 	/bin/rm -rf objs *~ rt
 rt: dirs objs/rt.o objs/rt_serial.o objs/rt_ispc.o $(TASK_OBJ)
 	$(CXX) $(CXXFLAGS) -o $@ objs/rt.o objs/rt_ispc.o objs/rt_serial.o $(TASK_OBJ) -lm $(TASK_LIB)
 objs/%.o: %.cpp
 	$(CXX) $< $(CXXFLAGS) -c -o $@
 objs/%.o: ../%.cpp
 	$(CXX) $< $(CXXFLAGS) -c -o $@
 objs/rt.o: objs/rt_ispc.h 
 objs/%_ispc.h objs/%_ispc.o: %.ispc
 	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/rt/rt.cpp
+++ b/examples/rt/rt.cpp
@@ -45,7 +45,6 @@
 #include <string.h>
 #include <sys/types.h>
 #include "../timing.h"
 #include "../cpuid.h"
 #include "rt_ispc.h"
 using namespace ispc;
@@ -96,38 +95,6 @@ static void writeImage(int *idImage, float *depthImage, int width, int height,
 }
 // Make sure that the vector ISA used during compilation is supported by
 // the processor.  The ISPC_TARGET_* macro is set in the ispc-generated
 // header file that we include above.
 static void
 ensureTargetISAIsSupported() {
 #if defined(ISPC_TARGET_SSE2)
    bool isaSupported = CPUSupportsSSE2();
    const char *target = "SSE2";
 #elif defined(ISPC_TARGET_SSE4)
    bool isaSupported = CPUSupportsSSE4();
    const char *target = "SSE4";
 #elif defined(ISPC_TARGET_AVX)
    bool isaSupported = CPUSupportsAVX();
    const char *target = "AVX";
 #else
 #error "Unknown ISPC_TARGET_* value"
 #endif
    if (!isaSupported) {
        fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
                "set, which isn't\n***        supported by this computer's CPU!\n", target);
        fprintf(stderr, "***\n***        Please modify the "
 #ifdef _MSC_VER
                "MSVC project file "
 #else
                "Makefile "
 #endif
                "to select another target (e.g. sse2)\n***\n");
        exit(1);
    }
 }
 static void usage() {
    fprintf(stderr, "rt [--scale=<factor>] <scene name base>\n");
    exit(1);
@@ -151,8 +118,6 @@ int main(int argc, char *argv[]) {
    if (filename == NULL)
        usage();
    ensureTargetISAIsSupported();
 #define READ(var, n)                                            \
    if (fread(&(var), sizeof(var), n, f) != (unsigned int)n) {  \
        fprintf(stderr, "Unexpected EOF reading scene file\n"); \
@@ -203,12 +168,12 @@ int main(int argc, char *argv[]) {
        // of node, the total number of int it if a leaf node, etc.
        float b[6];
        READ(b[0], 6);
-        nodes[i].bounds[0].v[0] = b[0];
+        nodes[i].bounds[0][0] = b[0];
-        nodes[i].bounds[0].v[1] = b[1];
+        nodes[i].bounds[0][1] = b[1];
-        nodes[i].bounds[0].v[2] = b[2];
+        nodes[i].bounds[0][2] = b[2];
-        nodes[i].bounds[1].v[0] = b[3];
+        nodes[i].bounds[1][0] = b[3];
-        nodes[i].bounds[1].v[1] = b[4];
+        nodes[i].bounds[1][1] = b[4];
-        nodes[i].bounds[1].v[2] = b[5];
+        nodes[i].bounds[1][2] = b[5];
        READ(nodes[i].offset, 1);
        READ(nodes[i].nPrimitives, 1);
        READ(nodes[i].splitAxis, 1);
@@ -225,19 +190,17 @@ int main(int argc, char *argv[]) {
        READ(v[0], 9);
        float *vp = v;
        for (int j = 0; j < 3; ++j) {
-            triangles[i].p[j].v[0] = *vp++;
+            triangles[i].p[j][0] = *vp++;
-            triangles[i].p[j].v[1] = *vp++;
+            triangles[i].p[j][1] = *vp++;
-            triangles[i].p[j].v[2] = *vp++;
+            triangles[i].p[j][2] = *vp++;
        }
        // And create an object id
        triangles[i].id = i+1;
    }
    fclose(f);
-    // round image resolution up to multiple of 16 to make things easy for
+    int height = int(baseHeight * scale);
-    // the code that assigns pixels to ispc program instances
+    int width = int(baseWidth * scale);
    int height = (int(baseHeight * scale) + 0xf) & ~0xf;
    int width = (int(baseWidth * scale) + 0xf) & ~0xf;
    // allocate images; one to hold hit object ids, one to hold depth to
    // the first interseciton
--- a/examples/rt/rt.ispc
+++ b/examples/rt/rt.ispc
@@ -43,16 +43,17 @@ struct Ray {
 };
 struct Triangle {
-    uniform float3 p[3];
+    float p[3][4];
-    uniform int id;
+    int id;
    int pad[3];
 };
 struct LinearBVHNode {
-    uniform float3 bounds[2];
+    float bounds[2][3];
-    uniform unsigned int offset;     // num primitives for leaf, second child for interior
+    unsigned int offset;     // num primitives for leaf, second child for interior
-    uniform unsigned int8 nPrimitives;
+    unsigned int8 nPrimitives;
-    uniform unsigned int8 splitAxis;
+    unsigned int8 splitAxis;
-    uniform unsigned int16 pad;
+    unsigned int16 pad;
 };
 static inline float3 Cross(const float3 v1, const float3 v2) {
@@ -72,7 +73,7 @@ static inline float Dot(const float3 a, const float3 b) {
 static void generateRay(uniform const float raster2camera[4][4], 
                        uniform const float camera2world[4][4],
-                        float x, float y, reference Ray ray) {
+                        float x, float y, Ray &ray) {
    ray.mint = 0.f;
    ray.maxt = 1e30f;
@@ -87,9 +88,12 @@ static void generateRay(uniform const float raster2camera[4][4],
    camy /= camw;
    camz /= camw;
-    ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy + camera2world[0][2] * camz;
+    ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy + 
-    ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy + camera2world[1][2] * camz;
+        camera2world[0][2] * camz;
-    ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy + camera2world[2][2] * camz;
+    ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy + 
        camera2world[1][2] * camz;
    ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy + 
        camera2world[2][2] * camz;
    ray.origin.x = camera2world[0][3] / camera2world[3][3];
    ray.origin.y = camera2world[1][3] / camera2world[3][3];
@@ -103,14 +107,16 @@ static void generateRay(uniform const float raster2camera[4][4],
 }
-static inline bool BBoxIntersect(const reference uniform float3 bounds[2], 
+static bool BBoxIntersect(const uniform float bounds[2][3], 
-                                 const reference Ray ray) {
+                          const Ray &ray) {
    uniform float3 bounds0 = { bounds[0][0], bounds[0][1], bounds[0][2] };
    uniform float3 bounds1 = { bounds[1][0], bounds[1][1], bounds[1][2] };
    float t0 = ray.mint, t1 = ray.maxt;
    // Check all three axis-aligned slabs.  Don't try to early out; it's
    // not worth the trouble
-    float3 tNear = (bounds[0] - ray.origin) * ray.invDir;
+    float3 tNear = (bounds0 - ray.origin) * ray.invDir;
-    float3 tFar  = (bounds[1] - ray.origin) * ray.invDir;
+    float3 tFar  = (bounds1 - ray.origin) * ray.invDir;
    if (tNear.x > tFar.x) {
        float tmp = tNear.x;
        tNear.x = tFar.x;
@@ -140,9 +146,12 @@ static inline bool BBoxIntersect(const reference uniform float3 bounds[2],
-static inline bool TriIntersect(const reference Triangle tri, reference Ray ray) {
+static bool TriIntersect(const uniform Triangle &tri, Ray &ray) {
-    uniform float3 e1 = tri.p[1] - tri.p[0];
+    uniform float3 p0 = { tri.p[0][0], tri.p[0][1], tri.p[0][2] };
-    uniform float3 e2 = tri.p[2] - tri.p[0];
+    uniform float3 p1 = { tri.p[1][0], tri.p[1][1], tri.p[1][2] };
    uniform float3 p2 = { tri.p[2][0], tri.p[2][1], tri.p[2][2] };
    uniform float3 e1 = p1 - p0;
    uniform float3 e2 = p2 - p0;
    float3 s1 = Cross(ray.dir, e2);
    float divisor = Dot(s1, e1);
@@ -153,7 +162,7 @@ static inline bool TriIntersect(const reference Triangle tri, reference Ray ray)
    float invDivisor = 1.f / divisor;
    // Compute first barycentric coordinate
-    float3 d = ray.origin - tri.p[0];
+    float3 d = ray.origin - p0;
    float b1 = Dot(d, s1) * invDivisor;
    if (b1 < 0. || b1 > 1.)
        hit = false;
@@ -177,8 +186,8 @@ static inline bool TriIntersect(const reference Triangle tri, reference Ray ray)
 }
-bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[], 
+bool BVHIntersect(const uniform LinearBVHNode nodes[], 
-                  reference Ray r) {
+                  const uniform Triangle tris[], Ray &r) {
    Ray ray = r;
    bool hit = false;
    // Follow ray through BVH nodes to find primitive intersections
@@ -187,7 +196,7 @@ bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],
    while (true) {
        // Check ray against BVH node
-        LinearBVHNode node = nodes[nodeNum];
+        uniform LinearBVHNode node = nodes[nodeNum];
        if (any(BBoxIntersect(node.bounds, ray))) {
            uniform unsigned int nPrimitives = node.nPrimitives;
            if (nPrimitives > 0) {
@@ -233,39 +242,20 @@ static void raytrace_tile(uniform int x0, uniform int x1,
                          const uniform float raster2camera[4][4], 
                          const uniform float camera2world[4][4],
                          uniform float image[], uniform int id[],
-                          const LinearBVHNode nodes[],
+                          const uniform LinearBVHNode nodes[],
-                          const Triangle triangles[]) {
+                          const uniform Triangle triangles[]) {
    uniform float widthScale = (float)(baseWidth) / (float)(width);
    uniform float heightScale = (float)(baseHeight) / (float)(height);
-    static const uniform float udx[16] = { 0, 1, 0, 1, 2, 3, 2, 3, 
+    foreach_tiled (y = y0 ... y1, x = x0 ... x1) {
-                                           0, 1, 0, 1, 2, 3, 2, 3 };
+        Ray ray;
-    static const uniform float udy[16] = { 0, 0, 1, 1, 0, 0, 1, 1, 
+        generateRay(raster2camera, camera2world, x*widthScale,
-                                           2, 2, 3, 3, 2, 2, 3, 3 };
+                    y*heightScale, ray);
        BVHIntersect(nodes, triangles, ray);
-    // The outer loops are always over blocks of 4x4 pixels
+        int offset = y * width + x;
-    for (uniform int y = y0; y < y1; y += 4) {
+        image[offset] = ray.maxt;
-        for (uniform int x = x0; x < x1; x += 4) {
+        id[offset] = ray.hitId;
            // Now we have a block of 4x4=16 pixels to process; it will
            // take 16/programCount iterations of this loop to process
            // them.
            for (uniform int o = 0; o < 16 / programCount; ++o) {
                // Map program instances to samples in the udx/udy arrays
                // to figure out which pixel each program instance is
                // responsible for
                const float dx = udx[o * programCount + programIndex];
                const float dy = udy[o * programCount + programIndex];
                Ray ray;
                generateRay(raster2camera, camera2world, (x+dx)*widthScale,
                            (y+dy)*heightScale, ray);
                BVHIntersect(nodes, triangles, ray);
                int offset = (y + (int)dy) * width + (x + (int)dx);
                image[offset] = ray.maxt;
                id[offset] = ray.hitId;
            }
        }
    }
 }
@@ -275,27 +265,27 @@ export void raytrace_ispc(uniform int width, uniform int height,
                          const uniform float raster2camera[4][4], 
                          const uniform float camera2world[4][4],
                          uniform float image[], uniform int id[],
-                          const LinearBVHNode nodes[],
+                          const uniform LinearBVHNode nodes[],
-                          const Triangle triangles[]) {
+                          const uniform Triangle triangles[]) {
    raytrace_tile(0, width, 0, height, width, height, baseWidth, baseHeight,
                  raster2camera, camera2world, image,
                  id, nodes, triangles);
 }
-task void raytrace_tile_task(uniform int y0, uniform int y1, 
+task void raytrace_tile_task(uniform int width, uniform int height,
                             uniform int width, uniform int height,
                             uniform int baseWidth, uniform int baseHeight,
                             const uniform float raster2camera[4][4], 
                             const uniform float camera2world[4][4],
                             uniform float image[], uniform int id[],
-                             const LinearBVHNode nodes[],
+                             const uniform LinearBVHNode nodes[],
-                             const Triangle triangles[]) {
+                             const uniform Triangle triangles[]) {
-    uniform int dx = 16; // must match dx below
+    uniform int dx = 16, dy = 16; // must match dx, dy below
-    uniform int xTasks = (width + (dx-1)) / dx;
+    uniform int xBuckets = (width + (dx-1)) / dx;
-    uniform int x0 = (taskIndex % xTasks) * dx;
+    uniform int x0 = (taskIndex % xBuckets) * dx;
-    uniform int x1 = x0 + dx;
+    uniform int x1 = min(x0 + dx, width);
-    x1 = min(x1, width);
+    uniform int y0 = (taskIndex / xBuckets) * dy;
    uniform int y1 = min(y0 + dy, height);
    raytrace_tile(x0, x1, y0, y1, width, height, baseWidth, baseHeight, 
                  raster2camera, camera2world, image,
@@ -308,14 +298,14 @@ export void raytrace_ispc_tasks(uniform int width, uniform int height,
                                const uniform float raster2camera[4][4], 
                                const uniform float camera2world[4][4],
                                uniform float image[], uniform int id[],
-                                const LinearBVHNode nodes[],
+                                const uniform LinearBVHNode nodes[],
-                                const Triangle triangles[]) {
+                                const uniform Triangle triangles[]) {
    uniform int dx = 16, dy = 16;
-    uniform int nTasks = (width + (dx-1)) / dx;
+    uniform int xBuckets = (width + (dx-1)) / dx;
-    for (uniform int y = 0; y < height; y += dy) {
+    uniform int yBuckets = (height + (dy-1)) / dy;
-        uniform int y1 = min(y + dy, height);
+    uniform int nTasks = xBuckets * yBuckets;
-        launch[nTasks] < raytrace_tile_task(y, y1, width, height, baseWidth,
+    launch[nTasks] raytrace_tile_task(width, height, baseWidth, baseHeight, 
-                                            baseHeight, raster2camera, camera2world, 
+                                      raster2camera, camera2world, 
-                                            image, id, nodes, triangles) >;
+                                      image, id, nodes, triangles);
    }
 }
--- a/examples/rt/rt.vcxproj
+++ b/examples/rt/rt.vcxproj
@@ -64,15 +64,19 @@
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <LinkIncremental>true</LinkIncremental>
    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <LinkIncremental>false</LinkIncremental>
    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
@@ -81,6 +85,7 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -96,6 +101,7 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -113,6 +119,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
@@ -131,6 +138,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
@@ -144,21 +152,21 @@
    <CustomBuild Include="rt.ispc">
      <FileType>Document</FileType>
      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
+ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
 </Command>
      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
+ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
+ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
 </Command>
      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
+ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
--- a/examples/rt/rt_serial.cpp
+++ b/examples/rt/rt_serial.cpp
@@ -75,12 +75,13 @@ struct Ray {
 // Declare these in a namespace so the mangling matches
 namespace ispc {
    struct Triangle {
-        float3 p[3];
+        float p[3][4]; // extra float pad after each vertex
        int32_t id;
        int32_t pad[3]; // make 16 x 32-bits
    };
    struct LinearBVHNode {
-        float3 bounds[2];
+        float bounds[2][3];
        int32_t offset;     // primitives for leaf, second child for interior
        uint8_t nPrimitives;
        uint8_t splitAxis;
@@ -122,9 +123,12 @@ static void generateRay(const float raster2camera[4][4],
    camy /= camw;
    camz /= camw;
-    ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy + camera2world[0][2] * camz;
+    ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy +
-    ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy + camera2world[1][2] * camz;
+        camera2world[0][2] * camz;
-    ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy + camera2world[2][2] * camz;
+    ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy +
        camera2world[1][2] * camz;
    ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy +
        camera2world[2][2] * camz;
    ray.origin.x = camera2world[0][3] / camera2world[3][3];
    ray.origin.y = camera2world[1][3] / camera2world[3][3];
@@ -140,12 +144,14 @@ static void generateRay(const float raster2camera[4][4],
 }
-static inline bool BBoxIntersect(const float3 bounds[2], 
+static inline bool BBoxIntersect(const float bounds[2][3], 
                                 const Ray &ray) {
    float3 bounds0(bounds[0][0], bounds[0][1], bounds[0][2]);
    float3 bounds1(bounds[1][0], bounds[1][1], bounds[1][2]);
    float t0 = ray.mint, t1 = ray.maxt;
-    float3 tNear = (bounds[0] - ray.origin) * ray.invDir;
+    float3 tNear = (bounds0 - ray.origin) * ray.invDir;
-    float3 tFar  = (bounds[1] - ray.origin) * ray.invDir;
+    float3 tFar  = (bounds1 - ray.origin) * ray.invDir;
    if (tNear.x > tFar.x) {
        float tmp = tNear.x;
        tNear.x = tFar.x;
@@ -176,8 +182,11 @@ static inline bool BBoxIntersect(const float3 bounds[2],
 inline bool TriIntersect(const Triangle &tri, Ray &ray) {
-    float3 e1 = tri.p[1] - tri.p[0];
+    float3 p0(tri.p[0][0], tri.p[0][1], tri.p[0][2]);
-    float3 e2 = tri.p[2] - tri.p[0];
+    float3 p1(tri.p[1][0], tri.p[1][1], tri.p[1][2]);
    float3 p2(tri.p[2][0], tri.p[2][1], tri.p[2][2]);
    float3 e1 = p1 - p0;
    float3 e2 = p2 - p0;
    float3 s1 = Cross(ray.dir, e2);
    float divisor = Dot(s1, e1);
@@ -187,7 +196,7 @@ inline bool TriIntersect(const Triangle &tri, Ray &ray) {
    float invDivisor = 1.f / divisor;
    // Compute first barycentric coordinate
-    float3 d = ray.origin - tri.p[0];
+    float3 d = ray.origin - p0;
    float b1 = Dot(d, s1) * invDivisor;
    if (b1 < 0. || b1 > 1.)
        return false;
--- a/examples/simple/Makefile
+++ b/examples/simple/Makefile
@@ -2,7 +2,7 @@
 CXX=g++ -m64
 CXXFLAGS=-Iobjs/ -O3 -Wall
 ISPC=ispc
-ISPCFLAGS=-O2 --arch=x86-64
+ISPCFLAGS=-O2 --arch=x86-64 --target=sse2
 default: simple
--- a/Show More
+++ b/Show More