Release notes, bump doxygen version for 1.2.0 release

Add __foreach_active statement to loop over active prog. instances.
For now this has the __ prefix, as an experimental feature currently only used in the standard library implementation. It's probably worth making something along these lines an official feature, but I'm not sure if this in its current form is quite the right thing.
2012-03-20 11:56:08 -07:00 · 2012-03-20 08:46:00 -07:00 · 2012-03-20 05:55:09 -07:00 · 2012-03-20 05:54:29 -07:00 · 2012-03-20 09:44:49 +08:00 · 2012-03-20 09:27:57 +08:00
468 changed files with 32481 additions and 7997 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -5,4 +5,11 @@ ispc
 ispc_test
 objs
 docs/doxygen
-docs/ispc.html
+docs/*.html
+tests*/*cpp
+tests*/*run
+examples/*/*.png
+examples/*/*.ppm
+examples/*/objs/*
+
+
--- a/131
+++ b/131
@@ -3,22 +3,50 @@
 #

 ARCH_OS = $(shell uname)
+ifeq ($(ARCH_OS), Darwin)
+	ARCH_OS2 = "OSX"
+else
+	ARCH_OS2 = $(shell uname -o)
+endif
 ARCH_TYPE = $(shell arch)

+ifeq ($(shell llvm-config --version), 3.1svn)
+  LLVM_LIBS=-lLLVMAsmParser -lLLVMInstrumentation -lLLVMLinker			\
+	-lLLVMArchive -lLLVMBitReader -lLLVMDebugInfo -lLLVMJIT -lLLVMipo	\
+	-lLLVMBitWriter -lLLVMTableGen -lLLVMCBackendInfo			\
+	-lLLVMX86Disassembler -lLLVMX86CodeGen -lLLVMSelectionDAG		\
+	-lLLVMAsmPrinter -lLLVMX86AsmParser -lLLVMX86Desc -lLLVMX86Info		\
+	-lLLVMX86AsmPrinter -lLLVMX86Utils -lLLVMMCDisassembler	-lLLVMMCParser	\
+	-lLLVMCodeGen -lLLVMScalarOpts	-lLLVMInstCombine -lLLVMTransformUtils	\
+	-lLLVMipa -lLLVMAnalysis -lLLVMMCJIT -lLLVMRuntimeDyld			\
+	-lLLVMExecutionEngine -lLLVMTarget -lLLVMMC -lLLVMObject -lLLVMCore 	\
+	-lLLVMSupport
+else
+  LLVM_LIBS=$(shell llvm-config --libs)
+endif
+
 CLANG=clang
 CLANG_LIBS = -lclangFrontend -lclangDriver \
             -lclangSerialization -lclangParse -lclangSema \
             -lclangAnalysis -lclangAST -lclangLex -lclangBasic
+ifeq ($(shell llvm-config --version), 3.1svn)
+  CLANG_LIBS += -lclangEdit
+endif

-ISPC_LIBS=$(CLANG_LIBS) \
-	$(shell llvm-config --ldflags --libs) \
-	-lpthread -ldl
-ISPC_TEST_LIBS=$(shell llvm-config --ldflags --libs) \
-	-lpthread -ldl
+ISPC_LIBS=$(shell llvm-config --ldflags) $(CLANG_LIBS) $(LLVM_LIBS) \
+	-lpthread
+
+ifeq ($(ARCH_OS),Linux)
+	ISPC_LIBS += -ldl
+endif
+
+ifeq ($(ARCH_OS2),Msys)
+	ISPC_LIBS += -lshlwapi -limagehlp -lpsapi
+endif

 LLVM_CXXFLAGS=$(shell llvm-config --cppflags)
-LLVM_VERSION=$(shell llvm-config --version | sed s/\\./_/)
-LLVM_VERSION_DEF=-DLLVM_$(LLVM_VERSION)
+LLVM_VERSION=LLVM_$(shell llvm-config --version | sed s/\\./_/)
+LLVM_VERSION_DEF=-D$(LLVM_VERSION)

 BUILD_DATE=$(shell date +%Y%m%d)
 BUILD_VERSION=$(shell git log --abbrev-commit --abbrev=16 | head -1)
@@ -33,11 +61,7 @@ LDFLAGS=
 ifeq ($(ARCH_OS),Linux)
  # try to link everything statically under Linux (including libstdc++) so
  # that the binaries we generate will be portable across distributions...
-  ifeq ($(ARCH_TYPE),x86_64)
-    LDFLAGS=-static -L/usr/lib/gcc/x86_64-linux-gnu/4.4
-  else
-    LDFLAGS=-L/usr/lib/gcc/i686-redhat-linux/4.6.0
-  endif
+    LDFLAGS=-static
 endif

 LEX=flex
@@ -45,21 +69,25 @@ YACC=bison -d -v -t

 ###########################################################################

-CXX_SRC=ast.cpp builtins.cpp ctx.cpp decl.cpp expr.cpp func.cpp ispc.cpp \
-	llvmutil.cpp main.cpp module.cpp opt.cpp stmt.cpp sym.cpp type.cpp \
-	util.cpp
+CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
+	ispc.cpp llvmutil.cpp main.cpp module.cpp opt.cpp stmt.cpp sym.cpp \
+	type.cpp util.cpp
 HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
 	opt.h stmt.h sym.h type.h util.h
-BUILTINS_SRC=builtins-avx.ll builtins-avx-x2.ll builtins-sse2.ll builtins-sse2-x2.ll \
-	builtins-sse4.ll builtins-sse4-x2.ll builtins-dispatch.ll
+TARGETS=avx1 avx1-x2 avx2 avx2-x2 sse2 sse2-x2 sse4 sse4-x2 generic-4 generic-8 \
+	generic-16 generic-1
+BUILTINS_SRC=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS))) \
+	builtins/dispatch.ll
+BUILTINS_OBJS=$(addprefix builtins-, $(notdir $(BUILTINS_SRC:.ll=.o))) \
+	builtins-c-32.cpp builtins-c-64.cpp 
 BISON_SRC=parse.yy
 FLEX_SRC=lex.ll

-OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_SRC:.ll=.o) \
-	builtins-c-32.o builtins-c-64.o stdlib_ispc.o $(BISON_SRC:.yy=.o) \
-	$(FLEX_SRC:.ll=.o))
+OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_OBJS) \
+	stdlib_generic_ispc.o stdlib_x86_ispc.o \
+	$(BISON_SRC:.yy=.o) $(FLEX_SRC:.ll=.o))

-default: ispc ispc_test
+default: ispc

 .PHONY: dirs clean depend doxygen print_llvm_src
 .PRECIOUS: objs/builtins-%.cpp
@@ -78,7 +106,7 @@ print_llvm_src:
 	@echo Using LLVM `llvm-config --version` from `llvm-config --libdir`

 clean:
-	/bin/rm -rf objs ispc ispc_test
+	/bin/rm -rf objs ispc

 doxygen:
 	/bin/rm -rf docs/doxygen
@@ -88,14 +116,18 @@ ispc: print_llvm_src dirs $(OBJS)
 	@echo Creating ispc executable
 	@$(CXX) $(LDFLAGS) -o $@ $(OBJS) $(ISPC_LIBS)

-ispc_test: dirs ispc_test.cpp
-	@echo Creating ispc_test executable
-	@$(CXX) $(LDFLAGS) $(CXXFLAGS) -o $@ ispc_test.cpp $(ISPC_TEST_LIBS)
-
 objs/%.o: %.cpp
 	@echo Compiling $<
 	@$(CXX) $(CXXFLAGS) -o $@ -c $<

+objs/cbackend.o: cbackend.cpp
+	@echo Compiling $<
+	@$(CXX) -fno-rtti -fno-exceptions $(CXXFLAGS) -o $@ -c $<
+
+objs/%.o: objs/%.cpp
+	@echo Compiling $<
+	@$(CXX) $(CXXFLAGS) -o $@ -c $<
+
 objs/parse.cc: parse.yy
 	@echo Running bison on $<
 	@$(YACC) -o $@ $<
@@ -112,41 +144,24 @@ objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc
 	@echo Compiling $<
 	@$(CXX) $(CXXFLAGS) -o $@ -c $<

-objs/builtins-%.cpp: builtins-%.ll
-	@echo Creating C++ source from builtin definitions file $<
-	@m4 -DLLVM_VERSION=$(LLVM_VERSION) builtins.m4 $< | ./bitcode2cpp.py $< > $@
-
-objs/builtins-%.o: objs/builtins-%.cpp
-	@echo Compiling $<
-	@$(CXX) $(CXXFLAGS) -o $@ -c $<
-
-objs/builtins-c-32.cpp: builtins-c.c
+objs/builtins-%.cpp: builtins/%.ll builtins/util.m4 $(wildcard builtins/*common.ll)
 	@echo Creating C++ source from builtins definition file $<
-	@$(CLANG) -m32 -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py builtins-c-32.c > $@
+	@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) $< | python bitcode2cpp.py $< > $@

-objs/builtins-c-32.o: objs/builtins-c-32.cpp
-	@echo Compiling $<
-	@$(CXX) $(CXXFLAGS) -o $@ -c $<
-
-objs/builtins-c-64.cpp: builtins-c.c
+objs/builtins-c-32.cpp: builtins/builtins.c
 	@echo Creating C++ source from builtins definition file $<
-	@$(CLANG) -m64 -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py builtins-c-64.c > $@
+	@$(CLANG) -m32 -emit-llvm -c $< -o - | llvm-dis - | python bitcode2cpp.py c-32 > $@

-objs/builtins-c-64.o: objs/builtins-c-64.cpp
-	@echo Compiling $<
-	@$(CXX) $(CXXFLAGS) -o $@ -c $<
+objs/builtins-c-64.cpp: builtins/builtins.c
+	@echo Creating C++ source from builtins definition file $<
+	@$(CLANG) -m64 -emit-llvm -c $< -o - | llvm-dis - | python bitcode2cpp.py c-64 > $@

-objs/stdlib_ispc.cpp: stdlib.ispc
-	@echo Creating C++ source from $<
-	@$(CLANG) -E -x c -DISPC=1 -DPI=3.1415926536 $< -o - | ./stdlib2cpp.py > $@
+objs/stdlib_generic_ispc.cpp: stdlib.ispc
+	@echo Creating C++ source from $< for generic
+	@$(CLANG) -E -x c -DISPC_TARGET_GENERIC=1 -DISPC=1 -DPI=3.1415926536 $< -o - | \
+		python stdlib2cpp.py generic > $@

-objs/stdlib_ispc.o: objs/stdlib_ispc.cpp
-	@echo Compiling $<
-	@$(CXX) $(CXXFLAGS) -o $@ -c $<
-
-objs/builtins-sse2.cpp: builtins.m4 builtins-sse2-common.ll builtins-sse2.ll
-objs/builtins-sse2-x2.cpp: builtins.m4 builtins-sse2-common.ll builtins-sse2-x2.ll
-objs/builtins-sse4.cpp: builtins.m4 builtins-sse4-common.ll builtins-sse4.ll
-objs/builtins-sse4-x2.cpp: builtins.m4 builtins-sse4-common.ll builtins-sse4-x2.ll
-objs/builtins-avx.cpp: builtins.m4 builtins-avx-common.ll builtins-avx.ll
-objs/builtins-avx-x2.cpp: builtins.m4 builtins-avx-common.ll builtins-avx-x2.ll
+objs/stdlib_x86_ispc.cpp: stdlib.ispc
+	@echo Creating C++ source from $< for x86
+	@$(CLANG) -E -x c -DISPC=1 -DPI=3.1415926536 $< -o - | \
+		python stdlib2cpp.py x86 > $@
--- a/README.rst
+++ b/README.rst
@@ -0,0 +1,90 @@
+==============================
+Intel(r) SPMD Program Compiler
+==============================
+
+``ispc`` is a compiler for a variant of the C programming language, with
+extensions for `single program, multiple data
+<http://en.wikipedia.org/wiki/SPMD>`_ programming.  Under the SPMD model,
+the programmer writes a program that generally appears to be a regular
+serial program, though the execution model is actually that a number of
+*program instances* execute in parallel on the hardware.
+
+Overview
+--------
+
+``ispc`` compiles a C-based SPMD programming language to run on the SIMD
+units of CPUs; it frequently provides a 3x or more speedup on CPUs with
+4-wide vector SSE units and 5x-6x on CPUs with 8-wide AVX vector units,
+without any of the difficulty of writing intrinsics code.  Parallelization
+across multiple cores is also supported by ``ispc``, making it
+possible to write programs that achieve performance improvement that scales
+by both number of cores and vector unit size.
+
+There are a few key principles in the design of ``ispc``:
+
+  * To build a small set of extensions to the C language that
+    would deliver excellent performance to performance-oriented
+    programmers who want to run SPMD programs on the CPU.
+
+  * To provide a thin abstraction layer between the programmer
+    and the hardware--in particular, to have an execution and
+    data model where the programmer can cleanly reason about the
+    mapping of their source program to compiled assembly language
+    and the underlying hardware.
+
+  * To make it possible to harness the computational power of SIMD
+    vector units without the extremely low-programmer-productivity
+    activity of directly writing intrinsics.
+
+  * To explore opportunities from close coupling between C/C++
+    application code and SPMD ``ispc`` code running on the
+    same processor--to have lightweight function calls between
+    the two languages and to share data directly via pointers without
+    copying or reformatting.
+
+``ispc`` is an open source compiler with the BSD license.  It uses the
+remarkable `LLVM Compiler Infrastructure <http://llvm.org>`_ for back-end
+code generation and optimization and is `hosted on
+github <http://github.com/ispc/ispc/>`_. It supports Windows, Mac, and
+Linux, with both x86 and x86-64 targets.  It currently supports the SSE2,
+SSE4, AVX1, and AVX2 instruction sets.
+
+Features
+--------
+
+``ispc`` provides a number of key features to developers:
+
+  * Familiarity as an extension of the C programming
+    language: ``ispc`` supports familiar C syntax and
+    programming idioms, while adding the ability to write SPMD
+    programs.
+
+  * High-quality SIMD code generation: the performance
+    of code generated by ``ispc`` is often close to that of
+    hand-written intrinsics code.
+
+  * Ease of adoption with existing software
+    systems: functions written in ``ispc`` directly
+    interoperate with application functions written in C/C++ and
+    with application data structures.
+            
+  * Portability across over a decade of CPU
+    generations: ``ispc`` has targets for SSE2, SSE4, AVX
+    (and soon, AVX2).
+
+  * Portability across operating systems: Microsoft
+    Windows, Mac OS X, and Linux are all supported
+    by ``ispc``.
+
+  * Debugging with standard tools: ``ispc``
+    programs can be debugged with standard debuggers (OS X and
+    Linux only).
+
+Additional Resources
+--------------------
+
+Prebuilt ``ispc`` binaries for Windows, OS X and Linux can be downloaded
+from the `ispc downloads page <http://ispc.github.com/downloads.html>`_.
+See also additional
+`documentation <http://ispc.github.com/documentation.html>`_ and additional
+`performance information <http://ispc.github.com/perf.html>`_.
--- a/README.txt
+++ b/README.txt
@@ -1,22 +0,0 @@
-==============================
-Intel(r) SPMD Program Compiler
-==============================
-
-Welcome to the Intel(r) SPMD Program Compiler (ispc)!  
-
-ispc is a new compiler for "single program, multiple data" (SPMD)
-programs. Under the SPMD model, the programmer writes a program that mostly
-appears to be a regular serial program, though the execution model is
-actually that a number of program instances execute in parallel on the
-hardware. ispc compiles a C-based SPMD programming language to run on the
-SIMD units of CPUs; it frequently provides a a 3x or more speedup on CPUs
-with 4-wide SSE units, without any of the difficulty of writing intrinsics
-code.
-
-ispc is an open source compiler under the BSD license; see the file
-LICENSE.txt.  ispc supports Windows, Mac, and Linux, with both x86 and
-x86-64 targets.  It currently supports the SSE2, SSE4, and AVX instruction
-sets.
-
-For more information and examples, as well as a wiki and the bug database,
-see the ispc distribution site, http://ispc.github.com.
--- a/ast.cpp
+++ b/ast.cpp
@@ -36,8 +36,11 @@
 */

 #include "ast.h"
+#include "expr.h"
 #include "func.h"
+#include "stmt.h"
 #include "sym.h"
+#include "util.h"

 ///////////////////////////////////////////////////////////////////////////
 // ASTNode
@@ -63,3 +66,365 @@ AST::GenerateIR() {
        functions[i]->GenerateIR();
 }

+///////////////////////////////////////////////////////////////////////////
+
+ASTNode *
+WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
+        void *data) {
+    if (node == NULL)
+        return node;
+
+    // Call the callback function
+    if (preFunc != NULL) {
+        if (preFunc(node, data) == false)
+            // The function asked us to not continue recursively, so stop.
+            return node;
+    }
+
+    ////////////////////////////////////////////////////////////////////////////
+    // Handle Statements
+    if (dynamic_cast<Stmt *>(node) != NULL) {
+        ExprStmt *es;
+        DeclStmt *ds;
+        IfStmt *is;
+        DoStmt *dos;
+        ForStmt *fs;
+        ForeachStmt *fes;
+        CaseStmt *cs;
+        DefaultStmt *defs;
+        SwitchStmt *ss;
+        ReturnStmt *rs;
+        LabeledStmt *ls;
+        StmtList *sl;
+        PrintStmt *ps;
+        AssertStmt *as;
+        DeleteStmt *dels;
+
+        if ((es = dynamic_cast<ExprStmt *>(node)) != NULL)
+            es->expr = (Expr *)WalkAST(es->expr, preFunc, postFunc, data);
+        else if ((ds = dynamic_cast<DeclStmt *>(node)) != NULL) {
+            for (unsigned int i = 0; i < ds->vars.size(); ++i)
+                ds->vars[i].init = (Expr *)WalkAST(ds->vars[i].init, preFunc, 
+                                                   postFunc, data);
+        }
+        else if ((is = dynamic_cast<IfStmt *>(node)) != NULL) {
+            is->test = (Expr *)WalkAST(is->test, preFunc, postFunc, data);
+            is->trueStmts = (Stmt *)WalkAST(is->trueStmts, preFunc, 
+                                            postFunc, data);
+            is->falseStmts = (Stmt *)WalkAST(is->falseStmts, preFunc, 
+                                             postFunc, data);
+        }
+        else if ((dos = dynamic_cast<DoStmt *>(node)) != NULL) {
+            dos->testExpr = (Expr *)WalkAST(dos->testExpr, preFunc, 
+                                            postFunc, data);
+            dos->bodyStmts = (Stmt *)WalkAST(dos->bodyStmts, preFunc, 
+                                             postFunc, data);
+        }
+        else if ((fs = dynamic_cast<ForStmt *>(node)) != NULL) {
+            fs->init = (Stmt *)WalkAST(fs->init, preFunc, postFunc, data);
+            fs->test = (Expr *)WalkAST(fs->test, preFunc, postFunc, data);
+            fs->step = (Stmt *)WalkAST(fs->step, preFunc, postFunc, data);
+            fs->stmts = (Stmt *)WalkAST(fs->stmts, preFunc, postFunc, data);
+        }
+        else if ((fes = dynamic_cast<ForeachStmt *>(node)) != NULL) {
+            for (unsigned int i = 0; i < fes->startExprs.size(); ++i)
+                fes->startExprs[i] = (Expr *)WalkAST(fes->startExprs[i], preFunc, 
+                                                     postFunc, data);
+            for (unsigned int i = 0; i < fes->endExprs.size(); ++i)
+                fes->endExprs[i] = (Expr *)WalkAST(fes->endExprs[i], preFunc, 
+                                                   postFunc, data);
+            fes->stmts = (Stmt *)WalkAST(fes->stmts, preFunc, postFunc, data);
+        }
+        else if ((cs = dynamic_cast<CaseStmt *>(node)) != NULL)
+            cs->stmts = (Stmt *)WalkAST(cs->stmts, preFunc, postFunc, data);
+        else if ((defs = dynamic_cast<DefaultStmt *>(node)) != NULL)
+            defs->stmts = (Stmt *)WalkAST(defs->stmts, preFunc, postFunc, data);
+        else if ((ss = dynamic_cast<SwitchStmt *>(node)) != NULL) {
+            ss->expr = (Expr *)WalkAST(ss->expr, preFunc, postFunc, data);
+            ss->stmts = (Stmt *)WalkAST(ss->stmts, preFunc, postFunc, data);
+        }
+        else if (dynamic_cast<BreakStmt *>(node) != NULL ||
+                 dynamic_cast<ContinueStmt *>(node) != NULL ||
+                 dynamic_cast<GotoStmt *>(node) != NULL) {
+            // nothing
+        }
+        else if ((ls = dynamic_cast<LabeledStmt *>(node)) != NULL)
+            ls->stmt = (Stmt *)WalkAST(ls->stmt, preFunc, postFunc, data);
+        else if ((rs = dynamic_cast<ReturnStmt *>(node)) != NULL)
+            rs->val = (Expr *)WalkAST(rs->val, preFunc, postFunc, data);
+        else if ((sl = dynamic_cast<StmtList *>(node)) != NULL) {
+            std::vector<Stmt *> &sls = sl->stmts;
+            for (unsigned int i = 0; i < sls.size(); ++i)
+                sls[i] = (Stmt *)WalkAST(sls[i], preFunc, postFunc, data);
+        }
+        else if ((ps = dynamic_cast<PrintStmt *>(node)) != NULL)
+            ps->values = (Expr *)WalkAST(ps->values, preFunc, postFunc, data);
+        else if ((as = dynamic_cast<AssertStmt *>(node)) != NULL)
+            as->expr = (Expr *)WalkAST(as->expr, preFunc, postFunc, data);
+        else if ((dels = dynamic_cast<DeleteStmt *>(node)) != NULL)
+            dels->expr = (Expr *)WalkAST(dels->expr, preFunc, postFunc, data);
+        else
+            FATAL("Unhandled statement type in WalkAST()");
+    }
+    else {
+        ///////////////////////////////////////////////////////////////////////////
+        // Handle expressions
+        Assert(dynamic_cast<Expr *>(node) != NULL);
+        UnaryExpr *ue;
+        BinaryExpr *be;
+        AssignExpr *ae;
+        SelectExpr *se;
+        ExprList *el;
+        FunctionCallExpr *fce;
+        IndexExpr *ie;
+        MemberExpr *me;
+        TypeCastExpr *tce;
+        ReferenceExpr *re;
+        DereferenceExpr *dre;
+        SizeOfExpr *soe;
+        AddressOfExpr *aoe;
+        NewExpr *newe;
+
+        if ((ue = dynamic_cast<UnaryExpr *>(node)) != NULL)
+            ue->expr = (Expr *)WalkAST(ue->expr, preFunc, postFunc, data);
+        else if ((be = dynamic_cast<BinaryExpr *>(node)) != NULL) {
+            be->arg0 = (Expr *)WalkAST(be->arg0, preFunc, postFunc, data);
+            be->arg1 = (Expr *)WalkAST(be->arg1, preFunc, postFunc, data);
+        }
+        else if ((ae = dynamic_cast<AssignExpr *>(node)) != NULL) {
+            ae->lvalue = (Expr *)WalkAST(ae->lvalue, preFunc, postFunc, data);
+            ae->rvalue = (Expr *)WalkAST(ae->rvalue, preFunc, postFunc, data);
+        }
+        else if ((se = dynamic_cast<SelectExpr *>(node)) != NULL) {
+            se->test = (Expr *)WalkAST(se->test, preFunc, postFunc, data);
+            se->expr1 = (Expr *)WalkAST(se->expr1, preFunc, postFunc, data);
+            se->expr2 = (Expr *)WalkAST(se->expr2, preFunc, postFunc, data);
+        }
+        else if ((el = dynamic_cast<ExprList *>(node)) != NULL) {
+            for (unsigned int i = 0; i < el->exprs.size(); ++i)
+                el->exprs[i] = (Expr *)WalkAST(el->exprs[i], preFunc, 
+                                               postFunc, data);
+        }
+        else if ((fce = dynamic_cast<FunctionCallExpr *>(node)) != NULL) {
+            fce->func = (Expr *)WalkAST(fce->func, preFunc, postFunc, data);
+            fce->args = (ExprList *)WalkAST(fce->args, preFunc, postFunc, data);
+            fce->launchCountExpr = (Expr *)WalkAST(fce->launchCountExpr, preFunc,
+                                                   postFunc, data);
+        }
+        else if ((ie = dynamic_cast<IndexExpr *>(node)) != NULL) {
+            ie->baseExpr = (Expr *)WalkAST(ie->baseExpr, preFunc, postFunc, data);
+            ie->index = (Expr *)WalkAST(ie->index, preFunc, postFunc, data);
+        }
+        else if ((me = dynamic_cast<MemberExpr *>(node)) != NULL)
+            me->expr = (Expr *)WalkAST(me->expr, preFunc, postFunc, data);
+        else if ((tce = dynamic_cast<TypeCastExpr *>(node)) != NULL)
+            tce->expr = (Expr *)WalkAST(tce->expr, preFunc, postFunc, data);
+        else if ((re = dynamic_cast<ReferenceExpr *>(node)) != NULL)
+            re->expr = (Expr *)WalkAST(re->expr, preFunc, postFunc, data);
+        else if ((dre = dynamic_cast<DereferenceExpr *>(node)) != NULL)
+            dre->expr = (Expr *)WalkAST(dre->expr, preFunc, postFunc, data);
+        else if ((soe = dynamic_cast<SizeOfExpr *>(node)) != NULL)
+            soe->expr = (Expr *)WalkAST(soe->expr, preFunc, postFunc, data);
+        else if ((aoe = dynamic_cast<AddressOfExpr *>(node)) != NULL)
+            aoe->expr = (Expr *)WalkAST(aoe->expr, preFunc, postFunc, data);
+        else if ((newe = dynamic_cast<NewExpr *>(node)) != NULL) {
+            newe->countExpr = (Expr *)WalkAST(newe->countExpr, preFunc, 
+                                              postFunc, data);
+            newe->initExpr = (Expr *)WalkAST(newe->initExpr, preFunc, 
+                                             postFunc, data);
+        }
+        else if (dynamic_cast<SymbolExpr *>(node) != NULL ||
+                 dynamic_cast<ConstExpr *>(node) != NULL ||
+                 dynamic_cast<FunctionSymbolExpr *>(node) != NULL ||
+                 dynamic_cast<SyncExpr *>(node) != NULL ||
+                 dynamic_cast<NullPointerExpr *>(node) != NULL) {
+            // nothing to do 
+        }
+        else 
+            FATAL("Unhandled expression type in WalkAST().");
+    }
+
+    // Call the callback function
+    if (postFunc != NULL)
+        return postFunc(node, data);
+    else
+        return node;
+}
+
+
+static ASTNode *
+lOptimizeNode(ASTNode *node, void *) {
+    return node->Optimize();
+}
+
+
+ASTNode *
+Optimize(ASTNode *root) {
+    return WalkAST(root, NULL, lOptimizeNode, NULL);
+}
+
+
+Expr *
+Optimize(Expr *expr) {
+    return (Expr *)Optimize((ASTNode *)expr);
+}
+
+
+Stmt *
+Optimize(Stmt *stmt) {
+    return (Stmt *)Optimize((ASTNode *)stmt);
+}
+
+
+static ASTNode *
+lTypeCheckNode(ASTNode *node, void *) {
+    return node->TypeCheck();
+}
+
+
+ASTNode *
+TypeCheck(ASTNode *root) {
+    return WalkAST(root, NULL, lTypeCheckNode, NULL);
+}
+
+
+Expr *
+TypeCheck(Expr *expr) {
+    return (Expr *)TypeCheck((ASTNode *)expr);
+}
+
+
+Stmt *
+TypeCheck(Stmt *stmt) {
+    return (Stmt *)TypeCheck((ASTNode *)stmt);
+}
+
+
+static bool
+lCostCallback(ASTNode *node, void *c) {
+    int *cost = (int *)c;
+    *cost += node->EstimateCost();
+    return true;
+}
+
+
+int
+EstimateCost(ASTNode *root) {
+    int cost = 0;
+    WalkAST(root, lCostCallback, NULL, &cost);
+    return cost;
+}
+
+
+/** Given an AST node, check to see if it's safe if we happen to run the
+    code for that node with the execution mask all off.
+ */
+static bool
+lCheckAllOffSafety(ASTNode *node, void *data) {
+    bool *okPtr = (bool *)data;
+
+    if (dynamic_cast<FunctionCallExpr *>(node) != NULL) {
+        // FIXME: If we could somehow determine that the function being
+        // called was safe (and all of the args Exprs were safe, then it'd
+        // be nice to be able to return true here.  (Consider a call to
+        // e.g. floatbits() in the stdlib.)  Unfortunately for now we just
+        // have to be conservative.
+        *okPtr = false;
+        return false;
+    }
+
+    if (dynamic_cast<AssertStmt *>(node) != NULL) {
+        // While it's fine to run the assert for varying tests, it's not
+        // desirable to check an assert on a uniform variable if all of the
+        // lanes are off.
+        *okPtr = false;
+        return false;
+    }
+
+    if (dynamic_cast<NewExpr *>(node) != NULL ||
+        dynamic_cast<DeleteStmt *>(node) != NULL) {
+        // We definitely don't want to run the uniform variants of these if
+        // the mask is all off.  It's also worth skipping the overhead of
+        // executing the varying versions of them in the all-off mask case.
+        *okPtr = false;
+        return false;
+    }
+
+    if (g->target.allOffMaskIsSafe == true)
+        // Don't worry about memory accesses if we have a target that can
+        // safely run them with the mask all off
+        return true;
+
+    IndexExpr *ie;
+    if ((ie = dynamic_cast<IndexExpr *>(node)) != NULL && ie->baseExpr != NULL) {
+        const Type *type = ie->baseExpr->GetType();
+        if (type == NULL)
+            return true;
+        if (dynamic_cast<const ReferenceType *>(type) != NULL)
+            type = type->GetReferenceTarget();
+
+        ConstExpr *ce = dynamic_cast<ConstExpr *>(ie->index);
+        if (ce == NULL) {
+            // indexing with a variable... -> not safe
+            *okPtr = false;
+            return false;
+        }
+
+        const PointerType *pointerType = 
+            dynamic_cast<const PointerType *>(type);
+        if (pointerType != NULL) {
+            // pointer[index] -> can't be sure -> not safe
+            *okPtr = false;
+            return false;
+        }
+
+        const SequentialType *seqType = 
+            dynamic_cast<const SequentialType *>(type);
+        Assert(seqType != NULL);
+        int nElements = seqType->GetElementCount();
+        if (nElements == 0) {
+            // Unsized array, so we can't be sure -> not safe
+            *okPtr = false;
+            return false;
+        }
+
+        int32_t indices[ISPC_MAX_NVEC];
+        int count = ce->AsInt32(indices);
+        for (int i = 0; i < count; ++i) {
+            if (indices[i] < 0 || indices[i] >= nElements) {
+                // Index is out of bounds -> not safe
+                *okPtr = false;
+                return false;
+            }
+        }
+
+        // All indices are in-bounds
+        return true;
+    }
+
+    MemberExpr *me;
+    if ((me = dynamic_cast<MemberExpr *>(node)) != NULL &&
+        me->dereferenceExpr) {
+        *okPtr = false;
+        return false;
+    }
+
+    DereferenceExpr *de;
+    if ((de = dynamic_cast<DereferenceExpr *>(node)) != NULL) {
+        const Type *exprType = de->expr->GetType();
+        if (dynamic_cast<const PointerType *>(exprType) != NULL) {
+            *okPtr = false;
+            return false;
+        }
+    }
+
+    return true;
+}
+
+
+bool
+SafeToRunWithMaskAllOff(ASTNode *root) {
+    bool safe = true;
+    WalkAST(root, lCheckAllOffSafety, NULL, &safe);
+    return safe;
+}
--- a/ast.h
+++ b/ast.h
@@ -53,10 +53,11 @@ public:
    virtual ~ASTNode();

    /** The Optimize() method should perform any appropriate early-stage
-        optimizations on the node (e.g. constant folding).  The caller
-        should use the returned ASTNode * in place of the original node.
-        This method may return NULL if an error is encountered during
-        optimization. */
+        optimizations on the node (e.g. constant folding).  This method
+        will be called after the node's children have already been
+        optimized, and the caller will store the returned ASTNode * in
+        place of the original node.  This method should return NULL if an
+        error is encountered during optimization. */
    virtual ASTNode *Optimize() = 0;

    /** Type checking should be performed by the node when this method is
@@ -65,6 +66,9 @@ public:
        pointer in place of the original ASTNode *. */
    virtual ASTNode *TypeCheck() = 0;

+    /** Estimate the execution cost of the node (not including the cost of
+        the children.  The value returned should be based on the COST_*
+        enumerant values defined in ispc.h. */
    virtual int EstimateCost() const = 0;

    /** All AST nodes must track the file position where they are
@@ -91,4 +95,57 @@ private:
    std::vector<Function *> functions;
 };

+
+/** Callback function type for preorder traversial visiting function for
+    the AST walk.
+ */
+typedef bool (* ASTPreCallBackFunc)(ASTNode *node, void *data);
+
+/** Callback function type for postorder traversial visiting function for
+    the AST walk.
+ */
+typedef ASTNode * (* ASTPostCallBackFunc)(ASTNode *node, void *data);
+
+/** Walk (some portion of) an AST, starting from the given root node.  At
+    each node, if preFunc is non-NULL, call it, passing the given void
+    *data pointer; if the call to preFunc function returns false, then the
+    children of the node aren't visited.  This function then makes
+    recursive calls to WalkAST() to process the node's children; after
+    doing so, calls postFunc, at the node.  The return value from the
+    postFunc call is ignored. */
+extern ASTNode *WalkAST(ASTNode *root, ASTPreCallBackFunc preFunc,
+                        ASTPostCallBackFunc postFunc, void *data);
+
+/** Perform simple optimizations on the AST or portion thereof passed to
+    this function, returning the resulting AST. */
+extern ASTNode *Optimize(ASTNode *root);
+
+/** Convenience version of Optimize() for Expr *s that returns an Expr *
+    (rather than an ASTNode *, which would require the caller to cast back
+    to an Expr *). */ 
+extern Expr *Optimize(Expr *);
+
+/** Convenience version of Optimize() for Expr *s that returns an Stmt *
+    (rather than an ASTNode *, which would require the caller to cast back
+    to a Stmt *). */ 
+extern Stmt *Optimize(Stmt *);
+
+/** Perform type-checking on the given AST (or portion of one), returning a
+    pointer to the root of the resulting AST. */
+extern ASTNode *TypeCheck(ASTNode *root);
+
+/** Convenience version of TypeCheck() for Expr *s that returns an Expr *. */
+extern Expr *TypeCheck(Expr *);
+
+/** Convenience version of TypeCheck() for Stmt *s that returns an Stmt *. */
+extern Stmt *TypeCheck(Stmt *);
+
+/** Returns an estimate of the execution cost of the tree starting at
+    the given root. */
+extern int EstimateCost(ASTNode *root);
+
+/** Returns true if it would be safe to run the given code with an "all
+    off" mask. */ 
+extern bool SafeToRunWithMaskAllOff(ASTNode *root);
+
 #endif // ISPC_AST_H
--- a/bitcode2cpp.py
+++ b/bitcode2cpp.py
@@ -11,7 +11,10 @@ length=0

 src=str(sys.argv[1])

-target = re.sub(".*builtins-", "", src)
+target = re.sub("builtins/target-", "", src)
+target = re.sub(r"builtins\\target-", "", target)
+target = re.sub("builtins/", "", target)
+target = re.sub(r"builtins\\", "", target)
 target = re.sub("\.ll$", "", target)
 target = re.sub("\.c$", "", target)
 target = re.sub("-", "_", target)
@@ -23,17 +26,21 @@ if platform.system() == 'Windows' or string.find(platform.system(), "CYGWIN_NT")
 try:
    as_out=subprocess.Popen([llvm_as, "-", "-o", "-"], stdout=subprocess.PIPE)
 except IOError:
-    print >> sys.stderr, "Couldn't open " + src
+    sys.stderr.write("Couldn't open " + src)
    sys.exit(1)

-print "unsigned char builtins_bitcode_" + target + "[] = {"
-for line in as_out.stdout.readlines():
-    length = length + len(line)
-    for c in line:
-        print ord(c)
-        print ", "
-print " 0 };\n\n"
-print "int builtins_bitcode_" + target + "_length = " + str(length) + ";\n"
+width = 16;
+sys.stdout.write("unsigned char builtins_bitcode_" + target + "[] = {\n")
+
+data = as_out.stdout.read()
+for i in range(0, len(data), 1):
+        sys.stdout.write("0x%0.2X, " % ord(data[i:i+1]))
+
+        if i%width == (width-1):
+            sys.stdout.write("\n")
+
+sys.stdout.write("0x00 };\n\n")
+sys.stdout.write("int builtins_bitcode_" + target + "_length = " + str(i+1) + ";\n")

 as_out.wait()

--- a/buildall.bat
+++ b/buildall.bat
@@ -8,7 +8,6 @@ REM Both the LLVM binaries and python need to be in the path
 set path=%LLVM_INSTALL_DIR%\bin;%PATH%;c:\cygwin\bin

 msbuild ispc.vcxproj /V:m /p:Platform=Win32 /p:Configuration=Release
-msbuild ispc_test.vcxproj /V:m /p:Platform=Win32 /p:Configuration=Release

 msbuild examples\examples.sln /V:m /p:Platform=x64 /p:Configuration=Release /t:rebuild
 msbuild examples\examples.sln /V:m /p:Platform=x64 /p:Configuration=Debug /t:rebuild
--- a/buildispc.bat
+++ b/buildispc.bat
@@ -0,0 +1,11 @@
+@echo off
+
+REM If LLVM_INSTALL_DIR isn't set globally in your environment,
+REM it can be set here_
+set LLVM_INSTALL_DIR=c:\users\mmp\llvm-dev
+set LLVM_VERSION=3.1svn
+
+REM Both the LLVM binaries and python need to be in the path
+set path=%LLVM_INSTALL_DIR%\bin;%PATH%;c:\cygwin\bin
+
+msbuild ispc.vcxproj /V:m /p:Platform=Win32 /p:Configuration=Release
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -99,6 +99,9 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
        return intAsUnsigned ? AtomicType::UniformUInt64 : AtomicType::UniformInt64;

    // varying
+    if (LLVMTypes::MaskType != LLVMTypes::Int32VectorType &&
+        t == LLVMTypes::MaskType)
+        return AtomicType::VaryingBool;
    else if (t == LLVMTypes::Int8VectorType)
        return intAsUnsigned ? AtomicType::VaryingUInt8 : AtomicType::VaryingInt8;
    else if (t == LLVMTypes::Int16VectorType)
@@ -194,7 +197,7 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
    // symbol creation code below assumes that any LLVM vector of i32s is a
    // varying int32.  Here, we need that to be interpreted as a varying
    // bool, so just have a one-off override for that one...
-    if (name == "__sext_varying_bool") {
+    if (g->target.maskBitCount != 1 && name == "__sext_varying_bool") {
        const Type *returnType = AtomicType::VaryingInt32;
        std::vector<const Type *> argTypes;
        argTypes.push_back(AtomicType::VaryingBool);
@@ -257,7 +260,7 @@ static void
 lAddModuleSymbols(llvm::Module *module, SymbolTable *symbolTable) {
 #if 0
    // FIXME: handle globals?
-    assert(module->global_empty());
+    Assert(module->global_empty());
 #endif

    llvm::Module::iterator iter;
@@ -287,11 +290,11 @@ lCheckModuleIntrinsics(llvm::Module *module) {
        // check the llvm.x86.* intrinsics for now...
        if (!strncmp(funcName.c_str(), "llvm.x86.", 9)) {
            llvm::Intrinsic::ID id = (llvm::Intrinsic::ID)func->getIntrinsicID();
-            assert(id != 0);
+            Assert(id != 0);
            LLVM_TYPE_CONST llvm::Type *intrinsicType = 
                llvm::Intrinsic::getType(*g->ctx, id);
            intrinsicType = llvm::PointerType::get(intrinsicType, 0);
-            assert(func->getType() == intrinsicType);
+            Assert(func->getType() == intrinsicType);
        }
    }
 }
@@ -311,8 +314,12 @@ lCheckModuleIntrinsics(llvm::Module *module) {
 static void
 lSetInternalFunctions(llvm::Module *module) {
    const char *names[] = {
+        "__add_float",
+        "__add_int32",
+        "__add_uniform_double",
        "__add_uniform_int32",
        "__add_uniform_int64",
+        "__add_varying_double",
        "__add_varying_int32",
        "__add_varying_int64",
        "__aos_to_soa3_float",
@@ -371,18 +378,21 @@ lSetInternalFunctions(llvm::Module *module) {
        "__atomic_xor_uniform_int64_global",
        "__broadcast_double",
        "__broadcast_float",
-        "__broadcast_int16",
-        "__broadcast_int32",
-        "__broadcast_int64",
-        "__broadcast_int8",
+        "__broadcast_i16",
+        "__broadcast_i32",
+        "__broadcast_i64",
+        "__broadcast_i8",
        "__ceil_uniform_double",
        "__ceil_uniform_float",
        "__ceil_varying_double",
        "__ceil_varying_float",
+        "__clock",
        "__count_trailing_zeros_i32",
        "__count_trailing_zeros_i64",
        "__count_leading_zeros_i32",
        "__count_leading_zeros_i64",
+        "__delete_uniform",
+        "__delete_varying",
        "__do_assert_uniform",
        "__do_assert_varying",
        "__do_print", 
@@ -428,6 +438,12 @@ lSetInternalFunctions(llvm::Module *module) {
        "__max_varying_uint32",
        "__max_varying_uint64",
        "__memory_barrier",
+        "__memcpy32",
+        "__memcpy64",
+        "__memmove32",
+        "__memmove64",
+        "__memset32",
+        "__memset64",
        "__min_uniform_double",
        "__min_uniform_float",
        "__min_uniform_int32",
@@ -441,6 +457,9 @@ lSetInternalFunctions(llvm::Module *module) {
        "__min_varying_uint32",
        "__min_varying_uint64",
        "__movmsk",
+        "__new_uniform",
+        "__new_varying32",
+        "__new_varying64",
        "__num_cores",
        "__packed_load_active",
        "__packed_store_active",
@@ -476,10 +495,10 @@ lSetInternalFunctions(llvm::Module *module) {
        "__reduce_min_uint64",
        "__rotate_double",
        "__rotate_float",
-        "__rotate_int16",
-        "__rotate_int32",
-        "__rotate_int64",
-        "__rotate_int8",
+        "__rotate_i16",
+        "__rotate_i32",
+        "__rotate_i64",
+        "__rotate_i8",
        "__round_uniform_double",
        "__round_uniform_float",
        "__round_varying_double",
@@ -490,16 +509,16 @@ lSetInternalFunctions(llvm::Module *module) {
        "__sext_varying_bool",
        "__shuffle2_double",
        "__shuffle2_float",
-        "__shuffle2_int16",
-        "__shuffle2_int32",
-        "__shuffle2_int64",
-        "__shuffle2_int8",
+        "__shuffle2_i16",
+        "__shuffle2_i32",
+        "__shuffle2_i64",
+        "__shuffle2_i8",
        "__shuffle_double",
        "__shuffle_float",
-        "__shuffle_int16",
-        "__shuffle_int32",
-        "__shuffle_int64",
-        "__shuffle_int8",
+        "__shuffle_i16",
+        "__shuffle_i32",
+        "__shuffle_i64",
+        "__shuffle_i8",
        "__soa_to_aos3_float",
        "__soa_to_aos3_float16",
        "__soa_to_aos3_float4",
@@ -514,6 +533,8 @@ lSetInternalFunctions(llvm::Module *module) {
        "__sqrt_uniform_float",
        "__sqrt_varying_double",
        "__sqrt_varying_float",
+        "__stdlib_acosf",
+        "__stdlib_asinf",
        "__stdlib_atan",
        "__stdlib_atan2",
        "__stdlib_atan2f",
@@ -543,12 +564,16 @@ lSetInternalFunctions(llvm::Module *module) {
        "__svml_pow",
        "__undef_uniform",
        "__undef_varying",
+        "__vec4_add_float",
+        "__vec4_add_int32",
+        "__vselect_float",
+        "__vselect_i32",
    };

    int count = sizeof(names) / sizeof(names[0]);
    for (int i = 0; i < count; ++i) {
        llvm::Function *f = module->getFunction(names[i]);
-        if (f != NULL)
+        if (f != NULL && f->empty() == false)
            f->setLinkage(llvm::GlobalValue::InternalLinkage);
    }
 }
@@ -583,9 +608,9 @@ AddBitcodeToModule(const unsigned char *bitcode, int length,
        // linking together modules with incompatible target triples..
        llvm::Triple mTriple(m->module->getTargetTriple());
        llvm::Triple bcTriple(bcModule->getTargetTriple());
-        assert(bcTriple.getArch() == llvm::Triple::UnknownArch ||
+        Assert(bcTriple.getArch() == llvm::Triple::UnknownArch ||
               mTriple.getArch() == bcTriple.getArch());
-        assert(bcTriple.getVendor() == llvm::Triple::UnknownVendor ||
+        Assert(bcTriple.getVendor() == llvm::Triple::UnknownVendor ||
               mTriple.getVendor() == bcTriple.getVendor());
        bcModule->setTargetTriple(mTriple.str());

@@ -610,8 +635,9 @@ AddBitcodeToModule(const unsigned char *bitcode, int length,
 static void
 lDefineConstantInt(const char *name, int val, llvm::Module *module,
                   SymbolTable *symbolTable) {
-    Symbol *pw = new Symbol(name, SourcePos(), AtomicType::UniformConstInt32,
-                            SC_STATIC);
+    Symbol *pw = 
+        new Symbol(name, SourcePos(), AtomicType::UniformInt32->GetAsConstType(),
+                   SC_STATIC);
    pw->constValue = new ConstExpr(pw->type, val, SourcePos());
    LLVM_TYPE_CONST llvm::Type *ltype = LLVMTypes::Int32Type;
    llvm::Constant *linit = LLVMInt32(val);
@@ -631,7 +657,7 @@ lDefineConstantIntFunc(const char *name, int val, llvm::Module *module,
    Symbol *sym = new Symbol(name, SourcePos(), ft, SC_STATIC);

    llvm::Function *func = module->getFunction(name);
-    assert(func != NULL); // it should be declared already...
+    Assert(func != NULL); // it should be declared already...
    func->addFnAttr(llvm::Attribute::AlwaysInline);
    llvm::BasicBlock *bblock = llvm::BasicBlock::Create(*g->ctx, "entry", func, 0);
    llvm::ReturnInst::Create(*g->ctx, LLVMInt32(val), bblock);
@@ -644,8 +670,9 @@ lDefineConstantIntFunc(const char *name, int val, llvm::Module *module,

 static void
 lDefineProgramIndex(llvm::Module *module, SymbolTable *symbolTable) {
-    Symbol *pidx = new Symbol("programIndex", SourcePos(), 
-                              AtomicType::VaryingConstInt32, SC_STATIC);
+    Symbol *pidx = 
+        new Symbol("programIndex", SourcePos(), 
+                   AtomicType::VaryingInt32->GetAsConstType(), SC_STATIC);

    int pi[ISPC_MAX_NVEC];
    for (int i = 0; i < g->target.vectorWidth; ++i)
@@ -706,11 +733,13 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
        extern int builtins_bitcode_sse4_x2_length;
        switch (g->target.vectorWidth) {
        case 4: 
-            AddBitcodeToModule(builtins_bitcode_sse4, builtins_bitcode_sse4_length, 
+            AddBitcodeToModule(builtins_bitcode_sse4,
+                               builtins_bitcode_sse4_length, 
                               module, symbolTable);
            break;
        case 8:
-            AddBitcodeToModule(builtins_bitcode_sse4_x2, builtins_bitcode_sse4_x2_length, 
+            AddBitcodeToModule(builtins_bitcode_sse4_x2, 
+                               builtins_bitcode_sse4_x2_length, 
                               module, symbolTable);
            break;
        default:
@@ -720,21 +749,77 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
    case Target::AVX:
        switch (g->target.vectorWidth) {
        case 8:
-            extern unsigned char builtins_bitcode_avx[];
-            extern int builtins_bitcode_avx_length;
-            AddBitcodeToModule(builtins_bitcode_avx, builtins_bitcode_avx_length, 
+            extern unsigned char builtins_bitcode_avx1[];
+            extern int builtins_bitcode_avx1_length;
+            AddBitcodeToModule(builtins_bitcode_avx1, 
+                               builtins_bitcode_avx1_length, 
                               module, symbolTable);
            break;
        case 16:
-            extern unsigned char builtins_bitcode_avx_x2[];
-            extern int builtins_bitcode_avx_x2_length;
-            AddBitcodeToModule(builtins_bitcode_avx_x2, builtins_bitcode_avx_x2_length,
+            extern unsigned char builtins_bitcode_avx1_x2[];
+            extern int builtins_bitcode_avx1_x2_length;
+            AddBitcodeToModule(builtins_bitcode_avx1_x2, 
+                               builtins_bitcode_avx1_x2_length,
                               module,  symbolTable);
            break;
        default:
            FATAL("logic error in DefineStdlib");
        }
        break;
+    case Target::AVX2:
+        switch (g->target.vectorWidth) {
+        case 8:
+            extern unsigned char builtins_bitcode_avx2[];
+            extern int builtins_bitcode_avx2_length;
+            AddBitcodeToModule(builtins_bitcode_avx2, 
+                               builtins_bitcode_avx2_length, 
+                               module, symbolTable);
+            break;
+        case 16:
+            extern unsigned char builtins_bitcode_avx2_x2[];
+            extern int builtins_bitcode_avx2_x2_length;
+            AddBitcodeToModule(builtins_bitcode_avx2_x2, 
+                               builtins_bitcode_avx2_x2_length,
+                               module,  symbolTable);
+            break;
+        default:
+            FATAL("logic error in DefineStdlib");
+        }
+        break;
+    case Target::GENERIC:
+        switch (g->target.vectorWidth) {
+        case 4:
+            extern unsigned char builtins_bitcode_generic_4[];
+            extern int builtins_bitcode_generic_4_length;
+            AddBitcodeToModule(builtins_bitcode_generic_4, 
+                               builtins_bitcode_generic_4_length, 
+                               module, symbolTable);
+            break;
+        case 8:
+            extern unsigned char builtins_bitcode_generic_8[];
+            extern int builtins_bitcode_generic_8_length;
+            AddBitcodeToModule(builtins_bitcode_generic_8, 
+                               builtins_bitcode_generic_8_length, 
+                               module, symbolTable);
+            break;
+        case 16:
+            extern unsigned char builtins_bitcode_generic_16[];
+            extern int builtins_bitcode_generic_16_length;
+            AddBitcodeToModule(builtins_bitcode_generic_16, 
+                               builtins_bitcode_generic_16_length, 
+                               module, symbolTable);
+            break;
+	case 1:
+            extern unsigned char builtins_bitcode_generic_1[];
+            extern int builtins_bitcode_generic_1_length;
+            AddBitcodeToModule(builtins_bitcode_generic_1, 
+                               builtins_bitcode_generic_1_length, 
+                               module, symbolTable);
+            break;
+        default:
+            FATAL("logic error in DefineStdlib");
+        }
+        break;
    default:
        FATAL("logic error");
    }
@@ -759,14 +844,22 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
    lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload, module,
                           symbolTable);

+    lDefineConstantInt("__have_native_half", (g->target.isa == Target::AVX2),
+                       module, symbolTable);
+
    if (includeStdlibISPC) {
        // If the user wants the standard library to be included, parse the
        // serialized version of the stdlib.ispc file to get its
-        // definitions added.  Disable emission of performance warnings for
-        // now, since the user doesn't care about any of that in the stdlib
-        // implementation...
-        extern char stdlib_code[];
-        yy_scan_string(stdlib_code);
-        yyparse();
+        // definitions added.
+      if (g->target.isa == Target::GENERIC&&g->target.vectorWidth!=1) { // 1 wide uses x86 stdlib
+            extern char stdlib_generic_code[];
+            yy_scan_string(stdlib_generic_code);
+            yyparse();
+        }
+        else {
+            extern char stdlib_x86_code[];
+            yy_scan_string(stdlib_x86_code);
+            yyparse();
+        }
    }
 }
--- a/builtins/builtins.c
+++ b/builtins/builtins.c
@@ -149,7 +149,7 @@ void __do_print(const char *format, const char *types, int width, int mask,


 int __num_cores() {
-#ifdef _MSC_VER
+#if defined(_MSC_VER) || defined(__MINGW32__)
    // This is quite a hack.  Including all of windows.h to get this definition
    // pulls in a bunch of stuff that leads to undefined symbols at link time.
    // So we don't #include <windows.h> but instead have the equivalent declarations
--- a/builtins/dispatch.ll
+++ b/builtins/dispatch.ll
@@ -48,23 +48,42 @@ declare void @abort() noreturn
 ;; corresponding to one of the Target::ISA enumerant values that gives the
 ;; most capable ISA that the curremt system can run.
 ;;
-;; #ifdef _MSC_VER
-;; extern void __stdcall __cpuid(int info[4], int infoType);
-;; #else
+;; Note: clang from LLVM 2.9 should be used if this is updated, for maximum
+;; backwards compatibility for anyone building ispc with LLVM 2.9.
+;;
+;; #include <stdint.h>
+;; #include <stdlib.h>
+;; 
 ;; static void __cpuid(int info[4], int infoType) {
 ;;     __asm__ __volatile__ ("cpuid"
 ;;                           : "=a" (info[0]), "=b" (info[1]), "=c" (info[2]), "=d" (info[3])
 ;;                           : "0" (infoType));
 ;; }
-;; #endif
+;; 
+;; /* Save %ebx in case it's the PIC register */
+;; static void __cpuid_count(int info[4], int level, int count) {
+;;   __asm__ __volatile__ ("xchg{l}\t{%%}ebx, %1\n\t"
+;;                         "cpuid\n\t"
+;;                         "xchg{l}\t{%%}ebx, %1\n\t"
+;;                         : "=a" (info[0]), "=r" (info[1]), "=c" (info[2]), "=d" (info[3])
+;;                         : "0" (level), "2" (count));
+;; }
 ;; 
 ;; int32_t __get_system_isa() {
 ;;     int info[4];
 ;;     __cpuid(info, 1);
+;; 
 ;;     /* NOTE: the values returned below must be the same as the
 ;;        corresponding enumerant values in Target::ISA. */
-;;     if ((info[2] & (1 << 28)) != 0)
-;;         return 2; // AVX
+;;     if ((info[2] & (1 << 28)) != 0) {
+;;         // AVX1 for sure. Do we have AVX2?
+;;         // Call cpuid with eax=7, ecx=0
+;;         __cpuid_count(info, 7, 0);
+;;         if ((info[1] & (1 << 5)) != 0)
+;;             return 3; // AVX2
+;;         else
+;;             return 2; // AVX1
+;;     }
 ;;     else if ((info[2] & (1 << 19)) != 0)
 ;;         return 1; // SSE4
 ;;     else if ((info[3] & (1 << 26)) != 0)
@@ -76,33 +95,42 @@ declare void @abort() noreturn
 %0 = type { i32, i32, i32, i32 }

 define i32 @__get_system_isa() nounwind ssp {
-  %1 = tail call %0 asm sideeffect "cpuid", "={ax},={bx},={cx},={dx},0,~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind
-  %2 = extractvalue %0 %1, 2
-  %3 = extractvalue %0 %1, 3
-  %4 = and i32 %2, 268435456
-  %5 = icmp eq i32 %4, 0
-  br i1 %5, label %6, label %13
+entry:
+  %0 = tail call %0 asm sideeffect "cpuid", "={ax},={bx},={cx},={dx},0,~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind
+  %asmresult9.i = extractvalue %0 %0, 2
+  %asmresult10.i = extractvalue %0 %0, 3
+  %and = and i32 %asmresult9.i, 268435456
+  %cmp = icmp eq i32 %and, 0
+  br i1 %cmp, label %if.else7, label %if.then

-; <label>:6                                       ; preds = %0
-  %7 = and i32 %2, 524288
-  %8 = icmp eq i32 %7, 0
-  br i1 %8, label %9, label %13
+if.then:                                          ; preds = %entry
+  %1 = tail call %0 asm sideeffect "xchg$(l$)\09$(%$)ebx, $1\0A\09cpuid\0A\09xchg$(l$)\09$(%$)ebx, $1\0A\09", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 7, i32 0) nounwind
+  %asmresult9.i24 = extractvalue %0 %1, 1
+  %and4 = lshr i32 %asmresult9.i24, 5
+  %2 = and i32 %and4, 1
+  %3 = or i32 %2, 2
+  br label %return

-; <label>:9                                       ; preds = %6
-  %10 = and i32 %3, 67108864
-  %11 = icmp eq i32 %10, 0
-  br i1 %11, label %12, label %13
+if.else7:                                         ; preds = %entry
+  %and10 = and i32 %asmresult9.i, 524288
+  %cmp11 = icmp eq i32 %and10, 0
+  br i1 %cmp11, label %if.else13, label %return

-; <label>:12                                      ; preds = %9
+if.else13:                                        ; preds = %if.else7
+  %and16 = and i32 %asmresult10.i, 67108864
+  %cmp17 = icmp eq i32 %and16, 0
+  br i1 %cmp17, label %if.else19, label %return
+
+if.else19:                                        ; preds = %if.else13
  tail call void @abort() noreturn nounwind
  unreachable

-; <label>:13                                      ; preds = %9, %6, %0
-  %.0 = phi i32 [ 2, %0 ], [ 1, %6 ], [ 0, %9 ]
-  ret i32 %.0
+return:                                           ; preds = %if.else13, %if.else7, %if.then
+  %retval.0 = phi i32 [ %3, %if.then ], [ 1, %if.else7 ], [ 0, %if.else13 ]
+  ret i32 %retval.0
 }

-
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; This function is called by each of the dispatch functions we generate;
 ;; it sets @__system_best_isa if it is unset.

--- a/builtins/target-avx-common.ll
+++ b/builtins/target-avx-common.ll
@@ -32,6 +32,11 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; AVX target implementation.

+ctlztz()
+define_prefetches()
+define_shuffles()
+aossoa()
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp

--- a/builtins/target-avx-x2.ll
+++ b/builtins/target-avx-x2.ll
@@ -32,12 +32,16 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; Basic 16-wide definitions

-stdlib_core(16)
-packed_load_and_store(16)
-scans(16)
-int64minmax(16)
+define(`WIDTH',`16')
+define(`MASK',`i32')
+include(`util.m4')

-include(`builtins-avx-common.ll')
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+
+include(`target-avx-common.ll')

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
@@ -166,33 +170,6 @@ define <16 x float> @__min_varying_float(<16 x float>,
 }


-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; int min/max
-
-define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
-  binary4to16(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
-  ret <16 x i32> %ret
-}
-
-define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
-  binary4to16(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
-  ret <16 x i32> %ret
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; unsigned int min/max
-
-define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
-  binary4to16(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
-  ret <16 x i32> %ret
-}
-
-define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
-  binary4to16(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
-  ret <16 x i32> %ret
-}
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops

@@ -381,13 +358,13 @@ load_and_broadcast(16, i32, 32)
 load_and_broadcast(16, i64, 64)

 ; no masked load instruction for i8 and i16 types??
-load_masked(16, i8,  8,  1)
-load_masked(16, i16, 16, 2)
+masked_load(16, i8,  8,  1)
+masked_load(16, i16, 16, 2)

 declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x float> %mask)
 declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
 
-define <16 x i32> @__load_masked_32(i8 *, <16 x i32> %mask) nounwind alwaysinline {
+define <16 x i32> @__masked_load_32(i8 *, <16 x i32> %mask) nounwind alwaysinline {
  %floatmask = bitcast <16 x i32> %mask to <16 x float>
  %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -405,7 +382,7 @@ define <16 x i32> @__load_masked_32(i8 *, <16 x i32> %mask) nounwind alwaysinlin
 }


-define <16 x i64> @__load_masked_64(i8 *, <16 x i32> %mask) nounwind alwaysinline {
+define <16 x i64> @__masked_load_64(i8 *, <16 x i32> %mask) nounwind alwaysinline {
  ; double up masks, bitcast to doubles
  %mask0 = shufflevector <16 x i32> %mask, <16 x i32> undef,
     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
@@ -618,12 +595,7 @@ define void @__masked_store_blend_64(<16 x i64>* nocapture %ptr, <16 x i64> %new
 }

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; gather/scatter
-
-gen_gather(16, i8)
-gen_gather(16, i16)
-gen_gather(16, i32)
-gen_gather(16, i64)
+;; scatter

 gen_scatter(16, i8)
 gen_scatter(16, i16)
--- a/builtins/target-avx.ll
+++ b/builtins/target-avx.ll
@@ -32,12 +32,16 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; Basic 8-wide definitions

-stdlib_core(8)
-packed_load_and_store(8)
-scans(8)
-int64minmax(8)
+define(`WIDTH',`8')
+define(`MASK',`i32')
+include(`util.m4')

-include(`builtins-avx-common.ll')
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+
+include(`target-avx-common.ll')

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
@@ -166,33 +170,6 @@ define <8 x float> @__min_varying_float(<8 x float>,
 }


-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; int min/max
-
-define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
-  binary4to8(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
-  ret <8 x i32> %ret
-}
-
-define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
-  binary4to8(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
-  ret <8 x i32> %ret
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; unsigned int min/max
-
-define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
-  binary4to8(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
-  ret <8 x i32> %ret
-}
-
-define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
-  binary4to8(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
-  ret <8 x i32> %ret
-}
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops

@@ -234,7 +211,7 @@ reduce_equal(8)
 ;; horizontal int32 ops

 define <8 x i32> @__add_varying_int32(<8 x i32>,
-                                               <8 x i32>) nounwind readnone alwaysinline {
+                                      <8 x i32>) nounwind readnone alwaysinline {
  %s = add <8 x i32> %0, %1
  ret <8 x i32> %s
 }
@@ -310,7 +287,7 @@ define double @__reduce_max_double(<8 x double>) nounwind readnone alwaysinline
 ;; horizontal int64 ops

 define <8 x i64> @__add_varying_int64(<8 x i64>,
-                                               <8 x i64>) nounwind readnone alwaysinline {
+                                      <8 x i64>) nounwind readnone alwaysinline {
  %s = add <8 x i64> %0, %1
  ret <8 x i64> %s
 }
@@ -362,13 +339,13 @@ load_and_broadcast(8, i32, 32)
 load_and_broadcast(8, i64, 64)

 ; no masked load instruction for i8 and i16 types??
-load_masked(8, i8,  8,  1)
-load_masked(8, i16, 16, 2)
+masked_load(8, i8,  8,  1)
+masked_load(8, i16, 16, 2)

 declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x float> %mask)
 declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
 
-define <8 x i32> @__load_masked_32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
+define <8 x i32> @__masked_load_32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
  %floatmask = bitcast <8 x i32> %mask to <8 x float>
  %floatval = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8 * %0, <8 x float> %floatmask)
  %retval = bitcast <8 x float> %floatval to <8 x i32>
@@ -376,7 +353,7 @@ define <8 x i32> @__load_masked_32(i8 *, <8 x i32> %mask) nounwind alwaysinline
 }


-define <8 x i64> @__load_masked_64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
+define <8 x i64> @__masked_load_64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
  ; double up masks, bitcast to doubles
  %mask0 = shufflevector <8 x i32> %mask, <8 x i32> undef,
     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
@@ -399,9 +376,6 @@ define <8 x i64> @__load_masked_64(i8 *, <8 x i32> %mask) nounwind alwaysinline
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store

-; FIXME: there is no AVX instruction for these, but we could be clever
-; by packing the bits down and setting the last 3/4 or half, respectively,
-; of the mask to zero...  Not sure if this would be a win in the end
 gen_masked_store(8, i8, 8)
 gen_masked_store(8, i16, 16)

@@ -516,12 +490,7 @@ define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,


 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; gather/scatter
-
-gen_gather(8, i8)
-gen_gather(8, i16)
-gen_gather(8, i32)
-gen_gather(8, i64)
+;; scatter

 gen_scatter(8, i8)
 gen_scatter(8, i16)
--- a/builtins/target-avx1-x2.ll
+++ b/builtins/target-avx1-x2.ll
@@ -0,0 +1,77 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+include(`target-avx-x2.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret <16 x i32> %ret
+}
+
+define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret <16 x i32> %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret <16 x i32> %ret
+}
+
+define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret <16 x i32> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather(16, i8)
+gen_gather(16, i16)
+gen_gather(16, i32)
+gen_gather(16, i64)
+
+
--- a/builtins/target-avx1.ll
+++ b/builtins/target-avx1.ll
@@ -0,0 +1,75 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+include(`target-avx.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret <8 x i32> %ret
+}
+
+define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret <8 x i32> %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret <8 x i32> %ret
+}
+
+define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret <8 x i32> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather(8, i8)
+gen_gather(8, i16)
+gen_gather(8, i32)
+gen_gather(8, i64)
--- a/builtins/target-avx2-x2.ll
+++ b/builtins/target-avx2-x2.ll
@@ -0,0 +1,129 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+include(`target-avx-x2.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readonly
+declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readonly
+
+define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary8to16(m, i32, @llvm.x86.avx2.pmins.d, %0, %1)
+  ret <16 x i32> %m
+}
+
+define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary8to16(m, i32, @llvm.x86.avx2.pmaxs.d, %0, %1)
+  ret <16 x i32> %m
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readonly
+declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readonly
+
+define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary8to16(m, i32, @llvm.x86.avx2.pminu.d, %0, %1)
+  ret <16 x i32> %m
+}
+
+define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary8to16(m, i32, @llvm.x86.avx2.pmaxu.d, %0, %1)
+  ret <16 x i32> %m
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float/half conversions
+
+declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
+; 0 is round nearest even
+declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
+
+define <16 x float> @__half_to_float_varying(<16 x i16> %v) nounwind readnone {
+  %r_0 = shufflevector <16 x i16> %v, <16 x i16> undef,
+             <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %vr_0 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_0)
+  %r_1 = shufflevector <16 x i16> %v, <16 x i16> undef,
+             <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vr_1 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_1)
+  %r = shufflevector <8 x float> %vr_0, <8 x float> %vr_1, 
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %r
+}
+
+define <16 x i16> @__float_to_half_varying(<16 x float> %v) nounwind readnone {
+  %r_0 = shufflevector <16 x float> %v, <16 x float> undef,
+             <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %vr_0 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_0, i32 0)
+  %r_1 = shufflevector <16 x float> %v, <16 x float> undef,
+             <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vr_1 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_1, i32 0)
+  %r = shufflevector <8 x i16> %vr_0, <8 x i16> %vr_1, 
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i16> %r
+}
+
+define float @__half_to_float_uniform(i16 %v) nounwind readnone {
+  %v1 = bitcast i16 %v to <1 x i16>
+  %vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
+  %r = extractelement <8 x float> %rv, i32 0
+  ret float %r
+}
+
+define i16 @__float_to_half_uniform(float %v) nounwind readnone {
+  %v1 = bitcast float %v to <1 x float>
+  %vv = shufflevector <1 x float> %v1, <1 x float> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  ; round to nearest even
+  %rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
+  %r = extractelement <8 x i16> %rv, i32 0
+  ret i16 %r
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather(16, i8)
+gen_gather(16, i16)
+gen_gather(16, i32)
+gen_gather(16, i64)
+
+
--- a/builtins/target-avx2.ll
+++ b/builtins/target-avx2.ll
@@ -0,0 +1,110 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+include(`target-avx.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readonly
+declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readonly
+
+define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  %m = call <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32> %0, <8 x i32> %1)
+  ret <8 x i32> %m
+}
+
+define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  %m = call <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32> %0, <8 x i32> %1)
+  ret <8 x i32> %m
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readonly
+declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readonly
+
+define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  %m = call <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32> %0, <8 x i32> %1)
+  ret <8 x i32> %m
+}
+
+define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  %m = call <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32> %0, <8 x i32> %1)
+  ret <8 x i32> %m
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float/half conversions
+
+declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
+; 0 is round nearest even
+declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
+
+define <8 x float> @__half_to_float_varying(<8 x i16> %v) nounwind readnone {
+  %r = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %v)
+  ret <8 x float> %r
+}
+
+define <8 x i16> @__float_to_half_varying(<8 x float> %v) nounwind readnone {
+  %r = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %v, i32 0)
+  ret <8 x i16> %r
+}
+
+define float @__half_to_float_uniform(i16 %v) nounwind readnone {
+  %v1 = bitcast i16 %v to <1 x i16>
+  %vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
+  %r = extractelement <8 x float> %rv, i32 0
+  ret float %r
+}
+
+define i16 @__float_to_half_uniform(float %v) nounwind readnone {
+  %v1 = bitcast float %v to <1 x float>
+  %vv = shufflevector <1 x float> %v1, <1 x float> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  ; round to nearest even
+  %rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
+  %r = extractelement <8 x i16> %rv, i32 0
+  ret i16 %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather(8, i8)
+gen_gather(8, i16)
+gen_gather(8, i32)
+gen_gather(8, i64)
--- a/builtins/target-generic-1.ll
+++ b/builtins/target-generic-1.ll
@@ -0,0 +1,935 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Define the standard library builtins for the NOVEC target
+define(`MASK',`i32')
+define(`WIDTH',`1')
+include(`util.m4')
+; Define some basics for a 1-wide target
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+aossoa()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+gen_masked_store(1, i8, 8)
+gen_masked_store(1, i16, 16)
+gen_masked_store(1, i32, 32)
+gen_masked_store(1, i64, 64)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+load_and_broadcast(1, i8, 8)
+load_and_broadcast(1, i16, 16)
+load_and_broadcast(1, i32, 32)
+load_and_broadcast(1, i64, 64)
+
+masked_load(1, i8,  8,  1)
+masked_load(1, i16, 16, 2)
+masked_load(1, i32, 32, 4)
+masked_load(1, i64, 64, 8)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather/scatter
+
+; define these with the macros from stdlib.m4
+
+gen_gather(1, i8)
+gen_gather(1, i16)
+gen_gather(1, i32)
+gen_gather(1, i64)
+
+gen_scatter(1, i8)
+gen_scatter(1, i16)
+gen_scatter(1, i32)
+gen_scatter(1, i64)
+
+
+define  <1 x i8> @__vselect_i8(<1 x i8>, <1 x i8> ,
+                                         <1 x i32> %mask) nounwind readnone alwaysinline {
+;  %mv = trunc <1 x i32> %mask to <1 x i8>
+;  %notmask = xor <1 x i8> %mv, <i8 -1>
+;  %cleared_old = and <1 x i8> %0, %notmask
+;  %masked_new = and <1 x i8> %1, %mv
+;  %new = or <1 x i8> %cleared_old, %masked_new
+;  ret <1 x i8> %new
+
+   ; not doing this the easy way because of problems with LLVM's scalarizer
+;   %cmp = icmp eq <1 x i32> %mask, <i32 0>
+;   %sel = select <1 x i1> %cmp, <1 x i8> %0, <1 x i8> %1
+    %m = extractelement <1 x i32> %mask, i32 0
+    %cmp = icmp eq i32 %m, 0
+    %d0 = extractelement <1 x i8> %0, i32 0
+    %d1 = extractelement <1 x i8> %1, i32 0
+    %sel = select i1 %cmp, i8 %d0, i8 %d1    
+    %r = insertelement <1 x i8> undef, i8 %sel, i32 0
+   ret <1 x i8> %r
+}
+
+define  <1 x i16> @__vselect_i16(<1 x i16>, <1 x i16> ,
+                                         <1 x i32> %mask) nounwind readnone alwaysinline {
+;  %mv = trunc <1 x i32> %mask to <1 x i16>
+;  %notmask = xor <1 x i16> %mv, <i16 -1>
+;  %cleared_old = and <1 x i16> %0, %notmask
+;  %masked_new = and <1 x i16> %1, %mv
+;  %new = or <1 x i16> %cleared_old, %masked_new
+;  ret <1 x i16> %new
+;   %cmp = icmp eq <1 x i32> %mask, <i32 0>
+;   %sel = select <1 x i1> %cmp, <1 x i16> %0, <1 x i16> %1
+    %m = extractelement <1 x i32> %mask, i32 0
+    %cmp = icmp eq i32 %m, 0
+    %d0 = extractelement <1 x i16> %0, i32 0
+    %d1 = extractelement <1 x i16> %1, i32 0
+    %sel = select i1 %cmp, i16 %d0, i16 %d1    
+    %r = insertelement <1 x i16> undef, i16 %sel, i32 0
+   ret <1 x i16> %r
+
+;   ret <1 x i16> %sel
+}
+
+
+define  <1 x i32> @__vselect_i32(<1 x i32>, <1 x i32> ,
+                                         <1 x i32> %mask) nounwind readnone alwaysinline {
+;  %notmask = xor <1 x i32> %mask, <i32 -1>
+;  %cleared_old = and <1 x i32> %0, %notmask
+;  %masked_new = and <1 x i32> %1, %mask
+;  %new = or <1 x i32> %cleared_old, %masked_new
+;  ret <1 x i32> %new
+;   %cmp = icmp eq <1 x i32> %mask, <i32 0>
+;   %sel = select <1 x i1> %cmp, <1 x i32> %0, <1 x i32> %1
+;   ret <1 x i32> %sel
+    %m = extractelement <1 x i32> %mask, i32 0
+    %cmp = icmp eq i32 %m, 0
+    %d0 = extractelement <1 x i32> %0, i32 0
+    %d1 = extractelement <1 x i32> %1, i32 0
+    %sel = select i1 %cmp, i32 %d0, i32 %d1    
+    %r = insertelement <1 x i32> undef, i32 %sel, i32 0
+   ret <1 x i32> %r
+
+}
+define  <1 x i64> @__vselect_i64(<1 x i64>, <1 x i64> ,
+                                         <1 x i32> %mask) nounwind readnone alwaysinline {
+;  %newmask = zext <1 x i32> %mask to <1 x i64>
+;  %notmask = xor <1 x i64> %newmask, <i64 -1>
+;  %cleared_old = and <1 x i64> %0, %notmask
+;  %masked_new = and <1 x i64> %1, %newmask
+;  %new = or <1 x i64> %cleared_old, %masked_new
+;  ret <1 x i64> %new
+;   %cmp = icmp eq <1 x i32> %mask, <i32 0>
+;   %sel = select <1 x i1> %cmp, <1 x i64> %0, <1 x i64> %1
+;   ret <1 x i64> %sel
+    %m = extractelement <1 x i32> %mask, i32 0
+    %cmp = icmp eq i32 %m, 0
+    %d0 = extractelement <1 x i64> %0, i32 0
+    %d1 = extractelement <1 x i64> %1, i32 0
+    %sel = select i1 %cmp, i64 %d0, i64 %d1    
+    %r = insertelement <1 x i64> undef, i64 %sel, i32 0
+   ret <1 x i64> %r
+
+}
+
+define  <1 x float> @__vselect_float(<1 x float>, <1 x float>,
+                                             <1 x i32> %mask) nounwind readnone alwaysinline {
+;  %v0 = bitcast <1 x float> %0 to <1 x i32>
+;  %v1 = bitcast <1 x float> %1 to <1 x i32>
+;  %r = call <1 x i32> @__vselect_i32(<1 x i32> %v0, <1 x i32> %v1, <1 x i32> %mask)
+;  %rf = bitcast <1 x i32> %r to <1 x float>
+;  ret <1 x float> %rf
+;   %cmp = icmp eq <1 x i32> %mask, <i32 0>
+;   %sel = select <1 x i1> %cmp, <1 x float> %0, <1 x float> %1
+;   ret <1 x float> %sel
+    %m = extractelement <1 x i32> %mask, i32 0
+    %cmp = icmp eq i32 %m, 0
+    %d0 = extractelement <1 x float> %0, i32 0
+    %d1 = extractelement <1 x float> %1, i32 0
+    %sel = select i1 %cmp, float %d0, float %d1    
+    %r = insertelement <1 x float> undef, float %sel, i32 0
+   ret <1 x float> %r
+
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+define void @__masked_store_blend_8(<1 x i8>* nocapture, <1 x i8>, 
+                                     <1 x i32> %mask) nounwind alwaysinline {
+  %val = load <1 x i8> * %0, align 4
+  %newval = call <1 x i8> @__vselect_i8(<1 x i8> %val, <1 x i8> %1, <1 x i32> %mask) 
+  store <1 x i8> %newval, <1 x i8> * %0, align 4
+  ret void
+}
+define void @__masked_store_blend_16(<1 x i16>* nocapture, <1 x i16>, 
+                                     <1 x i32> %mask) nounwind alwaysinline {
+  %val = load <1 x i16> * %0, align 4
+  %newval = call <1 x i16> @__vselect_i16(<1 x i16> %val, <1 x i16> %1, <1 x i32> %mask) 
+  store <1 x i16> %newval, <1 x i16> * %0, align 4
+  ret void
+}
+
+
+define void @__masked_store_blend_32(<1 x i32>* nocapture, <1 x i32>, 
+                                     <1 x i32> %mask) nounwind alwaysinline {
+  %val = load <1 x i32> * %0, align 4
+  %newval = call <1 x i32> @__vselect_i32(<1 x i32> %val, <1 x i32> %1, <1 x i32> %mask) 
+  store <1 x i32> %newval, <1 x i32> * %0, align 4
+  ret void
+}
+
+define void @__masked_store_blend_64(<1 x i64>* nocapture, <1 x i64>,
+                                     <1 x i32> %mask) nounwind alwaysinline {
+  %val = load <1 x i64> * %0, align 4
+  %newval = call <1 x i64> @__vselect_i64(<1 x i64> %val, <1 x i64> %1, <1 x i32> %mask) 
+  store <1 x i64> %newval, <1 x i64> * %0, align 4
+  ret void
+}
+
+define  i32 @__movmsk(<1 x i32>) nounwind readnone alwaysinline {
+  %item = extractelement <1 x i32> %0, i32 0
+  %v = lshr i32 %item, 31
+  ret i32 %v
+}
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding
+;;
+;; There are not any rounding instructions in SSE2, so we have to emulate
+;; the functionality with multiple instructions...
+
+; The code for __round_* is the result of compiling the following source
+; code.
+;
+; export float Round(float x) {
+;    unsigned int sign = signbits(x);
+;    unsigned int ix = intbits(x);
+;    ix ^= sign;
+;    x = floatbits(ix);
+;    x += 0x1.0p23f;
+;    x -= 0x1.0p23f;
+;    ix = intbits(x);
+;    ix ^= sign;
+;    x = floatbits(ix);
+;    return x;
+;}
+
+define  <1 x float> @__round_varying_float(<1 x float>) nounwind readonly alwaysinline {
+  %float_to_int_bitcast.i.i.i.i = bitcast <1 x float> %0 to <1 x i32>
+  %bitop.i.i = and <1 x i32> %float_to_int_bitcast.i.i.i.i, <i32 -2147483648>
+  %bitop.i = xor <1 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i40.i = bitcast <1 x i32> %bitop.i to <1 x float>
+  %binop.i = fadd <1 x float> %int_to_float_bitcast.i.i40.i, <float 8.388608e+06>
+  %binop21.i = fadd <1 x float> %binop.i, <float -8.388608e+06>
+  %float_to_int_bitcast.i.i.i = bitcast <1 x float> %binop21.i to <1 x i32>
+  %bitop31.i = xor <1 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i.i = bitcast <1 x i32> %bitop31.i to <1 x float>
+  ret <1 x float> %int_to_float_bitcast.i.i.i
+}
+
+;; Similarly, for implementations of the __floor* functions below, we have the
+;; bitcode from compiling the following source code...
+
+;export float Floor(float x) {
+;    float y = Round(x);
+;    unsigned int cmp = y > x ? 0xffffffff : 0;
+;    float delta = -1.f;
+;    unsigned int idelta = intbits(delta);
+;    idelta &= cmp;
+;    delta = floatbits(idelta);
+;    return y + delta;
+;}
+
+define  <1 x float> @__floor_varying_float(<1 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <1 x float> @__round_varying_float(<1 x float> %0) nounwind
+  %bincmp.i = fcmp ogt <1 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <1 x i1> %bincmp.i to <1 x i32>
+  %bitop.i = and <1 x i32> %val_to_boolvec32.i, <i32 -1082130432>
+  %int_to_float_bitcast.i.i.i = bitcast <1 x i32> %bitop.i to <1 x float>
+  %binop.i = fadd <1 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <1 x float> %binop.i
+}
+
+;; And here is the code we compiled to get the __ceil* functions below
+;
+;export uniform float Ceil(uniform float x) {
+;    uniform float y = Round(x);
+;    uniform int yltx = y < x ? 0xffffffff : 0;
+;    uniform float delta = 1.f;
+;    uniform int idelta = intbits(delta);
+;    idelta &= yltx;
+;    delta = floatbits(idelta);
+;    return y + delta;
+;}
+
+define  <1 x float> @__ceil_varying_float(<1 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <1 x float> @__round_varying_float(<1 x float> %0) nounwind
+  %bincmp.i = fcmp olt <1 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <1 x i1> %bincmp.i to <1 x i32>
+  %bitop.i = and <1 x i32> %val_to_boolvec32.i, <i32 1065353216>
+  %int_to_float_bitcast.i.i.i = bitcast <1 x i32> %bitop.i to <1 x float>
+  %binop.i = fadd <1 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <1 x float> %binop.i
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+; expecting math lib to provide this
+declare double @ceil (double) nounwind readnone
+declare double @floor (double) nounwind readnone
+declare double @round (double) nounwind readnone
+;declare float     @llvm.sqrt.f32(float %Val)
+declare double    @llvm.sqrt.f64(double %Val)
+declare float     @llvm.sin.f32(float %Val)
+declare float     @llvm.cos.f32(float %Val)
+declare float     @llvm.sqrt.f32(float %Val)
+declare float     @llvm.exp.f32(float %Val)
+declare float     @llvm.log.f32(float %Val)
+declare float     @llvm.pow.f32(float %f, float %e)
+
+
+
+
+;; stuff that could be in builtins ...
+
+define(`unary1to1', `
+  %v_0 = extractelement <1 x $1> %0, i32 0
+  %r_0 = call $1 $2($1 %v_0)
+  %ret_0 = insertelement <1 x $1> undef, $1 %r_0, i32 0
+  ret <1 x $1> %ret_0
+')
+
+
+
+;; dummy 1 wide vector ops
+define  void
+@__aos_to_soa4_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2,
+        <1 x float> %v3, <1 x float> * noalias %out0, 
+        <1 x float> * noalias %out1, <1 x float> * noalias %out2, 
+        <1 x float> * noalias %out3) nounwind alwaysinline { 
+
+  store <1 x float> %v0, <1 x float > * %out0
+  store <1 x float> %v1, <1 x float > * %out1
+  store <1 x float> %v2, <1 x float > * %out2
+  store <1 x float> %v3, <1 x float > * %out3
+
+  ret void
+}
+
+define  void
+@__soa_to_aos4_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2,
+        <1 x float> %v3, <1 x float> * noalias %out0, 
+        <1 x float> * noalias %out1, <1 x float> * noalias %out2, 
+        <1 x float> * noalias %out3) nounwind alwaysinline { 
+  call void @__aos_to_soa4_float1(<1 x float> %v0, <1 x float> %v1, 
+    <1 x float> %v2, <1 x float> %v3, <1 x float> * %out0, 
+    <1 x float> * %out1, <1 x float> * %out2, <1 x float> * %out3)
+  ret void
+}
+
+define  void
+@__aos_to_soa3_float1(<1 x float> %v0, <1 x float> %v1,
+         <1 x float> %v2, <1 x float> * %out0, <1 x float> * %out1,
+         <1 x float> * %out2) {
+  store <1 x float> %v0, <1 x float > * %out0
+  store <1 x float> %v1, <1 x float > * %out1
+  store <1 x float> %v2, <1 x float > * %out2
+
+  ret void
+}
+
+define  void
+@__soa_to_aos3_float1(<1 x float> %v0, <1 x float> %v1,
+         <1 x float> %v2, <1 x float> * %out0, <1 x float> * %out1,
+         <1 x float> * %out2) {
+  call void @__aos_to_soa3_float1(<1 x float> %v0, <1 x float> %v1,
+         <1 x float> %v2, <1 x float> * %out0, <1 x float> * %out1,
+         <1 x float> * %out2)
+  ret void
+}
+
+
+;; end builtins
+
+
+define  <1 x double> @__round_varying_double(<1 x double>) nounwind readonly alwaysinline {
+  unary1to1(double, @round)
+}
+
+define  <1 x double> @__floor_varying_double(<1 x double>) nounwind readonly alwaysinline {
+  unary1to1(double, @floor)
+}
+
+
+define  <1 x double> @__ceil_varying_double(<1 x double>) nounwind readonly alwaysinline {
+  unary1to1(double, @ceil)
+}
+
+; To do vector integer min and max, we do the vector compare and then sign
+; extend the i1 vector result to an i32 mask.  The __vselect does the
+; rest...
+
+define  <1 x i32> @__min_varying_int32(<1 x i32>, <1 x i32>) nounwind readonly alwaysinline {
+  %c = icmp slt <1 x i32> %0, %1
+  %mask = sext <1 x i1> %c to <1 x i32>
+  %v = call <1 x i32> @__vselect_i32(<1 x i32> %1, <1 x i32> %0, <1 x i32> %mask)
+  ret <1 x i32> %v
+}
+
+define  i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp slt i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define  <1 x i32> @__max_varying_int32(<1 x i32>, <1 x i32>) nounwind readonly alwaysinline {
+  %c = icmp sgt <1 x i32> %0, %1
+  %mask = sext <1 x i1> %c to <1 x i32>
+  %v = call <1 x i32> @__vselect_i32(<1 x i32> %1, <1 x i32> %0, <1 x i32> %mask)
+  ret <1 x i32> %v
+}
+
+define  i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp sgt i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+
+; The functions for unsigned ints are similar, just with unsigned
+; comparison functions...
+
+define  <1 x i32> @__min_varying_uint32(<1 x i32>, <1 x i32>) nounwind readonly alwaysinline {
+  %c = icmp ult <1 x i32> %0, %1
+  %mask = sext <1 x i1> %c to <1 x i32>
+  %v = call <1 x i32> @__vselect_i32(<1 x i32> %1, <1 x i32> %0, <1 x i32> %mask)
+  ret <1 x i32> %v
+}
+
+define  i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp ult i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define  <1 x i32> @__max_varying_uint32(<1 x i32>, <1 x i32>) nounwind readonly alwaysinline {
+  %c = icmp ugt <1 x i32> %0, %1
+  %mask = sext <1 x i1> %c to <1 x i32>
+  %v = call <1 x i32> @__vselect_i32(<1 x i32> %1, <1 x i32> %0, <1 x i32> %mask)
+  ret <1 x i32> %v
+}
+
+define  i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp ugt i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops / reductions
+
+declare i32 @llvm.ctpop.i32(i32) nounwind readnone
+
+define  i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
+  %call = call i32 @llvm.ctpop.i32(i32 %0)
+  ret i32 %call
+}
+
+declare i64 @llvm.ctpop.i64(i64) nounwind readnone
+
+define  i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
+  %call = call i64 @llvm.ctpop.i64(i64 %0)
+  ret i64 %call
+}
+
+
+define  float @__reduce_add_float(<1 x float> %v) nounwind readonly alwaysinline {
+  %r = extractelement <1 x float> %v, i32 0
+  ret float %r
+}
+
+define  float @__reduce_min_float(<1 x float>) nounwind readnone {
+  %r = extractelement <1 x float> %0, i32 0
+  ret float %r
+}
+
+define  float @__reduce_max_float(<1 x float>) nounwind readnone {
+  %r = extractelement <1 x float> %0, i32 0
+  ret float %r
+}
+
+define  i32 @__reduce_add_int32(<1 x i32> %v) nounwind readnone {
+  %r = extractelement <1 x i32> %v, i32 0
+  ret i32 %r
+}
+
+define  i32 @__reduce_min_int32(<1 x i32>) nounwind readnone {
+  %r = extractelement <1 x i32> %0, i32 0
+  ret i32 %r
+}
+
+define  i32 @__reduce_max_int32(<1 x i32>) nounwind readnone {
+  %r = extractelement <1 x i32> %0, i32 0
+  ret i32 %r
+}
+
+define  i32 @__reduce_add_uint32(<1 x i32> %v) nounwind readnone {
+  %r = call i32 @__reduce_add_int32(<1 x i32> %v)
+  ret i32 %r
+}
+
+define  i32 @__reduce_min_uint32(<1 x i32>) nounwind readnone {
+  %r = extractelement <1 x i32> %0, i32 0
+  ret i32 %r
+}
+
+define  i32 @__reduce_max_uint32(<1 x i32>) nounwind readnone {
+  %r = extractelement <1 x i32> %0, i32 0
+  ret i32 %r
+ }
+
+
+define  double @__reduce_add_double(<1 x double>) nounwind readnone {
+  %m = extractelement <1 x double> %0, i32 0
+  ret double %m
+}
+
+define  double @__reduce_min_double(<1 x double>) nounwind readnone {
+  %m = extractelement <1 x double> %0, i32 0
+  ret double %m
+}
+
+define  double @__reduce_max_double(<1 x double>) nounwind readnone {
+  %m = extractelement <1 x double> %0, i32 0
+  ret double %m
+}
+
+define  i64 @__reduce_add_int64(<1 x i64>) nounwind readnone {
+  %m = extractelement <1 x i64> %0, i32 0
+  ret i64 %m
+}
+
+define  i64 @__reduce_min_int64(<1 x i64>) nounwind readnone {
+  %m = extractelement <1 x i64> %0, i32 0
+  ret i64 %m
+}
+
+define  i64 @__reduce_max_int64(<1 x i64>) nounwind readnone {
+  %m = extractelement <1 x i64> %0, i32 0
+  ret i64 %m
+}
+
+define  i64 @__reduce_min_uint64(<1 x i64>) nounwind readnone {
+  %m = extractelement <1 x i64> %0, i32 0
+  ret i64 %m
+}
+
+define  i64 @__reduce_max_uint64(<1 x i64>) nounwind readnone {
+  %m = extractelement <1 x i64> %0, i32 0
+  ret i64 %m
+}
+
+define  i1 @__reduce_equal_int32(<1 x i32> %vv, i32 * %samevalue,
+                                      <1 x i32> %mask) nounwind alwaysinline {
+  %v=extractelement <1 x i32> %vv, i32 0
+  store i32 %v, i32 * %samevalue
+  ret i1 true
+
+}
+
+define  i1 @__reduce_equal_float(<1 x float> %vv, float * %samevalue,
+                                      <1 x i32> %mask) nounwind alwaysinline {
+  %v=extractelement <1 x float> %vv, i32 0
+  store float %v, float * %samevalue
+  ret i1 true
+
+}
+
+define  i1 @__reduce_equal_int64(<1 x i64> %vv, i64 * %samevalue,
+                                      <1 x i32> %mask) nounwind alwaysinline {
+  %v=extractelement <1 x i64> %vv, i32 0
+  store i64 %v, i64 * %samevalue
+  ret i1 true
+
+}
+
+define  i1 @__reduce_equal_double(<1 x double> %vv, double * %samevalue,
+                                      <1 x i32> %mask) nounwind alwaysinline {
+  %v=extractelement <1 x double> %vv, i32 0
+  store double %v, double * %samevalue
+  ret i1 true
+
+}
+
+; extracting/reinserting elements because I want to be able to remove vectors later on
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+define  <1 x float> @__rcp_varying_float(<1 x float>) nounwind readonly alwaysinline {
+  ;%call = call <1 x float> @llvm.x86.sse.rcp.ps(<1 x float> %0)
+  ; do one N-R iteration to improve precision
+  ;  float iv = __rcp_v(v);
+  ;  return iv * (2. - v * iv);
+  ;%v_iv = fmul <1 x float> %0, %call
+  ;%two_minus = fsub <1 x float> <float 2., float 2., float 2., float 2.>, %v_iv  
+  ;%iv_mul = fmul <1 x float> %call, %two_minus
+  ;ret <1 x float> %iv_mul
+  %d = extractelement <1 x float> %0, i32 0
+  %r = fdiv float 1.,%d
+  %rv = insertelement <1 x float> undef, float %r, i32 0
+  ret <1 x float> %rv
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; sqrt
+
+define  <1 x float> @__sqrt_varying_float(<1 x float>) nounwind readonly alwaysinline {
+  ;%call = call <1 x float> @llvm.x86.sse.sqrt.ps(<1 x float> %0)
+  ;ret <1 x float> %call
+  %d = extractelement <1 x float> %0, i32 0
+  %r = call float @llvm.sqrt.f32(float %d)
+  %rv = insertelement <1 x float> undef, float %r, i32 0
+  ret <1 x float> %rv
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; rsqrt
+
+define  <1 x float> @__rsqrt_varying_float(<1 x float> %v) nounwind readonly alwaysinline {
+  ;  float is = __rsqrt_v(v);
+  ;%is = call <1 x float> @llvm.x86.sse.rsqrt.ps(<1 x float> %v)
+  ; Newton-Raphson iteration to improve precision
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  ;%v_is = fmul <1 x float> %v, %is
+  ;%v_is_is = fmul <1 x float> %v_is, %is
+  ;%three_sub = fsub <1 x float> <float 3., float 3., float 3., float 3.>, %v_is_is
+  ;%is_mul = fmul <1 x float> %is, %three_sub
+  ;%half_scale = fmul <1 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  ;ret <1 x float> %half_scale
+  %s = call <1 x float> @__sqrt_varying_float(<1 x float> %v)
+  %r = call <1 x float> @__rcp_varying_float(<1 x float> %s)
+  ret <1 x float> %r
+  
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; svml stuff
+
+define  <1 x float> @__svml_sin(<1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_sinf4(<1 x float> %0)
+  ;ret <1 x float> %ret
+  ;%r = extractelement <1 x float> %0, i32 0
+  ;%s = call float @llvm.sin.f32(float %r)
+  ;%rv = insertelement <1 x float> undef, float %r, i32 0
+  ;ret <1 x float> %rv
+  unary1to1(float,@llvm.sin.f32)
+   
+}
+
+define  <1 x float> @__svml_cos(<1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_cosf4(<1 x float> %0)
+  ;ret <1 x float> %ret
+  ;%r = extractelement <1 x float> %0, i32 0
+  ;%s = call float @llvm.cos.f32(float %r)
+  ;%rv = insertelement <1 x float> undef, float %r, i32 0
+  ;ret <1 x float> %rv
+  unary1to1(float, @llvm.cos.f32)
+
+}
+
+define  void @__svml_sincos(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline {
+;  %s = call <1 x float> @__svml_sincosf4(<1 x float> * %2, <1 x float> %0)
+;  store <1 x float> %s, <1 x float> * %1
+;  ret void
+   %sin = call <1 x float> @__svml_sin (<1 x float> %0)
+   %cos = call <1 x float> @__svml_cos (<1 x float> %0)
+   store <1 x float> %sin, <1 x float> * %1
+   store <1 x float> %cos, <1 x float> * %2
+   ret void
+}
+
+define  <1 x float> @__svml_tan(<1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_tanf4(<1 x float> %0)
+  ;ret <1 x float> %ret
+  ;%r = extractelement <1 x float> %0, i32 0
+  ;%s = call float @llvm_tan_f32(float %r)
+  ;%rv = insertelement <1 x float> undef, float %r, i32 0
+  ;ret <1 x float> %rv
+  ;unasry1to1(float, @llvm.tan.f32)
+  ; UNSUPPORTED!
+  ret <1 x float > %0
+}
+
+define  <1 x float> @__svml_atan(<1 x float>) nounwind readnone alwaysinline {
+;  %ret = call <1 x float> @__svml_atanf4(<1 x float> %0)
+;  ret <1 x float> %ret
+  ;%r = extractelement <1 x float> %0, i32 0
+  ;%s = call float @llvm_atan_f32(float %r)
+  ;%rv = insertelement <1 x float> undef, float %r, i32 0
+  ;ret <1 x float> %rv
+  ;unsary1to1(float,@llvm.atan.f32)
+  ;UNSUPPORTED!
+  ret <1 x float > %0
+
+}
+
+define  <1 x float> @__svml_atan2(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_atan2f4(<1 x float> %0, <1 x float> %1)
+  ;ret <1 x float> %ret
+  ;%y = extractelement <1 x float> %0, i32 0
+  ;%x = extractelement <1 x float> %1, i32 0
+  ;%q = fdiv float %y, %x
+  ;%a = call float @llvm.atan.f32 (float %q)
+  ;%rv = insertelement <1 x float> undef, float %a, i32 0
+  ;ret <1 x float> %rv
+  ; UNSUPPORTED!
+  ret <1 x float > %0
+}
+
+define  <1 x float> @__svml_exp(<1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_expf4(<1 x float> %0)
+  ;ret <1 x float> %ret
+  unary1to1(float, @llvm.exp.f32)
+}
+
+define  <1 x float> @__svml_log(<1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_logf4(<1 x float> %0)
+  ;ret <1 x float> %ret
+  unary1to1(float, @llvm.log.f32)
+}
+
+define  <1 x float> @__svml_pow(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_powf4(<1 x float> %0, <1 x float> %1)
+  ;ret <1 x float> %ret
+  %r = extractelement <1 x float> %0, i32 0
+  %e  = extractelement <1 x float> %1, i32 0
+  %s = call float @llvm.pow.f32(float %r,float %e)
+  %rv = insertelement <1 x float> undef, float %s, i32 0
+  ret <1 x float> %rv
+
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+define  <1 x float> @__max_varying_float(<1 x float>, <1 x float>) nounwind readonly alwaysinline {
+;  %call = call <1 x float> @llvm.x86.sse.max.ps(<1 x float> %0, <1 x float> %1)
+;  ret <1 x float> %call
+  %a = extractelement <1 x float> %0, i32 0
+  %b = extractelement <1 x float> %1, i32 0
+  %d = fcmp ogt float %a, %b  
+  %r = select i1 %d, float %a, float %b
+  %rv = insertelement <1 x float> undef, float %r, i32 0
+  ret <1 x float> %rv    
+}
+
+define  <1 x float> @__min_varying_float(<1 x float>, <1 x float>) nounwind readonly alwaysinline {
+;  %call = call <1 x float> @llvm.x86.sse.min.ps(<1 x float> %0, <1 x float> %1)
+;  ret <1 x float> %call
+  %a = extractelement <1 x float> %0, i32 0
+  %b = extractelement <1 x float> %1, i32 0
+  %d = fcmp olt float %a, %b  
+  %r = select i1 %d, float %a, float %b
+  %rv = insertelement <1 x float> undef, float %r, i32 0
+  ret <1 x float> %rv    
+
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+;declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
+
+define  <1 x double> @__sqrt_varying_double(<1 x double>) nounwind alwaysinline {
+  ;unarya2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
+  ;ret <1 x double> %ret
+  unary1to1(double, @llvm.sqrt.f64)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+;declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
+;declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define  <1 x double> @__min_varying_double(<1 x double>, <1 x double>) nounwind readnone {
+  ;binarsy2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
+  ;ret <1 x double> %ret
+  %a = extractelement <1 x double> %0, i32 0
+  %b = extractelement <1 x double> %1, i32 0
+  %d = fcmp olt double %a, %b  
+  %r = select i1 %d, double %a, double %b
+  %rv = insertelement <1 x double> undef, double %r, i32 0
+  ret <1 x double> %rv    
+
+}
+
+define  <1 x double> @__max_varying_double(<1 x double>, <1 x double>) nounwind readnone {
+  ;binary2sto4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
+  ;ret <1 x double> %ret
+  %a = extractelement <1 x double> %0, i32 0
+  %b = extractelement <1 x double> %1, i32 0
+  %d = fcmp ogt double %a, %b  
+  %r = select i1 %d, double %a, double %b
+  %rv = insertelement <1 x double> undef, double %r, i32 0
+  ret <1 x double> %rv    
+
+}
+
+
+define  float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
+;    uniform float iv = extract(__rcp_u(v), 0);
+;    return iv * (2. - v * iv);
+  %r = fdiv float 1.,%0
+  ret float %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding floats
+
+define  float @__round_uniform_float(float) nounwind readonly alwaysinline {
+  ; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  ; the roundss intrinsic is a total mess--docs say:
+  ;
+  ;  __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
+  ;       
+  ;  b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
+  ;  on b0. The higher order 96 bits are copied directly from input parameter a. The
+  ;  return value is described by the following equations:
+  ;
+  ;  r0 = RND(b0)
+  ;  r1 = a1
+  ;  r2 = a2
+  ;  r3 = a3
+  ;
+  ;  It doesn't matter what we pass as a, since we only need the r0 value
+  ;  here.  So we pass the same register for both.
+  %v = insertelement<1 x float> undef, float %0, i32 0
+  %rv = call <1 x float> @__round_varying_float(<1 x float> %v)
+  %r=extractelement <1 x float> %rv, i32 0
+  ret float %r
+
+}
+
+define  float @__floor_uniform_float(float) nounwind readonly alwaysinline {
+  %v = insertelement<1 x float> undef, float %0, i32 0
+  %rv = call <1 x float> @__floor_varying_float(<1 x float> %v)
+  %r=extractelement <1 x float> %rv, i32 0
+  ret float %r
+
+}
+
+define  float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
+  %v = insertelement<1 x float> undef, float %0, i32 0
+  %rv = call <1 x float> @__ceil_varying_float(<1 x float> %v)
+  %r=extractelement <1 x float> %rv, i32 0
+  ret float %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+
+define  double @__round_uniform_double(double) nounwind readonly alwaysinline {
+       %rs=call double @round(double %0)
+       ret double %rs
+}
+
+define  double @__floor_uniform_double(double) nounwind readonly alwaysinline {
+  %rs = call double @floor(double %0)
+  ret double %rs
+}
+
+define  double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
+  %rs = call double @ceil(double %0)
+  ret double %rs
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; sqrt
+
+
+define  float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
+  %ret = call float @llvm.sqrt.f32(float %0)
+  ret float %ret
+}
+
+define  double @__sqrt_uniform_double(double) nounwind readonly alwaysinline {
+  %ret = call double @llvm.sqrt.f64(double %0)
+  ret double %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rsqrt
+
+
+define  float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
+  %s = call float @__sqrt_uniform_float(float %0)
+  %r = call float @__rcp_uniform_float(float %s)
+  ret float %r
+}
+
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; fastmath
+
+
+define  void @__fastmath() nounwind alwaysinline {
+ ; no-op
+  ret void
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+
+define  float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
+  %d = fcmp ogt float %0, %1 
+  %r = select i1 %d, float %0, float %1
+  ret float %r
+
+}
+
+define  float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
+  %d = fcmp olt float %0, %1 
+  %r = select i1 %d, float %0, float %1
+  ret float %r
+
+}
+define  double @__max_uniform_double(double, double) nounwind readonly alwaysinline {
+  %d = fcmp ogt double %0, %1 
+  %r = select i1 %d, double %0, double %1
+  ret double %r
+
+}
+
+define  double @__min_uniform_double(double, double) nounwind readonly alwaysinline {
+  %d = fcmp olt double %0, %1 
+  %r = select i1 %d, double %0, double %1
+  ret double %r
+
+}
+
+define_shuffles()
+
+ctlztz()
+
+define_prefetches()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
--- a/builtins/target-generic-16.ll
+++ b/builtins/target-generic-16.ll
@@ -0,0 +1,34 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`WIDTH',`16')
+include(`target-generic-common.ll')
+
--- a/builtins/target-generic-4.ll
+++ b/builtins/target-generic-4.ll
@@ -0,0 +1,34 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`WIDTH',`4')
+include(`target-generic-common.ll')
+
--- a/builtins/target-generic-8.ll
+++ b/builtins/target-generic-8.ll
@@ -0,0 +1,34 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`WIDTH',`8')
+include(`target-generic-common.ll')
+
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -0,0 +1,336 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`MASK',`i1')
+include(`util.m4')
+
+stdlib_core()
+scans()
+reduce_equal(WIDTH)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; broadcast/rotate/shuffle
+
+declare <WIDTH x float> @__smear_float(float) nounwind readnone
+declare <WIDTH x double> @__smear_double(double) nounwind readnone
+declare <WIDTH x i8> @__smear_i8(i8) nounwind readnone
+declare <WIDTH x i16> @__smear_i16(i16) nounwind readnone
+declare <WIDTH x i32> @__smear_i32(i32) nounwind readnone
+declare <WIDTH x i64> @__smear_i64(i64) nounwind readnone
+
+declare <WIDTH x float> @__broadcast_float(<WIDTH x float>, i32) nounwind readnone
+declare <WIDTH x double> @__broadcast_double(<WIDTH x double>, i32) nounwind readnone
+declare <WIDTH x i8> @__broadcast_i8(<WIDTH x i8>, i32) nounwind readnone
+declare <WIDTH x i16> @__broadcast_i16(<WIDTH x i16>, i32) nounwind readnone
+declare <WIDTH x i32> @__broadcast_i32(<WIDTH x i32>, i32) nounwind readnone
+declare <WIDTH x i64> @__broadcast_i64(<WIDTH x i64>, i32) nounwind readnone
+
+declare <WIDTH x i8> @__rotate_i8(<WIDTH x i8>, i32) nounwind readnone
+declare <WIDTH x i16> @__rotate_i16(<WIDTH x i16>, i32) nounwind readnone
+declare <WIDTH x float> @__rotate_float(<WIDTH x float>, i32) nounwind readnone
+declare <WIDTH x i32> @__rotate_i32(<WIDTH x i32>, i32) nounwind readnone
+declare <WIDTH x double> @__rotate_double(<WIDTH x double>, i32) nounwind readnone
+declare <WIDTH x i64> @__rotate_i64(<WIDTH x i64>, i32) nounwind readnone
+
+declare <WIDTH x i8> @__shuffle_i8(<WIDTH x i8>, <WIDTH x i32>) nounwind readnone
+declare <WIDTH x i8> @__shuffle2_i8(<WIDTH x i8>, <WIDTH x i8>,
+                                    <WIDTH x i32>) nounwind readnone
+declare <WIDTH x i16> @__shuffle_i16(<WIDTH x i16>, <WIDTH x i32>) nounwind readnone
+declare <WIDTH x i16> @__shuffle2_i16(<WIDTH x i16>, <WIDTH x i16>,
+                                      <WIDTH x i32>) nounwind readnone
+declare <WIDTH x float> @__shuffle_float(<WIDTH x float>,
+                                         <WIDTH x i32>) nounwind readnone
+declare <WIDTH x float> @__shuffle2_float(<WIDTH x float>, <WIDTH x float>,
+                                          <WIDTH x i32>) nounwind readnone
+declare <WIDTH x i32> @__shuffle_i32(<WIDTH x i32>,
+                                     <WIDTH x i32>) nounwind readnone
+declare <WIDTH x i32> @__shuffle2_i32(<WIDTH x i32>, <WIDTH x i32>,
+                                      <WIDTH x i32>) nounwind readnone
+declare <WIDTH x double> @__shuffle_double(<WIDTH x double>,
+                                           <WIDTH x i32>) nounwind readnone
+declare <WIDTH x double> @__shuffle2_double(<WIDTH x double>,
+                                            <WIDTH x double>, <WIDTH x i32>) nounwind readnone
+declare <WIDTH x i64> @__shuffle_i64(<WIDTH x i64>,
+                                     <WIDTH x i32>) nounwind readnone
+declare <WIDTH x i64> @__shuffle2_i64(<WIDTH x i64>, <WIDTH x i64>,
+                                      <WIDTH x i32>) nounwind readnone
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; aos/soa
+
+declare void @__soa_to_aos3_float(<WIDTH x float> %v0, <WIDTH x float> %v1,
+                                  <WIDTH x float> %v2, float * noalias %p) nounwind
+declare void @__aos_to_soa3_float(float * noalias %p, <WIDTH x float> * %out0,
+                                  <WIDTH x float> * %out1, <WIDTH x float> * %out2) nounwind
+declare void @__soa_to_aos4_float(<WIDTH x float> %v0, <WIDTH x float> %v1,
+                                  <WIDTH x float> %v2, <WIDTH x float> %v3,
+                                  float * noalias %p) nounwind
+declare void @__aos_to_soa4_float(float * noalias %p, <WIDTH x float> * noalias %out0,
+                                  <WIDTH x float> * noalias %out1,
+                                  <WIDTH x float> * noalias %out2,
+                                  <WIDTH x float> * noalias %out3) nounwind
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; math
+
+declare void @__fastmath() nounwind 
+
+;; round/floor/ceil
+
+declare float @__round_uniform_float(float) nounwind readnone 
+declare float @__floor_uniform_float(float) nounwind readnone 
+declare float @__ceil_uniform_float(float) nounwind readnone 
+
+declare double @__round_uniform_double(double) nounwind readnone 
+declare double @__floor_uniform_double(double) nounwind readnone 
+declare double @__ceil_uniform_double(double) nounwind readnone 
+
+declare <WIDTH x float> @__round_varying_float(<WIDTH x float>) nounwind readnone 
+declare <WIDTH x float> @__floor_varying_float(<WIDTH x float>) nounwind readnone 
+declare <WIDTH x float> @__ceil_varying_float(<WIDTH x float>) nounwind readnone 
+declare <WIDTH x double> @__round_varying_double(<WIDTH x double>) nounwind readnone 
+declare <WIDTH x double> @__floor_varying_double(<WIDTH x double>) nounwind readnone 
+declare <WIDTH x double> @__ceil_varying_double(<WIDTH x double>) nounwind readnone 
+
+;; min/max
+
+declare float @__max_uniform_float(float, float) nounwind readnone 
+declare float @__min_uniform_float(float, float) nounwind readnone 
+declare i32 @__min_uniform_int32(i32, i32) nounwind readnone 
+declare i32 @__max_uniform_int32(i32, i32) nounwind readnone 
+declare i32 @__min_uniform_uint32(i32, i32) nounwind readnone 
+declare i32 @__max_uniform_uint32(i32, i32) nounwind readnone 
+declare i64 @__min_uniform_int64(i64, i64) nounwind readnone 
+declare i64 @__max_uniform_int64(i64, i64) nounwind readnone 
+declare i64 @__min_uniform_uint64(i64, i64) nounwind readnone 
+declare i64 @__max_uniform_uint64(i64, i64) nounwind readnone 
+declare double @__min_uniform_double(double, double) nounwind readnone 
+declare double @__max_uniform_double(double, double) nounwind readnone 
+
+declare <WIDTH x float> @__max_varying_float(<WIDTH x float>,
+                                             <WIDTH x float>) nounwind readnone 
+declare <WIDTH x float> @__min_varying_float(<WIDTH x float>,
+                                             <WIDTH x float>) nounwind readnone 
+declare <WIDTH x i32> @__min_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone 
+declare <WIDTH x i32> @__max_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone 
+declare <WIDTH x i32> @__min_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone 
+declare <WIDTH x i32> @__max_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone 
+declare <WIDTH x i64> @__min_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone 
+declare <WIDTH x i64> @__max_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone 
+declare <WIDTH x i64> @__min_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone 
+declare <WIDTH x i64> @__max_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone 
+declare <WIDTH x double> @__min_varying_double(<WIDTH x double>,
+                                               <WIDTH x double>) nounwind readnone
+declare <WIDTH x double> @__max_varying_double(<WIDTH x double>,
+                                               <WIDTH x double>) nounwind readnone 
+
+;; sqrt/rsqrt/rcp
+
+declare float @__rsqrt_uniform_float(float) nounwind readnone 
+declare float @__rcp_uniform_float(float) nounwind readnone 
+declare float @__sqrt_uniform_float(float) nounwind readnone 
+declare <WIDTH x float> @__rcp_varying_float(<WIDTH x float>) nounwind readnone 
+declare <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float>) nounwind readnone 
+declare <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone 
+
+declare double @__sqrt_uniform_double(double) nounwind readnone
+declare <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readnone
+
+;; bit ops
+
+declare i32 @__popcnt_int32(i32) nounwind readnone
+declare i64 @__popcnt_int64(i64) nounwind readnone 
+
+declare i32 @__count_trailing_zeros_i32(i32) nounwind readnone
+declare i64 @__count_trailing_zeros_i64(i64) nounwind readnone
+declare i32 @__count_leading_zeros_i32(i32) nounwind readnone
+declare i64 @__count_leading_zeros_i64(i64) nounwind readnone
+
+;; svml
+
+; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
+; or, use the macro to call the 4-wide ones twice with our 8-wide
+; vectors...
+
+declare <WIDTH x float> @__svml_sin(<WIDTH x float>)
+declare <WIDTH x float> @__svml_cos(<WIDTH x float>)
+declare void @__svml_sincos(<WIDTH x float>, <WIDTH x float> *, <WIDTH x float> *)
+declare <WIDTH x float> @__svml_tan(<WIDTH x float>)
+declare <WIDTH x float> @__svml_atan(<WIDTH x float>)
+declare <WIDTH x float> @__svml_atan2(<WIDTH x float>, <WIDTH x float>)
+declare <WIDTH x float> @__svml_exp(<WIDTH x float>)
+declare <WIDTH x float> @__svml_log(<WIDTH x float>)
+declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reductions
+
+declare i32 @__movmsk(<WIDTH x i1>) nounwind readnone 
+
+declare float @__reduce_add_float(<WIDTH x float>) nounwind readnone
+declare float @__reduce_min_float(<WIDTH x float>) nounwind readnone 
+declare float @__reduce_max_float(<WIDTH x float>) nounwind readnone 
+
+declare i32 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone 
+declare i32 @__reduce_min_int32(<WIDTH x i32>) nounwind readnone 
+declare i32 @__reduce_max_int32(<WIDTH x i32>) nounwind readnone 
+
+declare i32 @__reduce_add_uint32(<WIDTH x i32>) nounwind readnone 
+declare i32 @__reduce_min_uint32(<WIDTH x i32>) nounwind readnone 
+declare i32 @__reduce_max_uint32(<WIDTH x i32>) nounwind readnone 
+
+declare double @__reduce_add_double(<WIDTH x double>) nounwind readnone 
+declare double @__reduce_min_double(<WIDTH x double>) nounwind readnone 
+declare double @__reduce_max_double(<WIDTH x double>) nounwind readnone 
+
+declare i64 @__reduce_add_int64(<WIDTH x i64>) nounwind readnone 
+declare i64 @__reduce_min_int64(<WIDTH x i64>) nounwind readnone 
+declare i64 @__reduce_max_int64(<WIDTH x i64>) nounwind readnone 
+
+declare i64 @__reduce_add_uint64(<WIDTH x i64>) nounwind readnone 
+declare i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone 
+declare i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone 
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+load_and_broadcast(WIDTH, i8, 8)
+load_and_broadcast(WIDTH, i16, 16)
+load_and_broadcast(WIDTH, i32, 32)
+load_and_broadcast(WIDTH, i64, 64)
+
+declare <WIDTH x i8> @__masked_load_8(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
+declare <WIDTH x i16> @__masked_load_16(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
+declare <WIDTH x i32> @__masked_load_32(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
+declare <WIDTH x i64> @__masked_load_64(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
+
+declare void @__masked_store_8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
+                               <WIDTH x i1>) nounwind 
+declare void @__masked_store_16(<WIDTH x i16>* nocapture, <WIDTH x i16>, 
+                                <WIDTH x i1>) nounwind 
+declare void @__masked_store_32(<WIDTH x i32>* nocapture, <WIDTH x i32>, 
+                                <WIDTH x i1>) nounwind 
+declare void @__masked_store_64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
+                                <WIDTH x i1> %mask) nounwind 
+
+ifelse(LLVM_VERSION, `LLVM_3_1svn',`
+define void @__masked_store_blend_8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
+                                     <WIDTH x i1>) nounwind alwaysinline {
+  %v = load <WIDTH x i8> * %0
+  %v1 = select <WIDTH x i1> %2, <WIDTH x i8> %1, <WIDTH x i8> %v
+  store <WIDTH x i8> %v1, <WIDTH x i8> * %0
+  ret void
+}
+
+define void @__masked_store_blend_16(<WIDTH x i16>* nocapture, <WIDTH x i16>, 
+                                     <WIDTH x i1>) nounwind alwaysinline {
+  %v = load <WIDTH x i16> * %0
+  %v1 = select <WIDTH x i1> %2, <WIDTH x i16> %1, <WIDTH x i16> %v
+  store <WIDTH x i16> %v1, <WIDTH x i16> * %0
+  ret void
+}
+
+define void @__masked_store_blend_32(<WIDTH x i32>* nocapture, <WIDTH x i32>, 
+                                     <WIDTH x i1>) nounwind alwaysinline {
+  %v = load <WIDTH x i32> * %0
+  %v1 = select <WIDTH x i1> %2, <WIDTH x i32> %1, <WIDTH x i32> %v
+  store <WIDTH x i32> %v1, <WIDTH x i32> * %0
+  ret void
+}
+
+define void @__masked_store_blend_64(<WIDTH x i64>* nocapture,
+                            <WIDTH x i64>, <WIDTH x i1>) nounwind alwaysinline {
+  %v = load <WIDTH x i64> * %0
+  %v1 = select <WIDTH x i1> %2, <WIDTH x i64> %1, <WIDTH x i64> %v
+  store <WIDTH x i64> %v1, <WIDTH x i64> * %0
+  ret void
+}
+',`
+declare void @__masked_store_blend_8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
+                                     <WIDTH x i1>) nounwind 
+declare void @__masked_store_blend_16(<WIDTH x i16>* nocapture, <WIDTH x i16>, 
+                                      <WIDTH x i1>) nounwind 
+declare void @__masked_store_blend_32(<WIDTH x i32>* nocapture, <WIDTH x i32>, 
+                                      <WIDTH x i1>) nounwind 
+declare void @__masked_store_blend_64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
+                                      <WIDTH x i1> %mask) nounwind 
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather/scatter
+
+define(`gather_scatter', `
+declare <WIDTH x $1> @__gather_base_offsets32_$1(i8 * nocapture, <WIDTH x i32>,
+                        i32, <WIDTH x i32>, <WIDTH x i1>) nounwind readonly 
+declare <WIDTH x $1> @__gather_base_offsets64_$1(i8 * nocapture, <WIDTH x i64>,
+                        i32, <WIDTH x i64>, <WIDTH x i1>) nounwind readonly 
+declare <WIDTH x $1> @__gather32_$1(<WIDTH x i32>, 
+                                    <WIDTH x i1>) nounwind readonly 
+declare <WIDTH x $1> @__gather64_$1(<WIDTH x i64>, 
+                                    <WIDTH x i1>) nounwind readonly 
+
+declare void @__scatter_base_offsets32_$1(i8* nocapture, <WIDTH x i32>,
+                  i32, <WIDTH x i32>, <WIDTH x $1>, <WIDTH x i1>) nounwind 
+declare void @__scatter_base_offsets64_$1(i8* nocapture, <WIDTH x i64>,
+                  i32, <WIDTH x i64>, <WIDTH x $1>, <WIDTH x i1>) nounwind 
+declare void @__scatter32_$1(<WIDTH x i32>, <WIDTH x $1>,
+                             <WIDTH x i1>) nounwind 
+declare void @__scatter64_$1(<WIDTH x i64>, <WIDTH x $1>,
+                              <WIDTH x i1>) nounwind 
+')
+
+gather_scatter(i8)
+gather_scatter(i16)
+gather_scatter(i32)
+gather_scatter(i64)
+
+declare i32 @__packed_load_active(i32 * nocapture, <WIDTH x i32> * nocapture,
+                                  <WIDTH x i1>) nounwind
+declare i32 @__packed_store_active(i32 * nocapture, <WIDTH x i32> %vals,
+                                   <WIDTH x i1>) nounwind
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; prefetch
+
+declare void @__prefetch_read_uniform_1(i8 * nocapture) nounwind 
+declare void @__prefetch_read_uniform_2(i8 * nocapture) nounwind 
+declare void @__prefetch_read_uniform_3(i8 * nocapture) nounwind 
+declare void @__prefetch_read_uniform_nt(i8 * nocapture) nounwind 
+
--- a/builtins/target-sse2-common.ll
+++ b/builtins/target-sse2-common.ll
@@ -29,6 +29,11 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  

+ctlztz()
+define_prefetches()
+define_shuffles()
+aossoa()
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp

--- a/builtins/target-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -36,12 +36,24 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; standard 8-wide definitions from m4 macros

-stdlib_core(8)
-packed_load_and_store(8)
-scans(8)
-int64minmax(8)
+define(`WIDTH',`8')
+define(`MASK',`i32')
+include(`util.m4')

-include(`builtins-sse2-common.ll')
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+
+include(`target-sse2-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
@@ -301,7 +313,7 @@ define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
 }

 define <4 x float> @__vec4_add_float(<4 x float> %v0,
-                                            <4 x float> %v1) nounwind readnone alwaysinline {
+                                     <4 x float> %v1) nounwind readnone alwaysinline {
  %v = fadd <4 x float> %v0, %v1
  ret <4 x float> %v
 }
@@ -325,7 +337,7 @@ define float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {

 ; helper function for reduce_add_int32
 define <4 x i32> @__vec4_add_int32(<4 x i32> %v0,
-                                            <4 x i32> %v1) nounwind readnone alwaysinline {
+                                   <4 x i32> %v1) nounwind readnone alwaysinline {
  %v = add <4 x i32> %v0, %v1
  ret <4 x i32> %v
 }
@@ -425,10 +437,10 @@ load_and_broadcast(8, i16, 16)
 load_and_broadcast(8, i32, 32)
 load_and_broadcast(8, i64, 64)

-load_masked(8, i8,  8,  1)
-load_masked(8, i16, 16, 2)
-load_masked(8, i32, 32, 4)
-load_masked(8, i64, 64, 8)
+masked_load(8, i8,  8,  1)
+masked_load(8, i16, 16, 2)
+masked_load(8, i32, 32, 4)
+masked_load(8, i64, 64, 8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter
--- a/builtins/target-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -33,12 +33,24 @@
 ;; Define the standard library builtins for the SSE2 target

 ; Define some basics for a 4-wide target
-stdlib_core(4)
-packed_load_and_store(4)
-scans(4)
-int64minmax(4)
+define(`WIDTH',`4')
+define(`MASK',`i32')
+include(`util.m4')

-include(`builtins-sse2-common.ll')
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+
+include(`target-sse2-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding
@@ -144,7 +156,7 @@ define <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alway
 ; from %1, and otherwise return the value from %0.

 define <4 x i32> @__vselect_i32(<4 x i32>, <4 x i32> ,
-                                         <4 x i32> %mask) nounwind readnone alwaysinline {
+                                <4 x i32> %mask) nounwind readnone alwaysinline {
  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
  %cleared_old = and <4 x i32> %0, %notmask
  %masked_new = and <4 x i32> %1, %mask
@@ -153,7 +165,7 @@ define <4 x i32> @__vselect_i32(<4 x i32>, <4 x i32> ,
 }

 define <4 x float> @__vselect_float(<4 x float>, <4 x float>,
-                                             <4 x i32> %mask) nounwind readnone alwaysinline {
+                                    <4 x i32> %mask) nounwind readnone alwaysinline {
  %v0 = bitcast <4 x float> %0 to <4 x i32>
  %v1 = bitcast <4 x float> %1 to <4 x i32>
  %r = call <4 x i32> @__vselect_i32(<4 x i32> %v0, <4 x i32> %v1, <4 x i32> %mask)
@@ -552,10 +564,10 @@ load_and_broadcast(4, i16, 16)
 load_and_broadcast(4, i32, 32)
 load_and_broadcast(4, i64, 64)

-load_masked(4, i8,  8,  1)
-load_masked(4, i16, 16, 2)
-load_masked(4, i32, 32, 4)
-load_masked(4, i64, 64, 8)
+masked_load(4, i8,  8,  1)
+masked_load(4, i16, 16, 2)
+masked_load(4, i32, 32, 4)
+masked_load(4, i64, 64, 8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter
--- a/builtins/target-sse4-common.ll
+++ b/builtins/target-sse4-common.ll
@@ -29,6 +29,11 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  

+ctlztz()
+define_prefetches()
+define_shuffles()
+aossoa()
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding floats

--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -36,12 +36,24 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; standard 8-wide definitions from m4 macros

-stdlib_core(8)
-packed_load_and_store(8)
-scans(8)
-int64minmax(8)
+define(`WIDTH',`8')
+define(`MASK',`i32')
+include(`util.m4')

-include(`builtins-sse4-common.ll')
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+
+include(`target-sse4-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
@@ -252,7 +264,7 @@ define float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {

 ; helper function for reduce_add_int32
 define <4 x i32> @__vec4_add_int32(<4 x i32> %v0,
-                                            <4 x i32> %v1) nounwind readnone alwaysinline {
+                                   <4 x i32> %v1) nounwind readnone alwaysinline {
  %v = add <4 x i32> %v0, %v1
  ret <4 x i32> %v
 }
@@ -352,10 +364,10 @@ load_and_broadcast(8, i16, 16)
 load_and_broadcast(8, i32, 32)
 load_and_broadcast(8, i64, 64)

-load_masked(8, i8,  8,  1)
-load_masked(8, i16, 16, 2)
-load_masked(8, i32, 32, 4)
-load_masked(8, i64, 64, 8)
+masked_load(8, i8,  8,  1)
+masked_load(8, i16, 16, 2)
+masked_load(8, i32, 32, 4)
+masked_load(8, i64, 64, 8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter
--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -33,12 +33,24 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 ; Define common 4-wide stuff
-stdlib_core(4)
-packed_load_and_store(4)
-scans(4)
-int64minmax(4)
+define(`WIDTH',`4')
+define(`MASK',`i32')
+include(`util.m4')

-include(`builtins-sse4-common.ll')
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+
+include(`target-sse4-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
@@ -451,10 +463,10 @@ load_and_broadcast(4, i16, 16)
 load_and_broadcast(4, i32, 32)
 load_and_broadcast(4, i64, 64)

-load_masked(4, i8,  8,  1)
-load_masked(4, i16, 16, 2)
-load_masked(4, i32, 32, 4)
-load_masked(4, i64, 64, 8)
+masked_load(4, i8,  8,  1)
+masked_load(4, i16, 16, 2)
+masked_load(4, i32, 32, 4)
+masked_load(4, i64, 64, 8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter
--- a/builtins/util.m4
+++ b/builtins/util.m4
--- a/cbackend.cpp
+++ b/cbackend.cpp
--- a/ctx.cpp
+++ b/ctx.cpp
--- a/ctx.h
+++ b/ctx.h
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -39,6 +39,7 @@
 #define ISPC_CTX_H 1

 #include "ispc.h"
+#include <map>
 #include <llvm/InstrTypes.h>
 #include <llvm/Instructions.h>
 #include <llvm/Analysis/DIBuilder.h>
@@ -98,9 +99,9 @@ public:
        the function entry mask and the internal mask. */ 
    llvm::Value *GetFullMask();

-    /** Provides the alloca'd pointer to memory to store the full function
-        mask.  This is only used to wire up the __mask builtin variable. */
-    void SetMaskPointer(llvm::Value *p);
+    /** Returns a pointer to storage in memory that stores the current full
+        mask. */
+    llvm::Value *GetFullMaskPointer();

    /** Provides the value of the mask at function entry */
    void SetFunctionMask(llvm::Value *val);
@@ -160,10 +161,8 @@ public:
    void EndLoop();

    /** Indicates that code generation for a 'foreach' or 'foreach_tiled'
-        loop is about to start.  The provided basic block pointer indicates
-        where control flow should go if a 'continue' statement is executed
-        in the loop. */
-    void StartForeach(llvm::BasicBlock *continueTarget);
+        loop is about to start. */
+    void StartForeach();
    void EndForeach();

    /** Emit code for a 'break' statement in a loop.  If doCoherenceCheck
@@ -186,12 +185,69 @@ public:
        previous iteration. */
    void RestoreContinuedLanes();

+    /** Indicates that code generation for a "switch" statement is about to
+        start.  isUniform indicates whether the "switch" value is uniform,
+        and bbAfterSwitch gives the basic block immediately following the
+        "switch" statement.  (For example, if the switch condition is
+        uniform, we jump here upon executing a "break" statement.) */
+    void StartSwitch(bool isUniform, llvm::BasicBlock *bbAfterSwitch);
+    /** Indicates the end of code generation for a "switch" statement. */
+    void EndSwitch();
+
+    /** Emits code for a "switch" statement in the program.
+        @param expr         Gives the value of the expression after the "switch"
+        @param defaultBlock Basic block to execute for the "default" case.  This
+                            should be NULL if there is no "default" label inside
+                            the switch.
+        @param caseBlocks   vector that stores the mapping from label values
+                            after "case" statements to basic blocks corresponding
+                            to the "case" labels.
+        @param nextBlocks   For each basic block for a "case" or "default" 
+                            label, this gives the basic block for the 
+                            immediately-following "case" or "default" label (or
+                            the basic block after the "switch" statement for the
+                            last label.)
+    */
+    void SwitchInst(llvm::Value *expr, llvm::BasicBlock *defaultBlock,
+                    const std::vector<std::pair<int, llvm::BasicBlock *> > &caseBlocks,
+                    const std::map<llvm::BasicBlock *, llvm::BasicBlock *> &nextBlocks);
+
+    /** Generates code for a "default" label after a "switch" statement.
+        The checkMask parameter indicates whether additional code should be
+        generated to check to see if the execution mask is all off after
+        the default label (in which case a jump to the following label will
+        be issued. */
+    void EmitDefaultLabel(bool checkMask, SourcePos pos);
+
+    /** Generates code for a "case" label after a "switch" statement.  See
+        the documentation for EmitDefaultLabel() for discussion of the
+        checkMask parameter. */
+    void EmitCaseLabel(int value, bool checkMask, SourcePos pos);
+
    /** Returns the current number of nested levels of 'varying' control
        flow */
    int VaryingCFDepth() const;

    bool InForeachLoop() const;

+    /** Temporarily disables emission of performance warnings from gathers
+        and scatters from subsequent code. */
+    void DisableGatherScatterWarnings();
+
+    /** Reenables emission of gather/scatter performance warnings. */
+    void EnableGatherScatterWarnings();
+
+    void SetContinueTarget(llvm::BasicBlock *bb) { continueTarget = bb; }
+
+    /** Step through the code and find label statements; create a basic
+        block for each one, so that subsequent calls to
+        GetLabeledBasicBlock() return the corresponding basic block. */
+    void InitializeLabelMap(Stmt *code);
+
+    /** If there is a label in the function with the given name, return the
+        new basic block that it starts. */
+    llvm::BasicBlock *GetLabeledBasicBlock(const std::string &label);
+
    /** Called to generate code for 'return' statement; value is the
        expression in the return statement (if non-NULL), and
        doCoherenceCheck indicates whether instructions should be generated
@@ -211,6 +267,10 @@ public:
        i1 value that indicates if all of the mask lanes are on. */
    llvm::Value *All(llvm::Value *mask);

+    /** Given a boolean mask value of type LLVMTypes::MaskType, return an
+        i1 value that indicates if all of the mask lanes are off. */
+    llvm::Value *None(llvm::Value *mask);
+
    /** Given a boolean mask value of type LLVMTypes::MaskType, return an
        i32 value wherein the i'th bit is on if and only if the i'th lane
        of the mask is on. */
@@ -339,6 +399,16 @@ public:
    llvm::Instruction *ZExtInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type, 
                                const char *name = NULL);

+    /** Given two integer-typed values (but possibly one vector and the
+        other not, and or of possibly-different bit-widths), update their
+        values as needed so that the two have the same (more general)
+        type. */ 
+    void MatchIntegerTypes(llvm::Value **v0, llvm::Value **v1);
+
+    /** Create a new slice pointer out of the given pointer to an soa type
+        and an integer offset to a slice within that type. */
+    llvm::Value *MakeSlicePointer(llvm::Value *ptr, llvm::Value *offset);
+
    /** These GEP methods are generalizations of the standard ones in LLVM;
        they support both uniform and varying basePtr values as well as
        uniform and varying index values (arrays of indices).  Varying base
@@ -359,7 +429,8 @@ public:
        the type of the pointer, though it may be NULL if the base pointer
        is uniform. */
    llvm::Value *AddElementOffset(llvm::Value *basePtr, int elementNum,
-                                  const Type *ptrType, const char *name = NULL);
+                                  const Type *ptrType, const char *name = NULL,
+                                  const PointerType **resultPtrType = NULL);

    /** Load from the memory location(s) given by lvalue, using the given
        mask.  The lvalue may be varying, in which case this corresponds to
@@ -390,7 +461,14 @@ public:
        varying, the given storeMask is used to mask the stores so that
        they only execute for the active program instances. */
    void StoreInst(llvm::Value *value, llvm::Value *ptr,
-                   llvm::Value *storeMask, const Type *ptrType);
+                   llvm::Value *storeMask, const Type *valueType,
+                   const Type *ptrType);
+
+    /** Copy count bytes of memory from the location pointed to by src to
+        the location pointed to by dest.  (src and dest must not be
+        overlapping.) */ 
+    void MemcpyInst(llvm::Value *dest, llvm::Value *src, llvm::Value *count,
+                    llvm::Value *align = NULL);

    void BranchInst(llvm::BasicBlock *block);
    void BranchInst(llvm::BasicBlock *trueBlock, llvm::BasicBlock *falseBlock,
@@ -446,6 +524,9 @@ private:
    /** Pointer to the Function for which we're currently generating code. */
    Function *function;

+    /** LLVM function representation for the current function. */
+    llvm::Function *llvmFunction;
+
    /** The basic block into which we add any alloca instructions that need
        to go at the very start of the function. */
    llvm::BasicBlock *allocaBlock;
@@ -479,10 +560,10 @@ private:
        the loop. */
    llvm::Value *loopMask;

-    /** If currently in a loop body, this is a pointer to memory to store a
-        mask value that represents which of the lanes have executed a
-        'break' statement.  If we're not in a loop body, this should be
-        NULL. */
+    /** If currently in a loop body or switch statement, this is a pointer
+        to memory to store a mask value that represents which of the lanes
+        have executed a 'break' statement.  If we're not in a loop body or
+        switch, this should be NULL. */
    llvm::Value *breakLanesPtr;

    /** Similar to breakLanesPtr, if we're inside a loop, this is a pointer
@@ -490,16 +571,49 @@ private:
        'continue' statement. */
    llvm::Value *continueLanesPtr;

-    /** If we're inside a loop, this gives the basic block immediately
-        after the current loop, which we will jump to if all of the lanes
-        have executed a break statement or are otherwise done with the
-        loop. */
+    /** If we're inside a loop or switch statement, this gives the basic
+        block immediately after the current loop or switch, which we will
+        jump to if all of the lanes have executed a break statement or are
+        otherwise done with it. */
    llvm::BasicBlock *breakTarget;

    /** If we're inside a loop, this gives the block to jump to if all of
        the running lanes have executed a 'continue' statement. */
    llvm::BasicBlock *continueTarget;

+    /** @name Switch statement state
+
+        These variables store various state that's active when we're
+        generating code for a switch statement.  They should all be NULL
+        outside of a switch.
+        @{
+    */
+
+    /** The value of the expression used to determine which case in the
+        statements after the switch to execute. */
+    llvm::Value *switchExpr;
+
+    /** Map from case label numbers to the basic block that will hold code
+        for that case. */
+    const std::vector<std::pair<int, llvm::BasicBlock *> > *caseBlocks;
+
+    /** The basic block of code to run for the "default" label in the
+        switch statement. */
+    llvm::BasicBlock *defaultBlock;
+
+    /** For each basic block for the code for cases (and the default label,
+        if present), this map gives the basic block for the immediately
+        following case/default label. */
+    const std::map<llvm::BasicBlock *, llvm::BasicBlock *> *nextBlocks;
+
+    /** Records whether the switch condition was uniform; this is a
+        distinct notion from whether the switch represents uniform or
+        varying control flow; we may have varying control flow from a
+        uniform switch condition if there is a 'break' inside the switch
+        that's under varying control flow. */
+    bool switchConditionWasUniform;
+    /** @} */
+
    /** A pointer to memory that records which of the program instances
        have executed a 'return' statement (and are thus really truly done
        running any more instructions in this functions. */
@@ -537,9 +651,17 @@ private:
        tasks launched from the current function. */
    llvm::Value *launchGroupHandlePtr;

+    /** Nesting count of the number of times calling code has disabled (and
+        not yet reenabled) gather/scatter performance warnings. */
+    int disableGSWarningCount;
+
+    std::map<std::string, llvm::BasicBlock *> labelMap;
+
+    static bool initLabelBBlocks(ASTNode *node, void *data);
+
    llvm::Value *pointerVectorToVoidPointers(llvm::Value *value);
    static void addGSMetadata(llvm::Value *inst, SourcePos pos);
-    bool ifsInLoopAllUniform() const;
+    bool ifsInCFAllUniform(int cfType) const;
    void jumpIfAllLoopLanesAreDone(llvm::BasicBlock *target);
    llvm::Value *emitGatherCallback(llvm::Value *lvalue, llvm::Value *retPtr);

@@ -547,13 +669,25 @@ private:
                                 const Type *ptrType);

    void restoreMaskGivenReturns(llvm::Value *oldMask);
+    void addSwitchMaskCheck(llvm::Value *mask);
+    bool inSwitchStatement() const;
+    llvm::Value *getMaskAtSwitchEntry();

-    void scatter(llvm::Value *value, llvm::Value *ptr, const Type *ptrType, 
-                 llvm::Value *mask);
+    CFInfo *popCFState();
+
+    void scatter(llvm::Value *value, llvm::Value *ptr, const Type *valueType,
+                 const Type *ptrType, llvm::Value *mask);
    void maskedStore(llvm::Value *value, llvm::Value *ptr, const Type *ptrType,
                     llvm::Value *mask);
-    llvm::Value *gather(llvm::Value *ptr, const Type *ptrType, llvm::Value *mask,
-                        const char *name);
+    void storeUniformToSOA(llvm::Value *value, llvm::Value *ptr, 
+                           llvm::Value *mask, const Type *valueType,
+                           const PointerType *ptrType);
+    llvm::Value *loadUniformFromSOA(llvm::Value *ptr, llvm::Value *mask,
+                                    const PointerType *ptrType, const char *name);
+
+    llvm::Value *gather(llvm::Value *ptr, const PointerType *ptrType,
+                        llvm::Value *mask, const char *name);
+
    llvm::Value *addVaryingOffsetsIfNeeded(llvm::Value *ptr, const Type *ptrType);
 };

--- a/decl.cpp
+++ b/decl.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -46,6 +46,18 @@
 #include <stdio.h>
 #include <set>

+static void
+lPrintTypeQualifiers(int typeQualifiers) {
+    if (typeQualifiers & TYPEQUAL_INLINE)    printf("inline ");
+    if (typeQualifiers & TYPEQUAL_CONST)     printf("const ");
+    if (typeQualifiers & TYPEQUAL_UNIFORM)   printf("uniform ");
+    if (typeQualifiers & TYPEQUAL_VARYING)   printf("varying ");
+    if (typeQualifiers & TYPEQUAL_TASK)      printf("task ");
+    if (typeQualifiers & TYPEQUAL_SIGNED)    printf("signed ");
+    if (typeQualifiers & TYPEQUAL_UNSIGNED)  printf("unsigned ");
+}
+
+
 /** Given a Type and a set of type qualifiers, apply the type qualifiers to
    the type, returning the type that is the result. 
 */
@@ -54,6 +66,25 @@ lApplyTypeQualifiers(int typeQualifiers, const Type *type, SourcePos pos) {
    if (type == NULL)
        return NULL;

+    if ((typeQualifiers & TYPEQUAL_CONST) != 0)
+        type = type->GetAsConstType();
+
+    if ((typeQualifiers & TYPEQUAL_UNIFORM) != 0) {
+        if (Type::Equal(type, AtomicType::Void))
+            Error(pos, "\"uniform\" qualifier is illegal with \"void\" type.");
+        else
+            type = type->GetAsUniformType();
+    }
+    else if ((typeQualifiers & TYPEQUAL_VARYING) != 0) {
+        if (Type::Equal(type, AtomicType::Void))
+            Error(pos, "\"varying\" qualifier is illegal with \"void\" type.");
+        else
+            type = type->GetAsVaryingType();
+    }
+    else
+        if (Type::Equal(type, AtomicType::Void) == false)
+            type = type->GetAsUnboundVariabilityType();
+
    if ((typeQualifiers & TYPEQUAL_UNSIGNED) != 0) {
        if ((typeQualifiers & TYPEQUAL_SIGNED) != 0)
            Error(pos, "Illegal to apply both \"signed\" and \"unsigned\" "
@@ -62,30 +93,19 @@ lApplyTypeQualifiers(int typeQualifiers, const Type *type, SourcePos pos) {
        const Type *unsignedType = type->GetAsUnsignedType();
        if (unsignedType != NULL)
            type = unsignedType;
-        else
+        else {
+            const Type *resolvedType = 
+                type->ResolveUnboundVariability(Variability::Varying);
            Error(pos, "\"unsigned\" qualifier is illegal with \"%s\" type.",
-              type->GetString().c_str());
-
+                  resolvedType->GetString().c_str());
+        }
    }

-    if ((typeQualifiers & TYPEQUAL_SIGNED) != 0 && type->IsIntType() == false)
+    if ((typeQualifiers & TYPEQUAL_SIGNED) != 0 && type->IsIntType() == false) {
+        const Type *resolvedType = 
+            type->ResolveUnboundVariability(Variability::Varying);
        Error(pos, "\"signed\" qualifier is illegal with non-integer type "
-              "\"%s\".", type->GetString().c_str());
-
-    if ((typeQualifiers & TYPEQUAL_CONST) != 0)
-        type = type->GetAsConstType();
-
-    if ((typeQualifiers & TYPEQUAL_UNIFORM) != 0)
-        type = type->GetAsUniformType();
-    else if ((typeQualifiers & TYPEQUAL_VARYING) != 0)
-        type = type->GetAsVaryingType();
-    else {
-        // otherwise, structs are uniform by default and everything
-        // else is varying by default
-        if (dynamic_cast<const StructType *>(type->GetBaseType()) != NULL)
-            type = type->GetAsUniformType();
-        else
-            type = type->GetAsVaryingType();
+              "\"%s\".", resolvedType->GetString().c_str());
    }

    return type;
@@ -106,18 +126,59 @@ DeclSpecs::DeclSpecs(const Type *t, StorageClass sc, int tq) {

 const Type *
 DeclSpecs::GetBaseType(SourcePos pos) const {
-    const Type *bt = baseType;
+    const Type *retType = baseType;
+
+    if (retType == NULL) {
+        Warning(pos, "No type specified in declaration.  Assuming int32.");
+        retType = AtomicType::UniformInt32->GetAsUnboundVariabilityType();
+    }
+
    if (vectorSize > 0) {
-        const AtomicType *atomicType = dynamic_cast<const AtomicType *>(bt);
+        const AtomicType *atomicType = dynamic_cast<const AtomicType *>(retType);
        if (atomicType == NULL) {
            Error(pos, "Only atomic types (int, float, ...) are legal for vector "
                  "types.");
            return NULL;
        }
-        bt = new VectorType(atomicType, vectorSize);
+        retType = new VectorType(atomicType, vectorSize);
    }

-    return lApplyTypeQualifiers(typeQualifiers, bt, pos);
+    retType = lApplyTypeQualifiers(typeQualifiers, retType, pos);
+    
+    if (soaWidth > 0) {
+        const StructType *st = dynamic_cast<const StructType *>(retType);
+
+        if (st == NULL) {
+            Error(pos, "Illegal to provide soa<%d> qualifier with non-struct "
+                  "type \"%s\".", soaWidth, retType->GetString().c_str());
+            return NULL;
+        }
+        else if (soaWidth <= 0 || (soaWidth & (soaWidth - 1)) != 0) {
+            Error(pos, "soa<%d> width illegal.  Value must be positive power "
+                  "of two.", soaWidth);
+            return NULL;
+        }
+
+        if (st->IsUniformType()) {
+            Error(pos, "\"uniform\" qualifier and \"soa<%d>\" qualifier can't "
+                  "both be used in a type declaration.", soaWidth);
+            return NULL;
+        }
+        else if (st->IsVaryingType()) {
+            Error(pos, "\"varying\" qualifier and \"soa<%d>\" qualifier can't "
+                  "both be used in a type declaration.", soaWidth);
+            return NULL;
+        }
+        else
+            retType = st->GetAsSOAType(soaWidth);
+
+        if (soaWidth < g->target.vectorWidth)
+            PerformanceWarning(pos, "soa<%d> width smaller than gang size %d "
+                               "currently leads to inefficient code to access "
+                               "soa types.", soaWidth, g->target.vectorWidth);
+    }
+    
+    return retType;
 }


@@ -138,21 +199,14 @@ lGetStorageClassName(StorageClass storageClass) {

 void
 DeclSpecs::Print() const {
-    printf("%s ", lGetStorageClassName(storageClass));
+    printf("Declspecs: [%s ", lGetStorageClassName(storageClass));

    if (soaWidth > 0) printf("soa<%d> ", soaWidth);
-
-    if (typeQualifiers & TYPEQUAL_INLINE)    printf("inline ");
-    if (typeQualifiers & TYPEQUAL_CONST)     printf("const ");
-    if (typeQualifiers & TYPEQUAL_UNIFORM)   printf("uniform ");
-    if (typeQualifiers & TYPEQUAL_VARYING)   printf("varying ");
-    if (typeQualifiers & TYPEQUAL_TASK)      printf("task ");
-    if (typeQualifiers & TYPEQUAL_SIGNED)    printf("signed ");
-    if (typeQualifiers & TYPEQUAL_UNSIGNED)  printf("unsigned ");
-
-    printf("%s", baseType->GetString().c_str());
+    lPrintTypeQualifiers(typeQualifiers);
+    printf("base type: %s", baseType->GetString().c_str());

    if (vectorSize > 0) printf("<%d>", vectorSize);
+    printf("]");
 }


@@ -172,6 +226,11 @@ Declarator::Declarator(DeclaratorKind dk, SourcePos p)
 void
 Declarator::InitFromDeclSpecs(DeclSpecs *ds) {
    const Type *t = GetType(ds);
+    if (t == NULL) {
+        Assert(m->errorCount > 0);
+        return;
+    }
+
    Symbol *sym = GetSymbol();
    if (sym != NULL) {
        sym->type = t;
@@ -192,19 +251,46 @@ Declarator::GetSymbol() const {


 void
-Declarator::Print() const {
+Declarator::Print(int indent) const {
+    printf("%*cdeclarator: [", indent, ' ');
+    pos.Print();
+
+    lPrintTypeQualifiers(typeQualifiers);
    Symbol *sym = GetSymbol();
    if (sym != NULL)
        printf("%s", sym->name.c_str());
    else
        printf("(null symbol)");

+    printf(", array size = %d", arraySize);
+
+    printf(", kind = ");
+    switch (kind) {
+    case DK_BASE:      printf("base");      break;
+    case DK_POINTER:   printf("pointer");   break;
+    case DK_REFERENCE: printf("reference"); break;
+    case DK_ARRAY:     printf("array");     break;
+    case DK_FUNCTION:  printf("function");  break;
+    default:           FATAL("Unhandled declarator kind");
+    }
+
    if (initExpr != NULL) {
        printf(" = (");
        initExpr->Print();
        printf(")");
    }
-    pos.Print();
+
+    if (functionParams.size() > 0) {
+        for (unsigned int i = 0; i < functionParams.size(); ++i) {
+            printf("\n%*cfunc param %d:\n", indent, ' ', i);
+            functionParams[i]->Print(indent+4);
+        }
+    }
+
+    if (child != NULL)
+        child->Print(indent + 4);
+
+    printf("]\n");
 }


@@ -216,14 +302,16 @@ Declarator::GetFunctionInfo(DeclSpecs *ds, std::vector<Symbol *> *funArgs) {
        return NULL;

    Symbol *declSym = GetSymbol();
-    assert(declSym != NULL);
+    Assert(declSym != NULL);

    // Get the symbol for the function from the symbol table.  (It should
    // already have been added to the symbol table by AddGlobal() by the
    // time we get here.)
    Symbol *funSym = m->symbolTable->LookupFunction(declSym->name.c_str(), type);
-    if (funSym != NULL)
+    if (funSym == NULL)
        // May be NULL due to error earlier in compilation
+        Assert(m->errorCount > 0);
+    else
        funSym->pos = pos;

    // Walk down to the declarator for the function.  (We have to get past
@@ -232,14 +320,23 @@ Declarator::GetFunctionInfo(DeclSpecs *ds, std::vector<Symbol *> *funArgs) {
    Declarator *d = this;
    while (d != NULL && d->kind != DK_FUNCTION)
        d = d->child;
-    assert(d != NULL);
+    Assert(d != NULL);

    for (unsigned int i = 0; i < d->functionParams.size(); ++i) {
-        Declaration *pdecl = d->functionParams[i];
-        assert(pdecl->declarators.size() == 1);
-        funArgs->push_back(pdecl->declarators[0]->GetSymbol());
+        Symbol *sym = d->GetSymbolForFunctionParameter(i);
+        if (sym->type == NULL) {
+            Assert(m->errorCount > 0);
+            continue;
+        }
+        else
+            sym->type = sym->type->ResolveUnboundVariability(Variability::Varying);
+
+        funArgs->push_back(sym);
    }

+    if (funSym != NULL)
+        funSym->type = funSym->type->ResolveUnboundVariability(Variability::Varying);
+
    return funSym;
 }

@@ -258,17 +355,26 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
    if (kind != DK_FUNCTION && isTask)
        Error(pos, "\"task\" qualifier illegal in variable declaration.");

+    Variability variability(Variability::Unbound);
+    if (hasUniformQual)
+        variability = Variability::Uniform;
+    else if (hasVaryingQual)
+        variability = Variability::Varying;
+
    const Type *type = base;
    switch (kind) {
    case DK_BASE:
        // All of the type qualifiers should be in the DeclSpecs for the
        // base declarator
-        assert(typeQualifiers == 0);
-        assert(child == NULL);
+        Assert(typeQualifiers == 0);
+        Assert(child == NULL);
        return type;

    case DK_POINTER:
-        type = new PointerType(type, hasUniformQual, isConst);
+        /* For now, any pointer to an SOA type gets the slice property; if
+           we add the capability to declare pointers as slices or not,
+           we'll want to set this based on a type qualifier here. */
+        type = new PointerType(type, variability, isConst, type->IsSOAType());
        if (child != NULL)
            return child->GetType(type, ds);
        else
@@ -297,6 +403,16 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
        break;

    case DK_ARRAY:
+        if (Type::Equal(type, AtomicType::Void)) {
+            Error(pos, "Arrays of \"void\" type are illegal.");
+            return NULL;
+        }
+        if (dynamic_cast<const ReferenceType *>(type)) {
+            Error(pos, "Arrays of references (type \"%s\") are illegal.",
+                  type->GetString().c_str());
+            return NULL;
+        }
+
        type = new ArrayType(type, arraySize);
        if (child)
            return child->GetType(type, ds);
@@ -316,31 +432,18 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
        for (unsigned int i = 0; i < functionParams.size(); ++i) {
            Declaration *d = functionParams[i];

-            char buf[32];
-            Symbol *sym;
-            if (d->declarators.size() == 0) {
-                // function declaration like foo(float), w/o a name for
-                // the parameter
-                sprintf(buf, "__anon_parameter_%d", i);
-                sym = new Symbol(buf, pos);
-                sym->type = d->declSpecs->GetBaseType(pos);
-            }
-            else {
-                sym = d->declarators[0]->GetSymbol();
-                if (sym == NULL) {
-                    // Handle more complex anonymous declarations like
-                    // float (float **).
-                    sprintf(buf, "__anon_parameter_%d", i);
-                    sym = new Symbol(buf, d->declarators[0]->pos);
-                    sym->type = d->declarators[0]->GetType(d->declSpecs);
-                }
-            }
+            Symbol *sym = GetSymbolForFunctionParameter(i);

            if (d->declSpecs->storageClass != SC_NONE)
                Error(sym->pos, "Storage class \"%s\" is illegal in "
                      "function parameter declaration for parameter \"%s\".", 
                      lGetStorageClassName(d->declSpecs->storageClass),
                      sym->name.c_str());
+            if (Type::Equal(sym->type, AtomicType::Void)) {
+                Error(sym->pos, "Parameter with type \"void\" illegal in function "
+                      "parameter list.");
+                sym->type = NULL;
+            }

            const ArrayType *at = dynamic_cast<const ArrayType *>(sym->type);
            if (at != NULL) {
@@ -352,7 +455,15 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
                // report this differently than it was originally declared
                // in the function, but it's not clear that this is a
                // significant problem.)
-                sym->type = PointerType::GetUniform(at->GetElementType());
+                if (at->GetElementType() == NULL) {
+                    Assert(m->errorCount > 0);
+                    return NULL;
+                }
+
+                const Type *targetType = at->GetElementType();
+                targetType = 
+                    targetType->ResolveUnboundVariability(Variability::Varying);
+                sym->type = PointerType::GetUniform(targetType);

                // Make sure there are no unsized arrays (other than the
                // first dimension) in function parameter lists.
@@ -376,13 +487,13 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
                // it lives down to the base declarator.
                Declarator *decl = d->declarators[0];
                while (decl->child != NULL) {
-                    assert(decl->initExpr == NULL);
+                    Assert(decl->initExpr == NULL);
                    decl = decl->child;
                }

                if (decl->initExpr != NULL &&
-                    (decl->initExpr = decl->initExpr->TypeCheck()) != NULL &&
-                    (decl->initExpr = decl->initExpr->Optimize()) != NULL &&
+                    (decl->initExpr = TypeCheck(decl->initExpr)) != NULL &&
+                    (decl->initExpr = Optimize(decl->initExpr)) != NULL &&
                    (init = dynamic_cast<ConstExpr *>(decl->initExpr)) == NULL) {
                    Error(decl->initExpr->pos, "Default value for parameter "
                          "\"%s\" must be a compile-time constant.", 
@@ -397,7 +508,11 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
            Error(pos, "No return type provided in function declaration.");
            return NULL;
        }
-
+        if (dynamic_cast<const FunctionType *>(returnType) != NULL) {
+            Error(pos, "Illegal to return function type from function.");
+            return NULL;
+        }
+        
        bool isExported = ds && (ds->storageClass == SC_EXPORT);
        bool isExternC =  ds && (ds->storageClass == SC_EXTERN_C);
        bool isTask =     ds && ((ds->typeQualifiers & TYPEQUAL_TASK) != 0);
@@ -418,38 +533,21 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
            return NULL;
        }

-        Type *functionType = 
-            new FunctionType(returnType, args, pos, argNames, argDefaults,
+        if (child == NULL) {
+            Assert(m->errorCount > 0);
+            return NULL;
+        }
+
+        const Type *functionType = 
+            new FunctionType(returnType, args, argNames, argDefaults,
                             argPos, isTask, isExported, isExternC);
+        functionType = functionType->ResolveUnboundVariability(Variability::Varying);
        return child->GetType(functionType, ds);
    }
    default:
        FATAL("Unexpected decl kind");
        return NULL;
    }
-
-#if 0
-            // Make sure we actually have an array of structs ..
-            const StructType *childStructType = 
-                dynamic_cast<const StructType *>(childType);
-            if (childStructType == NULL) {
-                Error(pos, "Illegal to provide soa<%d> qualifier with non-struct "
-                      "type \"%s\".", soaWidth, childType->GetString().c_str());
-                return new ArrayType(childType, arraySize == -1 ? 0 : arraySize);
-            }
-            else if ((soaWidth & (soaWidth - 1)) != 0) {
-                Error(pos, "soa<%d> width illegal.  Value must be power of two.",
-                      soaWidth);
-                return NULL;
-            }
-            else if (arraySize != -1 && (arraySize % soaWidth) != 0) {
-                Error(pos, "soa<%d> width must evenly divide array size %d.",
-                      soaWidth, arraySize);
-                return NULL;
-            }
-            return new SOAArrayType(childStructType, arraySize == -1 ? 0 : arraySize,
-                                    soaWidth);
-#endif
 }


@@ -461,6 +559,35 @@ Declarator::GetType(DeclSpecs *ds) const {
 }


+Symbol *
+Declarator::GetSymbolForFunctionParameter(int paramNum) const {
+    Assert(paramNum < (int)functionParams.size());
+    Declaration *d = functionParams[paramNum];
+
+    char buf[32];
+    Symbol *sym;
+    if (d->declarators.size() == 0) {
+        // function declaration like foo(float), w/o a name for
+        // the parameter
+        sprintf(buf, "__anon_parameter_%d", paramNum);
+        sym = new Symbol(buf, pos);
+        sym->type = d->declSpecs->GetBaseType(pos);
+    }
+    else {
+        Assert(d->declarators.size() == 1);
+        sym = d->declarators[0]->GetSymbol();
+        if (sym == NULL) {
+            // Handle more complex anonymous declarations like
+            // float (float **).
+            sprintf(buf, "__anon_parameter_%d", paramNum);
+            sym = new Symbol(buf, d->declarators[0]->pos);
+            sym->type = d->declarators[0]->GetType(d->declSpecs);
+        }
+    }
+    return sym;
+}
+
+
 ///////////////////////////////////////////////////////////////////////////
 // Declaration

@@ -485,23 +612,28 @@ Declaration::Declaration(DeclSpecs *ds, Declarator *d) {

 std::vector<VariableDeclaration>
 Declaration::GetVariableDeclarations() const {
-    assert(declSpecs->storageClass != SC_TYPEDEF);
+    Assert(declSpecs->storageClass != SC_TYPEDEF);
    std::vector<VariableDeclaration> vars;

    for (unsigned int i = 0; i < declarators.size(); ++i) {
-        if (declarators[i] == NULL)
-            continue;
        Declarator *decl = declarators[i];
-        if (decl == NULL)
+        if (decl == NULL) {
            // Ignore earlier errors
+            Assert(m->errorCount > 0);
            continue;
+        }

        Symbol *sym = decl->GetSymbol();
-        if (dynamic_cast<const FunctionType *>(sym->type) != NULL) {
-            // function declaration
-            m->symbolTable->AddFunction(sym);
+        if (sym == NULL || sym->type == NULL) {
+            // Ignore errors
+            Assert(m->errorCount > 0);
+            continue;
        }
-        else {
+        sym->type = sym->type->ResolveUnboundVariability(Variability::Varying);
+
+        if (Type::Equal(sym->type, AtomicType::Void))
+            Error(sym->pos, "\"void\" type variable illegal in declaration.");
+        else if (dynamic_cast<const FunctionType *>(sym->type) == NULL) {
            m->symbolTable->AddVariable(sym);
            vars.push_back(VariableDeclaration(sym, decl->initExpr));
        }
@@ -511,16 +643,43 @@ Declaration::GetVariableDeclarations() const {


 void
-Declaration::Print() const {
-    printf("Declaration: specs [");
-    declSpecs->Print();
-    printf("], declarators [");
-    for (unsigned int i = 0 ; i < declarators.size(); ++i) {
-        declarators[i]->Print();
-        printf("%s", (i == declarators.size() - 1) ? "]" : ", ");
+Declaration::DeclareFunctions() {
+    Assert(declSpecs->storageClass != SC_TYPEDEF);
+
+    for (unsigned int i = 0; i < declarators.size(); ++i) {
+        Declarator *decl = declarators[i];
+        if (decl == NULL) {
+            // Ignore earlier errors
+            Assert(m->errorCount > 0);
+            continue;
+        }
+
+        Symbol *sym = decl->GetSymbol();
+        if (sym == NULL || sym->type == NULL) {
+            // Ignore errors
+            Assert(m->errorCount > 0);
+            continue;
+        }
+        sym->type = sym->type->ResolveUnboundVariability(Variability::Varying);
+
+        if (dynamic_cast<const FunctionType *>(sym->type) == NULL)
+            continue;
+
+        bool isInline = (declSpecs->typeQualifiers & TYPEQUAL_INLINE);
+        m->AddFunctionDeclaration(sym, isInline);
    }
 }

+
+void
+Declaration::Print(int indent) const {
+    printf("%*cDeclaration: specs [", indent, ' ');
+    declSpecs->Print();
+    printf("], declarators:\n");
+    for (unsigned int i = 0 ; i < declarators.size(); ++i)
+        declarators[i]->Print(indent+4);
+}
+
 ///////////////////////////////////////////////////////////////////////////

 void
@@ -539,7 +698,7 @@ GetStructTypesNamesPositions(const std::vector<StructDeclaration *> &sd,
        DeclSpecs ds(type);
        if (type->IsUniformType()) 
            ds.typeQualifiers |= TYPEQUAL_UNIFORM;
-        else
+        else if (type->IsVaryingType())
            ds.typeQualifiers |= TYPEQUAL_VARYING;

        for (unsigned int j = 0; j < sd[i]->declarators->size(); ++j) {
@@ -548,6 +707,9 @@ GetStructTypesNamesPositions(const std::vector<StructDeclaration *> &sd,

            Symbol *sym = d->GetSymbol();

+            if (Type::Equal(sym->type, AtomicType::Void))
+                Error(d->pos, "\"void\" type illegal for struct member.");
+
            const ArrayType *arrayType = 
                dynamic_cast<const ArrayType *>(sym->type);
            if (arrayType != NULL && arrayType->GetElementCount() == 0) {
--- a/decl.h
+++ b/decl.h
@@ -153,10 +153,12 @@ public:
        declarator and symbols for its arguments in *args. */
    Symbol *GetFunctionInfo(DeclSpecs *ds, std::vector<Symbol *> *args);

+    Symbol *GetSymbolForFunctionParameter(int paramNum) const;
+
    /** Returns the symbol associated with the declarator. */
    Symbol *GetSymbol() const;

-    void Print() const;
+    void Print(int indent) const;

    /** Position of the declarator in the source program. */
    const SourcePos pos;
@@ -199,7 +201,7 @@ public:
    Declaration(DeclSpecs *ds, std::vector<Declarator *> *dlist = NULL);
    Declaration(DeclSpecs *ds, Declarator *d);

-    void Print() const;
+    void Print(int indent) const;

    /** This method walks through all of the Declarators in a declaration
        and returns a fully-initialized Symbol and (possibly) and
@@ -208,6 +210,10 @@ public:
        Declarator representation.) */
    std::vector<VariableDeclaration> GetVariableDeclarations() const;

+    /** For any function declarations in the Declaration, add the
+        declaration to the module. */
+    void DeclareFunctions();
+
    DeclSpecs *declSpecs;
    std::vector<Declarator *> declarators;
 };
--- a/docs/ReleaseNotes.txt
+++ b/docs/ReleaseNotes.txt
@@ -1,3 +1,169 @@
+=== v1.2.0 === (20 March 2012)
+
+This is a major new release of ispc, with a number of significant
+improvements to functionality, performance, and compiler robustness.  It
+does, however, include three small changes to language syntax and semantics
+that may require changes to existing programs:
+
+* Syntax for the "launch" keyword has been cleaned up; it's now no longer
+  necessary to bracket the launched function call with angle brackets.
+  (In other words, now use "launch foo();", rather than "launch < foo() >;".
+
+* When using pointers, the pointed-to data type is now "uniform" by
+  default.  Use the varying keyword to specify varying pointed-to types when
+  needed.  (i.e. "float *ptr" is a varying pointer to uniform float data,
+  whereas previously it was a varying pointer to varying float values.)
+  Use "varying float *" to specify a varying pointer to varying float data,
+  and so forth.
+
+* The details of "uniform" and "varying" and how they interact with struct
+  types have been cleaned up.  Now, when a struct type is declared, if the
+  struct elements don't have explicit "uniform" or "varying" qualifiers,
+  they are said to have "unbound" variability.  When a struct type is
+  instantiated, any unbound variability elements inherit the variability of
+  the parent struct type. See http://ispc.github.com/ispc.html#struct-types
+  for more details.
+
+ispc has a new language feature that makes it much easier to use the
+efficient "(array of) structure of arrays" (AoSoA, or SoA) memory layout of
+data.  A new "soa<n>" qualifier can be applied to structure types to
+specify an n-wide SoA version of the corresponding type.  Array indexing
+and pointer operations with arrays SoA types automatically handles the
+two-stage indexing calculation to access the data.  See
+http://ispc.github.com/ispc.html#structure-of-array-types for more details.
+
+For more efficient access of data that is still in "array of structures"
+(AoS) format, ispc has a new "memory coalescing" optimization that
+automatically detects series of strided loads and/or gathers that can be
+transformed into a more efficient set of vector loads and shuffles.  A
+diagnostic is emitted when this optimization is successfully applied. 
+
+Smaller changes in this release:
+
+* The standard library now provides memcpy(), memmove() and memset()
+  functions, as well as single-precision asin() and acos() functions.
+
+* -I can now be specified on the command-line to specify a search path for
+  #include files.
+
+* A number of improvements have been made to error reporting from the
+  parser, and a number of cases where malformed programs could cause the
+  compiler to crash have been fixed.
+
+* A number of small improvements to the quality and performance of generated
+  code have been made, including finding more cases where 32-bit addressing
+  calculations can be safely done on 64-bit systems and generating better
+  code for initializer expressions.
+
+=== v1.1.4 === (4 February 2012)
+
+There are two major bugfixes for Windows in this release.  First, a number
+of failures in AVX code generation on Windows have been fixed; AVX on
+Windows now has no known issues.  Second, a longstanding bug in parsing 64-bit
+integer constants on Windows has been fixed.
+
+This release features a new experimental scalar target, contributed by Gabe
+Weisz <gweisz@cs.cmu.edu>.  This target ("--target=generic-1") compiles
+gangs of single program instances (i.e. programCount == 1); it can be
+useful for debugging ispc programs.
+
+The compiler now supports dynamic memory allocation in ispc programs (with
+"new" and "delete" operators based on C++).  See
+http://ispc.github.com/ispc.html#dynamic-memory-allocation in the
+documentation for more information.
+
+ispc now performs "short circuit" evaluation of the || and && logical
+operators and the ? : selection operator.  (This represents the correction
+of a major incompatibility with C.)  Code like "(index < arraySize &&
+array[index] == 1)" thus now executes as in C, where "array[index]" won't
+be evaluated unless "index" is less than "arraySize".
+
+The standard library now provides "local" atomic operations, which are
+atomic across the gang of program instances (but not across other gangs or
+other hardware threads.  See the updated documentation on atomics for more
+information:
+http://ispc.github.com/ispc.html#atomic-operations-and-memory-fences.
+
+The standard library now offers a clock() function, which returns a uniform
+int64 value that counts processor cycles; it can be used for
+fine-resolution timing measurements.
+
+Finally (of limited interest now): ispc now supports the forthcoming AVX2
+instruction set, due with Haswell-generation CPUs.  All tests and examples
+compile and execute correctly with AVX2.  (Thanks specifically to Craig
+Topper and Nadav Rotem for work on AVX2 support in LLVM, which made this
+possible.)
+ 
+=== v1.1.3 === (20 January 2012)
+
+With this release, the language now supports "switch" statements, with the
+same semantics and syntax as in C.
+
+This release includes fixes for two important performance related issues:
+the quality of code generated for "foreach" statements has been
+substantially improved (https://github.com/ispc/ispc/issues/151), and a
+performance regression with code for "gathers" that was introduced in
+v1.1.2 has been fixed in this release. 
+
+A number of other small bugs were fixed in this release as well, including
+one where invalid memory would sometimes be incorrectly accessed
+(https://github.com/ispc/ispc/issues/160).
+
+Thanks to Jean-Luc Duprat for a number of patches that improve support for
+building on various platforms, and to Pierre-Antoine Lacaze for patches so
+that ispc builds under MinGW.
+
+=== v1.1.2 === (9 January 2012)
+
+The major new feature in this release is support for "generic" C++
+vectorized output; in other words, ispc can emit C++ code that corresponds
+to the vectorized computation that the ispc program represents.  See the
+examples/intrinsics directory in the ispc distribution for two example
+implementations of the set of functions that must be provided map the
+vector calls generated by ispc to target specific functions.
+
+ispc now has partial support for 'goto' statements; specifically, goto is
+allowed if any enclosing control flow statements (if/for/while/do) have
+'uniform' test expressions, but not if they have 'varying' tests.
+
+A number of improvements have been made to the code generated for gathers
+and scatters--one of them (better matching x86's "free" scale by 2/4/8 for
+addressing calculations) improved the performance of the noise example by
+14%.
+
+Many small bugs have been fixed in this release as well, including issue
+numbers 138, 129, 135, 127, 149, and 142.
+
+=== v1.1.1 === (15 December 2011)
+
+This release doesn't include any significant new functionality, but does
+include a small improvements in generated code and a number of bug fixes.
+
+The one user-visible language change is that integer constants may be
+specified with 'u' and 'l' suffixes, like in C.  For example, "1024llu"
+defines the constant with unsigned 64-bit type.
+
+More informative and useful error messages are printed when function
+overload resolution fails.
+
+Masking is avoided in additional cases when the mask can be
+statically-determined to be all on. 
+
+A number of small bugs have been fixed:
+- Under some circumstances, incorrect masks were used when assigning a
+  value to a reference and when doing gathers/scatters.
+- Incorrect code could be generated in some cases when some instances
+  returned part way through a function but others contineud executing.
+- Type checking wasn't being performed for calls through function pointers;
+  now an error is issued if the arguments don't match up, etc.
+- Incorrect code was being generated for gather/scatter to structs that had
+  elements with varying short-vector types.
+- Typechecking wasn't being performed for "foreach" statements; this led to
+  problems like function overload resolution not being performed if an
+  overloaded function call was used to determine the iteration range..
+- A number of symbols would be multiply-defined when compiling to multiple
+  targets and using the sse2-x2 target as one of them (issue #131).
+
 === v1.1.0 === (5 December 2011)

 This is a major new release of the compiler, with significant additions to
--- a/docs/build.sh
+++ b/docs/build.sh
@@ -2,11 +2,14 @@

 for i in ispc perfguide faq; do
    rst2html.py --template=template.txt --link-stylesheet \
-        --stylesheet-path=css/style.css $i.txt > $i.html
+        --stylesheet-path=css/style.css $i.rst > $i.html
 done

+rst2html.py --template=template-news.txt --link-stylesheet \
+    --stylesheet-path=css/style.css news.rst > news.html
+
 rst2html.py --template=template-perf.txt --link-stylesheet \
-        --stylesheet-path=css/style.css perf.txt > perf.html
+        --stylesheet-path=css/style.css perf.rst > perf.html

 #rst2latex --section-numbering --documentclass=article --documentoptions=DIV=9,10pt,letterpaper ispc.txt > ispc.tex
 #pdflatex ispc.tex
--- a/docs/faq.rst
+++ b/docs/faq.rst
@@ -1,10 +1,10 @@
-=============================================================
-Intel® SPMD Program Compiler Frequently Asked Questions (FAQ)
-=============================================================
+=====================================
+Frequently Asked Questions About ispc
+=====================================

 This document includes a number of frequently (and not frequently) asked
 questions about ispc, the Intel® SPMD Program Compiler.  The source to this
-document is in the file ``docs/faq.txt`` in the ``ispc`` source
+document is in the file ``docs/faq.rst`` in the ``ispc`` source
 distribution.

 * Understanding ispc's Output
@@ -19,6 +19,7 @@ distribution.
  + `How can I supply an initial execution mask in the call from the application?`_
  + `How can I generate a single binary executable with support for multiple instruction sets?`_
  + `How can I determine at run-time which vector instruction set's instructions were selected to execute?`_
+  + `Is it possible to inline ispc functions in C/C++ code?`_

 * Programming Techniques

@@ -273,10 +274,10 @@ Then four object files will be generated: ``foo_sse2.o``, ``foo_sse4.o``,
 ``foo_avx.o``, and ``foo.o``.[#]_  Link all of these into your executable, and
 when you call a function in ``foo.ispc`` from your application code,
 ``ispc`` will determine which instruction sets are supported by the CPU the
-code is running on and will call the most appropraite version of the
+code is running on and will call the most appropriate version of the
 function available.  

-.. [#] Similarly, if you choose to generate assembly langauage output or
+.. [#] Similarly, if you choose to generate assembly language output or
   LLVM bitcode output, multiple versions of those files will be created.

 In general, the version of the function that runs will be the one in the
@@ -346,6 +347,50 @@ In a similar fashion, it's possible to find out at run-time the value of
    export uniform int width() { return programCount; }


+Is it possible to inline ispc functions in C/C++ code?
+------------------------------------------------------
+
+If you're willing to use the ``clang`` C/C++ compiler that's part of the
+LLVM tool suite, then it is possible to inline ``ispc`` code with C/C++
+(and conversely, to inline C/C++ calls in ``ispc``).  Doing so can provide
+performance advantages when calling out to short functions written in the
+"other" language.  Note that you don't need to use ``clang`` to compile all
+of your C/C++ code, but only for the files where you want to be able to
+inline.  In order to do this, you must have a full installation of LLVM
+version 3.0 or later, including the ``clang`` compiler.
+
+The basic approach is to have the various compilers emit LLVM intermediate
+representation (IR) code and to then use tools from LLVM to link together
+the IR from the compilers and then re-optimize it, which gives the LLVM
+optimizer the opportunity to do additional inlining and cross-function
+optimizations.  If you have source files ``foo.ispc`` and ``foo.cpp``,
+first emit LLVM IR:
+
+::
+
+   ispc --emit-llvm -o foo_ispc.bc foo.ispc
+   clang -O2 -c -emit-llvm -o foo_cpp.bc foo.cpp
+
+Next, link the two IR files into a single file and run the LLVM optimizer
+on the result:
+
+::
+  
+    llvm-link foo_ispc.bc foo_cpp.bc -o - | opt -O3 -o foo_opt.bc
+
+And finally, generate a native object file:
+
+::
+
+   llc -filetype=obj foo_opt.bc -o foo.o
+
+This file can in turn be linked in with the rest of your object files when
+linking your applicaiton.
+
+(Note that if you're using the AVX instruction set, you must provide the
+``-mattr=+avx`` flag to ``llc``.)
+    
+
 Programming Techniques
 ======================

--- a/docs/ispc.rst
+++ b/docs/ispc.rst
--- a/docs/news.rst
+++ b/docs/news.rst
@@ -0,0 +1,28 @@
+=========
+ispc News
+=========
+
+ispc 1.1.4 is Released
+----------------------
+
+On February 4, 2012, the 1.1.4 release of ``ispc`` was posted; new features
+include ``new`` and ``delete`` for dynamic memory allocation in ``ispc``
+programs, "local" atomic operations in the standard library, and a new
+scalar compilation target.  See the `1.1.4 release notes`_ for details.
+
+.. _1.1.4 release notes: https://github.com/ispc/ispc/tree/master/docs/ReleaseNotes.txt
+
+
+ispc 1.1.3 is Released
+----------------------
+
+With this release, the language now supports "switch" statements, with the same semantics and syntax as in C.
+
+This release includes fixes for two important performance related issues:
+the quality of code generated for "foreach" statements has been
+substantially improved, and performance regression with code for "gathers"
+that was introduced in v1.1.2 has been fixed in this release.
+
+Thanks to Jean-Luc Duprat for a number of patches that improve support for
+building on various platforms, and to Pierre-Antoine Lacaze for patches so
+that ispc builds under MinGW.
--- a/docs/perf.rst
+++ b/docs/perf.rst
@@ -22,8 +22,8 @@ also included in the ``examples/`` directory.)
    - ``ispc``, 1 core
    - ``ispc``, 4 cores
  * - `AOBench`_ (512 x 512 resolution)
-    - 3.99x
-    - 19.32x
+    - 6.19x
+    - 28.06x
  * - `Binomial Options`_ (128k options)
    - 7.94x
    - 33.43x
@@ -31,23 +31,23 @@ also included in the ``examples/`` directory.)
    - 8.45x
    - 32.48x
  * - `Deferred Shading`_ (1280p)
-    - n/a
+    - 5.02x
    - 23.06x
  * - `Mandelbrot Set`_
    - 6.21x
-    - 19.90x
+    - 20.28x
  * - `Perlin Noise Function`_
    - 5.37x
    - n/a
  * - `Ray Tracer`_ (Sponza dataset)
-    - 3.99x
-    - 19.32x
+    - 4.31x
+    - 20.29x
  * - `3D Stencil`_
-    - 3.76x
-    - 13.79x
+    - 4.05x
+    - 15.53x
  * - `Volume Rendering`_
-    - 3.11x
-    - 15.80x
+    - 3.60x
+    - 17.53x


 .. _AOBench: https://github.com/ispc/ispc/tree/master/examples/aobench
--- a/docs/perfguide.rst
+++ b/docs/perfguide.rst
@@ -13,6 +13,7 @@ the most out of ``ispc`` in practice.
  + `Improving Control Flow Coherence With "foreach_tiled"`_
  + `Using Coherent Control Flow Constructs`_
  + `Use "uniform" Whenever Appropriate`_
+  + `Use "Structure of Arrays" Layout When Possible`_

 * `Tips and Techniques`_

@@ -64,7 +65,7 @@ on each one:
 Depending on the specifics of the computation being performed, the code
 generated for this function could likely be improved by modifying the code 
 so that the loop only goes as far through the data as is possible to pack
-an entire gang of program instances with computation each time thorugh the
+an entire gang of program instances with computation each time through the
 loop.  Doing so enables the ``ispc`` compiler to generate more efficient
 code for cases where it knows that the execution mask is "all on".  Then,
 an ``if`` statement at the end handles processing the ragged extra bits of
@@ -153,7 +154,7 @@ processed, and so forth.

 Performance benefit can come from using ``foreach_tiled`` in that it
 essentially optimizes for the benefit of iterating over *compact* regions
-of the domian (while ``foreach`` iterates over the domain in a way that
+of the domain (while ``foreach`` iterates over the domain in a way that
 generally allows linear memory access.)  There are two benefits from
 processing compact regions of the domain.  

@@ -215,7 +216,7 @@ Use "uniform" Whenever Appropriate
 ----------------------------------

 For any variable that will always have the same value across all of the
-program instances in a gang, declare the variable with the  ``unfiorm``
+program instances in a gang, declare the variable with the  ``uniform``
 qualifier.  Doing so enables the ``ispc`` compiler to emit better code in
 many different ways.

@@ -229,7 +230,7 @@ number of iterations:

 If this is written with ``i`` as a ``varying`` variable, as above, there's
 additional overhead in the code generated for the loop as the compiler
-emits instructions to handle the possibilty of not all program instances
+emits instructions to handle the possibility of not all program instances
 following the same control flow path (as might be the case if the loop
 limit, 10, was itself a ``varying`` value.)

@@ -247,6 +248,76 @@ but it's always best to provide the compiler with as much help as possible
 to understand the actual form of your computation.


+Use "Structure of Arrays" Layout When Possible
+----------------------------------------------
+
+In general, memory access performance (for both reads and writes) is best
+when the running program instances access a contiguous region of memory; in
+this case efficient vector load and store instructions can often be used
+rather than gathers and scatters.  As an example of this issue, consider an
+array of a simple point datatype laid out and accessed in conventional
+"array of structures" (AOS) layout:
+
+::
+
+    struct Point { float x, y, z; };
+    uniform Point pts[...];
+    float v = pts[programIndex].x;
+
+In the above code, the access to ``pts[programIndex].x`` accesses
+non-sequential memory locations, due to the ``y`` and ``z`` values between
+the desired ``x`` values in memory.  A "gather" is required to get the
+value of ``v``, with a corresponding decrease in performance.
+
+If ``Point`` was defined as a "structure of arrays" (SOA) type, the access
+can be much more efficient:
+
+::
+
+    struct Point8 { float x[8], y[8], z[8]; };
+    uniform Point8 pts8[...];
+    int majorIndex = programIndex / 8;
+    int minorIndex = programIndex % 8;
+    float v = pts8[majorIndex].x[minorIndex];
+
+In this case, each ``Point8`` has 8 ``x`` values contiguous in memory
+before 8 ``y`` values and then 8 ``z`` values.  If the gang size is 8 or
+less, the access for ``v`` will have the same value of ``majorIndex`` for
+all program instances and will access consecutive elements of the ``x[8]``
+array with a vector load.  (For larger gang sizes, two 8-wide vector loads
+would be issues, which is also quite efficient.)
+
+However, the syntax in the above code is messy; accessing SOA data in this
+fashion is much less elegant than the corresponding code for accessing the
+data with AOS layout.  The ``soa`` qualifier in ``ispc`` can be used to
+cause the corresponding transformation to be made to the ``Point`` type,
+while preserving the clean syntax for data access that comes with AOS
+layout:
+
+::
+
+    soa<8> Point pts[...]; 
+    float v = pts[programIndex].x;
+
+Thanks to having SOA layout a first-class concept in the language's type
+system, it's easy to write functions that convert data between the
+layouts.  For example, the ``aos_to_soa`` function below converts ``count``
+elements of the given ``Point`` type from AOS to 8-wide SOA layout.  (It
+assumes that the caller has pre-allocated sufficient space in the
+``pts_soa`` output array.
+
+::
+
+    void aos_to_soa(uniform Point pts_aos[], uniform int count,
+                    soa<8> pts_soa[]) {
+         foreach (i = 0 ... count)
+             pts_soa[i] = pts_aos[i];
+    }
+
+Analogously, a function could be written to convert back from SOA to AOS if
+needed.
+
+
 Tips and Techniques
 ===================

@@ -339,6 +410,12 @@ based on the index, it can be worth doing.  See the example
 ``examples/volume_rendering`` in the ``ispc`` distribution for the use of
 this technique in an instance where it is beneficial to performance.

+Understanding Memory Read Coalescing
+------------------------------------
+
+XXXX todo
+
+
 Avoid 64-bit Addressing Calculations When Possible
 --------------------------------------------------

@@ -568,7 +645,7 @@ mask of all lanes currently executing (assuming a four-wide gang size
 target machine).

 For a fuller example of the utility of this functionality, see
-``examples/aobench_instrumented`` in the ``ispc`` distribution.  Ths
+``examples/aobench_instrumented`` in the ``ispc`` distribution.  This
 example includes an implementation of the ``ISPCInstrument()`` function
 that collects aggregate data about the program's execution behavior.

--- a/docs/template-news.txt
+++ b/docs/template-news.txt
@@ -0,0 +1,65 @@
+%(head_prefix)s
+%(head)s
+<script type="text/javascript">
+
+  var _gaq = _gaq || [];
+  _gaq.push(['_setAccount', 'UA-1486404-4']);
+  _gaq.push(['_trackPageview']);
+
+  (function() {
+    var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
+    ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
+    var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
+  })();
+
+</script>
+%(stylesheet)s
+%(body_prefix)s
+<div id="wrap">
+  <div id="wrap2">
+    <div id="header">
+      <h1 id="logo">Intel SPMD Program Compiler</h1>
+      <div id="slogan">An open-source compiler for high-performance SIMD programming on
+      the CPU</div>
+    </div>
+    <div id="nav">
+      <div id="nbar">
+        <ul>
+          <li><a href="index.html">Overview</a></li>
+          <li id="selected"><a href="news.html">News</a></li>
+          <li><a href="features.html">Features</a></li>
+          <li><a href="downloads.html">Downloads</a></li>
+          <li><a href="documentation.html">Documentation</a></li>
+          <li><a href="perf.html">Performance</a></li>
+        </ul>
+      </div>
+    </div>
+    <div id="content-wrap">
+      <div id="sidebar">
+          <div class="widgetspace">
+            <h1>Resources</h1>
+            <ul class="menu">
+              <li><a href="http://github.com/ispc/ispc/">ispc page on github</a></li>
+              <li><a href="http://groups.google.com/group/ispc-users/">ispc
+              users mailing list</a></li>
+              <li><a href="http://groups.google.com/group/ispc-dev/">ispc
+              developers mailing list</a></li>
+              <li><a href="http://github.com/ispc/ispc/wiki/">Wiki</a></li>
+              <li><a href="http://github.com/ispc/ispc/issues/">Bug tracking</a></li>
+              <li><a href="doxygen/index.html">Doxygen</a></li>
+            </ul>
+        </div>
+      </div>
+%(body_pre_docinfo)s
+%(docinfo)s
+<div id="content">
+%(body)s
+</div>
+    <div class="clearfix"></div>
+    <div id="footer"> &copy; 2011-2012 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue  by: <a href="http://www.themebin.com/">ThemeBin</a>
+      <!-- Please Do Not remove this link, thank u -->
+      </div>
+      </div>
+      </div>
+      </div>
+%(body_suffix)s
--- a/docs/template-perf.txt
+++ b/docs/template-perf.txt
@@ -26,6 +26,7 @@
      <div id="nbar">
        <ul>
          <li><a href="index.html">Overview</a></li>
+          <li><a href="news.html">News</a></li>
          <li><a href="features.html">Features</a></li>
          <li><a href="downloads.html">Downloads</a></li>
          <li><a href="documentation.html">Documentation</a></li>
@@ -45,8 +46,7 @@
              developers mailing list</a></li>
              <li><a href="http://github.com/ispc/ispc/wiki/">Wiki</a></li>
              <li><a href="http://github.com/ispc/ispc/issues/">Bug tracking</a></li>
-              <li><a href="doxygen/index.html">Doxygen documentation of
-              <tt>ispc</tt> source code</a></li>
+              <li><a href="doxygen/index.html">Doxygen</a></li>
            </ul>
        </div>
      </div>
@@ -56,7 +56,7 @@
 %(body)s
 </div>
    <div class="clearfix"></div>
-    <div id="footer"> &copy; 2011 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue  by: <a href="http://www.themebin.com/">ThemeBin</a>
+    <div id="footer"> &copy; 2011-2012 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue  by: <a href="http://www.themebin.com/">ThemeBin</a>
      <!-- Please Do Not remove this link, thank u -->
      </div>
      </div>
--- a/docs/template.txt
+++ b/docs/template.txt
@@ -26,6 +26,7 @@
      <div id="nbar">
        <ul>
          <li><a href="index.html">Overview</a></li>
+          <li><a href="news.html">News</a></li>
          <li><a href="features.html">Features</a></li>
          <li><a href="downloads.html">Downloads</a></li>
          <li id="selected"><a href="documentation.html">Documentation</a></li>
@@ -45,8 +46,7 @@
              developers mailing list</a></li>
              <li><a href="http://github.com/ispc/ispc/wiki/">Wiki</a></li>
              <li><a href="http://github.com/ispc/ispc/issues/">Bug tracking</a></li>
-              <li><a href="doxygen/index.html">Doxygen documentation of
-              <tt>ispc</tt> source code</a></li>
+              <li><a href="doxygen/index.html">Doxygen</a></li>
            </ul>
        </div>
      </div>
@@ -56,7 +56,7 @@
 %(body)s
 </div>
    <div class="clearfix"></div>
-    <div id="footer"> &copy; 2011 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue  by: <a href="http://www.themebin.com/">ThemeBin</a>
+    <div id="footer"> &copy; 2011-2012 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue  by: <a href="http://www.themebin.com/">ThemeBin</a>
      <!-- Please Do Not remove this link, thank u -->
      </div>
      </div>
--- a/doxygen.cfg
+++ b/doxygen.cfg
@@ -31,7 +31,7 @@ PROJECT_NAME           = "Intel SPMD Program Compiler"
 # This could be handy for archiving the generated documentation or
 # if some version control system is used.

-PROJECT_NUMBER         = 1.1.0
+PROJECT_NUMBER         = 1.2.0

 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
 # base path where the generated documentation will be put.
--- a/examples/README.txt
+++ b/examples/README.txt
@@ -39,9 +39,6 @@ example implementation of this function that counts the number of times the
 callback is made and records some statistics about control flow coherence
 is provided in the instrument.cpp file.

-*** Note: on Linux, this example currently hits an assertion in LLVM during
-*** compilation
-

 Deferred
 ========
@@ -110,6 +107,13 @@ This program implements both the Black-Scholes and Binomial options pricing
 models in both ispc and regular serial C++ code.


+Perfbench
+=========
+
+This runs a number of microbenchmarks to measure system performance and
+code generation quality.
+
+
 RT
 ==

--- a/examples/aobench/Makefile
+++ b/examples/aobench/Makefile
@@ -1,39 +1,7 @@

-ARCH = $(shell uname)
+EXAMPLE=ao
+CPP_SRC=ao.cpp ao_serial.cpp
+ISPC_SRC=ao.ispc
+ISPC_TARGETS=sse2,sse4,avx

-TASK_CXX=../tasksys.cpp
-TASK_LIB=-lpthread
-TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
-
-CXX=g++
-CXXFLAGS=-Iobjs/ -O3 -Wall -m64
-ISPC=ispc
-ISPCFLAGS=-O2 --target=sse2,sse4,avx --arch=x86-64
-
-ISPC_OBJS=objs/ao_ispc.o objs/ao_ispc_sse2.o objs/ao_ispc_sse4.o \
-	objs/ao_ispc_avx.o
-OBJS=objs/ao.o objs/ao_serial.o $(ISPC_OBJS) $(TASK_OBJ)
-
-default: ao
-
-.PHONY: dirs clean
-
-dirs:
-	/bin/mkdir -p objs/
-
-clean:
-	/bin/rm -rf objs *~ ao
-
-ao: dirs $(OBJS) $(TASK_OBJ)
-	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASK_LIB)
-
-objs/%.o: %.cpp
-	$(CXX) $< $(CXXFLAGS) -c -o $@
-
-objs/%.o: ../%.cpp
-	$(CXX) $< $(CXXFLAGS) -c -o $@
-
-objs/ao.o: objs/ao_ispc.h 
-
-objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
-	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
+include ../common.mk
--- a/examples/aobench/ao.ispc
+++ b/examples/aobench/ao.ispc
@@ -50,7 +50,6 @@ struct Isect {
 struct Sphere {
    vec        center;
    float      radius;
-
 };

 struct Plane {
@@ -82,8 +81,8 @@ static inline void vnormalize(vec &v) {
 }


-static inline void
-ray_plane_intersect(Isect &isect, Ray &ray, Plane &plane) {
+static void
+ray_plane_intersect(Isect &isect, Ray &ray, uniform Plane &plane) {
    float d = -dot(plane.p, plane.n);
    float v = dot(ray.dir, plane.n);

@@ -103,7 +102,7 @@ ray_plane_intersect(Isect &isect, Ray &ray, Plane &plane) {


 static inline void
-ray_sphere_intersect(Isect &isect, Ray &ray, Sphere &sphere) {
+ray_sphere_intersect(Isect &isect, Ray &ray, uniform Sphere &sphere) {
    vec rs = ray.org - sphere.center;

    float B = dot(rs, ray.dir);
@@ -124,7 +123,7 @@ ray_sphere_intersect(Isect &isect, Ray &ray, Sphere &sphere) {
 }


-static inline void
+static void
 orthoBasis(vec basis[3], vec n) {
    basis[2] = n;
    basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;
@@ -147,8 +146,8 @@ orthoBasis(vec basis[3], vec n) {
 }


-static inline float
-ambient_occlusion(Isect &isect, Plane &plane, Sphere spheres[3], 
+static float
+ambient_occlusion(Isect &isect, uniform Plane &plane, uniform Sphere spheres[3],
                  RNGState &rngstate) {
    float eps = 0.0001f;
    vec p, n;
@@ -204,112 +203,52 @@ ambient_occlusion(Isect &isect, Plane &plane, Sphere spheres[3],
 static void ao_scanlines(uniform int y0, uniform int y1, uniform int w, 
                         uniform int h,  uniform int nsubsamples, 
                         uniform float image[]) {
-    static Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
-    static Sphere spheres[3] = {
+    static uniform Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
+    static uniform Sphere spheres[3] = {
        { { -2.0f, 0.0f, -3.5f }, 0.5f },
        { { -0.5f, 0.0f, -3.0f }, 0.5f },
        { { 1.0f, 0.0f, -2.2f }, 0.5f } };
    RNGState rngstate;

    seed_rng(&rngstate, y0);
+    float invSamples = 1.f / nsubsamples;

-    // Compute the mapping between the 'programCount'-wide program
-    // instances running in parallel and samples in the image.  
-    //
-    // For now, we'll always take four samples per pixel, so start by
-    // initializing du and dv with offsets into subpixel samples.  We'll
-    // take care of further updating du and dv for the case where we're
-    // doing more than 4 program instances in parallel shortly.
-    uniform float uSteps[4] = { 0, 1, 0, 1 };
-    uniform float vSteps[4] = { 0, 0, 1, 1 };
-    float du = uSteps[programIndex % 4] / nsubsamples;
-    float dv = vSteps[programIndex % 4] / nsubsamples;
+    foreach_tiled(y = y0 ... y1, x = 0 ... w, 
+                  u = 0 ... nsubsamples, v = 0 ... nsubsamples) {
+        float du = (float)u * invSamples, dv = (float)v * invSamples;

-    // Now handle the case where we are able to do more than one pixel's
-    // worth of work at once.  nx records the number of pixels in the x
-    // direction we do per iteration and ny the number in y.
-    uniform int nx = 1, ny = 1;
+        // Figure out x,y pixel in NDC
+        float px =  (x + du - (w / 2.0f)) / (w / 2.0f);
+        float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
+        float ret = 0.f;
+        Ray ray;
+        Isect isect;

-    // FIXME: We actually need ny to be 1 regardless of the decomposition,
-    // since the task decomposition is one scanline high.
+        ray.org = 0.f;

-    if (programCount == 8) {
-        // Do two pixels at once in the x direction
-        nx = 2;
-        if (programIndex >= 4) 
-            // And shift the offsets for the second pixel's worth of work
-            ++du;
-    }
-    else if (programCount == 16) {
-        nx = 4;
-        ny = 1;
-        if (programIndex >= 4 && programIndex < 8)
-            ++du;
-        if (programIndex >= 8 && programIndex < 12)
-            du += 2;
-        if (programIndex >= 12)
-            du += 3;
-    }
+        // Poor man's perspective projection
+        ray.dir.x = px;
+        ray.dir.y = py;
+        ray.dir.z = -1.0;
+        vnormalize(ray.dir);

-    // Now loop over all of the pixels, stepping in x and y as calculated
-    // above.  (Assumes that ny divides y and nx divides x...)
-    for (uniform int y = y0; y < y1; y += ny) {
-        for (uniform int x = 0; x < w; x += nx)  {
-            // Figure out x,y pixel in NDC
-            float px =  (x + du - (w / 2.0f)) / (w / 2.0f);
-            float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
-            float ret = 0.f;
-            Ray ray;
-            Isect isect;
+        isect.t   = 1.0e+17;
+        isect.hit = 0;

-            ray.org = 0.f;
+        for (uniform int snum = 0; snum < 3; ++snum)
+            ray_sphere_intersect(isect, ray, spheres[snum]);
+        ray_plane_intersect(isect, ray, plane);

-            // Poor man's perspective projection
-            ray.dir.x = px;
-            ray.dir.y = py;
-            ray.dir.z = -1.0;
-            vnormalize(ray.dir);
+        // Note use of 'coherent' if statement; the set of rays we
+        // trace will often all hit or all miss the scene
+        cif (isect.hit) {
+            ret = ambient_occlusion(isect, plane, spheres, rngstate);
+            ret *= invSamples * invSamples;

-            isect.t   = 1.0e+17;
-            isect.hit = 0;
-
-            for (uniform int snum = 0; snum < 3; ++snum)
-                ray_sphere_intersect(isect, ray, spheres[snum]);
-            ray_plane_intersect(isect, ray, plane);
-
-            // Note use of 'coherent' if statement; the set of rays we
-            // trace will often all hit or all miss the scene
-            cif (isect.hit)
-                ret = ambient_occlusion(isect, plane, spheres, rngstate);
-
-            // This is a little grungy; we have results for
-            // programCount-worth of values.  Because we're doing 2x2
-            // subsamples, we need to peel them off in groups of four,
-            // average the four values for each pixel, and update the
-            // output image.
-            //
-            // Store the varying value to a uniform array of the same size.
-            // See the discussion about communication among program
-            // instances in the ispc user's manual for more discussion on
-            // this idiom.
-            uniform float retArray[programCount];
-            retArray[programIndex] = ret;
-
-            // offset to the first pixel in the image
-            uniform int offset = 3 * (y * w + x);
-            for (uniform int p = 0; p < programCount; p += 4, offset += 3) {
-                // Get the four sample values for this pixel
-                uniform float sumret = retArray[p] + retArray[p+1] + retArray[p+2] +
-                    retArray[p+3];
-
-                // Normalize by number of samples taken
-                sumret /= nsubsamples * nsubsamples; 
-                
-                // Store result in the image
-                image[offset+0] = sumret;
-                image[offset+1] = sumret;
-                image[offset+2] = sumret;
-            }
+            int offset = 3 * (y * w + x);
+            atomic_add_local(&image[offset], ret);
+            atomic_add_local(&image[offset+1], ret);
+            atomic_add_local(&image[offset+2], ret);
        }
    }
 }
@@ -329,5 +268,5 @@ static void task ao_task(uniform int width, uniform int height,

 export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples, 
                          uniform float image[]) {
-    launch[h] < ao_task(w, h, nsubsamples, image) >;
+    launch[h] ao_task(w, h, nsubsamples, image);
 }
--- a/examples/aobench_instrumented/Makefile
+++ b/examples/aobench_instrumented/Makefile
@@ -14,13 +14,13 @@ dirs:
 clean:
 	/bin/rm -rf objs *~ ao

-ao: dirs objs/ao.o objs/instrument.o objs/ao_ispc.o
-	$(CXX) $(CXXFLAGS) -o $@ objs/ao.o objs/ao_ispc.o objs/instrument.o -lm -lpthread
+ao: objs/ao.o objs/instrument.o objs/ao_ispc.o ../tasksys.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $^ -lm -lpthread

-objs/%.o: %.cpp
+objs/%.o: %.cpp dirs
 	$(CXX) $< $(CXXFLAGS) -c -o $@

 objs/ao.o: objs/ao_ispc.h 

-objs/%_ispc.h objs/%_ispc.o: %.ispc
-	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
+objs/%_ispc.h objs/%_ispc.o: %.ispc dirs
+	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_instrumented_ispc.h
--- a/examples/aobench_instrumented/ao.ispc
+++ b/examples/aobench_instrumented/ao.ispc
@@ -329,5 +329,5 @@ static void task ao_task(uniform int width, uniform int height,

 export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples, 
                          uniform float image[]) {
-    launch[h] < ao_task(w, h, nsubsamples, image) >;
+    launch[h] ao_task(w, h, nsubsamples, image);
 }
--- a/examples/common.mk
+++ b/examples/common.mk
@@ -0,0 +1,65 @@
+
+TASK_CXX=../tasksys.cpp
+TASK_LIB=-lpthread
+TASK_OBJ=tasksys.o
+
+CXX=g++
+CXXFLAGS=-Iobjs/ -O2 -m64
+LIBS=-lm $(TASK_LIB) -lstdc++
+ISPC=ispc -O2 --arch=x86-64 $(ISPC_FLAGS)
+ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc.o $(ISPC_SRC:.ispc=)_ispc_sse2.o \
+	$(ISPC_SRC:.ispc=)_ispc_sse4.o $(ISPC_SRC:.ispc=)_ispc_avx.o)
+ISPC_HEADER=objs/$(ISPC_SRC:.ispc=_ispc.h)
+CPP_OBJS=$(addprefix objs/, $(CPP_SRC:.cpp=.o) $(TASK_OBJ))
+
+default: $(EXAMPLE)
+
+all: $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16 $(EXAMPLE)-scalar
+
+.PHONY: dirs clean
+
+dirs:
+	/bin/mkdir -p objs/
+
+objs/%.cpp objs/%.o objs/%.h: dirs
+
+clean:
+	/bin/rm -rf objs *~ $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16
+
+$(EXAMPLE): $(CPP_OBJS) $(ISPC_OBJS)
+	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
+
+objs/%.o: %.cpp dirs $(ISPC_HEADER)
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/%.o: ../%.cpp dirs
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/$(EXAMPLE).o: objs/$(EXAMPLE)_ispc.h
+
+objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
+	$(ISPC) --target=$(ISPC_TARGETS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
+
+objs/$(ISPC_SRC:.ispc=)_sse4.cpp: $(ISPC_SRC)
+	$(ISPC) $< -o $@ --target=generic-4 --emit-c++ --c++-include-file=sse4.h
+
+objs/$(ISPC_SRC:.ispc=)_sse4.o: objs/$(ISPC_SRC:.ispc=)_sse4.cpp
+	$(CXX) -I../intrinsics -msse4.2 $< $(CXXFLAGS) -c -o $@
+
+$(EXAMPLE)-sse4: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_sse4.o
+	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
+
+objs/$(ISPC_SRC:.ispc=)_generic16.cpp: $(ISPC_SRC)
+	$(ISPC) $< -o $@ --target=generic-16 --emit-c++ --c++-include-file=generic-16.h
+
+objs/$(ISPC_SRC:.ispc=)_generic16.o: objs/$(ISPC_SRC:.ispc=)_generic16.cpp
+	$(CXX) -I../intrinsics $< $(CXXFLAGS) -c -o $@
+
+$(EXAMPLE)-generic16: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_generic16.o
+	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
+
+objs/$(ISPC_SRC:.ispc=)_scalar.o: $(ISPC_SRC)
+	$(ISPC) $< -o $@ --target=generic-1
+
+$(EXAMPLE)-scalar: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_scalar.o
+	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
--- a/examples/deferred/Makefile
+++ b/examples/deferred/Makefile
@@ -1,38 +1,8 @@

-ARCH = $(shell uname)
+EXAMPLE=deferred_shading
+CPP_SRC=common.cpp main.cpp dynamic_c.cpp dynamic_cilk.cpp
+ISPC_SRC=kernels.ispc
+ISPC_TARGETS=sse2,sse4-x2,avx-x2
+ISPC_FLAGS=--opt=fast-math

-TASK_CXX=../tasksys.cpp
-TASK_LIB=-lpthread
-TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
-
-CXX=g++
-CXXFLAGS=-Iobjs/ -O3 -Wall -m64
-ISPC=ispc
-ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx-x2 --arch=x86-64 --math-lib=fast
-
-OBJS=objs/main.o objs/common.o objs/kernels_ispc.o objs/kernels_ispc_sse2.o \
-	objs/kernels_ispc_sse4.o objs/kernels_ispc_avx.o \
-	objs/dynamic_c.o objs/dynamic_cilk.o
-
-default: deferred_shading
-
-.PHONY: dirs clean
-.PRECIOUS: objs/kernels_ispc.h
-
-dirs:
-	/bin/mkdir -p objs/
-
-clean:
-	/bin/rm -rf objs *~ deferred_shading
-
-deferred_shading: dirs $(OBJS) $(TASK_OBJ)
-	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) $(TASK_OBJ) -lm $(TASK_LIB)
-
-objs/%.o: %.cpp objs/kernels_ispc.h deferred.h
-	$(CXX) $< $(CXXFLAGS) -c -o $@
-
-objs/%.o: ../%.cpp
-	$(CXX) $< $(CXXFLAGS) -c -o $@
-
-objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
-	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
+include ../common.mk
--- a/examples/deferred/common.cpp
+++ b/examples/deferred/common.cpp
@@ -204,6 +204,7 @@ void WriteFrame(const char *filename, const InputData *input,
    fprintf(out, "P6 %d %d 255\n", input->header.framebufferWidth, 
            input->header.framebufferHeight);
    fwrite(framebufferAOS, imageBytes, 1, out);
+    fclose(out);

    lAlignedFree(framebufferAOS);
 }
--- a/examples/deferred/kernels.ispc
+++ b/examples/deferred/kernels.ispc
@@ -35,35 +35,35 @@

 struct InputDataArrays
 {
-    uniform float * uniform zBuffer;
-    uniform unsigned int16 * uniform normalEncoded_x; // half float
-    uniform unsigned int16 * uniform normalEncoded_y; // half float
-    uniform unsigned int16 * uniform specularAmount; // half float
-    uniform unsigned int16 * uniform specularPower; // half float
-    uniform unsigned int8 * uniform albedo_x; // unorm8
-    uniform unsigned int8 * uniform albedo_y; // unorm8
-    uniform unsigned int8 * uniform albedo_z; // unorm8
-    uniform float * uniform lightPositionView_x;
-    uniform float * uniform lightPositionView_y;
-    uniform float * uniform lightPositionView_z;
-    uniform float * uniform lightAttenuationBegin;
-    uniform float * uniform lightColor_x;
-    uniform float * uniform lightColor_y;
-    uniform float * uniform lightColor_z;
-    uniform float * uniform lightAttenuationEnd;
+    float *zBuffer;
+    unsigned int16 *normalEncoded_x; // half float
+    unsigned int16 *normalEncoded_y; // half float
+    unsigned int16 *specularAmount; // half float
+    unsigned int16 *specularPower; // half float
+    unsigned int8 *albedo_x; // unorm8
+    unsigned int8 *albedo_y; // unorm8
+    unsigned int8 *albedo_z; // unorm8
+    float *lightPositionView_x;
+    float *lightPositionView_y;
+    float *lightPositionView_z;
+    float *lightAttenuationBegin;
+    float *lightColor_x;
+    float *lightColor_y;
+    float *lightColor_z;
+    float *lightAttenuationEnd;
 };

 struct InputHeader
 {
-    uniform float cameraProj[4][4];
-    uniform float cameraNear;
-    uniform float cameraFar;
+    float cameraProj[4][4];
+    float cameraNear;
+    float cameraFar;

-    uniform int32 framebufferWidth;
-    uniform int32 framebufferHeight;
-    uniform int32 numLights;
-    uniform int32 inputDataChunkSize;
-    uniform int32 inputDataArrayOffsets[idaNum];
+    int32 framebufferWidth;
+    int32 framebufferHeight;
+    int32 numLights;
+    int32 inputDataChunkSize;
+    int32 inputDataArrayOffsets[idaNum];
 };


@@ -158,38 +158,22 @@ IntersectLightsWithTileMinMax(
    uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
    uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
        
-    // Parallize across frustum planes.
-    // We really only have four side planes here, but write the code to
-    // handle programCount > 4 robustly
-    uniform float frustumPlanes_xy[programCount];
-    uniform float frustumPlanes_z[programCount];
+    uniform float frustumPlanes_xy[4] = {
+        -(cameraProj_11 * gBufferScale_x),
+         (cameraProj_11 * gBufferScale_x),
+         (cameraProj_22 * gBufferScale_y),
+        -(cameraProj_22 * gBufferScale_y) };
+    uniform float frustumPlanes_z[4] = {
+         tileEndX - gBufferScale_x,
+        -tileStartX + gBufferScale_x,
+         tileEndY - gBufferScale_y,
+        -tileStartY + gBufferScale_y };

-    // TODO: If programIndex < 4 here? Don't care about masking off the
-    // rest but if interleaving ("x2" modes) the other lanes should ideally
-    // not be emitted...
-    {
-        // This one is totally constant over the whole screen... worth pulling it up at all?
-        float frustumPlanes_xy_v;
-        frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 0, -(cameraProj_11 * gBufferScale_x));
-        frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 1,  (cameraProj_11 * gBufferScale_x));
-        frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 2,  (cameraProj_22 * gBufferScale_y));
-        frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 3, -(cameraProj_22 * gBufferScale_y));
-    
-        float frustumPlanes_z_v;
-        frustumPlanes_z_v = insert(frustumPlanes_z_v, 0,  tileEndX - gBufferScale_x);
-        frustumPlanes_z_v = insert(frustumPlanes_z_v, 1, -tileStartX + gBufferScale_x);
-        frustumPlanes_z_v = insert(frustumPlanes_z_v, 2,  tileEndY - gBufferScale_y);
-        frustumPlanes_z_v = insert(frustumPlanes_z_v, 3, -tileStartY + gBufferScale_y);
-
-        // Normalize
-        float norm = rsqrt(frustumPlanes_xy_v * frustumPlanes_xy_v + 
-                           frustumPlanes_z_v * frustumPlanes_z_v);
-            frustumPlanes_xy_v *= norm;
-            frustumPlanes_z_v *= norm;
-
-        // Save out for uniform use later
-        frustumPlanes_xy[programIndex] = frustumPlanes_xy_v;
-        frustumPlanes_z[programIndex] = frustumPlanes_z_v;
+    for (uniform int i = 0; i < 4; ++i) {
+        uniform float norm = rsqrt(frustumPlanes_xy[i] * frustumPlanes_xy[i] + 
+                                   frustumPlanes_z[i] * frustumPlanes_z[i]);
+        frustumPlanes_xy[i] *= norm;
+        frustumPlanes_z[i] *= norm;
    }

    uniform int32 tileNumLights = 0;
@@ -530,9 +514,9 @@ RenderStatic(uniform InputHeader &inputHeader,

    // Launch a task to render each tile, each of which is MIN_TILE_WIDTH
    // by MIN_TILE_HEIGHT pixels.
-    launch[num_groups] < RenderTile(num_groups_x, num_groups_y,
-                                    inputHeader, inputData, visualizeLightCount,
-                                    framebuffer_r, framebuffer_g, framebuffer_b) >;
+    launch[num_groups] RenderTile(num_groups_x, num_groups_y,
+                                  inputHeader, inputData, visualizeLightCount,
+                                  framebuffer_r, framebuffer_g, framebuffer_b);
 }


@@ -591,8 +575,6 @@ SplitTileMinMax(
    uniform float light_positionView_z_array[],
    uniform float light_attenuationEnd_array[],
    // Outputs
-    // TODO: ISPC doesn't currently like multidimensionsal arrays so we'll do the
-    // indexing math ourselves
    uniform int32 subtileIndices[],
    uniform int32 subtileIndicesPitch,
    uniform int32 subtileNumLights[]
@@ -601,30 +583,20 @@ SplitTileMinMax(
    uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
    uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
        
-    // Parallize across frustum planes
-    // Only have 2 frustum split planes here so may not be worth it, but
-    // we'll do it for now for consistency
-    uniform float frustumPlanes_xy[programCount];
-    uniform float frustumPlanes_z[programCount];
-
-    // This one is totally constant over the whole screen... worth pulling it up at all?
-    float frustumPlanes_xy_v;
-    frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 0, -(cameraProj_11 * gBufferScale_x));
-    frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 1,  (cameraProj_22 * gBufferScale_y));
-    
-    float frustumPlanes_z_v;
-    frustumPlanes_z_v = insert(frustumPlanes_z_v, 0, tileMidX - gBufferScale_x);
-    frustumPlanes_z_v = insert(frustumPlanes_z_v, 1, tileMidY - gBufferScale_y);
+    uniform float frustumPlanes_xy[2] = { -(cameraProj_11 * gBufferScale_x),
+                                           (cameraProj_22 * gBufferScale_y) };
+    uniform float frustumPlanes_z[2] = { tileMidX - gBufferScale_x,
+                                         tileMidY - gBufferScale_y };

    // Normalize
-    float norm = rsqrt(frustumPlanes_xy_v * frustumPlanes_xy_v + 
-                       frustumPlanes_z_v * frustumPlanes_z_v);
-    frustumPlanes_xy_v *= norm;
-    frustumPlanes_z_v *= norm;
-
-    // Save out for uniform use later
-    frustumPlanes_xy[programIndex] = frustumPlanes_xy_v;
-    frustumPlanes_z[programIndex] = frustumPlanes_z_v;
+    uniform float norm[2] = { rsqrt(frustumPlanes_xy[0] * frustumPlanes_xy[0] + 
+                                    frustumPlanes_z[0] * frustumPlanes_z[0]),
+                              rsqrt(frustumPlanes_xy[1] * frustumPlanes_xy[1] + 
+                                    frustumPlanes_z[1] * frustumPlanes_z[1]) };
+    frustumPlanes_xy[0] *= norm[0];
+    frustumPlanes_xy[1] *= norm[1];
+    frustumPlanes_z[0] *= norm[0];
+    frustumPlanes_z[1] *= norm[1];

    // Initialize
    uniform int32 subtileLightOffset[4];
--- a/examples/examples.sln
+++ b/examples/examples.sln
@@ -23,6 +23,8 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "stencil", "stencil\stencil.
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "deferred_shading", "deferred\deferred_shading.vcxproj", "{87F53C53-957E-4E91-878A-BC27828FB9EB}"
 EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "perfbench", "perfbench\perfbench.vcxproj", "{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}"
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|Win32 = Debug|Win32
@@ -119,6 +121,14 @@ Global
 		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|Win32.Build.0 = Release|Win32
 		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|x64.ActiveCfg = Release|x64
 		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|x64.Build.0 = Release|x64
+		{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Debug|Win32.ActiveCfg = Debug|Win32
+		{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Debug|Win32.Build.0 = Debug|Win32
+		{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Debug|x64.ActiveCfg = Debug|x64
+		{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Debug|x64.Build.0 = Debug|x64
+		{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Release|Win32.ActiveCfg = Release|Win32
+		{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Release|Win32.Build.0 = Release|Win32
+		{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Release|x64.ActiveCfg = Release|x64
+		{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
--- a/examples/intrinsics/generic-16.h
+++ b/examples/intrinsics/generic-16.h
--- a/examples/intrinsics/sse4.h
+++ b/examples/intrinsics/sse4.h
--- a/examples/mandelbrot/Makefile
+++ b/examples/mandelbrot/Makefile
@@ -1,30 +1,7 @@

-CXX=g++ -m64
-CXXFLAGS=-Iobjs/ -O3 -Wall
-ISPC=ispc
-ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx-x2 --arch=x86-64
+EXAMPLE=mandelbrot
+CPP_SRC=mandelbrot.cpp mandelbrot_serial.cpp
+ISPC_SRC=mandelbrot.ispc
+ISPC_TARGETS=sse2,sse4-x2,avx-x2

-default: mandelbrot
-
-.PHONY: dirs clean
-
-dirs:
-	/bin/mkdir -p objs/
-
-clean:
-	/bin/rm -rf objs *~ mandelbrot
-
-OBJS=objs/mandelbrot.o objs/mandelbrot_serial.o objs/mandelbrot_ispc_sse2.o \
-	objs/mandelbrot_ispc_sse4.o objs/mandelbrot_ispc_avx.o \
-	objs/mandelbrot_ispc.o
-
-mandelbrot: dirs $(OBJS)
-	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm
-
-objs/%.o: %.cpp
-	$(CXX) $< $(CXXFLAGS) -c -o $@
-
-objs/mandelbrot.o: objs/mandelbrot_ispc.h 
-
-objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
-	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
+include ../common.mk
--- a/examples/mandelbrot_tasks/Makefile
+++ b/examples/mandelbrot_tasks/Makefile
@@ -1,39 +1,7 @@

-ARCH = $(shell uname)
+EXAMPLE=mandelbrot
+CPP_SRC=mandelbrot.cpp mandelbrot_serial.cpp
+ISPC_SRC=mandelbrot.ispc
+ISPC_TARGETS=sse2,sse4-x2,avx-x2

-TASK_CXX=../tasksys.cpp
-TASK_LIB=-lpthread
-TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
-
-CXX=g++
-CXXFLAGS=-Iobjs/ -O3 -Wall -m64
-ISPC=ispc
-ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx-x2 --arch=x86-64
-
-OBJS=objs/mandelbrot.o objs/mandelbrot_serial.o $(TASK_OBJ) \
-	objs/mandelbrot_ispc.o objs/mandelbrot_ispc_sse2.o \
-	objs/mandelbrot_ispc_sse4.o objs/mandelbrot_ispc_avx.o 
-
-default: mandelbrot
-
-.PHONY: dirs clean
-
-dirs:
-	/bin/mkdir -p objs/
-
-clean:
-	/bin/rm -rf objs *~ mandelbrot
-
-mandelbrot: dirs $(OBJS)
-	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASK_LIB)
-
-objs/%.o: %.cpp
-	$(CXX) $< $(CXXFLAGS) -c -o $@
-
-objs/%.o: ../%.cpp
-	$(CXX) $< $(CXXFLAGS) -c -o $@
-
-objs/mandelbrot.o: objs/mandelbrot_ispc.h 
-
-objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
-	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
+include ../common.mk
--- a/examples/mandelbrot_tasks/mandelbrot.ispc
+++ b/examples/mandelbrot_tasks/mandelbrot.ispc
@@ -49,17 +49,16 @@ mandel(float c_re, float c_im, int count) {
 }


-/* Task to compute the Mandelbrot iterations for a span of scanlines from
-   [ystart,yend).
+/* Task to compute the Mandelbrot iterations for a single scanline.
 */
 task void
-mandelbrot_scanlines(uniform int ybase, uniform int span,
-                     uniform float x0, uniform float dx, 
-                     uniform float y0, uniform float dy,
-                     uniform int width, uniform int maxIterations,
-                     uniform int output[]) {
-    uniform int ystart = ybase + taskIndex * span;
-    uniform int yend = ystart + span;
+mandelbrot_scanline(uniform float x0, uniform float dx, 
+                    uniform float y0, uniform float dy,
+                    uniform int width, uniform int height, 
+                    uniform int span,
+                    uniform int maxIterations, uniform int output[]) {
+    uniform int ystart = taskIndex * span;
+    uniform int yend = min((taskIndex+1) * span, (unsigned int)height);

    foreach (yi = ystart ... yend, xi = 0 ... width) {
        float x = x0 + xi * dx;
@@ -71,20 +70,6 @@ mandelbrot_scanlines(uniform int ybase, uniform int span,
 }
                               

-task void
-mandelbrot_chunk(uniform float x0, uniform float dx,
-                 uniform float y0, uniform float dy,
-                 uniform int width, uniform int height,
-                 uniform int maxIterations, uniform int output[]) {
-    uniform int ystart = taskIndex * (height/taskCount);
-    uniform int yend = (taskIndex+1) * (height/taskCount);
-    uniform int span = 1;
-
-    launch[(yend-ystart)/span] < mandelbrot_scanlines(ystart, span, x0, dx, y0, dy,
-                                                      width, maxIterations, output) >;
-}
-
-
 export void
 mandelbrot_ispc(uniform float x0, uniform float y0, 
                uniform float x1, uniform float y1,
@@ -92,7 +77,8 @@ mandelbrot_ispc(uniform float x0, uniform float y0,
                uniform int maxIterations, uniform int output[]) {
    uniform float dx = (x1 - x0) / width;
    uniform float dy = (y1 - y0) / height;
+    uniform int span = 4;

-    launch[32] < mandelbrot_chunk(x0, dx, y0, dy, width, height,
-                                  maxIterations, output) >;
+    launch[height/span] mandelbrot_scanline(x0, dx, y0, dy, width, height, span,
+                                            maxIterations, output);
 }
--- a/examples/noise/Makefile
+++ b/examples/noise/Makefile
@@ -1,29 +1,7 @@

-CXX=g++ -m64
-CXXFLAGS=-Iobjs/ -O3 -Wall
-ISPC=ispc
-ISPCFLAGS=-O2 --target=sse2,sse4,avx-x2 --arch=x86-64
+EXAMPLE=noise
+CPP_SRC=$(EXAMPLE).cpp $(EXAMPLE)_serial.cpp
+ISPC_SRC=noise.ispc
+ISPC_TARGETS=sse2,sse4,avx-x2

-OBJS=objs/noise.o objs/noise_serial.o objs/noise_ispc.o objs/noise_ispc_sse2.o \
-	objs/noise_ispc_sse4.o objs/noise_ispc_avx.o 
-
-default: noise
-
-.PHONY: dirs clean
-
-dirs:
-	/bin/mkdir -p objs/
-
-clean:
-	/bin/rm -rf objs *~ noise
-
-noise: dirs $(OBJS)
-	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm
-
-objs/%.o: %.cpp
-	$(CXX) $< $(CXXFLAGS) -c -o $@
-
-objs/noise.o: objs/noise_ispc.h 
-
-objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
-	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
+include ../common.mk
--- a/examples/options/Makefile
+++ b/examples/options/Makefile
@@ -1,38 +1,7 @@

-TASK_CXX=../tasksys.cpp
-TASK_LIB=-lpthread
-TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
+EXAMPLE=options
+CPP_SRC=options.cpp options_serial.cpp
+ISPC_SRC=options.ispc
+ISPC_TARGETS=sse2,sse4-x2,avx-x2

-
-CXX=g++ -m64
-CXXFLAGS=-Iobjs/ -g -Wall
-ISPC=ispc
-ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx-x2 --arch=x86-64
-
-OBJS=objs/options.o objs/options_serial.o objs/options_ispc.o \
-	objs/options_ispc_sse2.o objs/options_ispc_sse4.o \
-	objs/options_ispc_avx.o $(TASK_OBJ)
-
-default: options
-
-.PHONY: dirs clean
-
-dirs:
-	/bin/mkdir -p objs/
-
-clean:
-	/bin/rm -rf objs *~ options
-
-options: dirs $(OBJS)
-	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASK_LIB)
-
-objs/%.o: %.cpp
-	$(CXX) $< $(CXXFLAGS) -c -o $@
-
-objs/%.o: ../%.cpp
-	$(CXX) $< $(CXXFLAGS) -c -o $@
-
-objs/options.o: objs/options_ispc.h options_defs.h
-
-objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc options_defs.h
-	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
+include ../common.mk
--- a/examples/options/options.ispc
+++ b/examples/options/options.ispc
@@ -77,7 +77,7 @@ black_scholes_ispc_tasks(uniform float Sa[], uniform float Xa[], uniform float T
                         uniform float ra[], uniform float va[], 
                         uniform float result[], uniform int count) {
    uniform int nTasks = max((int)64, (int)count/16384);
-    launch[nTasks] < bs_task(Sa, Xa, Ta, ra, va, result, count) >;
+    launch[nTasks] bs_task(Sa, Xa, Ta, ra, va, result, count);
 }


@@ -150,5 +150,5 @@ binomial_put_ispc_tasks(uniform float Sa[], uniform float Xa[],
                        uniform float va[], uniform float result[], 
                        uniform int count) {
    uniform int nTasks = max((int)64, (int)count/16384);
-    launch[nTasks] < binomial_task(Sa, Xa, Ta, ra, va, result, count) >;
+    launch[nTasks] binomial_task(Sa, Xa, Ta, ra, va, result, count);
 }
--- a/examples/perfbench/Makefile
+++ b/examples/perfbench/Makefile
@@ -0,0 +1,7 @@
+
+EXAMPLE=perbench
+CPP_SRC=perfbench.cpp perfbench_serial.cpp
+ISPC_SRC=perfbench.ispc
+ISPC_TARGETS=sse2,sse4,avx
+
+include ../common.mk
--- a/examples/perfbench/perfbench.cpp
+++ b/examples/perfbench/perfbench.cpp
@@ -0,0 +1,108 @@
+/*
+  Copyright (c) 2012, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#define NOMINMAX
+#pragma warning (disable: 4244)
+#pragma warning (disable: 4305)
+#endif
+
+#include <stdio.h>
+#include <algorithm>
+#include "../timing.h"
+
+#include "perfbench_ispc.h"
+
+typedef void (FuncType)(float *, int, float *, float *);
+
+struct PerfTest {
+    FuncType *aFunc;
+    const char *aName;
+    FuncType *bFunc;
+    const char *bName;
+    const char *testName;
+};
+
+extern void xyzSumAOS(float *a, int count, float *zeros, float *result);
+extern void xyzSumSOA(float *a, int count, float *zeros, float *result);
+
+
+static void
+lInitData(float *ptr, int count) {
+    for (int i = 0; i < count; ++i)
+        ptr[i] = float(i) / (1024.f * 1024.f);
+}
+
+static PerfTest tests[] = { 
+    { xyzSumAOS, "serial", ispc::xyzSumAOS, "ispc", "AOS vector element sum (with coalescing)" },
+    { xyzSumAOS, "serial", ispc::xyzSumAOSStdlib, "ispc", "AOS vector element sum (stdlib swizzle)" },
+    { xyzSumAOS, "serial", ispc::xyzSumAOSNoCoalesce, "ispc", "AOS vector element sum (no coalescing)" },
+    { xyzSumSOA, "serial", ispc::xyzSumSOA, "ispc", "SOA vector element sum" },
+    { ispc::gathers, "gather", ispc::loads, "vector load", "Memory reads" },
+    { ispc::scatters, "scatter", ispc::stores, "vector store", "Memory writes" },
+};
+
+int main() {
+    int count = 3*64*1024;
+    float *a = new float[count];
+    float zeros[32] = { 0 };
+
+    int nTests = sizeof(tests) / sizeof(tests[0]);
+    for (int i = 0; i < nTests; ++i) {
+        lInitData(a, count);
+        reset_and_start_timer();
+        float resultA[3] = { 0, 0, 0 };
+        for (int j = 0; j < 100; ++j)
+            tests[i].aFunc(a, count, zeros, resultA);
+        double aTime = get_elapsed_mcycles();
+
+        lInitData(a, count);
+        reset_and_start_timer();
+        float resultB[3] = { 0, 0, 0 };
+        for (int j = 0; j < 100; ++j)
+            tests[i].bFunc(a, count, zeros, resultB);
+        double bTime = get_elapsed_mcycles();
+
+        printf("%-40s: [%.2f] M cycles %s, [%.2f] M cycles %s (%.2fx speedup).\n",
+               tests[i].testName, aTime, tests[i].aName, bTime, tests[i].bName,
+               aTime/bTime);
+#if 0
+        printf("\t(%f %f %f) - (%f %f %f)\n", resultSerial[0], resultSerial[1],
+               resultSerial[2], resultISPC[0], resultISPC[1], resultISPC[2]);
+#endif
+    }
+
+    return 0;
+}
+
--- a/examples/perfbench/perfbench.ispc
+++ b/examples/perfbench/perfbench.ispc
@@ -0,0 +1,170 @@
+/*
+  Copyright (c) 2012, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+export void xyzSumAOS(uniform float array[], uniform int count,
+                      uniform float zeros[], uniform float result[]) {
+    float xsum = 0, ysum = 0, zsum = 0;
+    foreach (i = 0 ... count/3) {
+        float x = array[3*i];
+        float y = array[3*i+1];
+        float z = array[3*i+2];
+
+        xsum += x;
+        ysum += y;
+        zsum += z;
+    }
+    result[0] = reduce_add(xsum);
+    result[1] = reduce_add(ysum);
+    result[2] = reduce_add(zsum);
+}
+
+export void xyzSumAOSStdlib(uniform float array[], uniform int count,
+                            uniform float zeros[], uniform float result[]) {
+    float xsum = 0, ysum = 0, zsum = 0;
+    for (uniform int i = 0; i < 64*1024 /*count/3*/; i += programCount) {
+        float x, y, z;
+        aos_to_soa3(&array[3*i], &x, &y, &z);
+
+        xsum += x;
+        ysum += y;
+        zsum += z;
+    }
+    result[0] = reduce_add(xsum);
+    result[1] = reduce_add(ysum);
+    result[2] = reduce_add(zsum);
+}
+
+export void xyzSumAOSNoCoalesce(uniform float array[], uniform int count,
+                                uniform float zerosArray[], uniform float result[]) {
+    int zeros = zerosArray[programIndex];
+    float xsum = 0, ysum = 0, zsum = 0;
+    foreach (i = 0 ... count/3) {
+        float x = array[3*i+zeros];
+        float y = array[3*i+1+zeros];
+        float z = array[3*i+2+zeros];
+
+        xsum += x;
+        ysum += y;
+        zsum += z;
+    }
+    result[0] = reduce_add(xsum);
+    result[1] = reduce_add(ysum);
+    result[2] = reduce_add(zsum);
+}
+
+export void xyzSumSOA(uniform float array[], uniform int count,
+                      uniform float zeros[], uniform float result[]) {
+    float xsum = 0, ysum = 0, zsum = 0;
+    uniform float * uniform ap = array;
+    assert(programCount <= 8);
+
+    for (uniform int i = 0; i < count/3; i += 8, ap += 24) {
+        for (uniform int j = 0; j < 8; j += programCount) {
+            float x = ap[j + programIndex];
+            float y = ap[8 + j + programIndex];
+            float z = ap[16 + j + programIndex];
+
+            xsum += x;
+            ysum += y;
+            zsum += z;
+        }
+    }
+    result[0] = reduce_add(xsum);
+    result[1] = reduce_add(ysum);
+    result[2] = reduce_add(zsum);
+}
+
+export void gathers(uniform float array[], uniform int count,
+                    uniform float zeros[], uniform float result[]) {
+    float sum = 0;
+    int zero = zeros[programIndex];
+    foreach (i = 0 ... count)
+        sum += array[i + zero];
+    result[0] = reduce_add(sum);
+}
+
+
+export void loads(uniform float array[], uniform int count,
+                  uniform float zeros[], uniform float result[]) {
+    float sum = 0;
+    foreach (i = 0 ... count)
+        sum += array[i];
+    result[0] = reduce_add(sum);
+}
+
+
+export void scatters(uniform float array[], uniform int count,
+                     uniform float zeros[], uniform float result[]) {
+    int zero = zeros[programIndex];
+    foreach (i = 0 ... count)
+        array[i + zero] = zero;
+}
+
+
+export void stores(uniform float array[], uniform int count,
+                   uniform float zeros[], uniform float result[]) {
+    int zero = zeros[programIndex];
+    foreach (i = 0 ... count)
+        array[i] = zero;
+}
+
+export void normalizeAOSNoCoalesce(uniform float array[], uniform int count,
+                                   uniform float zeroArray[]) {
+    float zeros = zeroArray[programIndex];
+    foreach (i = 0 ... count/3) {
+        float x = array[3*i+zeros];
+        float y = array[3*i+1+zeros];
+        float z = array[3*i+2+zeros];
+
+        float l2 = x*x + y*y + z*z;
+        
+        array[3*i] /= l2;
+        array[3*i+1] /= l2;
+        array[3*i+2] /= l2;
+    }
+}
+
+export void normalizeSOA(uniform float array[], uniform int count,
+                         uniform float zeros[]) {
+    foreach (i = 0 ... count/3) {
+        float x = array[3*i];
+        float y = array[3*i+1];
+        float z = array[3*i+2];
+
+        float l2 = x*x + y*y + z*z;
+        
+        array[3*i] /= l2;
+        array[3*i+1] /= l2;
+        array[3*i+2] /= l2;
+    }
+}
--- a/examples/perfbench/perfbench.vcxproj
+++ b/examples/perfbench/perfbench.vcxproj
@@ -0,0 +1,175 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{d923bb7e-a7c8-4850-8fcf-0eb9ce35b4e8}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>perfbench</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="perfbench.cpp" />
+    <ClCompile Include="perfbench_serial.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="perfbench.ispc">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
--- a/examples/perfbench/perfbench_serial.cpp
+++ b/examples/perfbench/perfbench_serial.cpp
@@ -0,0 +1,61 @@
+/*
+  Copyright (c) 2012, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#include <math.h>
+
+void
+xyzSumAOS(float *a, int count, float *zeros, float *result) {
+    float xsum = 0, ysum = 0, zsum = 0;
+    for (int i = 0; i < count; i += 3) {
+        xsum += a[i];
+        ysum += a[i+1];
+        zsum += a[i+2];
+    }
+    result[0] = xsum;
+    result[1] = ysum;
+    result[2] = zsum;
+}
+
+void
+xyzSumSOA(float *a, int count, float *zeros, float *result) {
+    float xsum = 0, ysum = 0, zsum = 0;
+    for (int i = 0; i < count/3; ++i) {
+        float *p = a + (i >> 3) * 24 + (i & 7);
+        xsum += p[0];
+        ysum += p[8];
+        zsum += p[16];
+    }
+    result[0] = xsum;
+    result[1] = ysum;
+    result[2] = zsum;
+}
--- a/examples/rt/Makefile
+++ b/examples/rt/Makefile
@@ -1,38 +1,7 @@

-ARCH = $(shell uname)
+EXAMPLE=rt
+CPP_SRC=rt.cpp rt_serial.cpp
+ISPC_SRC=rt.ispc
+ISPC_TARGETS=sse2,sse4-x2,avx

-TASK_CXX=../tasksys.cpp
-TASK_LIB=-lpthread
-TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
-
-CXX=g++
-CXXFLAGS=-Iobjs/ -O3 -Wall -m64
-ISPC=ispc
-ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx --arch=x86-64
-
-OBJS=objs/rt.o objs/rt_serial.o $(TASK_OBJ) objs/rt_ispc.o objs/rt_ispc_sse2.o \
-	objs/rt_ispc_sse4.o objs/rt_ispc_avx.o
-
-default: rt
-
-.PHONY: dirs clean
-
-dirs:
-	/bin/mkdir -p objs/
-
-clean:
-	/bin/rm -rf objs *~ rt
-
-rt: dirs $(OBJS)
-	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASK_LIB)
-
-objs/%.o: %.cpp
-	$(CXX) $< $(CXXFLAGS) -c -o $@
-
-objs/%.o: ../%.cpp
-	$(CXX) $< $(CXXFLAGS) -c -o $@
-
-objs/rt.o: objs/rt_ispc.h 
-
-objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
-	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
+include ../common.mk
--- a/examples/rt/rt.ispc
+++ b/examples/rt/rt.ispc
@@ -43,17 +43,17 @@ struct Ray {
 };

 struct Triangle {
-    uniform float p[3][4];
-    uniform int id;
-    uniform int pad[3];
+    float p[3][4];
+    int id;
+    int pad[3];
 };

 struct LinearBVHNode {
-    uniform float bounds[2][3];
-    uniform unsigned int offset;     // num primitives for leaf, second child for interior
-    uniform unsigned int8 nPrimitives;
-    uniform unsigned int8 splitAxis;
-    uniform unsigned int16 pad;
+    float bounds[2][3];
+    unsigned int offset;     // num primitives for leaf, second child for interior
+    unsigned int8 nPrimitives;
+    unsigned int8 splitAxis;
+    unsigned int16 pad;
 };

 static inline float3 Cross(const float3 v1, const float3 v2) {
@@ -88,9 +88,12 @@ static void generateRay(uniform const float raster2camera[4][4],
    camy /= camw;
    camz /= camw;

-    ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy + camera2world[0][2] * camz;
-    ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy + camera2world[1][2] * camz;
-    ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy + camera2world[2][2] * camz;
+    ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy + 
+        camera2world[0][2] * camz;
+    ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy + 
+        camera2world[1][2] * camz;
+    ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy + 
+        camera2world[2][2] * camz;

    ray.origin.x = camera2world[0][3] / camera2world[3][3];
    ray.origin.y = camera2world[1][3] / camera2world[3][3];
@@ -104,8 +107,8 @@ static void generateRay(uniform const float raster2camera[4][4],
 }


-static inline bool BBoxIntersect(const uniform float bounds[2][3], 
-                                 const Ray &ray) {
+static bool BBoxIntersect(const uniform float bounds[2][3], 
+                          const Ray &ray) {
    uniform float3 bounds0 = { bounds[0][0], bounds[0][1], bounds[0][2] };
    uniform float3 bounds1 = { bounds[1][0], bounds[1][1], bounds[1][2] };
    float t0 = ray.mint, t1 = ray.maxt;
@@ -143,7 +146,7 @@ static inline bool BBoxIntersect(const uniform float bounds[2][3],



-static inline bool TriIntersect(const Triangle &tri, Ray &ray) {
+static bool TriIntersect(const uniform Triangle &tri, Ray &ray) {
    uniform float3 p0 = { tri.p[0][0], tri.p[0][1], tri.p[0][2] };
    uniform float3 p1 = { tri.p[1][0], tri.p[1][1], tri.p[1][2] };
    uniform float3 p2 = { tri.p[2][0], tri.p[2][1], tri.p[2][2] };
@@ -183,8 +186,8 @@ static inline bool TriIntersect(const Triangle &tri, Ray &ray) {
 }


-bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[], 
-                  Ray &r) {
+bool BVHIntersect(const uniform LinearBVHNode nodes[], 
+                  const uniform Triangle tris[], Ray &r) {
    Ray ray = r;
    bool hit = false;
    // Follow ray through BVH nodes to find primitive intersections
@@ -193,7 +196,7 @@ bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],

    while (true) {
        // Check ray against BVH node
-        LinearBVHNode node = nodes[nodeNum];
+        uniform LinearBVHNode node = nodes[nodeNum];
        if (any(BBoxIntersect(node.bounds, ray))) {
            uniform unsigned int nPrimitives = node.nPrimitives;
            if (nPrimitives > 0) {
@@ -239,8 +242,8 @@ static void raytrace_tile(uniform int x0, uniform int x1,
                          const uniform float raster2camera[4][4], 
                          const uniform float camera2world[4][4],
                          uniform float image[], uniform int id[],
-                          const LinearBVHNode nodes[],
-                          const Triangle triangles[]) {
+                          const uniform LinearBVHNode nodes[],
+                          const uniform Triangle triangles[]) {
    uniform float widthScale = (float)(baseWidth) / (float)(width);
    uniform float heightScale = (float)(baseHeight) / (float)(height);

@@ -262,8 +265,8 @@ export void raytrace_ispc(uniform int width, uniform int height,
                          const uniform float raster2camera[4][4], 
                          const uniform float camera2world[4][4],
                          uniform float image[], uniform int id[],
-                          const LinearBVHNode nodes[],
-                          const Triangle triangles[]) {
+                          const uniform LinearBVHNode nodes[],
+                          const uniform Triangle triangles[]) {
    raytrace_tile(0, width, 0, height, width, height, baseWidth, baseHeight,
                  raster2camera, camera2world, image,
                  id, nodes, triangles);
@@ -275,8 +278,8 @@ task void raytrace_tile_task(uniform int width, uniform int height,
                             const uniform float raster2camera[4][4], 
                             const uniform float camera2world[4][4],
                             uniform float image[], uniform int id[],
-                             const LinearBVHNode nodes[],
-                             const Triangle triangles[]) {
+                             const uniform LinearBVHNode nodes[],
+                             const uniform Triangle triangles[]) {
    uniform int dx = 16, dy = 16; // must match dx, dy below
    uniform int xBuckets = (width + (dx-1)) / dx;
    uniform int x0 = (taskIndex % xBuckets) * dx;
@@ -295,14 +298,14 @@ export void raytrace_ispc_tasks(uniform int width, uniform int height,
                                const uniform float raster2camera[4][4], 
                                const uniform float camera2world[4][4],
                                uniform float image[], uniform int id[],
-                                const LinearBVHNode nodes[],
-                                const Triangle triangles[]) {
+                                const uniform LinearBVHNode nodes[],
+                                const uniform Triangle triangles[]) {
    uniform int dx = 16, dy = 16;
    uniform int xBuckets = (width + (dx-1)) / dx;
    uniform int yBuckets = (height + (dy-1)) / dy;
    uniform int nTasks = xBuckets * yBuckets;
-    launch[nTasks] < raytrace_tile_task(width, height, baseWidth, baseHeight, 
-                                        raster2camera, camera2world, 
-                                        image, id, nodes, triangles) >;
+    launch[nTasks] raytrace_tile_task(width, height, baseWidth, baseHeight, 
+                                      raster2camera, camera2world, 
+                                      image, id, nodes, triangles);
 }

--- a/examples/rt/rt_serial.cpp
+++ b/examples/rt/rt_serial.cpp
@@ -123,9 +123,12 @@ static void generateRay(const float raster2camera[4][4],
    camy /= camw;
    camz /= camw;

-    ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy + camera2world[0][2] * camz;
-    ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy + camera2world[1][2] * camz;
-    ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy + camera2world[2][2] * camz;
+    ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy +
+        camera2world[0][2] * camz;
+    ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy +
+        camera2world[1][2] * camz;
+    ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy +
+        camera2world[2][2] * camz;

    ray.origin.x = camera2world[0][3] / camera2world[3][3];
    ray.origin.y = camera2world[1][3] / camera2world[3][3];
--- a/examples/stencil/Makefile
+++ b/examples/stencil/Makefile
@@ -1,39 +1,7 @@

-ARCH = $(shell uname)
+EXAMPLE=stencil
+CPP_SRC=stencil.cpp stencil_serial.cpp
+ISPC_SRC=stencil.ispc
+ISPC_TARGETS=sse2,sse4-x2,avx-x2

-TASK_CXX=../tasksys.cpp
-TASK_LIB=-lpthread
-TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
-
-CXX=g++
-CXXFLAGS=-Iobjs/ -O3 -Wall -m64
-ISPC=ispc
-ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx --arch=x86-64
-
-OBJS=objs/stencil.o objs/stencil_serial.o $(TASK_OBJ) objs/stencil_ispc.o \
-	objs/stencil_ispc_sse2.o objs/stencil_ispc_sse4.o \
-	objs/stencil_ispc_avx.o
-
-default: stencil
-
-.PHONY: dirs clean
-
-dirs:
-	/bin/mkdir -p objs/
-
-clean:
-	/bin/rm -rf objs *~ stencil
-
-stencil: dirs $(OBJS)
-	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASK_LIB)
-
-objs/%.o: %.cpp
-	$(CXX) $< $(CXXFLAGS) -c -o $@
-
-objs/%.o: ../%.cpp
-	$(CXX) $< $(CXXFLAGS) -c -o $@
-
-objs/stencil.o: objs/stencil_ispc.h 
-
-objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
-	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
+include ../common.mk
--- a/examples/stencil/stencil.ispc
+++ b/examples/stencil/stencil.ispc
@@ -41,27 +41,23 @@ stencil_step(uniform int x0, uniform int x1,
             uniform const float Ain[], uniform float Aout[]) {
    const uniform int Nxy = Nx * Ny;

-    for (uniform int z = z0; z < z1; ++z) {
-        for (uniform int y = y0; y < y1; ++y) {
-            foreach (x = x0 ... x1) {
-                int index = (z * Nxy) + (y * Nx) + x;
+    foreach (z = z0 ... z1, y = y0 ... y1, x = x0 ... x1) {
+        int index = (z * Nxy) + (y * Nx) + x;
 #define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]
 #define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
-                float div = coef[0] * A_cur(0, 0, 0) +
-                            coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) +
-                                       A_cur(0, +1, 0) + A_cur(0, -1, 0) +
-                                       A_cur(0, 0, +1) + A_cur(0, 0, -1)) +
-                            coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) +
-                                       A_cur(0, +2, 0) + A_cur(0, -2, 0) +
-                                       A_cur(0, 0, +2) + A_cur(0, 0, -2)) +
-                            coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) +
-                                       A_cur(0, +3, 0) + A_cur(0, -3, 0) +
-                                       A_cur(0, 0, +3) + A_cur(0, 0, -3));
+        float div = coef[0] * A_cur(0, 0, 0) +
+            coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) +
+                       A_cur(0, +1, 0) + A_cur(0, -1, 0) +
+                       A_cur(0, 0, +1) + A_cur(0, 0, -1)) +
+            coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) +
+                       A_cur(0, +2, 0) + A_cur(0, -2, 0) +
+                       A_cur(0, 0, +2) + A_cur(0, 0, -2)) +
+            coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) +
+                       A_cur(0, +3, 0) + A_cur(0, -3, 0) +
+                       A_cur(0, 0, +3) + A_cur(0, 0, -3));

-                A_next(0, 0, 0) = 2 * A_cur(0, 0, 0) - A_next(0, 0, 0) + 
-                    vsq[index] * div;
-            }
-        }
+        A_next(0, 0, 0) = 2 * A_cur(0, 0, 0) - A_next(0, 0, 0) + 
+            vsq[index] * div;
    }
 }

@@ -69,11 +65,12 @@ stencil_step(uniform int x0, uniform int x1,
 static task void
 stencil_step_task(uniform int x0, uniform int x1,
                  uniform int y0, uniform int y1,
-                  uniform int z0, uniform int z1,
+                  uniform int z0,
                  uniform int Nx, uniform int Ny, uniform int Nz,
                  uniform const float coef[4], uniform const float vsq[],
                  uniform const float Ain[], uniform float Aout[]) {
-    stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, Ain, Aout);
+    stencil_step(x0, x1, y0, y1, z0+taskIndex, z0+taskIndex+1,
+                 Nx, Ny, Nz, coef, vsq, Ain, Aout);
 }


@@ -89,17 +86,14 @@ loop_stencil_ispc_tasks(uniform int t0, uniform int t1,
 {
    for (uniform int t = t0; t < t1; ++t) {
        // Parallelize across cores as well: each task will work on a slice
-        // of "dz" in the z extent of the volume.  (dz=1 seems to work
-        // better than any larger values.)
-        uniform int dz = 1;
-        for (uniform int z = z0; z < z1; z += dz) {
-            if ((t & 1) == 0)
-                launch < stencil_step_task(x0, x1, y0, y1, z, z+dz, Nx, Ny, Nz, 
-                                           coef, vsq, Aeven, Aodd) >;
-            else
-                launch < stencil_step_task(x0, x1, y0, y1, z, z+dz, Nx, Ny, Nz, 
-                                           coef, vsq, Aodd, Aeven) >;
-        }
+        // of 1 in the z extent of the volume.
+        if ((t & 1) == 0)
+            launch[z1-z0] stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz, 
+                                            coef, vsq, Aeven, Aodd);
+        else
+            launch[z1-z0] stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz, 
+                                            coef, vsq, Aodd, Aeven);
+
        // We need to wait for all of the launched tasks to finish before
        // starting the next iteration.
        sync;
--- a/examples/tasksys.cpp
+++ b/examples/tasksys.cpp
@@ -273,7 +273,7 @@ lAtomicCompareAndSwapPointer(void **v, void *newValue, void *oldValue) {
 #else
    void *result;
 #if (ISPC_POINTER_BYTES == 4)
-    __asm__ __volatile__("lock\ncmpxchgd %2,%1"
+    __asm__ __volatile__("lock\ncmpxchgl %2,%1"
                          : "=a"(result), "=m"(*v)
                          : "q"(newValue), "0"(oldValue)
                          : "memory");
--- a/examples/volume_rendering/Makefile
+++ b/examples/volume_rendering/Makefile
@@ -1,38 +1,7 @@

-ARCH = $(shell uname)
+EXAMPLE=volume
+CPP_SRC=volume.cpp volume_serial.cpp
+ISPC_SRC=volume.ispc
+ISPC_TARGETS=sse2,sse4-x2,avx

-TASK_CXX=../tasksys.cpp
-TASK_LIB=-lpthread
-TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
-
-CXX=g++
-CXXFLAGS=-Iobjs/ -O3 -Wall -m64
-ISPC=ispc
-ISPCFLAGS=-O2 --target=sse2,sse4-x2 --arch=x86-64
-
-OBJS=objs/volume.o objs/volume_serial.o $(TASK_OBJ) objs/volume_ispc.o \
-	objs/volume_ispc_sse2.o objs/volume_ispc_sse4.o
-
-default: volume
-
-.PHONY: dirs clean
-
-dirs:
-	/bin/mkdir -p objs/
-
-clean:
-	/bin/rm -rf objs *~ volume
-
-volume: dirs $(OBJS)
-	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASK_LIB)
-
-objs/%.o: %.cpp
-	$(CXX) $< $(CXXFLAGS) -c -o $@
-
-objs/%.o: ../%.cpp
-	$(CXX) $< $(CXXFLAGS) -c -o $@
-
-objs/volume.o: objs/volume_ispc.h 
-
-objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o: %.ispc
-	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
+include ../common.mk
--- a/examples/volume_rendering/volume.ispc
+++ b/examples/volume_rendering/volume.ispc
@@ -124,24 +124,13 @@ static inline float D(int x, int y, int z, uniform int nVoxels[3],
 }


-static inline float Du(uniform int x, uniform int y, uniform int z, 
-                       uniform int nVoxels[3], uniform float density[]) {
-    x = clamp(x, 0, nVoxels[0]-1);
-    y = clamp(y, 0, nVoxels[1]-1);
-    z = clamp(z, 0, nVoxels[2]-1);
-
-    return density[z*nVoxels[0]*nVoxels[1] + y*nVoxels[0] + x];
-}
-
-
 static inline float3 Offset(float3 p, float3 pMin, float3 pMax) {
    return (p - pMin) / (pMax - pMin);
 }


-static inline float Density(float3 Pobj, float3 pMin, float3 pMax, 
-                            uniform float density[], uniform int nVoxels[3],
-                            uniform bool &checkForSameVoxel) {
+static float Density(float3 Pobj, float3 pMin, float3 pMax, 
+                     uniform float density[], uniform int nVoxels[3]) {
    if (!Inside(Pobj, pMin, pMax)) 
        return 0;
    // Compute voxel coordinates and offsets for _Pobj_
@@ -153,39 +142,14 @@ static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
    float dx = vox.x - vx, dy = vox.y - vy, dz = vox.z - vz;

    // Trilinearly interpolate density values to compute local density
-    float d00, d10, d01, d11;
-    uniform int uvx, uvy, uvz;
-    if (checkForSameVoxel && reduce_equal(vx, &uvx) && reduce_equal(vy, &uvy) &&
-        reduce_equal(vz, &uvz)) {
-        // If all of the program instances are inside the same voxel, then
-        // we'll call the 'uniform' variant of the voxel density lookup
-        // function, thus doing a single load for each value rather than a
-        // gather.
-        d00 = Lerp(dx, Du(uvx, uvy, uvz, nVoxels, density),     
-                       Du(uvx+1, uvy, uvz, nVoxels, density));
-        d10 = Lerp(dx, Du(uvx, uvy+1, uvz, nVoxels, density),   
-                       Du(uvx+1, uvy+1, uvz, nVoxels, density));
-        d01 = Lerp(dx, Du(uvx, uvy, uvz+1, nVoxels, density),   
-                       Du(uvx+1, uvy, uvz+1, nVoxels, density));
-        d11 = Lerp(dx, Du(uvx, uvy+1, uvz+1, nVoxels, density), 
-                       Du(uvx+1, uvy+1, uvz+1, nVoxels, density));
-    }
-    else {
-        // Otherwise, we have to do an actual gather in the more general
-        // D() function.  Once the reduce_equal tests above fail, we stop
-        // checking in subsequent steps, since it's unlikely that this will
-        // be true in the future once they've diverged into different
-        // voxels.
-        checkForSameVoxel = false;
-        d00 = Lerp(dx, D(vx, vy, vz, nVoxels, density),     
-                       D(vx+1, vy, vz, nVoxels, density));
-        d10 = Lerp(dx, D(vx, vy+1, vz, nVoxels, density),   
-                       D(vx+1, vy+1, vz, nVoxels, density));
-        d01 = Lerp(dx, D(vx, vy, vz+1, nVoxels, density),   
-                       D(vx+1, vy, vz+1, nVoxels, density));
-        d11 = Lerp(dx, D(vx, vy+1, vz+1, nVoxels, density), 
-                       D(vx+1, vy+1, vz+1, nVoxels, density));
-    }
+    float d00 = Lerp(dx, D(vx, vy, vz, nVoxels, density),     
+                     D(vx+1, vy, vz, nVoxels, density));
+    float d10 = Lerp(dx, D(vx, vy+1, vz, nVoxels, density),   
+                     D(vx+1, vy+1, vz, nVoxels, density));
+    float d01 = Lerp(dx, D(vx, vy, vz+1, nVoxels, density),   
+                     D(vx+1, vy, vz+1, nVoxels, density));
+    float d11 = Lerp(dx, D(vx, vy+1, vz+1, nVoxels, density), 
+                     D(vx+1, vy+1, vz+1, nVoxels, density));
    float d0 = Lerp(dy, d00, d10);
    float d1 = Lerp(dy, d01, d11);
    return Lerp(dz, d0, d1);
@@ -221,10 +185,8 @@ transmittance(uniform float3 p0, float3 p1, uniform float3 pMin,
    float t = rayT0;
    float3 pos = ray.origin + ray.dir * rayT0;
    float3 dirStep = ray.dir * stepT;
-    uniform bool checkForSameVoxel = true;
    while (t < rayT1) {
-        tau += stepDist * sigma_t * Density(pos, pMin, pMax, density, nVoxels,
-                                            checkForSameVoxel);
+        tau += stepDist * sigma_t * Density(pos, pMin, pMax, density, nVoxels);
        pos = pos + dirStep;
        t += stepT;
    }
@@ -268,9 +230,8 @@ raymarch(uniform float density[], uniform int nVoxels[3], Ray ray) {
    float t = rayT0;
    float3 pos = ray.origin + ray.dir * rayT0;
    float3 dirStep = ray.dir * stepT;
-    uniform bool checkForSameVoxel = true;
    cwhile (t < rayT1) {
-        float d = Density(pos, pMin, pMax, density, nVoxels, checkForSameVoxel);
+        float d = Density(pos, pMin, pMax, density, nVoxels);

        // terminate once attenuation is high
        float atten = exp(-tau);
@@ -375,6 +336,6 @@ volume_ispc_tasks(uniform float density[], uniform int nVoxels[3],
    // Launch tasks to work on (dx,dy)-sized tiles of the image
    uniform int dx = 8, dy = 8;
    uniform int nTasks = ((width+(dx-1))/dx) * ((height+(dy-1))/dy);
-    launch[nTasks] < volume_task(density, nVoxels, raster2camera, camera2world, 
-                                 width, height, image) >;
+    launch[nTasks] volume_task(density, nVoxels, raster2camera, camera2world, 
+                               width, height, image);
 }
--- a/examples/volume_rendering/volume.vcxproj
+++ b/examples/volume_rendering/volume.vcxproj
@@ -156,18 +156,18 @@
  <ItemGroup>
    <CustomBuild Include="volume.ispc">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
    </CustomBuild>
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
--- a/expr.cpp
+++ b/expr.cpp
--- a/expr.h
+++ b/expr.h
@@ -299,7 +299,6 @@ public:
    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
    llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
    const Type *GetType() const;
-    const Type *GetLValueType() const;
    Symbol *GetBaseSymbol() const;
    void Print() const;
    Expr *Optimize();
@@ -314,7 +313,6 @@ public:
    std::string identifier;
    const SourcePos identifierPos;

-protected:
    MemberExpr(Expr *expr, const char *identifier, SourcePos pos, 
               SourcePos identifierPos, bool derefLValue);

@@ -389,6 +387,10 @@ public:
        with values given by the "vales" parameter. */
    ConstExpr(ConstExpr *old, double *values);

+    /** Create ConstExpr with the same type and values as the given one,
+        but at the given position. */
+    ConstExpr(ConstExpr *old, SourcePos pos);
+
    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
    const Type *GetType() const;
    void Print() const;
@@ -493,8 +495,7 @@ private:
    probably-different type. */
 class TypeCastExpr : public Expr {
 public:
-    TypeCastExpr(const Type *t, Expr *e, bool preserveUniformity,
-                 SourcePos p);
+    TypeCastExpr(const Type *t, Expr *e, SourcePos p);

    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
    const Type *GetType() const;
@@ -507,7 +508,6 @@ public:

    const Type *type;
    Expr *expr;
-    bool preserveUniformity;
 };


@@ -634,13 +634,14 @@ public:
        being done just given type information without the parameter
        argument expressions being available.  It returns true on success.
     */
-    bool ResolveOverloads(const std::vector<const Type *> &argTypes,
+    bool ResolveOverloads(SourcePos argPos,
+                          const std::vector<const Type *> &argTypes,
                          const std::vector<bool> *argCouldBeNULL = NULL);
    Symbol *GetMatchingFunction();

 private:
    bool tryResolve(int (*matchFunc)(const Type *, const Type *),
-                    const std::vector<const Type *> &argTypes,
+                    SourcePos argPos, const std::vector<const Type *> &argTypes,
                    const std::vector<bool> *argCouldBeNULL);

    /** Name of the function that is being called. */
@@ -682,11 +683,44 @@ public:
    const Type *GetType() const;
    Expr *TypeCheck();
    Expr *Optimize();
+    llvm::Constant *GetConstant(const Type *type) const;
    void Print() const;
    int EstimateCost() const;
 };


+/** An expression representing a "new" expression, used for dynamically
+    allocating memory. 
+*/
+class NewExpr : public Expr {
+public:
+    NewExpr(int typeQual, const Type *type, Expr *initializer, Expr *count, 
+            SourcePos tqPos, SourcePos p);
+
+    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
+    const Type *GetType() const;
+    Expr *TypeCheck();
+    Expr *Optimize();
+    void Print() const;
+    int EstimateCost() const;
+
+    /** Type of object to allocate storage for. */
+    const Type *allocType;
+    /** Expression giving the number of elements to allocate, when the 
+        "new Foo[expr]" form is used.  This may be NULL, in which case a
+        single element of the given type will be allocated. */
+    Expr *countExpr;
+    /** Optional initializer expression used to initialize the allocated
+        memory. */
+    Expr *initExpr;
+    /** Indicates whether this is a "varying new" or "uniform new"
+        (i.e. whether a separate allocation is performed per program
+        instance, or whether a single allocation is performed for the
+        entire gang of program instances.) */
+    bool isVarying;
+};
+
+
 /** This function indicates whether it's legal to convert from fromType to
    toType.  If the optional errorMsgBase and source position parameters
    are provided, then an error message is issued if the type conversion
@@ -705,4 +739,20 @@ bool CanConvertTypes(const Type *fromType, const Type *toType,
 */
 Expr *TypeConvertExpr(Expr *expr, const Type *toType, const char *errorMsgBase);

+/** Utility routine that emits code to initialize a symbol given an
+    initializer expression.
+
+    @param lvalue    Memory location of storage for the symbol's data
+    @param symName   Name of symbol (used in error messages)
+    @param symType   Type of variable being initialized
+    @param initExpr  Expression for the initializer
+    @param ctx       FunctionEmitContext to use for generating instructions
+    @param pos       Source file position of the variable being initialized
+*/
+void
+InitSymbol(llvm::Value *lvalue, const Type *symType, Expr *initExpr,
+           FunctionEmitContext *ctx, SourcePos pos);
+
+bool PossiblyResolveFunctionOverloads(Expr *expr, const Type *type);
+
 #endif // ISPC_EXPR_H
--- a/func.cpp
+++ b/func.cpp
@@ -72,17 +72,10 @@ Function::Function(Symbol *s, const std::vector<Symbol *> &a, Stmt *c) {
    code = c;

    maskSymbol = m->symbolTable->LookupVariable("__mask");
-    assert(maskSymbol != NULL);
+    Assert(maskSymbol != NULL);

    if (code != NULL) {
-        if (g->debugPrint) {
-            fprintf(stderr, "Creating function \"%s\".  Initial code:\n", 
-                    sym->name.c_str());
-            code->Print(0);
-            fprintf(stderr, "---------------------\n");
-        }
-
-        code = code->TypeCheck();
+        code = TypeCheck(code);

        if (code != NULL && g->debugPrint) {
            fprintf(stderr, "After typechecking function \"%s\":\n", 
@@ -92,7 +85,7 @@ Function::Function(Symbol *s, const std::vector<Symbol *> &a, Stmt *c) {
        }

        if (code != NULL) {
-            code = code->Optimize();
+            code = Optimize(code);
            if (g->debugPrint) {
                fprintf(stderr, "After optimizing function \"%s\":\n", 
                        sym->name.c_str());
@@ -109,7 +102,7 @@ Function::Function(Symbol *s, const std::vector<Symbol *> &a, Stmt *c) {
    }

    const FunctionType *type = dynamic_cast<const FunctionType *>(sym->type);
-    assert(type != NULL);
+    Assert(type != NULL);

    for (unsigned int i = 0; i < args.size(); ++i)
        if (dynamic_cast<const ReferenceType *>(args[i]->type) == NULL)
@@ -117,13 +110,13 @@ Function::Function(Symbol *s, const std::vector<Symbol *> &a, Stmt *c) {

    if (type->isTask) {
        threadIndexSym = m->symbolTable->LookupVariable("threadIndex");
-        assert(threadIndexSym);
+        Assert(threadIndexSym);
        threadCountSym = m->symbolTable->LookupVariable("threadCount");
-        assert(threadCountSym);
+        Assert(threadCountSym);
        taskIndexSym = m->symbolTable->LookupVariable("taskIndex");
-        assert(taskIndexSym);
+        Assert(taskIndexSym);
        taskCountSym = m->symbolTable->LookupVariable("taskCount");
-        assert(taskCountSym);
+        Assert(taskCountSym);
    }
    else
        threadIndexSym = threadCountSym = taskIndexSym = taskCountSym = NULL;
@@ -133,7 +126,7 @@ Function::Function(Symbol *s, const std::vector<Symbol *> &a, Stmt *c) {
 const Type *
 Function::GetReturnType() const {
    const FunctionType *type = dynamic_cast<const FunctionType *>(sym->type);
-    assert(type != NULL);
+    Assert(type != NULL);
    return type->GetReturnType();
 }

@@ -141,7 +134,7 @@ Function::GetReturnType() const {
 const FunctionType *
 Function::GetType() const {
    const FunctionType *type = dynamic_cast<const FunctionType *>(sym->type);
-    assert(type != NULL);
+    Assert(type != NULL);
    return type;
 }

@@ -157,9 +150,9 @@ lCopyInTaskParameter(int i, llvm::Value *structArgPtr, const std::vector<Symbol
    // We expect the argument structure to come in as a poitner to a
    // structure.  Confirm and figure out its type here.
    const llvm::Type *structArgType = structArgPtr->getType();
-    assert(llvm::isa<llvm::PointerType>(structArgType));
+    Assert(llvm::isa<llvm::PointerType>(structArgType));
    const llvm::PointerType *pt = llvm::dyn_cast<const llvm::PointerType>(structArgType);
-    assert(llvm::isa<llvm::StructType>(pt->getElementType()));
+    Assert(llvm::isa<llvm::StructType>(pt->getElementType()));
    const llvm::StructType *argStructType = 
        llvm::dyn_cast<const llvm::StructType>(pt->getElementType());

@@ -189,10 +182,9 @@ lCopyInTaskParameter(int i, llvm::Value *structArgPtr, const std::vector<Symbol
 void 
 Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function, 
                   SourcePos firstStmtPos) {
-    llvm::Value *maskPtr = ctx->AllocaInst(LLVMTypes::MaskType, "mask_memory");
-    ctx->StoreInst(LLVMMaskAllOn, maskPtr);
-    maskSymbol->storagePtr = maskPtr;
-    ctx->SetMaskPointer(maskPtr);
+    // Connect the __mask builtin to the location in memory that stores its
+    // value
+    maskSymbol->storagePtr = ctx->GetFullMaskPointer();

    // add debugging info for __mask, programIndex, ...
    maskSymbol->pos = firstStmtPos;
@@ -202,7 +194,7 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
    llvm::BasicBlock *entryBBlock = ctx->GetCurrentBasicBlock();
 #endif
    const FunctionType *type = dynamic_cast<const FunctionType *>(sym->type);
-    assert(type != NULL);
+    Assert(type != NULL);
    if (type->isTask == true) {
        // For tasks, we there should always be three parmeters: the
        // pointer to the structure that holds all of the arguments, the
@@ -267,38 +259,87 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
        else {
            // Otherwise use the mask to set the entry mask value
            argIter->setName("__mask");
-            assert(argIter->getType() == LLVMTypes::MaskType);
+            Assert(argIter->getType() == LLVMTypes::MaskType);
            ctx->SetFunctionMask(argIter);
-            assert(++argIter == function->arg_end());
+            Assert(++argIter == function->arg_end());
        }
    }

    // Finally, we can generate code for the function
    if (code != NULL) {
-        int costEstimate = code->EstimateCost();
+        ctx->SetDebugPos(code->pos);
+        ctx->AddInstrumentationPoint("function entry");
+
+        int costEstimate = EstimateCost(code);
+        Debug(code->pos, "Estimated cost for function \"%s\" = %d\n", 
+              sym->name.c_str(), costEstimate);
+
+        // If the body of the function is non-trivial, then we wrap the
+        // entire thing inside code that tests to see if the mask is all
+        // on, all off, or mixed.  If this is a simple function, then this
+        // isn't worth the code bloat / overhead.
        bool checkMask = (type->isTask == true) || 
            ((function->hasFnAttr(llvm::Attribute::AlwaysInline) == false) &&
             costEstimate > CHECK_MASK_AT_FUNCTION_START_COST);
-        Debug(code->pos, "Estimated cost for function \"%s\" = %d\n", 
-              sym->name.c_str(), costEstimate);
-        // If the body of the function is non-trivial, then we wrap the
-        // entire thing around a varying "cif (true)" test in order to reap
-        // the side-effect benefit of checking to see if the execution mask
-        // is all on and thence having a specialized code path for that
-        // case.  If this is a simple function, then this isn't worth the
-        // code bloat / overhead.
-        if (checkMask) {
-            bool allTrue[ISPC_MAX_NVEC];
-            for (int i = 0; i < g->target.vectorWidth; ++i)
-                allTrue[i] = true;
-            Expr *trueExpr = new ConstExpr(AtomicType::VaryingBool, allTrue, 
-                                           code->pos);
-            code = new IfStmt(trueExpr, code, NULL, true, code->pos);
-        }
+        checkMask &= (g->target.maskingIsFree == false);
+        checkMask &= (g->opt.disableCoherentControlFlow == false);

-        ctx->SetDebugPos(code->pos);
-        ctx->AddInstrumentationPoint("function entry");
-        code->EmitCode(ctx);
+        if (checkMask) {
+            llvm::Value *mask = ctx->GetFunctionMask();
+            llvm::Value *allOn = ctx->All(mask);
+            llvm::BasicBlock *bbAllOn = ctx->CreateBasicBlock("all_on");
+            llvm::BasicBlock *bbNotAll = ctx->CreateBasicBlock("not_all_on");
+
+            // Set up basic blocks for goto targets
+            ctx->InitializeLabelMap(code);
+
+            ctx->BranchInst(bbAllOn, bbNotAll, allOn);
+            // all on: we've determined dynamically that the mask is all
+            // on.  Set the current mask to "all on" explicitly so that
+            // codegen for this path can be improved with this knowledge in
+            // hand...
+            ctx->SetCurrentBasicBlock(bbAllOn);
+            if (!g->opt.disableMaskAllOnOptimizations)
+                ctx->SetFunctionMask(LLVMMaskAllOn);
+            code->EmitCode(ctx);
+            if (ctx->GetCurrentBasicBlock())
+                ctx->ReturnInst();
+
+            // not all on: figure out if no instances are running, or if
+            // some of them are
+            ctx->SetCurrentBasicBlock(bbNotAll);
+            ctx->SetFunctionMask(mask);
+            llvm::BasicBlock *bbNoneOn = ctx->CreateBasicBlock("none_on");
+            llvm::BasicBlock *bbSomeOn = ctx->CreateBasicBlock("some_on");
+            llvm::Value *anyOn = ctx->Any(mask);
+            ctx->BranchInst(bbSomeOn, bbNoneOn, anyOn);
+            
+            // Everyone is off; get out of here.
+            ctx->SetCurrentBasicBlock(bbNoneOn);
+            ctx->ReturnInst();
+
+            // some on: reset the mask to the value it had at function
+            // entry and emit the code.  Resetting the mask here is
+            // important, due to the "all on" setting of it for the path
+            // above
+            ctx->SetCurrentBasicBlock(bbSomeOn);
+            ctx->SetFunctionMask(mask);
+
+            // Set up basic blocks for goto targets again; we want to have
+            // one set of them for gotos in the 'all on' case, and a
+            // distinct set for the 'mixed mask' case.
+            ctx->InitializeLabelMap(code);
+
+            code->EmitCode(ctx);
+            if (ctx->GetCurrentBasicBlock())
+                ctx->ReturnInst();
+        }
+        else {
+            // Set up basic blocks for goto targets
+            ctx->InitializeLabelMap(code);
+            // No check, just emit the code
+            code->EmitCode(ctx);
+        }
    }

    if (ctx->GetCurrentBasicBlock()) {
@@ -314,7 +355,7 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
        // issue a warning.  Also need to warn if it's the entry block for
        // the function (in which case it will not have predeccesors but is
        // still reachable.)
-        if (type->GetReturnType() != AtomicType::Void &&
+        if (Type::Equal(type->GetReturnType(), AtomicType::Void) == false &&
            (pred_begin(ec.bblock) != pred_end(ec.bblock) || (ec.bblock == entryBBlock)))
            Warning(sym->pos, "Missing return statement in function returning \"%s\".",
                    type->rType->GetString().c_str());
@@ -337,7 +378,7 @@ Function::GenerateIR() {
        return;

    llvm::Function *function = sym->function;
-    assert(function != NULL);
+    Assert(function != NULL);

    // But if that function has a definition, we don't want to redefine it.
    if (function->empty() == false) {
@@ -352,9 +393,8 @@ Function::GenerateIR() {
    SourcePos firstStmtPos = sym->pos;
    if (code) {
        StmtList *sl = dynamic_cast<StmtList *>(code);
-        if (sl && sl->GetStatements().size() > 0 && 
-            sl->GetStatements()[0] != NULL)
-            firstStmtPos = sl->GetStatements()[0]->pos;
+        if (sl && sl->stmts.size() > 0 && sl->stmts[0] != NULL)
+            firstStmtPos = sl->stmts[0]->pos;
        else
            firstStmtPos = code->pos;
    }
@@ -376,7 +416,7 @@ Function::GenerateIR() {
        // it without a mask parameter and without name mangling so that
        // the application can call it
        const FunctionType *type = dynamic_cast<const FunctionType *>(sym->type);
-        assert(type != NULL);
+        Assert(type != NULL);
        if (type->isExported) {
            if (!type->isTask) {
                LLVM_TYPE_CONST llvm::FunctionType *ftype = 
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -50,6 +50,7 @@
 #include <llvm/Analysis/DIBuilder.h>
 #include <llvm/Analysis/DebugInfo.h>
 #include <llvm/Support/Dwarf.h>
+#include <llvm/Instructions.h>
 #include <llvm/Target/TargetMachine.h>
 #include <llvm/Target/TargetOptions.h>
 #include <llvm/Target/TargetData.h>
@@ -129,24 +130,68 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
        t->nativeVectorWidth = 4;
        t->vectorWidth = 4;
        t->attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt";
+        t->maskingIsFree = false;
+        t->allOffMaskIsSafe = false;
+        t->maskBitCount = 32;
    }
    else if (!strcasecmp(isa, "sse2-x2")) {
        t->isa = Target::SSE2;
        t->nativeVectorWidth = 4;
        t->vectorWidth = 8;
        t->attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt";
+        t->maskingIsFree = false;
+        t->allOffMaskIsSafe = false;
+        t->maskBitCount = 32;
    }
    else if (!strcasecmp(isa, "sse4")) {
        t->isa = Target::SSE4;
        t->nativeVectorWidth = 4;
        t->vectorWidth = 4;
        t->attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
+        t->maskingIsFree = false;
+        t->allOffMaskIsSafe = false;
+        t->maskBitCount = 32;
    }
    else if (!strcasecmp(isa, "sse4x2") || !strcasecmp(isa, "sse4-x2")) {
        t->isa = Target::SSE4;
        t->nativeVectorWidth = 4;
        t->vectorWidth = 8;
        t->attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
+        t->maskingIsFree = false;
+        t->allOffMaskIsSafe = false;
+        t->maskBitCount = 32;
+    }
+    else if (!strcasecmp(isa, "generic-4")) {
+        t->isa = Target::GENERIC;
+        t->nativeVectorWidth = 4;
+        t->vectorWidth = 4;
+        t->maskingIsFree = true;
+        t->allOffMaskIsSafe = true;
+        t->maskBitCount = 1;
+    }
+    else if (!strcasecmp(isa, "generic-8")) {
+        t->isa = Target::GENERIC;
+        t->nativeVectorWidth = 8;
+        t->vectorWidth = 8;
+        t->maskingIsFree = true;
+        t->allOffMaskIsSafe = true;
+        t->maskBitCount = 1;
+    }
+    else if (!strcasecmp(isa, "generic-16")) {
+        t->isa = Target::GENERIC;
+        t->nativeVectorWidth = 16;
+        t->vectorWidth = 16;
+        t->maskingIsFree = true;
+        t->allOffMaskIsSafe = true;
+        t->maskBitCount = 1;
+    }
+    else if (!strcasecmp(isa, "generic-1")) {
+        t->isa = Target::GENERIC;
+        t->nativeVectorWidth = 1;
+        t->vectorWidth = 1;
+        t->maskingIsFree = false;
+        t->allOffMaskIsSafe = false;
+        t->maskBitCount = 32;
    }
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
    else if (!strcasecmp(isa, "avx")) {
@@ -154,14 +199,40 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
        t->nativeVectorWidth = 8;
        t->vectorWidth = 8;
        t->attributes = "+avx,+popcnt,+cmov";
+        t->maskingIsFree = false;
+        t->allOffMaskIsSafe = false;
+        t->maskBitCount = 32;
    }
    else if (!strcasecmp(isa, "avx-x2")) {
        t->isa = Target::AVX;
        t->nativeVectorWidth = 8;
        t->vectorWidth = 16;
        t->attributes = "+avx,+popcnt,+cmov";
+        t->maskingIsFree = false;
+        t->allOffMaskIsSafe = false;
+        t->maskBitCount = 32;
    }
-#endif // LLVM 3.0
+#endif // LLVM 3.0+
+#if defined(LLVM_3_1svn)
+    else if (!strcasecmp(isa, "avx2")) {
+        t->isa = Target::AVX2;
+        t->nativeVectorWidth = 8;
+        t->vectorWidth = 8;
+        t->attributes = "+avx2,+popcnt,+cmov,+f16c";
+        t->maskingIsFree = false;
+        t->allOffMaskIsSafe = false;
+        t->maskBitCount = 32;
+    }
+    else if (!strcasecmp(isa, "avx2-x2")) {
+        t->isa = Target::AVX2;
+        t->nativeVectorWidth = 16;
+        t->vectorWidth = 16;
+        t->attributes = "+avx2,+popcnt,+cmov,+f16c";
+        t->maskingIsFree = false;
+        t->allOffMaskIsSafe = false;
+        t->maskBitCount = 32;
+    }
+#endif // LLVM 3.1
    else {
        fprintf(stderr, "Target ISA \"%s\" is unknown.  Choices are: %s\n", 
                isa, SupportedTargetISAs());
@@ -201,10 +272,13 @@ Target::SupportedTargetArchs() {
 const char *
 Target::SupportedTargetISAs() {
    return "sse2, sse2-x2, sse4, sse4-x2"
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
+#ifndef LLVM_2_9
        ", avx, avx-x2"
-#endif
-        ;
+#endif // !LLVM_2_9
+#ifdef LLVM_3_1svn
+        ", avx2, avx2-x2"
+#endif // LLVM_3_1svn
+        ", generic-4, generic-8, generic-16, generic-1";
 }


@@ -241,11 +315,19 @@ Target::GetTargetMachine() const {

    llvm::Reloc::Model relocModel = generatePIC ? llvm::Reloc::PIC_ : 
                                                  llvm::Reloc::Default;
-#if defined(LLVM_3_0svn) || defined(LLVM_3_1svn) || defined(LLVM_3_0)
+#if defined(LLVM_3_1svn)
+    std::string featuresString = attributes;
+    llvm::TargetOptions options;
+    if (g->opt.fastMath == true)
+        options.UnsafeFPMath = 1;
+    llvm::TargetMachine *targetMachine = 
+        target->createTargetMachine(triple, cpu, featuresString, options,
+                                    relocModel);
+#elif defined(LLVM_3_0)
    std::string featuresString = attributes;
    llvm::TargetMachine *targetMachine = 
        target->createTargetMachine(triple, cpu, featuresString, relocModel);
-#else
+#else // LLVM 2.9
 #ifdef ISPC_IS_APPLE
    relocModel = llvm::Reloc::PIC_;
 #endif // ISPC_IS_APPLE
@@ -255,8 +337,9 @@ Target::GetTargetMachine() const {
 #ifndef ISPC_IS_WINDOWS
    targetMachine->setRelocationModel(relocModel);
 #endif // !ISPC_IS_WINDOWS
-#endif
-    assert(targetMachine != NULL);
+#endif // LLVM_2_9
+
+    Assert(targetMachine != NULL);

    targetMachine->setAsmVerbosityDefault(true);
    return targetMachine;
@@ -272,7 +355,10 @@ Target::GetISAString() const {
        return "sse4";
    case Target::AVX:
        return "avx";
-        break;
+    case Target::AVX2:
+        return "avx2";
+    case Target::GENERIC:
+        return "generic";
    default:
        FATAL("Unhandled target in GetISAString()");
    }
@@ -280,31 +366,113 @@ Target::GetISAString() const {
 }


+static bool
+lGenericTypeLayoutIndeterminate(LLVM_TYPE_CONST llvm::Type *type) {
+    if (type->isPrimitiveType() || type->isIntegerTy())
+        return false;
+
+    if (type == LLVMTypes::BoolVectorType ||
+        type == LLVMTypes::MaskType ||
+        type == LLVMTypes::Int1VectorType)
+        return true;
+
+    LLVM_TYPE_CONST llvm::ArrayType *at = 
+        llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(type);
+    if (at != NULL)
+        return lGenericTypeLayoutIndeterminate(at->getElementType());
+
+    LLVM_TYPE_CONST llvm::PointerType *pt = 
+        llvm::dyn_cast<LLVM_TYPE_CONST llvm::PointerType>(type);
+    if (pt != NULL)
+        return false;
+
+    LLVM_TYPE_CONST llvm::StructType *st =
+        llvm::dyn_cast<LLVM_TYPE_CONST llvm::StructType>(type);
+    if (st != NULL) {
+        for (int i = 0; i < (int)st->getNumElements(); ++i)
+            if (lGenericTypeLayoutIndeterminate(st->getElementType(i)))
+                return true;
+        return false;
+    }
+
+    Assert(llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(type));
+    return true;
+}
+
+
 llvm::Value *
-Target::SizeOf(LLVM_TYPE_CONST llvm::Type *type) {
+Target::SizeOf(LLVM_TYPE_CONST llvm::Type *type, 
+               llvm::BasicBlock *insertAtEnd) {
+    if (isa == Target::GENERIC &&
+        lGenericTypeLayoutIndeterminate(type)) {
+        llvm::Value *index[1] = { LLVMInt32(1) };
+        LLVM_TYPE_CONST llvm::PointerType *ptrType = llvm::PointerType::get(type, 0);
+        llvm::Value *voidPtr = llvm::ConstantPointerNull::get(ptrType);
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
+        llvm::ArrayRef<llvm::Value *> arrayRef(&index[0], &index[1]);
+        llvm::Instruction *gep = 
+            llvm::GetElementPtrInst::Create(voidPtr, arrayRef, "sizeof_gep",
+                                            insertAtEnd);
+#else
+        llvm::Instruction *gep =
+            llvm::GetElementPtrInst::Create(voidPtr, &index[0], &index[1],
+                                            "sizeof_gep", insertAtEnd);
+#endif
+        if (is32Bit || g->opt.force32BitAddressing)
+            return new llvm::PtrToIntInst(gep, LLVMTypes::Int32Type, 
+                                          "sizeof_int", insertAtEnd);
+        else
+            return new llvm::PtrToIntInst(gep, LLVMTypes::Int64Type, 
+                                          "sizeof_int", insertAtEnd);
+    }
+
    const llvm::TargetData *td = GetTargetMachine()->getTargetData();
-    assert(td != NULL);
+    Assert(td != NULL);
    uint64_t byteSize = td->getTypeSizeInBits(type) / 8;
    if (is32Bit || g->opt.force32BitAddressing)
-        return LLVMInt32(byteSize);
+        return LLVMInt32((int32_t)byteSize);
    else
        return LLVMInt64(byteSize);
 }


 llvm::Value *
-Target::StructOffset(LLVM_TYPE_CONST llvm::Type *type, int element) {
+Target::StructOffset(LLVM_TYPE_CONST llvm::Type *type, int element,
+                     llvm::BasicBlock *insertAtEnd) {
+    if (isa == Target::GENERIC && 
+        lGenericTypeLayoutIndeterminate(type) == true) {
+        llvm::Value *indices[2] = { LLVMInt32(0), LLVMInt32(element) };
+        LLVM_TYPE_CONST llvm::PointerType *ptrType = llvm::PointerType::get(type, 0);
+        llvm::Value *voidPtr = llvm::ConstantPointerNull::get(ptrType);
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
+        llvm::ArrayRef<llvm::Value *> arrayRef(&indices[0], &indices[2]);
+        llvm::Instruction *gep = 
+            llvm::GetElementPtrInst::Create(voidPtr, arrayRef, "offset_gep",
+                                            insertAtEnd);
+#else
+        llvm::Instruction *gep =
+            llvm::GetElementPtrInst::Create(voidPtr, &indices[0], &indices[2],
+                                            "offset_gep", insertAtEnd);
+#endif
+        if (is32Bit || g->opt.force32BitAddressing)
+            return new llvm::PtrToIntInst(gep, LLVMTypes::Int32Type, 
+                                          "offset_int", insertAtEnd);
+        else
+            return new llvm::PtrToIntInst(gep, LLVMTypes::Int64Type, 
+                                          "offset_int", insertAtEnd);
+    }
+
    const llvm::TargetData *td = GetTargetMachine()->getTargetData();
-    assert(td != NULL);
+    Assert(td != NULL);
    LLVM_TYPE_CONST llvm::StructType *structType = 
        llvm::dyn_cast<LLVM_TYPE_CONST llvm::StructType>(type);
-    assert(structType != NULL);
+    Assert(structType != NULL);
    const llvm::StructLayout *sl = td->getStructLayout(structType);
-    assert(sl != NULL);
+    Assert(sl != NULL);

    uint64_t offset = sl->getElementOffset(element);
    if (is32Bit || g->opt.force32BitAddressing)
-        return LLVMInt32(offset);
+        return LLVMInt32((int32_t)offset);
    else
        return LLVMInt64(offset);
 }
@@ -320,6 +488,7 @@ Opt::Opt() {
    force32BitAddressing = true;
    unrollLoops = true;
    disableAsserts = false;
+    disableMaskAllOnOptimizations = false;
    disableHandlePseudoMemoryOps = false;
    disableBlendedMaskedStores = false;
    disableCoherentControlFlow = false;
@@ -328,7 +497,7 @@ Opt::Opt() {
    disableMaskedStoreToStore = false;
    disableGatherScatterFlattening = false;
    disableUniformMemoryOptimizations = false;
-    disableMaskedStoreOptimizations = false;
+    disableCoalescing = false;
 }

 ///////////////////////////////////////////////////////////////////////////
@@ -342,12 +511,15 @@ Globals::Globals() {
    debugPrint = false;
    disableWarnings = false;
    warningsAsErrors = false;
+    quiet = false;
    disableLineWrap = false;
    emitPerfWarnings = true;
    emitInstrumentation = false;
    generateDebuggingSymbols = false;
+    enableFuzzTest = false;
+    fuzzTestSeed = -1;
    mangleFunctionsWithTarget = false;
-
+    
    ctx = new llvm::LLVMContext;

 #ifdef ISPC_IS_WINDOWS
@@ -362,7 +534,13 @@ Globals::Globals() {
 // SourcePos

 SourcePos::SourcePos(const char *n, int fl, int fc, int ll, int lc) {
-    name = n ? n : m->module->getModuleIdentifier().c_str();
+    name = n;
+    if (name == NULL) {
+        if (m != NULL)
+            name = m->module->getModuleIdentifier().c_str();
+        else
+            name = "(unknown)";
+    }
    first_line = fl;
    first_column = fc;
    last_line = ll != 0 ? ll : fl;
--- a/ispc.h
+++ b/ispc.h
@@ -38,6 +38,8 @@
 #ifndef ISPC_H
 #define ISPC_H

+#define ISPC_VERSION "1.2.0dev"
+
 #if !defined(LLVM_2_9) && !defined(LLVM_3_0) && !defined(LLVM_3_0svn) && !defined(LLVM_3_1svn)
 #error "Only LLVM 2.9, 3.0, and the 3.1 development branch are supported"
 #endif
@@ -50,11 +52,22 @@
 #define ISPC_IS_APPLE
 #endif

-#include <assert.h>
 #include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
 #include <vector>
 #include <string>

+#define Assert(expr)                                            \
+    ((void)((expr) ? 0 : __Assert (#expr, __FILE__, __LINE__)))
+#define __Assert(expr, file, line)                                      \
+    ((void)fprintf(stderr, "%s:%u: Assertion failed: \"%s\"\n"          \
+                   "***\n*** Please file a bug report at "              \
+                   "https://github.com/ispc/ispc/issues\n*** (Including as much " \
+                   "information as you can about how to reproduce this error).\n" \
+                   "*** You have apparently encountered a bug in the compiler that " \
+                   "we'd like to fix!\n***\n", file, line, expr), abort(), 0)
+
 /** @def ISPC_MAX_NVEC maximum vector size of any of the compliation
    targets.
 */
@@ -87,6 +100,8 @@ namespace llvm {
 #endif

 class ArrayType;
+class AST;
+class ASTNode;
 class AtomicType;
 class FunctionEmitContext;
 class Expr;
@@ -94,6 +109,7 @@ class ExprList;
 class Function;
 class FunctionType;
 class Module;
+class PointerType;
 class Stmt;
 class Symbol;
 class SymbolTable;
@@ -166,12 +182,14 @@ struct Target {
    const char *GetISAString() const;

    /** Returns the size of the given type */
-    llvm::Value *SizeOf(LLVM_TYPE_CONST llvm::Type *type);
+    llvm::Value *SizeOf(LLVM_TYPE_CONST llvm::Type *type,
+                        llvm::BasicBlock *insertAtEnd);
+
    /** Given a structure type and an element number in the structure,
        returns a value corresponding to the number of bytes from the start
        of the structure where the element is located. */
    llvm::Value *StructOffset(LLVM_TYPE_CONST llvm::Type *type,
-                              int element);
+                              int element, llvm::BasicBlock *insertAtEnd);

    /** llvm Target object representing this target. */
    const llvm::Target *target;
@@ -182,7 +200,7 @@ struct Target {
        flexible/performant of them will apear last in the enumerant.  Note
        also that __best_available_isa() needs to be updated if ISAs are
        added or the enumerant values are reordered.  */
-    enum ISA { SSE2, SSE4, AVX, NUM_ISAS };
+    enum ISA { SSE2, SSE4, AVX, AVX2, GENERIC, NUM_ISAS };

    /** Instruction set being compiled to. */
    ISA isa;
@@ -211,6 +229,23 @@ struct Target {

    /** Indicates whether position independent code should be generated. */
    bool generatePIC;
+
+    /** Is there overhead associated with masking on the target
+        architecture; e.g. there is on SSE, due to extra blends and the
+        like, but there isn't with an ISA that supports masking
+        natively. */
+    bool maskingIsFree;
+
+    /** Is it safe to run code with the mask all if: e.g. on SSE, the fast
+        gather trick assumes that at least one program instance is running
+        (so that it can safely assume that the array base pointer is
+        valid). */
+    bool allOffMaskIsSafe;
+
+    /** How many bits are used to store each element of the mask: e.g. this
+        is 32 on SSE/AVX, since that matches the HW better, but it's 1 for
+        the generic target. */
+    int maskBitCount;
 };


@@ -247,10 +282,15 @@ struct Opt {
     */ 
    bool force32BitAddressing;

-    /** Indicates whether assert() statements should be ignored (for
+    /** Indicates whether Assert() statements should be ignored (for
        performance in the generated code). */
    bool disableAsserts;
-    
+
+    /** If enabled, disables the various optimizations that kick in when
+        the execution mask can be determined to be "all on" at compile
+        time. */
+    bool disableMaskAllOnOptimizations;
+
    /** If enabled, the various __pseudo* memory ops (gather/scatter,
        masked load/store) are left in their __pseudo* form, for better
        understanding of the structure of generated code when reading
@@ -303,13 +343,9 @@ struct Opt {
        the impact of this optimization. */
    bool disableUniformMemoryOptimizations;

-    /** Disables optimizations for masked stores: masked stores with the
-        mask all on are transformed to regular stores, and masked stores
-        with the mask are all off are removed (which in turn can allow
-        eliminating additional dead code related to computing the value
-        stored).  This is likely only useful for measuring the impact of
-        this optimization. */
-    bool disableMaskedStoreOptimizations;
+    /** Disables optimizations that coalesce incoherent scalar memory
+        access from gathers into wider vector operations, when possible. */
+    bool disableCoalescing;
 };

 /** @brief This structure collects together a number of global variables. 
@@ -359,6 +395,9 @@ struct Globals {
        possible performance pitfalls. */
    bool emitPerfWarnings;

+    /** Indicates whether all printed output should be surpressed. */
+    bool quiet;
+
    /** Indicates whether calls should be emitted in the program to an
        externally-defined program instrumentation function. (See the
        "Instrumenting your ispc programs" section in the user's
@@ -373,6 +412,14 @@ struct Globals {
        vector width to them. */
    bool mangleFunctionsWithTarget;

+    /** If enabled, the lexer will randomly replace some tokens returned
+        with other tokens, in order to test error condition handling in the
+        compiler. */
+    bool enableFuzzTest;
+
+    /** Seed for random number generator used for fuzz testing. */
+    int fuzzTestSeed;
+
    /** Global LLVMContext object */
    llvm::LLVMContext *ctx;

@@ -383,18 +430,25 @@ struct Globals {
    /** Arguments to pass along to the C pre-processor, if it is run on the
        program before compilation. */
    std::vector<std::string> cppArgs;
+
+    /** Additional user-provided directories to search when processing
+        #include directives in the preprocessor. */
+    std::vector<std::string> includePath;
 };

 enum {
    COST_ASSIGN = 1,
    COST_COHERENT_BREAK_CONTINE = 4,
    COST_COMPLEX_ARITH_OP = 4,
+    COST_DELETE = 32,
    COST_DEREF = 4,
    COST_FUNCALL = 4,
    COST_FUNPTR_UNIFORM = 12,
    COST_FUNPTR_VARYING = 24,
    COST_GATHER = 8,
+    COST_GOTO = 4,
    COST_LOAD = 2,
+    COST_NEW = 32,
    COST_REGULAR_BREAK_CONTINUE = 2,
    COST_RETURN = 4,
    COST_SELECT = 4,
@@ -407,6 +461,8 @@ enum {
    COST_VARYING_IF = 3,
    COST_UNIFORM_LOOP = 4,
    COST_VARYING_LOOP = 6,
+    COST_UNIFORM_SWITCH = 4,
+    COST_VARYING_SWITCH = 12,
    COST_ASSERT = 8,

    CHECK_MASK_AT_FUNCTION_START_COST = 16,
--- a/ispc.sln
+++ b/ispc.sln
@@ -3,8 +3,6 @@ Microsoft Visual Studio Solution File, Format Version 11.00
 # Visual Studio 2010
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ispc", "ispc.vcxproj", "{9861F490-F516-480C-B63C-D62A77AFA9D5}"
 EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ispc_test", "ispc_test.vcxproj", "{92547BA8-BE86-4E78-8799-1D72A70E5831}"
-EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|Win32 = Debug|Win32
@@ -15,9 +13,6 @@ Global
 		{9861F490-F516-480C-B63C-D62A77AFA9D5}.Debug|Win32.Build.0 = Debug|Win32
 		{9861F490-F516-480C-B63C-D62A77AFA9D5}.Release|Win32.ActiveCfg = Release|Win32
 		{9861F490-F516-480C-B63C-D62A77AFA9D5}.Release|Win32.Build.0 = Release|Win32
-		{92547BA8-BE86-4E78-8799-1D72A70E5831}.Debug|Win32.ActiveCfg = Debug|Win32
-		{92547BA8-BE86-4E78-8799-1D72A70E5831}.Debug|Win32.Build.0 = Debug|Win32
-		{92547BA8-BE86-4E78-8799-1D72A70E5831}.Release|Win32.ActiveCfg = Release|Win32
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
--- a/ispc.vcxproj
+++ b/ispc.vcxproj
@@ -13,20 +13,28 @@
  <ItemGroup>
    <ClCompile Include="ast.cpp" />
    <ClCompile Include="builtins.cpp" />
+    <ClCompile Include="cbackend.cpp" />
    <ClCompile Include="ctx.cpp" />
    <ClCompile Include="decl.cpp" />
    <ClCompile Include="expr.cpp" />
    <ClCompile Include="func.cpp" />
-    <ClCompile Include="gen-bitcode-avx.cpp" />
-    <ClCompile Include="gen-bitcode-avx-x2.cpp" />
+    <ClCompile Include="gen-bitcode-avx1.cpp" />
+    <ClCompile Include="gen-bitcode-avx1-x2.cpp" />
+    <ClCompile Include="gen-bitcode-avx2.cpp" />
+    <ClCompile Include="gen-bitcode-avx2-x2.cpp" />
    <ClCompile Include="gen-bitcode-c-32.cpp" />
    <ClCompile Include="gen-bitcode-c-64.cpp" />
    <ClCompile Include="gen-bitcode-dispatch.cpp" />
+    <ClCompile Include="gen-bitcode-generic-1.cpp" />
+    <ClCompile Include="gen-bitcode-generic-4.cpp" />
+    <ClCompile Include="gen-bitcode-generic-8.cpp" />
+    <ClCompile Include="gen-bitcode-generic-16.cpp" />
    <ClCompile Include="gen-bitcode-sse2.cpp" />
    <ClCompile Include="gen-bitcode-sse2-x2.cpp" />
    <ClCompile Include="gen-bitcode-sse4.cpp" />
    <ClCompile Include="gen-bitcode-sse4-x2.cpp" />
-    <ClCompile Include="gen-stdlib.cpp" />
+    <ClCompile Include="gen-stdlib-generic.cpp" />
+    <ClCompile Include="gen-stdlib-x86.cpp" />
    <ClCompile Include="ispc.cpp" />
    <ClCompile Include="lex.cc">
      <DisableSpecificWarnings Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">4146;4800;4996;4355;4624;4005;4003;4018</DisableSpecificWarnings>
@@ -40,15 +48,15 @@
      <DisableSpecificWarnings Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">4146;4800;4996;4355;4624;4005;4065</DisableSpecificWarnings>
      <DisableSpecificWarnings Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">4146;4800;4996;4355;4624;4005;4065</DisableSpecificWarnings>
    </ClCompile>
-    <CustomBuild Include="builtins-c.c">
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-32.c &gt; gen-bitcode-c-32.cpp;
-%LLVM_INSTALL_DIR%\bin\clang -m64 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-64.c &gt; gen-bitcode-c-64.cpp</Command>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">clang builtins-c.c</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-32.c &gt; gen-bitcode-c-32.cpp;
-%LLVM_INSTALL_DIR%\bin\clang -m64 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-64.c &gt; gen-bitcode-c-64.cpp</Command>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">clang builtins-c.c</Message>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-c-32.cpp;gen-bitcore-c-64.cpp</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-c-32.cpp;gen-bitcore-c-64.cpp</Outputs>
+    <CustomBuild Include="builtins\builtins.c">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins\builtins.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py c-32 &gt; gen-bitcode-c-32.cpp;
+%LLVM_INSTALL_DIR%\bin\clang -m64 -emit-llvm builtins\builtins.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py c-64 &gt; gen-bitcode-c-64.cpp</Command>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building builtins.c</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins\builtins.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py c-32 &gt; gen-bitcode-c-32.cpp;
+%LLVM_INSTALL_DIR%\bin\clang -m64 -emit-llvm builtins\builtins.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py c-64 &gt; gen-bitcode-c-64.cpp</Command>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building builtins.c</Message>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-c-32.cpp;gen-bitcode-c-64.cpp</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-c-32.cpp;gen-bitcode-c-64.cpp</Outputs>
    </CustomBuild>
    <ClCompile Include="stmt.cpp" />
    <ClCompile Include="sym.cpp" />
@@ -75,103 +83,185 @@
  <ItemGroup>
    <CustomBuild Include="stdlib.ispc">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py &gt; gen-stdlib.cpp</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-stdlib.cpp</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py &gt; gen-stdlib.cpp</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-stdlib.cpp</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-stdlib.cpp</Message>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-stdlib.cpp</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py x86 &gt; gen-stdlib-x86.cpp;
+%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DISPC_TARGET_GENERIC=1 -DPI=3.1415926535 | python stdlib2cpp.py generic &gt; gen-stdlib-generic.cpp;
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-stdlib-generic.cpp;gen-stdlib-x86.cpp</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py x86 &gt; gen-stdlib-x86.cpp;
+%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DISPC_TARGET_GENERIC=1 -DPI=3.1415926535 | python stdlib2cpp.py generic &gt; gen-stdlib-generic.cpp;
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-stdlib-generic.cpp;gen-stdlib-x86.cpp</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-stdlib-{generic,x86}.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-stdlib-{generic,x86}.cpp</Message>
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
-    <CustomBuild Include="builtins-sse4.ll">
+    <CustomBuild Include="builtins\dispatch.ll">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse4.ll | python bitcode2cpp.py builtins-sse4.ll &gt; gen-bitcode-sse4.cpp</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse4.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse4-common.ll</AdditionalInputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse4.ll | python bitcode2cpp.py builtins-sse4.ll &gt; gen-bitcode-sse4.cpp</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse4.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse4-common.ll</AdditionalInputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse4.cpp</Message>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse4.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins-dispatch.ll">
-      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-dispatch.ll | python bitcode2cpp.py builtins-dispatch.ll &gt; gen-bitcode-dispatch.cpp</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\dispatch.ll | python bitcode2cpp.py dispatch.ll &gt; gen-bitcode-dispatch.cpp</Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-dispatch.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4</AdditionalInputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-dispatch.ll | python bitcode2cpp.py builtins-dispatch.ll &gt; gen-bitcode-dispatch.cpp</Command>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\dispatch.ll | python bitcode2cpp.py dispatch.ll &gt; gen-bitcode-dispatch.cpp</Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-dispatch.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4</AdditionalInputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4</AdditionalInputs>
      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-dispatch.cpp</Message>
      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-dispatch.cpp</Message>
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
-    <CustomBuild Include="builtins-sse4-x2.ll">
+    <CustomBuild Include="builtins\target-sse4.ll">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse4-x2.ll | python bitcode2cpp.py builtins-sse4-x2.ll &gt; gen-bitcode-sse4-x2.cpp</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-sse4.ll | python bitcode2cpp.py builtins\target-sse4.ll &gt; gen-bitcode-sse4.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse4.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-sse4.ll | python bitcode2cpp.py builtins\target-sse4.ll &gt; gen-bitcode-sse4.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse4.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse4.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse4.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-sse4-x2.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-sse4-x2.ll | python bitcode2cpp.py builtins\target-sse4-x2.ll &gt; gen-bitcode-sse4-x2.cpp</Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse4-x2.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse4-common.ll</AdditionalInputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse4-x2.ll | python bitcode2cpp.py builtins-sse4-x2.ll &gt; gen-bitcode-sse4-x2.cpp</Command>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-sse4-x2.ll | python bitcode2cpp.py builtins\target-sse4-x2.ll &gt; gen-bitcode-sse4-x2.cpp</Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse4-x2.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse4-common.ll</AdditionalInputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse4-x2.cpp</Message>
      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse4-x2.cpp</Message>
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
-    <CustomBuild Include="builtins-sse2.ll">
+    <CustomBuild Include="builtins\target-sse2.ll">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse2.ll | python bitcode2cpp.py builtins-sse2.ll &gt; gen-bitcode-sse2.cpp</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-sse2.ll | python bitcode2cpp.py builtins\target-sse2.ll &gt; gen-bitcode-sse2.cpp</Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse2.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse2-common.ll</AdditionalInputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse2.ll | python bitcode2cpp.py builtins-sse2.ll &gt; gen-bitcode-sse2.cpp</Command>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-sse2-common.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-sse2.ll | python bitcode2cpp.py builtins\target-sse2.ll &gt; gen-bitcode-sse2.cpp</Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse2.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse2-common.ll</AdditionalInputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-sse2-common.ll</AdditionalInputs>
      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse2.cpp</Message>
      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse2.cpp</Message>
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
-    <CustomBuild Include="builtins-sse2-x2.ll">
+    <CustomBuild Include="builtins\target-sse2-x2.ll">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse2-x2.ll | python bitcode2cpp.py builtins-sse2-x2.ll &gt; gen-bitcode-sse2-x2.cpp</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-sse2-x2.ll | python bitcode2cpp.py builtins\target-sse2-x2.ll &gt; gen-bitcode-sse2-x2.cpp</Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse2-x2.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse2-common.ll</AdditionalInputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse2-x2.ll | python bitcode2cpp.py builtins-sse2-x2.ll &gt; gen-bitcode-sse2-x2.cpp</Command>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-sse2-common.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-sse2-x2.ll | python bitcode2cpp.py builtins\target-sse2-x2.ll &gt; gen-bitcode-sse2-x2.cpp</Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse2-x2.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse2-common.ll</AdditionalInputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-sse2-common.ll</AdditionalInputs>
      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse2-x2.cpp</Message>
      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse2-x2.cpp</Message>
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
-    <CustomBuild Include="builtins-avx.ll">
+    <CustomBuild Include="builtins\target-avx1.ll">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-avx.ll | python bitcode2cpp.py builtins-avx.ll &gt; gen-bitcode-avx.cpp</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-avx-common.ll</AdditionalInputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-avx.ll | python bitcode2cpp.py builtins-avx.ll &gt; gen-bitcode-avx.cpp</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-avx-common.ll</AdditionalInputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx.cpp</Message>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx.cpp</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll &gt; gen-bitcode-avx1.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx1.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll &gt; gen-bitcode-avx1.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx1.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx1.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx1.cpp</Message>
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
-    <CustomBuild Include="builtins-avx-x2.ll">
+    <CustomBuild Include="builtins\target-avx1-x2.ll">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-avx-x2.ll | python bitcode2cpp.py builtins-avx-x2.ll &gt; gen-bitcode-avx-x2.cpp</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx-x2.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-avx-x2.ll | python bitcode2cpp.py builtins-avx-x2.ll &gt; gen-bitcode-avx-x2.cpp</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx-x2.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx-x2.cpp</Message>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx-x2.cpp</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx1-x2.ll | python bitcode2cpp.py builtins\target-avx1-x2.ll &gt; gen-bitcode-avx1-x2.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx1-x2.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx1-x2.ll | python bitcode2cpp.py builtins\target-avx1-x2.ll &gt; gen-bitcode-avx1-x2.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx1-x2.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\targets-avx-x2.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx1-x2.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx1-x2.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-avx2.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx2.ll | python bitcode2cpp.py builtins\target-avx2.ll &gt; gen-bitcode-avx2.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx2.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx2.ll | python bitcode2cpp.py builtins\target-avx2.ll &gt; gen-bitcode-avx2.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx2.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx2.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx2.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-avx2-x2.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx2-x2.ll | python bitcode2cpp.py builtins\target-avx2-x2.ll &gt; gen-bitcode-avx2-x2.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx2-x2.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx2-x2.ll | python bitcode2cpp.py builtins\target-avx2-x2.ll &gt; gen-bitcode-avx2-x2.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx2-x2.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\targets-avx-x2.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx2-x2.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx2-x2.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-generic-1.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-1.ll | python bitcode2cpp.py builtins\target-generic-1.ll &gt; gen-bitcode-generic-1.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-generic-1.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-1.ll | python bitcode2cpp.py builtins\target-generic-1.ll &gt; gen-bitcode-generic-1.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-generic-1.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-generic-1.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-generic-1.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-generic-4.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-4.ll | python bitcode2cpp.py builtins\target-generic-4.ll &gt; gen-bitcode-generic-4.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-generic-4.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-4.ll | python bitcode2cpp.py builtins\target-generic-4.ll &gt; gen-bitcode-generic-4.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-generic-4.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-generic-4.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-generic-4.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-generic-8.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-8.ll | python bitcode2cpp.py builtins\target-generic-8.ll &gt; gen-bitcode-generic-8.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-generic-8.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-8.ll | python bitcode2cpp.py builtins\target-generic-8.ll &gt; gen-bitcode-generic-8.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-generic-8.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-generic-8.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-generic-8.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-generic-16.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-16.ll | python bitcode2cpp.py builtins\target-generic-16.ll &gt; gen-bitcode-generic-16.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-generic-16.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-16.ll | python bitcode2cpp.py builtins\target-generic-16.ll &gt; gen-bitcode-generic-16.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-generic-16.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-generic-16.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-generic-16.cpp</Message>
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
--- a/ispc_test.cpp
+++ b/ispc_test.cpp
@@ -1,379 +0,0 @@
-/*
-  Copyright (c) 2010-2011, Intel Corporation
-  All rights reserved.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions are
-  met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived from
-      this software without specific prior written permission.
-
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
-   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
-*/
-
-#define _CRT_SECURE_NO_WARNINGS
-
-#if defined(_WIN32) || defined(_WIN64)
-#define ISPC_IS_WINDOWS
-#elif defined(__linux__)
-#define ISPC_IS_LINUX
-#elif defined(__APPLE__)
-#define ISPC_IS_APPLE
-#endif
-
-#ifdef ISPC_IS_WINDOWS
-#define NOMINMAX
-#include <windows.h>
-#endif
-#include <stdio.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <memory.h>
-#ifdef ISPC_IS_LINUX
-#include <malloc.h>
-#endif
-
-#ifdef ISPC_HAVE_SVML
-#include <xmmintrin.h>
-extern "C" {
-    extern __m128 __svml_sinf4(__m128);
-    extern __m128 __svml_cosf4(__m128);
-    extern __m128 __svml_sincosf4(__m128 *,__m128);
-    extern __m128 __svml_tanf4(__m128);
-    extern __m128 __svml_atanf4(__m128);
-    extern __m128 __svml_atan2f4(__m128, __m128);
-    extern __m128 __svml_expf4(__m128);
-    extern __m128 __svml_logf4(__m128);
-    extern __m128 __svml_powf4(__m128, __m128);
-}
-#endif
-
-#include <llvm/LLVMContext.h>
-#include <llvm/Module.h>
-#include <llvm/Type.h>
-#include <llvm/DerivedTypes.h>
-#include <llvm/Instructions.h>
-#include <llvm/ExecutionEngine/ExecutionEngine.h>
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
-  #include <llvm/Support/TargetRegistry.h>
-  #include <llvm/Support/TargetSelect.h>
-#else
-  #include <llvm/Target/TargetRegistry.h>
-  #include <llvm/Target/TargetSelect.h>
-#endif
-#include <llvm/ExecutionEngine/JIT.h>
-#include <llvm/Target/TargetOptions.h>
-#include <llvm/Target/TargetData.h>
-#include <llvm/Transforms/Scalar.h>
-#include <llvm/Transforms/IPO.h>
-#include <llvm/PassManager.h>
-#include <llvm/Support/CFG.h>
-#include <llvm/Analysis/Verifier.h>
-#include <llvm/Assembly/PrintModulePass.h>
-#include <llvm/Support/raw_ostream.h>
-#include <llvm/Bitcode/ReaderWriter.h>
-#include <llvm/Support/MemoryBuffer.h>
-#include <llvm/Support/system_error.h>
-
-bool shouldFail = false;
-
-extern "C" { 
-    void ISPCLaunch(void **, void *, void *, int32_t);
-    void ISPCSync(void *);
-    void *ISPCAlloc(void **, int64_t size, int32_t alignment);
-}
-
-void ISPCLaunch(void **handle, void *func, void *data, int32_t count) {
-    *handle = (void *)0xdeadbeef;
-    typedef void (*TaskFuncType)(void *, int, int, int, int);
-    TaskFuncType tft = (TaskFuncType)(func);
-    for (int i = 0; i < count; ++i)
-        tft(data, 0, 1, i, count);
-}
-
-
-void ISPCSync(void *) {
-}
-
-
-void *ISPCAlloc(void **handle, int64_t size, int32_t alignment) {
-    *handle = (void *)0xdeadbeef;
-    // leak time!
-#ifdef ISPC_IS_WINDOWS
-    return _aligned_malloc((size_t)size, alignment);
-#endif
-#ifdef ISPC_IS_LINUX
-    return memalign(alignment, size);
-#endif
-#ifdef ISPC_IS_APPLE
-    void *mem = malloc(size + (alignment-1) + sizeof(void*));
-    char *amem = ((char*)mem) + sizeof(void*);
-    amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
-                                        (alignment - 1)));
-    ((void**)amem)[-1] = mem;
-    return amem;
-#endif
-}
-
-
-static void usage(int ret) {
-    fprintf(stderr, "usage: ispc_test\n");
-    fprintf(stderr, "\t[-h/--help]\tprint help\n");
-    fprintf(stderr, "\t[-f]\t\tindicates that test is expected to fail\n");
-    fprintf(stderr, "\t<files>\n");
-    exit(ret);
-}
-
-static void svml_missing() {
-    fprintf(stderr, "Program called unavailable SVML function!\n");
-    exit(1);
-}
-
-// On Windows, sin() is an overloaded function, so we need an unambiguous
-// function we can take the address of when wiring up the external references
-// below.
-
-double Sin(double x) { return sin(x); }
-double Cos(double x) { return cos(x); }
-double Tan(double x) { return tan(x); }
-double Atan(double x) { return atan(x); }
-double Atan2(double y, double x) { return atan2(y, x); }
-double Pow(double a, double b) { return pow(a, b); }
-double Exp(double x) { return exp(x); }
-double Log(double x) { return log(x); }
-
-static bool lRunTest(const char *fn) {
-    llvm::LLVMContext *ctx = new llvm::LLVMContext;
-
-    llvm::OwningPtr<llvm::MemoryBuffer> buf;
-    llvm::error_code err = llvm::MemoryBuffer::getFileOrSTDIN(fn, buf);
-    if (err) {
-        fprintf(stderr, "Unable to open file \"%s\": %s\n", fn, err.message().c_str());
-        delete ctx;
-        return false;
-    }
-    std::string bcErr;
-    llvm::Module *module = llvm::ParseBitcodeFile(buf.get(), *ctx, &bcErr);
-
-    if (!module) {
-        fprintf(stderr, "Bitcode reader failed for \"%s\": %s\n", fn, bcErr.c_str());
-        delete ctx;
-        return false;
-    }
-
-    std::string eeError;
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
-    llvm::EngineBuilder engineBuilder(module);
-    engineBuilder.setErrorStr(&eeError);
-    engineBuilder.setEngineKind(llvm::EngineKind::JIT);
-#if 0
-    std::vector<std::string> attributes;
-    if (target != NULL && !strcmp(target, "avx"))
-        attributes.push_back("+avx");
-    engineBuilder.setMAttrs(attributes);
-    engineBuilder.setUseMCJIT(true);
-#endif
-    llvm::ExecutionEngine *ee = engineBuilder.create();
-#else
-    llvm::ExecutionEngine *ee = llvm::ExecutionEngine::createJIT(module, &eeError);
-#endif
-    if (!ee) {
-        fprintf(stderr, "Unable to create ExecutionEngine: %s\n", eeError.c_str());
-        return false;
-    }
-
-    llvm::Function *func;
-#define DO_FUNC(FUNC ,FUNCNAME)                           \
-    if ((func = module->getFunction(FUNCNAME)) != NULL)   \
-        ee->addGlobalMapping(func, (void *)FUNC)
-    DO_FUNC(ISPCLaunch, "ISPCLaunch");
-    DO_FUNC(ISPCSync, "ISPCSync");
-    DO_FUNC(ISPCAlloc, "ISPCAlloc");
-    DO_FUNC(putchar, "putchar");
-    DO_FUNC(printf, "printf");
-    DO_FUNC(fflush, "fflush");
-    DO_FUNC(sinf, "sinf");
-    DO_FUNC(cosf, "cosf");
-    DO_FUNC(tanf, "tanf");
-    DO_FUNC(atanf, "atanf");
-    DO_FUNC(atan2f, "atan2f");
-    DO_FUNC(powf, "powf");
-    DO_FUNC(expf, "expf");
-    DO_FUNC(logf, "logf");
-    DO_FUNC(Sin, "sin");
-    DO_FUNC(Cos, "cos");
-    DO_FUNC(Tan, "tan");
-    DO_FUNC(Atan, "atan");
-    DO_FUNC(Atan2, "atan2");
-    DO_FUNC(Pow, "pow");
-    DO_FUNC(Exp, "exp");
-    DO_FUNC(Log, "log");
-    DO_FUNC(memset, "memset");
-#ifdef ISPC_IS_APPLE
-    DO_FUNC(memset_pattern4, "memset_pattern4");
-    DO_FUNC(memset_pattern8, "memset_pattern8");
-    DO_FUNC(memset_pattern16, "memset_pattern16");
-#endif
-
-#ifdef ISPC_HAVE_SVML
-#define DO_SVML(FUNC ,FUNCNAME)                           \
-    if ((func = module->getFunction(FUNCNAME)) != NULL)   \
-        ee->addGlobalMapping(func, (void *)FUNC)
-#else
-#define DO_SVML(FUNC, FUNCNAME)                                         \
-    if ((func = module->getFunction(FUNCNAME)) != NULL)                 \
-        ee->addGlobalMapping(func, (void *)svml_missing)
-#endif
-
-    DO_SVML(__svml_sinf4, "__svml_sinf4");
-    DO_SVML(__svml_cosf4, "__svml_cosf4");
-    DO_SVML(__svml_sincosf4, "__svml_sincosf4");
-    DO_SVML(__svml_tanf4, "__svml_tanf4");
-    DO_SVML(__svml_atanf4, "__svml_atanf4");
-    DO_SVML(__svml_atan2f4, "__svml_atan2f4");
-    DO_SVML(__svml_expf4, "__svml_expf4");
-    DO_SVML(__svml_logf4, "__svml_logf4");
-    DO_SVML(__svml_powf4, "__svml_powf4");
-
-    // figure out the vector width in the compiled code
-    func = module->getFunction("width");
-    if (!func) {
-        fprintf(stderr, "No width() function found!\n");
-        return false;
-    }
-    int width;
-    {
-        typedef int (*PFN)();
-        PFN pfn = reinterpret_cast<PFN>(ee->getPointerToFunction(func));
-        width = pfn();
-        assert(width == 4 || width == 8 || width == 12 || width == 16);
-    }
-
-    // find the value that returns the desired result
-    func = module->getFunction("result");
-    bool foundResult = (func != NULL);
-    float result[16];
-    for (int i = 0; i < 16; ++i)
-        result[i] = 0;
-    if (foundResult) {
-        typedef void (*PFN)(float *);
-        PFN pfn = reinterpret_cast<PFN>(ee->getPointerToFunction(func));
-        pfn(result);
-    }
-    else
-        fprintf(stderr, "Warning: no result() function found.\n");
-
-    // try to find a function to run
-    float returned[16];
-    for (int i = 0; i < 16; ++i)
-        returned[i] = 0;
-    float vfloat[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
-    double vdouble[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
-    int vint[16] = { 2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32 };
-    int vint2[16] = { 5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20};
-
-    if ((func = module->getFunction("f_v")) != NULL) {
-        typedef void (*PFN)(float *);
-        PFN pfn = reinterpret_cast<PFN>(ee->getPointerToFunction(func));
-        pfn(returned);
-    }
-    else if ((func = module->getFunction("f_f")) != NULL) {
-        typedef void (*PFN)(float *, float *);
-        PFN pfn = reinterpret_cast<PFN>(ee->getPointerToFunction(func));
-        llvm::verifyFunction(*func);
-        pfn(returned, vfloat);
-    }
-    else if ((func = module->getFunction("f_fu")) != NULL) {
-        typedef void (*PFN)(float *, float *, float fu);
-        PFN pfn = reinterpret_cast<PFN>(ee->getPointerToFunction(func));
-        llvm::verifyFunction(*func);
-        pfn(returned, vfloat, 5.);
-    }
-    else if ((func = module->getFunction("f_fi")) != NULL) {
-        typedef void (*PFN)(float *, float *, int *);
-        PFN pfn = reinterpret_cast<PFN>(ee->getPointerToFunction(func));
-        pfn(returned, vfloat, vint);
-    }
-    else if ((func = module->getFunction("f_du")) != NULL) {
-        typedef void (*PFN)(float *, double *, double);
-        PFN pfn = reinterpret_cast<PFN>(ee->getPointerToFunction(func));
-        pfn(returned, vdouble, 5.);
-    }
-    else if ((func = module->getFunction("f_duf")) != NULL) {
-        typedef void (*PFN)(float *, double *, float);
-        PFN pfn = reinterpret_cast<PFN>(ee->getPointerToFunction(func));
-        pfn(returned, vdouble, 5.f);
-    }
-    else if ((func = module->getFunction("f_di")) != NULL) {
-        typedef void (*PFN)(float *, double *, int *);
-        PFN pfn = reinterpret_cast<PFN>(ee->getPointerToFunction(func));
-        pfn(returned, vdouble, vint2);
-    }
-    else {
-        fprintf(stderr, "Unable to find runnable function in file \"%s\"\n", fn);
-        return false;
-    }
-
-    // see if we got the right result
-    bool resultsMatch = true;
-    if (foundResult) {
-        for (int i = 0; i < width; ++i)
-            if (returned[i] != result[i]) {
-                resultsMatch = false;
-                fprintf(stderr, "Test \"%s\" RETURNED %d: %g / %a EXPECTED %g / %a\n",
-                        fn, i, returned[i], returned[i], result[i], result[i]);
-            }
-    }
-    else {
-        for (int i = 0; i < width; ++i)
-            fprintf(stderr, "Test \"%s\" returned %d: %g / %a\n",
-                    fn, i, returned[i], returned[i]);
-    }
-    if (foundResult && shouldFail && resultsMatch)
-        fprintf(stderr, "Test %s unexpectedly passed\n", fn);
-
-    delete ee;
-    delete ctx;
-
-    return foundResult && resultsMatch;
-}
-
-
-int main(int argc, char *argv[]) {
-    llvm::InitializeNativeTarget();
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
-    LLVMLinkInJIT();
-#endif
-
-    const char *filename = NULL;
-    for (int i = 1; i < argc; ++i) {
-        if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h"))
-            usage(0);
-        if (!strcmp(argv[i], "-f"))
-            shouldFail = true;
-        else
-            filename = argv[i];
-    }
-
-    return (lRunTest(filename) == true) ? 0 : 1;
-}
--- a/ispc_test.vcxproj
+++ b/ispc_test.vcxproj
@@ -1,90 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|Win32">
-      <Configuration>Debug</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|Win32">
-      <Configuration>Release</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
-  <ItemGroup>
-    <ClCompile Include="ispc_test.cpp" />
-  </ItemGroup>
-  <PropertyGroup Label="Globals">
-    <ProjectGuid>{92547BA8-BE86-4E78-8799-1D72A70E5831}</ProjectGuid>
-    <Keyword>Win32Proj</Keyword>
-    <RootNamespace>ispc_test</RootNamespace>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <LinkIncremental>true</LinkIncremental>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <LinkIncremental>false</LinkIncremental>
-  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>LLVM_3_0;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)/include</AdditionalIncludeDirectories>
-      <DisableSpecificWarnings>4146;4355;4800</DisableSpecificWarnings>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)/lib</AdditionalLibraryDirectories>
-      <AdditionalDependencies>LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMJIT.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Desc.lib;LLVMX86Info.lib;%(AdditionalDependencies)</AdditionalDependencies>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>LLVM_3_0;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)/include</AdditionalIncludeDirectories>
-      <DisableSpecificWarnings>4146;4355;4800</DisableSpecificWarnings>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)/lib</AdditionalLibraryDirectories>
-      <AdditionalDependencies>LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMJIT.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Desc.lib;LLVMX86Info.lib;%(AdditionalDependencies)</AdditionalDependencies>
-    </Link>
-  </ItemDefinitionGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-  </ImportGroup>
-</Project>
--- a/lex.ll
+++ b/lex.ll
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -42,7 +42,7 @@
 #include <stdlib.h>
 #include <stdint.h>

-static uint64_t lParseBinary(const char *ptr, SourcePos pos);
+static uint64_t lParseBinary(const char *ptr, SourcePos pos, char **endPtr);
 static void lCComment(SourcePos *);
 static void lCppComment(SourcePos *);
 static void lHandleCppHash(SourcePos *);
@@ -50,24 +50,279 @@ static void lStringConst(YYSTYPE *, SourcePos *);
 static double lParseHexFloat(const char *ptr);

 #define YY_USER_ACTION \
-    yylloc->first_line = yylloc->last_line; \
-    yylloc->first_column = yylloc->last_column; \
-    yylloc->last_column += yyleng;
+    yylloc.first_line = yylloc.last_line; \
+    yylloc.first_column = yylloc.last_column; \
+    yylloc.last_column += yyleng;

 #ifdef ISPC_IS_WINDOWS
 inline int isatty(int) { return 0; }
 #endif // ISPC_IS_WINDOWS

+static int allTokens[] = { 
+  TOKEN_ASSERT, TOKEN_BOOL, TOKEN_BREAK, TOKEN_CASE, TOKEN_CBREAK,
+  TOKEN_CCONTINUE, TOKEN_CDO, TOKEN_CFOR, TOKEN_CIF, TOKEN_CWHILE,
+  TOKEN_CONST, TOKEN_CONTINUE, TOKEN_CRETURN, TOKEN_DEFAULT, TOKEN_DO,
+  TOKEN_DELETE, TOKEN_DOUBLE, TOKEN_ELSE, TOKEN_ENUM,
+  TOKEN_EXPORT, TOKEN_EXTERN, TOKEN_FALSE, TOKEN_FLOAT, TOKEN_FOR,
+  TOKEN_FOREACH, TOKEN_FOREACH_TILED, TOKEN_GOTO, TOKEN_IF, TOKEN_INLINE,
+  TOKEN_INT, TOKEN_INT8, TOKEN_INT16, TOKEN_INT, TOKEN_INT64, TOKEN_LAUNCH,
+  TOKEN_NEW, TOKEN_NULL, TOKEN_PRINT, TOKEN_RETURN, TOKEN_SOA, TOKEN_SIGNED,
+  TOKEN_SIZEOF, TOKEN_STATIC, TOKEN_STRUCT, TOKEN_SWITCH, TOKEN_SYNC,
+  TOKEN_TASK, TOKEN_TRUE, TOKEN_TYPEDEF, TOKEN_UNIFORM, TOKEN_UNSIGNED,
+  TOKEN_VARYING, TOKEN_VOID, TOKEN_WHILE, TOKEN_STRING_C_LITERAL,
+  TOKEN_DOTDOTDOT, 
+  TOKEN_FLOAT_CONSTANT,
+  TOKEN_INT32_CONSTANT, TOKEN_UINT32_CONSTANT, 
+  TOKEN_INT64_CONSTANT, TOKEN_UINT64_CONSTANT, 
+  TOKEN_INC_OP, TOKEN_DEC_OP, TOKEN_LEFT_OP, TOKEN_RIGHT_OP, TOKEN_LE_OP,
+  TOKEN_GE_OP, TOKEN_EQ_OP, TOKEN_NE_OP, TOKEN_AND_OP, TOKEN_OR_OP,
+  TOKEN_MUL_ASSIGN, TOKEN_DIV_ASSIGN, TOKEN_MOD_ASSIGN, TOKEN_ADD_ASSIGN,
+  TOKEN_SUB_ASSIGN, TOKEN_LEFT_ASSIGN, TOKEN_RIGHT_ASSIGN, TOKEN_AND_ASSIGN,
+  TOKEN_XOR_ASSIGN, TOKEN_OR_ASSIGN, TOKEN_PTR_OP,
+  ';', '{', '}', ',', ':', '=', '(', ')', '[', ']', '.', '&', '!', '~', '-',
+  '+', '*', '/', '%', '<', '>', '^', '|', '?',
+};
+
+std::map<int, std::string> tokenToName;
+std::map<std::string, std::string> tokenNameRemap;
+
+void ParserInit() {
+    tokenToName[TOKEN_ASSERT] = "assert";
+    tokenToName[TOKEN_BOOL] = "bool";
+    tokenToName[TOKEN_BREAK] = "break";
+    tokenToName[TOKEN_CASE] = "case";
+    tokenToName[TOKEN_CBREAK] = "cbreak";
+    tokenToName[TOKEN_CCONTINUE] = "ccontinue";
+    tokenToName[TOKEN_CDO] = "cdo";
+    tokenToName[TOKEN_CFOR] = "cfor";
+    tokenToName[TOKEN_CIF] = "cif";
+    tokenToName[TOKEN_CWHILE] = "cwhile";
+    tokenToName[TOKEN_CONST] = "const";
+    tokenToName[TOKEN_CONTINUE] = "continue";
+    tokenToName[TOKEN_CRETURN] = "creturn";
+    tokenToName[TOKEN_DEFAULT] = "default";
+    tokenToName[TOKEN_DO] = "do";
+    tokenToName[TOKEN_DELETE] = "delete";
+    tokenToName[TOKEN_DOUBLE] = "double";
+    tokenToName[TOKEN_ELSE] = "else";
+    tokenToName[TOKEN_ENUM] = "enum";
+    tokenToName[TOKEN_EXPORT] = "export";
+    tokenToName[TOKEN_EXTERN] = "extern";
+    tokenToName[TOKEN_FALSE] = "false";
+    tokenToName[TOKEN_FLOAT] = "float";
+    tokenToName[TOKEN_FOR] = "for";
+    tokenToName[TOKEN_FOREACH] = "foreach";
+    tokenToName[TOKEN_FOREACH_TILED] = "foreach_tiled";
+    tokenToName[TOKEN_GOTO] = "goto";
+    tokenToName[TOKEN_IF] = "if";
+    tokenToName[TOKEN_INLINE] = "inline";
+    tokenToName[TOKEN_INT] = "int";
+    tokenToName[TOKEN_INT8] = "int8";
+    tokenToName[TOKEN_INT16] = "int16";
+    tokenToName[TOKEN_INT] = "int";
+    tokenToName[TOKEN_INT64] = "int64";
+    tokenToName[TOKEN_LAUNCH] = "launch";
+    tokenToName[TOKEN_NEW] = "new";
+    tokenToName[TOKEN_NULL] = "NULL";
+    tokenToName[TOKEN_PRINT] = "print";
+    tokenToName[TOKEN_RETURN] = "return";
+    tokenToName[TOKEN_SOA] = "soa";
+    tokenToName[TOKEN_SIGNED] = "signed";
+    tokenToName[TOKEN_SIZEOF] = "sizeof";
+    tokenToName[TOKEN_STATIC] = "static";
+    tokenToName[TOKEN_STRUCT] = "struct";
+    tokenToName[TOKEN_SWITCH] = "switch";
+    tokenToName[TOKEN_SYNC] = "sync";
+    tokenToName[TOKEN_TASK] = "task";
+    tokenToName[TOKEN_TRUE] = "true";
+    tokenToName[TOKEN_TYPEDEF] = "typedef";
+    tokenToName[TOKEN_UNIFORM] = "uniform";
+    tokenToName[TOKEN_UNSIGNED] = "unsigned";
+    tokenToName[TOKEN_VARYING] = "varying";
+    tokenToName[TOKEN_VOID] = "void";
+    tokenToName[TOKEN_WHILE] = "while";
+    tokenToName[TOKEN_STRING_C_LITERAL] = "\"C\"";
+    tokenToName[TOKEN_DOTDOTDOT] = "...";
+    tokenToName[TOKEN_FLOAT_CONSTANT] = "TOKEN_FLOAT_CONSTANT";
+    tokenToName[TOKEN_INT32_CONSTANT] = "TOKEN_INT32_CONSTANT";
+    tokenToName[TOKEN_UINT32_CONSTANT] = "TOKEN_UINT32_CONSTANT";
+    tokenToName[TOKEN_INT64_CONSTANT] = "TOKEN_INT64_CONSTANT";
+    tokenToName[TOKEN_UINT64_CONSTANT] = "TOKEN_UINT64_CONSTANT";
+    tokenToName[TOKEN_INC_OP] = "++";
+    tokenToName[TOKEN_DEC_OP] = "--";
+    tokenToName[TOKEN_LEFT_OP] = "<<";
+    tokenToName[TOKEN_RIGHT_OP] = ">>";
+    tokenToName[TOKEN_LE_OP] = "<=";
+    tokenToName[TOKEN_GE_OP] = ">=";
+    tokenToName[TOKEN_EQ_OP] = "==";
+    tokenToName[TOKEN_NE_OP] = "!=";
+    tokenToName[TOKEN_AND_OP] = "&&";
+    tokenToName[TOKEN_OR_OP] = "||";
+    tokenToName[TOKEN_MUL_ASSIGN] = "*=";
+    tokenToName[TOKEN_DIV_ASSIGN] = "/=";
+    tokenToName[TOKEN_MOD_ASSIGN] = "%=";
+    tokenToName[TOKEN_ADD_ASSIGN] = "+=";
+    tokenToName[TOKEN_SUB_ASSIGN] = "-=";
+    tokenToName[TOKEN_LEFT_ASSIGN] = "<<=";
+    tokenToName[TOKEN_RIGHT_ASSIGN] = ">>=";
+    tokenToName[TOKEN_AND_ASSIGN] = "&=";
+    tokenToName[TOKEN_XOR_ASSIGN] = "^=";
+    tokenToName[TOKEN_OR_ASSIGN] = "|=";
+    tokenToName[TOKEN_PTR_OP] = "->";
+    tokenToName[';'] = ";";
+    tokenToName['{'] = "{";
+    tokenToName['}'] = "}";
+    tokenToName[','] = ",";
+    tokenToName[':'] = ":";
+    tokenToName['='] = "=";
+    tokenToName['('] = "(";
+    tokenToName[')'] = ")";
+    tokenToName['['] = "[";
+    tokenToName[']'] = "]";
+    tokenToName['.'] = ".";
+    tokenToName['&'] = "&";
+    tokenToName['!'] = "!";
+    tokenToName['~'] = "~";
+    tokenToName['-'] = "-";
+    tokenToName['+'] = "+";
+    tokenToName['*'] = "*";
+    tokenToName['/'] = "/";
+    tokenToName['%'] = "%";
+    tokenToName['<'] = "<";
+    tokenToName['>'] = ">";
+    tokenToName['^'] = "^";
+    tokenToName['|'] = "|";
+    tokenToName['?'] = "?";
+    tokenToName[';'] = ";";
+
+    tokenNameRemap["TOKEN_ASSERT"] = "\'assert\'";
+    tokenNameRemap["TOKEN_BOOL"] = "\'bool\'";
+    tokenNameRemap["TOKEN_BREAK"] = "\'break\'";
+    tokenNameRemap["TOKEN_CASE"] = "\'case\'";
+    tokenNameRemap["TOKEN_CBREAK"] = "\'cbreak\'";
+    tokenNameRemap["TOKEN_CCONTINUE"] = "\'ccontinue\'";
+    tokenNameRemap["TOKEN_CDO"] = "\'cdo\'";
+    tokenNameRemap["TOKEN_CFOR"] = "\'cfor\'";
+    tokenNameRemap["TOKEN_CIF"] = "\'cif\'";
+    tokenNameRemap["TOKEN_CWHILE"] = "\'cwhile\'";
+    tokenNameRemap["TOKEN_CONST"] = "\'const\'";
+    tokenNameRemap["TOKEN_CONTINUE"] = "\'continue\'";
+    tokenNameRemap["TOKEN_CRETURN"] = "\'creturn\'";
+    tokenNameRemap["TOKEN_DEFAULT"] = "\'default\'";
+    tokenNameRemap["TOKEN_DO"] = "\'do\'";
+    tokenNameRemap["TOKEN_DELETE"] = "\'delete\'";
+    tokenNameRemap["TOKEN_DOUBLE"] = "\'double\'";
+    tokenNameRemap["TOKEN_ELSE"] = "\'else\'";
+    tokenNameRemap["TOKEN_ENUM"] = "\'enum\'";
+    tokenNameRemap["TOKEN_EXPORT"] = "\'export\'";
+    tokenNameRemap["TOKEN_EXTERN"] = "\'extern\'";
+    tokenNameRemap["TOKEN_FALSE"] = "\'false\'";
+    tokenNameRemap["TOKEN_FLOAT"] = "\'float\'";
+    tokenNameRemap["TOKEN_FOR"] = "\'for\'";
+    tokenNameRemap["TOKEN_FOREACH"] = "\'foreach\'";
+    tokenNameRemap["TOKEN_FOREACH_TILED"] = "\'foreach_tiled\'";
+    tokenNameRemap["TOKEN_GOTO"] = "\'goto\'";
+    tokenNameRemap["TOKEN_IDENTIFIER"] = "identifier";
+    tokenNameRemap["TOKEN_IF"] = "\'if\'";
+    tokenNameRemap["TOKEN_INLINE"] = "\'inline\'";
+    tokenNameRemap["TOKEN_INT"] = "\'int\'";
+    tokenNameRemap["TOKEN_INT8"] = "\'int8\'";
+    tokenNameRemap["TOKEN_INT16"] = "\'int16\'";
+    tokenNameRemap["TOKEN_INT"] = "\'int\'";
+    tokenNameRemap["TOKEN_INT64"] = "\'int64\'";
+    tokenNameRemap["TOKEN_LAUNCH"] = "\'launch\'";
+    tokenNameRemap["TOKEN_NEW"] = "\'new\'";
+    tokenNameRemap["TOKEN_NULL"] = "\'NULL\'";
+    tokenNameRemap["TOKEN_PRINT"] = "\'print\'";
+    tokenNameRemap["TOKEN_RETURN"] = "\'return\'";
+    tokenNameRemap["TOKEN_SOA"] = "\'soa\'";
+    tokenNameRemap["TOKEN_SIGNED"] = "\'signed\'";
+    tokenNameRemap["TOKEN_SIZEOF"] = "\'sizeof\'";
+    tokenNameRemap["TOKEN_STATIC"] = "\'static\'";
+    tokenNameRemap["TOKEN_STRUCT"] = "\'struct\'";
+    tokenNameRemap["TOKEN_SWITCH"] = "\'switch\'";
+    tokenNameRemap["TOKEN_SYNC"] = "\'sync\'";
+    tokenNameRemap["TOKEN_TASK"] = "\'task\'";
+    tokenNameRemap["TOKEN_TRUE"] = "\'true\'";
+    tokenNameRemap["TOKEN_TYPEDEF"] = "\'typedef\'";
+    tokenNameRemap["TOKEN_UNIFORM"] = "\'uniform\'";
+    tokenNameRemap["TOKEN_UNSIGNED"] = "\'unsigned\'";
+    tokenNameRemap["TOKEN_VARYING"] = "\'varying\'";
+    tokenNameRemap["TOKEN_VOID"] = "\'void\'";
+    tokenNameRemap["TOKEN_WHILE"] = "\'while\'";
+    tokenNameRemap["TOKEN_STRING_C_LITERAL"] = "\"C\"";
+    tokenNameRemap["TOKEN_DOTDOTDOT"] = "\'...\'";
+    tokenNameRemap["TOKEN_FLOAT_CONSTANT"] = "float constant";
+    tokenNameRemap["TOKEN_INT32_CONSTANT"] = "int32 constant";
+    tokenNameRemap["TOKEN_UINT32_CONSTANT"] = "unsigned int32 constant";
+    tokenNameRemap["TOKEN_INT64_CONSTANT"] = "int64 constant";
+    tokenNameRemap["TOKEN_UINT64_CONSTANT"] = "unsigned int64 constant";
+    tokenNameRemap["TOKEN_INC_OP"] = "\'++\'";
+    tokenNameRemap["TOKEN_DEC_OP"] = "\'--\'";
+    tokenNameRemap["TOKEN_LEFT_OP"] = "\'<<\'";
+    tokenNameRemap["TOKEN_RIGHT_OP"] = "\'>>\'";
+    tokenNameRemap["TOKEN_LE_OP"] = "\'<=\'";
+    tokenNameRemap["TOKEN_GE_OP"] = "\'>=\'";
+    tokenNameRemap["TOKEN_EQ_OP"] = "\'==\'";
+    tokenNameRemap["TOKEN_NE_OP"] = "\'!=\'";
+    tokenNameRemap["TOKEN_AND_OP"] = "\'&&\'";
+    tokenNameRemap["TOKEN_OR_OP"] = "\'||\'";
+    tokenNameRemap["TOKEN_MUL_ASSIGN"] = "\'*=\'";
+    tokenNameRemap["TOKEN_DIV_ASSIGN"] = "\'/=\'";
+    tokenNameRemap["TOKEN_MOD_ASSIGN"] = "\'%=\'";
+    tokenNameRemap["TOKEN_ADD_ASSIGN"] = "\'+=\'";
+    tokenNameRemap["TOKEN_SUB_ASSIGN"] = "\'-=\'";
+    tokenNameRemap["TOKEN_LEFT_ASSIGN"] = "\'<<=\'";
+    tokenNameRemap["TOKEN_RIGHT_ASSIGN"] = "\'>>=\'";
+    tokenNameRemap["TOKEN_AND_ASSIGN"] = "\'&=\'";
+    tokenNameRemap["TOKEN_XOR_ASSIGN"] = "\'^=\'";
+    tokenNameRemap["TOKEN_OR_ASSIGN"] = "\'|=\'";
+    tokenNameRemap["TOKEN_PTR_OP"] = "\'->\'";
+    tokenNameRemap["$end"] = "end of file";
+}
+
+
+inline int ispcRand() {
+#ifdef ISPC_IS_WINDOWS
+    return rand();
+#else
+    return lrand48();
+#endif
+}
+
+#define RT \
+    if (g->enableFuzzTest) { \
+        int r = ispcRand() % 40; \
+        if (r == 0) { \
+            Warning(yylloc, "Fuzz test dropping token"); \
+        } \
+        else if (r == 1) { \
+            Assert (tokenToName.size() > 0); \
+            int nt = sizeof(allTokens) / sizeof(allTokens[0]); \
+            int tn = ispcRand() % nt; \
+            yylval.stringVal = new std::string(yytext); /* just in case */\
+            Warning(yylloc, "Fuzz test replaced token with \"%s\"", tokenToName[allTokens[tn]].c_str()); \
+            return allTokens[tn]; \
+        } \
+        else if (r == 2) { \
+            Symbol *sym = m->symbolTable->RandomSymbol(); \
+            if (sym != NULL) { \
+                yylval.stringVal = new std::string(sym->name); \
+                Warning(yylloc, "Fuzz test replaced with identifier \"%s\".", sym->name.c_str()); \
+                return TOKEN_IDENTIFIER; \
+            } \
+        } \
+        /*  TOKEN_TYPE_NAME */ \
+     } else /* swallow semicolon */
+
 %}

 %option nounput
 %option noyywrap
-%option bison-bridge
-%option bison-locations
 %option nounistd

 WHITESPACE [ \t\r]+
-INT_NUMBER (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))
+INT_NUMBER (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[kMG]?
 FLOAT_NUMBER (([0-9]+|(([0-9]+\.[0-9]*[fF]?)|(\.[0-9]+)))([eE][-+]?[0-9]+)?[fF]?)
 HEX_FLOAT_NUMBER (0x[01](\.[0-9a-fA-F]*)?p[-+]?[0-9]+[fF]?)

@@ -75,200 +330,202 @@ IDENT [a-zA-Z_][a-zA-Z_0-9]*
 ZO_SWIZZLE ([01]+[w-z]+)+|([01]+[rgba]+)+|([01]+[uv]+)+

 %%
-"/*"            { lCComment(yylloc); }
-"//"            { lCppComment(yylloc); }
+"/*"            { lCComment(&yylloc); }
+"//"            { lCppComment(&yylloc); }

-__assert { return TOKEN_ASSERT; }
-bool { return TOKEN_BOOL; }
-break { return TOKEN_BREAK; }
-case { return TOKEN_CASE; }
-cbreak { return TOKEN_CBREAK; }
-ccontinue { return TOKEN_CCONTINUE; }
-cdo { return TOKEN_CDO; }
-cfor { return TOKEN_CFOR; }
-cif { return TOKEN_CIF; }
-cwhile { return TOKEN_CWHILE; }
-const { return TOKEN_CONST; }
-continue { return TOKEN_CONTINUE; }
-creturn { return TOKEN_CRETURN; }
-default { return TOKEN_DEFAULT; }
-do { return TOKEN_DO; }
-double { return TOKEN_DOUBLE; }
-else { return TOKEN_ELSE; }
-enum { return TOKEN_ENUM; }
-export { return TOKEN_EXPORT; }
-extern { return TOKEN_EXTERN; }
-false { return TOKEN_FALSE; }
-float { return TOKEN_FLOAT; }
-for { return TOKEN_FOR; }
-foreach { return TOKEN_FOREACH; }
-foreach_tiled { return TOKEN_FOREACH_TILED; }
-goto { return TOKEN_GOTO; }
-if { return TOKEN_IF; }
-inline { return TOKEN_INLINE; }
-int { return TOKEN_INT; }
-int8 { return TOKEN_INT8; }
-int16 { return TOKEN_INT16; }
-int32 { return TOKEN_INT; }
-int64 { return TOKEN_INT64; }
-launch { return TOKEN_LAUNCH; }
-NULL { return TOKEN_NULL; }
-print { return TOKEN_PRINT; }
-reference { Error(*yylloc, "\"reference\" qualifier is no longer supported; "
-                           "please use C++-style '&' syntax for references "
-                           "instead."); }
-return { return TOKEN_RETURN; }
-soa { return TOKEN_SOA; }
-signed { return TOKEN_SIGNED; }
-sizeof { return TOKEN_SIZEOF; }
-static { return TOKEN_STATIC; }
-struct { return TOKEN_STRUCT; }
-switch { return TOKEN_SWITCH; }
-sync { return TOKEN_SYNC; }
-task { return TOKEN_TASK; }
-true { return TOKEN_TRUE; }
-typedef { return TOKEN_TYPEDEF; }
-uniform { return TOKEN_UNIFORM; }
-unsigned { return TOKEN_UNSIGNED; }
-varying { return TOKEN_VARYING; }
-void { return TOKEN_VOID; }
-while { return TOKEN_WHILE; }
-\"C\" { return TOKEN_STRING_C_LITERAL; }
-\.\.\. { return TOKEN_DOTDOTDOT; }
+__assert { RT; return TOKEN_ASSERT; }
+bool { RT; return TOKEN_BOOL; }
+break { RT; return TOKEN_BREAK; }
+case { RT; return TOKEN_CASE; }
+cbreak { RT; return TOKEN_CBREAK; }
+ccontinue { RT; return TOKEN_CCONTINUE; }
+cdo { RT; return TOKEN_CDO; }
+cfor { RT; return TOKEN_CFOR; }
+cif { RT; return TOKEN_CIF; }
+cwhile { RT; return TOKEN_CWHILE; }
+const { RT; return TOKEN_CONST; }
+continue { RT; return TOKEN_CONTINUE; }
+creturn { RT; return TOKEN_CRETURN; }
+default { RT; return TOKEN_DEFAULT; }
+do { RT; return TOKEN_DO; }
+delete { RT; return TOKEN_DELETE; }
+delete\[\] { RT; return TOKEN_DELETE; }
+double { RT; return TOKEN_DOUBLE; }
+else { RT; return TOKEN_ELSE; }
+enum { RT; return TOKEN_ENUM; }
+export { RT; return TOKEN_EXPORT; }
+extern { RT; return TOKEN_EXTERN; }
+false { RT; return TOKEN_FALSE; }
+float { RT; return TOKEN_FLOAT; }
+for { RT; return TOKEN_FOR; }
+__foreach_active { RT; return TOKEN_FOREACH_ACTIVE; }
+foreach { RT; return TOKEN_FOREACH; }
+foreach_tiled { RT; return TOKEN_FOREACH_TILED; }
+goto { RT; return TOKEN_GOTO; }
+if { RT; return TOKEN_IF; }
+inline { RT; return TOKEN_INLINE; }
+int { RT; return TOKEN_INT; }
+int8 { RT; return TOKEN_INT8; }
+int16 { RT; return TOKEN_INT16; }
+int32 { RT; return TOKEN_INT; }
+int64 { RT; return TOKEN_INT64; }
+launch { RT; return TOKEN_LAUNCH; }
+new { RT; return TOKEN_NEW; }
+NULL { RT; return TOKEN_NULL; }
+print { RT; return TOKEN_PRINT; }
+return { RT; return TOKEN_RETURN; }
+soa { RT; return TOKEN_SOA; }
+signed { RT; return TOKEN_SIGNED; }
+sizeof { RT; return TOKEN_SIZEOF; }
+static { RT; return TOKEN_STATIC; }
+struct { RT; return TOKEN_STRUCT; }
+switch { RT; return TOKEN_SWITCH; }
+sync { RT; return TOKEN_SYNC; }
+task { RT; return TOKEN_TASK; }
+true { RT; return TOKEN_TRUE; }
+typedef { RT; return TOKEN_TYPEDEF; }
+uniform { RT; return TOKEN_UNIFORM; }
+unsigned { RT; return TOKEN_UNSIGNED; }
+varying { RT; return TOKEN_VARYING; }
+void { RT; return TOKEN_VOID; }
+while { RT; return TOKEN_WHILE; }
+\"C\" { RT; return TOKEN_STRING_C_LITERAL; }
+\.\.\. { RT; return TOKEN_DOTDOTDOT; }

-L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL; }
+L?\"(\\.|[^\\"])*\" { lStringConst(&yylval, &yylloc); return TOKEN_STRING_LITERAL; }

 {IDENT} { 
+    RT;
    /* We have an identifier--is it a type name or an identifier?
       The symbol table will straighten us out... */
-    yylval->stringVal = new std::string(yytext);
+    yylval.stringVal = new std::string(yytext);
    if (m->symbolTable->LookupType(yytext) != NULL)
        return TOKEN_TYPE_NAME;
    else
        return TOKEN_IDENTIFIER; 
 }

-{INT_NUMBER} { 
-    char *endPtr = NULL;
-    int64_t val;
+{INT_NUMBER}+(u|U|l|L)*? { 
+    RT;
+    int ls = 0, us = 0;

+    char *endPtr = NULL;
    if (yytext[0] == '0' && yytext[1] == 'b')
-        val = lParseBinary(yytext+2, *yylloc);
+        yylval.intVal = lParseBinary(yytext+2, yylloc, &endPtr);
    else {
-#ifdef ISPC_IS_WINDOWS
-        val = _strtoi64(yytext, &endPtr, 0);
+#if defined(ISPC_IS_WINDOWS) && !defined(__MINGW32__)
+        yylval.intVal = _strtoui64(yytext, &endPtr, 0);
 #else
        // FIXME: should use strtouq and then issue an error if we can't
        // fit into 64 bits...
-        val = strtoull(yytext, &endPtr, 0);
+        yylval.intVal = strtoull(yytext, &endPtr, 0);
 #endif
    }

+    bool kilo = false, mega = false, giga = false;
+    for (; *endPtr; endPtr++) {
+        if (*endPtr == 'k')
+            kilo = true;
+        else if (*endPtr == 'M')
+            mega = true;
+        else if (*endPtr == 'G')
+            giga = true;        
+        else if (*endPtr == 'l' || *endPtr == 'L')
+            ls++;
+        else if (*endPtr == 'u' || *endPtr == 'U')
+            us++;
+    }
+    if (kilo)
+        yylval.intVal *= 1024;
+    if (mega)
+        yylval.intVal *= 1024*1024;
+    if (giga)
+        yylval.intVal *= 1024*1024*1024;
+
+    if (ls >= 2)
+        return us ? TOKEN_UINT64_CONSTANT : TOKEN_INT64_CONSTANT;
+    else if (ls == 1)
+        return us ? TOKEN_UINT32_CONSTANT : TOKEN_INT32_CONSTANT;
+
    // See if we can fit this into a 32-bit integer...
-    if ((val & 0xffffffff) == val) {
-        yylval->int32Val = (int32_t)val;
-        return TOKEN_INT32_CONSTANT; 
-    }
-    else {
-        yylval->int64Val = val;
-        return TOKEN_INT64_CONSTANT; 
-    }
+    if ((yylval.intVal & 0xffffffff) == yylval.intVal)
+        return us ? TOKEN_UINT32_CONSTANT : TOKEN_INT32_CONSTANT;
+    else
+        return us ? TOKEN_UINT64_CONSTANT : TOKEN_INT64_CONSTANT;
 }

-{INT_NUMBER}[uU] {
-    char *endPtr = NULL;
-    uint64_t val;
-
-    if (yytext[0] == '0' && yytext[1] == 'b')
-        val = lParseBinary(yytext+2, *yylloc);
-    else {
-#ifdef ISPC_IS_WINDOWS
-        val = _strtoui64(yytext, &endPtr, 0);
-#else
-        val = strtoull(yytext, &endPtr, 0);
-#endif
-    }
-
-    if ((val & 0xffffffff) == val) {
-        // we can represent it in a 32-bit value
-        yylval->int32Val = (int32_t)val;
-        return TOKEN_UINT32_CONSTANT; 
-    }
-    else {
-        yylval->int64Val = val;
-        return TOKEN_UINT64_CONSTANT; 
-    }
-}

 {FLOAT_NUMBER} { 
-    yylval->floatVal = atof(yytext); 
+    RT;
+    yylval.floatVal = (float)atof(yytext);
    return TOKEN_FLOAT_CONSTANT; 
 }

 {HEX_FLOAT_NUMBER} {
-    yylval->floatVal = lParseHexFloat(yytext); 
+    RT;
+    yylval.floatVal = (float)lParseHexFloat(yytext); 
    return TOKEN_FLOAT_CONSTANT; 
 }

-"++" { return TOKEN_INC_OP; }
-"--" { return TOKEN_DEC_OP; }
-"<<" { return TOKEN_LEFT_OP; }
-">>" { return TOKEN_RIGHT_OP; }
-"<=" { return TOKEN_LE_OP; }
-">=" { return TOKEN_GE_OP; }
-"==" { return TOKEN_EQ_OP; }
-"!=" { return TOKEN_NE_OP; }
-"&&" { return TOKEN_AND_OP; }
-"||" { return TOKEN_OR_OP; }
-"*=" { return TOKEN_MUL_ASSIGN; }
-"/=" { return TOKEN_DIV_ASSIGN; }
-"%=" { return TOKEN_MOD_ASSIGN; }
-"+=" { return TOKEN_ADD_ASSIGN; }
-"-=" { return TOKEN_SUB_ASSIGN; }
-"<<=" { return TOKEN_LEFT_ASSIGN; }
-">>=" { return TOKEN_RIGHT_ASSIGN; }
-"&=" { return TOKEN_AND_ASSIGN; }
-"^=" { return TOKEN_XOR_ASSIGN; }
-"|=" { return TOKEN_OR_ASSIGN; }
-"->" { return TOKEN_PTR_OP; }
-";"             { return ';'; }
-("{"|"<%")      { return '{'; }
-("}"|"%>")      { return '}'; }
-","             { return ','; }
-":"             { return ':'; }
-"="             { return '='; }
-"("             { return '('; }
-")"             { return ')'; }
-("["|"<:")      { return '['; }
-("]"|":>")      { return ']'; }
-"."             { return '.'; }
-"&"             { return '&'; }
-"!"             { return '!'; }
-"~"             { return '~'; }
-"-"             { return '-'; }
-"+"             { return '+'; }
-"*"             { return '*'; }
-"/"             { return '/'; }
-"%"             { return '%'; }
-"<"             { return '<'; }
-">"             { return '>'; }
-"^"             { return '^'; }
-"|"             { return '|'; }
-"?"             { return '?'; }
+"++" { RT; return TOKEN_INC_OP; }
+"--" { RT; return TOKEN_DEC_OP; }
+"<<" { RT; return TOKEN_LEFT_OP; }
+">>" { RT; return TOKEN_RIGHT_OP; }
+"<=" { RT; return TOKEN_LE_OP; }
+">=" { RT; return TOKEN_GE_OP; }
+"==" { RT; return TOKEN_EQ_OP; }
+"!=" { RT; return TOKEN_NE_OP; }
+"&&" { RT; return TOKEN_AND_OP; }
+"||" { RT; return TOKEN_OR_OP; }
+"*=" { RT; return TOKEN_MUL_ASSIGN; }
+"/=" { RT; return TOKEN_DIV_ASSIGN; }
+"%=" { RT; return TOKEN_MOD_ASSIGN; }
+"+=" { RT; return TOKEN_ADD_ASSIGN; }
+"-=" { RT; return TOKEN_SUB_ASSIGN; }
+"<<=" { RT; return TOKEN_LEFT_ASSIGN; }
+">>=" { RT; return TOKEN_RIGHT_ASSIGN; }
+"&=" { RT; return TOKEN_AND_ASSIGN; }
+"^=" { RT; return TOKEN_XOR_ASSIGN; }
+"|=" { RT; return TOKEN_OR_ASSIGN; }
+"->" { RT; return TOKEN_PTR_OP; }
+";"             { RT; return ';'; }
+("{"|"<%")      { RT; return '{'; }
+("}"|"%>")      { RT; return '}'; }
+","             { RT; return ','; }
+":"             { RT; return ':'; }
+"="             { RT; return '='; }
+"("             { RT; return '('; }
+")"             { RT; return ')'; }
+("["|"<:")      { RT; return '['; }
+("]"|":>")      { RT; return ']'; }
+"."             { RT; return '.'; }
+"&"             { RT; return '&'; }
+"!"             { RT; return '!'; }
+"~"             { RT; return '~'; }
+"-"             { RT; return '-'; }
+"+"             { RT; return '+'; }
+"*"             { RT; return '*'; }
+"/"             { RT; return '/'; }
+"%"             { RT; return '%'; }
+"<"             { RT; return '<'; }
+">"             { RT; return '>'; }
+"^"             { RT; return '^'; }
+"|"             { RT; return '|'; }
+"?"             { RT; return '?'; }

 {WHITESPACE} { }

 \n {
-    yylloc->last_line++; 
-    yylloc->last_column = 1; 
+    yylloc.last_line++; 
+    yylloc.last_column = 1; 
 }

 #(line)?[ ][0-9]+[ ]\"(\\.|[^\\"])*\"[^\n]* { 
-    lHandleCppHash(yylloc); 
+    lHandleCppHash(&yylloc); 
 }

 . {
-    Error(*yylloc, "Illegal character: %c (0x%x)", yytext[0], int(yytext[0]));
+    Error(yylloc, "Illegal character: %c (0x%x)", yytext[0], int(yytext[0]));
    YY_USER_ACTION 
 }

@@ -285,14 +542,11 @@ L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL;
 /** Return the integer version of a binary constant from a string.
 */
 static uint64_t
-lParseBinary(const char *ptr, SourcePos pos) {
+lParseBinary(const char *ptr, SourcePos pos, char **endPtr) {
    uint64_t val = 0;
    bool warned = false;

-    while (*ptr != '\0') {
-        /* if this hits, the regexp for 0b... constants is broken */
-        assert(*ptr == '0' || *ptr == '1');
-
+    while (*ptr == '0' || *ptr == '1') {
        if ((val & (((int64_t)1)<<63)) && warned == false) {
            // We're about to shift out a set bit
            Warning(pos, "Can't represent binary constant with a 64-bit integer type");
@@ -302,6 +556,7 @@ lParseBinary(const char *ptr, SourcePos pos) {
        val = (val << 1) | (*ptr == '0' ? 0 : 1);
        ++ptr;
    }
+    *endPtr = (char *)ptr;
    return val;
 }

@@ -311,8 +566,10 @@ lParseBinary(const char *ptr, SourcePos pos) {
 static void
 lCComment(SourcePos *pos) {
    char c, prev = 0;
-  
+
    while ((c = yyinput()) != 0) {
+        ++pos->last_column;
+
        if (c == '\n') {
            pos->last_line++;
            pos->last_column = 1;
@@ -346,7 +603,7 @@ static void lHandleCppHash(SourcePos *pos) {
    char *ptr, *src;

    // Advance past the opening stuff on the line.
-    assert(yytext[0] == '#');
+    Assert(yytext[0] == '#');
    if (yytext[1] == ' ')
        // On Linux/OSX, the preprocessor gives us lines like
        // # 1234 "foo.c"
@@ -354,7 +611,7 @@ static void lHandleCppHash(SourcePos *pos) {
    else {
        // On windows, cl.exe's preprocessor gives us lines of the form:
        // #line 1234 "foo.c"
-        assert(!strncmp(yytext+1, "line ", 5));
+        Assert(!strncmp(yytext+1, "line ", 5));
        ptr = yytext + 6;
    }

@@ -364,13 +621,13 @@ static void lHandleCppHash(SourcePos *pos) {
    pos->last_column = 1;
    // Make sure that the character after the integer is a space and that
    // then we have open quotes
-    assert(src != ptr && src[0] == ' ' && src[1] == '"');
+    Assert(src != ptr && src[0] == ' ' && src[1] == '"');
    src += 2;

    // And the filename is everything up until the closing quotes
    std::string filename;
    while (*src != '"') {
-        assert(*src && *src != '\n');
+        Assert(*src && *src != '\n');
        filename.push_back(*src);
        ++src;
    }
@@ -471,13 +728,13 @@ ipow2(int exponent) {
 */
 static double
 lParseHexFloat(const char *ptr) {
-    assert(ptr != NULL);
+    Assert(ptr != NULL);

-    assert(ptr[0] == '0' && ptr[1] == 'x');
+    Assert(ptr[0] == '0' && ptr[1] == 'x');
    ptr += 2;

    // Start initializing the mantissa
-    assert(*ptr == '0' || *ptr == '1');
+    Assert(*ptr == '0' || *ptr == '1');
    double mantissa = (*ptr == '1') ? 1. : 0.;
    ++ptr;

@@ -497,7 +754,7 @@ lParseHexFloat(const char *ptr) {
            else if (*ptr >= 'a' && *ptr <= 'f')
                digit = 10 + *ptr - 'a';
            else {
-                assert(*ptr >= 'A' && *ptr <= 'F');
+                Assert(*ptr >= 'A' && *ptr <= 'F');
                digit = 10 + *ptr - 'A';
            }

@@ -510,7 +767,7 @@ lParseHexFloat(const char *ptr) {
    else
        // If there's not a '.', then we better be going straight to the
        // exponent
-        assert(*ptr == 'p');
+        Assert(*ptr == 'p');

    ++ptr; // skip the 'p'

--- a/llvmutil.cpp
+++ b/llvmutil.cpp
--- a/llvmutil.h
+++ b/llvmutil.h
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -38,12 +38,23 @@
 #ifndef ISPC_LLVMUTIL_H
 #define ISPC_LLVMUTIL_H 1

-#include "ispc.h"
 #include <llvm/LLVMContext.h>
 #include <llvm/Type.h>
 #include <llvm/DerivedTypes.h>
 #include <llvm/Constants.h>

+namespace llvm {
+    class PHINode;
+    class InsertElementInst;
+}
+
+// llvm::Type *s are no longer const in llvm 3.0
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
+#define LLVM_TYPE_CONST
+#else
+#define LLVM_TYPE_CONST const
+#endif
+

 /** This structure holds pointers to a variety of LLVM types; code
    elsewhere can use them from here, ratherthan needing to make more
@@ -99,6 +110,7 @@ extern llvm::Constant *LLVMTrue, *LLVMFalse;
    of LLVMTypes and the LLVMTrue/LLVMFalse constants.  However, it can't
    be called until the compilation target is known.
 */
+struct Target;
 extern void InitLLVMUtil(llvm::LLVMContext *ctx, Target target);

 /** Returns an LLVM i8 constant of the given value */
@@ -161,6 +173,14 @@ extern llvm::Constant *LLVMFloatVector(float f);
    across all elements */
 extern llvm::Constant *LLVMDoubleVector(double f);

+/** Returns a constant integer or vector (according to the given type) of
+    the given signed integer value. */
+extern llvm::Constant *LLVMIntAsType(int64_t, LLVM_TYPE_CONST llvm::Type *t);
+
+/** Returns a constant integer or vector (according to the given type) of
+    the given unsigned integer value. */
+extern llvm::Constant *LLVMUIntAsType(uint64_t, LLVM_TYPE_CONST llvm::Type *t);
+
 /** Returns an LLVM boolean vector based on the given array of values.
    The array should have g->target.vectorWidth elements. */
 extern llvm::Constant *LLVMBoolVector(const bool *v);
@@ -205,4 +225,77 @@ extern llvm::Constant *LLVMMaskAllOn;
 /** LLVM constant value representing an 'all off' SIMD lane mask */
 extern llvm::Constant *LLVMMaskAllOff;

+/** Tests to see if all of the elements of the vector in the 'v' parameter
+    are equal.  Like lValuesAreEqual(), this is a conservative test and may
+    return false for arrays where the values are actually all equal.  */
+extern bool LLVMVectorValuesAllEqual(llvm::Value *v);
+
+/** Given vector of integer-typed values, this function returns true if it
+    can determine that the elements of the vector have a step of 'stride'
+    between their values and false otherwise.  This function tries to
+    handle as many possibilities as possible, including things like all
+    elements equal to some non-constant value plus an integer offset, etc.
+    Needless to say (the halting problem and all that), it may return false
+    for some vectors that are in fact linear.
+    */
+extern bool LLVMVectorIsLinear(llvm::Value *v, int stride);
+
+/** Given a vector-typed value v, if the vector is a vector with constant
+    element values, this function extracts those element values into the
+    ret[] array and returns the number of elements (i.e. the vector type's
+    width) in *nElts.  It returns true if successful and false if the given
+    vector is not in fact a vector of constants. */
+extern bool LLVMExtractVectorInts(llvm::Value *v, int64_t ret[], int *nElts);
+
+/** This function takes chains of InsertElement instructions along the
+    lines of:
+
+    %v0 = insertelement undef, value_0, i32 index_0
+    %v1 = insertelement %v1,   value_1, i32 index_1
+    ...
+    %vn = insertelement %vn-1, value_n-1, i32 index_n-1
+
+    and initializes the provided elements array such that the i'th
+    llvm::Value * in the array is the element that was inserted into the
+    i'th element of the vector.  
+
+    When the chain of insertelement instruction comes to an end, the only
+    base case that this function handles is the initial value being a
+    constant vector.  For anything more complex (e.g. some other arbitrary
+    value, it doesn't try to extract element values into the returned
+    array.
+ */
+extern void LLVMFlattenInsertChain(llvm::InsertElementInst *ie, int vectorWidth,
+                                   llvm::Value **elements);
+
+/** This is a utility routine for debugging that dumps out the given LLVM
+    value as well as (recursively) all of the other values that it depends
+    on. */
+extern void LLVMDumpValue(llvm::Value *v);
+
+/** Given a vector-typed value, this function returns the value of its
+    first element.  Rather than just doing the straightforward thing of
+    using a single extractelement instruction to do this, this function
+    tries to rewrite the computation for the first element in scalar form;
+    this is generally more efficient than computing the entire vector's
+    worth of values just to extract the first element, in cases where only
+    the first element's value is needed.
+  */
+extern llvm::Value *LLVMExtractFirstVectorElement(llvm::Value *v, 
+                                              llvm::Instruction *insertBefore);
+
+/** This function takes two vectors, expected to be the same length, and
+    returns a new vector of twice the length that represents concatenating
+    the two of them. */
+extern llvm::Value *LLVMConcatVectors(llvm::Value *v1, llvm::Value *v2, 
+                                      llvm::Instruction *insertBefore);
+
+/** This is a utility function for vector shuffling; it takes two vectors
+    v1 and v2, and a compile-time constant set of integer permutations in
+    shuf[] and returns a new vector of length shufSize that represents the
+    corresponding shufflevector operation. */
+extern llvm::Value *LLVMShuffleVectors(llvm::Value *v1, llvm::Value *v2,
+                                       int32_t shuf[], int shufSize,
+                                       llvm::Instruction *insertBefore);
+
 #endif // ISPC_LLVMUTIL_H
--- a/main.cpp
+++ b/main.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -37,8 +37,13 @@

 #include "ispc.h"
 #include "module.h"
+#include "util.h"
+#include "type.h"
 #include <stdio.h>
 #include <stdlib.h>
+#ifdef ISPC_IS_WINDOWS
+  #include <time.h>
+#endif // ISPC_IS_WINDOWS
 #include <llvm/Support/PrettyStackTrace.h>
 #include <llvm/Support/Signals.h>
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
@@ -52,29 +57,53 @@

 #ifdef ISPC_IS_WINDOWS
 #define strcasecmp stricmp
+#ifndef BUILD_DATE
 #define BUILD_DATE __DATE__
+#endif
 #define BUILD_VERSION ""
 #endif // ISPC_IS_WINDOWS

-static void usage(int ret) {
-    printf("This is the Intel(r) SPMD Program Compiler (ispc), build %s (%s)\n\n", 
-           BUILD_DATE, BUILD_VERSION);
-    printf("usage: ispc\n");
+static void
+lPrintVersion() {
+    printf("Intel(r) SPMD Program Compiler (ispc), %s (build %s @ %s, LLVM %s)\n", 
+           ISPC_VERSION, BUILD_VERSION, BUILD_DATE, 
+#ifdef LLVM_2_9
+           "2.9"
+#elif defined(LLVM_3_0) || defined(LLVM_3_0svn)
+           "3.0"
+#elif defined(LLVM_3_1) || defined(LLVM_3_1svn)
+           "3.1"
+#else
+#error "Unhandled LLVM version"
+#endif 
+           );
+}
+
+
+static void
+usage(int ret) {
+    lPrintVersion();
+    printf("\nusage: ispc\n");
    printf("    [--addressing={32,64}]\t\tSelect 32- or 64-bit addressing. (Note that 32-bit\n");
    printf("                          \t\taddressing calculations are done by default, even\n");
    printf("                          \t\ton 64-bit target architectures.)\n");
    printf("    [--arch={%s}]\t\tSelect target architecture\n", 
           Target::SupportedTargetArchs());
+    printf("    [--c++-include-file=<name>]\t\tSpecify name of file to emit in #include statement in generated C++ code.\n");
    printf("    [--cpu=<cpu>]\t\t\tSelect target CPU type\n");
    printf("         <cpu>={%s}\n", Target::SupportedTargetCPUs());
    printf("    [-D<foo>]\t\t\t\t#define given value when running preprocessor\n");
-    printf("    [--debug]\t\t\t\tPrint information useful for debugging ispc\n");
    printf("    [--emit-asm]\t\t\tGenerate assembly language file as output\n");
+#ifndef LLVM_2_9
+    printf("    [--emit-c++]\t\t\tEmit a C++ source file as output\n");
+#endif // !LLVM_2_9
    printf("    [--emit-llvm]\t\t\tEmit LLVM bitode file as output\n");
    printf("    [--emit-obj]\t\t\tGenerate object file file as output (default)\n");
    printf("    [-g]\t\t\t\tGenerate debugging information\n");
    printf("    [--help]\t\t\t\tPrint help\n");
+    printf("    [--help-dev]\t\t\tPrint help for developer options\n");
    printf("    [-h <name>/--header-outfile=<name>]\tOutput filename for header\n");
+    printf("    [-I <path>]\t\t\t\tAdd <path> to #include file search path\n");
    printf("    [--instrument]\t\t\tEmit instrumentation to gather performance data\n");
    printf("    [--math-lib=<option>]\t\tSelect math library\n");
    printf("        default\t\t\t\tUse ispc's built-in math functions\n");
@@ -90,20 +119,10 @@ static void usage(int ret) {
    printf("        disable-loop-unroll\t\tDisable loop unrolling.\n");
    printf("        fast-masked-vload\t\tFaster masked vector loads on SSE (may go past end of array)\n");
    printf("        fast-math\t\t\tPerform non-IEEE-compliant optimizations of numeric expressions\n");
-#if 0
-    printf("        disable-handle-pseudo-memory-ops\n");
-    printf("        disable-blended-masked-stores\t\tScalarize masked stores on SSE (vs. using vblendps)\n");
-    printf("        disable-coherent-control-flow\t\tDisable coherent control flow optimizations\n");
-    printf("        disable-uniform-control-flow\t\tDisable uniform control flow optimizations\n");
-    printf("        disable-gather-scatter-optimizations\tDisable improvements to gather/scatter\n");
-    printf("        disable-blending-removal\t\tDisable eliminating blend at same scope\n");
-    printf("        disable-gather-scatter-flattening\tDisable flattening when all lanes are on\n");
-    printf("        disable-uniform-memory-optimizations\tDisable uniform-based coherent memory access\n");
-    printf("        disable-masked-store-optimizations\tDisable lowering to regular stores when possible\n");
-#endif
 #ifndef ISPC_IS_WINDOWS
    printf("    [--pic]\t\t\t\tGenerate position-independent code\n");
 #endif // !ISPC_IS_WINDOWS
+    printf("    [--quiet]\t\t\t\tSuppress all output\n");
    printf("    [--target=<isa>]\t\t\tSelect target ISA. <isa>={%s}\n", Target::SupportedTargetISAs());
    printf("    [--version]\t\t\t\tPrint ispc version\n");
    printf("    [--werror]\t\t\t\tTreat warnings as errors\n");
@@ -114,11 +133,33 @@ static void usage(int ret) {
 }


+static void
+devUsage(int ret) {
+    lPrintVersion();
+    printf("\nusage (developer options): ispc\n");
+    printf("    [--debug]\t\t\t\tPrint information useful for debugging ispc\n");
+    printf("    [--fuzz-test]\t\t\tRandomly perturb program input to test error conditions\n");
+    printf("    [--fuzz-seed=<value>]\t\tSeed value for RNG for fuzz testing\n");
+    printf("    [--opt=<option>]\t\t\tSet optimization option\n");
+    printf("        disable-all-on-optimizations\t\tDisable optimizations that take advantage of \"all on\" mask\n");
+    printf("        disable-blended-masked-stores\t\tScalarize masked stores on SSE (vs. using vblendps)\n");
+    printf("        disable-blending-removal\t\tDisable eliminating blend at same scope\n");
+    printf("        disable-coalescing\t\t\tDisable gather coalescing\n");
+    printf("        disable-coherent-control-flow\t\tDisable coherent control flow optimizations\n");
+    printf("        disable-gather-scatter-flattening\tDisable flattening when all lanes are on\n");
+    printf("        disable-gather-scatter-optimizations\tDisable improvements to gather/scatter\n");
+    printf("        disable-handle-pseudo-memory-ops\tLeave __pseudo_* calls for gather/scatter/etc. in final IR\n");
+    printf("        disable-uniform-control-flow\t\tDisable uniform control flow optimizations\n");
+    printf("        disable-uniform-memory-optimizations\tDisable uniform-based coherent memory access\n");
+    printf("    [--yydebug]\t\t\t\tPrint debugging information during parsing\n");
+    exit(ret);
+}
+

 /** We take arguments from both the command line as well as from the
    ISPC_ARGS environment variable.  This function returns a new set of
    arguments representing the ones from those two sources merged together.
- */ 
+*/ 
 static void lGetAllArgs(int Argc, char *Argv[], int &argc, char *argv[128]) {
    // Copy over the command line arguments (passed in)
    for (int i = 0; i < Argc; ++i)
@@ -166,10 +207,12 @@ int main(int Argc, char *Argv[]) {
    char *argv[128];
    lGetAllArgs(Argc, Argv, argc, argv);

+#if 0
    // Use LLVM's little utility function to print out nice stack traces if
    // we crash
    llvm::sys::PrintStackTraceOnErrorSignal();
    llvm::PrettyStackTraceProgram X(argc, argv);
+#endif

    // initialize available LLVM targets
    LLVMInitializeX86TargetInfo();
@@ -184,6 +227,7 @@ int main(int Argc, char *Argv[]) {
    char *file = NULL;
    const char *headerFileName = NULL;
    const char *outFileName = NULL;
+    const char *includeFileName = NULL;

    // Initiailize globals early so that we can set various option values
    // as we're parsing below
@@ -197,13 +241,15 @@ int main(int Argc, char *Argv[]) {
    for (int i = 1; i < argc; ++i) {
        if (!strcmp(argv[i], "--help"))
            usage(0);
+        if (!strcmp(argv[i], "--help-dev"))
+            devUsage(0);
        else if (!strncmp(argv[i], "-D", 2))
            g->cppArgs.push_back(argv[i]);
        else if (!strncmp(argv[i], "--addressing=", 13)) {
            if (atoi(argv[i] + 13) == 64)
                g->opt.force32BitAddressing = false;
            else if (atoi(argv[i] + 13) == 32)
-                g->opt.force32BitAddressing = 32;
+                g->opt.force32BitAddressing = true;
            else {
                fprintf(stderr, "Addressing width \"%s\" invalid--only 32 and "
                        "64 are allowed.\n", argv[i]+13);
@@ -233,13 +279,33 @@ int main(int Argc, char *Argv[]) {
        }
        else if (!strcmp(argv[i], "--emit-asm"))
            ot = Module::Asm;
+#ifndef LLVM_2_9
+        else if (!strcmp(argv[i], "--emit-c++"))
+            ot = Module::CXX;
+#endif // !LLVM_2_9
        else if (!strcmp(argv[i], "--emit-llvm"))
            ot = Module::Bitcode;
        else if (!strcmp(argv[i], "--emit-obj"))
            ot = Module::Object;
+        else if (!strcmp(argv[i], "-I")) {
+            if (++i == argc) {
+                fprintf(stderr, "No path specified after -I option.\n");
+                usage(1);
+            }
+            g->includePath.push_back(argv[i]);
+        }
+        else if (!strncmp(argv[i], "-I", 2))
+            g->includePath.push_back(argv[i]+2);
+        else if (!strcmp(argv[i], "--fuzz-test"))
+            g->enableFuzzTest = true;
+        else if (!strncmp(argv[i], "--fuzz-seed=", 12))
+            g->fuzzTestSeed = atoi(argv[i] + 12);
        else if (!strcmp(argv[i], "--target")) {
            // FIXME: should remove this way of specifying the target...
-            if (++i == argc) usage(1);
+            if (++i == argc) {
+                fprintf(stderr, "No target specified after --target option.\n");
+                usage(1);
+            }
            target = argv[i];
        }
        else if (!strncmp(argv[i], "--target=", 9))
@@ -254,8 +320,10 @@ int main(int Argc, char *Argv[]) {
                g->mathLib = Globals::Math_SVML;
            else if (!strcmp(lib, "system"))
                g->mathLib = Globals::Math_System;
-            else
+            else {
+                fprintf(stderr, "Unknown --math-lib= option \"%s\".\n", lib);
                usage(1);
+            }
        }
        else if (!strncmp(argv[i], "--opt=", 6)) {
            const char *opt = argv[i] + 6;
@@ -270,6 +338,10 @@ int main(int Argc, char *Argv[]) {

            // These are only used for performance tests of specific
            // optimizations
+            else if (!strcmp(opt, "disable-all-on-optimizations"))
+                g->opt.disableMaskAllOnOptimizations = true;
+            else if (!strcmp(opt, "disable-coalescing"))
+                g->opt.disableCoalescing = true;
            else if (!strcmp(opt, "disable-handle-pseudo-memory-ops"))
                g->opt.disableHandlePseudoMemoryOps = true;
            else if (!strcmp(opt, "disable-blended-masked-stores"))
@@ -286,10 +358,10 @@ int main(int Argc, char *Argv[]) {
                g->opt.disableGatherScatterFlattening = true;
            else if (!strcmp(opt, "disable-uniform-memory-optimizations"))
                g->opt.disableUniformMemoryOptimizations = true;
-            else if (!strcmp(opt, "disable-masked-store-optimizations"))
-                g->opt.disableMaskedStoreOptimizations = true;
-            else 
+            else {
+                fprintf(stderr, "Unknown --opt= option \"%s\".\n", opt);
                usage(1);
+            }
        }
        else if (!strcmp(argv[i], "--woff") || !strcmp(argv[i], "-woff")) {
            g->disableWarnings = true;
@@ -302,18 +374,27 @@ int main(int Argc, char *Argv[]) {
        else if (!strcmp(argv[i], "--wno-perf") || !strcmp(argv[i], "-wno-perf"))
            g->emitPerfWarnings = false;
        else if (!strcmp(argv[i], "-o")) {
-            if (++i == argc) usage(1);
+            if (++i == argc) {
+                fprintf(stderr, "No output file specified after -o option.\n");
+                usage(1);
+            }
            outFileName = argv[i];
        }
-        else if (!strcmp(argv[i], "--outfile="))
+        else if (!strncmp(argv[i], "--outfile=", 10))
            outFileName = argv[i] + strlen("--outfile=");
        else if (!strcmp(argv[i], "-h")) {
-            if (++i == argc) usage(1);
+            if (++i == argc) {
+                fprintf(stderr, "No header file name specified after -h option.\n");
+                usage(1);
+            }
            headerFileName = argv[i];
        }
-        else if (!strcmp(argv[i], "--header-outfile=")) {
+        else if (!strncmp(argv[i], "--header-outfile=", 17)) {
            headerFileName = argv[i] + strlen("--header-outfile=");
        }
+        else if (!strncmp(argv[i], "--c++-include-file=", 19)) {
+            includeFileName = argv[i] + strlen("--c++-include-file=");
+        }
        else if (!strcmp(argv[i], "-O0")) {
            g->opt.level = 0;
            optSet = true;
@@ -333,16 +414,26 @@ int main(int Argc, char *Argv[]) {
        else if (!strcmp(argv[i], "--pic"))
            generatePIC = true;
 #endif // !ISPC_IS_WINDOWS
+        else if (!strcmp(argv[i], "--quiet"))
+            g->quiet = true;
+        else if (!strcmp(argv[i], "--yydebug")) {
+            extern int yydebug;
+            yydebug = 1;
+        }
        else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--version")) {
-            printf("Intel(r) SPMD Program Compiler (ispc) build %s (%s)\n", 
-                   BUILD_DATE, BUILD_VERSION);
+            lPrintVersion();
            return 0;
        }
-        else if (argv[i][0] == '-')
+        else if (argv[i][0] == '-') {
+            fprintf(stderr, "Unknown option \"%s\".\n", argv[i]);
            usage(1);
+        }
        else {
-            if (file != NULL)
+            if (file != NULL) {
+                fprintf(stderr, "Multiple input files specified on command "
+                        "line: \"%s\" and \"%s\".\n", file, argv[i]);
                usage(1);
+            }
            else
                file = argv[i];
        }
@@ -354,6 +445,30 @@ int main(int Argc, char *Argv[]) {
    if (debugSet && !optSet)
        g->opt.level = 0;

+    if (g->enableFuzzTest) {
+        if (g->fuzzTestSeed == -1) {
+#ifdef ISPC_IS_WINDOWS
+            int seed = (unsigned)time(NULL);
+#else
+            int seed = getpid();
+#endif
+            g->fuzzTestSeed = seed;
+            Warning(SourcePos(), "Using seed %d for fuzz testing", 
+                    g->fuzzTestSeed);
+        }
+#ifdef ISPC_IS_WINDOWS
+        srand(g->fuzzTestSeed);
+#else
+        srand48(g->fuzzTestSeed);
+#endif
+    }
+
+    if (outFileName == NULL && headerFileName == NULL)
+        Warning(SourcePos(), "No output file or header file name specified. "
+                "Program will be compiled and warnings/errors will "
+                "be issued, but no output will be generated.");
+
    return Module::CompileAndOutput(file, arch, cpu, target, generatePIC,
-                                    ot, outFileName, headerFileName);
+                                    ot, outFileName, headerFileName, 
+                                    includeFileName);
 }
--- a/module.cpp
+++ b/module.cpp
@@ -49,7 +49,6 @@
 #include "llvmutil.h"

 #include <stdio.h>
-#include <assert.h>
 #include <stdarg.h>
 #include <ctype.h>
 #include <sys/types.h>
@@ -77,7 +76,6 @@
 #include <llvm/Target/TargetMachine.h>
 #include <llvm/Target/TargetOptions.h>
 #include <llvm/Target/TargetData.h>
-#include <llvm/PassManager.h>
 #include <llvm/Analysis/Verifier.h>
 #include <llvm/Support/CFG.h>
 #include <clang/Frontend/CompilerInstance.h>
@@ -150,8 +148,13 @@ extern void yy_delete_buffer(YY_BUFFER_STATE);

 int
 Module::CompileFile() {
+#ifndef LLVM_3_1svn
    if (g->opt.fastMath == true)
        llvm::UnsafeFPMath = true;
+#endif // !LLVM_3_1svn
+
+    extern void ParserInit();
+    ParserInit();

    // FIXME: it'd be nice to do this in the Module constructor, but this
    // function ends up calling into routines that expect the global
@@ -222,18 +225,24 @@ Module::AddGlobalVariable(Symbol *sym, Expr *initExpr, bool isConst) {
    if (sym == NULL || sym->type == NULL) {
        // But if these are NULL and there haven't been any previous
        // errors, something surprising is going on
-        assert(errorCount > 0);
+        Assert(errorCount > 0);
        return;
    }

    if (symbolTable->LookupFunction(sym->name.c_str())) {
-        Error(sym->pos, "Global variable \"%s\" shadows previously-declared function.",
-              sym->name.c_str());
+        Error(sym->pos, "Global variable \"%s\" shadows previously-declared "
+              "function.", sym->name.c_str());
        return;
    }

    if (sym->storageClass == SC_EXTERN_C) {
-        Error(sym->pos, "extern \"C\" qualifier can only be used for functions.");
+        Error(sym->pos, "extern \"C\" qualifier can only be used for "
+              "functions.");
+        return;
+    }
+
+    if (Type::Equal(sym->type, AtomicType::Void)) {
+        Error(sym->pos, "\"void\" type global variable is illegal.");
        return;
    }

@@ -262,7 +271,7 @@ Module::AddGlobalVariable(Symbol *sym, Expr *initExpr, bool isConst) {
                  "global variable \"%s\".", sym->name.c_str());
    }
    else if (initExpr != NULL) {
-        initExpr = initExpr->TypeCheck();
+        initExpr = TypeCheck(initExpr);
        if (initExpr != NULL) {
            // We need to make sure the initializer expression is
            // the same type as the global.  (But not if it's an
@@ -272,7 +281,7 @@ Module::AddGlobalVariable(Symbol *sym, Expr *initExpr, bool isConst) {
                initExpr = TypeConvertExpr(initExpr, sym->type, "initializer");
            
            if (initExpr != NULL) {
-                initExpr = initExpr->Optimize();
+                initExpr = Optimize(initExpr);
                // Fingers crossed, now let's see if we've got a
                // constant value..
                llvmInitializer = initExpr->GetConstant(sym->type);
@@ -319,23 +328,38 @@ Module::AddGlobalVariable(Symbol *sym, Expr *initExpr, bool isConst) {
 }


+/** Given an arbitrary type, see if it or any of the leaf types contained
+    in it has a type that's illegal to have exported to C/C++
+    code--specifically, that it has a varying value in memory, or a pointer
+    to SOA data (which has a different representation than a regular
+    pointer.

+    (Note that it's fine for the original struct or a contained struct to
+    be varying, so long as all of its members have bound 'uniform'
+    variability.) 

-/** Given an arbitrary type, see if it or any of the types contained in it
-    are varying.  Returns true if so, false otherwise. 
+    This functions returns true and issues an error if are any illegal
+    types are found and returns false otherwise.  
 */
 static bool
-lRecursiveCheckVarying(const Type *t) {
-    t = t->GetBaseType();
-    if (t->IsVaryingType()) return true;
-
+lRecursiveCheckValidParamType(const Type *t) {
    const StructType *st = dynamic_cast<const StructType *>(t);
-    if (st) {
+    if (st != NULL) {
        for (int i = 0; i < st->GetElementCount(); ++i)
-            if (lRecursiveCheckVarying(st->GetElementType(i)))
+            if (lRecursiveCheckValidParamType(st->GetElementType(i)))
                return true;
+        return false;
    }
-    return false;
+
+    const SequentialType *seqt = dynamic_cast<const SequentialType *>(t);
+    if (seqt != NULL)
+        return lRecursiveCheckValidParamType(seqt->GetElementType());
+
+    const PointerType *pt = dynamic_cast<const PointerType *>(t);
+    if (pt != NULL)
+        return (pt->IsSlice() || pt->IsVaryingType());
+
+    return t->IsVaryingType();
 }


@@ -347,7 +371,7 @@ lRecursiveCheckVarying(const Type *t) {
 static void
 lCheckForVaryingParameter(const Type *type, const std::string &name, 
                          SourcePos pos) {
-    if (lRecursiveCheckVarying(type)) {
+    if (lRecursiveCheckValidParamType(type)) {
        const Type *t = type->GetBaseType();
        if (dynamic_cast<const StructType *>(t))
            Error(pos, "Struct parameter \"%s\" with varying member(s) is illegal "
@@ -360,10 +384,8 @@ lCheckForVaryingParameter(const Type *type, const std::string &name,


 /** Given a function type, loop through the function parameters and see if
-    any are StructTypes.  If so, issue an error (this seems to be broken
-    currently).
-
-    @todo Fix passing structs from C/C++ to ispc functions.
+    any are StructTypes.  If so, issue an error; this is currently broken
+    (https://github.com/ispc/ispc/issues/3).
 */
 static void
 lCheckForStructParameters(const FunctionType *ftype, SourcePos pos) {
@@ -371,7 +393,7 @@ lCheckForStructParameters(const FunctionType *ftype, SourcePos pos) {
        const Type *type = ftype->GetParameterType(i);
        if (dynamic_cast<const StructType *>(type) != NULL) {
            Error(pos, "Passing structs to/from application functions is "
-                  "currently broken. Use a pointer or const pointer to the "
+                  "currently broken.  Use a pointer or const pointer to the "
                  "struct instead for now.");
            return;
        }
@@ -389,7 +411,7 @@ void
 Module::AddFunctionDeclaration(Symbol *funSym, bool isInline) {
    const FunctionType *functionType = 
        dynamic_cast<const FunctionType *>(funSym->type);
-    assert(functionType != NULL);
+    Assert(functionType != NULL);

    // If a global variable with the same name has already been declared
    // issue an error.
@@ -416,7 +438,7 @@ Module::AddFunctionDeclaration(Symbol *funSym, bool isInline) {
            // allowed.
            const FunctionType *ofType = 
                dynamic_cast<const FunctionType *>(overloadFunc->type);
-            assert(ofType != NULL);
+            Assert(ofType != NULL);
            if (ofType->GetNumParameters() == functionType->GetNumParameters()) {
                int i;
                for (i = 0; i < functionType->GetNumParameters(); ++i) {
@@ -479,10 +501,14 @@ Module::AddFunctionDeclaration(Symbol *funSym, bool isInline) {
    llvm::GlobalValue::LinkageTypes linkage = (funSym->storageClass == SC_STATIC ||
                                               isInline) ?
        llvm::GlobalValue::InternalLinkage : llvm::GlobalValue::ExternalLinkage;
-    std::string functionName = ((funSym->storageClass == SC_EXTERN_C) ?
-                                funSym->name : funSym->MangledName());
-    if (g->mangleFunctionsWithTarget)
-        functionName += g->target.GetISAString();
+    std::string functionName;
+    if (funSym->storageClass == SC_EXTERN_C)
+        functionName = funSym->name;
+    else {
+        functionName = funSym->MangledName();
+        if (g->mangleFunctionsWithTarget)
+            functionName += g->target.GetISAString();
+    }
    llvm::Function *function = 
        llvm::Function::Create(llvmFunctionType, linkage, functionName.c_str(), 
                               module);
@@ -500,11 +526,12 @@ Module::AddFunctionDeclaration(Symbol *funSym, bool isInline) {
    // Make sure that the return type isn't 'varying' if the function is
    // 'export'ed.
    if (funSym->storageClass == SC_EXPORT && 
-        lRecursiveCheckVarying(functionType->GetReturnType()))
+        lRecursiveCheckValidParamType(functionType->GetReturnType()))
        Error(funSym->pos, "Illegal to return a \"varying\" type from exported "
              "function \"%s\"", funSym->name.c_str());

-    if (functionType->isTask && (functionType->GetReturnType() != AtomicType::Void))
+    if (functionType->isTask && 
+        Type::Equal(functionType->GetReturnType(), AtomicType::Void) == false)
        Error(funSym->pos, "Task-qualified functions must have void return type.");

    if (functionType->isExported || functionType->isExternC)
@@ -571,7 +598,7 @@ Module::AddFunctionDeclaration(Symbol *funSym, bool isInline) {
    // Finally, we know all is good and we can add the function to the
    // symbol table
    bool ok = symbolTable->AddFunction(funSym);
-    assert(ok);
+    Assert(ok);
 }


@@ -583,7 +610,8 @@ Module::AddFunctionDefinition(Symbol *sym, const std::vector<Symbol *> &args,


 bool
-Module::writeOutput(OutputType outputType, const char *outFileName) {
+Module::writeOutput(OutputType outputType, const char *outFileName,
+                    const char *includeFileName) {
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
    if (diBuilder != NULL && outputType != Header)
        diBuilder->finalize();
@@ -609,6 +637,14 @@ Module::writeOutput(OutputType outputType, const char *outFileName) {
            if (strcasecmp(suffix, "o") && strcasecmp(suffix, "obj"))
                fileType = "object";
            break;
+#ifndef LLVM_2_9
+        case CXX:
+            if (strcasecmp(suffix, "c") && strcasecmp(suffix, "cc") &&
+                strcasecmp(suffix, "c++") && strcasecmp(suffix, "cxx") &&
+                strcasecmp(suffix, "cpp"))
+                fileType = "c++";
+            break;
+#endif // !LLVM_2_9
        case Header:
            if (strcasecmp(suffix, "h") && strcasecmp(suffix, "hh") &&
                strcasecmp(suffix, "hpp"))
@@ -622,12 +658,18 @@ Module::writeOutput(OutputType outputType, const char *outFileName) {

    if (outputType == Header)
        return writeHeader(outFileName);
-    else {
-        if (outputType == Bitcode)
-            return writeBitcode(module, outFileName);
-        else
-            return writeObjectFileOrAssembly(outputType, outFileName);
+    else if (outputType == Bitcode)
+        return writeBitcode(module, outFileName);
+#ifndef LLVM_2_9
+    else if (outputType == CXX) {
+        extern bool WriteCXXFile(llvm::Module *module, const char *fn, 
+                                 int vectorWidth, const char *includeName);
+        return WriteCXXFile(module, outFileName, g->target.vectorWidth,
+                            includeFileName);
    }
+#endif // !LLVM_2_9
+    else
+        return writeObjectFileOrAssembly(outputType, outFileName);
 }


@@ -729,7 +771,7 @@ static void
 lVisitNode(const StructType *structType, 
           std::map<const StructType *, StructDAGNode *> &structToNode,
           std::vector<const StructType *> &sortedTypes) {
-    assert(structToNode.find(structType) != structToNode.end());
+    Assert(structToNode.find(structType) != structToNode.end());
    // Get the node that encodes the structs that this one is immediately
    // dependent on.
    StructDAGNode *node = structToNode[structType];
@@ -793,13 +835,19 @@ lEmitStructDecls(std::vector<const StructType *> &structTypes, FILE *file) {
        if (hasIncomingEdges.find(structType) == hasIncomingEdges.end())
            lVisitNode(structType, structToNode, sortedTypes);
    }
-    assert(sortedTypes.size() == structTypes.size());
+    Assert(sortedTypes.size() == structTypes.size());

    // And finally we can emit the struct declarations by going through the
    // sorted ones in order.
    for (unsigned int i = 0; i < sortedTypes.size(); ++i) {
        const StructType *st = sortedTypes[i];
-        fprintf(file, "struct %s {\n", st->GetStructName().c_str());
+        fprintf(file, "struct %s", st->GetStructName().c_str());
+        if (st->GetSOAWidth() > 0)
+            // This has to match the naming scheme in
+            // StructType::GetCDeclaration().
+            fprintf(file, "_SOA%d", st->GetSOAWidth());
+        fprintf(file, " {\n");
+
        for (int j = 0; j < st->GetElementCount(); ++j) {
            const Type *type = st->GetElementType(j)->GetAsNonConstType();
            std::string d = type->GetCDeclaration(st->GetElementName(j));
@@ -828,10 +876,10 @@ lEmitEnumDecls(const std::vector<const EnumType *> &enumTypes, FILE *file) {
        // Print the individual enumerators 
        for (int j = 0; j < enumTypes[i]->GetEnumeratorCount(); ++j) {
            const Symbol *e = enumTypes[i]->GetEnumerator(j);
-            assert(e->constValue != NULL);
+            Assert(e->constValue != NULL);
            unsigned int enumValue;
            int count = e->constValue->AsUInt32(&enumValue);
-            assert(count == 1);
+            Assert(count == 1);

            // Always print an initializer to set the value.  We could be
            // 'clever' here and detect whether the implicit value given by
@@ -897,7 +945,7 @@ lAddTypeIfNew(const Type *type, std::vector<const T *> *exportedTypes) {
            return;

    const T *castType = dynamic_cast<const T *>(type);
-    assert(castType != NULL);
+    Assert(castType != NULL);
    exportedTypes->push_back(castType);
 }

@@ -934,7 +982,7 @@ lGetExportedTypes(const Type *type,
    else if (dynamic_cast<const VectorType *>(type) != NULL)
        lAddTypeIfNew(type, exportedVectorTypes);
    else
-        assert(dynamic_cast<const AtomicType *>(type) != NULL);
+        Assert(dynamic_cast<const AtomicType *>(type) != NULL);
 }


@@ -965,7 +1013,7 @@ lPrintFunctionDeclarations(FILE *file, const std::vector<Symbol *> &funcs) {
    fprintf(file, "#ifdef __cplusplus\nextern \"C\" {\n#endif // __cplusplus\n");
    for (unsigned int i = 0; i < funcs.size(); ++i) {
        const FunctionType *ftype = dynamic_cast<const FunctionType *>(funcs[i]->type);
-        assert(ftype);
+        Assert(ftype);
        std::string decl = ftype->GetCDeclaration(funcs[i]->name);
        fprintf(file, "    extern %s;\n", decl.c_str());
    }
@@ -977,9 +1025,10 @@ static void
 lPrintExternGlobals(FILE *file, const std::vector<Symbol *> &externGlobals) {
    for (unsigned int i = 0; i < externGlobals.size(); ++i) {
        Symbol *sym = externGlobals[i];
-        if (lRecursiveCheckVarying(sym->type))
-            Warning(sym->pos, "Not emitting declaration for symbol \"%s\" into generated "
-                    "header file since it (or some of its members) are varying.",
+        if (lRecursiveCheckValidParamType(sym->type))
+            Warning(sym->pos, "Not emitting declaration for symbol \"%s\" into "
+                    "generated header file since it (or some of its members) "
+                    "has types that are illegal in exported symbols.",
                    sym->name.c_str());
        else
            fprintf(file, "extern %s;\n", sym->type->GetCDeclaration(sym->name).c_str());
@@ -990,7 +1039,7 @@ lPrintExternGlobals(FILE *file, const std::vector<Symbol *> &externGlobals) {
 static bool
 lIsExported(const Symbol *sym) {
    const FunctionType *ft = dynamic_cast<const FunctionType *>(sym->type);
-    assert(ft);
+    Assert(ft);
    return ft->isExported;
 }

@@ -998,7 +1047,7 @@ lIsExported(const Symbol *sym) {
 static bool
 lIsExternC(const Symbol *sym) {
    const FunctionType *ft = dynamic_cast<const FunctionType *>(sym->type);
-    assert(ft);
+    Assert(ft);
    return ft->isExternC;
 }

@@ -1150,6 +1199,24 @@ Module::execPreprocessor(const char* infilename, llvm::raw_string_ostream* ostre
    inst.createSourceManager(inst.getFileManager());
    inst.InitializeSourceManager(infilename);

+    // Don't remove comments in the preprocessor, so that we can accurately
+    // track the source file position by handling them ourselves.
+    inst.getPreprocessorOutputOpts().ShowComments = 1;
+
+    clang::HeaderSearchOptions &headerOpts = inst.getHeaderSearchOpts();
+    headerOpts.UseBuiltinIncludes = 0;
+#ifndef LLVM_2_9
+    headerOpts.UseStandardSystemIncludes = 0;
+#endif // !LLVM_2_9
+    headerOpts.UseStandardCXXIncludes = 0;
+    if (g->debugPrint)
+        headerOpts.Verbose = 1;
+    for (int i = 0; i < (int)g->includePath.size(); ++i)
+        headerOpts.AddPath(g->includePath[i], clang::frontend::Angled,
+                           true /* is user supplied */,
+                           false /* not a framework */,
+                           true /* ignore sys root */);
+
    clang::PreprocessorOptions &opts = inst.getPreprocessorOpts();

    // Add defs for ISPC and PI
@@ -1157,19 +1224,19 @@ Module::execPreprocessor(const char* infilename, llvm::raw_string_ostream* ostre
    opts.addMacroDef("PI=3.1415926535");

    // Add #define for current compilation target
-    switch (g->target.isa) {
-    case Target::SSE2:
-        opts.addMacroDef("ISPC_TARGET_SSE2");
-        break;
-    case Target::SSE4:
-        opts.addMacroDef("ISPC_TARGET_SSE4");
-        break;
-    case Target::AVX:
-        opts.addMacroDef("ISPC_TARGET_AVX");
-        break;
-    default:
-        FATAL("Unhandled target ISA in preprocessor symbol definition");
+    char targetMacro[128];
+    sprintf(targetMacro, "ISPC_TARGET_%s", g->target.GetISAString());
+    char *p = targetMacro;
+    while (*p) {
+        *p = toupper(*p);
+        ++p;
    }
+    opts.addMacroDef(targetMacro);
+
+    if (g->target.is32Bit)
+        opts.addMacroDef("ISPC_POINTER_SIZE=32");
+    else
+        opts.addMacroDef("ISPC_POINTER_SIZE=64");

    opts.addMacroDef("ISPC_MAJOR_VERSION=1");
    opts.addMacroDef("ISPC_MINOR_VERSION=1");
@@ -1317,7 +1384,7 @@ lExtractAndRewriteGlobals(llvm::Module *module,

            Symbol *sym = 
                m->symbolTable->LookupVariable(gv->getName().str().c_str());
-            assert(sym != NULL);
+            Assert(sym != NULL);
            globals->push_back(RewriteGlobalInfo(gv, init, sym->pos));
        }
    }
@@ -1366,9 +1433,9 @@ lAddExtractedGlobals(llvm::Module *module,
            if (globals[j].size() > 0) {
                // There should be the same number of globals in the other
                // vectors, in the same order.
-                assert(globals[firstActive].size() == globals[j].size());
+                Assert(globals[firstActive].size() == globals[j].size());
                llvm::GlobalVariable *gv2 = globals[j][i].gv;
-                assert(gv2->getName() == gv->getName());
+                Assert(gv2->getName() == gv->getName());

                // It is possible that the types may not match, though--for
                // example, this happens with varying globals if we compile
@@ -1422,7 +1489,7 @@ lCreateDispatchFunction(llvm::Module *module, llvm::Function *setISAFunc,

        // Grab the type of the function as well.
        if (ftype != NULL)
-            assert(ftype == funcs.func[i]->getFunctionType());
+            Assert(ftype == funcs.func[i]->getFunctionType());
        else
            ftype = funcs.func[i]->getFunctionType();

@@ -1510,7 +1577,7 @@ lCreateDispatchFunction(llvm::Module *module, llvm::Function *setISAFunc,
    // or some such, but we don't want to start imposing too much of a
    // runtime library requirement either...
    llvm::Function *abortFunc = module->getFunction("abort");
-    assert(abortFunc);
+    Assert(abortFunc);
    llvm::CallInst::Create(abortFunc, "", bblock);

    // Return an undef value from the function here; we won't get to this
@@ -1542,10 +1609,10 @@ lCreateDispatchModule(std::map<std::string, FunctionTargetVariants> &functions)

    // Get pointers to things we need below
    llvm::Function *setFunc = module->getFunction("__set_system_isa");
-    assert(setFunc != NULL);
+    Assert(setFunc != NULL);
    llvm::Value *systemBestISAPtr = 
        module->getGlobalVariable("__system_best_isa", true);
-    assert(systemBestISAPtr != NULL);
+    Assert(systemBestISAPtr != NULL);

    // For each exported function, create the dispatch function
    std::map<std::string, FunctionTargetVariants>::iterator iter;
@@ -1567,7 +1634,8 @@ lCreateDispatchModule(std::map<std::string, FunctionTargetVariants> &functions)
 int
 Module::CompileAndOutput(const char *srcFile, const char *arch, const char *cpu, 
                         const char *target, bool generatePIC, OutputType outputType, 
-                         const char *outFileName, const char *headerFileName) {
+                         const char *outFileName, const char *headerFileName,
+                         const char *includeFileName) {
    if (target == NULL || strchr(target, ',') == NULL) {
        // We're only compiling to a single target
        if (!Target::GetTarget(arch, cpu, target, generatePIC, &g->target))
@@ -1576,12 +1644,15 @@ Module::CompileAndOutput(const char *srcFile, const char *arch, const char *cpu,
        m = new Module(srcFile);
        if (m->CompileFile() == 0) {
            if (outFileName != NULL)
-                if (!m->writeOutput(outputType, outFileName))
+                if (!m->writeOutput(outputType, outFileName, includeFileName))
                    return 1;
            if (headerFileName != NULL)
                if (!m->writeOutput(Module::Header, headerFileName))
                    return 1;
        }
+        else
+            ++m->errorCount;
+
        int errorCount = m->errorCount;
        delete m;
        m = NULL;
@@ -1589,9 +1660,17 @@ Module::CompileAndOutput(const char *srcFile, const char *arch, const char *cpu,
        return errorCount > 0;
    }
    else {
+#ifndef LLVM_2_9
+        if (outputType == CXX) {
+            Error(SourcePos(), "Illegal to specify more then one target when "
+                  "compiling C++ output.");
+            return 1;
+        }
+#endif // !LLVM_2_9
+
        // The user supplied multiple targets
        std::vector<std::string> targets = lExtractTargets(target);
-        assert(targets.size() > 1);
+        Assert(targets.size() > 1);

        if (outFileName != NULL && strcmp(outFileName, "-") == 0) {
            Error(SourcePos(), "Multi-target compilation can't generate output "
@@ -1668,7 +1747,7 @@ Module::CompileAndOutput(const char *srcFile, const char *arch, const char *cpu,
        int i = 1;
        while (i < Target::NUM_ISAS && firstTargetMachine == NULL)
            firstTargetMachine = targetMachines[i++];
-        assert(firstTargetMachine != NULL);
+        Assert(firstTargetMachine != NULL);

        if (outFileName != NULL) {
            if (outputType == Bitcode)
--- a/module.h
+++ b/module.h
@@ -80,6 +80,9 @@ public:
    enum OutputType { Asm,      /** Generate text assembly language output */
                      Bitcode,  /** Generate LLVM IR bitcode output */
                      Object,   /** Generate a native object file */
+#ifndef LLVM_2_9
+                      CXX,      /** Generate a C++ file */
+#endif // !LLVM_2_9
                      Header    /** Generate a C/C++ header file with 
                                    declarations of 'export'ed functions, global
                                    variables, and the types used by them. */
@@ -108,6 +111,10 @@ public:
                              inclusion from C/C++ code with declarations of
                              types and functions exported from the given ispc
                              source file.
+        @param includeFileName If non-NULL, gives the filename for the C++ 
+                               backend to emit in an #include statement to
+                               get definitions of the builtins for the generic
+                               target.
        @return             Number of errors encountered when compiling
                            srcFile.
     */
@@ -115,7 +122,8 @@ public:
                                const char *cpu, const char *targets, 
                                bool generatePIC, OutputType outputType, 
                                const char *outFileName, 
-                                const char *headerFileName);
+                                const char *headerFileName, 
+                                const char *includeFileName);

    /** Total number of errors encountered during compilation. */
    int errorCount;
@@ -138,7 +146,8 @@ private:
        true on success, false if there has been an error.  The given
        filename may be NULL, indicating that output should go to standard
        output. */
-    bool writeOutput(OutputType ot, const char *filename);
+    bool writeOutput(OutputType ot, const char *filename,
+                     const char *includeFileName = NULL);
    bool writeHeader(const char *filename);
    bool writeObjectFileOrAssembly(OutputType outputType, const char *filename);
    static bool writeObjectFileOrAssembly(llvm::TargetMachine *targetMachine,
--- a/opt.cpp
+++ b/opt.cpp
--- a/parse.yy
+++ b/parse.yy
--- a/run_tests.py
+++ b/run_tests.py
@@ -2,9 +2,6 @@

 # test-running driver for ispc

-# TODO: windows support (mostly should be calling CL.exe rather than gcc
-#   for static linking?)
-
 from optparse import OptionParser
 import multiprocessing
 from ctypes import c_int
@@ -15,35 +12,95 @@ import re
 import signal
 import random
 import string
-import mutex
 import subprocess
 import shlex
 import platform
+import tempfile
+
+# This script is affected by http://bugs.python.org/issue5261 on OSX 10.5 Leopard
+# git history has a workaround for that issue.
+
+is_windows = (platform.system() == 'Windows' or
+              'CYGWIN_NT' in platform.system())

 parser = OptionParser()
 parser.add_option("-r", "--random-shuffle", dest="random", help="Randomly order tests",
                  default=False, action="store_true")
-parser.add_option("-s", "--static-exe", dest="static_exe", 
-                  help="Create and run a regular executable for each test (rather than using the LLVM JIT).",
-                  default=False, action="store_true")
+parser.add_option("-g", "--generics-include", dest="include_file", help="Filename for header implementing functions for generics",
+                  default=None)
 parser.add_option('-t', '--target', dest='target',
-                  help='Set compilation target (sse2, sse2-x2, sse4, sse4-x2, avx, avx-x2)',
+                  help='Set compilation target (sse2, sse2-x2, sse4, sse4-x2, avx, avx-x2, generic-4, generic-8, generic-16)',
                  default="sse4")
 parser.add_option('-a', '--arch', dest='arch',
                  help='Set architecture (x86, x86-64)',
                  default="x86-64")
+parser.add_option("-c", "--compiler", dest="compiler_exe", help="Compiler binary to use to run tests",
+                  default=None)
 parser.add_option('-o', '--no-opt', dest='no_opt', help='Disable optimization',
                  default=False, action="store_true")
+parser.add_option('-j', '--jobs', dest='num_jobs', help='Maximum number of jobs to run in parallel',
+                  default="1024", type="int")
+parser.add_option('-v', '--verbose', dest='verbose', help='Enable verbose output',
+                  default=False, action="store_true")
+parser.add_option('--wrap-exe', dest='wrapexe',
+                  help='Executable to wrap test runs with (e.g. "valgrind")',
+                  default="")

 (options, args) = parser.parse_args()

-# if no specific test files are specified, run all of the tests in tests/
-# and failing_tests/
+if not is_windows:
+    ispc_exe = "./ispc"
+else:
+    ispc_exe = "../Release/ispc.exe"
+
+is_generic_target = (options.target.find("generic-") != -1 and
+                     options.target != "generic-1")
+if is_generic_target and options.include_file == None:
+    if options.target == "generic-4":
+        sys.stderr.write("No generics #include specified; using examples/intrinsics/sse4.h\n")
+        options.include_file = "examples/intrinsics/sse4.h"
+    elif options.target == "generic-8":
+        sys.stderr.write("No generics #include specified and no default available for \"generic-8\" target.\n")
+        sys.exit(1)
+    elif options.target == "generic-16":
+        sys.stderr.write("No generics #include specified; using examples/intrinsics/generic-16.h\n")
+        options.include_file = "examples/intrinsics/generic-16.h"
+
+if options.compiler_exe == None:
+    if is_windows:
+        options.compiler_exe = "cl"
+    else:
+        options.compiler_exe = "g++"
+
+def fix_windows_paths(files):
+    ret = [ ]
+    for fn in files:
+        ret += [ string.replace(fn, '\\', '/') ]
+    return ret
+
+    
+# if no specific test files are specified, run all of the tests in tests/,
+# failing_tests/, and tests_errors/
 if len(args) == 0:
    files = glob.glob("tests/*ispc") + glob.glob("failing_tests/*ispc") + \
        glob.glob("tests_errors/*ispc")
+    files = fix_windows_paths(files)
 else:
-    files = args
+    if is_windows:
+        argfiles = [ ]
+        for f in args:
+            # we have to glob ourselves if this is being run under a DOS
+            # shell..
+            argfiles += glob.glob(f)
+    else:
+        argfiles = args
+        
+    files = [ ]
+    for f in argfiles:
+        if os.path.splitext(string.lower(f))[1] != ".ispc":
+            sys.stdout.write("Ignoring file %s, which doesn't have an .ispc extension.\n" % f)
+        else:
+            files += [ f ]

 # randomly shuffle the tests if asked to do so
 if (options.random):
@@ -52,19 +109,14 @@ if (options.random):

 # counter
 total_tests = 0
-finished_tests_counter = multiprocessing.Value(c_int)

-# We'd like to use the Lock class from the multiprocessing package to
-# serialize accesses to finished_tests_counter.  Unfortunately, the version of
-# python that ships with OSX 10.5 has this bug:
-# http://bugs.python.org/issue5261.  Therefore, we use the (deprecated but
-# still available) mutex class.
-#finished_tests_counter_lock = multiprocessing.Lock()
-finished_tests_mutex = mutex.mutex()
+finished_tests_counter = multiprocessing.Value(c_int)
+finished_tests_counter_lock = multiprocessing.Lock()

 # utility routine to print an update on the number of tests that have been
-# finished.  Should be called with the mutex (or lock) held..
+# finished.  Should be called with the lock held..
 def update_progress(fn):
+    global total_tests
    finished_tests_counter.value = finished_tests_counter.value + 1
    progress_str = " Done %d / %d [%s]" % (finished_tests_counter.value, total_tests, fn)
    # spaces to clear out detrius from previous printing...
@@ -73,138 +125,196 @@ def update_progress(fn):
    progress_str += '\r'
    sys.stdout.write(progress_str)
    sys.stdout.flush()
-    finished_tests_mutex.unlock()

-fnull = open(os.devnull, 'w')
+def run_command(cmd):
+    if options.verbose:
+        sys.stdout.write("Running: %s\n" % cmd)
+    sp = subprocess.Popen(shlex.split(cmd), stdin=None,
+                          stdout=subprocess.PIPE,
+                          stderr=subprocess.PIPE)
+    out = sp.communicate()
+    output = ""
+    output += out[0].decode("utf-8")
+    output += out[1].decode("utf-8")
+
+    return (sp.returncode, output)

 # run the commands in cmd_list
-def run_cmds(cmd_list, filename, expect_failure):
-    for cmd in cmd_list:
-        if expect_failure:
-            failed = (subprocess.call(cmd, shell = True, stdout = fnull, stderr = fnull) != 0)
-        else:
-            failed = (os.system(cmd) != 0)
-        if failed:
-            break
+def run_cmds(compile_cmds, run_cmd, filename, expect_failure):
+    for cmd in compile_cmds:
+        (return_code, output) = run_command(cmd)
+        compile_failed = (return_code != 0)
+        if compile_failed:
+            sys.stdout.write("Compilation of test %s failed            \n" % filename)
+            if output != "":
+                sys.stdout.write("%s" % output)
+            return (1, 0)

-    surprise = ((expect_failure and not failed) or (not expect_failure and failed))
+    (return_code, output) = run_command(run_cmd)
+    run_failed = (return_code != 0)
+
+    surprise = ((expect_failure and not run_failed) or
+                (not expect_failure and run_failed))
    if surprise == True:
-        print "Test %s %s                 " % \
-            (filename, "unexpectedly passed" if expect_failure else "failed")
-    return surprise
+        sys.stderr.write("Test %s %s (return code %d)            \n" % \
+            (filename, "unexpectedly passed" if expect_failure else "failed",
+             return_code))
+    if output != "":
+        sys.stdout.write("%s\n" % output)
+    if surprise == True:
+        return (0, 1)
+    else:
+        return (0, 0)


-# pull tests to run from the given queue and run them.  Multiple copies of
-# this function will be running in parallel across all of the CPU cores of
-# the system.
-def run_tasks_from_queue(queue):
-    error_count = 0
-    while True:
-        filename = queue.get()
-        if (filename == 'STOP'):
-            sys.exit(error_count)
+def run_test(filename):
+    global is_windows
+    if is_windows:
+        input_prefix = "../"
+    else:
+        input_prefix = ""
+        
+    # is this a test to make sure an error is issued?
+    want_error = (filename.find("tests_errors") != -1)
+    if want_error == True:
+        ispc_cmd = ispc_exe + " --werror --nowrap %s --arch=%s --target=%s" % \
+            (input_prefix + filename, options.arch, options.target)
+        (return_code, output) = run_command(ispc_cmd)
+        got_error = (return_code != 0)

-        # is this a test to make sure an error is issued?
-        want_error = (filename.find("tests_errors") != -1)
-        if want_error == True:
-            ispc_cmd = "ispc --werror --nowrap %s --arch=%s --target=%s" % \
-                (filename, options.arch, options.target)
-            sp = subprocess.Popen(shlex.split(ispc_cmd), stdin=None, stdout=subprocess.PIPE,
-                                  stderr=subprocess.PIPE)
-            output = sp.communicate()[1]
-            got_error = (sp.returncode != 0)
-
-            # figure out the error message we're expecting
-            file = open(filename, 'r')
-            firstline = file.readline()
-            firstline = string.replace(firstline, "//", "")
-            firstline = string.lstrip(firstline)
-            firstline = string.rstrip(firstline)
-            file.close()
-
-            if (output.find(firstline) == -1):
-                print "Didn't see expected error message \"%s\" from test %s.\nActual outout: %s" % \
-                    (firstline, filename, output)
-                error_count += 1
-            elif got_error == False:
-                print "Unexpectedly no errors issued from test %s" % filename
-                error_count += 1
-            continue
+        # figure out the error message we're expecting
+        file = open(input_prefix + filename, 'r')
+        firstline = file.readline()
+        firstline = firstline.replace("//", "")
+        firstline = firstline.lstrip()
+        firstline = firstline.rstrip()
+        file.close()

+        if (output.find(firstline) == -1):
+            sys.stderr.write("Didn't see expected error message %s from test %s.\nActual output:\n%s\n" % \
+                (firstline, filename, output))
+            return (1, 0)
+        elif got_error == False:
+            sys.stderr.write("Unexpectedly no errors issued from test %s\n" % filename)
+            return (1, 0)
+        else:
+            return (0, 0)
+    else:
        # do we expect this test to fail?
        should_fail = (filename.find("failing_") != -1)

-        if options.static_exe == True:
-            # if the user wants us to build a static executable to run for
-            # this test, we need to figure out the signature of the test
-            # function that this test has.
-            sig2def = { "f_v(" : 0, "f_f(" : 1, "f_fu(" : 2, "f_fi(" : 3, 
-                        "f_du(" : 4, "f_duf(" : 5, "f_di(" : 6 }
-            file = open(filename, 'r')
-            match = -1
-            for line in file:
-                # look for lines with 'export'...
-                if line.find("export") == -1:
-                    continue
-                # one of them should have a function with one of the
-                # declarations in sig2def
-                for pattern, ident in sig2def.items():
-                    if line.find(pattern) != -1:
-                        match = ident
-                        break
-            file.close()
-            if match == -1:
-                print "Fatal error: unable to find function signature in test %s" % filename
-                error_count += 1
+        # We need to figure out the signature of the test
+        # function that this test has.
+        sig2def = { "f_v(" : 0, "f_f(" : 1, "f_fu(" : 2, "f_fi(" : 3, 
+                    "f_du(" : 4, "f_duf(" : 5, "f_di(" : 6 }
+        file = open(input_prefix + filename, 'r')
+        match = -1
+        for line in file:
+            # look for lines with 'export'...
+            if line.find("export") == -1:
+                continue
+            # one of them should have a function with one of the
+            # declarations in sig2def
+            for pattern, ident in list(sig2def.items()):
+                if line.find(pattern) != -1:
+                    match = ident
+                    break
+        file.close()
+        if match == -1:
+            sys.stderr.write("Fatal error: unable to find function signature " + \
+                  "in test %s\n" % filename)
+            return (1, 0)
+        else:
+            global is_generic_target
+            if is_generic_target:
+                obj_name = "%s.cpp" % filename
+
+            if is_windows:
+                if not is_generic_target:
+                    obj_name = "%s%s.obj" % (input_prefix, filename)
+                exe_name = "%s%s.exe" % (input_prefix, filename)
+
+                cc_cmd = "%s /I. /I../winstuff /Zi /nologo /DTEST_SIG=%d %stest_static.cpp %s /Fe%s" % \
+                         (options.compiler_exe, match, input_prefix, obj_name, exe_name)
+                if should_fail:
+                    cc_cmd += " /DEXPECT_FAILURE"
            else:
-                obj_name = "%s.o" % filename
+                if not is_generic_target:
+                    obj_name = "%s.o" % filename
                exe_name = "%s.run" % filename
-                ispc_cmd = "ispc --woff %s -o %s --arch=%s --target=%s" % \
-                    (filename, obj_name, options.arch, options.target)
-                if options.no_opt:
-                    ispc_cmd += " -O0" 
+
                if options.arch == 'x86':
                    gcc_arch = '-m32'
                else:
                    gcc_arch = '-m64'
-                gcc_cmd = "g++ %s test_static.cpp -DTEST_SIG=%d %s.o -o %s" % \
-                    (gcc_arch, match, filename, exe_name)
+                cc_cmd = "%s -O2 -msse4.2 -I. %s test_static.cpp -DTEST_SIG=%d %s -o %s" % \
+                         (options.compiler_exe, gcc_arch, match, obj_name, exe_name)
                if platform.system() == 'Darwin':
-                    gcc_cmd += ' -Wl,-no_pie'
+                    cc_cmd += ' -Wl,-no_pie'
                if should_fail:
-                    gcc_cmd += " -DEXPECT_FAILURE"
-                    
-                # compile the ispc code, make the executable, and run it...
-                error_count += run_cmds([ispc_cmd, gcc_cmd, exe_name], filename, should_fail)
+                    cc_cmd += " -DEXPECT_FAILURE"

-                # clean up after running the test
+            ispc_cmd = ispc_exe + " --woff %s -o %s --arch=%s --target=%s" % \
+                       (input_prefix+filename, obj_name, options.arch, options.target)
+            if options.no_opt:
+                ispc_cmd += " -O0" 
+            if is_generic_target:
+                ispc_cmd += " --emit-c++ --c++-include-file=%s" % options.include_file
+
+        # compile the ispc code, make the executable, and run it...
+        (compile_error, run_error) = run_cmds([ispc_cmd, cc_cmd], 
+                                              options.wrapexe + " " + exe_name, \
+                                              filename, should_fail)
+
+        # clean up after running the test
+        try:
+            if not run_error:
+                os.unlink(exe_name)
+                if is_windows:
+                    os.unlink("%s%s.pdb" % (input_prefix, filename))
+                    os.unlink("%s%s.ilk" % (input_prefix, filename))
+            os.unlink(obj_name)
+        except:
+            None
+
+        return (compile_error, run_error)
+
+# pull tests to run from the given queue and run them.  Multiple copies of
+# this function will be running in parallel across all of the CPU cores of
+# the system.
+def run_tasks_from_queue(queue, queue_ret):
+    if is_windows:
+        tmpdir = "tmp%d" % os.getpid()
+        os.mkdir(tmpdir)
+        os.chdir(tmpdir)
+    else:
+        olddir = ""
+        
+    compile_error_files = [ ]
+    run_error_files = [ ]
+    while True:
+        filename = queue.get()
+        if (filename == 'STOP'):
+            queue_ret.put((compile_error_files, run_error_files))
+            if is_windows:
                try:
-                    os.unlink(exe_name)
-                    os.unlink(obj_name)
+                    os.remove("test_static.obj")
+                    os.remove("/vc100.pdb")
+                    os.chdir("..")
+                    os.rmdir(tmpdir)
                except:
                    None
-        else:
-            # otherwise we'll use ispc_test + the LLVM JIT to run the test
-            bitcode_file = "%s.bc" % filename
-            compile_cmd = "ispc --woff --emit-llvm %s --target=%s -o %s" % \
-                (filename, options.target, bitcode_file)
-            if options.no_opt:
-                compile_cmd += " -O0"
-            test_cmd = "ispc_test %s" % bitcode_file
+                
+            sys.exit(0)

-            error_count += run_cmds([compile_cmd, test_cmd], filename, should_fail)
-
-            try:
-                os.unlink(bitcode_file)
-            except:
-                None
-
-        # If not for http://bugs.python.org/issue5261 on OSX, we'd like to do this:
-        #with finished_tests_counter_lock:
-            #update_progress(filename)
-        # but instead we do this...
-        finished_tests_mutex.lock(update_progress, filename)
+        (compile_error, run_error) = run_test(filename)
+        if compile_error != 0:
+            compile_error_files += [ filename ]
+        if run_error != 0:
+            run_error_files += [ filename ]

+        with finished_tests_counter_lock:
+            update_progress(filename)

 task_threads = []

@@ -214,16 +324,23 @@ def sigint(signum, frame):
    sys.exit(1)

 if __name__ == '__main__':
-    nthreads = multiprocessing.cpu_count()
    total_tests = len(files)
-    print "Found %d CPUs. Running %d tests." % (nthreads, total_tests)
+
+    compile_error_files = [ ]
+    run_error_files = [ ]
+
+    nthreads = min(multiprocessing.cpu_count(), options.num_jobs)
+    sys.stdout.write("Running %d jobs in parallel. Running %d tests.\n" % (nthreads, total_tests))

    # put each of the test filenames into a queue
    q = multiprocessing.Queue()
    for fn in files:
+        if is_windows:
+            fn = fn.replace("\\",'/')
        q.put(fn)
    for x in range(nthreads):
        q.put('STOP')
+    qret = multiprocessing.Queue()

    # need to catch sigint so that we can terminate all of the tasks if
    # we're interrupted
@@ -231,17 +348,30 @@ if __name__ == '__main__':

    # launch jobs to run tests
    for x in range(nthreads):
-        t = multiprocessing.Process(target=run_tasks_from_queue, args=(q,))
+        t = multiprocessing.Process(target=run_tasks_from_queue, args=(q,qret))
        task_threads.append(t)
        t.start()

    # wait for them to all finish and then return the number that failed
    # (i.e. return 0 if all is ok)
-    error_count = 0
    for t in task_threads:
        t.join()
-        error_count += t.exitcode
-    print
-    if error_count > 0:
-        print "%d / %d tests FAILED!" % (error_count, total_tests)
-    sys.exit(error_count)
+    sys.stdout.write("\n")
+
+    while not qret.empty():
+        (c, r) = qret.get()
+        compile_error_files += c
+        run_error_files += r
+
+    if len(compile_error_files) > 0:
+        compile_error_files.sort()
+        sys.stdout.write("%d / %d tests FAILED compilation:\n" % (len(compile_error_files), total_tests))
+        for f in compile_error_files:
+            sys.stdout.write("\t%s\n" % f)
+    if len(run_error_files) > 0:
+        run_error_files.sort()
+        sys.stdout.write("%d / %d tests FAILED execution:\n" % (len(run_error_files), total_tests))
+        for f in run_error_files:
+            sys.stdout.write("\t%s\n" % f)
+
+    sys.exit(len(compile_error_files) + len(run_error_files))
--- a/run_tests.sh
+++ b/run_tests.sh
@@ -1,95 +0,0 @@
-#!/bin/bash
-
-surprises=0
-verbose=false
-number=$(ls -1 tests/*.ispc|wc -l)
-counter=1
-target=sse4
-
-while getopts ":vt:h" opt;do
-    case $opt in
-        v) verbose=true
-            ;;
-        t) target=$OPTARG
-            ;;
-        h) cat <<EOF
-           usage: run_tests.sh [-v] [-t target] [filenames]
-                  -v           # verbose output
-                  -t           # specify compilation target (SSE4 is the default).
-                  [filenames]  # (optional) files to run through testing infrastructure
-                               # if none are provided, all in tests/ will be run.
-EOF
-            exit 1
-    esac
-done
-
-ISPC_ARCH=x86-64
-if [[ $OS == "Windows_NT" ]]; then
-  ISPC_ARCH=x86
-fi
-ISPC_ARGS="--target=$target --arch=$ISPC_ARCH -O2 --woff"
-
-shift $(( $OPTIND - 1 ))
-if [[ "$1" > 0 ]]; then
-    while [[ "$1" > 0 ]]; do
-        i=$1
-        shift
-        echo Running test $i
-
-        bc=${i%%ispc}bc
-        ispc $ISPC_ARGS $i -o $bc --emit-llvm
-        if [[ $? != 0 ]]; then
-            surprises=1
-            echo Test $i FAILED ispc compile
-            echo
-        else
-            ispc_test $bc
-            if [[ $? != 0 ]]; then
-                surprises=1
-                echo Test $i FAILED ispc_test
-                echo
-            fi
-        fi
-        /bin/rm -f $bc
-    done
-else
-    echo Running all correctness tests
-
-    for i in tests/*.ispc; do
-        if $verbose; then
-            echo -en "Running test $counter of $number.\r"
-        fi
-        (( counter++ ))
-        bc=${i%%ispc}bc
-        ispc $ISPC_ARGS $i -o $bc --emit-llvm 
-        if [[ $? != 0 ]]; then
-            surprises=1
-            echo Test $i FAILED ispc compile
-            echo
-        else
-            ispc_test $bc
-            if [[ $? != 0 ]]; then
-                surprises=1
-                echo Test $i FAILED ispc_test
-                echo
-            fi
-        fi
-        /bin/rm -f $bc
-    done
-
-    echo -e "\nRunning failing tests"
-    for i in failing_tests/*.ispc; do
-        (ispc -O2 $i -woff -o - --emit-llvm | ispc_test -) 2>/dev/null 1>/dev/null
-        if [[ $? == 0 ]]; then
-            surprises=1
-            echo Test $i UNEXPECTEDLY PASSED
-            echo
-        fi
-    done
-fi
-
-if [[ $surprises == 0 ]]; then
-    echo No surprises.
-fi
-
-exit $surprises
--- a/stdlib.ispc
+++ b/stdlib.ispc
--- a/stdlib2cpp.py
+++ b/stdlib2cpp.py
@@ -2,11 +2,17 @@

 import sys

-print "char stdlib_code[] = { "
+t=str(sys.argv[1])

-for line in sys.stdin:
-    for c in line:
-        print ord(c)
-        print ", "
+sys.stdout.write("char stdlib_" + t + "_code[] = {\n")

-print "0 };"
+width = 16
+data = sys.stdin.read()
+for i in range(0, len(data), 1):
+    sys.stdout.write("0x%0.2X, " % ord(data[i:i+1]))
+
+    if i%width == (width-1):
+        sys.stdout.write("\n")
+
+sys.stdout.write("0x00 };\n\n")
+                                    
--- a/Show More
+++ b/Show More