Doxygen number bump and release notes for 1.1.3

Improve atomic_swap_global() to take advantage of associativity.
We now do a single atomic hardware swap and then effectively do swaps between the running program instances such that the result is the same as if they had happened to run a particular ordering of hardware swaps themselves. Also cleaned up __atomic_swap_uniform_* built-in implementations to not take the mask, which they weren't using anyway. Finishes Issue #56.
2012-01-20 17:04:16 -08:00 · 2012-01-20 10:37:33 -08:00 · 2012-01-19 17:57:59 -08:00 · 2012-01-19 11:35:02 -07:00 · 2012-01-19 10:04:32 -07:00 · 2012-01-19 10:02:47 -07:00
125 changed files with 17314 additions and 3469 deletions
--- a/96
+++ b/96
@@ -3,6 +3,11 @@
 #

 ARCH_OS = $(shell uname)
+ifeq ($(ARCH_OS), Darwin)
+	ARCH_OS2 = "OSX"
+else
+	ARCH_OS2 = $(shell uname -o)
+endif
 ARCH_TYPE = $(shell arch)

 ifeq ($(shell llvm-config --version), 3.1svn)
@@ -26,7 +31,15 @@ CLANG_LIBS = -lclangFrontend -lclangDriver \
             -lclangAnalysis -lclangAST -lclangLex -lclangBasic

 ISPC_LIBS=$(shell llvm-config --ldflags) $(CLANG_LIBS) $(LLVM_LIBS) \
-	-lpthread -ldl
+	-lpthread
+
+ifeq ($(ARCH_OS),Linux)
+	ISPC_LIBS += -ldl
+endif
+
+ifeq ($(ARCH_OS2),Msys)
+	ISPC_LIBS += -lshlwapi -limagehlp -lpsapi
+endif

 LLVM_CXXFLAGS=$(shell llvm-config --cppflags)
 LLVM_VERSION=LLVM_$(shell llvm-config --version | sed s/\\./_/)
@@ -45,11 +58,7 @@ LDFLAGS=
 ifeq ($(ARCH_OS),Linux)
  # try to link everything statically under Linux (including libstdc++) so
  # that the binaries we generate will be portable across distributions...
-  ifeq ($(ARCH_TYPE),x86_64)
-    LDFLAGS=-static -L/usr/lib/gcc/x86_64-linux-gnu/4.4
-  else
-    LDFLAGS=-L/usr/lib/gcc/i686-redhat-linux/4.6.0
-  endif
+    LDFLAGS=-static
 endif

 LEX=flex
@@ -57,19 +66,23 @@ YACC=bison -d -v -t

 ###########################################################################

-CXX_SRC=ast.cpp builtins.cpp ctx.cpp decl.cpp expr.cpp func.cpp ispc.cpp \
-	llvmutil.cpp main.cpp module.cpp opt.cpp stmt.cpp sym.cpp type.cpp \
-	util.cpp
+CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
+	ispc.cpp llvmutil.cpp main.cpp module.cpp opt.cpp stmt.cpp sym.cpp \
+	type.cpp util.cpp
 HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
 	opt.h stmt.h sym.h type.h util.h
-BUILTINS_SRC=builtins-avx.ll builtins-avx-x2.ll builtins-sse2.ll builtins-sse2-x2.ll \
-	builtins-sse4.ll builtins-sse4-x2.ll builtins-dispatch.ll
+TARGETS=avx1 avx1-x2 avx2 avx2-x2 sse2 sse2-x2 sse4 sse4-x2 generic-4 generic-8 \
+	generic-16
+BUILTINS_SRC=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS))) \
+	builtins/dispatch.ll
+BUILTINS_OBJS=$(addprefix builtins-, $(notdir $(BUILTINS_SRC:.ll=.o))) \
+	builtins-c-32.cpp builtins-c-64.cpp 
 BISON_SRC=parse.yy
 FLEX_SRC=lex.ll

-OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_SRC:.ll=.o) \
-	builtins-c-32.o builtins-c-64.o stdlib_ispc.o $(BISON_SRC:.yy=.o) \
-	$(FLEX_SRC:.ll=.o))
+OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_OBJS) \
+	stdlib_generic_ispc.o stdlib_x86_ispc.o \
+	$(BISON_SRC:.yy=.o) $(FLEX_SRC:.ll=.o))

 default: ispc

@@ -104,6 +117,14 @@ objs/%.o: %.cpp
 	@echo Compiling $<
 	@$(CXX) $(CXXFLAGS) -o $@ -c $<

+objs/cbackend.o: cbackend.cpp
+	@echo Compiling $<
+	@$(CXX) -fno-rtti -fno-exceptions $(CXXFLAGS) -o $@ -c $<
+
+objs/%.o: objs/%.cpp
+	@echo Compiling $<
+	@$(CXX) $(CXXFLAGS) -o $@ -c $<
+
 objs/parse.cc: parse.yy
 	@echo Running bison on $<
 	@$(YACC) -o $@ $<
@@ -120,41 +141,24 @@ objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc
 	@echo Compiling $<
 	@$(CXX) $(CXXFLAGS) -o $@ -c $<

-objs/builtins-%.cpp: builtins-%.ll
-	@echo Creating C++ source from builtin definitions file $<
-	@m4 -DLLVM_VERSION=$(LLVM_VERSION) builtins.m4 $< | ./bitcode2cpp.py $< > $@
-
-objs/builtins-%.o: objs/builtins-%.cpp
-	@echo Compiling $<
-	@$(CXX) $(CXXFLAGS) -o $@ -c $<
-
-objs/builtins-c-32.cpp: builtins-c.c
+objs/builtins-%.cpp: builtins/%.ll builtins/util.m4 $(wildcard builtins/*common.ll)
 	@echo Creating C++ source from builtins definition file $<
-	@$(CLANG) -m32 -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py builtins-c-32.c > $@
+	@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) $< | python bitcode2cpp.py $< > $@

-objs/builtins-c-32.o: objs/builtins-c-32.cpp
-	@echo Compiling $<
-	@$(CXX) $(CXXFLAGS) -o $@ -c $<
-
-objs/builtins-c-64.cpp: builtins-c.c
+objs/builtins-c-32.cpp: builtins/builtins.c
 	@echo Creating C++ source from builtins definition file $<
-	@$(CLANG) -m64 -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py builtins-c-64.c > $@
+	@$(CLANG) -m32 -emit-llvm -c $< -o - | llvm-dis - | python bitcode2cpp.py c-32 > $@

-objs/builtins-c-64.o: objs/builtins-c-64.cpp
-	@echo Compiling $<
-	@$(CXX) $(CXXFLAGS) -o $@ -c $<
+objs/builtins-c-64.cpp: builtins/builtins.c
+	@echo Creating C++ source from builtins definition file $<
+	@$(CLANG) -m64 -emit-llvm -c $< -o - | llvm-dis - | python bitcode2cpp.py c-64 > $@

-objs/stdlib_ispc.cpp: stdlib.ispc
-	@echo Creating C++ source from $<
-	@$(CLANG) -E -x c -DISPC=1 -DPI=3.1415926536 $< -o - | ./stdlib2cpp.py > $@
+objs/stdlib_generic_ispc.cpp: stdlib.ispc
+	@echo Creating C++ source from $< for generic
+	@$(CLANG) -E -x c -DISPC_TARGET_GENERIC=1 -DISPC=1 -DPI=3.1415926536 $< -o - | \
+		python stdlib2cpp.py generic > $@

-objs/stdlib_ispc.o: objs/stdlib_ispc.cpp
-	@echo Compiling $<
-	@$(CXX) $(CXXFLAGS) -o $@ -c $<
-
-objs/builtins-sse2.cpp: builtins.m4 builtins-sse2-common.ll builtins-sse2.ll
-objs/builtins-sse2-x2.cpp: builtins.m4 builtins-sse2-common.ll builtins-sse2-x2.ll
-objs/builtins-sse4.cpp: builtins.m4 builtins-sse4-common.ll builtins-sse4.ll
-objs/builtins-sse4-x2.cpp: builtins.m4 builtins-sse4-common.ll builtins-sse4-x2.ll
-objs/builtins-avx.cpp: builtins.m4 builtins-avx-common.ll builtins-avx.ll
-objs/builtins-avx-x2.cpp: builtins.m4 builtins-avx-common.ll builtins-avx-x2.ll
+objs/stdlib_x86_ispc.cpp: stdlib.ispc
+	@echo Creating C++ source from $< for x86
+	@$(CLANG) -E -x c -DISPC=1 -DPI=3.1415926536 $< -o - | \
+		python stdlib2cpp.py x86 > $@
--- a/README.rst
+++ b/README.rst
@@ -0,0 +1,90 @@
+==============================
+Intel(r) SPMD Program Compiler
+==============================
+
+``ispc`` is a compiler for a variant of the C programming language, with
+extensions for `single program, multiple data
+<http://en.wikipedia.org/wiki/SPMD>`_ programming.  Under the SPMD model,
+the programmer writes a program that generally appears to be a regular
+serial program, though the execution model is actually that a number of
+*program instances* execute in parallel on the hardware.
+
+Overview
+--------
+
+``ispc`` compiles a C-based SPMD programming language to run on the SIMD
+units of CPUs; it frequently provides a 3x or more speedup on CPUs with
+4-wide vector SSE units and 5x-6x on CPUs with 8-wide AVX vector units,
+without any of the difficulty of writing intrinsics code.  Parallelization
+across multiple cores is also supported by ``ispc``, making it
+possible to write programs that achieve performance improvement that scales
+by both number of cores and vector unit size.
+
+There are a few key principles in the design of ``ispc``:
+
+  * To build a small set of extensions to the C language that
+    would deliver excellent performance to performance-oriented
+    programmers who want to run SPMD programs on the CPU.
+
+  * To provide a thin abstraction layer between the programmer
+    and the hardware--in particular, to have an execution and
+    data model where the programmer can cleanly reason about the
+    mapping of their source program to compiled assembly language
+    and the underlying hardware.
+
+  * To make it possible to harness the computational power of SIMD
+    vector units without the extremely low-programmer-productivity
+    activity of directly writing intrinsics.
+
+  * To explore opportunities from close coupling between C/C++
+    application code and SPMD ``ispc`` code running on the
+    same processor--to have lightweight function calls between
+    the two languages and to share data directly via pointers without
+    copying or reformatting.
+
+``ispc`` is an open source compiler with the BSD license.  It uses the
+remarkable `LLVM Compiler Infrastructure <http://llvm.org>`_ for back-end
+code generation and optimization and is `hosted on
+github <http://github.com/ispc/ispc/>`_. It supports Windows, Mac, and
+Linux, with both x86 and x86-64 targets.  It currently supports the SSE2,
+SSE4, and AVX instruction sets.
+
+Features
+--------
+
+``ispc`` provides a number of key features to developers:
+
+  * Familiarity as an extension of the C programming
+    language: ``ispc`` supports familiar C syntax and
+    programming idioms, while adding the ability to write SPMD
+    programs.
+
+  * High-quality SIMD code generation: the performance
+    of code generated by ``ispc`` is often close to that of
+    hand-written intrinsics code.
+
+  * Ease of adoption with existing software
+    systems: functions written in ``ispc`` directly
+    interoperate with application functions written in C/C++ and
+    with application data structures.
+            
+  * Portability across over a decade of CPU
+    generations: ``ispc`` has targets for SSE2, SSE4, AVX
+    (and soon, AVX2).
+
+  * Portability across operating systems: Microsoft
+    Windows, Mac OS X, and Linux are all supported
+    by ``ispc``.
+
+  * Debugging with standard tools: ``ispc``
+    programs can be debugged with standard debuggers (OS X and
+    Linux only).
+
+Additional Resources
+--------------------
+
+Prebuilt ``ispc`` binaries for Windows, OS X and Linux can be downloaded
+from the `ispc downloads page <http://ispc.github.com/downloads.html>`_.
+See also additional
+`documentation <http://ispc.github.com/documentation.html>`_ and additional
+`performance information <http://ispc.github.com/perf.html>`_.
--- a/README.txt
+++ b/README.txt
@@ -1,22 +0,0 @@
-==============================
-Intel(r) SPMD Program Compiler
-==============================
-
-Welcome to the Intel(r) SPMD Program Compiler (ispc)!  
-
-ispc is a new compiler for "single program, multiple data" (SPMD)
-programs. Under the SPMD model, the programmer writes a program that mostly
-appears to be a regular serial program, though the execution model is
-actually that a number of program instances execute in parallel on the
-hardware. ispc compiles a C-based SPMD programming language to run on the
-SIMD units of CPUs; it frequently provides a a 3x or more speedup on CPUs
-with 4-wide SSE units, without any of the difficulty of writing intrinsics
-code.
-
-ispc is an open source compiler under the BSD license; see the file
-LICENSE.txt.  ispc supports Windows, Mac, and Linux, with both x86 and
-x86-64 targets.  It currently supports the SSE2, SSE4, and AVX instruction
-sets.
-
-For more information and examples, as well as a wiki and the bug database,
-see the ispc distribution site, http://ispc.github.com.
--- a/ast.cpp
+++ b/ast.cpp
@@ -36,8 +36,11 @@
 */

 #include "ast.h"
+#include "expr.h"
 #include "func.h"
+#include "stmt.h"
 #include "sym.h"
+#include "util.h"

 ///////////////////////////////////////////////////////////////////////////
 // ASTNode
@@ -63,3 +66,242 @@ AST::GenerateIR() {
        functions[i]->GenerateIR();
 }

+///////////////////////////////////////////////////////////////////////////
+
+ASTNode *
+WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
+        void *data) {
+    if (node == NULL)
+        return node;
+
+    // Call the callback function
+    if (preFunc != NULL) {
+        if (preFunc(node, data) == false)
+            // The function asked us to not continue recursively, so stop.
+            return node;
+    }
+
+    ////////////////////////////////////////////////////////////////////////////
+    // Handle Statements
+    if (dynamic_cast<Stmt *>(node) != NULL) {
+        ExprStmt *es;
+        DeclStmt *ds;
+        IfStmt *is;
+        DoStmt *dos;
+        ForStmt *fs;
+        ForeachStmt *fes;
+        CaseStmt *cs;
+        DefaultStmt *defs;
+        SwitchStmt *ss;
+        ReturnStmt *rs;
+        LabeledStmt *ls;
+        StmtList *sl;
+        PrintStmt *ps;
+        AssertStmt *as;
+
+        if ((es = dynamic_cast<ExprStmt *>(node)) != NULL)
+            es->expr = (Expr *)WalkAST(es->expr, preFunc, postFunc, data);
+        else if ((ds = dynamic_cast<DeclStmt *>(node)) != NULL) {
+            for (unsigned int i = 0; i < ds->vars.size(); ++i)
+                ds->vars[i].init = (Expr *)WalkAST(ds->vars[i].init, preFunc, 
+                                                   postFunc, data);
+        }
+        else if ((is = dynamic_cast<IfStmt *>(node)) != NULL) {
+            is->test = (Expr *)WalkAST(is->test, preFunc, postFunc, data);
+            is->trueStmts = (Stmt *)WalkAST(is->trueStmts, preFunc, 
+                                            postFunc, data);
+            is->falseStmts = (Stmt *)WalkAST(is->falseStmts, preFunc, 
+                                             postFunc, data);
+        }
+        else if ((dos = dynamic_cast<DoStmt *>(node)) != NULL) {
+            dos->testExpr = (Expr *)WalkAST(dos->testExpr, preFunc, 
+                                            postFunc, data);
+            dos->bodyStmts = (Stmt *)WalkAST(dos->bodyStmts, preFunc, 
+                                             postFunc, data);
+        }
+        else if ((fs = dynamic_cast<ForStmt *>(node)) != NULL) {
+            fs->init = (Stmt *)WalkAST(fs->init, preFunc, postFunc, data);
+            fs->test = (Expr *)WalkAST(fs->test, preFunc, postFunc, data);
+            fs->step = (Stmt *)WalkAST(fs->step, preFunc, postFunc, data);
+            fs->stmts = (Stmt *)WalkAST(fs->stmts, preFunc, postFunc, data);
+        }
+        else if ((fes = dynamic_cast<ForeachStmt *>(node)) != NULL) {
+            for (unsigned int i = 0; i < fes->startExprs.size(); ++i)
+                fes->startExprs[i] = (Expr *)WalkAST(fes->startExprs[i], preFunc, 
+                                                     postFunc, data);
+            for (unsigned int i = 0; i < fes->endExprs.size(); ++i)
+                fes->endExprs[i] = (Expr *)WalkAST(fes->endExprs[i], preFunc, 
+                                                   postFunc, data);
+            fes->stmts = (Stmt *)WalkAST(fes->stmts, preFunc, postFunc, data);
+        }
+        else if ((cs = dynamic_cast<CaseStmt *>(node)) != NULL)
+            cs->stmts = (Stmt *)WalkAST(cs->stmts, preFunc, postFunc, data);
+        else if ((defs = dynamic_cast<DefaultStmt *>(node)) != NULL)
+            defs->stmts = (Stmt *)WalkAST(defs->stmts, preFunc, postFunc, data);
+        else if ((ss = dynamic_cast<SwitchStmt *>(node)) != NULL) {
+            ss->expr = (Expr *)WalkAST(ss->expr, preFunc, postFunc, data);
+            ss->stmts = (Stmt *)WalkAST(ss->stmts, preFunc, postFunc, data);
+        }
+        else if (dynamic_cast<BreakStmt *>(node) != NULL ||
+                 dynamic_cast<ContinueStmt *>(node) != NULL ||
+                 dynamic_cast<GotoStmt *>(node) != NULL) {
+            // nothing
+        }
+        else if ((ls = dynamic_cast<LabeledStmt *>(node)) != NULL)
+            ls->stmt = (Stmt *)WalkAST(ls->stmt, preFunc, postFunc, data);
+        else if ((rs = dynamic_cast<ReturnStmt *>(node)) != NULL)
+            rs->val = (Expr *)WalkAST(rs->val, preFunc, postFunc, data);
+        else if ((sl = dynamic_cast<StmtList *>(node)) != NULL) {
+            std::vector<Stmt *> &sls = sl->stmts;
+            for (unsigned int i = 0; i < sls.size(); ++i)
+                sls[i] = (Stmt *)WalkAST(sls[i], preFunc, postFunc, data);
+        }
+        else if ((ps = dynamic_cast<PrintStmt *>(node)) != NULL)
+            ps->values = (Expr *)WalkAST(ps->values, preFunc, postFunc, data);
+        else if ((as = dynamic_cast<AssertStmt *>(node)) != NULL)
+            as->expr = (Expr *)WalkAST(as->expr, preFunc, postFunc, data);
+        else
+            FATAL("Unhandled statement type in WalkAST()");
+    }
+    else {
+        ///////////////////////////////////////////////////////////////////////////
+        // Handle expressions
+        Assert(dynamic_cast<Expr *>(node) != NULL);
+        UnaryExpr *ue;
+        BinaryExpr *be;
+        AssignExpr *ae;
+        SelectExpr *se;
+        ExprList *el;
+        FunctionCallExpr *fce;
+        IndexExpr *ie;
+        MemberExpr *me;
+        TypeCastExpr *tce;
+        ReferenceExpr *re;
+        DereferenceExpr *dre;
+        SizeOfExpr *soe;
+        AddressOfExpr *aoe;
+
+        if ((ue = dynamic_cast<UnaryExpr *>(node)) != NULL)
+            ue->expr = (Expr *)WalkAST(ue->expr, preFunc, postFunc, data);
+        else if ((be = dynamic_cast<BinaryExpr *>(node)) != NULL) {
+            be->arg0 = (Expr *)WalkAST(be->arg0, preFunc, postFunc, data);
+            be->arg1 = (Expr *)WalkAST(be->arg1, preFunc, postFunc, data);
+        }
+        else if ((ae = dynamic_cast<AssignExpr *>(node)) != NULL) {
+            ae->lvalue = (Expr *)WalkAST(ae->lvalue, preFunc, postFunc, data);
+            ae->rvalue = (Expr *)WalkAST(ae->rvalue, preFunc, postFunc, data);
+        }
+        else if ((se = dynamic_cast<SelectExpr *>(node)) != NULL) {
+            se->test = (Expr *)WalkAST(se->test, preFunc, postFunc, data);
+            se->expr1 = (Expr *)WalkAST(se->expr1, preFunc, postFunc, data);
+            se->expr2 = (Expr *)WalkAST(se->expr2, preFunc, postFunc, data);
+        }
+        else if ((el = dynamic_cast<ExprList *>(node)) != NULL) {
+            for (unsigned int i = 0; i < el->exprs.size(); ++i)
+                el->exprs[i] = (Expr *)WalkAST(el->exprs[i], preFunc, 
+                                               postFunc, data);
+        }
+        else if ((fce = dynamic_cast<FunctionCallExpr *>(node)) != NULL) {
+            fce->func = (Expr *)WalkAST(fce->func, preFunc, postFunc, data);
+            fce->args = (ExprList *)WalkAST(fce->args, preFunc, postFunc, data);
+            fce->launchCountExpr = (Expr *)WalkAST(fce->launchCountExpr, preFunc,
+                                                   postFunc, data);
+        }
+        else if ((ie = dynamic_cast<IndexExpr *>(node)) != NULL) {
+            ie->baseExpr = (Expr *)WalkAST(ie->baseExpr, preFunc, postFunc, data);
+            ie->index = (Expr *)WalkAST(ie->index, preFunc, postFunc, data);
+        }
+        else if ((me = dynamic_cast<MemberExpr *>(node)) != NULL)
+            me->expr = (Expr *)WalkAST(me->expr, preFunc, postFunc, data);
+        else if ((tce = dynamic_cast<TypeCastExpr *>(node)) != NULL)
+            tce->expr = (Expr *)WalkAST(tce->expr, preFunc, postFunc, data);
+        else if ((re = dynamic_cast<ReferenceExpr *>(node)) != NULL)
+            re->expr = (Expr *)WalkAST(re->expr, preFunc, postFunc, data);
+        else if ((dre = dynamic_cast<DereferenceExpr *>(node)) != NULL)
+            dre->expr = (Expr *)WalkAST(dre->expr, preFunc, postFunc, data);
+        else if ((soe = dynamic_cast<SizeOfExpr *>(node)) != NULL)
+            soe->expr = (Expr *)WalkAST(soe->expr, preFunc, postFunc, data);
+        else if ((aoe = dynamic_cast<AddressOfExpr *>(node)) != NULL)
+            aoe->expr = (Expr *)WalkAST(aoe->expr, preFunc, postFunc, data);
+        else if (dynamic_cast<SymbolExpr *>(node) != NULL ||
+                 dynamic_cast<ConstExpr *>(node) != NULL ||
+                 dynamic_cast<FunctionSymbolExpr *>(node) != NULL ||
+                 dynamic_cast<SyncExpr *>(node) != NULL ||
+                 dynamic_cast<NullPointerExpr *>(node) != NULL) {
+            // nothing to do 
+        }
+        else 
+            FATAL("Unhandled expression type in WalkAST().");
+    }
+
+    // Call the callback function
+    if (postFunc != NULL)
+        return postFunc(node, data);
+    else
+        return node;
+}
+
+
+static ASTNode *
+lOptimizeNode(ASTNode *node, void *) {
+    return node->Optimize();
+}
+
+
+ASTNode *
+Optimize(ASTNode *root) {
+    return WalkAST(root, NULL, lOptimizeNode, NULL);
+}
+
+
+Expr *
+Optimize(Expr *expr) {
+    return (Expr *)Optimize((ASTNode *)expr);
+}
+
+
+Stmt *
+Optimize(Stmt *stmt) {
+    return (Stmt *)Optimize((ASTNode *)stmt);
+}
+
+
+static ASTNode *
+lTypeCheckNode(ASTNode *node, void *) {
+    return node->TypeCheck();
+}
+
+
+ASTNode *
+TypeCheck(ASTNode *root) {
+    return WalkAST(root, NULL, lTypeCheckNode, NULL);
+}
+
+
+Expr *
+TypeCheck(Expr *expr) {
+    return (Expr *)TypeCheck((ASTNode *)expr);
+}
+
+
+Stmt *
+TypeCheck(Stmt *stmt) {
+    return (Stmt *)TypeCheck((ASTNode *)stmt);
+}
+
+
+static bool
+lCostCallback(ASTNode *node, void *c) {
+    int *cost = (int *)c;
+    *cost += node->EstimateCost();
+    return true;
+}
+
+
+int
+EstimateCost(ASTNode *root) {
+    int cost = 0;
+    WalkAST(root, lCostCallback, NULL, &cost);
+    return cost;
+}
+
--- a/ast.h
+++ b/ast.h
@@ -53,10 +53,11 @@ public:
    virtual ~ASTNode();

    /** The Optimize() method should perform any appropriate early-stage
-        optimizations on the node (e.g. constant folding).  The caller
-        should use the returned ASTNode * in place of the original node.
-        This method may return NULL if an error is encountered during
-        optimization. */
+        optimizations on the node (e.g. constant folding).  This method
+        will be called after the node's children have already been
+        optimized, and the caller will store the returned ASTNode * in
+        place of the original node.  This method should return NULL if an
+        error is encountered during optimization. */
    virtual ASTNode *Optimize() = 0;

    /** Type checking should be performed by the node when this method is
@@ -65,6 +66,9 @@ public:
        pointer in place of the original ASTNode *. */
    virtual ASTNode *TypeCheck() = 0;

+    /** Estimate the execution cost of the node (not including the cost of
+        the children.  The value returned should be based on the COST_*
+        enumerant values defined in ispc.h. */
    virtual int EstimateCost() const = 0;

    /** All AST nodes must track the file position where they are
@@ -91,4 +95,53 @@ private:
    std::vector<Function *> functions;
 };

+
+/** Callback function type for preorder traversial visiting function for
+    the AST walk.
+ */
+typedef bool (* ASTPreCallBackFunc)(ASTNode *node, void *data);
+
+/** Callback function type for postorder traversial visiting function for
+    the AST walk.
+ */
+typedef ASTNode * (* ASTPostCallBackFunc)(ASTNode *node, void *data);
+
+/** Walk (some portion of) an AST, starting from the given root node.  At
+    each node, if preFunc is non-NULL, call it, passing the given void
+    *data pointer; if the call to preFunc function returns false, then the
+    children of the node aren't visited.  This function then makes
+    recursive calls to WalkAST() to process the node's children; after
+    doing so, calls postFunc, at the node.  The return value from the
+    postFunc call is ignored. */
+extern ASTNode *WalkAST(ASTNode *root, ASTPreCallBackFunc preFunc,
+                        ASTPostCallBackFunc postFunc, void *data);
+
+/** Perform simple optimizations on the AST or portion thereof passed to
+    this function, returning the resulting AST. */
+extern ASTNode *Optimize(ASTNode *root);
+
+/** Convenience version of Optimize() for Expr *s that returns an Expr *
+    (rather than an ASTNode *, which would require the caller to cast back
+    to an Expr *). */ 
+extern Expr *Optimize(Expr *);
+
+/** Convenience version of Optimize() for Expr *s that returns an Stmt *
+    (rather than an ASTNode *, which would require the caller to cast back
+    to a Stmt *). */ 
+extern Stmt *Optimize(Stmt *);
+
+/** Perform type-checking on the given AST (or portion of one), returning a
+    pointer to the root of the resulting AST. */
+extern ASTNode *TypeCheck(ASTNode *root);
+
+/** Convenience version of TypeCheck() for Expr *s that returns an Expr *. */
+extern Expr *TypeCheck(Expr *);
+
+/** Convenience version of TypeCheck() for Stmt *s that returns an Stmt *. */
+extern Stmt *TypeCheck(Stmt *);
+
+/** Returns an estimate of the execution cost of the tree starting at
+    the given root. */
+extern int EstimateCost(ASTNode *root);
+
 #endif // ISPC_AST_H
--- a/bitcode2cpp.py
+++ b/bitcode2cpp.py
@@ -11,7 +11,10 @@ length=0

 src=str(sys.argv[1])

-target = re.sub(".*builtins-", "", src)
+target = re.sub("builtins/target-", "", src)
+target = re.sub(r"builtins\\target-", "", target)
+target = re.sub("builtins/", "", target)
+target = re.sub(r"builtins\\", "", target)
 target = re.sub("\.ll$", "", target)
 target = re.sub("\.c$", "", target)
 target = re.sub("-", "_", target)
@@ -23,17 +26,21 @@ if platform.system() == 'Windows' or string.find(platform.system(), "CYGWIN_NT")
 try:
    as_out=subprocess.Popen([llvm_as, "-", "-o", "-"], stdout=subprocess.PIPE)
 except IOError:
-    print >> sys.stderr, "Couldn't open " + src
+    sys.stderr.write("Couldn't open " + src)
    sys.exit(1)

-print "unsigned char builtins_bitcode_" + target + "[] = {"
-for line in as_out.stdout.readlines():
-    length = length + len(line)
-    for c in line:
-        print ord(c)
-        print ", "
-print " 0 };\n\n"
-print "int builtins_bitcode_" + target + "_length = " + str(length) + ";\n"
+width = 16;
+sys.stdout.write("unsigned char builtins_bitcode_" + target + "[] = {\n")
+
+data = as_out.stdout.read()
+for i in range(0, len(data), 1):
+        sys.stdout.write("0x%0.2X, " % ord(data[i:i+1]))
+
+        if i%width == (width-1):
+            sys.stdout.write("\n")
+
+sys.stdout.write("0x00 };\n\n")
+sys.stdout.write("int builtins_bitcode_" + target + "_length = " + str(i+1) + ";\n")

 as_out.wait()

--- a/buildispc.bat
+++ b/buildispc.bat
@@ -0,0 +1,11 @@
+@echo off
+
+REM If LLVM_INSTALL_DIR isn't set globally in your environment,
+REM it can be set here_
+set LLVM_INSTALL_DIR=c:\users\mmp\llvm-dev
+set LLVM_VERSION=3.1svn
+
+REM Both the LLVM binaries and python need to be in the path
+set path=%LLVM_INSTALL_DIR%\bin;%PATH%;c:\cygwin\bin
+
+msbuild ispc.vcxproj /V:m /p:Platform=Win32 /p:Configuration=Release
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -99,6 +99,9 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
        return intAsUnsigned ? AtomicType::UniformUInt64 : AtomicType::UniformInt64;

    // varying
+    if (LLVMTypes::MaskType != LLVMTypes::Int32VectorType &&
+        t == LLVMTypes::MaskType)
+        return AtomicType::VaryingBool;
    else if (t == LLVMTypes::Int8VectorType)
        return intAsUnsigned ? AtomicType::VaryingUInt8 : AtomicType::VaryingInt8;
    else if (t == LLVMTypes::Int16VectorType)
@@ -194,7 +197,7 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
    // symbol creation code below assumes that any LLVM vector of i32s is a
    // varying int32.  Here, we need that to be interpreted as a varying
    // bool, so just have a one-off override for that one...
-    if (name == "__sext_varying_bool") {
+    if (g->target.maskBitCount != 1 && name == "__sext_varying_bool") {
        const Type *returnType = AtomicType::VaryingInt32;
        std::vector<const Type *> argTypes;
        argTypes.push_back(AtomicType::VaryingBool);
@@ -375,10 +378,10 @@ lSetInternalFunctions(llvm::Module *module) {
        "__atomic_xor_uniform_int64_global",
        "__broadcast_double",
        "__broadcast_float",
-        "__broadcast_int16",
-        "__broadcast_int32",
-        "__broadcast_int64",
-        "__broadcast_int8",
+        "__broadcast_i16",
+        "__broadcast_i32",
+        "__broadcast_i64",
+        "__broadcast_i8",
        "__ceil_uniform_double",
        "__ceil_uniform_float",
        "__ceil_varying_double",
@@ -480,10 +483,10 @@ lSetInternalFunctions(llvm::Module *module) {
        "__reduce_min_uint64",
        "__rotate_double",
        "__rotate_float",
-        "__rotate_int16",
-        "__rotate_int32",
-        "__rotate_int64",
-        "__rotate_int8",
+        "__rotate_i16",
+        "__rotate_i32",
+        "__rotate_i64",
+        "__rotate_i8",
        "__round_uniform_double",
        "__round_uniform_float",
        "__round_varying_double",
@@ -494,16 +497,16 @@ lSetInternalFunctions(llvm::Module *module) {
        "__sext_varying_bool",
        "__shuffle2_double",
        "__shuffle2_float",
-        "__shuffle2_int16",
-        "__shuffle2_int32",
-        "__shuffle2_int64",
-        "__shuffle2_int8",
+        "__shuffle2_i16",
+        "__shuffle2_i32",
+        "__shuffle2_i64",
+        "__shuffle2_i8",
        "__shuffle_double",
        "__shuffle_float",
-        "__shuffle_int16",
-        "__shuffle_int32",
-        "__shuffle_int64",
-        "__shuffle_int8",
+        "__shuffle_i16",
+        "__shuffle_i32",
+        "__shuffle_i64",
+        "__shuffle_i8",
        "__soa_to_aos3_float",
        "__soa_to_aos3_float16",
        "__soa_to_aos3_float4",
@@ -556,7 +559,7 @@ lSetInternalFunctions(llvm::Module *module) {
    int count = sizeof(names) / sizeof(names[0]);
    for (int i = 0; i < count; ++i) {
        llvm::Function *f = module->getFunction(names[i]);
-        if (f != NULL)
+        if (f != NULL && f->empty() == false)
            f->setLinkage(llvm::GlobalValue::InternalLinkage);
    }
 }
@@ -714,11 +717,13 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
        extern int builtins_bitcode_sse4_x2_length;
        switch (g->target.vectorWidth) {
        case 4: 
-            AddBitcodeToModule(builtins_bitcode_sse4, builtins_bitcode_sse4_length, 
+            AddBitcodeToModule(builtins_bitcode_sse4,
+                               builtins_bitcode_sse4_length, 
                               module, symbolTable);
            break;
        case 8:
-            AddBitcodeToModule(builtins_bitcode_sse4_x2, builtins_bitcode_sse4_x2_length, 
+            AddBitcodeToModule(builtins_bitcode_sse4_x2, 
+                               builtins_bitcode_sse4_x2_length, 
                               module, symbolTable);
            break;
        default:
@@ -726,24 +731,72 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
        }
        break;
    case Target::AVX:
-    case Target::AVX2:
        switch (g->target.vectorWidth) {
        case 8:
-            extern unsigned char builtins_bitcode_avx[];
-            extern int builtins_bitcode_avx_length;
-            AddBitcodeToModule(builtins_bitcode_avx, builtins_bitcode_avx_length, 
+            extern unsigned char builtins_bitcode_avx1[];
+            extern int builtins_bitcode_avx1_length;
+            AddBitcodeToModule(builtins_bitcode_avx1, 
+                               builtins_bitcode_avx1_length, 
                               module, symbolTable);
            break;
        case 16:
-            extern unsigned char builtins_bitcode_avx_x2[];
-            extern int builtins_bitcode_avx_x2_length;
-            AddBitcodeToModule(builtins_bitcode_avx_x2, builtins_bitcode_avx_x2_length,
+            extern unsigned char builtins_bitcode_avx1_x2[];
+            extern int builtins_bitcode_avx1_x2_length;
+            AddBitcodeToModule(builtins_bitcode_avx1_x2, 
+                               builtins_bitcode_avx1_x2_length,
                               module,  symbolTable);
            break;
        default:
            FATAL("logic error in DefineStdlib");
        }
        break;
+    case Target::AVX2:
+        switch (g->target.vectorWidth) {
+        case 8:
+            extern unsigned char builtins_bitcode_avx2[];
+            extern int builtins_bitcode_avx2_length;
+            AddBitcodeToModule(builtins_bitcode_avx2, 
+                               builtins_bitcode_avx2_length, 
+                               module, symbolTable);
+            break;
+        case 16:
+            extern unsigned char builtins_bitcode_avx2_x2[];
+            extern int builtins_bitcode_avx2_x2_length;
+            AddBitcodeToModule(builtins_bitcode_avx2_x2, 
+                               builtins_bitcode_avx2_x2_length,
+                               module,  symbolTable);
+            break;
+        default:
+            FATAL("logic error in DefineStdlib");
+        }
+        break;
+    case Target::GENERIC:
+        switch (g->target.vectorWidth) {
+        case 4:
+            extern unsigned char builtins_bitcode_generic_4[];
+            extern int builtins_bitcode_generic_4_length;
+            AddBitcodeToModule(builtins_bitcode_generic_4, 
+                               builtins_bitcode_generic_4_length, 
+                               module, symbolTable);
+            break;
+        case 8:
+            extern unsigned char builtins_bitcode_generic_8[];
+            extern int builtins_bitcode_generic_8_length;
+            AddBitcodeToModule(builtins_bitcode_generic_8, 
+                               builtins_bitcode_generic_8_length, 
+                               module, symbolTable);
+            break;
+        case 16:
+            extern unsigned char builtins_bitcode_generic_16[];
+            extern int builtins_bitcode_generic_16_length;
+            AddBitcodeToModule(builtins_bitcode_generic_16, 
+                               builtins_bitcode_generic_16_length, 
+                               module, symbolTable);
+            break;
+        default:
+            FATAL("logic error in DefineStdlib");
+        }
+        break;
    default:
        FATAL("logic error");
    }
@@ -771,11 +824,16 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
    if (includeStdlibISPC) {
        // If the user wants the standard library to be included, parse the
        // serialized version of the stdlib.ispc file to get its
-        // definitions added.  Disable emission of performance warnings for
-        // now, since the user doesn't care about any of that in the stdlib
-        // implementation...
-        extern char stdlib_code[];
-        yy_scan_string(stdlib_code);
-        yyparse();
+        // definitions added.
+        if (g->target.isa == Target::GENERIC) {
+            extern char stdlib_generic_code[];
+            yy_scan_string(stdlib_generic_code);
+            yyparse();
+        }
+        else {
+            extern char stdlib_x86_code[];
+            yy_scan_string(stdlib_x86_code);
+            yyparse();
+        }
    }
 }
--- a/builtins/builtins.c
+++ b/builtins/builtins.c
@@ -149,7 +149,7 @@ void __do_print(const char *format, const char *types, int width, int mask,


 int __num_cores() {
-#ifdef _MSC_VER
+#if defined(_MSC_VER) || defined(__MINGW32__)
    // This is quite a hack.  Including all of windows.h to get this definition
    // pulls in a bunch of stuff that leads to undefined symbols at link time.
    // So we don't #include <windows.h> but instead have the equivalent declarations
--- a/builtins/dispatch.ll
+++ b/builtins/dispatch.ll
@@ -48,23 +48,42 @@ declare void @abort() noreturn
 ;; corresponding to one of the Target::ISA enumerant values that gives the
 ;; most capable ISA that the curremt system can run.
 ;;
-;; #ifdef _MSC_VER
-;; extern void __stdcall __cpuid(int info[4], int infoType);
-;; #else
+;; Note: clang from LLVM 2.9 should be used if this is updated, for maximum
+;; backwards compatibility for anyone building ispc with LLVM 2.9.
+;;
+;; #include <stdint.h>
+;; #include <stdlib.h>
+;; 
 ;; static void __cpuid(int info[4], int infoType) {
 ;;     __asm__ __volatile__ ("cpuid"
 ;;                           : "=a" (info[0]), "=b" (info[1]), "=c" (info[2]), "=d" (info[3])
 ;;                           : "0" (infoType));
 ;; }
-;; #endif
+;; 
+;; /* Save %ebx in case it's the PIC register */
+;; static void __cpuid_count(int info[4], int level, int count) {
+;;   __asm__ __volatile__ ("xchg{l}\t{%%}ebx, %1\n\t"
+;;                         "cpuid\n\t"
+;;                         "xchg{l}\t{%%}ebx, %1\n\t"
+;;                         : "=a" (info[0]), "=r" (info[1]), "=c" (info[2]), "=d" (info[3])
+;;                         : "0" (level), "2" (count));
+;; }
 ;; 
 ;; int32_t __get_system_isa() {
 ;;     int info[4];
 ;;     __cpuid(info, 1);
+;; 
 ;;     /* NOTE: the values returned below must be the same as the
 ;;        corresponding enumerant values in Target::ISA. */
-;;     if ((info[2] & (1 << 28)) != 0)
-;;         return 2; // AVX
+;;     if ((info[2] & (1 << 28)) != 0) {
+;;         // AVX1 for sure. Do we have AVX2?
+;;         // Call cpuid with eax=7, ecx=0
+;;         __cpuid_count(info, 7, 0);
+;;         if ((info[1] & (1 << 5)) != 0)
+;;             return 3; // AVX2
+;;         else
+;;             return 2; // AVX1
+;;     }
 ;;     else if ((info[2] & (1 << 19)) != 0)
 ;;         return 1; // SSE4
 ;;     else if ((info[3] & (1 << 26)) != 0)
@@ -76,33 +95,42 @@ declare void @abort() noreturn
 %0 = type { i32, i32, i32, i32 }

 define i32 @__get_system_isa() nounwind ssp {
-  %1 = tail call %0 asm sideeffect "cpuid", "={ax},={bx},={cx},={dx},0,~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind
-  %2 = extractvalue %0 %1, 2
-  %3 = extractvalue %0 %1, 3
-  %4 = and i32 %2, 268435456
-  %5 = icmp eq i32 %4, 0
-  br i1 %5, label %6, label %13
+entry:
+  %0 = tail call %0 asm sideeffect "cpuid", "={ax},={bx},={cx},={dx},0,~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind
+  %asmresult9.i = extractvalue %0 %0, 2
+  %asmresult10.i = extractvalue %0 %0, 3
+  %and = and i32 %asmresult9.i, 268435456
+  %cmp = icmp eq i32 %and, 0
+  br i1 %cmp, label %if.else7, label %if.then

-; <label>:6                                       ; preds = %0
-  %7 = and i32 %2, 524288
-  %8 = icmp eq i32 %7, 0
-  br i1 %8, label %9, label %13
+if.then:                                          ; preds = %entry
+  %1 = tail call %0 asm sideeffect "xchg$(l$)\09$(%$)ebx, $1\0A\09cpuid\0A\09xchg$(l$)\09$(%$)ebx, $1\0A\09", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 7, i32 0) nounwind
+  %asmresult9.i24 = extractvalue %0 %1, 1
+  %and4 = lshr i32 %asmresult9.i24, 5
+  %2 = and i32 %and4, 1
+  %3 = or i32 %2, 2
+  br label %return

-; <label>:9                                       ; preds = %6
-  %10 = and i32 %3, 67108864
-  %11 = icmp eq i32 %10, 0
-  br i1 %11, label %12, label %13
+if.else7:                                         ; preds = %entry
+  %and10 = and i32 %asmresult9.i, 524288
+  %cmp11 = icmp eq i32 %and10, 0
+  br i1 %cmp11, label %if.else13, label %return

-; <label>:12                                      ; preds = %9
+if.else13:                                        ; preds = %if.else7
+  %and16 = and i32 %asmresult10.i, 67108864
+  %cmp17 = icmp eq i32 %and16, 0
+  br i1 %cmp17, label %if.else19, label %return
+
+if.else19:                                        ; preds = %if.else13
  tail call void @abort() noreturn nounwind
  unreachable

-; <label>:13                                      ; preds = %9, %6, %0
-  %.0 = phi i32 [ 2, %0 ], [ 1, %6 ], [ 0, %9 ]
-  ret i32 %.0
+return:                                           ; preds = %if.else13, %if.else7, %if.then
+  %retval.0 = phi i32 [ %3, %if.then ], [ 1, %if.else7 ], [ 0, %if.else13 ]
+  ret i32 %retval.0
 }

-
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; This function is called by each of the dispatch functions we generate;
 ;; it sets @__system_best_isa if it is unset.

--- a/builtins/target-avx-common.ll
+++ b/builtins/target-avx-common.ll
@@ -32,6 +32,11 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; AVX target implementation.

+ctlztz()
+define_prefetches()
+define_shuffles()
+aossoa()
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp

--- a/builtins/target-avx-x2.ll
+++ b/builtins/target-avx-x2.ll
@@ -32,12 +32,16 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; Basic 16-wide definitions

-stdlib_core(16)
-packed_load_and_store(16)
-scans(16)
-int64minmax(16)
+define(`WIDTH',`16')
+define(`MASK',`i32')
+include(`util.m4')

-include(`builtins-avx-common.ll')
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+
+include(`target-avx-common.ll')

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
@@ -166,33 +170,6 @@ define <16 x float> @__min_varying_float(<16 x float>,
 }


-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; int min/max
-
-define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
-  binary4to16(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
-  ret <16 x i32> %ret
-}
-
-define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
-  binary4to16(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
-  ret <16 x i32> %ret
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; unsigned int min/max
-
-define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
-  binary4to16(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
-  ret <16 x i32> %ret
-}
-
-define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
-  binary4to16(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
-  ret <16 x i32> %ret
-}
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops

@@ -381,13 +358,13 @@ load_and_broadcast(16, i32, 32)
 load_and_broadcast(16, i64, 64)

 ; no masked load instruction for i8 and i16 types??
-load_masked(16, i8,  8,  1)
-load_masked(16, i16, 16, 2)
+masked_load(16, i8,  8,  1)
+masked_load(16, i16, 16, 2)

 declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x float> %mask)
 declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
 
-define <16 x i32> @__load_masked_32(i8 *, <16 x i32> %mask) nounwind alwaysinline {
+define <16 x i32> @__masked_load_32(i8 *, <16 x i32> %mask) nounwind alwaysinline {
  %floatmask = bitcast <16 x i32> %mask to <16 x float>
  %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -405,7 +382,7 @@ define <16 x i32> @__load_masked_32(i8 *, <16 x i32> %mask) nounwind alwaysinlin
 }


-define <16 x i64> @__load_masked_64(i8 *, <16 x i32> %mask) nounwind alwaysinline {
+define <16 x i64> @__masked_load_64(i8 *, <16 x i32> %mask) nounwind alwaysinline {
  ; double up masks, bitcast to doubles
  %mask0 = shufflevector <16 x i32> %mask, <16 x i32> undef,
     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
@@ -618,12 +595,7 @@ define void @__masked_store_blend_64(<16 x i64>* nocapture %ptr, <16 x i64> %new
 }

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; gather/scatter
-
-gen_gather(16, i8)
-gen_gather(16, i16)
-gen_gather(16, i32)
-gen_gather(16, i64)
+;; scatter

 gen_scatter(16, i8)
 gen_scatter(16, i16)
--- a/builtins/target-avx.ll
+++ b/builtins/target-avx.ll
@@ -32,12 +32,16 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; Basic 8-wide definitions

-stdlib_core(8)
-packed_load_and_store(8)
-scans(8)
-int64minmax(8)
+define(`WIDTH',`8')
+define(`MASK',`i32')
+include(`util.m4')

-include(`builtins-avx-common.ll')
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+
+include(`target-avx-common.ll')

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
@@ -166,33 +170,6 @@ define <8 x float> @__min_varying_float(<8 x float>,
 }


-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; int min/max
-
-define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
-  binary4to8(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
-  ret <8 x i32> %ret
-}
-
-define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
-  binary4to8(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
-  ret <8 x i32> %ret
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; unsigned int min/max
-
-define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
-  binary4to8(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
-  ret <8 x i32> %ret
-}
-
-define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
-  binary4to8(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
-  ret <8 x i32> %ret
-}
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops

@@ -234,7 +211,7 @@ reduce_equal(8)
 ;; horizontal int32 ops

 define <8 x i32> @__add_varying_int32(<8 x i32>,
-                                               <8 x i32>) nounwind readnone alwaysinline {
+                                      <8 x i32>) nounwind readnone alwaysinline {
  %s = add <8 x i32> %0, %1
  ret <8 x i32> %s
 }
@@ -310,7 +287,7 @@ define double @__reduce_max_double(<8 x double>) nounwind readnone alwaysinline
 ;; horizontal int64 ops

 define <8 x i64> @__add_varying_int64(<8 x i64>,
-                                               <8 x i64>) nounwind readnone alwaysinline {
+                                      <8 x i64>) nounwind readnone alwaysinline {
  %s = add <8 x i64> %0, %1
  ret <8 x i64> %s
 }
@@ -362,13 +339,13 @@ load_and_broadcast(8, i32, 32)
 load_and_broadcast(8, i64, 64)

 ; no masked load instruction for i8 and i16 types??
-load_masked(8, i8,  8,  1)
-load_masked(8, i16, 16, 2)
+masked_load(8, i8,  8,  1)
+masked_load(8, i16, 16, 2)

 declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x float> %mask)
 declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
 
-define <8 x i32> @__load_masked_32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
+define <8 x i32> @__masked_load_32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
  %floatmask = bitcast <8 x i32> %mask to <8 x float>
  %floatval = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8 * %0, <8 x float> %floatmask)
  %retval = bitcast <8 x float> %floatval to <8 x i32>
@@ -376,7 +353,7 @@ define <8 x i32> @__load_masked_32(i8 *, <8 x i32> %mask) nounwind alwaysinline
 }


-define <8 x i64> @__load_masked_64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
+define <8 x i64> @__masked_load_64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
  ; double up masks, bitcast to doubles
  %mask0 = shufflevector <8 x i32> %mask, <8 x i32> undef,
     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
@@ -399,9 +376,6 @@ define <8 x i64> @__load_masked_64(i8 *, <8 x i32> %mask) nounwind alwaysinline
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store

-; FIXME: there is no AVX instruction for these, but we could be clever
-; by packing the bits down and setting the last 3/4 or half, respectively,
-; of the mask to zero...  Not sure if this would be a win in the end
 gen_masked_store(8, i8, 8)
 gen_masked_store(8, i16, 16)

@@ -516,12 +490,7 @@ define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,


 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; gather/scatter
-
-gen_gather(8, i8)
-gen_gather(8, i16)
-gen_gather(8, i32)
-gen_gather(8, i64)
+;; scatter

 gen_scatter(8, i8)
 gen_scatter(8, i16)
--- a/builtins/target-avx1-x2.ll
+++ b/builtins/target-avx1-x2.ll
@@ -0,0 +1,69 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+include(`target-avx-x2.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret <16 x i32> %ret
+}
+
+define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret <16 x i32> %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret <16 x i32> %ret
+}
+
+define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret <16 x i32> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather(16, i8)
+gen_gather(16, i16)
+gen_gather(16, i32)
+gen_gather(16, i64)
+
+
--- a/builtins/target-avx1.ll
+++ b/builtins/target-avx1.ll
@@ -0,0 +1,70 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+include(`target-avx.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret <8 x i32> %ret
+}
+
+define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret <8 x i32> %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret <8 x i32> %ret
+}
+
+define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret <8 x i32> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather(8, i8)
+gen_gather(8, i16)
+gen_gather(8, i32)
+gen_gather(8, i64)
+
+
+
--- a/builtins/target-avx2-x2.ll
+++ b/builtins/target-avx2-x2.ll
@@ -0,0 +1,74 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+include(`target-avx-x2.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readonly
+declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readonly
+
+define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary8to16(m, i32, @llvm.x86.avx2.pmins.d, %0, %1)
+  ret <16 x i32> %m
+}
+
+define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary8to16(m, i32, @llvm.x86.avx2.pmaxs.d, %0, %1)
+  ret <16 x i32> %m
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readonly
+declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readonly
+
+define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary8to16(m, i32, @llvm.x86.avx2.pminu.d, %0, %1)
+  ret <16 x i32> %m
+}
+
+define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary8to16(m, i32, @llvm.x86.avx2.pmaxu.d, %0, %1)
+  ret <16 x i32> %m
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather(16, i8)
+gen_gather(16, i16)
+gen_gather(16, i32)
+gen_gather(16, i64)
+
+
--- a/builtins/target-avx2.ll
+++ b/builtins/target-avx2.ll
@@ -0,0 +1,75 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+include(`target-avx.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readonly
+declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readonly
+
+define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  %m = call <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32> %0, <8 x i32> %1)
+  ret <8 x i32> %m
+}
+
+define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  %m = call <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32> %0, <8 x i32> %1)
+  ret <8 x i32> %m
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readonly
+declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readonly
+
+define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  %m = call <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32> %0, <8 x i32> %1)
+  ret <8 x i32> %m
+}
+
+define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  %m = call <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32> %0, <8 x i32> %1)
+  ret <8 x i32> %m
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather(8, i8)
+gen_gather(8, i16)
+gen_gather(8, i32)
+gen_gather(8, i64)
+
+
+
--- a/builtins/target-generic-16.ll
+++ b/builtins/target-generic-16.ll
@@ -0,0 +1,34 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`WIDTH',`16')
+include(`target-generic-common.ll')
+
--- a/builtins/target-generic-4.ll
+++ b/builtins/target-generic-4.ll
@@ -0,0 +1,34 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`WIDTH',`4')
+include(`target-generic-common.ll')
+
--- a/builtins/target-generic-8.ll
+++ b/builtins/target-generic-8.ll
@@ -0,0 +1,34 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`WIDTH',`8')
+include(`target-generic-common.ll')
+
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -0,0 +1,328 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`MASK',`i1')
+include(`util.m4')
+
+stdlib_core()
+scans()
+reduce_equal(WIDTH)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; broadcast/rotate/shuffle
+
+declare <WIDTH x float> @__smear_float(float) nounwind readnone
+declare <WIDTH x double> @__smear_double(double) nounwind readnone
+declare <WIDTH x i8> @__smear_i8(i8) nounwind readnone
+declare <WIDTH x i16> @__smear_i16(i16) nounwind readnone
+declare <WIDTH x i32> @__smear_i32(i32) nounwind readnone
+declare <WIDTH x i64> @__smear_i64(i64) nounwind readnone
+
+declare <WIDTH x float> @__broadcast_float(<WIDTH x float>, i32) nounwind readnone
+declare <WIDTH x double> @__broadcast_double(<WIDTH x double>, i32) nounwind readnone
+declare <WIDTH x i8> @__broadcast_i8(<WIDTH x i8>, i32) nounwind readnone
+declare <WIDTH x i16> @__broadcast_i16(<WIDTH x i16>, i32) nounwind readnone
+declare <WIDTH x i32> @__broadcast_i32(<WIDTH x i32>, i32) nounwind readnone
+declare <WIDTH x i64> @__broadcast_i64(<WIDTH x i64>, i32) nounwind readnone
+
+declare <WIDTH x i8> @__rotate_i8(<WIDTH x i8>, i32) nounwind readnone
+declare <WIDTH x i16> @__rotate_i16(<WIDTH x i16>, i32) nounwind readnone
+declare <WIDTH x float> @__rotate_float(<WIDTH x float>, i32) nounwind readnone
+declare <WIDTH x i32> @__rotate_i32(<WIDTH x i32>, i32) nounwind readnone
+declare <WIDTH x double> @__rotate_double(<WIDTH x double>, i32) nounwind readnone
+declare <WIDTH x i64> @__rotate_i64(<WIDTH x i64>, i32) nounwind readnone
+
+declare <WIDTH x i8> @__shuffle_i8(<WIDTH x i8>, <WIDTH x i32>) nounwind readnone
+declare <WIDTH x i8> @__shuffle2_i8(<WIDTH x i8>, <WIDTH x i8>,
+                                    <WIDTH x i32>) nounwind readnone
+declare <WIDTH x i16> @__shuffle_i16(<WIDTH x i16>, <WIDTH x i32>) nounwind readnone
+declare <WIDTH x i16> @__shuffle2_i16(<WIDTH x i16>, <WIDTH x i16>,
+                                      <WIDTH x i32>) nounwind readnone
+declare <WIDTH x float> @__shuffle_float(<WIDTH x float>,
+                                         <WIDTH x i32>) nounwind readnone
+declare <WIDTH x float> @__shuffle2_float(<WIDTH x float>, <WIDTH x float>,
+                                          <WIDTH x i32>) nounwind readnone
+declare <WIDTH x i32> @__shuffle_i32(<WIDTH x i32>,
+                                     <WIDTH x i32>) nounwind readnone
+declare <WIDTH x i32> @__shuffle2_i32(<WIDTH x i32>, <WIDTH x i32>,
+                                      <WIDTH x i32>) nounwind readnone
+declare <WIDTH x double> @__shuffle_double(<WIDTH x double>,
+                                           <WIDTH x i32>) nounwind readnone
+declare <WIDTH x double> @__shuffle2_double(<WIDTH x double>,
+                                            <WIDTH x double>, <WIDTH x i32>) nounwind readnone
+declare <WIDTH x i64> @__shuffle_i64(<WIDTH x i64>,
+                                     <WIDTH x i32>) nounwind readnone
+declare <WIDTH x i64> @__shuffle2_i64(<WIDTH x i64>, <WIDTH x i64>,
+                                      <WIDTH x i32>) nounwind readnone
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; aos/soa
+
+declare void @__soa_to_aos3_float(<WIDTH x float> %v0, <WIDTH x float> %v1,
+                                  <WIDTH x float> %v2, float * noalias %p) nounwind
+declare void @__aos_to_soa3_float(float * noalias %p, <WIDTH x float> * %out0,
+                                  <WIDTH x float> * %out1, <WIDTH x float> * %out2) nounwind
+declare void @__soa_to_aos4_float(<WIDTH x float> %v0, <WIDTH x float> %v1,
+                                  <WIDTH x float> %v2, <WIDTH x float> %v3,
+                                  float * noalias %p) nounwind
+declare void @__aos_to_soa4_float(float * noalias %p, <WIDTH x float> * noalias %out0,
+                                  <WIDTH x float> * noalias %out1,
+                                  <WIDTH x float> * noalias %out2,
+                                  <WIDTH x float> * noalias %out3) nounwind
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; math
+
+declare void @__fastmath() nounwind 
+
+;; round/floor/ceil
+
+declare float @__round_uniform_float(float) nounwind readnone 
+declare float @__floor_uniform_float(float) nounwind readnone 
+declare float @__ceil_uniform_float(float) nounwind readnone 
+
+declare double @__round_uniform_double(double) nounwind readnone 
+declare double @__floor_uniform_double(double) nounwind readnone 
+declare double @__ceil_uniform_double(double) nounwind readnone 
+
+declare <WIDTH x float> @__round_varying_float(<WIDTH x float>) nounwind readnone 
+declare <WIDTH x float> @__floor_varying_float(<WIDTH x float>) nounwind readnone 
+declare <WIDTH x float> @__ceil_varying_float(<WIDTH x float>) nounwind readnone 
+declare <WIDTH x double> @__round_varying_double(<WIDTH x double>) nounwind readnone 
+declare <WIDTH x double> @__floor_varying_double(<WIDTH x double>) nounwind readnone 
+declare <WIDTH x double> @__ceil_varying_double(<WIDTH x double>) nounwind readnone 
+
+;; min/max
+
+declare float @__max_uniform_float(float, float) nounwind readnone 
+declare float @__min_uniform_float(float, float) nounwind readnone 
+declare i32 @__min_uniform_int32(i32, i32) nounwind readnone 
+declare i32 @__max_uniform_int32(i32, i32) nounwind readnone 
+declare i32 @__min_uniform_uint32(i32, i32) nounwind readnone 
+declare i32 @__max_uniform_uint32(i32, i32) nounwind readnone 
+declare i64 @__min_uniform_int64(i64, i64) nounwind readnone 
+declare i64 @__max_uniform_int64(i64, i64) nounwind readnone 
+declare i64 @__min_uniform_uint64(i64, i64) nounwind readnone 
+declare i64 @__max_uniform_uint64(i64, i64) nounwind readnone 
+declare double @__min_uniform_double(double, double) nounwind readnone 
+declare double @__max_uniform_double(double, double) nounwind readnone 
+
+declare <WIDTH x float> @__max_varying_float(<WIDTH x float>,
+                                             <WIDTH x float>) nounwind readnone 
+declare <WIDTH x float> @__min_varying_float(<WIDTH x float>,
+                                             <WIDTH x float>) nounwind readnone 
+declare <WIDTH x i32> @__min_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone 
+declare <WIDTH x i32> @__max_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone 
+declare <WIDTH x i32> @__min_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone 
+declare <WIDTH x i32> @__max_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone 
+declare <WIDTH x i64> @__min_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone 
+declare <WIDTH x i64> @__max_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone 
+declare <WIDTH x i64> @__min_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone 
+declare <WIDTH x i64> @__max_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone 
+declare <WIDTH x double> @__min_varying_double(<WIDTH x double>,
+                                               <WIDTH x double>) nounwind readnone
+declare <WIDTH x double> @__max_varying_double(<WIDTH x double>,
+                                               <WIDTH x double>) nounwind readnone 
+
+;; sqrt/rsqrt/rcp
+
+declare float @__rsqrt_uniform_float(float) nounwind readnone 
+declare float @__rcp_uniform_float(float) nounwind readnone 
+declare float @__sqrt_uniform_float(float) nounwind readnone 
+declare <WIDTH x float> @__rcp_varying_float(<WIDTH x float>) nounwind readnone 
+declare <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float>) nounwind readnone 
+declare <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone 
+
+declare double @__sqrt_uniform_double(double) nounwind readnone
+declare <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readnone
+
+;; bit ops
+
+declare i32 @__popcnt_int32(i32) nounwind readnone
+declare i64 @__popcnt_int64(i64) nounwind readnone 
+
+declare i32 @__count_trailing_zeros_i32(i32) nounwind readnone
+declare i64 @__count_trailing_zeros_i64(i64) nounwind readnone
+declare i32 @__count_leading_zeros_i32(i32) nounwind readnone
+declare i64 @__count_leading_zeros_i64(i64) nounwind readnone
+
+;; svml
+
+; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
+; or, use the macro to call the 4-wide ones twice with our 8-wide
+; vectors...
+
+declare <WIDTH x float> @__svml_sin(<WIDTH x float>)
+declare <WIDTH x float> @__svml_cos(<WIDTH x float>)
+declare void @__svml_sincos(<WIDTH x float>, <WIDTH x float> *, <WIDTH x float> *)
+declare <WIDTH x float> @__svml_tan(<WIDTH x float>)
+declare <WIDTH x float> @__svml_atan(<WIDTH x float>)
+declare <WIDTH x float> @__svml_atan2(<WIDTH x float>, <WIDTH x float>)
+declare <WIDTH x float> @__svml_exp(<WIDTH x float>)
+declare <WIDTH x float> @__svml_log(<WIDTH x float>)
+declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reductions
+
+declare i32 @__movmsk(<WIDTH x i1>) nounwind readnone 
+
+declare float @__reduce_add_float(<WIDTH x float>) nounwind readnone
+declare float @__reduce_min_float(<WIDTH x float>) nounwind readnone 
+declare float @__reduce_max_float(<WIDTH x float>) nounwind readnone 
+
+declare i32 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone 
+declare i32 @__reduce_min_int32(<WIDTH x i32>) nounwind readnone 
+declare i32 @__reduce_max_int32(<WIDTH x i32>) nounwind readnone 
+
+declare i32 @__reduce_add_uint32(<WIDTH x i32>) nounwind readnone 
+declare i32 @__reduce_min_uint32(<WIDTH x i32>) nounwind readnone 
+declare i32 @__reduce_max_uint32(<WIDTH x i32>) nounwind readnone 
+
+declare double @__reduce_add_double(<WIDTH x double>) nounwind readnone 
+declare double @__reduce_min_double(<WIDTH x double>) nounwind readnone 
+declare double @__reduce_max_double(<WIDTH x double>) nounwind readnone 
+
+declare i64 @__reduce_add_int64(<WIDTH x i64>) nounwind readnone 
+declare i64 @__reduce_min_int64(<WIDTH x i64>) nounwind readnone 
+declare i64 @__reduce_max_int64(<WIDTH x i64>) nounwind readnone 
+
+declare i64 @__reduce_add_uint64(<WIDTH x i64>) nounwind readnone 
+declare i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone 
+declare i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone 
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+load_and_broadcast(WIDTH, i8, 8)
+load_and_broadcast(WIDTH, i16, 16)
+load_and_broadcast(WIDTH, i32, 32)
+load_and_broadcast(WIDTH, i64, 64)
+
+declare <WIDTH x i8> @__masked_load_8(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
+declare <WIDTH x i16> @__masked_load_16(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
+declare <WIDTH x i32> @__masked_load_32(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
+declare <WIDTH x i64> @__masked_load_64(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
+
+declare void @__masked_store_8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
+                               <WIDTH x i1>) nounwind 
+declare void @__masked_store_16(<WIDTH x i16>* nocapture, <WIDTH x i16>, 
+                                <WIDTH x i1>) nounwind 
+declare void @__masked_store_32(<WIDTH x i32>* nocapture, <WIDTH x i32>, 
+                                <WIDTH x i1>) nounwind 
+declare void @__masked_store_64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
+                                <WIDTH x i1> %mask) nounwind 
+
+ifelse(LLVM_VERSION, `LLVM_3_1svn',`
+define void @__masked_store_blend_8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
+                                     <WIDTH x i1>) nounwind alwaysinline {
+  %v = load <WIDTH x i8> * %0
+  %v1 = select <WIDTH x i1> %2, <WIDTH x i8> %1, <WIDTH x i8> %v
+  store <WIDTH x i8> %v1, <WIDTH x i8> * %0
+  ret void
+}
+
+define void @__masked_store_blend_16(<WIDTH x i16>* nocapture, <WIDTH x i16>, 
+                                     <WIDTH x i1>) nounwind alwaysinline {
+  %v = load <WIDTH x i16> * %0
+  %v1 = select <WIDTH x i1> %2, <WIDTH x i16> %1, <WIDTH x i16> %v
+  store <WIDTH x i16> %v1, <WIDTH x i16> * %0
+  ret void
+}
+
+define void @__masked_store_blend_32(<WIDTH x i32>* nocapture, <WIDTH x i32>, 
+                                     <WIDTH x i1>) nounwind alwaysinline {
+  %v = load <WIDTH x i32> * %0
+  %v1 = select <WIDTH x i1> %2, <WIDTH x i32> %1, <WIDTH x i32> %v
+  store <WIDTH x i32> %v1, <WIDTH x i32> * %0
+  ret void
+}
+
+define void @__masked_store_blend_64(<WIDTH x i64>* nocapture,
+                            <WIDTH x i64>, <WIDTH x i1>) nounwind alwaysinline {
+  %v = load <WIDTH x i64> * %0
+  %v1 = select <WIDTH x i1> %2, <WIDTH x i64> %1, <WIDTH x i64> %v
+  store <WIDTH x i64> %v1, <WIDTH x i64> * %0
+  ret void
+}
+',`
+declare void @__masked_store_blend_8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
+                                     <WIDTH x i1>) nounwind 
+declare void @__masked_store_blend_16(<WIDTH x i16>* nocapture, <WIDTH x i16>, 
+                                      <WIDTH x i1>) nounwind 
+declare void @__masked_store_blend_32(<WIDTH x i32>* nocapture, <WIDTH x i32>, 
+                                      <WIDTH x i1>) nounwind 
+declare void @__masked_store_blend_64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
+                                      <WIDTH x i1> %mask) nounwind 
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather/scatter
+
+define(`gather_scatter', `
+declare <WIDTH x $1> @__gather_base_offsets32_$1(i8 * nocapture, <WIDTH x i32>,
+                        i32, <WIDTH x i1>) nounwind readonly 
+declare <WIDTH x $1> @__gather_base_offsets64_$1(i8 * nocapture, <WIDTH x i64>,
+                        i32, <WIDTH x i1>) nounwind readonly 
+declare <WIDTH x $1> @__gather32_$1(<WIDTH x i32>, 
+                                    <WIDTH x i1>) nounwind readonly 
+declare <WIDTH x $1> @__gather64_$1(<WIDTH x i64>, 
+                                    <WIDTH x i1>) nounwind readonly 
+
+declare void @__scatter_base_offsets32_$1(i8* nocapture, <WIDTH x i32>,
+                  i32, <WIDTH x $1>, <WIDTH x i1>) nounwind 
+declare void @__scatter_base_offsets64_$1(i8* nocapture, <WIDTH x i64>,
+                  i32, <WIDTH x $1>, <WIDTH x i1>) nounwind 
+declare void @__scatter32_$1(<WIDTH x i32>, <WIDTH x $1>,
+                             <WIDTH x i1>) nounwind 
+declare void @__scatter64_$1(<WIDTH x i64>, <WIDTH x $1>,
+                              <WIDTH x i1>) nounwind 
+')
+
+gather_scatter(i8)
+gather_scatter(i16)
+gather_scatter(i32)
+gather_scatter(i64)
+
+declare i32 @__packed_load_active(i32 * nocapture, <WIDTH x i32> * nocapture,
+                                  <WIDTH x i1>) nounwind
+declare i32 @__packed_store_active(i32 * nocapture, <WIDTH x i32> %vals,
+                                   <WIDTH x i1>) nounwind
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; prefetch
+
+declare void @__prefetch_read_uniform_1(i8 * nocapture) nounwind 
+declare void @__prefetch_read_uniform_2(i8 * nocapture) nounwind 
+declare void @__prefetch_read_uniform_3(i8 * nocapture) nounwind 
+declare void @__prefetch_read_uniform_nt(i8 * nocapture) nounwind 
+
--- a/builtins/target-sse2-common.ll
+++ b/builtins/target-sse2-common.ll
@@ -29,6 +29,11 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  

+ctlztz()
+define_prefetches()
+define_shuffles()
+aossoa()
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp

--- a/builtins/target-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -36,12 +36,16 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; standard 8-wide definitions from m4 macros

-stdlib_core(8)
-packed_load_and_store(8)
-scans(8)
-int64minmax(8)
+define(`WIDTH',`8')
+define(`MASK',`i32')
+include(`util.m4')

-include(`builtins-sse2-common.ll')
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+
+include(`target-sse2-common.ll')

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
@@ -425,10 +429,10 @@ load_and_broadcast(8, i16, 16)
 load_and_broadcast(8, i32, 32)
 load_and_broadcast(8, i64, 64)

-load_masked(8, i8,  8,  1)
-load_masked(8, i16, 16, 2)
-load_masked(8, i32, 32, 4)
-load_masked(8, i64, 64, 8)
+masked_load(8, i8,  8,  1)
+masked_load(8, i16, 16, 2)
+masked_load(8, i32, 32, 4)
+masked_load(8, i64, 64, 8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter
--- a/builtins/target-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -33,12 +33,16 @@
 ;; Define the standard library builtins for the SSE2 target

 ; Define some basics for a 4-wide target
-stdlib_core(4)
-packed_load_and_store(4)
-scans(4)
-int64minmax(4)
+define(`WIDTH',`4')
+define(`MASK',`i32')
+include(`util.m4')

-include(`builtins-sse2-common.ll')
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+
+include(`target-sse2-common.ll')

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding
@@ -552,10 +556,10 @@ load_and_broadcast(4, i16, 16)
 load_and_broadcast(4, i32, 32)
 load_and_broadcast(4, i64, 64)

-load_masked(4, i8,  8,  1)
-load_masked(4, i16, 16, 2)
-load_masked(4, i32, 32, 4)
-load_masked(4, i64, 64, 8)
+masked_load(4, i8,  8,  1)
+masked_load(4, i16, 16, 2)
+masked_load(4, i32, 32, 4)
+masked_load(4, i64, 64, 8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter
--- a/builtins/target-sse4-common.ll
+++ b/builtins/target-sse4-common.ll
@@ -29,6 +29,11 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  

+ctlztz()
+define_prefetches()
+define_shuffles()
+aossoa()
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding floats

--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -36,12 +36,16 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; standard 8-wide definitions from m4 macros

-stdlib_core(8)
-packed_load_and_store(8)
-scans(8)
-int64minmax(8)
+define(`WIDTH',`8')
+define(`MASK',`i32')
+include(`util.m4')

-include(`builtins-sse4-common.ll')
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+
+include(`target-sse4-common.ll')

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
@@ -352,10 +356,10 @@ load_and_broadcast(8, i16, 16)
 load_and_broadcast(8, i32, 32)
 load_and_broadcast(8, i64, 64)

-load_masked(8, i8,  8,  1)
-load_masked(8, i16, 16, 2)
-load_masked(8, i32, 32, 4)
-load_masked(8, i64, 64, 8)
+masked_load(8, i8,  8,  1)
+masked_load(8, i16, 16, 2)
+masked_load(8, i32, 32, 4)
+masked_load(8, i64, 64, 8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter
--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -33,12 +33,16 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 ; Define common 4-wide stuff
-stdlib_core(4)
-packed_load_and_store(4)
-scans(4)
-int64minmax(4)
+define(`WIDTH',`4')
+define(`MASK',`i32')
+include(`util.m4')

-include(`builtins-sse4-common.ll')
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+
+include(`target-sse4-common.ll')

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
@@ -451,10 +455,10 @@ load_and_broadcast(4, i16, 16)
 load_and_broadcast(4, i32, 32)
 load_and_broadcast(4, i64, 64)

-load_masked(4, i8,  8,  1)
-load_masked(4, i16, 16, 2)
-load_masked(4, i32, 32, 4)
-load_masked(4, i64, 64, 8)
+masked_load(4, i8,  8,  1)
+masked_load(4, i16, 16, 2)
+masked_load(4, i32, 32, 4)
+masked_load(4, i64, 64, 8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter
--- a/builtins/util.m4
+++ b/builtins/util.m4
--- a/cbackend.cpp
+++ b/cbackend.cpp
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -74,18 +74,35 @@ struct CFInfo {
                              llvm::Value *savedContinueLanesPtr,
                              llvm::Value *savedMask, llvm::Value *savedLoopMask);

+    static CFInfo *GetSwitch(bool isUniform, llvm::BasicBlock *breakTarget,
+                             llvm::BasicBlock *continueTarget, 
+                             llvm::Value *savedBreakLanesPtr,
+                             llvm::Value *savedContinueLanesPtr,
+                             llvm::Value *savedMask, llvm::Value *savedLoopMask,
+                             llvm::Value *switchExpr,
+                             llvm::BasicBlock *bbDefault,
+                             const std::vector<std::pair<int, llvm::BasicBlock *> > *bbCases,
+                             const std::map<llvm::BasicBlock *, llvm::BasicBlock *> *bbNext,
+                             bool scUniform);
+    
    bool IsIf() { return type == If; }
    bool IsLoop() { return type == Loop; }
    bool IsForeach() { return type == Foreach; }
-    bool IsVaryingType() { return !isUniform; }
+    bool IsSwitch() { return type == Switch; }
+    bool IsVarying() { return !isUniform; }
    bool IsUniform() { return isUniform; }

-    enum CFType { If, Loop, Foreach };
+    enum CFType { If, Loop, Foreach, Switch };
    CFType type;
    bool isUniform;
    llvm::BasicBlock *savedBreakTarget, *savedContinueTarget;
    llvm::Value *savedBreakLanesPtr, *savedContinueLanesPtr;
    llvm::Value *savedMask, *savedLoopMask;
+    llvm::Value *savedSwitchExpr;
+    llvm::BasicBlock *savedDefaultBlock;
+    const std::vector<std::pair<int, llvm::BasicBlock *> > *savedCaseBlocks;
+    const std::map<llvm::BasicBlock *, llvm::BasicBlock *> *savedNextBlocks;
+    bool savedSwitchConditionWasUniform;

 private:
    CFInfo(CFType t, bool uniformIf, llvm::Value *sm) {
@@ -95,11 +112,18 @@ private:
        savedBreakTarget = savedContinueTarget = NULL;
        savedBreakLanesPtr = savedContinueLanesPtr = NULL;
        savedMask = savedLoopMask = sm;
+        savedSwitchExpr = NULL;
+        savedDefaultBlock = NULL;
+        savedCaseBlocks = NULL;
+        savedNextBlocks = NULL;
    }
    CFInfo(CFType t, bool iu, llvm::BasicBlock *bt, llvm::BasicBlock *ct,
           llvm::Value *sb, llvm::Value *sc, llvm::Value *sm,
-           llvm::Value *lm) {
-        Assert(t == Loop);
+           llvm::Value *lm, llvm::Value *sse = NULL, llvm::BasicBlock *bbd = NULL, 
+           const std::vector<std::pair<int, llvm::BasicBlock *> > *bbc = NULL,
+           const std::map<llvm::BasicBlock *, llvm::BasicBlock *> *bbn = NULL,
+           bool scu = false) {
+        Assert(t == Loop || t == Switch);
        type = t;
        isUniform = iu;
        savedBreakTarget = bt;
@@ -108,6 +132,11 @@ private:
        savedContinueLanesPtr = sc;
        savedMask = sm;
        savedLoopMask = lm;
+        savedSwitchExpr = sse;
+        savedDefaultBlock = bbd;
+        savedCaseBlocks = bbc;
+        savedNextBlocks = bbn;
+        savedSwitchConditionWasUniform = scu;
    }
    CFInfo(CFType t, llvm::BasicBlock *bt, llvm::BasicBlock *ct,
           llvm::Value *sb, llvm::Value *sc, llvm::Value *sm,
@@ -121,6 +150,10 @@ private:
        savedContinueLanesPtr = sc;
        savedMask = sm;
        savedLoopMask = lm;
+        savedSwitchExpr = NULL;
+        savedDefaultBlock = NULL;
+        savedCaseBlocks = NULL;
+        savedNextBlocks = NULL;
    }
 };

@@ -154,12 +187,30 @@ CFInfo::GetForeach(llvm::BasicBlock *breakTarget,
                      savedMask, savedForeachMask);
 }

+
+CFInfo *
+CFInfo::GetSwitch(bool isUniform, llvm::BasicBlock *breakTarget,
+                  llvm::BasicBlock *continueTarget, 
+                  llvm::Value *savedBreakLanesPtr,
+                  llvm::Value *savedContinueLanesPtr, llvm::Value *savedMask,
+                  llvm::Value *savedLoopMask, llvm::Value *savedSwitchExpr,
+                  llvm::BasicBlock *savedDefaultBlock,
+                  const std::vector<std::pair<int, llvm::BasicBlock *> > *savedCases,
+                  const std::map<llvm::BasicBlock *, llvm::BasicBlock *> *savedNext,
+                  bool savedSwitchConditionUniform) {
+    return new CFInfo(Switch, isUniform, breakTarget, continueTarget, 
+                      savedBreakLanesPtr, savedContinueLanesPtr,
+                      savedMask, savedLoopMask, savedSwitchExpr, savedDefaultBlock, 
+                      savedCases, savedNext, savedSwitchConditionUniform);
+}
+
 ///////////////////////////////////////////////////////////////////////////

 FunctionEmitContext::FunctionEmitContext(Function *func, Symbol *funSym,
-                                         llvm::Function *llvmFunction,
+                                         llvm::Function *lf,
                                         SourcePos firstStmtPos) {
    function = func;
+    llvmFunction = lf;

    /* Create a new basic block to store all of the allocas */
    allocaBlock = llvm::BasicBlock::Create(*g->ctx, "allocas", llvmFunction, 0);
@@ -181,6 +232,11 @@ FunctionEmitContext::FunctionEmitContext(Function *func, Symbol *funSym,
    breakLanesPtr = continueLanesPtr = NULL;
    breakTarget = continueTarget = NULL;

+    switchExpr = NULL;
+    caseBlocks = NULL;
+    defaultBlock = NULL;
+    nextBlocks = NULL;
+
    returnedLanesPtr = AllocaInst(LLVMTypes::MaskType, "returned_lanes_memory");
    StoreInst(LLVMMaskAllOff, returnedLanesPtr);

@@ -421,51 +477,61 @@ FunctionEmitContext::StartVaryingIf(llvm::Value *oldMask) {

 void
 FunctionEmitContext::EndIf() {
+    CFInfo *ci = popCFState();
    // Make sure we match up with a Start{Uniform,Varying}If().
-    Assert(controlFlowInfo.size() > 0 && controlFlowInfo.back()->IsIf());
-    CFInfo *ci = controlFlowInfo.back();
-    controlFlowInfo.pop_back();
+    Assert(ci->IsIf());

    // 'uniform' ifs don't change the mask so we only need to restore the
    // mask going into the if for 'varying' if statements
-    if (!ci->IsUniform() && bblock != NULL) {
-        // We can't just restore the mask as it was going into the 'if'
-        // statement.  First we have to take into account any program
-        // instances that have executed 'return' statements; the restored
-        // mask must be off for those lanes.
-        restoreMaskGivenReturns(ci->savedMask);
+    if (ci->IsUniform() || bblock == NULL)
+        return;

-        // If the 'if' statement is inside a loop with a 'varying'
-        // consdition, we also need to account for any break or continue
-        // statements that executed inside the 'if' statmeent; we also must
-        // leave the lane masks for the program instances that ran those
-        // off after we restore the mask after the 'if'.  The code below
-        // ends up being optimized out in the case that there were no break
-        // or continue statements (and breakLanesPtr and continueLanesPtr
-        // have their initial 'all off' values), so we don't need to check
-        // for that here.
-        if (continueLanesPtr != NULL) {
-            // We want to compute:
-            // newMask = (oldMask & ~(breakLanes | continueLanes))
-            llvm::Value *oldMask = GetInternalMask();
-            llvm::Value *continueLanes = LoadInst(continueLanesPtr,
-                                                  "continue_lanes");
-            llvm::Value *bcLanes = continueLanes;
+    // We can't just restore the mask as it was going into the 'if'
+    // statement.  First we have to take into account any program
+    // instances that have executed 'return' statements; the restored
+    // mask must be off for those lanes.
+    restoreMaskGivenReturns(ci->savedMask);

-            if (breakLanesPtr != NULL) {
-                // breakLanesPtr will be NULL if we're inside a 'foreach' loop
-                llvm::Value *breakLanes = LoadInst(breakLanesPtr, "break_lanes");
-                bcLanes = BinaryOperator(llvm::Instruction::Or, breakLanes, 
-                                         continueLanes, "break|continue_lanes");
-            }
+    // If the 'if' statement is inside a loop with a 'varying'
+    // condition, we also need to account for any break or continue
+    // statements that executed inside the 'if' statmeent; we also must
+    // leave the lane masks for the program instances that ran those
+    // off after we restore the mask after the 'if'.  The code below
+    // ends up being optimized out in the case that there were no break
+    // or continue statements (and breakLanesPtr and continueLanesPtr
+    // have their initial 'all off' values), so we don't need to check
+    // for that here.
+    // 
+    // There are three general cases to deal with here:
+    // - Loops: both break and continue are allowed, and thus the corresponding
+    //   lane mask pointers are non-NULL
+    // - Foreach: only continueLanesPtr may be non-NULL
+    // - Switch: only breakLanesPtr may be non-NULL
+    if (continueLanesPtr != NULL || breakLanesPtr != NULL) {
+        // We want to compute:
+        // newMask = (oldMask & ~(breakLanes | continueLanes)),
+        // treading breakLanes or continueLanes as "all off" if the
+        // corresponding pointer is NULL.
+        llvm::Value *bcLanes = NULL;

-            llvm::Value *notBreakOrContinue = 
-                NotOperator(bcLanes, "!(break|continue)_lanes");
-            llvm::Value *newMask = 
-                BinaryOperator(llvm::Instruction::And, oldMask, 
-                               notBreakOrContinue, "new_mask");
-            SetInternalMask(newMask);
+        if (continueLanesPtr != NULL)
+            bcLanes = LoadInst(continueLanesPtr, "continue_lanes");
+        else
+            bcLanes = LLVMMaskAllOff;
+
+        if (breakLanesPtr != NULL) {
+            llvm::Value *breakLanes = LoadInst(breakLanesPtr, "break_lanes");
+            bcLanes = BinaryOperator(llvm::Instruction::Or, bcLanes, 
+                                     breakLanes, "|break_lanes");
        }
+
+        llvm::Value *notBreakOrContinue = 
+            NotOperator(bcLanes, "!(break|continue)_lanes");
+        llvm::Value *oldMask = GetInternalMask();
+        llvm::Value *newMask = 
+            BinaryOperator(llvm::Instruction::And, oldMask, 
+                           notBreakOrContinue, "new_mask");
+        SetInternalMask(newMask);
    }
 }

@@ -501,17 +567,8 @@ FunctionEmitContext::StartLoop(llvm::BasicBlock *bt, llvm::BasicBlock *ct,

 void
 FunctionEmitContext::EndLoop() {
-    Assert(controlFlowInfo.size() && controlFlowInfo.back()->IsLoop());
-    CFInfo *ci = controlFlowInfo.back();
-    controlFlowInfo.pop_back();
-
-    // Restore the break/continue state information to what it was before
-    // we went into this loop.
-    breakTarget = ci->savedBreakTarget;
-    continueTarget = ci->savedContinueTarget;
-    breakLanesPtr = ci->savedBreakLanesPtr;
-    continueLanesPtr = ci->savedContinueLanesPtr;
-    loopMask = ci->savedLoopMask;
+    CFInfo *ci = popCFState();
+    Assert(ci->IsLoop());

    if (!ci->IsUniform())
        // If the loop had a 'uniform' test, then it didn't make any
@@ -524,7 +581,7 @@ FunctionEmitContext::EndLoop() {


 void
-FunctionEmitContext::StartForeach(llvm::BasicBlock *ct) {
+FunctionEmitContext::StartForeach() {
    // Store the current values of various loop-related state so that we
    // can restore it when we exit this loop.
    llvm::Value *oldMask = GetInternalMask();
@@ -536,7 +593,7 @@ FunctionEmitContext::StartForeach(llvm::BasicBlock *ct) {

    continueLanesPtr = AllocaInst(LLVMTypes::MaskType, "foreach_continue_lanes");
    StoreInst(LLVMMaskAllOff, continueLanesPtr);
-    continueTarget = ct;
+    continueTarget = NULL; // should be set by SetContinueTarget()

    loopMask = NULL;
 }
@@ -544,17 +601,8 @@ FunctionEmitContext::StartForeach(llvm::BasicBlock *ct) {

 void
 FunctionEmitContext::EndForeach() {
-    Assert(controlFlowInfo.size() && controlFlowInfo.back()->IsForeach());
-    CFInfo *ci = controlFlowInfo.back();
-    controlFlowInfo.pop_back();
-
-    // Restore the break/continue state information to what it was before
-    // we went into this loop.
-    breakTarget = ci->savedBreakTarget;
-    continueTarget = ci->savedContinueTarget;
-    breakLanesPtr = ci->savedBreakLanesPtr;
-    continueLanesPtr = ci->savedContinueLanesPtr;
-    loopMask = ci->savedLoopMask;
+    CFInfo *ci = popCFState();
+    Assert(ci->IsForeach());
 }


@@ -575,28 +623,64 @@ FunctionEmitContext::restoreMaskGivenReturns(llvm::Value *oldMask) {
 }


+/** Returns "true" if the first enclosing non-if control flow expression is
+    a "switch" statement.
+*/
+bool
+FunctionEmitContext::inSwitchStatement() const {
+    // Go backwards through controlFlowInfo, since we add new nested scopes
+    // to the back.
+    int i = controlFlowInfo.size() - 1;
+    while (i >= 0 && controlFlowInfo[i]->IsIf())
+        --i;
+    // Got to the first non-if (or end of CF info)
+    if (i == -1)
+        return false;
+    return controlFlowInfo[i]->IsSwitch();
+}
+
+
 void
 FunctionEmitContext::Break(bool doCoherenceCheck) {
+    Assert(controlFlowInfo.size() > 0);
    if (breakTarget == NULL) {
        Error(currentPos, "\"break\" statement is illegal outside of "
-              "for/while/do loops.");
+              "for/while/do loops and \"switch\" statements.");
+        return;
+    }
+
+    if (bblock == NULL)
+        return;
+
+    if (inSwitchStatement() == true &&
+        switchConditionWasUniform == true && 
+        ifsInCFAllUniform(CFInfo::Switch)) {
+        // We know that all program instances are executing the break, so
+        // just jump to the block immediately after the switch.
+        Assert(breakTarget != NULL);
+        BranchInst(breakTarget);
+        bblock = NULL;
        return;
    }

    // If all of the enclosing 'if' tests in the loop have uniform control
    // flow or if we can tell that the mask is all on, then we can just
    // jump to the break location.
-    if (ifsInLoopAllUniform() || GetInternalMask() == LLVMMaskAllOn) {
+    if (inSwitchStatement() == false && 
+        (ifsInCFAllUniform(CFInfo::Loop) || 
+         GetInternalMask() == LLVMMaskAllOn)) {
        BranchInst(breakTarget);
-        if (ifsInLoopAllUniform() && doCoherenceCheck)
-            Warning(currentPos, "Coherent break statement not necessary in fully uniform "
-                    "control flow.");
+        if (ifsInCFAllUniform(CFInfo::Loop) && doCoherenceCheck)
+            Warning(currentPos, "Coherent break statement not necessary in "
+                    "fully uniform control flow.");
        // Set bblock to NULL since the jump has terminated the basic block
        bblock = NULL;
    }
    else {
-        // Otherwise we need to update the mask of the lanes that have
-        // executed a 'break' statement:
+        // Varying switch, uniform switch where the 'break' is under
+        // varying control flow, or a loop with varying 'if's above the
+        // break.  In these cases, we need to update the mask of the lanes
+        // that have executed a 'break' statement: 
        // breakLanes = breakLanes | mask
        Assert(breakLanesPtr != NULL);
        llvm::Value *mask = GetInternalMask();
@@ -612,16 +696,20 @@ FunctionEmitContext::Break(bool doCoherenceCheck) {
        // an 'if' statement and restore the mask then.
        SetInternalMask(LLVMMaskAllOff);

-        if (doCoherenceCheck)
-            // If the user has indicated that this is a 'coherent' break
-            // statement, then check to see if the mask is all off.  If so,
-            // we have to conservatively jump to the continueTarget, not
-            // the breakTarget, since part of the reason the mask is all
-            // off may be due to 'continue' statements that executed in the
-            // current loop iteration.  
-            // FIXME: if the loop only has break statements and no
-            // continues, we can jump to breakTarget in that case.
-            jumpIfAllLoopLanesAreDone(continueTarget);
+        if (doCoherenceCheck) {
+            if (continueTarget != NULL)
+                // If the user has indicated that this is a 'coherent'
+                // break statement, then check to see if the mask is all
+                // off.  If so, we have to conservatively jump to the
+                // continueTarget, not the breakTarget, since part of the
+                // reason the mask is all off may be due to 'continue'
+                // statements that executed in the current loop iteration.
+                jumpIfAllLoopLanesAreDone(continueTarget);
+            else if (breakTarget != NULL)
+                // Similarly handle these for switch statements, where we
+                // only have a break target.
+                jumpIfAllLoopLanesAreDone(breakTarget);
+        }
    }
 }

@@ -634,12 +722,12 @@ FunctionEmitContext::Continue(bool doCoherenceCheck) {
        return;
    }

-    if (ifsInLoopAllUniform() || GetInternalMask() == LLVMMaskAllOn) {
+    if (ifsInCFAllUniform(CFInfo::Loop) || GetInternalMask() == LLVMMaskAllOn) {
        // Similarly to 'break' statements, we can immediately jump to the
        // continue target if we're only in 'uniform' control flow within
        // loop or if we can tell that the mask is all on.
        AddInstrumentationPoint("continue: uniform CF, jumped");
-        if (ifsInLoopAllUniform() && doCoherenceCheck)
+        if (ifsInCFAllUniform(CFInfo::Loop) && doCoherenceCheck)
            Warning(currentPos, "Coherent continue statement not necessary in "
                    "fully uniform control flow.");
        BranchInst(continueTarget);
@@ -652,8 +740,9 @@ FunctionEmitContext::Continue(bool doCoherenceCheck) {
        llvm::Value *mask = GetInternalMask();
        llvm::Value *continueMask = 
            LoadInst(continueLanesPtr, "continue_mask");
-        llvm::Value *newMask = BinaryOperator(llvm::Instruction::Or,
-                                              mask, continueMask, "mask|continueMask");
+        llvm::Value *newMask = 
+            BinaryOperator(llvm::Instruction::Or, mask, continueMask,
+                           "mask|continueMask");
        StoreInst(newMask, continueLanesPtr);

        // And set the current mask to be all off in case there are any
@@ -670,22 +759,23 @@ FunctionEmitContext::Continue(bool doCoherenceCheck) {


 /** This function checks to see if all of the 'if' statements (if any)
-    between the current scope and the first enclosing loop have 'uniform'
-    tests.
+    between the current scope and the first enclosing loop/switch of given
+    control flow type have 'uniform' tests.
 */
 bool
-FunctionEmitContext::ifsInLoopAllUniform() const {
+FunctionEmitContext::ifsInCFAllUniform(int type) const {
    Assert(controlFlowInfo.size() > 0);
    // Go backwards through controlFlowInfo, since we add new nested scopes
-    // to the back.  Stop once we come to the first enclosing loop.
+    // to the back.  Stop once we come to the first enclosing control flow
+    // structure of the desired type.
    int i = controlFlowInfo.size() - 1;
-    while (i >= 0 && controlFlowInfo[i]->type != CFInfo::Loop) {
+    while (i >= 0 && controlFlowInfo[i]->type != type) {
        if (controlFlowInfo[i]->isUniform == false)
            // Found a scope due to an 'if' statement with a varying test
            return false;
        --i;
    }
-    Assert(i >= 0); // else we didn't find a loop!
+    Assert(i >= 0); // else we didn't find the expected control flow type!
    return true;
 }

@@ -758,11 +848,249 @@ FunctionEmitContext::RestoreContinuedLanes() {
 }


+void
+FunctionEmitContext::StartSwitch(bool cfIsUniform, llvm::BasicBlock *bbBreak) {
+    llvm::Value *oldMask = GetInternalMask();
+    controlFlowInfo.push_back(CFInfo::GetSwitch(cfIsUniform, breakTarget, 
+                                                continueTarget, breakLanesPtr,
+                                                continueLanesPtr, oldMask, 
+                                                loopMask, switchExpr, defaultBlock, 
+                                                caseBlocks, nextBlocks,
+                                                switchConditionWasUniform));
+
+    breakLanesPtr = AllocaInst(LLVMTypes::MaskType, "break_lanes_memory");
+    StoreInst(LLVMMaskAllOff, breakLanesPtr);
+    breakTarget = bbBreak;
+
+    continueLanesPtr = NULL;
+    continueTarget = NULL;
+    loopMask = NULL;
+
+    // These will be set by the SwitchInst() method
+    switchExpr = NULL;
+    defaultBlock = NULL;
+    caseBlocks = NULL;
+    nextBlocks = NULL;
+}
+
+
+void
+FunctionEmitContext::EndSwitch() {
+    Assert(bblock != NULL);
+
+    CFInfo *ci = popCFState();
+    if (ci->IsVarying() && bblock != NULL)
+        restoreMaskGivenReturns(ci->savedMask);
+}
+
+
+/** Emit code to check for an "all off" mask before the code for a 
+    case or default label in a "switch" statement.
+ */
+void
+FunctionEmitContext::addSwitchMaskCheck(llvm::Value *mask) {
+    llvm::Value *allOff = None(mask);
+    llvm::BasicBlock *bbSome = CreateBasicBlock("case_default_on");
+
+    // Find the basic block for the case or default label immediately after
+    // the current one in the switch statement--that's where we want to
+    // jump if the mask is all off at this label.
+    Assert(nextBlocks->find(bblock) != nextBlocks->end());
+    llvm::BasicBlock *bbNext = nextBlocks->find(bblock)->second;
+
+    // Jump to the next one of the mask is all off; otherwise jump to the
+    // newly created block that will hold the actual code for this label.
+    BranchInst(bbNext, bbSome, allOff);
+    SetCurrentBasicBlock(bbSome);
+}
+
+
+/** Returns the execution mask at entry to the first enclosing "switch"
+    statement. */
+llvm::Value *
+FunctionEmitContext::getMaskAtSwitchEntry() {
+    Assert(controlFlowInfo.size() > 0);
+    int i = controlFlowInfo.size() - 1;
+    while (i >= 0 && controlFlowInfo[i]->type != CFInfo::Switch)
+        --i;
+    Assert(i != -1);
+    return controlFlowInfo[i]->savedMask;
+}
+
+
+void
+FunctionEmitContext::EmitDefaultLabel(bool checkMask, SourcePos pos) {
+    if (inSwitchStatement() == false) {
+        Error(pos, "\"default\" label illegal outside of \"switch\" "
+              "statement.");
+        return;
+    }
+
+    // If there's a default label in the switch, a basic block for it
+    // should have been provided in the previous call to SwitchInst().
+    Assert(defaultBlock != NULL);
+
+    if (bblock != NULL)
+        // The previous case in the switch fell through, or we're in a
+        // varying switch; terminate the current block with a jump to the
+        // block for the code for the default label.
+        BranchInst(defaultBlock);
+    SetCurrentBasicBlock(defaultBlock);
+
+    if (switchConditionWasUniform)
+        // Nothing more to do for this case; return back to the caller,
+        // which will then emit the code for the default case.
+        return;
+
+    // For a varying switch, we need to update the execution mask.
+    //
+    // First, compute the mask that corresponds to which program instances
+    // should execute the "default" code; this corresponds to the set of
+    // program instances that don't match any of the case statements.
+    // Therefore, we generate code that compares the value of the switch
+    // expression to the value associated with each of the "case"
+    // statements such that the surviving lanes didn't match any of them.
+    llvm::Value *matchesDefault = getMaskAtSwitchEntry();
+    for (int i = 0; i < (int)caseBlocks->size(); ++i) {
+        int value = (*caseBlocks)[i].first;
+        llvm::Value *valueVec = (switchExpr->getType() == LLVMTypes::Int32VectorType) ?
+            LLVMInt32Vector(value) : LLVMInt64Vector(value);
+        // TODO: for AVX2 at least, the following generates better code
+        // than doing ICMP_NE and skipping the NotOperator() below; file a
+        // LLVM bug?
+        llvm::Value *matchesCaseValue = 
+            CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, switchExpr,
+                    valueVec, "cmp_case_value");
+        matchesCaseValue = I1VecToBoolVec(matchesCaseValue);
+
+        llvm::Value *notMatchesCaseValue = NotOperator(matchesCaseValue);
+        matchesDefault = BinaryOperator(llvm::Instruction::And, matchesDefault, 
+                                        notMatchesCaseValue, "default&~case_match");
+    }
+
+    // The mask may have some lanes on, which corresponds to the previous
+    // label falling through; compute the updated mask by ANDing with the
+    // current mask.
+    llvm::Value *oldMask = GetInternalMask();
+    llvm::Value *newMask = BinaryOperator(llvm::Instruction::Or, oldMask, 
+                                          matchesDefault, "old_mask|matches_default");
+    SetInternalMask(newMask);
+
+    if (checkMask)
+        addSwitchMaskCheck(newMask);
+}
+
+
+void
+FunctionEmitContext::EmitCaseLabel(int value, bool checkMask, SourcePos pos) {
+    if (inSwitchStatement() == false) {
+        Error(pos, "\"case\" label illegal outside of \"switch\" statement.");
+        return;
+    }
+
+    // Find the basic block for this case statement.
+    llvm::BasicBlock *bbCase = NULL;
+    Assert(caseBlocks != NULL);
+    for (int i = 0; i < (int)caseBlocks->size(); ++i)
+        if ((*caseBlocks)[i].first == value) {
+            bbCase = (*caseBlocks)[i].second;
+            break;
+        }
+    Assert(bbCase != NULL);
+
+    if (bblock != NULL)
+        // fall through from the previous case
+        BranchInst(bbCase);
+    SetCurrentBasicBlock(bbCase);
+
+    if (switchConditionWasUniform)
+        return;
+
+    // update the mask: first, get a mask that indicates which program
+    // instances have a value for the switch expression that matches this
+    // case statement.
+    llvm::Value *valueVec = (switchExpr->getType() == LLVMTypes::Int32VectorType) ?
+        LLVMInt32Vector(value) : LLVMInt64Vector(value);
+    llvm::Value *matchesCaseValue = 
+        CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, switchExpr,
+                valueVec, "cmp_case_value");
+    matchesCaseValue = I1VecToBoolVec(matchesCaseValue);
+
+    // If a lane was off going into the switch, we don't care if has a
+    // value in the switch expression that happens to match this case.
+    llvm::Value *entryMask = getMaskAtSwitchEntry();
+    matchesCaseValue = BinaryOperator(llvm::Instruction::And, entryMask,
+                                      matchesCaseValue, "entry_mask&case_match");
+
+    // Take the surviving lanes and turn on the mask for them.
+    llvm::Value *oldMask = GetInternalMask();
+    llvm::Value *newMask = BinaryOperator(llvm::Instruction::Or, oldMask, 
+                                          matchesCaseValue, "mask|case_match");
+    SetInternalMask(newMask);
+
+    if (checkMask)
+        addSwitchMaskCheck(newMask);
+}
+
+
+void
+FunctionEmitContext::SwitchInst(llvm::Value *expr, llvm::BasicBlock *bbDefault,
+                const std::vector<std::pair<int, llvm::BasicBlock *> > &bbCases,
+                const std::map<llvm::BasicBlock *, llvm::BasicBlock *> &bbNext) {
+    // The calling code should have called StartSwitch() before calling
+    // SwitchInst().
+    Assert(controlFlowInfo.size() &&
+           controlFlowInfo.back()->IsSwitch());
+
+    switchExpr = expr;
+    defaultBlock = bbDefault;
+    caseBlocks = new std::vector<std::pair<int, llvm::BasicBlock *> >(bbCases);
+    nextBlocks = new std::map<llvm::BasicBlock *, llvm::BasicBlock *>(bbNext);
+    switchConditionWasUniform = 
+        (llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(expr->getType()) == false);
+
+    if (switchConditionWasUniform == true) {
+        // For a uniform switch condition, just wire things up to the LLVM
+        // switch instruction.
+        llvm::SwitchInst *s = llvm::SwitchInst::Create(expr, bbDefault, 
+                                                       bbCases.size(), bblock);
+        for (int i = 0; i < (int)bbCases.size(); ++i) {
+            if (expr->getType() == LLVMTypes::Int32Type)
+                s->addCase(LLVMInt32(bbCases[i].first), bbCases[i].second);
+            else {
+                Assert(expr->getType() == LLVMTypes::Int64Type);
+                s->addCase(LLVMInt64(bbCases[i].first), bbCases[i].second);
+            }
+        }
+
+        AddDebugPos(s);
+        // switch is a terminator
+        bblock = NULL;
+    }
+    else {
+        // For a varying switch, we first turn off all lanes of the mask
+        SetInternalMask(LLVMMaskAllOff);
+
+        if (nextBlocks->size() > 0) {
+            // If there are any labels inside the switch, jump to the first
+            // one; any code before the first label won't be executed by
+            // anyone.
+            std::map<llvm::BasicBlock *, llvm::BasicBlock *>::const_iterator iter;
+            iter = nextBlocks->find(NULL);
+            Assert(iter != nextBlocks->end());
+            llvm::BasicBlock *bbFirst = iter->second;
+            BranchInst(bbFirst);
+            bblock = NULL;
+        }
+    }
+}
+
+
 int
 FunctionEmitContext::VaryingCFDepth() const { 
    int sum = 0;
    for (unsigned int i = 0; i < controlFlowInfo.size(); ++i)
-        if (controlFlowInfo[i]->IsVaryingType())
+        if (controlFlowInfo[i]->IsVarying())
            ++sum;
    return sum;
 }
@@ -777,6 +1105,41 @@ FunctionEmitContext::InForeachLoop() const {
 }


+bool
+FunctionEmitContext::initLabelBBlocks(ASTNode *node, void *data) {
+    LabeledStmt *ls = dynamic_cast<LabeledStmt *>(node);
+    if (ls == NULL)
+        return true;
+
+    FunctionEmitContext *ctx = (FunctionEmitContext *)data;
+
+    if (ctx->labelMap.find(ls->name) != ctx->labelMap.end())
+        Error(ls->pos, "Multiple labels named \"%s\" in function.",
+              ls->name.c_str());
+    else {
+        llvm::BasicBlock *bb = ctx->CreateBasicBlock(ls->name.c_str());
+        ctx->labelMap[ls->name] = bb;
+    }
+    return true;
+}
+
+
+void
+FunctionEmitContext::InitializeLabelMap(Stmt *code) {
+    labelMap.erase(labelMap.begin(), labelMap.end());
+    WalkAST(code, initLabelBBlocks, NULL, this);
+}
+
+
+llvm::BasicBlock *
+FunctionEmitContext::GetLabeledBasicBlock(const std::string &label) {
+    if (labelMap.find(label) != labelMap.end())
+        return labelMap[label];
+    else
+        return NULL;
+}
+
+
 void
 FunctionEmitContext::CurrentLanesReturned(Expr *expr, bool doCoherenceCheck) {
    const Type *returnType = function->GetReturnType();
@@ -869,14 +1232,25 @@ FunctionEmitContext::All(llvm::Value *mask) {
 }


+llvm::Value *
+FunctionEmitContext::None(llvm::Value *mask) {
+    llvm::Value *mmval = LaneMask(mask);
+    return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, mmval,
+                   LLVMInt32(0), "none_mm_cmp");
+}
+
+
 llvm::Value *
 FunctionEmitContext::LaneMask(llvm::Value *v) {
    // Call the target-dependent movmsk function to turn the vector mask
    // into an i32 value
    std::vector<Symbol *> mm;
    m->symbolTable->LookupFunction("__movmsk", &mm);
-    // There should be one with signed int signature, one unsigned int.
-    Assert(mm.size() == 2); 
+    if (g->target.maskBitCount == 1)
+        Assert(mm.size() == 1);
+    else
+        // There should be one with signed int signature, one unsigned int.
+        Assert(mm.size() == 2); 
    // We can actually call either one, since both are i32s as far as
    // LLVM's type system is concerned...
    llvm::Function *fmm = mm[0]->function;
@@ -917,8 +1291,7 @@ FunctionEmitContext::GetStringPtr(const std::string &str) {

 llvm::BasicBlock *
 FunctionEmitContext::CreateBasicBlock(const char *name) {
-    llvm::Function *function = bblock->getParent();
-    return llvm::BasicBlock::Create(*g->ctx, name, function);
+    return llvm::BasicBlock::Create(*g->ctx, name, llvmFunction);
 }


@@ -929,6 +1302,9 @@ FunctionEmitContext::I1VecToBoolVec(llvm::Value *b) {
        return NULL;
    }

+    if (g->target.maskBitCount == 1)
+        return b;
+
    LLVM_TYPE_CONST llvm::ArrayType *at = 
        llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(b->getType());
    if (at) {
@@ -1462,7 +1838,7 @@ FunctionEmitContext::applyVaryingGEP(llvm::Value *basePtr, llvm::Value *index,
    // Find the scale factor for the index (i.e. the size of the object
    // that the pointer(s) point(s) to.
    const Type *scaleType = ptrType->GetBaseType();
-    llvm::Value *scale = g->target.SizeOf(scaleType->LLVMType(g->ctx));
+    llvm::Value *scale = g->target.SizeOf(scaleType->LLVMType(g->ctx), bblock);

    bool indexIsVarying = 
        llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(index->getType());
@@ -1645,7 +2021,8 @@ FunctionEmitContext::AddElementOffset(llvm::Value *basePtr, int elementNum,
    if (st != NULL)
        // If the pointer is to a structure, Target::StructOffset() gives
        // us the offset in bytes to the given element of the structure
-        offset = g->target.StructOffset(st->LLVMType(g->ctx), elementNum);
+        offset = g->target.StructOffset(st->LLVMType(g->ctx), elementNum,
+                                        bblock);
    else {
        // Otherwise we should have a vector or array here and the offset
        // is given by the element number times the size of the element
@@ -1654,7 +2031,7 @@ FunctionEmitContext::AddElementOffset(llvm::Value *basePtr, int elementNum,
            dynamic_cast<const SequentialType *>(ptrType->GetBaseType());
        Assert(st != NULL);
        llvm::Value *size = 
-            g->target.SizeOf(st->GetElementType()->LLVMType(g->ctx));
+            g->target.SizeOf(st->GetElementType()->LLVMType(g->ctx), bblock);
        llvm::Value *scale = (g->target.is32Bit || g->opt.force32BitAddressing) ?
            LLVMInt32(elementNum) : LLVMInt64(elementNum);
        offset = BinaryOperator(llvm::Instruction::Mul, size, scale);
@@ -1939,6 +2316,20 @@ FunctionEmitContext::maskedStore(llvm::Value *value, llvm::Value *ptr,
        else
            maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_64");
    }
+    else if (valueType == AtomicType::VaryingBool &&
+             g->target.maskBitCount == 1) {
+        llvm::Value *notMask = BinaryOperator(llvm::Instruction::Xor, mask,
+                                              LLVMMaskAllOn, "~mask");
+        llvm::Value *old = LoadInst(ptr);
+        llvm::Value *maskedOld = BinaryOperator(llvm::Instruction::And, old,
+                                                notMask, "old&~mask");
+        llvm::Value *maskedNew = BinaryOperator(llvm::Instruction::And, value,
+                                                mask, "new&mask");
+        llvm::Value *final = BinaryOperator(llvm::Instruction::Or, maskedOld,
+                                            maskedNew, "old_new_result");
+        StoreInst(final, ptr);
+        return;
+    }
    else if (valueType == AtomicType::VaryingDouble || 
             valueType == AtomicType::VaryingInt64 ||
             valueType == AtomicType::VaryingUInt64) {
@@ -2459,7 +2850,7 @@ FunctionEmitContext::LaunchInst(llvm::Value *callee,

    llvm::Function *falloc = m->module->getFunction("ISPCAlloc");
    Assert(falloc != NULL);
-    llvm::Value *structSize = g->target.SizeOf(argStructType);
+    llvm::Value *structSize = g->target.SizeOf(argStructType, bblock);
    if (structSize->getType() != LLVMTypes::Int64Type)
        // ISPCAlloc expects the size as an uint64_t, but on 32-bit
        // targets, SizeOf returns a 32-bit value
@@ -2555,7 +2946,7 @@ FunctionEmitContext::addVaryingOffsetsIfNeeded(llvm::Value *ptr,
    // Find the size of a uniform element of the varying type
    LLVM_TYPE_CONST llvm::Type *llvmBaseUniformType = 
        baseType->GetAsUniformType()->LLVMType(g->ctx);
-    llvm::Value *unifSize = g->target.SizeOf(llvmBaseUniformType);
+    llvm::Value *unifSize = g->target.SizeOf(llvmBaseUniformType, bblock);
    unifSize = SmearUniform(unifSize);

    // Compute offset = <0, 1, .. > * unifSize
@@ -2576,3 +2967,37 @@ FunctionEmitContext::addVaryingOffsetsIfNeeded(llvm::Value *ptr,

    return BinaryOperator(llvm::Instruction::Add, ptr, offset);
 }
+
+
+CFInfo *
+FunctionEmitContext::popCFState() {
+    Assert(controlFlowInfo.size() > 0);
+    CFInfo *ci = controlFlowInfo.back();
+    controlFlowInfo.pop_back();
+
+    if (ci->IsSwitch()) {
+        breakTarget = ci->savedBreakTarget;
+        continueTarget = ci->savedContinueTarget;
+        breakLanesPtr = ci->savedBreakLanesPtr;
+        continueLanesPtr = ci->savedContinueLanesPtr;
+        loopMask = ci->savedLoopMask;
+        switchExpr = ci->savedSwitchExpr;
+        defaultBlock = ci->savedDefaultBlock;
+        caseBlocks = ci->savedCaseBlocks;
+        nextBlocks = ci->savedNextBlocks;
+        switchConditionWasUniform = ci->savedSwitchConditionWasUniform;
+    }
+    else if (ci->IsLoop() || ci->IsForeach()) {
+        breakTarget = ci->savedBreakTarget;
+        continueTarget = ci->savedContinueTarget;
+        breakLanesPtr = ci->savedBreakLanesPtr;
+        continueLanesPtr = ci->savedContinueLanesPtr;
+        loopMask = ci->savedLoopMask;
+    }
+    else {
+        Assert(ci->IsIf());
+        // nothing to do
+    }
+
+    return ci;
+}
--- a/ctx.h
+++ b/ctx.h
@@ -39,6 +39,7 @@
 #define ISPC_CTX_H 1

 #include "ispc.h"
+#include <map>
 #include <llvm/InstrTypes.h>
 #include <llvm/Instructions.h>
 #include <llvm/Analysis/DIBuilder.h>
@@ -160,10 +161,8 @@ public:
    void EndLoop();

    /** Indicates that code generation for a 'foreach' or 'foreach_tiled'
-        loop is about to start.  The provided basic block pointer indicates
-        where control flow should go if a 'continue' statement is executed
-        in the loop. */
-    void StartForeach(llvm::BasicBlock *continueTarget);
+        loop is about to start. */
+    void StartForeach();
    void EndForeach();

    /** Emit code for a 'break' statement in a loop.  If doCoherenceCheck
@@ -186,12 +185,62 @@ public:
        previous iteration. */
    void RestoreContinuedLanes();

+    /** Indicates that code generation for a "switch" statement is about to
+        start.  isUniform indicates whether the "switch" value is uniform,
+        and bbAfterSwitch gives the basic block immediately following the
+        "switch" statement.  (For example, if the switch condition is
+        uniform, we jump here upon executing a "break" statement.) */
+    void StartSwitch(bool isUniform, llvm::BasicBlock *bbAfterSwitch);
+    /** Indicates the end of code generation for a "switch" statement. */
+    void EndSwitch();
+
+    /** Emits code for a "switch" statement in the program.
+        @param expr         Gives the value of the expression after the "switch"
+        @param defaultBlock Basic block to execute for the "default" case.  This
+                            should be NULL if there is no "default" label inside
+                            the switch.
+        @param caseBlocks   vector that stores the mapping from label values
+                            after "case" statements to basic blocks corresponding
+                            to the "case" labels.
+        @param nextBlocks   For each basic block for a "case" or "default" 
+                            label, this gives the basic block for the 
+                            immediately-following "case" or "default" label (or
+                            the basic block after the "switch" statement for the
+                            last label.)
+    */
+    void SwitchInst(llvm::Value *expr, llvm::BasicBlock *defaultBlock,
+                    const std::vector<std::pair<int, llvm::BasicBlock *> > &caseBlocks,
+                    const std::map<llvm::BasicBlock *, llvm::BasicBlock *> &nextBlocks);
+
+    /** Generates code for a "default" label after a "switch" statement.
+        The checkMask parameter indicates whether additional code should be
+        generated to check to see if the execution mask is all off after
+        the default label (in which case a jump to the following label will
+        be issued. */
+    void EmitDefaultLabel(bool checkMask, SourcePos pos);
+
+    /** Generates code for a "case" label after a "switch" statement.  See
+        the documentation for EmitDefaultLabel() for discussion of the
+        checkMask parameter. */
+    void EmitCaseLabel(int value, bool checkMask, SourcePos pos);
+
    /** Returns the current number of nested levels of 'varying' control
        flow */
    int VaryingCFDepth() const;

    bool InForeachLoop() const;

+    void SetContinueTarget(llvm::BasicBlock *bb) { continueTarget = bb; }
+
+    /** Step through the code and find label statements; create a basic
+        block for each one, so that subsequent calls to
+        GetLabeledBasicBlock() return the corresponding basic block. */
+    void InitializeLabelMap(Stmt *code);
+
+    /** If there is a label in the function with the given name, return the
+        new basic block that it starts. */
+    llvm::BasicBlock *GetLabeledBasicBlock(const std::string &label);
+
    /** Called to generate code for 'return' statement; value is the
        expression in the return statement (if non-NULL), and
        doCoherenceCheck indicates whether instructions should be generated
@@ -211,6 +260,10 @@ public:
        i1 value that indicates if all of the mask lanes are on. */
    llvm::Value *All(llvm::Value *mask);

+    /** Given a boolean mask value of type LLVMTypes::MaskType, return an
+        i1 value that indicates if all of the mask lanes are off. */
+    llvm::Value *None(llvm::Value *mask);
+
    /** Given a boolean mask value of type LLVMTypes::MaskType, return an
        i32 value wherein the i'th bit is on if and only if the i'th lane
        of the mask is on. */
@@ -446,6 +499,9 @@ private:
    /** Pointer to the Function for which we're currently generating code. */
    Function *function;

+    /** LLVM function representation for the current function. */
+    llvm::Function *llvmFunction;
+
    /** The basic block into which we add any alloca instructions that need
        to go at the very start of the function. */
    llvm::BasicBlock *allocaBlock;
@@ -479,10 +535,10 @@ private:
        the loop. */
    llvm::Value *loopMask;

-    /** If currently in a loop body, this is a pointer to memory to store a
-        mask value that represents which of the lanes have executed a
-        'break' statement.  If we're not in a loop body, this should be
-        NULL. */
+    /** If currently in a loop body or switch statement, this is a pointer
+        to memory to store a mask value that represents which of the lanes
+        have executed a 'break' statement.  If we're not in a loop body or
+        switch, this should be NULL. */
    llvm::Value *breakLanesPtr;

    /** Similar to breakLanesPtr, if we're inside a loop, this is a pointer
@@ -490,16 +546,49 @@ private:
        'continue' statement. */
    llvm::Value *continueLanesPtr;

-    /** If we're inside a loop, this gives the basic block immediately
-        after the current loop, which we will jump to if all of the lanes
-        have executed a break statement or are otherwise done with the
-        loop. */
+    /** If we're inside a loop or switch statement, this gives the basic
+        block immediately after the current loop or switch, which we will
+        jump to if all of the lanes have executed a break statement or are
+        otherwise done with it. */
    llvm::BasicBlock *breakTarget;

    /** If we're inside a loop, this gives the block to jump to if all of
        the running lanes have executed a 'continue' statement. */
    llvm::BasicBlock *continueTarget;

+    /** @name Switch statement state
+
+        These variables store various state that's active when we're
+        generating code for a switch statement.  They should all be NULL
+        outside of a switch.
+        @{
+    */
+
+    /** The value of the expression used to determine which case in the
+        statements after the switch to execute. */
+    llvm::Value *switchExpr;
+
+    /** Map from case label numbers to the basic block that will hold code
+        for that case. */
+    const std::vector<std::pair<int, llvm::BasicBlock *> > *caseBlocks;
+
+    /** The basic block of code to run for the "default" label in the
+        switch statement. */
+    llvm::BasicBlock *defaultBlock;
+
+    /** For each basic block for the code for cases (and the default label,
+        if present), this map gives the basic block for the immediately
+        following case/default label. */
+    const std::map<llvm::BasicBlock *, llvm::BasicBlock *> *nextBlocks;
+
+    /** Records whether the switch condition was uniform; this is a
+        distinct notion from whether the switch represents uniform or
+        varying control flow; we may have varying control flow from a
+        uniform switch condition if there is a 'break' inside the switch
+        that's under varying control flow. */
+    bool switchConditionWasUniform;
+    /** @} */
+
    /** A pointer to memory that records which of the program instances
        have executed a 'return' statement (and are thus really truly done
        running any more instructions in this functions. */
@@ -537,9 +626,13 @@ private:
        tasks launched from the current function. */
    llvm::Value *launchGroupHandlePtr;

+    std::map<std::string, llvm::BasicBlock *> labelMap;
+
+    static bool initLabelBBlocks(ASTNode *node, void *data);
+
    llvm::Value *pointerVectorToVoidPointers(llvm::Value *value);
    static void addGSMetadata(llvm::Value *inst, SourcePos pos);
-    bool ifsInLoopAllUniform() const;
+    bool ifsInCFAllUniform(int cfType) const;
    void jumpIfAllLoopLanesAreDone(llvm::BasicBlock *target);
    llvm::Value *emitGatherCallback(llvm::Value *lvalue, llvm::Value *retPtr);

@@ -547,6 +640,11 @@ private:
                                 const Type *ptrType);

    void restoreMaskGivenReturns(llvm::Value *oldMask);
+    void addSwitchMaskCheck(llvm::Value *mask);
+    bool inSwitchStatement() const;
+    llvm::Value *getMaskAtSwitchEntry();
+
+    CFInfo *popCFState();

    void scatter(llvm::Value *value, llvm::Value *ptr, const Type *ptrType, 
                 llvm::Value *mask);
--- a/decl.cpp
+++ b/decl.cpp
@@ -46,6 +46,18 @@
 #include <stdio.h>
 #include <set>

+static void
+lPrintTypeQualifiers(int typeQualifiers) {
+    if (typeQualifiers & TYPEQUAL_INLINE)    printf("inline ");
+    if (typeQualifiers & TYPEQUAL_CONST)     printf("const ");
+    if (typeQualifiers & TYPEQUAL_UNIFORM)   printf("uniform ");
+    if (typeQualifiers & TYPEQUAL_VARYING)   printf("varying ");
+    if (typeQualifiers & TYPEQUAL_TASK)      printf("task ");
+    if (typeQualifiers & TYPEQUAL_SIGNED)    printf("signed ");
+    if (typeQualifiers & TYPEQUAL_UNSIGNED)  printf("unsigned ");
+}
+
+
 /** Given a Type and a set of type qualifiers, apply the type qualifiers to
    the type, returning the type that is the result. 
 */
@@ -54,6 +66,16 @@ lApplyTypeQualifiers(int typeQualifiers, const Type *type, SourcePos pos) {
    if (type == NULL)
        return NULL;

+    if ((typeQualifiers & TYPEQUAL_CONST) != 0)
+        type = type->GetAsConstType();
+
+    if ((typeQualifiers & TYPEQUAL_UNIFORM) != 0)
+        type = type->GetAsUniformType();
+    else if ((typeQualifiers & TYPEQUAL_VARYING) != 0)
+        type = type->GetAsVaryingType();
+    else
+        type = type->GetAsUnboundVariabilityType();
+
    if ((typeQualifiers & TYPEQUAL_UNSIGNED) != 0) {
        if ((typeQualifiers & TYPEQUAL_SIGNED) != 0)
            Error(pos, "Illegal to apply both \"signed\" and \"unsigned\" "
@@ -64,29 +86,13 @@ lApplyTypeQualifiers(int typeQualifiers, const Type *type, SourcePos pos) {
            type = unsignedType;
        else
            Error(pos, "\"unsigned\" qualifier is illegal with \"%s\" type.",
-              type->GetString().c_str());
-
+                  type->ResolveUnboundVariability(Type::Varying)->GetString().c_str());
    }

    if ((typeQualifiers & TYPEQUAL_SIGNED) != 0 && type->IsIntType() == false)
        Error(pos, "\"signed\" qualifier is illegal with non-integer type "
-              "\"%s\".", type->GetString().c_str());
-
-    if ((typeQualifiers & TYPEQUAL_CONST) != 0)
-        type = type->GetAsConstType();
-
-    if ((typeQualifiers & TYPEQUAL_UNIFORM) != 0)
-        type = type->GetAsUniformType();
-    else if ((typeQualifiers & TYPEQUAL_VARYING) != 0)
-        type = type->GetAsVaryingType();
-    else {
-        // otherwise, structs are uniform by default and everything
-        // else is varying by default
-        if (dynamic_cast<const StructType *>(type->GetBaseType()) != NULL)
-            type = type->GetAsUniformType();
-        else
-            type = type->GetAsVaryingType();
-    }
+              "\"%s\".", 
+              type->ResolveUnboundVariability(Type::Varying)->GetString().c_str());

    return type;
 }
@@ -138,21 +144,14 @@ lGetStorageClassName(StorageClass storageClass) {

 void
 DeclSpecs::Print() const {
-    printf("%s ", lGetStorageClassName(storageClass));
+    printf("Declspecs: [%s ", lGetStorageClassName(storageClass));

    if (soaWidth > 0) printf("soa<%d> ", soaWidth);
-
-    if (typeQualifiers & TYPEQUAL_INLINE)    printf("inline ");
-    if (typeQualifiers & TYPEQUAL_CONST)     printf("const ");
-    if (typeQualifiers & TYPEQUAL_UNIFORM)   printf("uniform ");
-    if (typeQualifiers & TYPEQUAL_VARYING)   printf("varying ");
-    if (typeQualifiers & TYPEQUAL_TASK)      printf("task ");
-    if (typeQualifiers & TYPEQUAL_SIGNED)    printf("signed ");
-    if (typeQualifiers & TYPEQUAL_UNSIGNED)  printf("unsigned ");
-
-    printf("%s", baseType->GetString().c_str());
+    lPrintTypeQualifiers(typeQualifiers);
+    printf("base type: %s", baseType->GetString().c_str());

    if (vectorSize > 0) printf("<%d>", vectorSize);
+    printf("]");
 }


@@ -192,19 +191,46 @@ Declarator::GetSymbol() const {


 void
-Declarator::Print() const {
+Declarator::Print(int indent) const {
+    printf("%*cdeclarator: [", indent, ' ');
+    pos.Print();
+
+    lPrintTypeQualifiers(typeQualifiers);
    Symbol *sym = GetSymbol();
    if (sym != NULL)
        printf("%s", sym->name.c_str());
    else
        printf("(null symbol)");

+    printf(", array size = %d", arraySize);
+
+    printf(", kind = ");
+    switch (kind) {
+    case DK_BASE:      printf("base");      break;
+    case DK_POINTER:   printf("pointer");   break;
+    case DK_REFERENCE: printf("reference"); break;
+    case DK_ARRAY:     printf("array");     break;
+    case DK_FUNCTION:  printf("function");  break;
+    default:           FATAL("Unhandled declarator kind");
+    }
+
    if (initExpr != NULL) {
        printf(" = (");
        initExpr->Print();
        printf(")");
    }
-    pos.Print();
+
+    if (functionParams.size() > 0) {
+        for (unsigned int i = 0; i < functionParams.size(); ++i) {
+            printf("\n%*cfunc param %d:\n", indent, ' ', i);
+            functionParams[i]->Print(indent+4);
+        }
+    }
+
+    if (child != NULL)
+        child->Print(indent + 4);
+
+    printf("]\n");
 }


@@ -235,11 +261,13 @@ Declarator::GetFunctionInfo(DeclSpecs *ds, std::vector<Symbol *> *funArgs) {
    Assert(d != NULL);

    for (unsigned int i = 0; i < d->functionParams.size(); ++i) {
-        Declaration *pdecl = d->functionParams[i];
-        Assert(pdecl->declarators.size() == 1);
-        funArgs->push_back(pdecl->declarators[0]->GetSymbol());
+        Symbol *sym = d->GetSymbolForFunctionParameter(i);
+        sym->type = sym->type->ResolveUnboundVariability(Type::Varying);
+        funArgs->push_back(sym);
    }

+    funSym->type = funSym->type->ResolveUnboundVariability(Type::Varying);
+
    return funSym;
 }

@@ -258,6 +286,12 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
    if (kind != DK_FUNCTION && isTask)
        Error(pos, "\"task\" qualifier illegal in variable declaration.");

+    Type::Variability variability = Type::Unbound;
+    if (hasUniformQual)
+        variability = Type::Uniform;
+    else if (hasVaryingQual)
+        variability = Type::Varying;
+
    const Type *type = base;
    switch (kind) {
    case DK_BASE:
@@ -268,7 +302,7 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
        return type;

    case DK_POINTER:
-        type = new PointerType(type, hasUniformQual, isConst);
+        type = new PointerType(type, variability, isConst);
        if (child != NULL)
            return child->GetType(type, ds);
        else
@@ -316,25 +350,7 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
        for (unsigned int i = 0; i < functionParams.size(); ++i) {
            Declaration *d = functionParams[i];

-            char buf[32];
-            Symbol *sym;
-            if (d->declarators.size() == 0) {
-                // function declaration like foo(float), w/o a name for
-                // the parameter
-                sprintf(buf, "__anon_parameter_%d", i);
-                sym = new Symbol(buf, pos);
-                sym->type = d->declSpecs->GetBaseType(pos);
-            }
-            else {
-                sym = d->declarators[0]->GetSymbol();
-                if (sym == NULL) {
-                    // Handle more complex anonymous declarations like
-                    // float (float **).
-                    sprintf(buf, "__anon_parameter_%d", i);
-                    sym = new Symbol(buf, d->declarators[0]->pos);
-                    sym->type = d->declarators[0]->GetType(d->declSpecs);
-                }
-            }
+            Symbol *sym = GetSymbolForFunctionParameter(i);

            if (d->declSpecs->storageClass != SC_NONE)
                Error(sym->pos, "Storage class \"%s\" is illegal in "
@@ -381,8 +397,8 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
                }

                if (decl->initExpr != NULL &&
-                    (decl->initExpr = decl->initExpr->TypeCheck()) != NULL &&
-                    (decl->initExpr = decl->initExpr->Optimize()) != NULL &&
+                    (decl->initExpr = TypeCheck(decl->initExpr)) != NULL &&
+                    (decl->initExpr = Optimize(decl->initExpr)) != NULL &&
                    (init = dynamic_cast<ConstExpr *>(decl->initExpr)) == NULL) {
                    Error(decl->initExpr->pos, "Default value for parameter "
                          "\"%s\" must be a compile-time constant.", 
@@ -397,7 +413,7 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
            Error(pos, "No return type provided in function declaration.");
            return NULL;
        }
-
+        
        bool isExported = ds && (ds->storageClass == SC_EXPORT);
        bool isExternC =  ds && (ds->storageClass == SC_EXTERN_C);
        bool isTask =     ds && ((ds->typeQualifiers & TYPEQUAL_TASK) != 0);
@@ -418,9 +434,10 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
            return NULL;
        }

-        Type *functionType = 
-            new FunctionType(returnType, args, pos, argNames, argDefaults,
+        const Type *functionType = 
+            new FunctionType(returnType, args, argNames, argDefaults,
                             argPos, isTask, isExported, isExternC);
+        functionType = functionType->ResolveUnboundVariability(Type::Varying);
        return child->GetType(functionType, ds);
    }
    default:
@@ -461,6 +478,35 @@ Declarator::GetType(DeclSpecs *ds) const {
 }


+Symbol *
+Declarator::GetSymbolForFunctionParameter(int paramNum) const {
+    Assert(paramNum < (int)functionParams.size());
+    Declaration *d = functionParams[paramNum];
+
+    char buf[32];
+    Symbol *sym;
+    if (d->declarators.size() == 0) {
+        // function declaration like foo(float), w/o a name for
+        // the parameter
+        sprintf(buf, "__anon_parameter_%d", paramNum);
+        sym = new Symbol(buf, pos);
+        sym->type = d->declSpecs->GetBaseType(pos);
+    }
+    else {
+        Assert(d->declarators.size() == 1);
+        sym = d->declarators[0]->GetSymbol();
+        if (sym == NULL) {
+            // Handle more complex anonymous declarations like
+            // float (float **).
+            sprintf(buf, "__anon_parameter_%d", paramNum);
+            sym = new Symbol(buf, d->declarators[0]->pos);
+            sym->type = d->declarators[0]->GetType(d->declSpecs);
+        }
+    }
+    return sym;
+}
+
+
 ///////////////////////////////////////////////////////////////////////////
 // Declaration

@@ -489,19 +535,15 @@ Declaration::GetVariableDeclarations() const {
    std::vector<VariableDeclaration> vars;

    for (unsigned int i = 0; i < declarators.size(); ++i) {
-        if (declarators[i] == NULL)
-            continue;
        Declarator *decl = declarators[i];
        if (decl == NULL)
            // Ignore earlier errors
            continue;

        Symbol *sym = decl->GetSymbol();
-        if (dynamic_cast<const FunctionType *>(sym->type) != NULL) {
-            // function declaration
-            m->symbolTable->AddFunction(sym);
-        }
-        else {
+        sym->type = sym->type->ResolveUnboundVariability(Type::Varying);
+
+        if (dynamic_cast<const FunctionType *>(sym->type) == NULL) {
            m->symbolTable->AddVariable(sym);
            vars.push_back(VariableDeclaration(sym, decl->initExpr));
        }
@@ -511,16 +553,36 @@ Declaration::GetVariableDeclarations() const {


 void
-Declaration::Print() const {
-    printf("Declaration: specs [");
-    declSpecs->Print();
-    printf("], declarators [");
-    for (unsigned int i = 0 ; i < declarators.size(); ++i) {
-        declarators[i]->Print();
-        printf("%s", (i == declarators.size() - 1) ? "]" : ", ");
+Declaration::DeclareFunctions() {
+    Assert(declSpecs->storageClass != SC_TYPEDEF);
+
+    for (unsigned int i = 0; i < declarators.size(); ++i) {
+        Declarator *decl = declarators[i];
+        if (decl == NULL)
+            // Ignore earlier errors
+            continue;
+
+        Symbol *sym = decl->GetSymbol();
+        sym->type = sym->type->ResolveUnboundVariability(Type::Varying);
+
+        if (dynamic_cast<const FunctionType *>(sym->type) == NULL)
+            continue;
+
+        bool isInline = (declSpecs->typeQualifiers & TYPEQUAL_INLINE);
+        m->AddFunctionDeclaration(sym, isInline);
    }
 }

+
+void
+Declaration::Print(int indent) const {
+    printf("%*cDeclaration: specs [", indent, ' ');
+    declSpecs->Print();
+    printf("], declarators:\n");
+    for (unsigned int i = 0 ; i < declarators.size(); ++i)
+        declarators[i]->Print(indent+4);
+}
+
 ///////////////////////////////////////////////////////////////////////////

 void
@@ -539,7 +601,7 @@ GetStructTypesNamesPositions(const std::vector<StructDeclaration *> &sd,
        DeclSpecs ds(type);
        if (type->IsUniformType()) 
            ds.typeQualifiers |= TYPEQUAL_UNIFORM;
-        else
+        else if (type->IsVaryingType())
            ds.typeQualifiers |= TYPEQUAL_VARYING;

        for (unsigned int j = 0; j < sd[i]->declarators->size(); ++j) {
--- a/decl.h
+++ b/decl.h
@@ -153,10 +153,12 @@ public:
        declarator and symbols for its arguments in *args. */
    Symbol *GetFunctionInfo(DeclSpecs *ds, std::vector<Symbol *> *args);

+    Symbol *GetSymbolForFunctionParameter(int paramNum) const;
+
    /** Returns the symbol associated with the declarator. */
    Symbol *GetSymbol() const;

-    void Print() const;
+    void Print(int indent) const;

    /** Position of the declarator in the source program. */
    const SourcePos pos;
@@ -199,7 +201,7 @@ public:
    Declaration(DeclSpecs *ds, std::vector<Declarator *> *dlist = NULL);
    Declaration(DeclSpecs *ds, Declarator *d);

-    void Print() const;
+    void Print(int indent) const;

    /** This method walks through all of the Declarators in a declaration
        and returns a fully-initialized Symbol and (possibly) and
@@ -208,6 +210,10 @@ public:
        Declarator representation.) */
    std::vector<VariableDeclaration> GetVariableDeclarations() const;

+    /** For any function declarations in the Declaration, add the
+        declaration to the module. */
+    void DeclareFunctions();
+
    DeclSpecs *declSpecs;
    std::vector<Declarator *> declarators;
 };
--- a/docs/ReleaseNotes.txt
+++ b/docs/ReleaseNotes.txt
@@ -1,3 +1,43 @@
+=== v1.1.3 === (20 January 2012)
+
+With this release, the language now supports "switch" statements, with the
+same semantics and syntax as in C.
+
+This release includes fixes for two important performance related issues:
+the quality of code generated for "foreach" statements has been
+substantially improved (https://github.com/ispc/ispc/issues/151), and a
+performance regression with code for "gathers" that was introduced in
+v1.1.2 has been fixed in this release. 
+
+A number of other small bugs were fixed in this release as well, including
+one where invalid memory would sometimes be incorrectly accessed
+(https://github.com/ispc/ispc/issues/160).
+
+Thanks to Jean-Luc Duprat for a number of patches that improve support for
+building on various platforms, and to Pierre-Antoine Lacaze for patches so
+that ispc builds under MinGW.
+
+=== v1.1.2 === (9 January 2012)
+
+The major new feature in this release is support for "generic" C++
+vectorized output; in other words, ispc can emit C++ code that corresponds
+to the vectorized computation that the ispc program represents.  See the
+examples/intrinsics directory in the ispc distribution for two example
+implementations of the set of functions that must be provided map the
+vector calls generated by ispc to target specific functions.
+
+ispc now has partial support for 'goto' statements; specifically, goto is
+allowed if any enclosing control flow statements (if/for/while/do) have
+'uniform' test expressions, but not if they have 'varying' tests.
+
+A number of improvements have been made to the code generated for gathers
+and scatters--one of them (better matching x86's "free" scale by 2/4/8 for
+addressing calculations) improved the performance of the noise example by
+14%.
+
+Many small bugs have been fixed in this release as well, including issue
+numbers 138, 129, 135, 127, 149, and 142.
+
 === v1.1.1 === (15 December 2011)

 This release doesn't include any significant new functionality, but does
--- a/docs/build.sh
+++ b/docs/build.sh
@@ -2,11 +2,11 @@

 for i in ispc perfguide faq; do
    rst2html.py --template=template.txt --link-stylesheet \
-        --stylesheet-path=css/style.css $i.txt > $i.html
+        --stylesheet-path=css/style.css $i.rst > $i.html
 done

 rst2html.py --template=template-perf.txt --link-stylesheet \
-        --stylesheet-path=css/style.css perf.txt > perf.html
+        --stylesheet-path=css/style.css perf.rst > perf.html

 #rst2latex --section-numbering --documentclass=article --documentoptions=DIV=9,10pt,letterpaper ispc.txt > ispc.tex
 #pdflatex ispc.tex
--- a/docs/faq.rst
+++ b/docs/faq.rst
@@ -1,10 +1,10 @@
-=============================================================
-Intel® SPMD Program Compiler Frequently Asked Questions (FAQ)
-=============================================================
+=====================================
+Frequently Asked Questions About ispc
+=====================================

 This document includes a number of frequently (and not frequently) asked
 questions about ispc, the Intel® SPMD Program Compiler.  The source to this
-document is in the file ``docs/faq.txt`` in the ``ispc`` source
+document is in the file ``docs/faq.rst`` in the ``ispc`` source
 distribution.

 * Understanding ispc's Output
--- a/docs/ispc.rst
+++ b/docs/ispc.rst
@@ -56,6 +56,7 @@ Contents:

  + `Basic Command-line Options`_
  + `Selecting The Compilation Target`_
+  + `Generating Generic C++ Output`_
  + `Selecting 32 or 64 Bit Addressing`_
  + `The Preprocessor`_
  + `Debugging`_
@@ -98,7 +99,9 @@ Contents:
  + `Control Flow`_

    * `Conditional Statements: "if"`_
+    * `Conditional Statements: "switch"`_
    * `Basic Iteration Statements: "for", "while", and "do"`_
+    * `Unstructured Control Flow: "goto"`_
    * `"Coherent" Control Flow Statements: "cif" and Friends`_
    * `Parallel Iteration Statements: "foreach" and "foreach_tiled"`_
    * `Parallel Iteration with "programIndex" and "programCount"`_
@@ -432,6 +435,65 @@ Intel® SSE2, use ``--target=sse2``.  (As with the other options in this
 section, see the output of ``ispc --help`` for a full list of supported
 targets.)

+Generating Generic C++ Output
+-----------------------------
+
+In addition to generating object files or assembly output for specific
+targets like SSE2, SSE4, and AVX, ``ispc`` provides an option to generate
+"generic" C++ output.  This
+
+As an example, consider the following simple ``ispc`` program:
+
+::
+
+    int foo(int i, int j) {
+        return (i < 0) ? 0 : i + j;
+    }
+
+If this program is compiled with the following command:
+
+::
+
+  ispc foo.ispc --emit-c++ --target=generic-4 -o foo.cpp
+
+Then ``foo()`` is compiled to the following C++ code (after various
+automatically-generated boilerplate code):
+
+::
+
+    __vec4_i32 foo(__vec4_i32 i_llvm_cbe, __vec4_i32 j_llvm_cbe,
+                   __vec4_i1 __mask_llvm_cbe) {
+        return (__select((__signed_less_than(i_llvm_cbe,
+                                             __vec4_i32 (0u, 0u, 0u, 0u))),
+                         __vec4_i32 (0u, 0u, 0u, 0u),
+                        (__add(i_llvm_cbe, j_llvm_cbe))));
+    }
+
+Note that the original computation has been expressed in terms of a number
+of vector types (e.g. ``__vec4_i32`` for a 4-wide vector of 32-bit integers
+and ``__vec4_i1`` for a 4-wide vector of boolean values) and in terms of
+vector operations on these types like ``__add()`` and ``__select()``).
+
+You are then free to provide your own implementations of these types and
+functions.  For example, you might want to target a specific vector ISA, or
+you might want to instrument these functions for performance measurements.
+
+There is an example implementation of 4-wide variants of the required
+functions, suitable for use with the ``generic-4`` target in the file
+``examples/intrinsics/sse4.h``, and there is an example straightforward C
+implementation of the 16-wide variants for the ``generic-16`` target in the
+file ``examples/intrinsics/generic-16.h``.  There is not yet comprehensive
+documentation of these types and the functions that must be provided for
+them when the C++ target is used, but a review of those two files should
+provide the basic context.
+
+If you are using C++ source emission, you may also find the
+``--c++-include-file=<filename>`` command line argument useful; it adds an
+``#include`` statement with the given filename at the top of the emitted
+C++ file; this can be used to easily include specific implementations of
+the vector types and functions.
+
+
 Selecting 32 or 64 Bit Addressing
 ---------------------------------

@@ -1080,7 +1142,7 @@ in C:

 * Expression syntax and basic types
 * Syntax for variable declarations
-* Control flow structures: if, for, while, do
+* Control flow structures: ``if``, ``for``, ``while``, ``do``, and ``switch``.
 * Pointers, including function pointers, ``void *``, and C's array/pointer
  duality (arrays are converted to pointers when passed to functions, etc.)
 * Structs and arrays
@@ -1124,7 +1186,7 @@ but are likely to be supported in future releases:
  ``int64`` types
 * Character constants
 * String constants and arrays of characters as strings
-* ``switch`` and ``goto`` statements
+* ``goto`` statements are partially supported (see `Unstructured Control Flow: "goto"`_)
 * ``union`` types
 * Bitfield members of ``struct`` types
 * Variable numbers of arguments to functions
@@ -1185,6 +1247,18 @@ Here are three ways of specifying the integer value "15":
   int fifteen_hex     = 0xf;
   int fifteen_binary  = 0b1111;

+A number of suffixes can be provided with integer numeric constants.
+First, "u" denotes that the constant is unsigned, and "ll" denotes a 64-bit
+integer constant (while "l" denotes a 32-bit integer constant).  It is also
+possible to denote units of 1024, 1024*1024, or 1024*1024*1024 with the
+SI-inspired suffixes "k", "M", and "G" respectively:
+
+::
+
+   int two_kb = 2k;   // 2048
+   int two_megs = 2M; // 2 * 1024 * 1024
+   int one_gig = 1G;  // 1024 * 1024 * 1024
+
 Floating-point constants can be specified in one of three ways.  First,
 they may be a sequence of zero or more digits from 0 to 9, followed by a
 period, followed by zero or more digits from 0 to 9. (There must be at
@@ -1920,6 +1994,31 @@ executes if the condition is false.
    else
        x *= 2.;

+Conditional Statements: "switch"
+--------------------------------
+
+The ``switch`` conditional statement is also available, again with the same
+behavior as in C; the expression used in the ``switch`` must be of integer
+type (but it can be uniform or varying).  As in C, if there is no ``break``
+statement at the end of the code for a given case, execution "falls
+through" to the following case.  These features are demonstrated in the
+code below.
+
+::
+
+    int x = ...;
+    switch (x) {
+    case 0:
+    case 1:
+        foo(x);
+        /* fall through */
+    case 5:
+        x = 0;
+        break;
+    default:
+        x *= x;
+    }
+
 Basic Iteration Statements: "for", "while", and "do"
 ----------------------------------------------------

@@ -1945,6 +2044,37 @@ one of them executes a ``continue`` statement, other program instances
 executing code in the loop body that didn't execute the ``continue`` will
 be unaffected by it.

+Unstructured Control Flow: "goto"
+---------------------------------
+
+``goto`` statements are allowed in ``ispc`` programs under limited
+circumstances; specifically, only when the compiler can determine that if
+any program instance executes a ``goto`` statement, then all of the program
+instances will be running at that statement, such that all will follow the
+``goto``.
+
+Put another way: it's illegal for there to be "varying" control flow
+statements in scopes that enclose a ``goto`` statement.  An error is issued
+if a ``goto`` is used in this situation.
+
+The syntax for adding labels to ``ispc`` programs and jumping to them with
+``goto`` is the same as in C.  The following code shows a ``goto`` based
+equivalent of a ``for`` loop where the induction variable ``i`` goes from
+zero to ten.
+
+::
+
+      uniform int i = 0;
+    check:
+      if (i > 10)
+          goto done;
+      // loop body
+      ++i;
+      goto check;
+    done:
+      // ...
+
+
 "Coherent" Control Flow Statements: "cif" and Friends
 -----------------------------------------------------

--- a/docs/perf.rst
+++ b/docs/perf.rst
--- a/docs/perfguide.rst
+++ b/docs/perfguide.rst
--- a/docs/template-perf.txt
+++ b/docs/template-perf.txt
@@ -45,8 +45,7 @@
              developers mailing list</a></li>
              <li><a href="http://github.com/ispc/ispc/wiki/">Wiki</a></li>
              <li><a href="http://github.com/ispc/ispc/issues/">Bug tracking</a></li>
-              <li><a href="doxygen/index.html">Doxygen documentation of
-              <tt>ispc</tt> source code</a></li>
+              <li><a href="doxygen/index.html">Doxygen</a></li>
            </ul>
        </div>
      </div>
--- a/docs/template.txt
+++ b/docs/template.txt
@@ -45,8 +45,7 @@
              developers mailing list</a></li>
              <li><a href="http://github.com/ispc/ispc/wiki/">Wiki</a></li>
              <li><a href="http://github.com/ispc/ispc/issues/">Bug tracking</a></li>
-              <li><a href="doxygen/index.html">Doxygen documentation of
-              <tt>ispc</tt> source code</a></li>
+              <li><a href="doxygen/index.html">Doxygen</a></li>
            </ul>
        </div>
      </div>
--- a/doxygen.cfg
+++ b/doxygen.cfg
@@ -31,7 +31,7 @@ PROJECT_NAME           = "Intel SPMD Program Compiler"
 # This could be handy for archiving the generated documentation or
 # if some version control system is used.

-PROJECT_NUMBER         = 1.1.1
+PROJECT_NUMBER         = 1.1.3

 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
 # base path where the generated documentation will be put.
--- a/examples/aobench/Makefile
+++ b/examples/aobench/Makefile
@@ -1,39 +1,7 @@

-ARCH = $(shell uname)
+EXAMPLE=ao
+CPP_SRC=ao.cpp ao_serial.cpp
+ISPC_SRC=ao.ispc
+ISPC_TARGETS=sse2,sse4,avx

-TASK_CXX=../tasksys.cpp
-TASK_LIB=-lpthread
-TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
-
-CXX=g++
-CXXFLAGS=-Iobjs/ -O3 -Wall -m64
-ISPC=ispc
-ISPCFLAGS=-O2 --target=sse2,sse4,avx --arch=x86-64
-
-ISPC_OBJS=objs/ao_ispc.o objs/ao_ispc_sse2.o objs/ao_ispc_sse4.o \
-	objs/ao_ispc_avx.o
-OBJS=objs/ao.o objs/ao_serial.o $(ISPC_OBJS) $(TASK_OBJ)
-
-default: ao
-
-.PHONY: dirs clean
-
-dirs:
-	/bin/mkdir -p objs/
-
-clean:
-	/bin/rm -rf objs *~ ao
-
-ao: dirs $(OBJS) $(TASK_OBJ)
-	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASK_LIB)
-
-objs/%.o: %.cpp
-	$(CXX) $< $(CXXFLAGS) -c -o $@
-
-objs/%.o: ../%.cpp
-	$(CXX) $< $(CXXFLAGS) -c -o $@
-
-objs/ao.o: objs/ao_ispc.h 
-
-objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
-	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
+include ../common.mk
--- a/examples/aobench/ao.ispc
+++ b/examples/aobench/ao.ispc
@@ -82,7 +82,7 @@ static inline void vnormalize(vec &v) {
 }


-static inline void
+static void
 ray_plane_intersect(Isect &isect, Ray &ray, Plane &plane) {
    float d = -dot(plane.p, plane.n);
    float v = dot(ray.dir, plane.n);
@@ -124,7 +124,7 @@ ray_sphere_intersect(Isect &isect, Ray &ray, Sphere &sphere) {
 }


-static inline void
+static void
 orthoBasis(vec basis[3], vec n) {
    basis[2] = n;
    basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;
@@ -147,7 +147,7 @@ orthoBasis(vec basis[3], vec n) {
 }


-static inline float
+static float
 ambient_occlusion(Isect &isect, Plane &plane, Sphere spheres[3], 
                  RNGState &rngstate) {
    float eps = 0.0001f;
--- a/examples/aobench_instrumented/Makefile
+++ b/examples/aobench_instrumented/Makefile
@@ -14,13 +14,13 @@ dirs:
 clean:
 	/bin/rm -rf objs *~ ao

-ao: dirs objs/ao.o objs/instrument.o objs/ao_ispc.o
-	$(CXX) $(CXXFLAGS) -o $@ objs/ao.o objs/ao_ispc.o objs/instrument.o -lm -lpthread
+ao: objs/ao.o objs/instrument.o objs/ao_ispc.o ../tasksys.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $^ -lm -lpthread

-objs/%.o: %.cpp
+objs/%.o: %.cpp dirs
 	$(CXX) $< $(CXXFLAGS) -c -o $@

 objs/ao.o: objs/ao_ispc.h 

-objs/%_ispc.h objs/%_ispc.o: %.ispc
-	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
+objs/%_ispc.h objs/%_ispc.o: %.ispc dirs
+	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_instrumented_ispc.h
--- a/examples/common.mk
+++ b/examples/common.mk
@@ -0,0 +1,59 @@
+
+TASK_CXX=../tasksys.cpp
+TASK_LIB=-lpthread
+TASK_OBJ=tasksys.o
+
+CXX=g++
+CXXFLAGS=-Iobjs/ -O2 -m64
+LIBS=-lm $(TASK_LIB) -lstdc++
+ISPC=ispc -O2 --arch=x86-64 $(ISPC_FLAGS)
+ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc.o $(ISPC_SRC:.ispc=)_ispc_sse2.o \
+	$(ISPC_SRC:.ispc=)_ispc_sse4.o $(ISPC_SRC:.ispc=)_ispc_avx.o)
+ISPC_HEADER=objs/$(ISPC_SRC:.ispc=_ispc.h)
+CPP_OBJS=$(addprefix objs/, $(CPP_SRC:.cpp=.o) $(TASK_OBJ))
+
+default: $(EXAMPLE)
+
+all: $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16
+
+.PHONY: dirs clean
+
+dirs:
+	/bin/mkdir -p objs/
+
+objs/%.cpp objs/%.o objs/%.h: dirs
+
+clean:
+	/bin/rm -rf objs *~ $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16
+
+$(EXAMPLE): $(CPP_OBJS) $(ISPC_OBJS)
+	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
+
+objs/%.o: %.cpp dirs $(ISPC_HEADER)
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/%.o: ../%.cpp dirs
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/$(EXAMPLE).o: objs/$(EXAMPLE)_ispc.h
+
+objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
+	$(ISPC) --target=$(ISPC_TARGETS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
+
+objs/$(ISPC_SRC:.ispc=)_sse4.cpp: $(ISPC_SRC)
+	$(ISPC) $< -o $@ --target=generic-4 --emit-c++ --c++-include-file=sse4.h
+
+objs/$(ISPC_SRC:.ispc=)_sse4.o: objs/$(ISPC_SRC:.ispc=)_sse4.cpp
+	$(CXX) -I../intrinsics -msse4.2 $< $(CXXFLAGS) -c -o $@
+
+$(EXAMPLE)-sse4: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_sse4.o
+	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
+
+objs/$(ISPC_SRC:.ispc=)_generic16.cpp: $(ISPC_SRC)
+	$(ISPC) $< -o $@ --target=generic-16 --emit-c++ --c++-include-file=generic-16.h
+
+objs/$(ISPC_SRC:.ispc=)_generic16.o: objs/$(ISPC_SRC:.ispc=)_generic16.cpp
+	$(CXX) -I../intrinsics $< $(CXXFLAGS) -c -o $@
+
+$(EXAMPLE)-generic16: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_generic16.o
+	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
--- a/examples/deferred/Makefile
+++ b/examples/deferred/Makefile
@@ -1,38 +1,8 @@

-ARCH = $(shell uname)
+EXAMPLE=deferred_shading
+CPP_SRC=common.cpp main.cpp dynamic_c.cpp dynamic_cilk.cpp
+ISPC_SRC=kernels.ispc
+ISPC_TARGETS=sse2,sse4-x2,avx-x2
+ISPC_FLAGS=--opt=fast-math

-TASK_CXX=../tasksys.cpp
-TASK_LIB=-lpthread
-TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
-
-CXX=g++
-CXXFLAGS=-Iobjs/ -O3 -Wall -m64
-ISPC=ispc
-ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx-x2 --arch=x86-64 --math-lib=fast
-
-OBJS=objs/main.o objs/common.o objs/kernels_ispc.o objs/kernels_ispc_sse2.o \
-	objs/kernels_ispc_sse4.o objs/kernels_ispc_avx.o \
-	objs/dynamic_c.o objs/dynamic_cilk.o
-
-default: deferred_shading
-
-.PHONY: dirs clean
-.PRECIOUS: objs/kernels_ispc.h
-
-dirs:
-	/bin/mkdir -p objs/
-
-clean:
-	/bin/rm -rf objs *~ deferred_shading
-
-deferred_shading: dirs $(OBJS) $(TASK_OBJ)
-	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) $(TASK_OBJ) -lm $(TASK_LIB)
-
-objs/%.o: %.cpp objs/kernels_ispc.h deferred.h
-	$(CXX) $< $(CXXFLAGS) -c -o $@
-
-objs/%.o: ../%.cpp
-	$(CXX) $< $(CXXFLAGS) -c -o $@
-
-objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
-	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
+include ../common.mk
--- a/examples/intrinsics/generic-16.h
+++ b/examples/intrinsics/generic-16.h
--- a/examples/intrinsics/sse4.h
+++ b/examples/intrinsics/sse4.h
--- a/examples/mandelbrot/Makefile
+++ b/examples/mandelbrot/Makefile
@@ -1,30 +1,7 @@

-CXX=g++ -m64
-CXXFLAGS=-Iobjs/ -O3 -Wall
-ISPC=ispc
-ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx-x2 --arch=x86-64
+EXAMPLE=mandelbrot
+CPP_SRC=mandelbrot.cpp mandelbrot_serial.cpp
+ISPC_SRC=mandelbrot.ispc
+ISPC_TARGETS=sse2,sse4-x2,avx-x2

-default: mandelbrot
-
-.PHONY: dirs clean
-
-dirs:
-	/bin/mkdir -p objs/
-
-clean:
-	/bin/rm -rf objs *~ mandelbrot
-
-OBJS=objs/mandelbrot.o objs/mandelbrot_serial.o objs/mandelbrot_ispc_sse2.o \
-	objs/mandelbrot_ispc_sse4.o objs/mandelbrot_ispc_avx.o \
-	objs/mandelbrot_ispc.o
-
-mandelbrot: dirs $(OBJS)
-	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm
-
-objs/%.o: %.cpp
-	$(CXX) $< $(CXXFLAGS) -c -o $@
-
-objs/mandelbrot.o: objs/mandelbrot_ispc.h 
-
-objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
-	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
+include ../common.mk
--- a/examples/mandelbrot_tasks/Makefile
+++ b/examples/mandelbrot_tasks/Makefile
@@ -1,39 +1,7 @@

-ARCH = $(shell uname)
+EXAMPLE=mandelbrot
+CPP_SRC=mandelbrot.cpp mandelbrot_serial.cpp
+ISPC_SRC=mandelbrot.ispc
+ISPC_TARGETS=sse2,sse4-x2,avx-x2

-TASK_CXX=../tasksys.cpp
-TASK_LIB=-lpthread
-TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
-
-CXX=g++
-CXXFLAGS=-Iobjs/ -O3 -Wall -m64
-ISPC=ispc
-ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx-x2 --arch=x86-64
-
-OBJS=objs/mandelbrot.o objs/mandelbrot_serial.o $(TASK_OBJ) \
-	objs/mandelbrot_ispc.o objs/mandelbrot_ispc_sse2.o \
-	objs/mandelbrot_ispc_sse4.o objs/mandelbrot_ispc_avx.o 
-
-default: mandelbrot
-
-.PHONY: dirs clean
-
-dirs:
-	/bin/mkdir -p objs/
-
-clean:
-	/bin/rm -rf objs *~ mandelbrot
-
-mandelbrot: dirs $(OBJS)
-	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASK_LIB)
-
-objs/%.o: %.cpp
-	$(CXX) $< $(CXXFLAGS) -c -o $@
-
-objs/%.o: ../%.cpp
-	$(CXX) $< $(CXXFLAGS) -c -o $@
-
-objs/mandelbrot.o: objs/mandelbrot_ispc.h 
-
-objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
-	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
+include ../common.mk
--- a/examples/noise/Makefile
+++ b/examples/noise/Makefile
@@ -1,29 +1,7 @@

-CXX=g++ -m64
-CXXFLAGS=-Iobjs/ -O3 -Wall
-ISPC=ispc
-ISPCFLAGS=-O2 --target=sse2,sse4,avx-x2 --arch=x86-64
+EXAMPLE=noise
+CPP_SRC=$(EXAMPLE).cpp $(EXAMPLE)_serial.cpp
+ISPC_SRC=noise.ispc
+ISPC_TARGETS=sse2,sse4,avx-x2

-OBJS=objs/noise.o objs/noise_serial.o objs/noise_ispc.o objs/noise_ispc_sse2.o \
-	objs/noise_ispc_sse4.o objs/noise_ispc_avx.o 
-
-default: noise
-
-.PHONY: dirs clean
-
-dirs:
-	/bin/mkdir -p objs/
-
-clean:
-	/bin/rm -rf objs *~ noise
-
-noise: dirs $(OBJS)
-	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm
-
-objs/%.o: %.cpp
-	$(CXX) $< $(CXXFLAGS) -c -o $@
-
-objs/noise.o: objs/noise_ispc.h 
-
-objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
-	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
+include ../common.mk
--- a/examples/options/Makefile
+++ b/examples/options/Makefile
@@ -1,38 +1,7 @@

-TASK_CXX=../tasksys.cpp
-TASK_LIB=-lpthread
-TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
+EXAMPLE=options
+CPP_SRC=options.cpp options_serial.cpp
+ISPC_SRC=options.ispc
+ISPC_TARGETS=sse2,sse4-x2,avx-x2

-
-CXX=g++ -m64
-CXXFLAGS=-Iobjs/ -g -Wall
-ISPC=ispc
-ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx-x2 --arch=x86-64
-
-OBJS=objs/options.o objs/options_serial.o objs/options_ispc.o \
-	objs/options_ispc_sse2.o objs/options_ispc_sse4.o \
-	objs/options_ispc_avx.o $(TASK_OBJ)
-
-default: options
-
-.PHONY: dirs clean
-
-dirs:
-	/bin/mkdir -p objs/
-
-clean:
-	/bin/rm -rf objs *~ options
-
-options: dirs $(OBJS)
-	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASK_LIB)
-
-objs/%.o: %.cpp
-	$(CXX) $< $(CXXFLAGS) -c -o $@
-
-objs/%.o: ../%.cpp
-	$(CXX) $< $(CXXFLAGS) -c -o $@
-
-objs/options.o: objs/options_ispc.h options_defs.h
-
-objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc options_defs.h
-	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
+include ../common.mk
--- a/examples/rt/Makefile
+++ b/examples/rt/Makefile
@@ -1,38 +1,7 @@

-ARCH = $(shell uname)
+EXAMPLE=rt
+CPP_SRC=rt.cpp rt_serial.cpp
+ISPC_SRC=rt.ispc
+ISPC_TARGETS=sse2,sse4-x2,avx

-TASK_CXX=../tasksys.cpp
-TASK_LIB=-lpthread
-TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
-
-CXX=g++
-CXXFLAGS=-Iobjs/ -O3 -Wall -m64
-ISPC=ispc
-ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx --arch=x86-64
-
-OBJS=objs/rt.o objs/rt_serial.o $(TASK_OBJ) objs/rt_ispc.o objs/rt_ispc_sse2.o \
-	objs/rt_ispc_sse4.o objs/rt_ispc_avx.o
-
-default: rt
-
-.PHONY: dirs clean
-
-dirs:
-	/bin/mkdir -p objs/
-
-clean:
-	/bin/rm -rf objs *~ rt
-
-rt: dirs $(OBJS)
-	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASK_LIB)
-
-objs/%.o: %.cpp
-	$(CXX) $< $(CXXFLAGS) -c -o $@
-
-objs/%.o: ../%.cpp
-	$(CXX) $< $(CXXFLAGS) -c -o $@
-
-objs/rt.o: objs/rt_ispc.h 
-
-objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
-	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
+include ../common.mk
--- a/examples/rt/rt.ispc
+++ b/examples/rt/rt.ispc
@@ -104,8 +104,8 @@ static void generateRay(uniform const float raster2camera[4][4],
 }


-static inline bool BBoxIntersect(const uniform float bounds[2][3], 
-                                 const Ray &ray) {
+static bool BBoxIntersect(const uniform float bounds[2][3], 
+                          const Ray &ray) {
    uniform float3 bounds0 = { bounds[0][0], bounds[0][1], bounds[0][2] };
    uniform float3 bounds1 = { bounds[1][0], bounds[1][1], bounds[1][2] };
    float t0 = ray.mint, t1 = ray.maxt;
@@ -143,7 +143,7 @@ static inline bool BBoxIntersect(const uniform float bounds[2][3],



-static inline bool TriIntersect(const Triangle &tri, Ray &ray) {
+static bool TriIntersect(const Triangle &tri, Ray &ray) {
    uniform float3 p0 = { tri.p[0][0], tri.p[0][1], tri.p[0][2] };
    uniform float3 p1 = { tri.p[1][0], tri.p[1][1], tri.p[1][2] };
    uniform float3 p2 = { tri.p[2][0], tri.p[2][1], tri.p[2][2] };
--- a/examples/stencil/Makefile
+++ b/examples/stencil/Makefile
@@ -1,39 +1,7 @@

-ARCH = $(shell uname)
+EXAMPLE=stencil
+CPP_SRC=stencil.cpp stencil_serial.cpp
+ISPC_SRC=stencil.ispc
+ISPC_TARGETS=sse2,sse4-x2,avx-x2

-TASK_CXX=../tasksys.cpp
-TASK_LIB=-lpthread
-TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
-
-CXX=g++
-CXXFLAGS=-Iobjs/ -O3 -Wall -m64
-ISPC=ispc
-ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx-x2 --arch=x86-64
-
-OBJS=objs/stencil.o objs/stencil_serial.o $(TASK_OBJ) objs/stencil_ispc.o \
-	objs/stencil_ispc_sse2.o objs/stencil_ispc_sse4.o \
-	objs/stencil_ispc_avx.o
-
-default: stencil
-
-.PHONY: dirs clean
-
-dirs:
-	/bin/mkdir -p objs/
-
-clean:
-	/bin/rm -rf objs *~ stencil
-
-stencil: dirs $(OBJS)
-	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASK_LIB)
-
-objs/%.o: %.cpp
-	$(CXX) $< $(CXXFLAGS) -c -o $@
-
-objs/%.o: ../%.cpp
-	$(CXX) $< $(CXXFLAGS) -c -o $@
-
-objs/stencil.o: objs/stencil_ispc.h 
-
-objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
-	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
+include ../common.mk
--- a/examples/volume_rendering/Makefile
+++ b/examples/volume_rendering/Makefile
@@ -1,38 +1,7 @@

-ARCH = $(shell uname)
+EXAMPLE=volume
+CPP_SRC=volume.cpp volume_serial.cpp
+ISPC_SRC=volume.ispc
+ISPC_TARGETS=sse2,sse4-x2,avx

-TASK_CXX=../tasksys.cpp
-TASK_LIB=-lpthread
-TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
-
-CXX=g++
-CXXFLAGS=-Iobjs/ -O3 -Wall -m64
-ISPC=ispc
-ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx --arch=x86-64
-
-OBJS=objs/volume.o objs/volume_serial.o $(TASK_OBJ) objs/volume_ispc.o \
-	objs/volume_ispc_sse2.o objs/volume_ispc_sse4.o objs/volume_ispc_avx.o
-
-default: volume
-
-.PHONY: dirs clean
-
-dirs:
-	/bin/mkdir -p objs/
-
-clean:
-	/bin/rm -rf objs *~ volume
-
-volume: dirs $(OBJS)
-	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASK_LIB)
-
-objs/%.o: %.cpp
-	$(CXX) $< $(CXXFLAGS) -c -o $@
-
-objs/%.o: ../%.cpp
-	$(CXX) $< $(CXXFLAGS) -c -o $@
-
-objs/volume.o: objs/volume_ispc.h 
-
-objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
-	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
+include ../common.mk
--- a/examples/volume_rendering/volume.ispc
+++ b/examples/volume_rendering/volume.ispc
@@ -129,8 +129,8 @@ static inline float3 Offset(float3 p, float3 pMin, float3 pMax) {
 }


-static inline float Density(float3 Pobj, float3 pMin, float3 pMax, 
-                            uniform float density[], uniform int nVoxels[3]) {
+static float Density(float3 Pobj, float3 pMin, float3 pMax, 
+                     uniform float density[], uniform int nVoxels[3]) {
    if (!Inside(Pobj, pMin, pMax)) 
        return 0;
    // Compute voxel coordinates and offsets for _Pobj_
--- a/expr.cpp
+++ b/expr.cpp
@@ -36,12 +36,22 @@
 */

 #include "expr.h"
+#include "ast.h"
 #include "type.h"
 #include "sym.h"
 #include "ctx.h"
 #include "module.h"
 #include "util.h"
 #include "llvmutil.h"
+#ifndef _MSC_VER
+#include <inttypes.h>
+#endif
+#ifndef PRId64
+#define PRId64 "lld"
+#endif
+#ifndef PRIu64
+#define PRIu64 "llu"
+#endif

 #include <list>
 #include <set>
@@ -142,9 +152,9 @@ lArrayToPointer(Expr *expr) {
    Expr *zero = new ConstExpr(AtomicType::UniformInt32, 0, expr->pos);
    Expr *index = new IndexExpr(expr, zero, expr->pos);
    Expr *addr = new AddressOfExpr(index, expr->pos);
-    addr = addr->TypeCheck();
+    addr = TypeCheck(addr);
    Assert(addr != NULL);
-    addr = addr->Optimize();
+    addr = Optimize(addr);
    Assert(addr != NULL);
    return addr;
 }
@@ -224,7 +234,7 @@ lDoTypeConv(const Type *fromType, const Type *toType, Expr **expr,
            eltType = eltType->GetAsConstType();
        if (Type::Equal(toPointerType, 
                        new PointerType(eltType,
-                                        toPointerType->IsUniformType(),
+                                        toPointerType->GetVariability(),
                                        toPointerType->IsConstType())))
            goto typecast_ok;
        else {
@@ -466,7 +476,7 @@ lDoTypeConv(const Type *fromType, const Type *toType, Expr **expr,

 typecast_ok:
    if (expr != NULL)
-        *expr = new TypeCastExpr(toType, *expr, false, pos);
+        *expr = new TypeCastExpr(toType, *expr, pos);
    return true;
 }

@@ -638,6 +648,9 @@ lLLVMConstantValue(const Type *type, llvm::LLVMContext *ctx, double value) {

 static llvm::Value *
 lMaskForSymbol(Symbol *baseSym, FunctionEmitContext *ctx) {
+    if (baseSym == NULL)
+        return ctx->GetFullMask();
+
    if (dynamic_cast<const PointerType *>(baseSym->type) != NULL ||
        dynamic_cast<const ReferenceType *>(baseSym->type) != NULL)
        // FIXME: for pointers, we really only want to do this for
@@ -658,10 +671,11 @@ lMaskForSymbol(Symbol *baseSym, FunctionEmitContext *ctx) {
 static void
 lStoreAssignResult(llvm::Value *value, llvm::Value *ptr, const Type *ptrType,
                   FunctionEmitContext *ctx, Symbol *baseSym) {
-    Assert(baseSym != NULL &&
+    Assert(baseSym == NULL ||
           baseSym->varyingCFDepth <= ctx->VaryingCFDepth());
    if (!g->opt.disableMaskedStoreToStore &&
        !g->opt.disableMaskAllOnOptimizations &&
+        baseSym != NULL &&
        baseSym->varyingCFDepth == ctx->VaryingCFDepth() &&
        baseSym->storageClass != SC_STATIC &&
        dynamic_cast<const ReferenceType *>(baseSym->type) == NULL &&
@@ -843,11 +857,6 @@ UnaryExpr::GetType() const {

 Expr *
 UnaryExpr::Optimize() {
-    if (!expr)
-        return NULL;
-
-    expr = expr->Optimize();
-
    ConstExpr *constExpr = dynamic_cast<ConstExpr *>(expr);
    // If the operand isn't a constant, then we can't do any optimization
    // here...
@@ -932,16 +941,11 @@ UnaryExpr::Optimize() {

 Expr *
 UnaryExpr::TypeCheck() {
-    if (expr != NULL) 
-        expr = expr->TypeCheck();
-    if (expr == NULL)
+    const Type *type;
+    if (expr == NULL || (type = expr->GetType()) == NULL)
        // something went wrong in type checking...
        return NULL;

-    const Type *type = expr->GetType();
-    if (type == NULL)
-        return NULL;
-
    if (op == PreInc || op == PreDec || op == PostInc || op == PostDec) {
        if (type->IsConstType()) {
            Error(pos, "Can't assign to type \"%s\" on left-hand side of "
@@ -999,7 +1003,7 @@ UnaryExpr::TypeCheck() {

 int
 UnaryExpr::EstimateCost() const {
-    return (expr ? expr->EstimateCost() : 0) + COST_SIMPLE_ARITH_LOGIC_OP;
+    return COST_SIMPLE_ARITH_LOGIC_OP;
 }


@@ -1116,7 +1120,8 @@ lEmitBinaryArith(BinaryExpr::Op op, llvm::Value *value0, llvm::Value *value1,
                // points to in order to return the difference in elements.
                LLVM_TYPE_CONST llvm::Type *llvmElementType = 
                    ptrType->GetBaseType()->LLVMType(g->ctx);
-                llvm::Value *size = g->target.SizeOf(llvmElementType);
+                llvm::Value *size = g->target.SizeOf(llvmElementType, 
+                                                     ctx->GetCurrentBasicBlock());
                if (ptrType->IsVaryingType())
                    size = ctx->SmearUniform(size);

@@ -1489,12 +1494,7 @@ lConstFoldBoolBinOp(BinaryExpr::Op op, const bool *v0, const bool *v1,

 Expr *
 BinaryExpr::Optimize() {
-    if (arg0 != NULL) 
-        arg0 = arg0->Optimize();
-    if (arg1 != NULL) 
-        arg1 = arg1->Optimize();
-
-    if (!arg0 || !arg1)
+    if (arg0 == NULL || arg1 == NULL)
        return NULL;

    ConstExpr *constArg0 = dynamic_cast<ConstExpr *>(arg0);
@@ -1516,10 +1516,10 @@ BinaryExpr::Optimize() {
                    inv[i] = 1.f / inv[i];
                Expr *einv = new ConstExpr(type1, inv, constArg1->pos);
                Expr *e = new BinaryExpr(Mul, arg0, einv, pos);
-                e = e->TypeCheck();
+                e = ::TypeCheck(e);
                if (e == NULL)
                    return NULL;
-                return e->Optimize();
+                return ::Optimize(e);
            }
        }

@@ -1539,18 +1539,18 @@ BinaryExpr::Optimize() {
                    ExprList *args = new ExprList(arg1, arg1->pos);
                    Expr *rcpCall = new FunctionCallExpr(rcpSymExpr, args, 
                                                         arg1->pos);
-                    rcpCall = rcpCall->TypeCheck();
+                    rcpCall = ::TypeCheck(rcpCall);
                    if (rcpCall == NULL)
                        return NULL;
-                    rcpCall = rcpCall->Optimize();
+                    rcpCall = ::Optimize(rcpCall);
                    if (rcpCall == NULL)
                        return NULL;

                    Expr *ret = new BinaryExpr(Mul, arg0, rcpCall, pos);
-                    ret = ret->TypeCheck();
+                    ret = ::TypeCheck(ret);
                    if (ret == NULL)
                        return NULL;
-                    return ret->Optimize();
+                    return ::Optimize(ret);
                }
                else
                    Warning(pos, "rcp() not found from stdlib.  Can't apply "
@@ -1638,11 +1638,6 @@ BinaryExpr::Optimize() {

 Expr *
 BinaryExpr::TypeCheck() {
-    if (arg0 != NULL) 
-        arg0 = arg0->TypeCheck();
-    if (arg1 != NULL) 
-        arg1 = arg1->TypeCheck();
-
    if (arg0 == NULL || arg1 == NULL)
        return NULL;

@@ -1905,10 +1900,8 @@ BinaryExpr::TypeCheck() {

 int
 BinaryExpr::EstimateCost() const {
-    return ((arg0 ? arg0->EstimateCost() : 0) +
-            (arg1 ? arg1->EstimateCost() : 0) +
-            ((op == Div || op == Mod) ? COST_COMPLEX_ARITH_OP : 
-                                        COST_SIMPLE_ARITH_LOGIC_OP));
+    return (op == Div || op == Mod) ? COST_COMPLEX_ARITH_OP : 
+                                      COST_SIMPLE_ARITH_LOGIC_OP;
 }


@@ -2037,14 +2030,13 @@ AssignExpr::GetValue(FunctionEmitContext *ctx) const {
    ctx->SetDebugPos(pos);

    Symbol *baseSym = lvalue->GetBaseSymbol();
-    // Should be caught during type-checking...
-    assert(baseSym != NULL);

    switch (op) {
    case Assign: {
        llvm::Value *lv = lvalue->GetLValue(ctx);
        if (lv == NULL) {
-            Assert(m->errorCount > 0);
+            Error(lvalue->pos, "Left hand side of assignment expression can't "
+                  "be assigned to.");
            return NULL;
        }
        const Type *lvalueType = lvalue->GetLValueType();
@@ -2089,13 +2081,8 @@ AssignExpr::GetValue(FunctionEmitContext *ctx) const {

 Expr *
 AssignExpr::Optimize() {
-    if (lvalue) 
-        lvalue = lvalue->Optimize();
-    if (rvalue) 
-        rvalue = rvalue->Optimize();
    if (lvalue == NULL || rvalue == NULL)
        return NULL;
-
    return this;
 }

@@ -2139,10 +2126,6 @@ lCheckForConstStructMember(SourcePos pos, const StructType *structType,

 Expr *
 AssignExpr::TypeCheck() {
-    if (lvalue != NULL) 
-        lvalue = lvalue->TypeCheck();
-    if (rvalue != NULL) 
-        rvalue = rvalue->TypeCheck();
    if (lvalue == NULL || rvalue == NULL) 
        return NULL;

@@ -2176,13 +2159,13 @@ AssignExpr::TypeCheck() {
        }
    }

-    if (lvalue->GetBaseSymbol() == NULL) {
-        Error(lvalue->pos, "Left hand side of assignment statement can't be "
-              "assigned to.");
+    const Type *lhsType = lvalue->GetType();
+    if (lhsType->IsConstType()) {
+        Error(lvalue->pos, "Can't assign to type \"%s\" on left-hand side of "
+              "expression.", lhsType->GetString().c_str());
        return NULL;
    }

-    const Type *lhsType = lvalue->GetType();
    if (dynamic_cast<const PointerType *>(lhsType) != NULL) {
        if (op == AddAssign || op == SubAssign) {
            if (PointerType::IsVoidPointer(lhsType)) {
@@ -2216,12 +2199,6 @@ AssignExpr::TypeCheck() {
    if (rvalue == NULL)
        return NULL;

-    if (lhsType->IsConstType()) {
-        Error(pos, "Can't assign to type \"%s\" on left-hand side of "
-              "expression.", lhsType->GetString().c_str());
-        return NULL;
-    }
-
    // Make sure we're not assigning to a struct that has a constant member
    const StructType *st = dynamic_cast<const StructType *>(lhsType);
    if (st != NULL && lCheckForConstStructMember(pos, st, st))
@@ -2233,15 +2210,12 @@ AssignExpr::TypeCheck() {

 int
 AssignExpr::EstimateCost() const {
-    int cost = ((lvalue ? lvalue->EstimateCost() : 0) +
-                (rvalue ? rvalue->EstimateCost() : 0));
-    cost += COST_ASSIGN;
    if (op == Assign)
-        return cost;
+        return COST_ASSIGN;
    if (op == DivAssign || op == ModAssign)
-        return cost + COST_COMPLEX_ARITH_OP;
+        return COST_ASSIGN + COST_COMPLEX_ARITH_OP;
    else
-        return cost + COST_SIMPLE_ARITH_LOGIC_OP;
+        return COST_ASSIGN + COST_SIMPLE_ARITH_LOGIC_OP;
 }


@@ -2412,28 +2386,14 @@ SelectExpr::GetType() const {

 Expr *
 SelectExpr::Optimize() {
-    if (test) 
-        test = test->Optimize();
-    if (expr1) 
-        expr1 = expr1->Optimize();
-    if (expr2) 
-        expr2 = expr2->Optimize();
    if (test == NULL || expr1 == NULL || expr2 == NULL)
        return NULL;
-
    return this;
 }


 Expr *
 SelectExpr::TypeCheck() {
-    if (test) 
-        test = test->TypeCheck();
-    if (expr1) 
-        expr1 = expr1->TypeCheck();
-    if (expr2) 
-        expr2 = expr2->TypeCheck();
-
    if (test == NULL || expr1 == NULL || expr2 == NULL)
        return NULL;

@@ -2650,144 +2610,130 @@ FunctionCallExpr::GetType() const {

 Expr *
 FunctionCallExpr::Optimize() {
-    if (func) 
-        func = func->Optimize();
-    if (args) 
-        args = args->Optimize();
-    if (launchCountExpr != NULL)
-        launchCountExpr = launchCountExpr->Optimize();
-
-    if (!func || !args)
+    if (func == NULL || args == NULL)
        return NULL;
-        
    return this;
 }


 Expr *
 FunctionCallExpr::TypeCheck() {
-    if (func != NULL)
-        func = func->TypeCheck();
-    if (args != NULL) 
-        args = args->TypeCheck();
-    if (launchCountExpr != NULL)
-        launchCountExpr = launchCountExpr->TypeCheck();
+    if (func == NULL || args == NULL)
+        return NULL;

-    if (args != NULL && func != NULL) {
-        std::vector<const Type *> argTypes;
-        std::vector<bool> argCouldBeNULL;
-        for (unsigned int i = 0; i < args->exprs.size(); ++i) {
-            if (args->exprs[i] == NULL)
-                return NULL;
-            const Type *t = args->exprs[i]->GetType();
-            if (t == NULL)
-                return NULL;
-            argTypes.push_back(t);
-            argCouldBeNULL.push_back(lIsAllIntZeros(args->exprs[i]));
+    std::vector<const Type *> argTypes;
+    std::vector<bool> argCouldBeNULL;
+    for (unsigned int i = 0; i < args->exprs.size(); ++i) {
+        if (args->exprs[i] == NULL)
+            return NULL;
+        const Type *t = args->exprs[i]->GetType();
+        if (t == NULL)
+            return NULL;
+        argTypes.push_back(t);
+        argCouldBeNULL.push_back(lIsAllIntZeros(args->exprs[i]));
+    }
+
+    FunctionSymbolExpr *fse = dynamic_cast<FunctionSymbolExpr *>(func);
+    if (fse != NULL) {
+        // Regular function call
+
+        if (fse->ResolveOverloads(args->pos, argTypes, &argCouldBeNULL) == false)
+            return NULL;
+
+        func = ::TypeCheck(fse);
+        if (func == NULL)
+            return NULL;
+
+        const PointerType *pt = 
+            dynamic_cast<const PointerType *>(func->GetType());
+        const FunctionType *ft = (pt == NULL) ? NULL : 
+            dynamic_cast<const FunctionType *>(pt->GetBaseType());
+        if (ft == NULL) {
+            Error(pos, "Valid function name must be used for function call.");
+            return NULL;
        }

-        FunctionSymbolExpr *fse = dynamic_cast<FunctionSymbolExpr *>(func);
-        if (fse != NULL) {
-            // Regular function call
-
-            if (fse->ResolveOverloads(args->pos, argTypes, &argCouldBeNULL) == false)
+        if (ft->isTask) {
+            if (!isLaunch)
+                Error(pos, "\"launch\" expression needed to call function "
+                      "with \"task\" qualifier.");
+            if (!launchCountExpr)
                return NULL;

-            func = fse->TypeCheck();
-            if (func == NULL)
+            launchCountExpr = 
+                TypeConvertExpr(launchCountExpr, AtomicType::UniformInt32,
+                                "task launch count");
+            if (launchCountExpr == NULL)
                return NULL;
-
-            const PointerType *pt = 
-                dynamic_cast<const PointerType *>(func->GetType());
-            const FunctionType *ft = (pt == NULL) ? NULL : 
-                dynamic_cast<const FunctionType *>(pt->GetBaseType());
-            if (ft == NULL) {
-                Error(pos, "Valid function name must be used for function call.");
-                return NULL;
-            }
-
-            if (ft->isTask) {
-                if (!isLaunch)
-                    Error(pos, "\"launch\" expression needed to call function "
-                          "with \"task\" qualifier.");
-                if (!launchCountExpr)
-                    return NULL;
-
-                launchCountExpr = 
-                    TypeConvertExpr(launchCountExpr, AtomicType::UniformInt32,
-                                    "task launch count");
-                if (launchCountExpr == NULL)
-                    return NULL;
-            }
-            else {
-                if (isLaunch)
-                    Error(pos, "\"launch\" expression illegal with non-\"task\"-"
-                          "qualified function.");
-                Assert(launchCountExpr == NULL);
-            }
        }
        else {
-            // Call through a function pointer
-            const Type *fptrType = func->GetType();
-            if (fptrType == NULL)
-                return NULL;
+            if (isLaunch)
+                Error(pos, "\"launch\" expression illegal with non-\"task\"-"
+                      "qualified function.");
+            Assert(launchCountExpr == NULL);
+        }
+    }
+    else {
+        // Call through a function pointer
+        const Type *fptrType = func->GetType();
+        if (fptrType == NULL)
+            return NULL;
           
-            Assert(dynamic_cast<const PointerType *>(fptrType) != NULL);
-            const FunctionType *funcType = 
-                dynamic_cast<const FunctionType *>(fptrType->GetBaseType());
-            if (funcType == NULL) {
-                Error(pos, "Must provide function name or function pointer for "
-                      "function call expression.");
-                return NULL;
-            }
+        Assert(dynamic_cast<const PointerType *>(fptrType) != NULL);
+        const FunctionType *funcType = 
+            dynamic_cast<const FunctionType *>(fptrType->GetBaseType());
+        if (funcType == NULL) {
+            Error(pos, "Must provide function name or function pointer for "
+                  "function call expression.");
+            return NULL;
+        }
            
-            // Make sure we don't have too many arguments for the function
-            if ((int)argTypes.size() > funcType->GetNumParameters()) {
-                Error(args->pos, "Too many parameter values provided in "
-                      "function call (%d provided, %d expected).",
-                      (int)argTypes.size(), funcType->GetNumParameters());
-                return NULL;
-            }
-            // It's ok to have too few arguments, as long as the function's
-            // default parameter values have started by the time we run out
-            // of arguments
-            if ((int)argTypes.size() < funcType->GetNumParameters() &&
-                funcType->GetParameterDefault(argTypes.size()) == NULL) {
-                Error(args->pos, "Too few parameter values provided in "
-                      "function call (%d provided, %d expected).",
-                      (int)argTypes.size(), funcType->GetNumParameters());
-                return NULL;
-            }
+        // Make sure we don't have too many arguments for the function
+        if ((int)argTypes.size() > funcType->GetNumParameters()) {
+            Error(args->pos, "Too many parameter values provided in "
+                  "function call (%d provided, %d expected).",
+                  (int)argTypes.size(), funcType->GetNumParameters());
+            return NULL;
+        }
+        // It's ok to have too few arguments, as long as the function's
+        // default parameter values have started by the time we run out
+        // of arguments
+        if ((int)argTypes.size() < funcType->GetNumParameters() &&
+            funcType->GetParameterDefault(argTypes.size()) == NULL) {
+            Error(args->pos, "Too few parameter values provided in "
+                  "function call (%d provided, %d expected).",
+                  (int)argTypes.size(), funcType->GetNumParameters());
+            return NULL;
+        }

-            // Now make sure they can all type convert to the corresponding
-            // parameter types..
-            for (int i = 0; i < (int)argTypes.size(); ++i) {
-                if (i < funcType->GetNumParameters()) {
-                    // make sure it can type convert
-                    const Type *paramType = funcType->GetParameterType(i);
-                    if (CanConvertTypes(argTypes[i], paramType) == false &&
-                        !(argCouldBeNULL[i] == true &&
-                          dynamic_cast<const PointerType *>(paramType) != NULL)) {
-                        Error(args->exprs[i]->pos, "Can't convert argument of "
-                              "type \"%s\" to type \"%s\" for funcion call "
-                              "argument.", argTypes[i]->GetString().c_str(),
-                              paramType->GetString().c_str());
-                        return NULL;
-                    }
+        // Now make sure they can all type convert to the corresponding
+        // parameter types..
+        for (int i = 0; i < (int)argTypes.size(); ++i) {
+            if (i < funcType->GetNumParameters()) {
+                // make sure it can type convert
+                const Type *paramType = funcType->GetParameterType(i);
+                if (CanConvertTypes(argTypes[i], paramType) == false &&
+                    !(argCouldBeNULL[i] == true &&
+                      dynamic_cast<const PointerType *>(paramType) != NULL)) {
+                    Error(args->exprs[i]->pos, "Can't convert argument of "
+                          "type \"%s\" to type \"%s\" for function call "
+                          "argument.", argTypes[i]->GetString().c_str(),
+                          paramType->GetString().c_str());
+                    return NULL;
                }
-                else
-                    // Otherwise the parameter default saves us.  It should
-                    // be there for sure, given the check right above the
-                    // for loop.
-                    Assert(funcType->GetParameterDefault(i) != NULL);
            }
+            else
+                // Otherwise the parameter default saves us.  It should
+                // be there for sure, given the check right above the
+                // for loop.
+                Assert(funcType->GetParameterDefault(i) != NULL);
+        }

-            if (fptrType->IsVaryingType() && 
-                funcType->GetReturnType()->IsUniformType()) {
-                Error(pos, "Illegal to call a varying function pointer that "
-                      "points to a function with a uniform return type.");
-                return NULL;
-            }
+        if (fptrType->IsVaryingType() && 
+            funcType->GetReturnType()->IsUniformType()) {
+            Error(pos, "Illegal to call a varying function pointer that "
+                  "points to a function with a uniform return type.");
+            return NULL;
        }
    }

@@ -2799,28 +2745,20 @@ FunctionCallExpr::TypeCheck() {

 int
 FunctionCallExpr::EstimateCost() const {
-    int callCost = 0;
-    if (isLaunch) {
-        callCost = COST_TASK_LAUNCH;
-        if (launchCountExpr != NULL)
-            callCost += launchCountExpr->EstimateCost();
-    }
+    if (isLaunch)
+        return COST_TASK_LAUNCH;
    else if (dynamic_cast<FunctionSymbolExpr *>(func) == NULL) {
        // it's going through a function pointer
        const Type *fpType = func->GetType();
        if (fpType != NULL) {
            Assert(dynamic_cast<const PointerType *>(fpType) != NULL);
            if (fpType->IsUniformType())
-                callCost = COST_FUNPTR_UNIFORM;
+                return COST_FUNPTR_UNIFORM;
            else 
-                callCost = COST_FUNPTR_VARYING;
+                return COST_FUNPTR_VARYING;
        }
    }
-    else
-        // regular function call
-        callCost = COST_FUNCALL;
-
-    return (args ? args->EstimateCost() : 0) + callCost;
+    return COST_FUNCALL;
 }


@@ -2858,18 +2796,12 @@ ExprList::GetType() const {

 ExprList *
 ExprList::Optimize() {
-    for (unsigned int i = 0; i < exprs.size(); ++i)
-        if (exprs[i])
-            exprs[i] = exprs[i]->Optimize();
    return this;
 }


 ExprList *
 ExprList::TypeCheck() {
-    for (unsigned int i = 0; i < exprs.size(); ++i)
-        if (exprs[i])
-            exprs[i] = exprs[i]->TypeCheck();
    return this;
 }

@@ -2943,12 +2875,7 @@ ExprList::GetConstant(const Type *type) const {

 int
 ExprList::EstimateCost() const {
-    int cost = 0;
-    for (unsigned int i = 0; i < exprs.size(); ++i) {
-        if (exprs[i] != NULL)
-            cost += exprs[i]->EstimateCost();
-    }
-    return cost;
+    return 0;
 }


@@ -3224,29 +3151,19 @@ IndexExpr::GetLValueType() const {

 Expr *
 IndexExpr::Optimize() {
-    if (baseExpr) 
-        baseExpr = baseExpr->Optimize();
-    if (index) 
-        index = index->Optimize();
    if (baseExpr == NULL || index == NULL)
        return NULL;
-
    return this;
 }


 Expr *
 IndexExpr::TypeCheck() {
-    if (baseExpr) 
-        baseExpr = baseExpr->TypeCheck();
-    if (index) 
-        index = index->TypeCheck();
-    
-    if (!baseExpr || !index || !index->GetType())
+    if (baseExpr == NULL || index == NULL || index->GetType() == NULL)
        return NULL;

    const Type *baseExprType = baseExpr->GetType();
-    if (!baseExprType)
+    if (baseExprType == NULL)
        return NULL;

    if (!dynamic_cast<const SequentialType *>(baseExprType->GetReferenceTarget()) &&
@@ -3615,6 +3532,12 @@ VectorMemberExpr::getElementType() const {
 MemberExpr *
 MemberExpr::create(Expr *e, const char *id, SourcePos p, SourcePos idpos,
                   bool derefLValue) {
+    // FIXME: we need to call TypeCheck() here so that we can call
+    // e->GetType() in the following.  But really we just shouldn't try to
+    // resolve this now but just have a generic MemberExpr type that
+    // handles all cases so that this is unnecessary.
+    e = ::TypeCheck(e);
+
    const Type *exprType;
    if (e == NULL || (exprType = e->GetType()) == NULL)
        return NULL;
@@ -3779,16 +3702,12 @@ MemberExpr::GetLValueType() const {

 Expr *
 MemberExpr::TypeCheck() {
-    if (expr) 
-        expr = expr->TypeCheck();
    return expr ? this : NULL;
 }


 Expr *
 MemberExpr::Optimize() {
-    if (expr) 
-        expr = expr->Optimize();
    return expr ? this : NULL;
 }

@@ -4630,18 +4549,10 @@ ConstExpr::Print() const {
            printf("%f", floatVal[i]);
            break;
        case AtomicType::TYPE_INT64:
-#ifdef ISPC_IS_LINUX
-            printf("%ld", int64Val[i]);
-#else
-            printf("%lld", int64Val[i]);
-#endif
+            printf("%"PRId64, int64Val[i]);
            break;
        case AtomicType::TYPE_UINT64:
-#ifdef ISPC_IS_LINUX
-            printf("%lu", uint64Val[i]);
-#else
-            printf("%llu", uint64Val[i]);
-#endif
+            printf("%"PRIu64, uint64Val[i]);
            break;
        case AtomicType::TYPE_DOUBLE:
            printf("%f", doubleVal[i]);
@@ -4660,11 +4571,10 @@ ConstExpr::Print() const {
 ///////////////////////////////////////////////////////////////////////////
 // TypeCastExpr

-TypeCastExpr::TypeCastExpr(const Type *t, Expr *e, bool pu, SourcePos p) 
+TypeCastExpr::TypeCastExpr(const Type *t, Expr *e, SourcePos p) 
  : Expr(p) {
    type = t;
    expr = e;
-    preserveUniformity = pu;
 }


@@ -5307,10 +5217,10 @@ TypeCastExpr::GetValue(FunctionEmitContext *ctx) const {
        if (Type::EqualIgnoringConst(arrayAsPtr->GetType(), toPointerType) == false) {
            Assert(Type::EqualIgnoringConst(arrayAsPtr->GetType()->GetAsVaryingType(),
                                            toPointerType) == true);
-            arrayAsPtr = new TypeCastExpr(toPointerType, arrayAsPtr, false, pos);
-            arrayAsPtr = arrayAsPtr->TypeCheck();
+            arrayAsPtr = new TypeCastExpr(toPointerType, arrayAsPtr, pos);
+            arrayAsPtr = ::TypeCheck(arrayAsPtr);
            Assert(arrayAsPtr != NULL);
-            arrayAsPtr = arrayAsPtr->Optimize();
+            arrayAsPtr = ::Optimize(arrayAsPtr);
            Assert(arrayAsPtr != NULL);
        }
        Assert(Type::EqualIgnoringConst(arrayAsPtr->GetType(), toPointerType));
@@ -5458,6 +5368,7 @@ TypeCastExpr::GetValue(FunctionEmitContext *ctx) const {

 const Type *
 TypeCastExpr::GetType() const { 
+    Assert(type->HasUnboundVariability() == false);
    return type; 
 }

@@ -5467,7 +5378,7 @@ lDeconstifyType(const Type *t) {
    const PointerType *pt = dynamic_cast<const PointerType *>(t);
    if (pt != NULL)
        return new PointerType(lDeconstifyType(pt->GetBaseType()), 
-                               pt->IsUniformType(), false);
+                               pt->GetVariability(), false);
    else
        return t->GetAsNonConstType();
 }
@@ -5475,21 +5386,19 @@ lDeconstifyType(const Type *t) {

 Expr *
 TypeCastExpr::TypeCheck() {
-    if (expr != NULL) 
-        expr = expr->TypeCheck();
    if (expr == NULL)
        return NULL;

-    const Type *toType = GetType(), *fromType = expr->GetType();
+    const Type *toType = type, *fromType = expr->GetType();
    if (toType == NULL || fromType == NULL)
        return NULL;

-    if (preserveUniformity == true && fromType->IsUniformType() &&
-        toType->IsVaryingType()) {
+    if (toType->HasUnboundVariability() && fromType->IsUniformType()) {
        TypeCastExpr *tce = new TypeCastExpr(toType->GetAsUniformType(),
-                                             expr, false, pos);
-        return tce->TypeCheck();
+                                             expr, pos);
+        return ::TypeCheck(tce);
    }
+    type = toType = type->ResolveUnboundVariability(Type::Varying);

    fromType = lDeconstifyType(fromType);
    toType = lDeconstifyType(toType);
@@ -5546,13 +5455,8 @@ TypeCastExpr::TypeCheck() {

 Expr *
 TypeCastExpr::Optimize() {
-    if (expr != NULL) 
-        expr = expr->Optimize();
-    if (expr == NULL)
-        return NULL;
-
    ConstExpr *constExpr = dynamic_cast<ConstExpr *>(expr);
-    if (!constExpr)
+    if (constExpr == NULL)
        // We can't do anything if this isn't a const expr
        return this;

@@ -5736,19 +5640,14 @@ ReferenceExpr::GetLValueType() const {

 Expr *
 ReferenceExpr::Optimize() {
-    if (expr) 
-        expr = expr->Optimize();
    if (expr == NULL)
        return NULL;
-
    return this;
 }


 Expr *
 ReferenceExpr::TypeCheck() {
-    if (expr != NULL) 
-        expr = expr->TypeCheck();
    if (expr == NULL)
        return NULL;
    return this;
@@ -5845,8 +5744,6 @@ DereferenceExpr::GetType() const {

 Expr *
 DereferenceExpr::TypeCheck() {
-    if (expr != NULL)
-        expr = expr->TypeCheck();
    if (expr == NULL)
        return NULL;
    return this;
@@ -5855,8 +5752,6 @@ DereferenceExpr::TypeCheck() {

 Expr *
 DereferenceExpr::Optimize() {
-    if (expr != NULL) 
-        expr = expr->Optimize();
    if (expr == NULL)
        return NULL;
    return this;
@@ -5946,16 +5841,12 @@ AddressOfExpr::Print() const {

 Expr *
 AddressOfExpr::TypeCheck() {
-    if (expr != NULL)
-        expr = expr->TypeCheck();
    return this;
 }


 Expr *
 AddressOfExpr::Optimize() {
-    if (expr != NULL)
-        expr = expr->Optimize();
    return this;
 }

@@ -5976,6 +5867,8 @@ SizeOfExpr::SizeOfExpr(Expr *e, SourcePos p)

 SizeOfExpr::SizeOfExpr(const Type *t, SourcePos p)
    : Expr(p), expr(NULL), type(t) {
+    if (type->HasUnboundVariability())
+        type = type->ResolveUnboundVariability(Type::Varying);
 }


@@ -5990,7 +5883,7 @@ SizeOfExpr::GetValue(FunctionEmitContext *ctx) const {
    if (llvmType == NULL)
        return NULL;

-    return g->target.SizeOf(llvmType);
+    return g->target.SizeOf(llvmType, ctx->GetCurrentBasicBlock());
 }


@@ -6016,16 +5909,12 @@ SizeOfExpr::Print() const {

 Expr *
 SizeOfExpr::TypeCheck() {
-    if (expr != NULL)
-        expr = expr->TypeCheck();
    return this;
 }


 Expr *
 SizeOfExpr::Optimize() {
-    if (expr != NULL)
-        expr = expr->Optimize();
    return this;
 }

@@ -6144,7 +6033,8 @@ FunctionSymbolExpr::GetType() const {
        return NULL;
    }

-    return matchingFunc ? new PointerType(matchingFunc->type, true, true) : NULL;
+    return matchingFunc ? 
+        new PointerType(matchingFunc->type, Type::Uniform, true) : NULL;
 }


--- a/expr.h
+++ b/expr.h
@@ -314,7 +314,6 @@ public:
    std::string identifier;
    const SourcePos identifierPos;

-protected:
    MemberExpr(Expr *expr, const char *identifier, SourcePos pos, 
               SourcePos identifierPos, bool derefLValue);

@@ -493,8 +492,7 @@ private:
    probably-different type. */
 class TypeCastExpr : public Expr {
 public:
-    TypeCastExpr(const Type *t, Expr *e, bool preserveUniformity,
-                 SourcePos p);
+    TypeCastExpr(const Type *t, Expr *e, SourcePos p);

    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
    const Type *GetType() const;
@@ -507,7 +505,6 @@ public:

    const Type *type;
    Expr *expr;
-    bool preserveUniformity;
 };


--- a/func.cpp
+++ b/func.cpp
@@ -75,14 +75,7 @@ Function::Function(Symbol *s, const std::vector<Symbol *> &a, Stmt *c) {
    Assert(maskSymbol != NULL);

    if (code != NULL) {
-        if (g->debugPrint) {
-            fprintf(stderr, "Creating function \"%s\".  Initial code:\n", 
-                    sym->name.c_str());
-            code->Print(0);
-            fprintf(stderr, "---------------------\n");
-        }
-
-        code = code->TypeCheck();
+        code = TypeCheck(code);

        if (code != NULL && g->debugPrint) {
            fprintf(stderr, "After typechecking function \"%s\":\n", 
@@ -92,7 +85,7 @@ Function::Function(Symbol *s, const std::vector<Symbol *> &a, Stmt *c) {
        }

        if (code != NULL) {
-            code = code->Optimize();
+            code = Optimize(code);
            if (g->debugPrint) {
                fprintf(stderr, "After optimizing function \"%s\":\n", 
                        sym->name.c_str());
@@ -277,7 +270,7 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
        ctx->SetDebugPos(code->pos);
        ctx->AddInstrumentationPoint("function entry");

-        int costEstimate = code->EstimateCost();
+        int costEstimate = EstimateCost(code);
        Debug(code->pos, "Estimated cost for function \"%s\" = %d\n", 
              sym->name.c_str(), costEstimate);

@@ -288,14 +281,19 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
        bool checkMask = (type->isTask == true) || 
            ((function->hasFnAttr(llvm::Attribute::AlwaysInline) == false) &&
             costEstimate > CHECK_MASK_AT_FUNCTION_START_COST);
-        if (checkMask && g->opt.disableCoherentControlFlow == false) {
+        checkMask &= (g->target.maskingIsFree == false);
+        checkMask &= (g->opt.disableCoherentControlFlow == false);
+
+        if (checkMask) {
            llvm::Value *mask = ctx->GetFunctionMask();
            llvm::Value *allOn = ctx->All(mask);
            llvm::BasicBlock *bbAllOn = ctx->CreateBasicBlock("all_on");
            llvm::BasicBlock *bbNotAll = ctx->CreateBasicBlock("not_all_on");

-            ctx->BranchInst(bbAllOn, bbNotAll, allOn);
+            // Set up basic blocks for goto targets
+            ctx->InitializeLabelMap(code);

+            ctx->BranchInst(bbAllOn, bbNotAll, allOn);
            // all on: we've determined dynamically that the mask is all
            // on.  Set the current mask to "all on" explicitly so that
            // codegen for this path can be improved with this knowledge in
@@ -326,14 +324,22 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
            // above
            ctx->SetCurrentBasicBlock(bbSomeOn);
            ctx->SetFunctionMask(mask);
+
+            // Set up basic blocks for goto targets again; we want to have
+            // one set of them for gotos in the 'all on' case, and a
+            // distinct set for the 'mixed mask' case.
+            ctx->InitializeLabelMap(code);
+
            code->EmitCode(ctx);
            if (ctx->GetCurrentBasicBlock())
                ctx->ReturnInst();
-
        }
-        else
+        else {
+            // Set up basic blocks for goto targets
+            ctx->InitializeLabelMap(code);
            // No check, just emit the code
            code->EmitCode(ctx);
+        }
    }

    if (ctx->GetCurrentBasicBlock()) {
@@ -387,9 +393,8 @@ Function::GenerateIR() {
    SourcePos firstStmtPos = sym->pos;
    if (code) {
        StmtList *sl = dynamic_cast<StmtList *>(code);
-        if (sl && sl->GetStatements().size() > 0 && 
-            sl->GetStatements()[0] != NULL)
-            firstStmtPos = sl->GetStatements()[0]->pos;
+        if (sl && sl->stmts.size() > 0 && sl->stmts[0] != NULL)
+            firstStmtPos = sl->stmts[0]->pos;
        else
            firstStmtPos = code->pos;
    }
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -50,6 +50,7 @@
 #include <llvm/Analysis/DIBuilder.h>
 #include <llvm/Analysis/DebugInfo.h>
 #include <llvm/Support/Dwarf.h>
+#include <llvm/Instructions.h>
 #include <llvm/Target/TargetMachine.h>
 #include <llvm/Target/TargetOptions.h>
 #include <llvm/Target/TargetData.h>
@@ -129,24 +130,60 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
        t->nativeVectorWidth = 4;
        t->vectorWidth = 4;
        t->attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt";
+        t->maskingIsFree = false;
+        t->allOffMaskIsSafe = false;
+        t->maskBitCount = 32;
    }
    else if (!strcasecmp(isa, "sse2-x2")) {
        t->isa = Target::SSE2;
        t->nativeVectorWidth = 4;
        t->vectorWidth = 8;
        t->attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt";
+        t->maskingIsFree = false;
+        t->allOffMaskIsSafe = false;
+        t->maskBitCount = 32;
    }
    else if (!strcasecmp(isa, "sse4")) {
        t->isa = Target::SSE4;
        t->nativeVectorWidth = 4;
        t->vectorWidth = 4;
        t->attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
+        t->maskingIsFree = false;
+        t->allOffMaskIsSafe = false;
+        t->maskBitCount = 32;
    }
    else if (!strcasecmp(isa, "sse4x2") || !strcasecmp(isa, "sse4-x2")) {
        t->isa = Target::SSE4;
        t->nativeVectorWidth = 4;
        t->vectorWidth = 8;
        t->attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
+        t->maskingIsFree = false;
+        t->allOffMaskIsSafe = false;
+        t->maskBitCount = 32;
+    }
+    else if (!strcasecmp(isa, "generic-4")) {
+        t->isa = Target::GENERIC;
+        t->nativeVectorWidth = 4;
+        t->vectorWidth = 4;
+        t->maskingIsFree = true;
+        t->allOffMaskIsSafe = true;
+        t->maskBitCount = 1;
+    }
+    else if (!strcasecmp(isa, "generic-8")) {
+        t->isa = Target::GENERIC;
+        t->nativeVectorWidth = 8;
+        t->vectorWidth = 8;
+        t->maskingIsFree = true;
+        t->allOffMaskIsSafe = true;
+        t->maskBitCount = 1;
+    }
+    else if (!strcasecmp(isa, "generic-16")) {
+        t->isa = Target::GENERIC;
+        t->nativeVectorWidth = 16;
+        t->vectorWidth = 16;
+        t->maskingIsFree = true;
+        t->allOffMaskIsSafe = true;
+        t->maskBitCount = 1;
    }
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
    else if (!strcasecmp(isa, "avx")) {
@@ -154,12 +191,18 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
        t->nativeVectorWidth = 8;
        t->vectorWidth = 8;
        t->attributes = "+avx,+popcnt,+cmov";
+        t->maskingIsFree = false;
+        t->allOffMaskIsSafe = false;
+        t->maskBitCount = 32;
    }
    else if (!strcasecmp(isa, "avx-x2")) {
        t->isa = Target::AVX;
        t->nativeVectorWidth = 8;
        t->vectorWidth = 16;
        t->attributes = "+avx,+popcnt,+cmov";
+        t->maskingIsFree = false;
+        t->allOffMaskIsSafe = false;
+        t->maskBitCount = 32;
    }
 #endif // LLVM 3.0+
 #if defined(LLVM_3_1svn)
@@ -168,12 +211,18 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
        t->nativeVectorWidth = 8;
        t->vectorWidth = 8;
        t->attributes = "+avx2,+popcnt,+cmov";
+        t->maskingIsFree = false;
+        t->allOffMaskIsSafe = false;
+        t->maskBitCount = 32;
    }
    else if (!strcasecmp(isa, "avx2-x2")) {
        t->isa = Target::AVX2;
        t->nativeVectorWidth = 16;
        t->vectorWidth = 16;
        t->attributes = "+avx2,+popcnt,+cmov";
+        t->maskingIsFree = false;
+        t->allOffMaskIsSafe = false;
+        t->maskBitCount = 32;
    }
 #endif // LLVM 3.1
    else {
@@ -221,7 +270,7 @@ Target::SupportedTargetISAs() {
 #ifdef LLVM_3_1svn
        ", avx2, avx2-x2"
 #endif // LLVM_3_1svn
-        ;
+        ", generic-4, generic-8, generic-16";
 }


@@ -300,6 +349,8 @@ Target::GetISAString() const {
        return "avx";
    case Target::AVX2:
        return "avx2";
+    case Target::GENERIC:
+        return "generic";
    default:
        FATAL("Unhandled target in GetISAString()");
    }
@@ -307,8 +358,66 @@ Target::GetISAString() const {
 }


+static bool
+lGenericTypeLayoutIndeterminate(LLVM_TYPE_CONST llvm::Type *type) {
+    if (type->isPrimitiveType() || type->isIntegerTy())
+        return false;
+
+    if (type == LLVMTypes::BoolVectorType ||
+        type == LLVMTypes::MaskType ||
+        type == LLVMTypes::Int1VectorType)
+        return true;
+
+    LLVM_TYPE_CONST llvm::ArrayType *at = 
+        llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(type);
+    if (at != NULL)
+        return lGenericTypeLayoutIndeterminate(at->getElementType());
+
+    LLVM_TYPE_CONST llvm::PointerType *pt = 
+        llvm::dyn_cast<LLVM_TYPE_CONST llvm::PointerType>(type);
+    if (pt != NULL)
+        return false;
+
+    LLVM_TYPE_CONST llvm::StructType *st =
+        llvm::dyn_cast<LLVM_TYPE_CONST llvm::StructType>(type);
+    if (st != NULL) {
+        for (int i = 0; i < (int)st->getNumElements(); ++i)
+            if (lGenericTypeLayoutIndeterminate(st->getElementType(i)))
+                return true;
+        return false;
+    }
+
+    Assert(llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(type));
+    return true;
+}
+
+
 llvm::Value *
-Target::SizeOf(LLVM_TYPE_CONST llvm::Type *type) {
+Target::SizeOf(LLVM_TYPE_CONST llvm::Type *type, 
+               llvm::BasicBlock *insertAtEnd) {
+    if (isa == Target::GENERIC &&
+        lGenericTypeLayoutIndeterminate(type)) {
+        llvm::Value *index[1] = { LLVMInt32(1) };
+        LLVM_TYPE_CONST llvm::PointerType *ptrType = llvm::PointerType::get(type, 0);
+        llvm::Value *voidPtr = llvm::ConstantPointerNull::get(ptrType);
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
+        llvm::ArrayRef<llvm::Value *> arrayRef(&index[0], &index[1]);
+        llvm::Instruction *gep = 
+            llvm::GetElementPtrInst::Create(voidPtr, arrayRef, "sizeof_gep",
+                                            insertAtEnd);
+#else
+        llvm::Instruction *gep =
+            llvm::GetElementPtrInst::Create(voidPtr, &index[0], &index[1],
+                                            "sizeof_gep", insertAtEnd);
+#endif
+        if (is32Bit || g->opt.force32BitAddressing)
+            return new llvm::PtrToIntInst(gep, LLVMTypes::Int32Type, 
+                                          "sizeof_int", insertAtEnd);
+        else
+            return new llvm::PtrToIntInst(gep, LLVMTypes::Int64Type, 
+                                          "sizeof_int", insertAtEnd);
+    }
+
    const llvm::TargetData *td = GetTargetMachine()->getTargetData();
    Assert(td != NULL);
    uint64_t byteSize = td->getTypeSizeInBits(type) / 8;
@@ -320,7 +429,31 @@ Target::SizeOf(LLVM_TYPE_CONST llvm::Type *type) {


 llvm::Value *
-Target::StructOffset(LLVM_TYPE_CONST llvm::Type *type, int element) {
+Target::StructOffset(LLVM_TYPE_CONST llvm::Type *type, int element,
+                     llvm::BasicBlock *insertAtEnd) {
+    if (isa == Target::GENERIC && 
+        lGenericTypeLayoutIndeterminate(type) == true) {
+        llvm::Value *indices[2] = { LLVMInt32(0), LLVMInt32(element) };
+        LLVM_TYPE_CONST llvm::PointerType *ptrType = llvm::PointerType::get(type, 0);
+        llvm::Value *voidPtr = llvm::ConstantPointerNull::get(ptrType);
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
+        llvm::ArrayRef<llvm::Value *> arrayRef(&indices[0], &indices[2]);
+        llvm::Instruction *gep = 
+            llvm::GetElementPtrInst::Create(voidPtr, arrayRef, "offset_gep",
+                                            insertAtEnd);
+#else
+        llvm::Instruction *gep =
+            llvm::GetElementPtrInst::Create(voidPtr, &indices[0], &indices[2],
+                                            "offset_gep", insertAtEnd);
+#endif
+        if (is32Bit || g->opt.force32BitAddressing)
+            return new llvm::PtrToIntInst(gep, LLVMTypes::Int32Type, 
+                                          "offset_int", insertAtEnd);
+        else
+            return new llvm::PtrToIntInst(gep, LLVMTypes::Int64Type, 
+                                          "offset_int", insertAtEnd);
+    }
+
    const llvm::TargetData *td = GetTargetMachine()->getTargetData();
    Assert(td != NULL);
    LLVM_TYPE_CONST llvm::StructType *structType = 
--- a/ispc.h
+++ b/ispc.h
@@ -98,6 +98,8 @@ namespace llvm {
 #endif

 class ArrayType;
+class AST;
+class ASTNode;
 class AtomicType;
 class FunctionEmitContext;
 class Expr;
@@ -177,12 +179,14 @@ struct Target {
    const char *GetISAString() const;

    /** Returns the size of the given type */
-    llvm::Value *SizeOf(LLVM_TYPE_CONST llvm::Type *type);
+    llvm::Value *SizeOf(LLVM_TYPE_CONST llvm::Type *type,
+                        llvm::BasicBlock *insertAtEnd);
+
    /** Given a structure type and an element number in the structure,
        returns a value corresponding to the number of bytes from the start
        of the structure where the element is located. */
    llvm::Value *StructOffset(LLVM_TYPE_CONST llvm::Type *type,
-                              int element);
+                              int element, llvm::BasicBlock *insertAtEnd);

    /** llvm Target object representing this target. */
    const llvm::Target *target;
@@ -193,7 +197,7 @@ struct Target {
        flexible/performant of them will apear last in the enumerant.  Note
        also that __best_available_isa() needs to be updated if ISAs are
        added or the enumerant values are reordered.  */
-    enum ISA { SSE2, SSE4, AVX, AVX2, NUM_ISAS };
+    enum ISA { SSE2, SSE4, AVX, AVX2, GENERIC, NUM_ISAS };

    /** Instruction set being compiled to. */
    ISA isa;
@@ -222,6 +226,23 @@ struct Target {

    /** Indicates whether position independent code should be generated. */
    bool generatePIC;
+
+    /** Is there overhead associated with masking on the target
+        architecture; e.g. there is on SSE, due to extra blends and the
+        like, but there isn't with an ISA that supports masking
+        natively. */
+    bool maskingIsFree;
+
+    /** Is it safe to run code with the mask all if: e.g. on SSE, the fast
+        gather trick assumes that at least one program instance is running
+        (so that it can safely assume that the array base pointer is
+        valid). */
+    bool allOffMaskIsSafe;
+
+    /** How many bits are used to store each element of the mask: e.g. this
+        is 32 on SSE/AVX, since that matches the HW better, but it's 1 for
+        the generic target. */
+    int maskBitCount;
 };


@@ -402,6 +423,7 @@ enum {
    COST_FUNPTR_UNIFORM = 12,
    COST_FUNPTR_VARYING = 24,
    COST_GATHER = 8,
+    COST_GOTO = 4,
    COST_LOAD = 2,
    COST_REGULAR_BREAK_CONTINUE = 2,
    COST_RETURN = 4,
@@ -415,6 +437,8 @@ enum {
    COST_VARYING_IF = 3,
    COST_UNIFORM_LOOP = 4,
    COST_VARYING_LOOP = 6,
+    COST_UNIFORM_SWITCH = 4,
+    COST_VARYING_SWITCH = 12,
    COST_ASSERT = 8,

    CHECK_MASK_AT_FUNCTION_START_COST = 16,
--- a/ispc.vcxproj
+++ b/ispc.vcxproj
@@ -13,20 +13,27 @@
  <ItemGroup>
    <ClCompile Include="ast.cpp" />
    <ClCompile Include="builtins.cpp" />
+    <ClCompile Include="cbackend.cpp" />
    <ClCompile Include="ctx.cpp" />
    <ClCompile Include="decl.cpp" />
    <ClCompile Include="expr.cpp" />
    <ClCompile Include="func.cpp" />
-    <ClCompile Include="gen-bitcode-avx.cpp" />
-    <ClCompile Include="gen-bitcode-avx-x2.cpp" />
+    <ClCompile Include="gen-bitcode-avx1.cpp" />
+    <ClCompile Include="gen-bitcode-avx1-x2.cpp" />
+    <ClCompile Include="gen-bitcode-avx2.cpp" />
+    <ClCompile Include="gen-bitcode-avx2-x2.cpp" />
    <ClCompile Include="gen-bitcode-c-32.cpp" />
    <ClCompile Include="gen-bitcode-c-64.cpp" />
    <ClCompile Include="gen-bitcode-dispatch.cpp" />
+    <ClCompile Include="gen-bitcode-generic-4.cpp" />
+    <ClCompile Include="gen-bitcode-generic-8.cpp" />
+    <ClCompile Include="gen-bitcode-generic-16.cpp" />
    <ClCompile Include="gen-bitcode-sse2.cpp" />
    <ClCompile Include="gen-bitcode-sse2-x2.cpp" />
    <ClCompile Include="gen-bitcode-sse4.cpp" />
    <ClCompile Include="gen-bitcode-sse4-x2.cpp" />
-    <ClCompile Include="gen-stdlib.cpp" />
+    <ClCompile Include="gen-stdlib-generic.cpp" />
+    <ClCompile Include="gen-stdlib-x86.cpp" />
    <ClCompile Include="ispc.cpp" />
    <ClCompile Include="lex.cc">
      <DisableSpecificWarnings Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">4146;4800;4996;4355;4624;4005;4003;4018</DisableSpecificWarnings>
@@ -40,15 +47,15 @@
      <DisableSpecificWarnings Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">4146;4800;4996;4355;4624;4005;4065</DisableSpecificWarnings>
      <DisableSpecificWarnings Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">4146;4800;4996;4355;4624;4005;4065</DisableSpecificWarnings>
    </ClCompile>
-    <CustomBuild Include="builtins-c.c">
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-32.c &gt; gen-bitcode-c-32.cpp;
-%LLVM_INSTALL_DIR%\bin\clang -m64 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-64.c &gt; gen-bitcode-c-64.cpp</Command>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">clang builtins-c.c</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-32.c &gt; gen-bitcode-c-32.cpp;
-%LLVM_INSTALL_DIR%\bin\clang -m64 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-64.c &gt; gen-bitcode-c-64.cpp</Command>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">clang builtins-c.c</Message>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-c-32.cpp;gen-bitcore-c-64.cpp</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-c-32.cpp;gen-bitcore-c-64.cpp</Outputs>
+    <CustomBuild Include="builtins\builtins.c">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins\builtins.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py c-32 &gt; gen-bitcode-c-32.cpp;
+%LLVM_INSTALL_DIR%\bin\clang -m64 -emit-llvm builtins\builtins.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py c-64 &gt; gen-bitcode-c-64.cpp</Command>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building builtins.c</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins\builtins.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py c-32 &gt; gen-bitcode-c-32.cpp;
+%LLVM_INSTALL_DIR%\bin\clang -m64 -emit-llvm builtins\builtins.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py c-64 &gt; gen-bitcode-c-64.cpp</Command>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building builtins.c</Message>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-c-32.cpp;gen-bitcode-c-64.cpp</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-c-32.cpp;gen-bitcode-c-64.cpp</Outputs>
    </CustomBuild>
    <ClCompile Include="stmt.cpp" />
    <ClCompile Include="sym.cpp" />
@@ -75,103 +82,172 @@
  <ItemGroup>
    <CustomBuild Include="stdlib.ispc">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py &gt; gen-stdlib.cpp</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-stdlib.cpp</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py &gt; gen-stdlib.cpp</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-stdlib.cpp</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-stdlib.cpp</Message>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-stdlib.cpp</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py x86 &gt; gen-stdlib-x86.cpp;
+%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DISPC_TARGET_GENERIC=1 -DPI=3.1415926535 | python stdlib2cpp.py generic &gt; gen-stdlib-generic.cpp;
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-stdlib-generic.cpp;gen-stdlib-x86.cpp</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py x86 &gt; gen-stdlib-x86.cpp;
+%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DISPC_TARGET_GENERIC=1 -DPI=3.1415926535 | python stdlib2cpp.py generic &gt; gen-stdlib-generic.cpp;
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-stdlib-generic.cpp;gen-stdlib-x86.cpp</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-stdlib-{generic,x86}.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-stdlib-{generic,x86}.cpp</Message>
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
-    <CustomBuild Include="builtins-sse4.ll">
+    <CustomBuild Include="builtins\dispatch.ll">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse4.ll | python bitcode2cpp.py builtins-sse4.ll &gt; gen-bitcode-sse4.cpp</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse4.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse4-common.ll</AdditionalInputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse4.ll | python bitcode2cpp.py builtins-sse4.ll &gt; gen-bitcode-sse4.cpp</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse4.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse4-common.ll</AdditionalInputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse4.cpp</Message>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse4.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins-dispatch.ll">
-      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-dispatch.ll | python bitcode2cpp.py builtins-dispatch.ll &gt; gen-bitcode-dispatch.cpp</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\dispatch.ll | python bitcode2cpp.py dispatch.ll &gt; gen-bitcode-dispatch.cpp</Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-dispatch.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4</AdditionalInputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-dispatch.ll | python bitcode2cpp.py builtins-dispatch.ll &gt; gen-bitcode-dispatch.cpp</Command>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\dispatch.ll | python bitcode2cpp.py dispatch.ll &gt; gen-bitcode-dispatch.cpp</Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-dispatch.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4</AdditionalInputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4</AdditionalInputs>
      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-dispatch.cpp</Message>
      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-dispatch.cpp</Message>
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
-    <CustomBuild Include="builtins-sse4-x2.ll">
+    <CustomBuild Include="builtins\target-sse4.ll">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse4-x2.ll | python bitcode2cpp.py builtins-sse4-x2.ll &gt; gen-bitcode-sse4-x2.cpp</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-sse4.ll | python bitcode2cpp.py builtins\target-sse4.ll &gt; gen-bitcode-sse4.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse4.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-sse4.ll | python bitcode2cpp.py builtins\target-sse4.ll &gt; gen-bitcode-sse4.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse4.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse4.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse4.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-sse4-x2.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-sse4-x2.ll | python bitcode2cpp.py builtins\target-sse4-x2.ll &gt; gen-bitcode-sse4-x2.cpp</Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse4-x2.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse4-common.ll</AdditionalInputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse4-x2.ll | python bitcode2cpp.py builtins-sse4-x2.ll &gt; gen-bitcode-sse4-x2.cpp</Command>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-sse4-x2.ll | python bitcode2cpp.py builtins\target-sse4-x2.ll &gt; gen-bitcode-sse4-x2.cpp</Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse4-x2.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse4-common.ll</AdditionalInputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse4-x2.cpp</Message>
      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse4-x2.cpp</Message>
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
-    <CustomBuild Include="builtins-sse2.ll">
+    <CustomBuild Include="builtins\target-sse2.ll">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse2.ll | python bitcode2cpp.py builtins-sse2.ll &gt; gen-bitcode-sse2.cpp</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-sse2.ll | python bitcode2cpp.py builtins\target-sse2.ll &gt; gen-bitcode-sse2.cpp</Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse2.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse2-common.ll</AdditionalInputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse2.ll | python bitcode2cpp.py builtins-sse2.ll &gt; gen-bitcode-sse2.cpp</Command>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-sse2-common.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-sse2.ll | python bitcode2cpp.py builtins\target-sse2.ll &gt; gen-bitcode-sse2.cpp</Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse2.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse2-common.ll</AdditionalInputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-sse2-common.ll</AdditionalInputs>
      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse2.cpp</Message>
      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse2.cpp</Message>
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
-    <CustomBuild Include="builtins-sse2-x2.ll">
+    <CustomBuild Include="builtins\target-sse2-x2.ll">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse2-x2.ll | python bitcode2cpp.py builtins-sse2-x2.ll &gt; gen-bitcode-sse2-x2.cpp</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-sse2-x2.ll | python bitcode2cpp.py builtins\target-sse2-x2.ll &gt; gen-bitcode-sse2-x2.cpp</Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse2-x2.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse2-common.ll</AdditionalInputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse2-x2.ll | python bitcode2cpp.py builtins-sse2-x2.ll &gt; gen-bitcode-sse2-x2.cpp</Command>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-sse2-common.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-sse2-x2.ll | python bitcode2cpp.py builtins\target-sse2-x2.ll &gt; gen-bitcode-sse2-x2.cpp</Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse2-x2.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse2-common.ll</AdditionalInputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-sse2-common.ll</AdditionalInputs>
      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse2-x2.cpp</Message>
      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse2-x2.cpp</Message>
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
-    <CustomBuild Include="builtins-avx.ll">
+    <CustomBuild Include="builtins\target-avx1.ll">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-avx.ll | python bitcode2cpp.py builtins-avx.ll &gt; gen-bitcode-avx.cpp</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-avx-common.ll</AdditionalInputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-avx.ll | python bitcode2cpp.py builtins-avx.ll &gt; gen-bitcode-avx.cpp</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-avx-common.ll</AdditionalInputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx.cpp</Message>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx.cpp</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll &gt; gen-bitcode-avx1.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx1.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll &gt; gen-bitcode-avx1.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx1.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx1.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx1.cpp</Message>
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
-    <CustomBuild Include="builtins-avx-x2.ll">
+    <CustomBuild Include="builtins\target-avx1-x2.ll">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-avx-x2.ll | python bitcode2cpp.py builtins-avx-x2.ll &gt; gen-bitcode-avx-x2.cpp</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx-x2.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-avx-x2.ll | python bitcode2cpp.py builtins-avx-x2.ll &gt; gen-bitcode-avx-x2.cpp</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx-x2.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx-x2.cpp</Message>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx-x2.cpp</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx1-x2.ll | python bitcode2cpp.py builtins\target-avx1-x2.ll &gt; gen-bitcode-avx1-x2.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx1-x2.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx1-x2.ll | python bitcode2cpp.py builtins\target-avx1-x2.ll &gt; gen-bitcode-avx1-x2.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx1-x2.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\targets-avx-x2.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx1-x2.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx1-x2.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-avx2.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx2.ll | python bitcode2cpp.py builtins\target-avx2.ll &gt; gen-bitcode-avx2.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx2.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx2.ll | python bitcode2cpp.py builtins\target-avx2.ll &gt; gen-bitcode-avx2.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx2.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx2.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx2.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-avx2-x2.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx2-x2.ll | python bitcode2cpp.py builtins\target-avx2-x2.ll &gt; gen-bitcode-avx2-x2.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx2-x2.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx2-x2.ll | python bitcode2cpp.py builtins\target-avx2-x2.ll &gt; gen-bitcode-avx2-x2.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx2-x2.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\targets-avx-x2.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx2-x2.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx2-x2.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-generic-4.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-4.ll | python bitcode2cpp.py builtins\target-generic-4.ll &gt; gen-bitcode-generic-4.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-generic-4.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-4.ll | python bitcode2cpp.py builtins\target-generic-4.ll &gt; gen-bitcode-generic-4.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-generic-4.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-generic-4.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-generic-4.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-generic-8.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-8.ll | python bitcode2cpp.py builtins\target-generic-8.ll &gt; gen-bitcode-generic-8.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-generic-8.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-8.ll | python bitcode2cpp.py builtins\target-generic-8.ll &gt; gen-bitcode-generic-8.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-generic-8.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-generic-8.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-generic-8.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-generic-16.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-16.ll | python bitcode2cpp.py builtins\target-generic-16.ll &gt; gen-bitcode-generic-16.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-generic-16.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-16.ll | python bitcode2cpp.py builtins\target-generic-16.ll &gt; gen-bitcode-generic-16.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-generic-16.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-generic-16.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-generic-16.cpp</Message>
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
--- a/lex.ll
+++ b/lex.ll
@@ -42,7 +42,7 @@
 #include <stdlib.h>
 #include <stdint.h>

-static uint64_t lParseBinary(const char *ptr, SourcePos pos);
+static uint64_t lParseBinary(const char *ptr, SourcePos pos, char **endPtr);
 static void lCComment(SourcePos *);
 static void lCppComment(SourcePos *);
 static void lHandleCppHash(SourcePos *);
@@ -67,7 +67,7 @@ inline int isatty(int) { return 0; }
 %option nounistd

 WHITESPACE [ \t\r]+
-INT_NUMBER (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))
+INT_NUMBER (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[kMG]?
 FLOAT_NUMBER (([0-9]+|(([0-9]+\.[0-9]*[fF]?)|(\.[0-9]+)))([eE][-+]?[0-9]+)?[fF]?)
 HEX_FLOAT_NUMBER (0x[01](\.[0-9a-fA-F]*)?p[-+]?[0-9]+[fF]?)

@@ -151,30 +151,44 @@ L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL;
 {INT_NUMBER}+(u|U|l|L)*? { 
    int ls = 0, us = 0;

+    char *endPtr = NULL;
    if (yytext[0] == '0' && yytext[1] == 'b')
-        yylval->intVal = lParseBinary(yytext+2, *yylloc);
+        yylval->intVal = lParseBinary(yytext+2, *yylloc, &endPtr);
    else {
-        char *endPtr = NULL;
-
-#ifdef ISPC_IS_WINDOWS
+#if defined(ISPC_IS_WINDOWS) && !defined(__MINGW32__)
        yylval->intVal = _strtoi64(yytext, &endPtr, 0);
 #else
        // FIXME: should use strtouq and then issue an error if we can't
        // fit into 64 bits...
        yylval->intVal = strtoull(yytext, &endPtr, 0);
 #endif
-        for (; *endPtr; endPtr++) {
-           if (*endPtr == 'l' || *endPtr == 'L')
-                ls++;
-           else if (*endPtr == 'u' || *endPtr == 'U')
-                us++;
-        }
-        if (ls >= 2)
-            return us ? TOKEN_UINT64_CONSTANT : TOKEN_INT64_CONSTANT;
-        else if (ls == 1)
-           return us ? TOKEN_UINT32_CONSTANT : TOKEN_INT32_CONSTANT;
    }

+    bool kilo = false, mega = false, giga = false;
+    for (; *endPtr; endPtr++) {
+        if (*endPtr == 'k')
+            kilo = true;
+        else if (*endPtr == 'M')
+            mega = true;
+        else if (*endPtr == 'G')
+            giga = true;        
+        else if (*endPtr == 'l' || *endPtr == 'L')
+            ls++;
+        else if (*endPtr == 'u' || *endPtr == 'U')
+            us++;
+    }
+    if (kilo)
+        yylval->intVal *= 1024;
+    if (mega)
+        yylval->intVal *= 1024*1024;
+    if (giga)
+        yylval->intVal *= 1024*1024*1024;
+
+    if (ls >= 2)
+        return us ? TOKEN_UINT64_CONSTANT : TOKEN_INT64_CONSTANT;
+    else if (ls == 1)
+        return us ? TOKEN_UINT32_CONSTANT : TOKEN_INT32_CONSTANT;
+
    // See if we can fit this into a 32-bit integer...
    if ((yylval->intVal & 0xffffffff) == yylval->intVal)
        return us ? TOKEN_UINT32_CONSTANT : TOKEN_INT32_CONSTANT;
@@ -268,14 +282,11 @@ L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL;
 /** Return the integer version of a binary constant from a string.
 */
 static uint64_t
-lParseBinary(const char *ptr, SourcePos pos) {
+lParseBinary(const char *ptr, SourcePos pos, char **endPtr) {
    uint64_t val = 0;
    bool warned = false;

-    while (*ptr != '\0') {
-        /* if this hits, the regexp for 0b... constants is broken */
-        Assert(*ptr == '0' || *ptr == '1');
-
+    while (*ptr == '0' || *ptr == '1') {
        if ((val & (((int64_t)1)<<63)) && warned == false) {
            // We're about to shift out a set bit
            Warning(pos, "Can't represent binary constant with a 64-bit integer type");
@@ -285,6 +296,7 @@ lParseBinary(const char *ptr, SourcePos pos) {
        val = (val << 1) | (*ptr == '0' ? 0 : 1);
        ++ptr;
    }
+    *endPtr = (char *)ptr;
    return val;
 }

--- a/llvmutil.cpp
+++ b/llvmutil.cpp
@@ -36,7 +36,9 @@
 */

 #include "llvmutil.h"
+#include "ispc.h"
 #include "type.h"
+#include <llvm/Instructions.h>

 LLVM_TYPE_CONST llvm::Type *LLVMTypes::VoidType = NULL;
 LLVM_TYPE_CONST llvm::PointerType *LLVMTypes::VoidPointerType = NULL;
@@ -105,11 +107,14 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target target) {
    LLVMTypes::FloatPointerType = llvm::PointerType::get(LLVMTypes::FloatType, 0);
    LLVMTypes::DoublePointerType = llvm::PointerType::get(LLVMTypes::DoubleType, 0);

-    // Note that both the mask and bool vectors are vector of int32s
-    // (not i1s).  LLVM ends up generating much better SSE code with
-    // this representation.
-    LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
-        llvm::VectorType::get(llvm::Type::getInt32Ty(*ctx), target.vectorWidth);
+    if (target.maskBitCount == 1)
+        LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
+            llvm::VectorType::get(llvm::Type::getInt1Ty(*ctx), target.vectorWidth);
+    else {
+        Assert(target.maskBitCount == 32);
+        LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
+            llvm::VectorType::get(llvm::Type::getInt32Ty(*ctx), target.vectorWidth);
+    }

    LLVMTypes::Int1VectorType = 
        llvm::VectorType::get(llvm::Type::getInt1Ty(*ctx), target.vectorWidth);
@@ -141,7 +146,11 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target target) {

    std::vector<llvm::Constant *> maskOnes;
    llvm::Constant *onMask = NULL;
-    onMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), -1,
+    if (target.maskBitCount == 1)
+        onMask = llvm::ConstantInt::get(llvm::Type::getInt1Ty(*ctx), 1,
+                                        false /*unsigned*/); // 0x1
+    else
+        onMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), -1,
                                    true /*signed*/); // 0xffffffff

    for (int i = 0; i < target.vectorWidth; ++i)
@@ -150,8 +159,12 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target target) {

    std::vector<llvm::Constant *> maskZeros;
    llvm::Constant *offMask = NULL;
-    offMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), 0,
-                                     true /*signed*/);
+    if (target.maskBitCount == 1)
+        offMask = llvm::ConstantInt::get(llvm::Type::getInt1Ty(*ctx), 0,
+                                         true /*signed*/);
+    else
+        offMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), 0,
+                                         true /*signed*/);

    for (int i = 0; i < target.vectorWidth; ++i)
        maskZeros.push_back(offMask);
@@ -454,3 +467,239 @@ LLVMBoolVector(const bool *bvec) {
    }
    return llvm::ConstantVector::get(vals);
 }
+
+
+/** Conservative test to see if two llvm::Values are equal.  There are
+    (potentially many) cases where the two values actually are equal but
+    this will return false.  However, if it does return true, the two
+    vectors definitely are equal.
+
+    @todo This seems to catch all of the cases we currently need it for in
+    practice, but it's be nice to make it a little more robust/general.  In
+    general, though, a little something called the halting problem means we
+    won't get all of them.
+*/
+static bool
+lValuesAreEqual(llvm::Value *v0, llvm::Value *v1, 
+                std::vector<llvm::PHINode *> &seenPhi0,
+                std::vector<llvm::PHINode *> &seenPhi1) {
+    // Thanks to the fact that LLVM hashes and returns the same pointer for
+    // constants (of all sorts, even constant expressions), this first test
+    // actually catches a lot of cases.  LLVM's SSA form also helps a lot
+    // with this..
+    if (v0 == v1)
+        return true;
+
+    Assert(seenPhi0.size() == seenPhi1.size());
+    for (unsigned int i = 0; i < seenPhi0.size(); ++i)
+        if (v0 == seenPhi0[i] && v1 == seenPhi1[i])
+            return true;
+
+    llvm::BinaryOperator *bo0 = llvm::dyn_cast<llvm::BinaryOperator>(v0);
+    llvm::BinaryOperator *bo1 = llvm::dyn_cast<llvm::BinaryOperator>(v1);
+    if (bo0 != NULL && bo1 != NULL) {
+        if (bo0->getOpcode() != bo1->getOpcode())
+            return false;
+        return (lValuesAreEqual(bo0->getOperand(0), bo1->getOperand(0),
+                                seenPhi0, seenPhi1) &&
+                lValuesAreEqual(bo0->getOperand(1), bo1->getOperand(1),
+                                seenPhi0, seenPhi1));
+    }
+
+    llvm::PHINode *phi0 = llvm::dyn_cast<llvm::PHINode>(v0);
+    llvm::PHINode *phi1 = llvm::dyn_cast<llvm::PHINode>(v1);
+    if (phi0 != NULL && phi1 != NULL) {
+        if (phi0->getNumIncomingValues() != phi1->getNumIncomingValues())
+            return false;
+
+        seenPhi0.push_back(phi0);
+        seenPhi1.push_back(phi1);
+
+        unsigned int numIncoming = phi0->getNumIncomingValues();
+        // Check all of the incoming values: if all of them are all equal,
+        // then we're good.
+        bool anyFailure = false;
+        for (unsigned int i = 0; i < numIncoming; ++i) {
+            Assert(phi0->getIncomingBlock(i) == phi1->getIncomingBlock(i));
+            if (!lValuesAreEqual(phi0->getIncomingValue(i), 
+                                 phi1->getIncomingValue(i), seenPhi0, seenPhi1)) {
+                anyFailure = true;
+                break;
+            }
+        }
+
+        seenPhi0.pop_back();
+        seenPhi1.pop_back();
+
+        return !anyFailure;
+    }
+
+    return false;
+}
+
+
+/** Given an llvm::Value known to be an integer, return its value as
+    an int64_t.
+*/
+static int64_t
+lGetIntValue(llvm::Value *offset) {
+    llvm::ConstantInt *intOffset = llvm::dyn_cast<llvm::ConstantInt>(offset);
+    Assert(intOffset && (intOffset->getBitWidth() == 32 ||
+                         intOffset->getBitWidth() == 64));
+    return intOffset->getSExtValue();
+}
+
+
+/** This function takes chains of InsertElement instructions along the
+    lines of:
+
+    %v0 = insertelement undef, value_0, i32 index_0
+    %v1 = insertelement %v1,   value_1, i32 index_1
+    ...
+    %vn = insertelement %vn-1, value_n-1, i32 index_n-1
+
+    and initializes the provided elements array such that the i'th
+    llvm::Value * in the array is the element that was inserted into the
+    i'th element of the vector.
+*/
+void
+LLVMFlattenInsertChain(llvm::InsertElementInst *ie, int vectorWidth,
+                       llvm::Value **elements) {
+    for (int i = 0; i < vectorWidth; ++i)
+        elements[i] = NULL;
+
+    while (ie != NULL) {
+        int64_t iOffset = lGetIntValue(ie->getOperand(2));
+        Assert(iOffset >= 0 && iOffset < vectorWidth);
+        Assert(elements[iOffset] == NULL);
+
+        elements[iOffset] = ie->getOperand(1);
+
+        llvm::Value *insertBase = ie->getOperand(0);
+        ie = llvm::dyn_cast<llvm::InsertElementInst>(insertBase);
+        if (ie == NULL) {
+            if (llvm::isa<llvm::UndefValue>(insertBase))
+                return;
+
+            llvm::ConstantVector *cv = 
+                llvm::dyn_cast<llvm::ConstantVector>(insertBase);
+            Assert(cv != NULL);
+            Assert(iOffset < (int)cv->getNumOperands());
+            elements[iOffset] = cv->getOperand(iOffset);
+        }
+    }
+}
+
+
+/** Tests to see if all of the elements of the vector in the 'v' parameter
+    are equal.  Like lValuesAreEqual(), this is a conservative test and may
+    return false for arrays where the values are actually all equal.  */
+bool
+LLVMVectorValuesAllEqual(llvm::Value *v, int vectorLength,
+                         std::vector<llvm::PHINode *> &seenPhis) {
+    if (llvm::isa<llvm::ConstantAggregateZero>(v))
+        return true;
+
+    llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(v);
+    if (cv != NULL)
+        return (cv->getSplatValue() != NULL);
+
+    llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(v);
+    if (bop != NULL)
+        return (LLVMVectorValuesAllEqual(bop->getOperand(0), vectorLength, 
+                                      seenPhis) &&
+                LLVMVectorValuesAllEqual(bop->getOperand(1), vectorLength, 
+                                      seenPhis));
+
+    llvm::CastInst *cast = llvm::dyn_cast<llvm::CastInst>(v);
+    if (cast != NULL)
+        return LLVMVectorValuesAllEqual(cast->getOperand(0), vectorLength, 
+                                     seenPhis);
+
+    llvm::InsertElementInst *ie = llvm::dyn_cast<llvm::InsertElementInst>(v);
+    if (ie != NULL) {
+        llvm::Value *elements[ISPC_MAX_NVEC];
+        LLVMFlattenInsertChain(ie, vectorLength, elements);
+
+        // We will ignore any values of elements[] that are NULL; as they
+        // correspond to undefined values--we just want to see if all of
+        // the defined values have the same value.
+        int lastNonNull = 0;
+        while (lastNonNull < vectorLength && elements[lastNonNull] == NULL)
+            ++lastNonNull;
+
+        if (lastNonNull == vectorLength)
+            // all of them are undef!
+            return true;
+
+        for (int i = lastNonNull; i < vectorLength; ++i) {
+            if (elements[i] == NULL)
+                continue;
+
+            std::vector<llvm::PHINode *> seenPhi0;
+            std::vector<llvm::PHINode *> seenPhi1;
+            if (lValuesAreEqual(elements[lastNonNull], elements[i], seenPhi0, 
+                                seenPhi1) == false)
+                return false;
+            lastNonNull = i;
+        }
+        return true;
+    }
+
+    llvm::PHINode *phi = llvm::dyn_cast<llvm::PHINode>(v);
+    if (phi) {
+        for (unsigned int i = 0; i < seenPhis.size(); ++i)
+            if (seenPhis[i] == phi)
+                return true;
+
+        seenPhis.push_back(phi);
+
+        unsigned int numIncoming = phi->getNumIncomingValues();
+        // Check all of the incoming values: if all of them are all equal,
+        // then we're good.
+        for (unsigned int i = 0; i < numIncoming; ++i) {
+            if (!LLVMVectorValuesAllEqual(phi->getIncomingValue(i), vectorLength,
+                                       seenPhis)) {
+                seenPhis.pop_back();
+                return false;
+            }
+        }
+
+        seenPhis.pop_back();
+        return true;
+    }
+
+    Assert(!llvm::isa<llvm::Constant>(v));
+
+    if (llvm::isa<llvm::CallInst>(v) || llvm::isa<llvm::LoadInst>(v) ||
+        !llvm::isa<llvm::Instruction>(v))
+        return false;
+
+    llvm::ShuffleVectorInst *shuffle = llvm::dyn_cast<llvm::ShuffleVectorInst>(v);
+    if (shuffle != NULL) {
+        llvm::Value *indices = shuffle->getOperand(2);
+        if (LLVMVectorValuesAllEqual(indices, vectorLength, seenPhis))
+            // The easy case--just a smear of the same element across the
+            // whole vector.
+            return true;
+
+        // TODO: handle more general cases?
+        return false;
+    }
+
+#if 0
+    fprintf(stderr, "all equal: ");
+    v->dump();
+    fprintf(stderr, "\n");
+    llvm::Instruction *inst = llvm::dyn_cast<llvm::Instruction>(v);
+    if (inst) {
+        inst->getParent()->dump();
+        fprintf(stderr, "\n");
+        fprintf(stderr, "\n");
+    }
+#endif
+
+    return false;
+}
+
+
--- a/llvmutil.h
+++ b/llvmutil.h
@@ -38,12 +38,23 @@
 #ifndef ISPC_LLVMUTIL_H
 #define ISPC_LLVMUTIL_H 1

-#include "ispc.h"
 #include <llvm/LLVMContext.h>
 #include <llvm/Type.h>
 #include <llvm/DerivedTypes.h>
 #include <llvm/Constants.h>

+namespace llvm {
+    class PHINode;
+    class InsertElementInst;
+}
+
+// llvm::Type *s are no longer const in llvm 3.0
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
+#define LLVM_TYPE_CONST
+#else
+#define LLVM_TYPE_CONST const
+#endif
+

 /** This structure holds pointers to a variety of LLVM types; code
    elsewhere can use them from here, ratherthan needing to make more
@@ -99,6 +110,7 @@ extern llvm::Constant *LLVMTrue, *LLVMFalse;
    of LLVMTypes and the LLVMTrue/LLVMFalse constants.  However, it can't
    be called until the compilation target is known.
 */
+struct Target;
 extern void InitLLVMUtil(llvm::LLVMContext *ctx, Target target);

 /** Returns an LLVM i8 constant of the given value */
@@ -205,4 +217,13 @@ extern llvm::Constant *LLVMMaskAllOn;
 /** LLVM constant value representing an 'all off' SIMD lane mask */
 extern llvm::Constant *LLVMMaskAllOff;

+/** Tests to see if all of the elements of the vector in the 'v' parameter
+    are equal.  Like lValuesAreEqual(), this is a conservative test and may
+    return false for arrays where the values are actually all equal.  */
+extern bool LLVMVectorValuesAllEqual(llvm::Value *v, int vectorLength,
+                                     std::vector<llvm::PHINode *> &seenPhis);
+
+void LLVMFlattenInsertChain(llvm::InsertElementInst *ie, int vectorWidth,
+                            llvm::Value **elements);
+
 #endif // ISPC_LLVMUTIL_H
--- a/main.cpp
+++ b/main.cpp
@@ -38,6 +38,7 @@
 #include "ispc.h"
 #include "module.h"
 #include "util.h"
+#include "type.h"
 #include <stdio.h>
 #include <stdlib.h>
 #include <llvm/Support/PrettyStackTrace.h>
@@ -53,24 +54,47 @@

 #ifdef ISPC_IS_WINDOWS
 #define strcasecmp stricmp
+#ifndef BUILD_DATE
 #define BUILD_DATE __DATE__
+#endif
 #define BUILD_VERSION ""
 #endif // ISPC_IS_WINDOWS

-static void usage(int ret) {
-    printf("This is the Intel(r) SPMD Program Compiler (ispc), build %s (%s)\n\n", 
-           BUILD_DATE, BUILD_VERSION);
-    printf("usage: ispc\n");
+static void
+lPrintVersion() {
+    printf("Intel(r) SPMD Program Compiler (ispc), build %s (%s, LLVM %s)\n", 
+           BUILD_DATE, BUILD_VERSION,
+#ifdef LLVM_2_9
+           "2.9"
+#elif defined(LLVM_3_0) || defined(LLVM_3_0svn)
+           "3.0"
+#elif defined(LLVM_3_1) || defined(LLVM_3_1svn)
+           "3.1"
+#else
+#error "Unhandled LLVM version"
+#endif 
+           );
+}
+
+
+static void
+usage(int ret) {
+    lPrintVersion();
+    printf("\nusage: ispc\n");
    printf("    [--addressing={32,64}]\t\tSelect 32- or 64-bit addressing. (Note that 32-bit\n");
    printf("                          \t\taddressing calculations are done by default, even\n");
    printf("                          \t\ton 64-bit target architectures.)\n");
    printf("    [--arch={%s}]\t\tSelect target architecture\n", 
           Target::SupportedTargetArchs());
+    printf("    [--c++-include-file=<name>]\t\tSpecify name of file to emit in #include statement in generated C++ code.\n");
    printf("    [--cpu=<cpu>]\t\t\tSelect target CPU type\n");
    printf("         <cpu>={%s}\n", Target::SupportedTargetCPUs());
    printf("    [-D<foo>]\t\t\t\t#define given value when running preprocessor\n");
    printf("    [--debug]\t\t\t\tPrint information useful for debugging ispc\n");
    printf("    [--emit-asm]\t\t\tGenerate assembly language file as output\n");
+#ifndef LLVM_2_9
+    printf("    [--emit-c++]\t\t\tEmit a C++ source file as output\n");
+#endif // !LLVM_2_9
    printf("    [--emit-llvm]\t\t\tEmit LLVM bitode file as output\n");
    printf("    [--emit-obj]\t\t\tGenerate object file file as output (default)\n");
    printf("    [-g]\t\t\t\tGenerate debugging information\n");
@@ -184,9 +208,12 @@ int main(int Argc, char *Argv[]) {
    LLVMInitializeX86TargetMC();
 #endif

+    AtomicType::Init();
+
    char *file = NULL;
    const char *headerFileName = NULL;
    const char *outFileName = NULL;
+    const char *includeFileName = NULL;

    // Initiailize globals early so that we can set various option values
    // as we're parsing below
@@ -236,13 +263,20 @@ int main(int Argc, char *Argv[]) {
        }
        else if (!strcmp(argv[i], "--emit-asm"))
            ot = Module::Asm;
+#ifndef LLVM_2_9
+        else if (!strcmp(argv[i], "--emit-c++"))
+            ot = Module::CXX;
+#endif // !LLVM_2_9
        else if (!strcmp(argv[i], "--emit-llvm"))
            ot = Module::Bitcode;
        else if (!strcmp(argv[i], "--emit-obj"))
            ot = Module::Object;
        else if (!strcmp(argv[i], "--target")) {
            // FIXME: should remove this way of specifying the target...
-            if (++i == argc) usage(1);
+            if (++i == argc) {
+                fprintf(stderr, "No target specified after --target option.\n");
+                usage(1);
+            }
            target = argv[i];
        }
        else if (!strncmp(argv[i], "--target=", 9))
@@ -257,8 +291,10 @@ int main(int Argc, char *Argv[]) {
                g->mathLib = Globals::Math_SVML;
            else if (!strcmp(lib, "system"))
                g->mathLib = Globals::Math_System;
-            else
+            else {
+                fprintf(stderr, "Unknown --math-lib= option \"%s\".\n", lib);
                usage(1);
+            }
        }
        else if (!strncmp(argv[i], "--opt=", 6)) {
            const char *opt = argv[i] + 6;
@@ -291,8 +327,10 @@ int main(int Argc, char *Argv[]) {
                g->opt.disableGatherScatterFlattening = true;
            else if (!strcmp(opt, "disable-uniform-memory-optimizations"))
                g->opt.disableUniformMemoryOptimizations = true;
-            else 
+            else {
+                fprintf(stderr, "Unknown --opt= option \"%s\".\n", opt);
                usage(1);
+            }
        }
        else if (!strcmp(argv[i], "--woff") || !strcmp(argv[i], "-woff")) {
            g->disableWarnings = true;
@@ -305,18 +343,27 @@ int main(int Argc, char *Argv[]) {
        else if (!strcmp(argv[i], "--wno-perf") || !strcmp(argv[i], "-wno-perf"))
            g->emitPerfWarnings = false;
        else if (!strcmp(argv[i], "-o")) {
-            if (++i == argc) usage(1);
+            if (++i == argc) {
+                fprintf(stderr, "No output file specified after -o option.\n");
+                usage(1);
+            }
            outFileName = argv[i];
        }
        else if (!strcmp(argv[i], "--outfile="))
            outFileName = argv[i] + strlen("--outfile=");
        else if (!strcmp(argv[i], "-h")) {
-            if (++i == argc) usage(1);
+            if (++i == argc) {
+                fprintf(stderr, "No header file name specified after -h option.\n");
+                usage(1);
+            }
            headerFileName = argv[i];
        }
-        else if (!strcmp(argv[i], "--header-outfile=")) {
+        else if (!strncmp(argv[i], "--header-outfile=", 17)) {
            headerFileName = argv[i] + strlen("--header-outfile=");
        }
+        else if (!strncmp(argv[i], "--c++-include-file=", 19)) {
+            includeFileName = argv[i] + strlen("--c++-include-file=");
+        }
        else if (!strcmp(argv[i], "-O0")) {
            g->opt.level = 0;
            optSet = true;
@@ -337,15 +384,19 @@ int main(int Argc, char *Argv[]) {
            generatePIC = true;
 #endif // !ISPC_IS_WINDOWS
        else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--version")) {
-            printf("Intel(r) SPMD Program Compiler (ispc) build %s (%s)\n", 
-                   BUILD_DATE, BUILD_VERSION);
+            lPrintVersion();
            return 0;
        }
-        else if (argv[i][0] == '-')
+        else if (argv[i][0] == '-') {
+            fprintf(stderr, "Unknown option \"%s\".\n", argv[i]);
            usage(1);
+        }
        else {
-            if (file != NULL)
+            if (file != NULL) {
+                fprintf(stderr, "Multiple input files specified on command "
+                        "line: \"%s\" and \"%s\".\n", file, argv[i]);
                usage(1);
+            }
            else
                file = argv[i];
        }
@@ -363,5 +414,6 @@ int main(int Argc, char *Argv[]) {
                "be issued, but no output will be generated.");

    return Module::CompileAndOutput(file, arch, cpu, target, generatePIC,
-                                    ot, outFileName, headerFileName);
+                                    ot, outFileName, headerFileName, 
+                                    includeFileName);
 }
--- a/module.cpp
+++ b/module.cpp
@@ -76,7 +76,6 @@
 #include <llvm/Target/TargetMachine.h>
 #include <llvm/Target/TargetOptions.h>
 #include <llvm/Target/TargetData.h>
-#include <llvm/PassManager.h>
 #include <llvm/Analysis/Verifier.h>
 #include <llvm/Support/CFG.h>
 #include <clang/Frontend/CompilerInstance.h>
@@ -263,7 +262,7 @@ Module::AddGlobalVariable(Symbol *sym, Expr *initExpr, bool isConst) {
                  "global variable \"%s\".", sym->name.c_str());
    }
    else if (initExpr != NULL) {
-        initExpr = initExpr->TypeCheck();
+        initExpr = TypeCheck(initExpr);
        if (initExpr != NULL) {
            // We need to make sure the initializer expression is
            // the same type as the global.  (But not if it's an
@@ -273,7 +272,7 @@ Module::AddGlobalVariable(Symbol *sym, Expr *initExpr, bool isConst) {
                initExpr = TypeConvertExpr(initExpr, sym->type, "initializer");
            
            if (initExpr != NULL) {
-                initExpr = initExpr->Optimize();
+                initExpr = Optimize(initExpr);
                // Fingers crossed, now let's see if we've got a
                // constant value..
                llvmInitializer = initExpr->GetConstant(sym->type);
@@ -584,7 +583,8 @@ Module::AddFunctionDefinition(Symbol *sym, const std::vector<Symbol *> &args,


 bool
-Module::writeOutput(OutputType outputType, const char *outFileName) {
+Module::writeOutput(OutputType outputType, const char *outFileName,
+                    const char *includeFileName) {
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
    if (diBuilder != NULL && outputType != Header)
        diBuilder->finalize();
@@ -610,6 +610,14 @@ Module::writeOutput(OutputType outputType, const char *outFileName) {
            if (strcasecmp(suffix, "o") && strcasecmp(suffix, "obj"))
                fileType = "object";
            break;
+#ifndef LLVM_2_9
+        case CXX:
+            if (strcasecmp(suffix, "c") && strcasecmp(suffix, "cc") &&
+                strcasecmp(suffix, "c++") && strcasecmp(suffix, "cxx") &&
+                strcasecmp(suffix, "cpp"))
+                fileType = "c++";
+            break;
+#endif // !LLVM_2_9
        case Header:
            if (strcasecmp(suffix, "h") && strcasecmp(suffix, "hh") &&
                strcasecmp(suffix, "hpp"))
@@ -623,12 +631,18 @@ Module::writeOutput(OutputType outputType, const char *outFileName) {

    if (outputType == Header)
        return writeHeader(outFileName);
-    else {
-        if (outputType == Bitcode)
-            return writeBitcode(module, outFileName);
-        else
-            return writeObjectFileOrAssembly(outputType, outFileName);
+    else if (outputType == Bitcode)
+        return writeBitcode(module, outFileName);
+#ifndef LLVM_2_9
+    else if (outputType == CXX) {
+        extern bool WriteCXXFile(llvm::Module *module, const char *fn, 
+                                 int vectorWidth, const char *includeName);
+        return WriteCXXFile(module, outFileName, g->target.vectorWidth,
+                            includeFileName);
    }
+#endif // !LLVM_2_9
+    else
+        return writeObjectFileOrAssembly(outputType, outFileName);
 }


@@ -1158,22 +1172,14 @@ Module::execPreprocessor(const char* infilename, llvm::raw_string_ostream* ostre
    opts.addMacroDef("PI=3.1415926535");

    // Add #define for current compilation target
-    switch (g->target.isa) {
-    case Target::SSE2:
-        opts.addMacroDef("ISPC_TARGET_SSE2");
-        break;
-    case Target::SSE4:
-        opts.addMacroDef("ISPC_TARGET_SSE4");
-        break;
-    case Target::AVX:
-        opts.addMacroDef("ISPC_TARGET_AVX");
-        break;
-    case Target::AVX2:
-        opts.addMacroDef("ISPC_TARGET_AVX2");
-        break;
-    default:
-        FATAL("Unhandled target ISA in preprocessor symbol definition");
+    char targetMacro[128];
+    sprintf(targetMacro, "ISPC_TARGET_%s", g->target.GetISAString());
+    char *p = targetMacro;
+    while (*p) {
+        *p = toupper(*p);
+        ++p;
    }
+    opts.addMacroDef(targetMacro);

    if (g->target.is32Bit)
        opts.addMacroDef("ISPC_POINTER_SIZE=32");
@@ -1576,7 +1582,8 @@ lCreateDispatchModule(std::map<std::string, FunctionTargetVariants> &functions)
 int
 Module::CompileAndOutput(const char *srcFile, const char *arch, const char *cpu, 
                         const char *target, bool generatePIC, OutputType outputType, 
-                         const char *outFileName, const char *headerFileName) {
+                         const char *outFileName, const char *headerFileName,
+                         const char *includeFileName) {
    if (target == NULL || strchr(target, ',') == NULL) {
        // We're only compiling to a single target
        if (!Target::GetTarget(arch, cpu, target, generatePIC, &g->target))
@@ -1585,7 +1592,7 @@ Module::CompileAndOutput(const char *srcFile, const char *arch, const char *cpu,
        m = new Module(srcFile);
        if (m->CompileFile() == 0) {
            if (outFileName != NULL)
-                if (!m->writeOutput(outputType, outFileName))
+                if (!m->writeOutput(outputType, outFileName, includeFileName))
                    return 1;
            if (headerFileName != NULL)
                if (!m->writeOutput(Module::Header, headerFileName))
@@ -1598,6 +1605,14 @@ Module::CompileAndOutput(const char *srcFile, const char *arch, const char *cpu,
        return errorCount > 0;
    }
    else {
+#ifndef LLVM_2_9
+        if (outputType == CXX) {
+            Error(SourcePos(), "Illegal to specify more then one target when "
+                  "compiling C++ output.");
+            return 1;
+        }
+#endif // !LLVM_2_9
+
        // The user supplied multiple targets
        std::vector<std::string> targets = lExtractTargets(target);
        Assert(targets.size() > 1);
--- a/module.h
+++ b/module.h
@@ -80,6 +80,9 @@ public:
    enum OutputType { Asm,      /** Generate text assembly language output */
                      Bitcode,  /** Generate LLVM IR bitcode output */
                      Object,   /** Generate a native object file */
+#ifndef LLVM_2_9
+                      CXX,      /** Generate a C++ file */
+#endif // !LLVM_2_9
                      Header    /** Generate a C/C++ header file with 
                                    declarations of 'export'ed functions, global
                                    variables, and the types used by them. */
@@ -108,6 +111,10 @@ public:
                              inclusion from C/C++ code with declarations of
                              types and functions exported from the given ispc
                              source file.
+        @param includeFileName If non-NULL, gives the filename for the C++ 
+                               backend to emit in an #include statement to
+                               get definitions of the builtins for the generic
+                               target.
        @return             Number of errors encountered when compiling
                            srcFile.
     */
@@ -115,7 +122,8 @@ public:
                                const char *cpu, const char *targets, 
                                bool generatePIC, OutputType outputType, 
                                const char *outFileName, 
-                                const char *headerFileName);
+                                const char *headerFileName, 
+                                const char *includeFileName);

    /** Total number of errors encountered during compilation. */
    int errorCount;
@@ -138,7 +146,8 @@ private:
        true on success, false if there has been an error.  The given
        filename may be NULL, indicating that output should go to standard
        output. */
-    bool writeOutput(OutputType ot, const char *filename);
+    bool writeOutput(OutputType ot, const char *filename,
+                     const char *includeFileName = NULL);
    bool writeHeader(const char *filename);
    bool writeObjectFileOrAssembly(OutputType outputType, const char *filename);
    static bool writeObjectFileOrAssembly(llvm::TargetMachine *targetMachine,
--- a/opt.cpp
+++ b/opt.cpp
--- a/parse.yy
+++ b/parse.yy
@@ -224,7 +224,7 @@ struct ForeachDimension {
 %type <declSpecs> declaration_specifiers 

 %type <stringVal> string_constant
-%type <constCharPtr> struct_or_union_name enum_identifier
+%type <constCharPtr> struct_or_union_name enum_identifier goto_identifier
 %type <intVal> int_constant soa_width_specifier

 %type <foreachDimension> foreach_dimension_specifier
@@ -362,13 +362,7 @@ cast_expression
    : unary_expression
    | '(' type_name ')' cast_expression
      {
-          // Pass true here to try to preserve uniformity 
-          // so that things like:
-          // uniform int y = ...;
-          // uniform float x = 1. / (float)y;
-          // don't issue an error due to (float)y being inadvertently
-          // and undesirably-to-the-user "varying"...
-          $$ = new TypeCastExpr($2, $4, true, Union(@1,@4)); 
+          $$ = new TypeCastExpr($2, $4, Union(@1,@4)); 
      }
    ;

@@ -500,6 +494,7 @@ declaration_statement
            $$ = NULL;
        }
        else {
+            $1->DeclareFunctions();
            std::vector<VariableDeclaration> vars = $1->GetVariableDeclarations();
            $$ = new DeclStmt(vars, @1);
        }
@@ -638,13 +633,13 @@ type_specifier

 atomic_var_type_specifier
    : TOKEN_VOID { $$ = AtomicType::Void; }
-    | TOKEN_BOOL { $$ = AtomicType::VaryingBool; }
-    | TOKEN_INT8 { $$ = AtomicType::VaryingInt8; }
-    | TOKEN_INT16 { $$ = AtomicType::VaryingInt16; }
-    | TOKEN_INT { $$ = AtomicType::VaryingInt32; }
-    | TOKEN_FLOAT { $$ = AtomicType::VaryingFloat; }
-    | TOKEN_DOUBLE { $$ = AtomicType::VaryingDouble; }
-    | TOKEN_INT64 { $$ = AtomicType::VaryingInt64; }
+    | TOKEN_BOOL { $$ = AtomicType::UnboundBool; }
+    | TOKEN_INT8 { $$ = AtomicType::UnboundInt8; }
+    | TOKEN_INT16 { $$ = AtomicType::UnboundInt16; }
+    | TOKEN_INT { $$ = AtomicType::UnboundInt32; }
+    | TOKEN_FLOAT { $$ = AtomicType::UnboundFloat; }
+    | TOKEN_DOUBLE { $$ = AtomicType::UnboundDouble; }
+    | TOKEN_INT64 { $$ = AtomicType::UnboundInt64; }
    ;

 short_vec_specifier
@@ -670,7 +665,7 @@ struct_or_union_specifier
          GetStructTypesNamesPositions(*$4, &elementTypes, &elementNames,
                                       &elementPositions);
          StructType *st = new StructType($2, elementTypes, elementNames,
-                                          elementPositions, false, true, @2);
+                                          elementPositions, false, Type::Unbound, @2);
          m->symbolTable->AddType($2, st, @2);
          $$ = st;
      }
@@ -681,8 +676,9 @@ struct_or_union_specifier
          std::vector<SourcePos> elementPositions;
          GetStructTypesNamesPositions(*$3, &elementTypes, &elementNames,
                                       &elementPositions);
+          // FIXME: should be unbound
          $$ = new StructType("", elementTypes, elementNames, elementPositions,
-                              false, true, @1);
+                              false, Type::Unbound, @1);
      }
    | struct_or_union '{' '}' 
      {
@@ -748,7 +744,7 @@ specifier_qualifier_list
            else if ($1 == TYPEQUAL_SIGNED) {
                if ($2->IsIntType() == false) {
                    Error(@1, "Can't apply \"signed\" qualifier to \"%s\" type.",
-                          $2->GetString().c_str());
+                          $2->ResolveUnboundVariability(Type::Varying)->GetString().c_str());
                    $$ = $2;
                }
            }
@@ -758,7 +754,7 @@ specifier_qualifier_list
                    $$ = t;
                else {
                    Error(@1, "Can't apply \"unsigned\" qualifier to \"%s\" type. Ignoring.",
-                          $2->GetString().c_str());
+                          $2->ResolveUnboundVariability(Type::Varying)->GetString().c_str());
                    $$ = $2;
                }
            } 
@@ -775,8 +771,11 @@ specifier_qualifier_list
            else
                FATAL("Unhandled type qualifier in parser.");
        }
-        else
+        else {
+            if (m->errorCount == 0)
+                Error(@1, "Lost type qualifier in parser.");  
            $$ = NULL;
+        }
    }
    ;

@@ -1112,8 +1111,7 @@ type_name
 abstract_declarator
    : pointer
      {
-          Declarator *d = new Declarator(DK_POINTER, @1);
-          $$ = d;
+          $$ = $1;
      }
    | direct_abstract_declarator
    | pointer direct_abstract_declarator
@@ -1262,10 +1260,22 @@ statement
    ;

 labeled_statement
-    : TOKEN_CASE constant_expression ':' statement
-      { UNIMPLEMENTED; }
+    : goto_identifier ':' statement
+    {
+        $$ = new LabeledStmt($1, $3, @1);
+    }
+    | TOKEN_CASE constant_expression ':' statement
+      { 
+          int value;
+          if ($2 != NULL && 
+              lGetConstantInt($2, &value, @2, "Case statement value")) {
+              $$ = new CaseStmt(value, $4, Union(@1, @2));
+          }
+          else
+              $$ = NULL;
+      }
    | TOKEN_DEFAULT ':' statement
-      { UNIMPLEMENTED; }
+      { $$ = new DefaultStmt($3, @1); }
    ;

 start_scope
@@ -1311,7 +1321,7 @@ selection_statement
    | TOKEN_CIF '(' expression ')' statement TOKEN_ELSE statement
      { $$ = new IfStmt($3, $5, $7, true, @1); }
    | TOKEN_SWITCH '(' expression ')' statement
-      { UNIMPLEMENTED; }
+      { $$ = new SwitchStmt($3, $5, @1); }
    ;

 for_test
@@ -1433,9 +1443,13 @@ iteration_statement
     }
    ;

+goto_identifier
+    : TOKEN_IDENTIFIER { $$ = yylval.stringVal->c_str(); }
+    ;
+
 jump_statement
-    : TOKEN_GOTO TOKEN_IDENTIFIER ';'
-      { UNIMPLEMENTED; }
+    : TOKEN_GOTO goto_identifier ';'
+      { $$ = new GotoStmt($2, @1, @2); }
    | TOKEN_CONTINUE ';'
      { $$ = new ContinueStmt(false, @1); }
    | TOKEN_BREAK ';'
@@ -1551,19 +1565,21 @@ lAddDeclaration(DeclSpecs *ds, Declarator *decl) {
        const Type *t = decl->GetType(ds);
        if (t == NULL)
            return;
+
+        Symbol *sym = decl->GetSymbol();
+        Assert(sym != NULL);
        const FunctionType *ft = dynamic_cast<const FunctionType *>(t);
        if (ft != NULL) {
-            Symbol *funSym = decl->GetSymbol();
-            Assert(funSym != NULL);
-            funSym->type = ft;
-            funSym->storageClass = ds->storageClass;
-
+            sym->type = ft;
+            sym->storageClass = ds->storageClass;
            bool isInline = (ds->typeQualifiers & TYPEQUAL_INLINE);
-            m->AddFunctionDeclaration(funSym, isInline);
+            m->AddFunctionDeclaration(sym, isInline);
+        }
+        else {
+            sym->type = sym->type->ResolveUnboundVariability(Type::Varying);
+            bool isConst = (ds->typeQualifiers & TYPEQUAL_CONST) != 0;
+            m->AddGlobalVariable(sym, decl->initExpr, isConst);
        }
-        else
-            m->AddGlobalVariable(decl->GetSymbol(), decl->initExpr,
-                                 (ds->typeQualifiers & TYPEQUAL_CONST) != 0);
    }
 }

@@ -1589,6 +1605,7 @@ lAddFunctionParams(Declarator *decl) {
            continue;
        Assert(pdecl->declarators.size() == 1);
        Symbol *sym = pdecl->declarators[0]->GetSymbol();
+        sym->type = sym->type->ResolveUnboundVariability(Type::Varying);
 #ifndef NDEBUG
        bool ok = m->symbolTable->AddVariable(sym);
        if (ok == false)
@@ -1605,7 +1622,8 @@ lAddFunctionParams(Declarator *decl) {

 /** Add a symbol for the built-in mask variable to the symbol table */
 static void lAddMaskToSymbolTable(SourcePos pos) {
-    const Type *t = AtomicType::VaryingConstUInt32;
+    const Type *t = g->target.isa == Target::GENERIC ?
+        AtomicType::VaryingConstBool : AtomicType::VaryingConstUInt32;
    Symbol *maskSymbol = new Symbol("__mask", pos, t);
    m->symbolTable->AddVariable(maskSymbol);
 }
@@ -1674,10 +1692,10 @@ static bool
 lGetConstantInt(Expr *expr, int *value, SourcePos pos, const char *usage) {
    if (expr == NULL)
        return false;
-    expr = expr->TypeCheck();
+    expr = TypeCheck(expr);
    if (expr == NULL)
        return false;
-    expr = expr->Optimize();
+    expr = Optimize(expr);
    if (expr == NULL)
        return false;

@@ -1753,8 +1771,8 @@ lFinalizeEnumeratorSymbols(std::vector<Symbol *> &enums,
               the actual enum type here and optimize it, which will have
               us end up with a ConstExpr with the desired EnumType... */
            Expr *castExpr = new TypeCastExpr(enumType, enums[i]->constValue,
-                                              false, enums[i]->pos);
-            castExpr = castExpr->Optimize();
+                                              enums[i]->pos);
+            castExpr = Optimize(castExpr);
            enums[i]->constValue = dynamic_cast<ConstExpr *>(castExpr);
            Assert(enums[i]->constValue != NULL);
        }
--- a/run_tests.py
+++ b/run_tests.py
@@ -12,32 +12,80 @@ import re
 import signal
 import random
 import string
-import mutex
 import subprocess
 import shlex
 import platform
+import tempfile
+
+# This script is affected by http://bugs.python.org/issue5261 on OSX 10.5 Leopard
+# git history has a workaround for that issue.
+
+is_windows = (platform.system() == 'Windows' or
+              'CYGWIN_NT' in platform.system())

 parser = OptionParser()
 parser.add_option("-r", "--random-shuffle", dest="random", help="Randomly order tests",
                  default=False, action="store_true")
+parser.add_option("-g", "--generics-include", dest="include_file", help="Filename for header implementing functions for generics",
+                  default=None)
 parser.add_option('-t', '--target', dest='target',
-                  help='Set compilation target (sse2, sse2-x2, sse4, sse4-x2, avx, avx-x2)',
+                  help='Set compilation target (sse2, sse2-x2, sse4, sse4-x2, avx, avx-x2, generic-4, generic-8, generic-16)',
                  default="sse4")
 parser.add_option('-a', '--arch', dest='arch',
                  help='Set architecture (x86, x86-64)',
                  default="x86-64")
+parser.add_option("-c", "--compiler", dest="compiler_exe", help="Compiler binary to use to run tests",
+                  default=None)
 parser.add_option('-o', '--no-opt', dest='no_opt', help='Disable optimization',
                  default=False, action="store_true")
+parser.add_option('-v', '--verbose', dest='verbose', help='Enable verbose output',
+                  default=False, action="store_true")
+if not is_windows:
+    parser.add_option('--valgrind', dest='valgrind', help='Run tests with valgrind',
+                      default=False, action="store_true")

 (options, args) = parser.parse_args()

+if not is_windows and options.valgrind:
+    valgrind_exe = "valgrind "
+else:
+    valgrind_exe = ""
+
+if not is_windows:
+    ispc_exe = "./ispc"
+else:
+    ispc_exe = "Release/ispc.exe"
+
+is_generic_target = options.target.find("generic-") != -1
+if is_generic_target and options.include_file == None:
+    if options.target == "generic-4":
+        sys.stderr.write("No generics #include specified; using examples/intrinsics/sse4.h\n")
+        options.include_file = "examples/intrinsics/sse4.h"
+    elif options.target == "generic-8":
+        sys.stderr.write("No generics #include specified and no default available for \"generic-8\" target.\n")
+        sys.exit(1)
+    elif options.target == "generic-16":
+        sys.stderr.write("No generics #include specified; using examples/intrinsics/generic-16.h\n")
+        options.include_file = "examples/intrinsics/generic-16.h"
+
+if options.compiler_exe == None:
+    if is_windows:
+        options.compiler_exe = "cl"
+    else:
+        options.compiler_exe = "g++"
+
 # if no specific test files are specified, run all of the tests in tests/
 # and failing_tests/
 if len(args) == 0:
    files = glob.glob("tests/*ispc") + glob.glob("failing_tests/*ispc") + \
        glob.glob("tests_errors/*ispc")
 else:
-    files = args
+    files = [ ]
+    for f in args:
+        if os.path.splitext(string.lower(f))[1] != ".ispc":
+            print "Ignoring file %s, which doesn't have an .ispc extension." % f
+        else:
+            files += [ f ]

 # randomly shuffle the tests if asked to do so
 if (options.random):
@@ -47,19 +95,11 @@ if (options.random):
 # counter
 total_tests = 0

-# We'd like to use the Lock class from the multiprocessing package to
-# serialize accesses to finished_tests_counter.  Unfortunately, the version of
-# python that ships with OSX 10.5 has this bug:
-# http://bugs.python.org/issue5261.  Therefore, we use the (deprecated but
-# still available) mutex class.
-#finished_tests_counter_lock = multiprocessing.Lock()
-if not (platform.system() == 'Windows' or
-        'CYGWIN_NT' in platform.system()):
-    finished_tests_mutex = mutex.mutex()
-    finished_tests_counter = multiprocessing.Value(c_int)
+finished_tests_counter = multiprocessing.Value(c_int)
+finished_tests_counter_lock = multiprocessing.Lock()

 # utility routine to print an update on the number of tests that have been
-# finished.  Should be called with the mutex (or lock) held..
+# finished.  Should be called with the lock held..
 def update_progress(fn):
    finished_tests_counter.value = finished_tests_counter.value + 1
    progress_str = " Done %d / %d [%s]" % (finished_tests_counter.value, total_tests, fn)
@@ -69,67 +109,80 @@ def update_progress(fn):
    progress_str += '\r'
    sys.stdout.write(progress_str)
    sys.stdout.flush()
-    finished_tests_mutex.unlock()

-fnull = open(os.devnull, 'w')
+def run_command(cmd):
+    if options.verbose:
+        sys.stdout.write("Running: %s\n" % cmd)
+    sp = subprocess.Popen(shlex.split(cmd), stdin=None,
+                          stdout=subprocess.PIPE,
+                          stderr=subprocess.PIPE)
+    out = sp.communicate()
+    output = ""
+    output += out[0].decode("utf-8")
+    output += out[1].decode("utf-8")
+
+    return (sp.returncode, output)

 # run the commands in cmd_list
-def run_cmds(cmd_list, filename, expect_failure):
-    output = ""
-    for cmd in cmd_list:
-        sp = subprocess.Popen(shlex.split(cmd), stdin=None,
-                              stdout=subprocess.PIPE,
-                              stderr=subprocess.PIPE)
-        out = sp.communicate()
-        output += out[0]
-        output += out[1]
-        failed = (sp.returncode != 0)
-        if failed:
-            break
+def run_cmds(compile_cmds, run_cmd, filename, expect_failure):
+    for cmd in compile_cmds:
+        (return_code, output) = run_command(cmd)
+        compile_failed = (return_code != 0)
+        if compile_failed:
+            sys.stdout.write("Compilation of test %s failed            \n" % filename)
+            if output != "":
+                sys.stdout.write("%s" % output)
+            return (1, 0)

-    surprise = ((expect_failure and not failed) or
-                (not expect_failure and failed))
+    (return_code, output) = run_command(run_cmd)
+    run_failed = (return_code != 0)
+
+    surprise = ((expect_failure and not run_failed) or
+                (not expect_failure and run_failed))
    if surprise == True:
-        print "Test %s %s (return code %d)            " % \
+        sys.stderr.write("Test %s %s (return code %d)            \n" % \
            (filename, "unexpectedly passed" if expect_failure else "failed",
-             sp.returncode)
+             return_code))
    if output != "":
-        print "%s" % output
-    return surprise
+        sys.stdout.write("%s\n" % output)
+    if surprise == True:
+        return (0, 1)
+    else:
+        return (0, 0)


 def run_test(filename):
+    global is_windows
+    if is_windows:
+        input_prefix = "../"
+    else:
+        input_prefix = ""
+        
    # is this a test to make sure an error is issued?
-    error_count = 0
    want_error = (filename.find("tests_errors") != -1)
    if want_error == True:
-        ispc_cmd = "ispc --werror --nowrap %s --arch=%s --target=%s" % \
-            (filename, options.arch, options.target)
-        sp = subprocess.Popen(shlex.split(ispc_cmd), stdin=None,
-                              stdout=subprocess.PIPE,
-                              stderr=subprocess.PIPE)
-        out = sp.communicate()
-        output = ""
-        output += out[0]
-        output += out[1]
-        got_error = (sp.returncode != 0)
+        ispc_cmd = ispc_exe + " --werror --nowrap %s --arch=%s --target=%s" % \
+            (input_prefix + filename, options.arch, options.target)
+        (return_code, output) = run_command(ispc_cmd)
+        got_error = (return_code != 0)

        # figure out the error message we're expecting
-        file = open(filename, 'r')
+        file = open(input_prefix + filename, 'r')
        firstline = file.readline()
-        firstline = string.replace(firstline, "//", "")
-        firstline = string.lstrip(firstline)
-        firstline = string.rstrip(firstline)
+        firstline = firstline.replace("//", "")
+        firstline = firstline.lstrip()
+        firstline = firstline.rstrip()
        file.close()

        if (output.find(firstline) == -1):
-            print "OUT %s" % filename
-            print "Didnt see expected error message %s from test %s.\nActual output:\n%s" % \
-                (firstline, filename, output)
-            error_count += 1
+            sys.stderr.write("Didn't see expected error message %s from test %s.\nActual output:\n%s\n" % \
+                (firstline, filename, output))
+            return (1, 0)
        elif got_error == False:
-            print "Unexpectedly no errors issued from test %s" % filename
-            error_count += 1
+            sys.stderr.write("Unexpectedly no errors issued from test %s\n" % filename)
+            return (1, 0)
+        else:
+            return (0, 0)
    else:
        # do we expect this test to fail?
        should_fail = (filename.find("failing_") != -1)
@@ -138,7 +191,7 @@ def run_test(filename):
        # function that this test has.
        sig2def = { "f_v(" : 0, "f_f(" : 1, "f_fu(" : 2, "f_fi(" : 3, 
                    "f_du(" : 4, "f_duf(" : 5, "f_di(" : 6 }
-        file = open(filename, 'r')
+        file = open(input_prefix + filename, 'r')
        match = -1
        for line in file:
            # look for lines with 'export'...
@@ -146,74 +199,107 @@ def run_test(filename):
                continue
            # one of them should have a function with one of the
            # declarations in sig2def
-            for pattern, ident in sig2def.items():
+            for pattern, ident in list(sig2def.items()):
                if line.find(pattern) != -1:
                    match = ident
                    break
        file.close()
        if match == -1:
-            print "Fatal error: unable to find function signature " + \
-                  "in test %s" % filename
-            error_count += 1
+            sys.stderr.write("Fatal error: unable to find function signature " + \
+                  "in test %s\n" % filename)
+            return (1, 0)
        else:
-            if (platform.system() == 'Windows' or
-                'CYGWIN_NT' in platform.system()):
-                obj_name = "%s.obj" % filename
-                exe_name = "%s.exe" % filename
-                cc_cmd = "cl /nologo test_static.cpp /DTEST_SIG=%d %s.obj /Fe%s" % \
-                         (match, filename, exe_name)
+            is_generic_target = options.target.find("generic-") != -1
+            if is_generic_target:
+                obj_name = "%s.cpp" % filename
+
+            if is_windows:
+                if not is_generic_target:
+                    obj_name = "%s%s.obj" % (input_prefix, filename)
+                exe_name = "%s%s.exe" % (input_prefix, filename)
+
+                cc_cmd = "%s /I. /Iwinstuff /Zi /nologo /DTEST_SIG=%d %stest_static.cpp %s /Fe%s" % \
+                         (options.compiler_exe, match, input_prefix, obj_name, exe_name)
                if should_fail:
                    cc_cmd += " /DEXPECT_FAILURE"
            else:
-                obj_name = "%s.o" % filename
+                if not is_generic_target:
+                    obj_name = "%s.o" % filename
                exe_name = "%s.run" % filename
+
                if options.arch == 'x86':
                    gcc_arch = '-m32'
                else:
                    gcc_arch = '-m64'
-                cc_cmd = "g++ %s test_static.cpp -DTEST_SIG=%d %s.o -o %s" % \
-                         (gcc_arch, match, filename, exe_name)
+                cc_cmd = "%s -O2 -msse4.2 -I. %s test_static.cpp -DTEST_SIG=%d %s -o %s" % \
+                         (options.compiler_exe, gcc_arch, match, obj_name, exe_name)
                if platform.system() == 'Darwin':
                    cc_cmd += ' -Wl,-no_pie'
                if should_fail:
                    cc_cmd += " -DEXPECT_FAILURE"
-    
-            ispc_cmd = "ispc --woff %s -o %s --arch=%s --target=%s" % \
-                       (filename, obj_name, options.arch, options.target)
+
+            ispc_cmd = ispc_exe + " --woff %s -o %s --arch=%s --target=%s" % \
+                       (input_prefix+filename, obj_name, options.arch, options.target)
            if options.no_opt:
                ispc_cmd += " -O0" 
-    
+            if is_generic_target:
+                ispc_cmd += " --emit-c++ --c++-include-file=%s" % options.include_file
+
        # compile the ispc code, make the executable, and run it...
-        error_count += run_cmds([ispc_cmd, cc_cmd, exe_name], \
-                                filename, should_fail)
-    
+        global valgrind_exe
+        (compile_error, run_error) = run_cmds([ispc_cmd, cc_cmd], 
+                                              valgrind_exe + " " + exe_name, \
+                                              filename, should_fail)
+
        # clean up after running the test
        try:
-            os.unlink(exe_name)
+            if not run_error:
+                os.unlink(exe_name)
+                if is_windows:
+                    os.unlink(filename + ".pdb")
+                    os.unlink(filename + ".ilk")
            os.unlink(obj_name)
        except:
            None

-    return error_count
+        return (compile_error, run_error)

 # pull tests to run from the given queue and run them.  Multiple copies of
 # this function will be running in parallel across all of the CPU cores of
 # the system.
-def run_tasks_from_queue(queue):
-    error_count = 0
+def run_tasks_from_queue(queue, queue_ret):
+    if is_windows:
+        tmpdir = "tmp%d" % os.getpid()
+        os.mkdir(tmpdir)
+        os.chdir(tmpdir)
+    else:
+        olddir = ""
+        
+    compile_error_files = [ ]
+    run_error_files = [ ]
    while True:
        filename = queue.get()
        if (filename == 'STOP'):
-            sys.exit(error_count)
+            queue_ret.put((compile_error_files, run_error_files))
+            if is_windows:
+                try:
+                    os.remove("test_static.obj")
+                    os.remove("/vc100.pdb")
+                    os.chdir("..")
+                    os.rmdir(tmpdir)
+                except:
+                    None
+                
+            sys.exit(0)

-        error_count += run_test(filename)
-
-        # If not for http://bugs.python.org/issue5261 on OSX, we'd like to do this:
-        #with finished_tests_counter_lock:
-            #update_progress(filename)
-        # but instead we do this...
-        finished_tests_mutex.lock(update_progress, filename)
+        (compile_error, run_error) = run_test(filename)
+        if compile_error != 0:
+            compile_error_files += [ filename ]
+        if run_error != 0:
+            run_error_files += [ filename ]

+        with finished_tests_counter_lock:
+            update_progress(filename)

 task_threads = []

@@ -224,57 +310,51 @@ def sigint(signum, frame):

 if __name__ == '__main__':
    total_tests = len(files)
-    error_count = 0

-    if (platform.system() == 'Windows' or
-        'CYGWIN_NT' in platform.system()):
-        # cl.exe gets itself all confused if we have multiple instances of
-        # it running concurrently and operating on the same .cpp file
-        # (test_static.cpp), even if we are generating a differently-named
-        # exe in the end.  So run serially. :-(
-        nthreads = 1
-        num_done = 0
-        print "Running %d tests." % (total_tests)
-        for fn in files:
-            error_count += run_test(fn)
+    compile_error_files = [ ]
+    run_error_files = [ ]

-            num_done += 1
-            progress_str = " Done %d / %d [%s]" % (num_done, total_tests, fn)
-            # spaces to clear out detrius from previous printing...
-            for x in range(30):
-                progress_str += ' '
-            progress_str += '\r'
-            sys.stdout.write(progress_str)
-            sys.stdout.flush()
-    else:
-        nthreads = multiprocessing.cpu_count()
-        print "Found %d CPUs. Running %d tests." % (nthreads, total_tests)
+    nthreads = multiprocessing.cpu_count()
+    print "Found %d CPUs. Running %d tests." % (nthreads, total_tests)

-        # put each of the test filenames into a queue
-        q = multiprocessing.Queue()
-        for fn in files:
-            q.put(fn)
-        for x in range(nthreads):
-            q.put('STOP')
+    # put each of the test filenames into a queue
+    q = multiprocessing.Queue()
+    for fn in files:
+        q.put(fn)
+    for x in range(nthreads):
+        q.put('STOP')
+    qret = multiprocessing.Queue()

-        # need to catch sigint so that we can terminate all of the tasks if
-        # we're interrupted
-        signal.signal(signal.SIGINT, sigint)
+    # need to catch sigint so that we can terminate all of the tasks if
+    # we're interrupted
+    signal.signal(signal.SIGINT, sigint)

-        # launch jobs to run tests
-        for x in range(nthreads):
-            t = multiprocessing.Process(target=run_tasks_from_queue, args=(q,))
-            task_threads.append(t)
-            t.start()
+    # launch jobs to run tests
+    for x in range(nthreads):
+        t = multiprocessing.Process(target=run_tasks_from_queue, args=(q,qret))
+        task_threads.append(t)
+        t.start()

-        # wait for them to all finish and then return the number that failed
-        # (i.e. return 0 if all is ok)
-        error_count = 0
-        for t in task_threads:
-            t.join()
-            error_count += t.exitcode
-        print
+    # wait for them to all finish and then return the number that failed
+    # (i.e. return 0 if all is ok)
+    for t in task_threads:
+        t.join()
+    print

-    if error_count > 0:
-        print "%d / %d tests FAILED!" % (error_count, total_tests)
-    sys.exit(error_count)
+    while not qret.empty():
+        (c, r) = qret.get()
+        compile_error_files += c
+        run_error_files += r
+
+    if len(compile_error_files) > 0:
+        compile_error_files.sort()
+        sys.stdout.write("%d / %d tests FAILED compilation:\n" % (len(compile_error_files), total_tests))
+        for f in compile_error_files:
+            sys.stdout.write("\t%s\n" % f)
+    if len(run_error_files) > 0:
+        run_error_files.sort()
+        sys.stdout.write("%d / %d tests FAILED execution:\n" % (len(run_error_files), total_tests))
+        for f in run_error_files:
+            sys.stdout.write("\t%s\n" % f)
+
+    sys.exit(len(compile_error_files) + len(run_error_files))
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -38,6 +38,14 @@
           ispc code 
 */

+#ifdef ISPC_TARGET_GENERIC
+#define IntMaskType bool
+#define UIntMaskType bool
+#else
+#define IntMaskType int32
+#define UIntMaskType unsigned int32
+#endif
+
 ///////////////////////////////////////////////////////////////////////////
 // Low level primitives

@@ -86,15 +94,15 @@ static inline float broadcast(float v, uniform int i) {
 }

 static inline int8 broadcast(int8 v, uniform int i) {
-    return __broadcast_int8(v, i);
+    return __broadcast_i8(v, i);
 }

 static inline int16 broadcast(int16 v, uniform int i) {
-    return __broadcast_int16(v, i);
+    return __broadcast_i16(v, i);
 }

 static inline int32 broadcast(int32 v, uniform int i) {
-    return __broadcast_int32(v, i);
+    return __broadcast_i32(v, i);
 }

 static inline double broadcast(double v, uniform int i) {
@@ -102,7 +110,7 @@ static inline double broadcast(double v, uniform int i) {
 }

 static inline int64 broadcast(int64 v, uniform int i) {
-    return __broadcast_int64(v, i);
+    return __broadcast_i64(v, i);
 }

 static inline float rotate(float v, uniform int i) {
@@ -110,15 +118,15 @@ static inline float rotate(float v, uniform int i) {
 }

 static inline int8 rotate(int8 v, uniform int i) {
-    return __rotate_int8(v, i);
+    return __rotate_i8(v, i);
 }

 static inline int16 rotate(int16 v, uniform int i) {
-    return __rotate_int16(v, i);
+    return __rotate_i16(v, i);
 }

 static inline int32 rotate(int32 v, uniform int i) {
-    return __rotate_int32(v, i);
+    return __rotate_i32(v, i);
 }

 static inline double rotate(double v, uniform int i) {
@@ -126,7 +134,7 @@ static inline double rotate(double v, uniform int i) {
 }

 static inline int64 rotate(int64 v, uniform int i) {
-    return __rotate_int64(v, i);
+    return __rotate_i64(v, i);
 }

 static inline float shuffle(float v, int i) {
@@ -134,15 +142,15 @@ static inline float shuffle(float v, int i) {
 }

 static inline int8 shuffle(int8 v, int i) {
-    return __shuffle_int8(v, i);
+    return __shuffle_i8(v, i);
 }

 static inline int16 shuffle(int16 v, int i) {
-    return __shuffle_int16(v, i);
+    return __shuffle_i16(v, i);
 }

 static inline int32 shuffle(int32 v, int i) {
-    return __shuffle_int32(v, i);
+    return __shuffle_i32(v, i);
 }

 static inline double shuffle(double v, int i) {
@@ -150,7 +158,7 @@ static inline double shuffle(double v, int i) {
 }

 static inline int64 shuffle(int64 v, int i) {
-    return __shuffle_int64(v, i);
+    return __shuffle_i64(v, i);
 }

 static inline float shuffle(float v0, float v1, int i) {
@@ -158,15 +166,15 @@ static inline float shuffle(float v0, float v1, int i) {
 }

 static inline int8 shuffle(int8 v0, int8 v1, int i) {
-    return __shuffle2_int8(v0, v1, i);
+    return __shuffle2_i8(v0, v1, i);
 }

 static inline int16 shuffle(int16 v0, int16 v1, int i) {
-    return __shuffle2_int16(v0, v1, i);
+    return __shuffle2_i16(v0, v1, i);
 }

 static inline int32 shuffle(int32 v0, int32 v1, int i) {
-    return __shuffle2_int32(v0, v1, i);
+    return __shuffle2_i32(v0, v1, i);
 }

 static inline double shuffle(double v0, double v1, int i) {
@@ -174,7 +182,7 @@ static inline double shuffle(double v0, double v1, int i) {
 }

 static inline int64 shuffle(int64 v0, int64 v1, int i) {
-    return __shuffle2_int64(v0, v1, i);
+    return __shuffle2_i64(v0, v1, i);
 }

 // x[i]
@@ -274,13 +282,21 @@ static inline int32 sign_extend(bool v) {
 static inline uniform bool any(bool v) {
    // We only care about whether "any" is true for the active program instances,
    // so we have to make v with the current program mask.
+#ifdef ISPC_TARGET_GENERIC
+    return __movmsk(v & __mask) != 0;
+#else
    return __movmsk(__sext_varying_bool(v) & __mask) != 0;
+#endif
 }

 static inline uniform bool all(bool v) {
    // As with any(), we need to explicitly mask v with the current program mask
    // so we're only looking at the current lanes
+#ifdef ISPC_TARGET_GENERIC
+    bool match = ((v & __mask) == __mask);
+#else
    int32 match = __sext_varying_bool((__sext_varying_bool(v) & __mask) == __mask);
+#endif
    return __movmsk(match) == (1 << programCount) - 1;
 }

@@ -296,19 +312,23 @@ static inline int popcnt(int v) {
    int r;
    for (uniform int i = 0; i < programCount; ++i)
        r = insert(r, i, popcnt(extract(v, i)));
-    return (r & __mask);
+    return __mask ? r : 0;
 }

 static inline int popcnt(int64 v) {
    int r;
    for (uniform int i = 0; i < programCount; ++i)
        r = insert(r, i, popcnt(extract(v, i)));
-    return (r & __mask);
+    return __mask ? r : 0;
 }

 static inline uniform int popcnt(bool v) {
    // As with any() and all(), only count across the active lanes
+#ifdef ISPC_TARGET_GENERIC
+    return __popcnt_int32(__movmsk(v & __mask));
+#else
    return __popcnt_int32(__movmsk(__sext_varying_bool(v) & __mask));
+#endif
 }

 static inline uniform int lanemask() {
@@ -450,23 +470,27 @@ soa_to_aos4(float v0, float v1, float v2, float v3, uniform float a[]) {
 static inline void
 aos_to_soa3(uniform int32 a[], int32 * uniform v0, int32 * uniform v1, 
            int32 * uniform v2) {
-    __aos_to_soa3_int32(a, v0, v1, v2);
+    aos_to_soa3((uniform float * uniform)a, (float * uniform)v0, 
+                (float * uniform)v1, (float * uniform)v2);
 }

 static inline void
 soa_to_aos3(int32 v0, int32 v1, int32 v2, uniform int32 a[]) {
-    __soa_to_aos3_int32(v0, v1, v2, a);
+    soa_to_aos3(floatbits(v0), floatbits(v1), floatbits(v2),
+                (uniform float * uniform)a);
 }

 static inline void
 aos_to_soa4(uniform int32 a[], int32 * uniform v0, int32 * uniform v1, 
            int32 * uniform v2, int32 * uniform v3) {
-    __aos_to_soa4_int32(a, v0, v1, v2, v3);
+    aos_to_soa4((uniform float * uniform)a, (float * uniform )v0, 
+                (float * uniform)v1, (float * uniform)v2, (float * uniform)v3);
 }

 static inline void
 soa_to_aos4(int32 v0, int32 v1, int32 v2, int32 v3, uniform int32 a[]) {
-    __soa_to_aos4_int32(v0, v1, v2, v3, a);
+    soa_to_aos4(floatbits(v0), floatbits(v1), floatbits(v2), floatbits(v3), 
+                (uniform float * uniform)a);
 }

 ///////////////////////////////////////////////////////////////////////////
@@ -569,7 +593,7 @@ static inline uniform float reduce_max(float v) {

 static inline uniform int reduce_add(int x) {
    // Zero out the values for lanes that aren't running
-    return __reduce_add_int32(x & __mask);
+    return __reduce_add_int32(__mask ? x : 0);
 }

 static inline uniform int reduce_min(int v) {
@@ -589,7 +613,7 @@ static inline uniform int reduce_max(int v) {
 static inline uniform unsigned int reduce_add(unsigned int x) {
    // Set values for non-running lanes to zero so they don't affect the
    // result.
-    return __reduce_add_uint32(x & __mask);
+    return __reduce_add_uint32(__mask ? x : 0);
 }

 static inline uniform unsigned int reduce_min(unsigned int v) {
@@ -627,7 +651,7 @@ static inline uniform double reduce_max(double v) {

 static inline uniform int64 reduce_add(int64 x) {
    // Zero out the values for lanes that aren't running
-    return __reduce_add_int64(x & (int64)(__mask));
+    return __reduce_add_int64(__mask ? x : 0);
 }

 static inline uniform int64 reduce_min(int64 v) {
@@ -647,7 +671,7 @@ static inline uniform int64 reduce_max(int64 v) {
 static inline uniform unsigned int64 reduce_add(unsigned int64 x) {
    // Set values for non-running lanes to zero so they don't affect the
    // result.
-    return __reduce_add_int64(x & (int64)(__mask));
+    return __reduce_add_int64(__mask ? x : 0);
 }

 static inline uniform unsigned int64 reduce_min(unsigned int64 v) {
@@ -672,19 +696,19 @@ static inline uniform bool reduce_equal(TYPE v, uniform TYPE * uniform value) {
    return __reduce_equal_##FUNCTYPE(v, value, (MASKTYPE)__mask);       \
 }

-REDUCE_EQUAL(int32, int32, int32)
-REDUCE_EQUAL(unsigned int32, int32, unsigned int32)
-REDUCE_EQUAL(float, float, int32)
-REDUCE_EQUAL(int64, int64, int32)
-REDUCE_EQUAL(unsigned int64, int64, unsigned int32)
-REDUCE_EQUAL(double, double, int32)
+REDUCE_EQUAL(int32, int32, IntMaskType)
+REDUCE_EQUAL(unsigned int32, int32, UIntMaskType)
+REDUCE_EQUAL(float, float, IntMaskType)
+REDUCE_EQUAL(int64, int64, IntMaskType)
+REDUCE_EQUAL(unsigned int64, int64, UIntMaskType)
+REDUCE_EQUAL(double, double, IntMaskType)

 static int32 exclusive_scan_add(int32 v) {
-    return __exclusive_scan_add_i32(v, (int32)__mask);
+    return __exclusive_scan_add_i32(v, (IntMaskType)__mask);
 }

 static unsigned int32 exclusive_scan_add(unsigned int32 v) {
-    return __exclusive_scan_add_i32(v, __mask);
+    return __exclusive_scan_add_i32((int32)v, (IntMaskType)__mask);
 }

 static float exclusive_scan_add(float v) {
@@ -692,11 +716,11 @@ static float exclusive_scan_add(float v) {
 }

 static int64 exclusive_scan_add(int64 v) {
-    return __exclusive_scan_add_i64(v, (int32)__mask);
+    return __exclusive_scan_add_i64(v, (IntMaskType)__mask);
 }

 static unsigned int64 exclusive_scan_add(unsigned int64 v) {
-    return __exclusive_scan_add_i64(v, __mask);
+    return __exclusive_scan_add_i64(v, (UIntMaskType)__mask);
 }

 static double exclusive_scan_add(double v) {
@@ -704,35 +728,35 @@ static double exclusive_scan_add(double v) {
 }

 static int32 exclusive_scan_and(int32 v) {
-    return __exclusive_scan_and_i32(v, (int32)__mask);
+    return __exclusive_scan_and_i32(v, (IntMaskType)__mask);
 }

 static unsigned int32 exclusive_scan_and(unsigned int32 v) {
-    return __exclusive_scan_and_i32(v, __mask);
+    return __exclusive_scan_and_i32(v, (UIntMaskType)__mask);
 }

 static int64 exclusive_scan_and(int64 v) {
-    return __exclusive_scan_and_i64(v, (int32)__mask);
+    return __exclusive_scan_and_i64(v, (IntMaskType)__mask);
 }

 static unsigned int64 exclusive_scan_and(unsigned int64 v) {
-    return __exclusive_scan_and_i64(v, __mask);
+    return __exclusive_scan_and_i64(v, (UIntMaskType)__mask);
 }

 static int32 exclusive_scan_or(int32 v) {
-    return __exclusive_scan_or_i32(v, (int32)__mask);
+    return __exclusive_scan_or_i32(v, (IntMaskType)__mask);
 }

 static unsigned int32 exclusive_scan_or(unsigned int32 v) {
-    return __exclusive_scan_or_i32(v, __mask);
+    return __exclusive_scan_or_i32(v, (UIntMaskType)__mask);
 }

 static int64 exclusive_scan_or(int64 v) {
-    return __exclusive_scan_or_i64(v, (int32)__mask);
+    return __exclusive_scan_or_i64(v, (IntMaskType)__mask);
 }

 static unsigned int64 exclusive_scan_or(unsigned int64 v) {
-    return __exclusive_scan_or_i64(v, __mask);
+    return __exclusive_scan_or_i64(v, (UIntMaskType)__mask);
 }

 ///////////////////////////////////////////////////////////////////////////
@@ -741,23 +765,23 @@ static unsigned int64 exclusive_scan_or(unsigned int64 v) {
 static inline uniform int 
 packed_load_active(uniform unsigned int * uniform a,
                   unsigned int * uniform vals) {
-    return __packed_load_active(a, vals, (unsigned int32)__mask);
+    return __packed_load_active(a, vals, (UIntMaskType)__mask);
 }

 static inline uniform int
 packed_store_active(uniform unsigned int * uniform a,
                    unsigned int vals) {
-    return __packed_store_active(a, vals, (unsigned int32)__mask);
+    return __packed_store_active(a, vals, (UIntMaskType)__mask);
 }

 static inline uniform int 
 packed_load_active(uniform int * uniform a, int * uniform vals) {
-    return __packed_load_active(a, vals, (int32)__mask);
+    return __packed_load_active(a, vals, (IntMaskType)__mask);
 }

 static inline uniform int 
 packed_store_active(uniform int * uniform a, int vals) {
-    return __packed_store_active(a, vals, (int32)__mask);
+    return __packed_store_active(a, vals, (IntMaskType)__mask);
 }

 ///////////////////////////////////////////////////////////////////////////
@@ -784,8 +808,7 @@ static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
 static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
                                               uniform TA value) {      \
    memory_barrier();                                                   \
-    uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value, \
-                                                            (MASKTYPE)__mask); \
+    uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value); \
    memory_barrier();                                                   \
    return ret;                                                         \
 }                                                                       \
@@ -800,22 +823,80 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \
            continue;                                                   \
        uniform TA * uniform p = ptrArray[i];                           \
        uniform TA v = extract(value, i);                               \
-        uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v,     \
-                                                      (MASKTYPE)__mask); \
+        uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v);    \
        ret = insert(ret, i, r);                                        \
    }                                                                   \
    memory_barrier();                                                   \
    return ret;                                                         \
 }                                                                       \

-#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB, MASKTYPE)                \
+#define DEFINE_ATOMIC_SWAP(TA,TB)                                       \
+static inline TA atomic_swap_global(uniform TA * uniform ptr, TA value) { \
+    memory_barrier();                                                   \
+    uniform int i = 0;                                                  \
+    TA ret[programCount];                                               \
+    TA memVal;                                                          \
+    uniform int lastSwap;                                               \
+    uniform int mask = lanemask();                                      \
+    /* First, have the first running program instance (if any) perform  \
+       the swap with memory with its value of "value"; record the       \
+       value returned. */                                               \
+    for (; i < programCount; ++i) {                                     \
+        if ((mask & (1 << i)) == 0)                                     \
+            continue;                                                   \
+        memVal = __atomic_swap_uniform_##TB##_global(ptr, extract(value, i)); \
+        lastSwap = i;                                                   \
+        break;                                                          \
+    }                                                                   \
+    /* Now, for all of the remaining running program instances, set the \
+       return value of the last instance that did a swap with this      \
+       instance's value of "value"; this gives the same effect as if the \
+       current instance had executed a hardware atomic swap right before \
+       the last one that did a swap. */                                 \
+    for (; i < programCount; ++i) {                                     \
+        if ((mask & (1 << i)) == 0)                                     \
+            continue;                                                   \
+        ret[lastSwap] = extract(value, i);                              \
+        lastSwap = i;                                                   \
+    }                                                                   \
+    /* And the last instance that wanted to swap gets the value we      \
+       originally got back from memory... */                            \
+    ret[lastSwap] = memVal;                                             \
+    memory_barrier();                                                   \
+    return ret[programIndex];                                           \
+}                                                                       \
+static inline uniform TA atomic_swap_global(uniform TA * uniform ptr,   \
+                                            uniform TA value) {         \
+    memory_barrier();                                                   \
+    uniform TA ret = __atomic_swap_uniform_##TB##_global(ptr, value);   \
+    memory_barrier();                                                   \
+    return ret;                                                         \
+}                                                                       \
+static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \
+    uniform TA * uniform ptrArray[programCount];                        \
+    ptrArray[programIndex] = ptr;                                       \
+    memory_barrier();                                                   \
+    TA ret;                                                             \
+    uniform int mask = lanemask();                                      \
+    for (uniform int i = 0; i < programCount; ++i) {                    \
+        if ((mask & (1 << i)) == 0)                                     \
+            continue;                                                   \
+        uniform TA * uniform p = ptrArray[i];                           \
+        uniform TA v = extract(value, i);                               \
+        uniform TA r = __atomic_swap_uniform_##TB##_global(p, v);       \
+        ret = insert(ret, i, r);                                        \
+    }                                                                   \
+    memory_barrier();                                                   \
+    return ret;                                                         \
+}                                                                       \
+
+#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB)                          \
 static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
    uniform TA oneval = reduce_##OPA(value);                            \
    TA ret;                                                             \
    if (lanemask() != 0) {                                              \
        memory_barrier();                                               \
-        ret = __atomic_##OPB##_uniform_##TB##_global(ptr, oneval,       \
-                                                     (MASKTYPE)__mask); \
+        ret = __atomic_##OPB##_uniform_##TB##_global(ptr, oneval);      \
        memory_barrier();                                               \
    }                                                                   \
    return ret;                                                         \
@@ -823,8 +904,7 @@ static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
 static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
                                               uniform TA value) {      \
    memory_barrier();                                                   \
-    uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value, \
-                                                            (MASKTYPE)__mask); \
+    uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value); \
    memory_barrier();                                                   \
    return ret;                                                         \
 }                                                                       \
@@ -840,59 +920,60 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr,        \
            continue;                                                   \
        uniform TA * uniform p = ptrArray[i];                           \
        uniform TA v = extract(value, i);                               \
-        uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v,     \
-                                                      (MASKTYPE)__mask); \
+        uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v);    \
        ret = insert(ret, i, r);                                        \
    }                                                                   \
    memory_barrier();                                                   \
    return ret;                                                         \
 }

-DEFINE_ATOMIC_OP(int32,int32,add,add,int32)
-DEFINE_ATOMIC_OP(int32,int32,subtract,sub,int32)
-DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min,int32)
-DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max,int32)
-DEFINE_ATOMIC_OP(int32,int32,and,and,int32)
-DEFINE_ATOMIC_OP(int32,int32,or,or,int32)
-DEFINE_ATOMIC_OP(int32,int32,xor,xor,int32)
-DEFINE_ATOMIC_OP(int32,int32,swap,swap,int32)
+DEFINE_ATOMIC_OP(int32,int32,add,add,IntMaskType)
+DEFINE_ATOMIC_OP(int32,int32,subtract,sub,IntMaskType)
+DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min)
+DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max)
+DEFINE_ATOMIC_OP(int32,int32,and,and,IntMaskType)
+DEFINE_ATOMIC_OP(int32,int32,or,or,IntMaskType)
+DEFINE_ATOMIC_OP(int32,int32,xor,xor,IntMaskType)
+DEFINE_ATOMIC_SWAP(int32,int32)

 // For everything but atomic min and max, we can use the same
 // implementations for unsigned as for signed.
-DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,unsigned int32)
-DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,unsigned int32)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin,unsigned int32)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax,unsigned int32)
-DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,unsigned int32)
-DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,unsigned int32)
-DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,unsigned int32)
-DEFINE_ATOMIC_OP(unsigned int32,int32,swap,swap,unsigned int32)
+DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,UIntMaskType)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax)
+DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,UIntMaskType)
+DEFINE_ATOMIC_SWAP(unsigned int32,int32)

-DEFINE_ATOMIC_OP(float,float,swap,swap,int32)
+DEFINE_ATOMIC_SWAP(float,float)

-DEFINE_ATOMIC_OP(int64,int64,add,add,int32)
-DEFINE_ATOMIC_OP(int64,int64,subtract,sub,int32)
-DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min,int32)
-DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max,int32)
-DEFINE_ATOMIC_OP(int64,int64,and,and,int32)
-DEFINE_ATOMIC_OP(int64,int64,or,or,int32)
-DEFINE_ATOMIC_OP(int64,int64,xor,xor,int32)
-DEFINE_ATOMIC_OP(int64,int64,swap,swap,int32)
+DEFINE_ATOMIC_OP(int64,int64,add,add,IntMaskType)
+DEFINE_ATOMIC_OP(int64,int64,subtract,sub,IntMaskType)
+DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min)
+DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max)
+DEFINE_ATOMIC_OP(int64,int64,and,and,IntMaskType)
+DEFINE_ATOMIC_OP(int64,int64,or,or,IntMaskType)
+DEFINE_ATOMIC_OP(int64,int64,xor,xor,IntMaskType)
+DEFINE_ATOMIC_SWAP(int64,int64)

 // For everything but atomic min and max, we can use the same
 // implementations for unsigned as for signed.
-DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,unsigned int32)
-DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,unsigned int32)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin,unsigned int32)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax,unsigned int32)
-DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,unsigned int32)
-DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,unsigned int32)
-DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,unsigned int32)
-DEFINE_ATOMIC_OP(unsigned int64,int64,swap,swap,unsigned int32)
+DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,UIntMaskType)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax)
+DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,UIntMaskType)
+DEFINE_ATOMIC_SWAP(unsigned int64,int64)

-DEFINE_ATOMIC_OP(double,double,swap,swap,int32)
+DEFINE_ATOMIC_SWAP(double,double)

 #undef DEFINE_ATOMIC_OP
+#undef DEFINE_ATOMIC_MINMAX_OP
+#undef DEFINE_ATOMIC_SWAP

 #define ATOMIC_DECL_CMPXCHG(TA, TB, MASKTYPE)                           \
 static inline TA atomic_compare_exchange_global(                           \
@@ -907,18 +988,17 @@ static inline uniform TA atomic_compare_exchange_global(               \
         uniform TA * uniform ptr, uniform TA oldval, uniform TA newval) { \
    memory_barrier();                                                      \
    uniform TA ret =                                                    \
-        __atomic_compare_exchange_uniform_##TB##_global(ptr, oldval, newval, \
-                                                        (MASKTYPE)__mask); \
+        __atomic_compare_exchange_uniform_##TB##_global(ptr, oldval, newval); \
    memory_barrier();                                                   \
    return ret;                                                         \
 }

-ATOMIC_DECL_CMPXCHG(int32, int32, int32)
-ATOMIC_DECL_CMPXCHG(unsigned int32, int32, unsigned int32)
-ATOMIC_DECL_CMPXCHG(float, float, int32)
-ATOMIC_DECL_CMPXCHG(int64, int64, int32)
-ATOMIC_DECL_CMPXCHG(unsigned int64, int64, unsigned int32)
-ATOMIC_DECL_CMPXCHG(double, double, int32)
+ATOMIC_DECL_CMPXCHG(int32, int32, IntMaskType)
+ATOMIC_DECL_CMPXCHG(unsigned int32, int32, UIntMaskType)
+ATOMIC_DECL_CMPXCHG(float, float, IntMaskType)
+ATOMIC_DECL_CMPXCHG(int64, int64, IntMaskType)
+ATOMIC_DECL_CMPXCHG(unsigned int64, int64, UIntMaskType)
+ATOMIC_DECL_CMPXCHG(double, double, IntMaskType)

 #undef ATOMIC_DECL_CMPXCHG

@@ -3071,16 +3151,15 @@ static inline unsigned int random(RNGState * uniform state)
 {
    unsigned int b;

-    // FIXME: state->z1, etc..
-    b  = (((*state).z1 << 6) ^ (*state).z1) >> 13;
-    (*state).z1 = (((*state).z1 & 4294967294U) << 18) ^ b;
-    b  = (((*state).z2 << 2) ^ (*state).z2) >> 27; 
-    (*state).z2 = (((*state).z2 & 4294967288U) << 2) ^ b;
-    b  = (((*state).z3 << 13) ^ (*state).z3) >> 21;
-    (*state).z3 = (((*state).z3 & 4294967280U) << 7) ^ b;
-    b  = (((*state).z4 << 3) ^ (*state).z4) >> 12;
-    (*state).z4 = (((*state).z4 & 4294967168U) << 13) ^ b;
-    return ((*state).z1 ^ (*state).z2 ^ (*state).z3 ^ (*state).z4);
+    b  = ((state->z1 << 6) ^ state->z1) >> 13;
+    state->z1 = ((state->z1 & 4294967294U) << 18) ^ b;
+    b  = ((state->z2 << 2) ^ state->z2) >> 27; 
+    state->z2 = ((state->z2 & 4294967288U) << 2) ^ b;
+    b  = ((state->z3 << 13) ^ state->z3) >> 21;
+    state->z3 = ((state->z3 & 4294967280U) << 7) ^ b;
+    b  = ((state->z4 << 3) ^ state->z4) >> 12;
+    state->z4 = ((state->z4 & 4294967168U) << 13) ^ b;
+    return (state->z1 ^ state->z2 ^ state->z3 ^ state->z4);
 }

 static inline float frandom(RNGState * uniform state)
@@ -3096,30 +3175,30 @@ static inline uniform unsigned int __seed4(RNGState * uniform state,
    uniform unsigned int c1 = 0xf0f0f0f0;
    uniform unsigned int c2 = 0x0f0f0f0f;

-    (*state).z1 = insert((*state).z1, start + 0, seed);
-    (*state).z1 = insert((*state).z1, start + 1, seed ^ c1);
-    (*state).z1 = insert((*state).z1, start + 2, (seed << 3) ^ c1);
-    (*state).z1 = insert((*state).z1, start + 3, (seed << 2) ^ c2);
+    state->z1 = insert(state->z1, start + 0, seed);
+    state->z1 = insert(state->z1, start + 1, seed ^ c1);
+    state->z1 = insert(state->z1, start + 2, (seed << 3) ^ c1);
+    state->z1 = insert(state->z1, start + 3, (seed << 2) ^ c2);

    seed += 131;
-    (*state).z2 = insert((*state).z2, start + 0, seed);
-    (*state).z2 = insert((*state).z2, start + 1, seed ^ c1);
-    (*state).z2 = insert((*state).z2, start + 2, (seed << 3) ^ c1);
-    (*state).z2 = insert((*state).z2, start + 3, (seed << 2) ^ c2);
+    state->z2 = insert(state->z2, start + 0, seed);
+    state->z2 = insert(state->z2, start + 1, seed ^ c1);
+    state->z2 = insert(state->z2, start + 2, (seed << 3) ^ c1);
+    state->z2 = insert(state->z2, start + 3, (seed << 2) ^ c2);

-    seed ^= extract((*state).z2, 2);
-    (*state).z3 = insert((*state).z3, start + 0, seed);
-    (*state).z3 = insert((*state).z3, start + 1, seed ^ c1);
-    (*state).z3 = insert((*state).z3, start + 2, (seed << 3) ^ c1);
-    (*state).z3 = insert((*state).z3, start + 3, (seed << 2) ^ c2);
+    seed ^= extract(state->z2, 2);
+    state->z3 = insert(state->z3, start + 0, seed);
+    state->z3 = insert(state->z3, start + 1, seed ^ c1);
+    state->z3 = insert(state->z3, start + 2, (seed << 3) ^ c1);
+    state->z3 = insert(state->z3, start + 3, (seed << 2) ^ c2);

    seed <<= 4;
    seed += 3;
-    seed ^= extract((*state).z1, 3);
-    (*state).z4 = insert((*state).z4, start + 0, seed);
-    (*state).z4 = insert((*state).z4, start + 1, seed ^ c1);
-    (*state).z4 = insert((*state).z4, start + 2, (seed << 3) ^ c1);
-    (*state).z4 = insert((*state).z4, start + 3, (seed << 2) ^ c2);
+    seed ^= extract(state->z1, 3);
+    state->z4 = insert(state->z4, start + 0, seed);
+    state->z4 = insert(state->z4, start + 1, seed ^ c1);
+    state->z4 = insert(state->z4, start + 2, (seed << 3) ^ c1);
+    state->z4 = insert(state->z4, start + 3, (seed << 2) ^ c2);

    return seed;
 }
--- a/stdlib2cpp.py
+++ b/stdlib2cpp.py
@@ -2,11 +2,17 @@

 import sys

-print "char stdlib_code[] = { "
+t=str(sys.argv[1])

-for line in sys.stdin:
-    for c in line:
-        print ord(c)
-        print ", "
+sys.stdout.write("char stdlib_" + t + "_code[] = {\n")

-print "0 };"
+width = 16
+data = sys.stdin.read()
+for i in range(0, len(data), 1):
+    sys.stdout.write("0x%0.2X, " % ord(data[i:i+1]))
+
+    if i%width == (width-1):
+        sys.stdout.write("\n")
+
+sys.stdout.write("0x00 };\n\n")
+                                    
--- a/stmt.cpp
+++ b/stmt.cpp
--- a/stmt.h
+++ b/stmt.h
@@ -60,8 +60,10 @@ public:
    virtual void Print(int indent) const = 0;

    // Redeclare these methods with Stmt * return values, rather than
-    // ASTNode *s, as in the original ASTNode declarations of them.
-    virtual Stmt *Optimize() = 0;
+    // ASTNode *s, as in the original ASTNode declarations of them.  We'll
+    // also provide a default implementation of Optimize(), since most
+    // Stmts don't have anything to do here.
+    virtual Stmt *Optimize();
    virtual Stmt *TypeCheck() = 0;
 };

@@ -74,7 +76,6 @@ public:
    void EmitCode(FunctionEmitContext *ctx) const;
    void Print(int indent) const;

-    Stmt *Optimize();
    Stmt *TypeCheck();
    int EstimateCost() const;

@@ -117,7 +118,6 @@ public:
    void EmitCode(FunctionEmitContext *ctx) const;
    void Print(int indent) const;

-    Stmt *Optimize();
    Stmt *TypeCheck();
    int EstimateCost() const;

@@ -158,7 +158,6 @@ public:
    void EmitCode(FunctionEmitContext *ctx) const;
    void Print(int indent) const;

-    Stmt *Optimize();
    Stmt *TypeCheck();
    int EstimateCost() const;

@@ -179,7 +178,6 @@ public:
    void EmitCode(FunctionEmitContext *ctx) const;
    void Print(int indent) const;

-    Stmt *Optimize();
    Stmt *TypeCheck();
    int EstimateCost() const;

@@ -206,7 +204,6 @@ public:
    void EmitCode(FunctionEmitContext *ctx) const;
    void Print(int indent) const;

-    Stmt *Optimize();
    Stmt *TypeCheck();
    int EstimateCost() const;

@@ -228,7 +225,6 @@ public:
    void EmitCode(FunctionEmitContext *ctx) const;
    void Print(int indent) const;

-    Stmt *Optimize();
    Stmt *TypeCheck();
    int EstimateCost() const;

@@ -253,7 +249,6 @@ public:
    void EmitCode(FunctionEmitContext *ctx) const;
    void Print(int indent) const;

-    Stmt *Optimize();
    Stmt *TypeCheck();
    int EstimateCost() const;

@@ -275,7 +270,6 @@ public:
    void EmitCode(FunctionEmitContext *ctx) const;
    void Print(int indent) const;

-    Stmt *Optimize();
    Stmt *TypeCheck();
    int EstimateCost() const;

@@ -288,6 +282,97 @@ public:
 };


+/** Statement corresponding to a "case" label in the program.  In addition
+    to the value associated with the "case", this statement also stores the
+    statements following it. */
+class CaseStmt : public Stmt {
+public:
+    CaseStmt(int value, Stmt *stmt, SourcePos pos);
+
+    void EmitCode(FunctionEmitContext *ctx) const;
+    void Print(int indent) const;
+
+    Stmt *TypeCheck();
+    int EstimateCost() const;
+
+    /** Integer value after the "case" statement */
+    const int value;
+    Stmt *stmts;
+};
+
+
+/** Statement for a "default" label (as would be found inside a "switch"
+    statement). */
+class DefaultStmt : public Stmt {
+public:
+    DefaultStmt(Stmt *stmt, SourcePos pos);
+
+    void EmitCode(FunctionEmitContext *ctx) const;
+    void Print(int indent) const;
+
+    Stmt *TypeCheck();
+    int EstimateCost() const;
+
+    Stmt *stmts;
+};
+
+
+/** A "switch" statement in the program. */
+class SwitchStmt : public Stmt {
+public:
+    SwitchStmt(Expr *expr, Stmt *stmts, SourcePos pos);
+
+    void EmitCode(FunctionEmitContext *ctx) const;
+    void Print(int indent) const;
+
+    Stmt *TypeCheck();
+    int EstimateCost() const;
+
+    /** Expression that is used to determine which label to jump to. */
+    Expr *expr;
+    /** Statement block after the "switch" expression. */
+    Stmt *stmts;
+};
+
+
+/** A "goto" in an ispc program. */
+class GotoStmt : public Stmt {
+public:
+    GotoStmt(const char *label, SourcePos gotoPos, SourcePos idPos);
+
+    void EmitCode(FunctionEmitContext *ctx) const;
+    void Print(int indent) const;
+
+    Stmt *Optimize();
+    Stmt *TypeCheck();
+    int EstimateCost() const;
+
+    /** Name of the label to jump to when the goto is executed. */
+    std::string label;
+    SourcePos identifierPos;
+};
+
+
+/** Statement corresponding to a label (as would be used as a goto target)
+    in the program. */
+class LabeledStmt : public Stmt {
+public:
+    LabeledStmt(const char *label, Stmt *stmt, SourcePos p);
+
+    void EmitCode(FunctionEmitContext *ctx) const;
+    void Print(int indent) const;
+
+    Stmt *Optimize();
+    Stmt *TypeCheck();
+    int EstimateCost() const;
+
+    /** Name of the label. */
+    std::string name;
+    /** Statements following the label. */
+    Stmt *stmt;
+};
+
+
 /** @brief Representation of a list of statements in the program.
 */
 class StmtList : public Stmt {
@@ -297,14 +382,11 @@ public:
    void EmitCode(FunctionEmitContext *ctx) const;
    void Print(int indent) const;

-    Stmt *Optimize();
    Stmt *TypeCheck();
    int EstimateCost() const;

    void Add(Stmt *s) { if (s) stmts.push_back(s); }
-    const std::vector<Stmt *> &GetStatements() { return stmts; }

-private:
    std::vector<Stmt *> stmts;
 };

@@ -325,7 +407,6 @@ public:
    void EmitCode(FunctionEmitContext *ctx) const;
    void Print(int indent) const;

-    Stmt *Optimize();
    Stmt *TypeCheck();
    int EstimateCost() const;

@@ -352,7 +433,6 @@ public:
    void EmitCode(FunctionEmitContext *ctx) const;
    void Print(int indent) const;

-    Stmt *Optimize();
    Stmt *TypeCheck();
    int EstimateCost() const;

--- a/sym.cpp
+++ b/sym.cpp
@@ -72,8 +72,7 @@ SymbolTable::SymbolTable() {

 SymbolTable::~SymbolTable() {
    // Otherwise we have mismatched push/pop scopes
-    Assert(variables.size() == 1 && functions.size() == 1 &&
-           types.size() == 1);
+    Assert(variables.size() == 1 && types.size() == 1);
    PopScope();
 }

@@ -81,7 +80,6 @@ SymbolTable::~SymbolTable() {
 void
 SymbolTable::PushScope() { 
    variables.push_back(new SymbolMapType);
-    functions.push_back(new FunctionMapType);
    types.push_back(new TypeMapType);
 }

@@ -92,10 +90,6 @@ SymbolTable::PopScope() {
    delete variables.back();
    variables.pop_back();

-    Assert(functions.size() > 1);
-    delete functions.back();
-    functions.pop_back();
-
    Assert(types.size() > 1);
    delete types.back();
    types.pop_back();
@@ -160,7 +154,7 @@ SymbolTable::AddFunction(Symbol *symbol) {
        // the symbol table
        return false;

-    std::vector<Symbol *> &funOverloads = (*functions.back())[symbol->name];
+    std::vector<Symbol *> &funOverloads = functions[symbol->name];
    funOverloads.push_back(symbol);
    return true;
 }
@@ -168,17 +162,14 @@ SymbolTable::AddFunction(Symbol *symbol) {

 bool
 SymbolTable::LookupFunction(const char *name, std::vector<Symbol *> *matches) {
-    for (int i = (int)functions.size() - 1; i >= 0; --i) {
-        FunctionMapType &fm = *(functions[i]);
-        FunctionMapType::iterator iter = fm.find(name);
-        if (iter != fm.end()) {
-            if (matches == NULL)
-                return true;
-            else {
-                const std::vector<Symbol *> &funcs = iter->second;
-                for (int j = 0; j < (int)funcs.size(); ++j)
-                    matches->push_back(funcs[j]);
-            }
+    FunctionMapType::iterator iter = functions.find(name);
+    if (iter != functions.end()) {
+        if (matches == NULL)
+            return true;
+        else {
+            const std::vector<Symbol *> &funcs = iter->second;
+            for (int j = 0; j < (int)funcs.size(); ++j)
+                matches->push_back(funcs[j]);
        }
    }
    return matches ? (matches->size() > 0) : false;
@@ -187,15 +178,12 @@ SymbolTable::LookupFunction(const char *name, std::vector<Symbol *> *matches) {

 Symbol *
 SymbolTable::LookupFunction(const char *name, const FunctionType *type) {
-    for (int i = (int)functions.size() - 1; i >= 0; --i) {
-        FunctionMapType &fm = *(functions[i]);
-        FunctionMapType::iterator iter = fm.find(name);
-        if (iter != fm.end()) {
-            std::vector<Symbol *> funcs = iter->second;
-            for (int j = 0; j < (int)funcs.size(); ++j) {
-                if (Type::Equal(funcs[j]->type, type))
-                    return funcs[j];
-            }
+    FunctionMapType::iterator iter = functions.find(name);
+    if (iter != functions.end()) {
+        std::vector<Symbol *> funcs = iter->second;
+        for (int j = 0; j < (int)funcs.size(); ++j) {
+            if (Type::Equal(funcs[j]->type, type))
+                return funcs[j];
        }
    }
    return NULL;
@@ -261,14 +249,11 @@ SymbolTable::ClosestVariableOrFunctionMatch(const char *str) const {
        }
    }

-    for (int i = 0; i < (int)functions.size(); ++i) {
-        const FunctionMapType &fm = *(functions[i]);
-        FunctionMapType::const_iterator iter;
-        for (iter = fm.begin(); iter != fm.end(); ++iter) {
-            int dist = StringEditDistance(str, iter->first, maxDelta+1);
-            if (dist <= maxDelta)
-                matches[dist].push_back(iter->first);
-        }
+    FunctionMapType::const_iterator iter;
+    for (iter = functions.begin(); iter != functions.end(); ++iter) {
+        int dist = StringEditDistance(str, iter->first, maxDelta+1);
+        if (dist <= maxDelta)
+            matches[dist].push_back(iter->first);
    }

    // Now, return the first entry of matches[] that is non-empty, if any.
@@ -346,15 +331,13 @@ SymbolTable::Print() {
    }

    fprintf(stderr, "Functions:\n----------------\n");
-    for (int i = 0; i < (int)functions.size(); ++i) {
-        FunctionMapType::iterator fiter = functions[i]->begin();
-        while (fiter != functions[i]->end()) {
-            fprintf(stderr, "%s\n", fiter->first.c_str());
-            std::vector<Symbol *> &syms = fiter->second;
-            for (unsigned int j = 0; j < syms.size(); ++j)
-                fprintf(stderr, "    %s\n", syms[j]->type->GetString().c_str());
-            ++fiter;
-        }
+    FunctionMapType::iterator fiter = functions.begin();
+    while (fiter != functions.end()) {
+        fprintf(stderr, "%s\n", fiter->first.c_str());
+        std::vector<Symbol *> &syms = fiter->second;
+        for (unsigned int j = 0; j < syms.size(); ++j)
+            fprintf(stderr, "    %s\n", syms[j]->type->GetString().c_str());
+        ++fiter;
    }

    depth = 0;
--- a/sym.h
+++ b/sym.h
@@ -257,12 +257,13 @@ private:
    typedef std::map<std::string, Symbol *> SymbolMapType;
    std::vector<SymbolMapType *> variables;

-    /** Function declarations are also scoped., A STL \c vector is used to
-        store the function symbols for a given name since, due to function
-        overloading, a name can have multiple function symbols associated
-        with it. */
+    /** Function declarations are *not* scoped.  (C99, for example, allows
+        an implementation to maintain function declarations in a single
+        namespace.)  A STL \c vector is used to store the function symbols
+        for a given name since, due to function overloading, a name can
+        have multiple function symbols associated with it. */
    typedef std::map<std::string, std::vector<Symbol *> > FunctionMapType;
-    std::vector<FunctionMapType *> functions;
+    FunctionMapType functions;

    /** Type definitions can also be scoped.  A new \c TypeMapType
        is added to the back of the \c types \c vector each time a new scope
@@ -278,15 +279,12 @@ SymbolTable::GetMatchingFunctions(Predicate pred,
                                  std::vector<Symbol *> *matches) const {
    // Iterate through all function symbols and apply the given predicate.
    // If it returns true, add the Symbol * to the provided vector.
-    for (unsigned int i = 0; i < functions.size(); ++i) {
-        FunctionMapType &fm = *(functions[i]);
-        FunctionMapType::const_iterator iter;
-        for (iter = fm.begin(); iter != fm.end(); ++iter) {
-            const std::vector<Symbol *> &syms = iter->second;
-            for (unsigned int j = 0; j < syms.size(); ++j) {
-                if (pred(syms[j]))
-                    matches->push_back(syms[j]);
-            }
+    FunctionMapType::const_iterator iter;
+    for (iter = functions.begin(); iter != functions.end(); ++iter) {
+        const std::vector<Symbol *> &syms = iter->second;
+        for (unsigned int j = 0; j < syms.size(); ++j) {
+            if (pred(syms[j]))
+                matches->push_back(syms[j]);
        }
    }
 }
--- a/test_static.cpp
+++ b/test_static.cpp
@@ -46,7 +46,6 @@
 #include <assert.h>
 #include <string.h>
 #include <stdio.h>
-#include <assert.h>
 #include <stdint.h>
 #ifdef ISPC_IS_LINUX
 #include <malloc.h>
--- a/tests/atomics-swap.ispc
+++ b/tests/atomics-swap.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+uniform int32 s = 1234;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = 0;
+    if (programIndex & 1) {
+        b = atomic_swap_global(&s, programIndex);
+    }
+    RET[programIndex] = reduce_add(b) + s;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1234 + reduce_add(programIndex & 1 ? programIndex : 0);
+}
--- a/tests/goto-1.ispc
+++ b/tests/goto-1.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = 0.; b = a; 
+    RET[programIndex] = a+b; 
+    goto skip;
+    RET[programIndex] = 0; 
+ skip:
+    ;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2 + 2*programIndex;
+}
--- a/tests/goto-2.ispc
+++ b/tests/goto-2.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = 0.; b = a; 
+    RET[programIndex] = a+b; 
+    if (all(a != 0))
+        goto skip;
+    RET[programIndex] = 0; 
+ skip:
+    ;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2 + 2*programIndex;
+}
--- a/tests/goto-3.ispc
+++ b/tests/goto-3.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = 0.; b = a; 
+    RET[programIndex] = a+b; 
+    if (all(a == 0))
+        goto skip;
+    RET[programIndex] = 0; 
+ skip:
+    ;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}
--- a/tests/goto-4.ispc
+++ b/tests/goto-4.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = 0.; b = a; 
+    RET[programIndex] = 0; 
+ encore:
+    ++RET[programIndex];
+    if (any(a != 0)) {
+        a = max(a-1, 0);
+        goto encore;
+    }
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = programCount+1; 
+}
--- a/tests/kilo-mega-giga-1.ispc
+++ b/tests/kilo-mega-giga-1.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    a *= 1k;
+    RET[programIndex] = a; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1024*(programIndex+1);
+}
--- a/tests/kilo-mega-giga-2.ispc
+++ b/tests/kilo-mega-giga-2.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int a = b + 2M;
+    RET[programIndex] = a;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2*1024*1024 + 5;
+}
--- a/tests/kilo-mega-giga-3.ispc
+++ b/tests/kilo-mega-giga-3.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    unsigned int32 a = 3G;
+    a -= 2G;
+    a -= 1024M;
+    RET[programIndex] = a;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}
--- a/tests/movmsk-opt.ispc
+++ b/tests/movmsk-opt.ispc
@@ -1,35 +0,0 @@
-static float float4(uniform float a, uniform float b, uniform float c, 
-                    uniform float d) {
-    float ret = 0;
-    for (uniform int i = 0; i < programCount; i += 4) {
-        ret = insert(ret, i + 0, a);
-        ret = insert(ret, i + 1, b);
-        ret = insert(ret, i + 2, c);
-        ret = insert(ret, i + 3, d);
-    }
-    return ret;
-}
-
-export uniform int width() { return programCount; }
-
-
-
-export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
-    float a = aFOO[programIndex];
-    uniform int ret = 0;
-
-    float v = float4(1,1,0,0);
-    bool b = (v == 1.);
-    ret = __movmsk((sign_extend(b)));
-    RET[programIndex] = ret;
-}
-
-
-// fixme for 16-wide...
-export void result(uniform float RET[]) { 
-    uniform int x = -1234;
-    if (programCount == 4) x = 3;
-    else if (programCount == 8) x = 0x33;
-    else if (programCount == 16) x = 0x3333;
-    RET[programIndex] = x;
-}
--- a/tests/ptr-assign-lhs-math-1.ispc
+++ b/tests/ptr-assign-lhs-math-1.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform float a[programCount];
+    a[programIndex] = aFOO[programIndex];
+
+    uniform float * uniform ptr = a;
+    *(ptr+1) = 0;
+    RET[programIndex] = a[programIndex];
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1+programIndex;
+    RET[1] = 0;
+}
--- a/tests/ptr-assign-lhs-math-2.ispc
+++ b/tests/ptr-assign-lhs-math-2.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform float a[programCount];
+    a[programIndex] = aFOO[programIndex];
+
+    uniform float * varying ptr = a;
+    *(ptr+programIndex) = 0;
+    RET[programIndex] = a[programIndex];
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}
--- a/tests/switch-1.ispc
+++ b/tests/switch-1.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int a = aFOO[programIndex]; 
+    switch (b) {
+    default:
+        RET[programIndex] = -1; 
+        break;
+    case 5:
+        RET[programIndex] = 0; 
+    }
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}
--- a/tests/switch-10.ispc
+++ b/tests/switch-10.ispc
@@ -0,0 +1,44 @@
+
+export uniform int width() { return programCount; }
+
+int switchit(int a, uniform int b) {
+    switch (a) {
+    case 3:
+        return 1;
+    case 7:
+    case 6:
+    case 4:
+    case 5:
+        if (a & 1)
+            break;
+        return 2;
+    case 1: {
+        switch (a+b) {
+        case 6:
+            return 42;
+        default:
+            break;
+        }
+        return -1234;
+    }
+    case 32:
+        *((int *)NULL) = 0;
+    default:
+        return 0;
+    }
+    return 3;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int a = aFOO[programIndex]; 
+    int x = switchit(a, b);
+    RET[programIndex] = x; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 42;
+    RET[2] = 1;
+    RET[6] = RET[4] = 3;
+    RET[5] = RET[3] = 2;
+}
--- a/tests/switch-11.ispc
+++ b/tests/switch-11.ispc
@@ -0,0 +1,50 @@
+
+export uniform int width() { return programCount; }
+
+int switchit(int a, uniform int b) {
+    switch (a) {
+    case 3:
+        return 1;
+    case 7:
+    case 6:
+    case 4:
+    case 5:
+        if (a & 1)
+            break;
+        return 2;
+    case 1: {
+        switch (a+b) {
+        case 60:
+            return -1234;
+        default:
+            break;
+        case 6:
+            if (b == 5)
+                break;
+            return -42;
+        case 12:
+            return -1;
+        }
+        return 42;
+    }
+    case 32:
+        *((int *)NULL) = 0;
+    default:
+        return 0;
+    }
+    return 3;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int a = aFOO[programIndex]; 
+    int x = switchit(a, b);
+    RET[programIndex] = x; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 42;
+    RET[2] = 1;
+    RET[6] = RET[4] = 3;
+    RET[5] = RET[3] = 2;
+}
--- a/tests/switch-12.ispc
+++ b/tests/switch-12.ispc
@@ -0,0 +1,54 @@
+
+export uniform int width() { return programCount; }
+
+int switchit(int a, uniform int b) {
+    switch (a) {
+    case 3:
+        return 1;
+    case 7:
+    case 6:
+    case 4:
+    case 5:
+        if (a & 1)
+            break;
+        return 2;
+    case 1: {
+        switch (a+b) {
+        case 60:
+            return -1234;
+        default:
+            break;
+        case 6:
+            int count = 0;
+            for (count = 0; count < 10; ++count) {
+                a += b;
+                if (a == 11)
+                    break;
+            }
+            return a;
+        case 12:
+            return -1;
+        }
+        return 42;
+    }
+    case 32:
+        *((int *)NULL) = 0;
+    default:
+        return 0;
+    }
+    return 3;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int a = aFOO[programIndex]; 
+    int x = switchit(a, b);
+    RET[programIndex] = x; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 11;
+    RET[2] = 1;
+    RET[6] = RET[4] = 3;
+    RET[5] = RET[3] = 2;
+}
--- a/tests/switch-13.ispc
+++ b/tests/switch-13.ispc
@@ -0,0 +1,28 @@
+
+export uniform int width() { return programCount; }
+
+int switchit(int a, uniform int b) {
+    int r = -1;
+    switch (b) {
+    case 5:
+        if (a & 1) {
+            r=3;
+            break;
+        }
+        r= 2;
+        break;
+    default:
+        r= 3;
+    }
+    return r;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int a = aFOO[programIndex]; 
+    int x = switchit(a, b);
+    RET[programIndex] = x; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (programIndex & 1) ? 2 : 3;
+}
--- a/tests/switch-14.ispc
+++ b/tests/switch-14.ispc
@@ -0,0 +1,24 @@
+
+export uniform int width() { return programCount; }
+
+int switchit(int a, uniform int b) {
+    switch (b) {
+    case 5:
+        if (a & 1)
+            break;
+        return 2;
+    default:
+        return 42;
+    }
+    return 3;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int a = aFOO[programIndex]; 
+    int x = switchit(a, b);
+    RET[programIndex] = x; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (programIndex & 1) ? 2 : 3;
+}
--- a/tests/switch-2.ispc
+++ b/tests/switch-2.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int a = aFOO[programIndex]; 
+    switch (b) {
+    default:
+        RET[programIndex] = -1; 
+    case 5:
+        RET[programIndex] = 0; 
+    }
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}
--- a/tests/switch-3.ispc
+++ b/tests/switch-3.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int a = aFOO[programIndex]; 
+    switch (b) {
+    case 5:
+        RET[programIndex] = 0; 
+        break;
+    default:
+        RET[programIndex] = -1; 
+    }
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}
--- a/tests/switch-4.ispc
+++ b/tests/switch-4.ispc
@@ -0,0 +1,24 @@
+
+export uniform int width() { return programCount; }
+
+int switchit(int a, uniform int b) {
+    int r = 0;
+    switch (a) {
+    case 3:
+        r = 1;
+        break;
+    default:
+        r = 0;
+    }
+    return r;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int a = aFOO[programIndex]; 
+    int x = switchit(a, b);
+    RET[programIndex] = x; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (programIndex == 2) ? 1 : 0;
+}
--- a/Show More
+++ b/Show More