Merge remote-tracking branch 'matt/master'

2012-01-26 10:41:13 -08:00
parent f2b99ccb08 1867b5b317
commit ee8b6ebbf6
100 changed files with 5850 additions and 2116 deletions
--- a/28
+++ b/28
@@ -3,6 +3,11 @@
 #

 ARCH_OS = $(shell uname)
+ifeq ($(ARCH_OS), Darwin)
+	ARCH_OS2 = "OSX"
+else
+	ARCH_OS2 = $(shell uname -o)
+endif
 ARCH_TYPE = $(shell arch)

 ifeq ($(shell llvm-config --version), 3.1svn)
@@ -26,7 +31,15 @@ CLANG_LIBS = -lclangFrontend -lclangDriver \
             -lclangAnalysis -lclangAST -lclangLex -lclangBasic

 ISPC_LIBS=$(shell llvm-config --ldflags) $(CLANG_LIBS) $(LLVM_LIBS) \
-	-lpthread -ldl
+	-lpthread
+
+ifeq ($(ARCH_OS),Linux)
+	ISPC_LIBS += -ldl
+endif
+
+ifeq ($(ARCH_OS2),Msys)
+	ISPC_LIBS += -lshlwapi -limagehlp -lpsapi
+endif

 LLVM_CXXFLAGS=$(shell llvm-config --cppflags)
 LLVM_VERSION=LLVM_$(shell llvm-config --version | sed s/\\./_/)
@@ -58,7 +71,8 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
 	type.cpp util.cpp
 HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
 	opt.h stmt.h sym.h type.h util.h
-TARGETS=avx avx-x2 sse2 sse2-x2 sse4 sse4-x2 generic-4 generic-8 generic-16
+TARGETS=avx1 avx1-x2 avx2 avx2-x2 sse2 sse2-x2 sse4 sse4-x2 generic-4 generic-8 \
+	generic-16
 BUILTINS_SRC=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS))) \
 	builtins/dispatch.ll
 BUILTINS_OBJS=$(addprefix builtins-, $(notdir $(BUILTINS_SRC:.ll=.o))) \
@@ -129,22 +143,22 @@ objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc

 objs/builtins-%.cpp: builtins/%.ll builtins/util.m4 $(wildcard builtins/*common.ll)
 	@echo Creating C++ source from builtins definition file $<
-	@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) $< | ./bitcode2cpp.py $< > $@
+	@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) $< | python bitcode2cpp.py $< > $@

 objs/builtins-c-32.cpp: builtins/builtins.c
 	@echo Creating C++ source from builtins definition file $<
-	@$(CLANG) -m32 -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py c-32 > $@
+	@$(CLANG) -m32 -emit-llvm -c $< -o - | llvm-dis - | python bitcode2cpp.py c-32 > $@

 objs/builtins-c-64.cpp: builtins/builtins.c
 	@echo Creating C++ source from builtins definition file $<
-	@$(CLANG) -m64 -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py c-64 > $@
+	@$(CLANG) -m64 -emit-llvm -c $< -o - | llvm-dis - | python bitcode2cpp.py c-64 > $@

 objs/stdlib_generic_ispc.cpp: stdlib.ispc
 	@echo Creating C++ source from $< for generic
 	@$(CLANG) -E -x c -DISPC_TARGET_GENERIC=1 -DISPC=1 -DPI=3.1415926536 $< -o - | \
-		./stdlib2cpp.py generic > $@
+		python stdlib2cpp.py generic > $@

 objs/stdlib_x86_ispc.cpp: stdlib.ispc
 	@echo Creating C++ source from $< for x86
 	@$(CLANG) -E -x c -DISPC=1 -DPI=3.1415926536 $< -o - | \
-		./stdlib2cpp.py x86 > $@
+		python stdlib2cpp.py x86 > $@
--- a/README.rst
+++ b/README.rst
@@ -47,7 +47,7 @@ remarkable `LLVM Compiler Infrastructure <http://llvm.org>`_ for back-end
 code generation and optimization and is `hosted on
 github <http://github.com/ispc/ispc/>`_. It supports Windows, Mac, and
 Linux, with both x86 and x86-64 targets.  It currently supports the SSE2,
-SSE4, and AVX instruction sets.
+SSE4, AVX1, and AVX2 instruction sets.

 Features
 --------
--- a/ast.cpp
+++ b/ast.cpp
@@ -90,7 +90,11 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
        DoStmt *dos;
        ForStmt *fs;
        ForeachStmt *fes;
+        CaseStmt *cs;
+        DefaultStmt *defs;
+        SwitchStmt *ss;
        ReturnStmt *rs;
+        LabeledStmt *ls;
        StmtList *sl;
        PrintStmt *ps;
        AssertStmt *as;
@@ -130,10 +134,21 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
                                                   postFunc, data);
            fes->stmts = (Stmt *)WalkAST(fes->stmts, preFunc, postFunc, data);
        }
-        else if (dynamic_cast<BreakStmt *>(node) != NULL ||
-                 dynamic_cast<ContinueStmt *>(node) != NULL) {
-            // nothing 
+        else if ((cs = dynamic_cast<CaseStmt *>(node)) != NULL)
+            cs->stmts = (Stmt *)WalkAST(cs->stmts, preFunc, postFunc, data);
+        else if ((defs = dynamic_cast<DefaultStmt *>(node)) != NULL)
+            defs->stmts = (Stmt *)WalkAST(defs->stmts, preFunc, postFunc, data);
+        else if ((ss = dynamic_cast<SwitchStmt *>(node)) != NULL) {
+            ss->expr = (Expr *)WalkAST(ss->expr, preFunc, postFunc, data);
+            ss->stmts = (Stmt *)WalkAST(ss->stmts, preFunc, postFunc, data);
        }
+        else if (dynamic_cast<BreakStmt *>(node) != NULL ||
+                 dynamic_cast<ContinueStmt *>(node) != NULL ||
+                 dynamic_cast<GotoStmt *>(node) != NULL) {
+            // nothing
+        }
+        else if ((ls = dynamic_cast<LabeledStmt *>(node)) != NULL)
+            ls->stmt = (Stmt *)WalkAST(ls->stmt, preFunc, postFunc, data);
        else if ((rs = dynamic_cast<ReturnStmt *>(node)) != NULL)
            rs->val = (Expr *)WalkAST(rs->val, preFunc, postFunc, data);
        else if ((sl = dynamic_cast<StmtList *>(node)) != NULL) {
@@ -151,7 +166,7 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
    else {
        ///////////////////////////////////////////////////////////////////////////
        // Handle expressions
-        assert(dynamic_cast<Expr *>(node) != NULL);
+        Assert(dynamic_cast<Expr *>(node) != NULL);
        UnaryExpr *ue;
        BinaryExpr *be;
        AssignExpr *ae;
@@ -289,3 +304,4 @@ EstimateCost(ASTNode *root) {
    WalkAST(root, lCostCallback, NULL, &cost);
    return cost;
 }
+
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -386,6 +386,7 @@ lSetInternalFunctions(llvm::Module *module) {
        "__ceil_uniform_float",
        "__ceil_varying_double",
        "__ceil_varying_float",
+        "__clock",
        "__count_trailing_zeros_i32",
        "__count_trailing_zeros_i64",
        "__count_leading_zeros_i32",
@@ -717,11 +718,13 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
        extern int builtins_bitcode_sse4_x2_length;
        switch (g->target.vectorWidth) {
        case 4: 
-            AddBitcodeToModule(builtins_bitcode_sse4, builtins_bitcode_sse4_length, 
+            AddBitcodeToModule(builtins_bitcode_sse4,
+                               builtins_bitcode_sse4_length, 
                               module, symbolTable);
            break;
        case 8:
-            AddBitcodeToModule(builtins_bitcode_sse4_x2, builtins_bitcode_sse4_x2_length, 
+            AddBitcodeToModule(builtins_bitcode_sse4_x2, 
+                               builtins_bitcode_sse4_x2_length, 
                               module, symbolTable);
            break;
        default:
@@ -729,18 +732,39 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
        }
        break;
    case Target::AVX:
-    case Target::AVX2:
        switch (g->target.vectorWidth) {
        case 8:
-            extern unsigned char builtins_bitcode_avx[];
-            extern int builtins_bitcode_avx_length;
-            AddBitcodeToModule(builtins_bitcode_avx, builtins_bitcode_avx_length, 
+            extern unsigned char builtins_bitcode_avx1[];
+            extern int builtins_bitcode_avx1_length;
+            AddBitcodeToModule(builtins_bitcode_avx1, 
+                               builtins_bitcode_avx1_length, 
                               module, symbolTable);
            break;
        case 16:
-            extern unsigned char builtins_bitcode_avx_x2[];
-            extern int builtins_bitcode_avx_x2_length;
-            AddBitcodeToModule(builtins_bitcode_avx_x2, builtins_bitcode_avx_x2_length,
+            extern unsigned char builtins_bitcode_avx1_x2[];
+            extern int builtins_bitcode_avx1_x2_length;
+            AddBitcodeToModule(builtins_bitcode_avx1_x2, 
+                               builtins_bitcode_avx1_x2_length,
+                               module,  symbolTable);
+            break;
+        default:
+            FATAL("logic error in DefineStdlib");
+        }
+        break;
+    case Target::AVX2:
+        switch (g->target.vectorWidth) {
+        case 8:
+            extern unsigned char builtins_bitcode_avx2[];
+            extern int builtins_bitcode_avx2_length;
+            AddBitcodeToModule(builtins_bitcode_avx2, 
+                               builtins_bitcode_avx2_length, 
+                               module, symbolTable);
+            break;
+        case 16:
+            extern unsigned char builtins_bitcode_avx2_x2[];
+            extern int builtins_bitcode_avx2_x2_length;
+            AddBitcodeToModule(builtins_bitcode_avx2_x2, 
+                               builtins_bitcode_avx2_x2_length,
                               module,  symbolTable);
            break;
        default:
@@ -798,6 +822,9 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
    lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload, module,
                           symbolTable);

+    lDefineConstantInt("__have_native_half", (g->target.isa == Target::AVX2),
+                       module, symbolTable);
+
    if (includeStdlibISPC) {
        // If the user wants the standard library to be included, parse the
        // serialized version of the stdlib.ispc file to get its
--- a/builtins/builtins.c
+++ b/builtins/builtins.c
@@ -149,7 +149,7 @@ void __do_print(const char *format, const char *types, int width, int mask,


 int __num_cores() {
-#ifdef _MSC_VER
+#if defined(_MSC_VER) || defined(__MINGW32__)
    // This is quite a hack.  Including all of windows.h to get this definition
    // pulls in a bunch of stuff that leads to undefined symbols at link time.
    // So we don't #include <windows.h> but instead have the equivalent declarations
--- a/builtins/dispatch.ll
+++ b/builtins/dispatch.ll
@@ -48,23 +48,42 @@ declare void @abort() noreturn
 ;; corresponding to one of the Target::ISA enumerant values that gives the
 ;; most capable ISA that the curremt system can run.
 ;;
-;; #ifdef _MSC_VER
-;; extern void __stdcall __cpuid(int info[4], int infoType);
-;; #else
+;; Note: clang from LLVM 2.9 should be used if this is updated, for maximum
+;; backwards compatibility for anyone building ispc with LLVM 2.9.
+;;
+;; #include <stdint.h>
+;; #include <stdlib.h>
+;; 
 ;; static void __cpuid(int info[4], int infoType) {
 ;;     __asm__ __volatile__ ("cpuid"
 ;;                           : "=a" (info[0]), "=b" (info[1]), "=c" (info[2]), "=d" (info[3])
 ;;                           : "0" (infoType));
 ;; }
-;; #endif
+;; 
+;; /* Save %ebx in case it's the PIC register */
+;; static void __cpuid_count(int info[4], int level, int count) {
+;;   __asm__ __volatile__ ("xchg{l}\t{%%}ebx, %1\n\t"
+;;                         "cpuid\n\t"
+;;                         "xchg{l}\t{%%}ebx, %1\n\t"
+;;                         : "=a" (info[0]), "=r" (info[1]), "=c" (info[2]), "=d" (info[3])
+;;                         : "0" (level), "2" (count));
+;; }
 ;; 
 ;; int32_t __get_system_isa() {
 ;;     int info[4];
 ;;     __cpuid(info, 1);
+;; 
 ;;     /* NOTE: the values returned below must be the same as the
 ;;        corresponding enumerant values in Target::ISA. */
-;;     if ((info[2] & (1 << 28)) != 0)
-;;         return 2; // AVX
+;;     if ((info[2] & (1 << 28)) != 0) {
+;;         // AVX1 for sure. Do we have AVX2?
+;;         // Call cpuid with eax=7, ecx=0
+;;         __cpuid_count(info, 7, 0);
+;;         if ((info[1] & (1 << 5)) != 0)
+;;             return 3; // AVX2
+;;         else
+;;             return 2; // AVX1
+;;     }
 ;;     else if ((info[2] & (1 << 19)) != 0)
 ;;         return 1; // SSE4
 ;;     else if ((info[3] & (1 << 26)) != 0)
@@ -76,33 +95,42 @@ declare void @abort() noreturn
 %0 = type { i32, i32, i32, i32 }

 define i32 @__get_system_isa() nounwind ssp {
-  %1 = tail call %0 asm sideeffect "cpuid", "={ax},={bx},={cx},={dx},0,~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind
-  %2 = extractvalue %0 %1, 2
-  %3 = extractvalue %0 %1, 3
-  %4 = and i32 %2, 268435456
-  %5 = icmp eq i32 %4, 0
-  br i1 %5, label %6, label %13
+entry:
+  %0 = tail call %0 asm sideeffect "cpuid", "={ax},={bx},={cx},={dx},0,~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind
+  %asmresult9.i = extractvalue %0 %0, 2
+  %asmresult10.i = extractvalue %0 %0, 3
+  %and = and i32 %asmresult9.i, 268435456
+  %cmp = icmp eq i32 %and, 0
+  br i1 %cmp, label %if.else7, label %if.then

-; <label>:6                                       ; preds = %0
-  %7 = and i32 %2, 524288
-  %8 = icmp eq i32 %7, 0
-  br i1 %8, label %9, label %13
+if.then:                                          ; preds = %entry
+  %1 = tail call %0 asm sideeffect "xchg$(l$)\09$(%$)ebx, $1\0A\09cpuid\0A\09xchg$(l$)\09$(%$)ebx, $1\0A\09", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 7, i32 0) nounwind
+  %asmresult9.i24 = extractvalue %0 %1, 1
+  %and4 = lshr i32 %asmresult9.i24, 5
+  %2 = and i32 %and4, 1
+  %3 = or i32 %2, 2
+  br label %return

-; <label>:9                                       ; preds = %6
-  %10 = and i32 %3, 67108864
-  %11 = icmp eq i32 %10, 0
-  br i1 %11, label %12, label %13
+if.else7:                                         ; preds = %entry
+  %and10 = and i32 %asmresult9.i, 524288
+  %cmp11 = icmp eq i32 %and10, 0
+  br i1 %cmp11, label %if.else13, label %return

-; <label>:12                                      ; preds = %9
+if.else13:                                        ; preds = %if.else7
+  %and16 = and i32 %asmresult10.i, 67108864
+  %cmp17 = icmp eq i32 %and16, 0
+  br i1 %cmp17, label %if.else19, label %return
+
+if.else19:                                        ; preds = %if.else13
  tail call void @abort() noreturn nounwind
  unreachable

-; <label>:13                                      ; preds = %9, %6, %0
-  %.0 = phi i32 [ 2, %0 ], [ 1, %6 ], [ 0, %9 ]
-  ret i32 %.0
+return:                                           ; preds = %if.else13, %if.else7, %if.then
+  %retval.0 = phi i32 [ %3, %if.then ], [ 1, %if.else7 ], [ 0, %if.else13 ]
+  ret i32 %retval.0
 }

-
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; This function is called by each of the dispatch functions we generate;
 ;; it sets @__system_best_isa if it is unset.

--- a/builtins/target-avx-x2.ll
+++ b/builtins/target-avx-x2.ll
@@ -170,33 +170,6 @@ define <16 x float> @__min_varying_float(<16 x float>,
 }


-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; int min/max
-
-define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
-  binary4to16(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
-  ret <16 x i32> %ret
-}
-
-define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
-  binary4to16(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
-  ret <16 x i32> %ret
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; unsigned int min/max
-
-define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
-  binary4to16(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
-  ret <16 x i32> %ret
-}
-
-define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
-  binary4to16(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
-  ret <16 x i32> %ret
-}
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops

@@ -622,12 +595,7 @@ define void @__masked_store_blend_64(<16 x i64>* nocapture %ptr, <16 x i64> %new
 }

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; gather/scatter
-
-gen_gather(16, i8)
-gen_gather(16, i16)
-gen_gather(16, i32)
-gen_gather(16, i64)
+;; scatter

 gen_scatter(16, i8)
 gen_scatter(16, i16)
--- a/builtins/target-avx.ll
+++ b/builtins/target-avx.ll
@@ -170,33 +170,6 @@ define <8 x float> @__min_varying_float(<8 x float>,
 }


-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; int min/max
-
-define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
-  binary4to8(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
-  ret <8 x i32> %ret
-}
-
-define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
-  binary4to8(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
-  ret <8 x i32> %ret
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; unsigned int min/max
-
-define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
-  binary4to8(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
-  ret <8 x i32> %ret
-}
-
-define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
-  binary4to8(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
-  ret <8 x i32> %ret
-}
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops

@@ -238,7 +211,7 @@ reduce_equal(8)
 ;; horizontal int32 ops

 define <8 x i32> @__add_varying_int32(<8 x i32>,
-                                               <8 x i32>) nounwind readnone alwaysinline {
+                                      <8 x i32>) nounwind readnone alwaysinline {
  %s = add <8 x i32> %0, %1
  ret <8 x i32> %s
 }
@@ -314,7 +287,7 @@ define double @__reduce_max_double(<8 x double>) nounwind readnone alwaysinline
 ;; horizontal int64 ops

 define <8 x i64> @__add_varying_int64(<8 x i64>,
-                                               <8 x i64>) nounwind readnone alwaysinline {
+                                      <8 x i64>) nounwind readnone alwaysinline {
  %s = add <8 x i64> %0, %1
  ret <8 x i64> %s
 }
@@ -403,9 +376,6 @@ define <8 x i64> @__masked_load_64(i8 *, <8 x i32> %mask) nounwind alwaysinline
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store

-; FIXME: there is no AVX instruction for these, but we could be clever
-; by packing the bits down and setting the last 3/4 or half, respectively,
-; of the mask to zero...  Not sure if this would be a win in the end
 gen_masked_store(8, i8, 8)
 gen_masked_store(8, i16, 16)

@@ -520,12 +490,7 @@ define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,


 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; gather/scatter
-
-gen_gather(8, i8)
-gen_gather(8, i16)
-gen_gather(8, i32)
-gen_gather(8, i64)
+;; scatter

 gen_scatter(8, i8)
 gen_scatter(8, i16)
--- a/builtins/target-avx1-x2.ll
+++ b/builtins/target-avx1-x2.ll
@@ -0,0 +1,77 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+include(`target-avx-x2.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret <16 x i32> %ret
+}
+
+define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret <16 x i32> %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret <16 x i32> %ret
+}
+
+define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret <16 x i32> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather(16, i8)
+gen_gather(16, i16)
+gen_gather(16, i32)
+gen_gather(16, i64)
+
+
--- a/builtins/target-avx1.ll
+++ b/builtins/target-avx1.ll
@@ -0,0 +1,75 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+include(`target-avx.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret <8 x i32> %ret
+}
+
+define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret <8 x i32> %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret <8 x i32> %ret
+}
+
+define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret <8 x i32> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather(8, i8)
+gen_gather(8, i16)
+gen_gather(8, i32)
+gen_gather(8, i64)
--- a/builtins/target-avx2-x2.ll
+++ b/builtins/target-avx2-x2.ll
@@ -0,0 +1,129 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+include(`target-avx-x2.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readonly
+declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readonly
+
+define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary8to16(m, i32, @llvm.x86.avx2.pmins.d, %0, %1)
+  ret <16 x i32> %m
+}
+
+define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary8to16(m, i32, @llvm.x86.avx2.pmaxs.d, %0, %1)
+  ret <16 x i32> %m
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readonly
+declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readonly
+
+define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary8to16(m, i32, @llvm.x86.avx2.pminu.d, %0, %1)
+  ret <16 x i32> %m
+}
+
+define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary8to16(m, i32, @llvm.x86.avx2.pmaxu.d, %0, %1)
+  ret <16 x i32> %m
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float/half conversions
+
+declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
+; 0 is round nearest even
+declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
+
+define <16 x float> @__half_to_float_varying(<16 x i16> %v) nounwind readnone {
+  %r_0 = shufflevector <16 x i16> %v, <16 x i16> undef,
+             <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %vr_0 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_0)
+  %r_1 = shufflevector <16 x i16> %v, <16 x i16> undef,
+             <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vr_1 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_1)
+  %r = shufflevector <8 x float> %vr_0, <8 x float> %vr_1, 
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %r
+}
+
+define <16 x i16> @__float_to_half_varying(<16 x float> %v) nounwind readnone {
+  %r_0 = shufflevector <16 x float> %v, <16 x float> undef,
+             <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %vr_0 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_0, i32 0)
+  %r_1 = shufflevector <16 x float> %v, <16 x float> undef,
+             <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vr_1 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_1, i32 0)
+  %r = shufflevector <8 x i16> %vr_0, <8 x i16> %vr_1, 
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i16> %r
+}
+
+define float @__half_to_float_uniform(i16 %v) nounwind readnone {
+  %v1 = bitcast i16 %v to <1 x i16>
+  %vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
+  %r = extractelement <8 x float> %rv, i32 0
+  ret float %r
+}
+
+define i16 @__float_to_half_uniform(float %v) nounwind readnone {
+  %v1 = bitcast float %v to <1 x float>
+  %vv = shufflevector <1 x float> %v1, <1 x float> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  ; round to nearest even
+  %rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
+  %r = extractelement <8 x i16> %rv, i32 0
+  ret i16 %r
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather(16, i8)
+gen_gather(16, i16)
+gen_gather(16, i32)
+gen_gather(16, i64)
+
+
--- a/builtins/target-avx2.ll
+++ b/builtins/target-avx2.ll
@@ -0,0 +1,110 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+include(`target-avx.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readonly
+declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readonly
+
+define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  %m = call <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32> %0, <8 x i32> %1)
+  ret <8 x i32> %m
+}
+
+define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  %m = call <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32> %0, <8 x i32> %1)
+  ret <8 x i32> %m
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readonly
+declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readonly
+
+define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  %m = call <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32> %0, <8 x i32> %1)
+  ret <8 x i32> %m
+}
+
+define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  %m = call <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32> %0, <8 x i32> %1)
+  ret <8 x i32> %m
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float/half conversions
+
+declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
+; 0 is round nearest even
+declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
+
+define <8 x float> @__half_to_float_varying(<8 x i16> %v) nounwind readnone {
+  %r = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %v)
+  ret <8 x float> %r
+}
+
+define <8 x i16> @__float_to_half_varying(<8 x float> %v) nounwind readnone {
+  %r = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %v, i32 0)
+  ret <8 x i16> %r
+}
+
+define float @__half_to_float_uniform(i16 %v) nounwind readnone {
+  %v1 = bitcast i16 %v to <1 x i16>
+  %vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
+  %r = extractelement <8 x float> %rv, i32 0
+  ret float %r
+}
+
+define i16 @__float_to_half_uniform(float %v) nounwind readnone {
+  %v1 = bitcast float %v to <1 x float>
+  %vv = shufflevector <1 x float> %v1, <1 x float> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  ; round to nearest even
+  %rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
+  %r = extractelement <8 x i16> %rv, i32 0
+  ret i16 %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather(8, i8)
+gen_gather(8, i16)
+gen_gather(8, i32)
+gen_gather(8, i64)
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -233,7 +233,7 @@ declare <WIDTH x i32> @__masked_load_32(i8 * nocapture, <WIDTH x i1> %mask) noun
 declare <WIDTH x i64> @__masked_load_64(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly

 declare void @__masked_store_8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
-                                <WIDTH x i1>) nounwind 
+                               <WIDTH x i1>) nounwind 
 declare void @__masked_store_16(<WIDTH x i16>* nocapture, <WIDTH x i16>, 
                                <WIDTH x i1>) nounwind 
 declare void @__masked_store_32(<WIDTH x i32>* nocapture, <WIDTH x i32>, 
@@ -241,8 +241,9 @@ declare void @__masked_store_32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
 declare void @__masked_store_64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
                                <WIDTH x i1> %mask) nounwind 

+ifelse(LLVM_VERSION, `LLVM_3_1svn',`
 define void @__masked_store_blend_8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
-                                     <WIDTH x i1>) nounwind {
+                                     <WIDTH x i1>) nounwind alwaysinline {
  %v = load <WIDTH x i8> * %0
  %v1 = select <WIDTH x i1> %2, <WIDTH x i8> %1, <WIDTH x i8> %v
  store <WIDTH x i8> %v1, <WIDTH x i8> * %0
@@ -250,7 +251,7 @@ define void @__masked_store_blend_8(<WIDTH x i8>* nocapture, <WIDTH x i8>,
 }

 define void @__masked_store_blend_16(<WIDTH x i16>* nocapture, <WIDTH x i16>, 
-                                     <WIDTH x i1>) nounwind {
+                                     <WIDTH x i1>) nounwind alwaysinline {
  %v = load <WIDTH x i16> * %0
  %v1 = select <WIDTH x i1> %2, <WIDTH x i16> %1, <WIDTH x i16> %v
  store <WIDTH x i16> %v1, <WIDTH x i16> * %0
@@ -258,7 +259,7 @@ define void @__masked_store_blend_16(<WIDTH x i16>* nocapture, <WIDTH x i16>,
 }

 define void @__masked_store_blend_32(<WIDTH x i32>* nocapture, <WIDTH x i32>, 
-                                     <WIDTH x i1>) nounwind {
+                                     <WIDTH x i1>) nounwind alwaysinline {
  %v = load <WIDTH x i32> * %0
  %v1 = select <WIDTH x i1> %2, <WIDTH x i32> %1, <WIDTH x i32> %v
  store <WIDTH x i32> %v1, <WIDTH x i32> * %0
@@ -266,30 +267,40 @@ define void @__masked_store_blend_32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
 }

 define void @__masked_store_blend_64(<WIDTH x i64>* nocapture,
-                                     <WIDTH x i64>, <WIDTH x i1>) nounwind {
+                            <WIDTH x i64>, <WIDTH x i1>) nounwind alwaysinline {
  %v = load <WIDTH x i64> * %0
  %v1 = select <WIDTH x i1> %2, <WIDTH x i64> %1, <WIDTH x i64> %v
  store <WIDTH x i64> %v1, <WIDTH x i64> * %0
  ret void
 }
+',`
+declare void @__masked_store_blend_8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
+                                     <WIDTH x i1>) nounwind 
+declare void @__masked_store_blend_16(<WIDTH x i16>* nocapture, <WIDTH x i16>, 
+                                      <WIDTH x i1>) nounwind 
+declare void @__masked_store_blend_32(<WIDTH x i32>* nocapture, <WIDTH x i32>, 
+                                      <WIDTH x i1>) nounwind 
+declare void @__masked_store_blend_64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
+                                      <WIDTH x i1> %mask) nounwind 
+')

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter

 define(`gather_scatter', `
 declare <WIDTH x $1> @__gather_base_offsets32_$1(i8 * nocapture, <WIDTH x i32>,
-                        i32, <WIDTH x i1>) nounwind readonly 
+                        i32, <WIDTH x i32>, <WIDTH x i1>) nounwind readonly 
 declare <WIDTH x $1> @__gather_base_offsets64_$1(i8 * nocapture, <WIDTH x i64>,
-                        i32, <WIDTH x i1>) nounwind readonly 
+                        i32, <WIDTH x i64>, <WIDTH x i1>) nounwind readonly 
 declare <WIDTH x $1> @__gather32_$1(<WIDTH x i32>, 
                                    <WIDTH x i1>) nounwind readonly 
 declare <WIDTH x $1> @__gather64_$1(<WIDTH x i64>, 
                                    <WIDTH x i1>) nounwind readonly 

 declare void @__scatter_base_offsets32_$1(i8* nocapture, <WIDTH x i32>,
-                  i32, <WIDTH x $1>, <WIDTH x i1>) nounwind 
+                  i32, <WIDTH x i32>, <WIDTH x $1>, <WIDTH x i1>) nounwind 
 declare void @__scatter_base_offsets64_$1(i8* nocapture, <WIDTH x i64>,
-                  i32, <WIDTH x $1>, <WIDTH x i1>) nounwind 
+                  i32, <WIDTH x i64>, <WIDTH x $1>, <WIDTH x i1>) nounwind 
 declare void @__scatter32_$1(<WIDTH x i32>, <WIDTH x $1>,
                             <WIDTH x i1>) nounwind 
 declare void @__scatter64_$1(<WIDTH x i64>, <WIDTH x $1>,
--- a/builtins/target-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -47,6 +47,14 @@ int64minmax()

 include(`target-sse2-common.ll')

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp

--- a/builtins/target-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -44,6 +44,14 @@ int64minmax()

 include(`target-sse2-common.ll')

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding
 ;;
--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -47,6 +47,14 @@ int64minmax()

 include(`target-sse4-common.ll')

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp

--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -44,6 +44,14 @@ int64minmax()

 include(`target-sse4-common.ll')

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp

--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -760,14 +760,12 @@ define(`global_atomic_uniform', `
 ifelse(LLVM_VERSION, `LLVM_2_9',`
 declare $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %delta)

-define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val,
-                                         <$1 x MASK> %mask) nounwind alwaysinline {
+define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val) nounwind alwaysinline {
  %r = call $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %val)
  ret $3 %r
 }
 ', `
-define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val,
-                                         <$1 x MASK> %mask) nounwind alwaysinline {
+define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val) nounwind alwaysinline {
  %r = atomicrmw $2 $3 * %ptr, $3 %val seq_cst
  ret $3 %r
 }
@@ -786,26 +784,7 @@ declare i32 @llvm.atomic.swap.i32.p0i32(i32 * %ptr, i32 %val)
 declare i64 @llvm.atomic.swap.i64.p0i64(i64 * %ptr, i64 %val)')

 define(`global_swap', `
-
-define <$1 x $2> @__atomic_swap_$3_global($2* %ptr, <$1 x $2> %val,
-                                          <$1 x MASK> %mask) nounwind alwaysinline {
-  %rptr = alloca <$1 x $2>
-  %rptr32 = bitcast <$1 x $2> * %rptr to $2 *
-
-  per_lane($1, <$1 x MASK> %mask, `
-   %val_LANE_ID = extractelement <$1 x $2> %val, i32 LANE
-ifelse(LLVM_VERSION, `LLVM_2_9',`
-   %r_LANE_ID = call $2 @llvm.atomic.swap.$2.p0$2($2 * %ptr, $2 %val_LANE_ID)', `
-   %r_LANE_ID = atomicrmw xchg $2 * %ptr, $2 %val_LANE_ID seq_cst')
-   %rp_LANE_ID = getelementptr $2 * %rptr32, i32 LANE
-   store $2 %r_LANE_ID, $2 * %rp_LANE_ID')
-
-  %r = load <$1 x $2> * %rptr
-  ret <$1 x $2> %r
-}
-
-define $2 @__atomic_swap_uniform_$3_global($2* %ptr, $2 %val,
-                                           <$1 x MASK> %mask) nounwind alwaysinline {
+define $2 @__atomic_swap_uniform_$3_global($2* %ptr, $2 %val) nounwind alwaysinline {
 ifelse(LLVM_VERSION, `LLVM_2_9',`
 %r = call $2 @llvm.atomic.swap.$2.p0$2($2 * %ptr, $2 %val)', `
 %r = atomicrmw xchg $2 * %ptr, $2 %val seq_cst')
@@ -845,7 +824,7 @@ ifelse(LLVM_VERSION, `LLVM_2_9',`
 }

 define $2 @__atomic_compare_exchange_uniform_$3_global($2* %ptr, $2 %cmp,
-                               $2 %val, <$1 x MASK> %mask) nounwind alwaysinline {
+                                                       $2 %val) nounwind alwaysinline {
 ifelse(LLVM_VERSION, `LLVM_2_9',`
  %r = call $2 @llvm.atomic.cmp.swap.$2.p0$2($2 * %ptr, $2 %cmp, $2 %val)', `
  %r = cmpxchg $2 * %ptr, $2 %cmp, $2 %val seq_cst')
@@ -1586,17 +1565,15 @@ declare void @__pseudo_masked_store_64(<WIDTH x i64> * nocapture, <WIDTH x i64>,
 ; these represent gathers from a common base pointer with offsets.  The
 ; offset_scale factor scales the offsets before they are added to the base
 ; pointer--it should have the value 1, 2, 4, or 8.  (It can always just be 1.)
-; The 2, 4, 8 cases are used to match LLVM patterns that use the free 2/4/8 scaling
-; available in x86 addressing calculations... 
+; Then, the offset delta_value (guaranteed to be a compile-time constant value),
+; is added to the final address. The 2, 4, 8 scales are used to match LLVM patterns
+; that use the free 2/4/8 scaling available in x86 addressing calculations, and
+; offset_delta feeds into the free offset calculation. 
 ;
-; varying int8  __pseudo_gather_base_offsets{32,64}_8(uniform int8 *base, 
-;                                    int{32,64} offsets, int32 offset_scale, mask)
-; varying int16 __pseudo_gather_base_offsets{32,64}_16(uniform int16 *base, 
-;                                    int{32,64} offsets, int32 offset_scale, mask)
-; varying int32 __pseudo_gather_base_offsets{32,64}_32(uniform int32 *base, 
-;                                    int{32,64} offsets, int32 offset_scale, mask)
-; varying int64 __pseudo_gather_base_offsets{32,64}_64(uniform int64 *base, 
-;                                    int{32,64} offsets, int32 offset_scale, mask)
+; varying int{8,16,32,64}
+; __pseudo_gather_base_offsets{32,64}_{8,16,32,64}(uniform int8 *base,
+;                                    int{32,64} offsets, uniform int32 offset_scale, 
+;                                    int{32,64} offset_delta, mask)
 ;
 ; Then, the GSImprovementsPass optimizations finds these and either
 ; converts them to native gather functions or converts them to vector
@@ -1612,22 +1589,22 @@ declare <WIDTH x i16> @__pseudo_gather64_16(<WIDTH x i64>, <WIDTH x MASK>) nounw
 declare <WIDTH x i32> @__pseudo_gather64_32(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
 declare <WIDTH x i64> @__pseudo_gather64_64(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly

-declare <WIDTH x i8>  @__pseudo_gather_base_offsets32_8(i8 *, <WIDTH x i32>, i32,
-                                                     <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x i16> @__pseudo_gather_base_offsets32_16(i8 *, <WIDTH x i32>, i32,
+declare <WIDTH x i8>  @__pseudo_gather_base_offsets32_8(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
+                                                        <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i16> @__pseudo_gather_base_offsets32_16(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
                                                      <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x i32> @__pseudo_gather_base_offsets32_32(i8 *, <WIDTH x i32>, i32,
+declare <WIDTH x i32> @__pseudo_gather_base_offsets32_32(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
                                                      <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x i64> @__pseudo_gather_base_offsets32_64(i8 *, <WIDTH x i32>, i32,
+declare <WIDTH x i64> @__pseudo_gather_base_offsets32_64(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
                                                      <WIDTH x MASK>) nounwind readonly

-declare <WIDTH x i8>  @__pseudo_gather_base_offsets64_8(i8 *, <WIDTH x i64>, i32,
+declare <WIDTH x i8>  @__pseudo_gather_base_offsets64_8(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                     <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x i16> @__pseudo_gather_base_offsets64_16(i8 *, <WIDTH x i64>, i32,
+declare <WIDTH x i16> @__pseudo_gather_base_offsets64_16(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                      <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x i32> @__pseudo_gather_base_offsets64_32(i8 *, <WIDTH x i64>, i32,
+declare <WIDTH x i32> @__pseudo_gather_base_offsets64_32(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                      <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x i64> @__pseudo_gather_base_offsets64_64(i8 *, <WIDTH x i64>, i32,
+declare <WIDTH x i64> @__pseudo_gather_base_offsets64_64(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                      <WIDTH x MASK>) nounwind readonly

 ; Similarly to the pseudo-gathers defined above, we also declare undefined
@@ -1642,13 +1619,9 @@ declare <WIDTH x i64> @__pseudo_gather_base_offsets64_64(i8 *, <WIDTH x i64>, i3
 ; transforms them to scatters like:
 ;
 ; void __pseudo_scatter_base_offsets{32,64}_8(uniform int8 *base, 
-;             varying int32 offsets, int32 offset_scale, varying int8 values, mask)
-; void __pseudo_scatter_base_offsets{32,64}_16(uniform int16 *base, 
-;             varying int32 offsets, int32 offset_scale, varying int16 values, mask)
-; void __pseudo_scatter_base_offsets{32,64}_32(uniform int32 *base, 
-;             varying int32 offsets, int32 offset_scale, varying int32 values, mask)
-; void __pseudo_scatter_base_offsets{32,64}_64(uniform int64 *base, 
-;             varying int32 offsets, int32 offset_scale, varying int64 values, mask)
+;             varying int32 offsets, uniform int32 offset_scale, 
+;             varying int{32,64} offset_delta, varying int8 values, mask)
+; (and similarly for 16/32/64 bit values)
 ;
 ; And the GSImprovementsPass in turn converts these to actual native
 ; scatters or masked stores.  
@@ -1663,22 +1636,22 @@ declare void @__pseudo_scatter64_16(<WIDTH x i64>, <WIDTH x i16>, <WIDTH x MASK>
 declare void @__pseudo_scatter64_32(<WIDTH x i64>, <WIDTH x i32>, <WIDTH x MASK>) nounwind
 declare void @__pseudo_scatter64_64(<WIDTH x i64>, <WIDTH x i64>, <WIDTH x MASK>) nounwind

-declare void @__pseudo_scatter_base_offsets32_8(i8 * nocapture, <WIDTH x i32>, i32,
+declare void @__pseudo_scatter_base_offsets32_8(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
                                                <WIDTH x i8>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_base_offsets32_16(i8 * nocapture, <WIDTH x i32>, i32,
+declare void @__pseudo_scatter_base_offsets32_16(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
                                                 <WIDTH x i16>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_base_offsets32_32(i8 * nocapture, <WIDTH x i32>, i32,
+declare void @__pseudo_scatter_base_offsets32_32(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
                                                 <WIDTH x i32>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_base_offsets32_64(i8 * nocapture, <WIDTH x i32>, i32,
+declare void @__pseudo_scatter_base_offsets32_64(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
                                                 <WIDTH x i64>, <WIDTH x MASK>) nounwind

-declare void @__pseudo_scatter_base_offsets64_8(i8 * nocapture, <WIDTH x i64>, i32,
+declare void @__pseudo_scatter_base_offsets64_8(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                <WIDTH x i8>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_base_offsets64_16(i8 * nocapture, <WIDTH x i64>, i32,
+declare void @__pseudo_scatter_base_offsets64_16(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                 <WIDTH x i16>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_base_offsets64_32(i8 * nocapture, <WIDTH x i64>, i32,
+declare void @__pseudo_scatter_base_offsets64_32(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                 <WIDTH x i32>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_base_offsets64_64(i8 * nocapture, <WIDTH x i64>, i32,
+declare void @__pseudo_scatter_base_offsets64_64(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                 <WIDTH x i64>, <WIDTH x MASK>) nounwind

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -1832,6 +1805,22 @@ ok:
  ret void
 }

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; read hw clock
+
+define i64 @__clock() nounwind uwtable ssp {
+entry:
+  tail call void asm sideeffect "xorl %eax,%eax \0A    cpuid", "~{rax},~{rbx},~{rcx},~{rdx},~{dirflag},~{fpsr},~{flags}"() nounwind
+  %0 = tail call { i32, i32 } asm sideeffect "rdtsc", "={ax},={dx},~{dirflag},~{fpsr},~{flags}"() nounwind
+  %asmresult = extractvalue { i32, i32 } %0, 0
+  %asmresult1 = extractvalue { i32, i32 } %0, 1
+  %conv = zext i32 %asmresult1 to i64
+  %shl = shl nuw i64 %conv, 32
+  %conv2 = zext i32 %asmresult to i64
+  %or = or i64 %shl, %conv2
+  ret i64 %or
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; stdlib transcendentals
 ;;
@@ -1997,38 +1986,18 @@ global_atomic_uniform(WIDTH, umax, i64, uint64)
 global_swap(WIDTH, i32, int32)
 global_swap(WIDTH, i64, int64)

-define <WIDTH x float> @__atomic_swap_float_global(float * %ptr, <WIDTH x float> %val,
-                                                   <WIDTH x MASK> %mask) nounwind alwaysinline {
-  %iptr = bitcast float * %ptr to i32 *
-  %ival = bitcast <WIDTH x float> %val to <WIDTH x i32>
-  %iret = call <WIDTH x i32> @__atomic_swap_int32_global(i32 * %iptr, <WIDTH x i32> %ival, <WIDTH x MASK> %mask)
-  %ret = bitcast <WIDTH x i32> %iret to <WIDTH x float>
-  ret <WIDTH x float> %ret
-}
-
-define <WIDTH x double> @__atomic_swap_double_global(double * %ptr, <WIDTH x double> %val,
-                                                   <WIDTH x MASK> %mask) nounwind alwaysinline {
-  %iptr = bitcast double * %ptr to i64 *
-  %ival = bitcast <WIDTH x double> %val to <WIDTH x i64>
-  %iret = call <WIDTH x i64> @__atomic_swap_int64_global(i64 * %iptr, <WIDTH x i64> %ival, <WIDTH x MASK> %mask)
-  %ret = bitcast <WIDTH x i64> %iret to <WIDTH x double>
-  ret <WIDTH x double> %ret
-}
-
-define float @__atomic_swap_uniform_float_global(float * %ptr, float %val,
-                                                   <WIDTH x MASK> %mask) nounwind alwaysinline {
+define float @__atomic_swap_uniform_float_global(float * %ptr, float %val) nounwind alwaysinline {
  %iptr = bitcast float * %ptr to i32 *
  %ival = bitcast float %val to i32
-  %iret = call i32 @__atomic_swap_uniform_int32_global(i32 * %iptr, i32 %ival, <WIDTH x MASK> %mask)
+  %iret = call i32 @__atomic_swap_uniform_int32_global(i32 * %iptr, i32 %ival)
  %ret = bitcast i32 %iret to float
  ret float %ret
 }

-define double @__atomic_swap_uniform_double_global(double * %ptr, double %val,
-                                                   <WIDTH x MASK> %mask) nounwind alwaysinline {
+define double @__atomic_swap_uniform_double_global(double * %ptr, double %val) nounwind alwaysinline {
  %iptr = bitcast double * %ptr to i64 *
  %ival = bitcast double %val to i64
-  %iret = call i64 @__atomic_swap_uniform_int64_global(i64 * %iptr, i64 %ival, <WIDTH x MASK> %mask)
+  %iret = call i64 @__atomic_swap_uniform_int64_global(i64 * %iptr, i64 %ival)
  %ret = bitcast i64 %iret to double
  ret double %ret
 }
@@ -2058,24 +2027,23 @@ define <WIDTH x double> @__atomic_compare_exchange_double_global(double * %ptr,
  ret <WIDTH x double> %ret
 }

-define float @__atomic_compare_exchange_uniform_float_global(float * %ptr, float %cmp, float %val,
-                                                   <WIDTH x MASK> %mask) nounwind alwaysinline {
+define float @__atomic_compare_exchange_uniform_float_global(float * %ptr, float %cmp,
+                                                             float %val) nounwind alwaysinline {
  %iptr = bitcast float * %ptr to i32 *
  %icmp = bitcast float %cmp to i32
  %ival = bitcast float %val to i32
  %iret = call i32 @__atomic_compare_exchange_uniform_int32_global(i32 * %iptr, i32 %icmp,
-                                                                   i32 %ival, <WIDTH x MASK> %mask)
+                                                                   i32 %ival)
  %ret = bitcast i32 %iret to float
  ret float %ret
 }

 define double @__atomic_compare_exchange_uniform_double_global(double * %ptr, double %cmp,
-                                            double %val, <WIDTH x MASK> %mask) nounwind alwaysinline {
+                                                               double %val) nounwind alwaysinline {
  %iptr = bitcast double * %ptr to i64 *
  %icmp = bitcast double %cmp to i64
  %ival = bitcast double %val to i64
-  %iret = call i64 @__atomic_compare_exchange_uniform_int64_global(i64 * %iptr, i64 %icmp,
-                                                                   i64 %ival, <WIDTH x MASK> %mask)
+  %iret = call i64 @__atomic_compare_exchange_uniform_int64_global(i64 * %iptr, i64 %icmp, i64 %ival)
  %ret = bitcast i64 %iret to double
  ret double %ret
 }
@@ -2727,7 +2695,8 @@ define(`gen_gather', `
 ;; Define the utility function to do the gather operation for a single element
 ;; of the type
 define <$1 x $2> @__gather_elt32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_scale,
-                                    <$1 x $2> %ret, i32 %lane) nounwind readonly alwaysinline {
+                                    <$1 x i32> %offset_delta, <$1 x $2> %ret,
+                                    i32 %lane) nounwind readonly alwaysinline {
  ; compute address for this one from the base
  %offset32 = extractelement <$1 x i32> %offsets, i32 %lane
  ; the order and details of the next 4 lines are important--they match LLVMs 
@@ -2737,15 +2706,20 @@ define <$1 x $2> @__gather_elt32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_
  %offset = mul i64 %offset64, %scale64
  %ptroffset = getelementptr i8 * %ptr, i64 %offset

+  %delta = extractelement <$1 x i32> %offset_delta, i32 %lane
+  %delta64 = sext i32 %delta to i64
+  %finalptr = getelementptr i8 * %ptroffset, i64 %delta64
+
  ; load value and insert into returned value
-  %ptrcast = bitcast i8 * %ptroffset to $2 *
+  %ptrcast = bitcast i8 * %finalptr to $2 *
  %val = load $2 *%ptrcast
  %updatedret = insertelement <$1 x $2> %ret, $2 %val, i32 %lane
  ret <$1 x $2> %updatedret
 }

 define <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_scale,
-                                    <$1 x $2> %ret, i32 %lane) nounwind readonly alwaysinline {
+                                    <$1 x i64> %offset_delta, <$1 x $2> %ret,
+                                    i32 %lane) nounwind readonly alwaysinline {
  ; compute address for this one from the base
  %offset64 = extractelement <$1 x i64> %offsets, i32 %lane
  ; the order and details of the next 4 lines are important--they match LLVMs 
@@ -2754,8 +2728,11 @@ define <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_
  %offset = mul i64 %offset64, %offset_scale64
  %ptroffset = getelementptr i8 * %ptr, i64 %offset

+  %delta64 = extractelement <$1 x i64> %offset_delta, i32 %lane
+  %finalptr = getelementptr i8 * %ptroffset, i64 %delta64
+
  ; load value and insert into returned value
-  %ptrcast = bitcast i8 * %ptroffset to $2 *
+  %ptrcast = bitcast i8 * %finalptr to $2 *
  %val = load $2 *%ptrcast
  %updatedret = insertelement <$1 x $2> %ret, $2 %val, i32 %lane
  ret <$1 x $2> %updatedret
@@ -2763,6 +2740,7 @@ define <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_


 define <$1 x $2> @__gather_base_offsets32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_scale,
+                                             <$1 x i32> %offset_delta,
                                             <$1 x i32> %vecmask) nounwind readonly alwaysinline {
  ; We can be clever and avoid the per-lane stuff for gathers if we are willing
  ; to require that the 0th element of the array being gathered from is always
@@ -2775,16 +2753,25 @@ define <$1 x $2> @__gather_base_offsets32_$2(i8 * %ptr, <$1 x i32> %offsets, i32
                                     <$1 x i32> %vecmask)
  %newOffsets = load <$1 x i32> * %offsetsPtr

+  %deltaPtr = alloca <$1 x i32>
+  store <$1 x i32> zeroinitializer, <$1 x i32> * %deltaPtr
+  call void @__masked_store_blend_32(<$1 x i32> * %deltaPtr, <$1 x i32> %offset_delta, 
+                                     <$1 x i32> %vecmask)
+  %newDelta = load <$1 x i32> * %deltaPtr
+
  %ret0 = call <$1 x $2> @__gather_elt32_$2(i8 * %ptr, <$1 x i32> %newOffsets,
-                                            i32 %offset_scale, <$1 x $2> undef, i32 0)
+                                            i32 %offset_scale, <$1 x i32> %offset_delta,
+                                            <$1 x $2> undef, i32 0)
  forloop(lane, 1, eval($1-1), 
          `patsubst(patsubst(`%retLANE = call <$1 x $2> @__gather_elt32_$2(i8 * %ptr, 
-                                <$1 x i32> %newOffsets, i32 %offset_scale, <$1 x $2> %retPREV, i32 LANE)
+                                <$1 x i32> %newOffsets, i32 %offset_scale, <$1 x i32> %offset_delta,
+                                <$1 x $2> %retPREV, i32 LANE)
                    ', `LANE', lane), `PREV', eval(lane-1))')
  ret <$1 x $2> %ret`'eval($1-1)
 }

 define <$1 x $2> @__gather_base_offsets64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_scale,
+                                             <$1 x i64> %offset_delta,
                                             <$1 x i32> %vecmask) nounwind readonly alwaysinline {
  ; We can be clever and avoid the per-lane stuff for gathers if we are willing
  ; to require that the 0th element of the array being gathered from is always
@@ -2797,11 +2784,19 @@ define <$1 x $2> @__gather_base_offsets64_$2(i8 * %ptr, <$1 x i64> %offsets, i32
                                     <$1 x i32> %vecmask)
  %newOffsets = load <$1 x i64> * %offsetsPtr

+  %deltaPtr = alloca <$1 x i64>
+  store <$1 x i64> zeroinitializer, <$1 x i64> * %deltaPtr
+  call void @__masked_store_blend_64(<$1 x i64> * %deltaPtr, <$1 x i64> %offset_delta, 
+                                     <$1 x i32> %vecmask)
+  %newDelta = load <$1 x i64> * %deltaPtr
+
  %ret0 = call <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %newOffsets,
-                                            i32 %offset_scale, <$1 x $2> undef, i32 0)
+                                            i32 %offset_scale, <$1 x i64> %newDelta,
+                                            <$1 x $2> undef, i32 0)
  forloop(lane, 1, eval($1-1), 
          `patsubst(patsubst(`%retLANE = call <$1 x $2> @__gather_elt64_$2(i8 * %ptr, 
-                                <$1 x i64> %newOffsets, i32 %offset_scale, <$1 x $2> %retPREV, i32 LANE)
+                                <$1 x i64> %newOffsets, i32 %offset_scale, <$1 x i64> %newDelta,
+                                <$1 x $2> %retPREV, i32 LANE)
                    ', `LANE', lane), `PREV', eval(lane-1))')
  ret <$1 x $2> %ret`'eval($1-1)
 }
@@ -2852,7 +2847,8 @@ define(`gen_scatter', `
 ;; Define the function that descripes the work to do to scatter a single
 ;; value
 define void @__scatter_elt32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_scale,
-                                <$1 x $2> %values, i32 %lane) nounwind alwaysinline {
+                                <$1 x i32> %offset_delta, <$1 x $2> %values,
+                                i32 %lane) nounwind alwaysinline {
  %offset32 = extractelement <$1 x i32> %offsets, i32 %lane
  ; the order and details of the next 4 lines are important--they match LLVMs 
  ; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations
@@ -2861,42 +2857,52 @@ define void @__scatter_elt32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_scal
  %offset = mul i64 %offset64, %scale64
  %ptroffset = getelementptr i8 * %ptr, i64 %offset

-  %ptrcast = bitcast i8 * %ptroffset to $2 *
+  %delta = extractelement <$1 x i32> %offset_delta, i32 %lane
+  %delta64 = sext i32 %delta to i64
+  %finalptr = getelementptr i8 * %ptroffset, i64 %delta64
+
+  %ptrcast = bitcast i8 * %finalptr to $2 *
  %storeval = extractelement <$1 x $2> %values, i32 %lane
  store $2 %storeval, $2 * %ptrcast
  ret void
 }

 define void @__scatter_elt64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_scale,
-                                <$1 x $2> %values, i32 %lane) nounwind alwaysinline {
+                                <$1 x i64> %offset_delta, <$1 x $2> %values,
+                                i32 %lane) nounwind alwaysinline {
  %offset64 = extractelement <$1 x i64> %offsets, i32 %lane
  ; the order and details of the next 4 lines are important--they match LLVMs 
  ; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations
  %scale64 = sext i32 %offset_scale to i64
  %offset = mul i64 %offset64, %scale64
  %ptroffset = getelementptr i8 * %ptr, i64 %offset
-  %ptrcast = bitcast i8 * %ptroffset to $2 *

+  %delta64 = extractelement <$1 x i64> %offset_delta, i32 %lane
+  %finalptr = getelementptr i8 * %ptroffset, i64 %delta64
+
+  %ptrcast = bitcast i8 * %finalptr to $2 *
  %storeval = extractelement <$1 x $2> %values, i32 %lane
  store $2 %storeval, $2 * %ptrcast
  ret void
 }

 define void @__scatter_base_offsets32_$2(i8* %base, <$1 x i32> %offsets, i32 %offset_scale,
-                                         <$1 x $2> %values, <$1 x i32> %mask) nounwind alwaysinline {
+                                         <$1 x i32> %offset_delta, <$1 x $2> %values,
+                                         <$1 x i32> %mask) nounwind alwaysinline {
  ;; And use the `per_lane' macro to do all of the per-lane work for scatter...
  per_lane($1, <$1 x i32> %mask, `
      call void @__scatter_elt32_$2(i8 * %base, <$1 x i32> %offsets, i32 %offset_scale,
-                                    <$1 x $2> %values, i32 LANE)')
+                                    <$1 x i32> %offset_delta, <$1 x $2> %values, i32 LANE)')
  ret void
 }

 define void @__scatter_base_offsets64_$2(i8* %base, <$1 x i64> %offsets, i32 %offset_scale,
-                                         <$1 x $2> %values, <$1 x i32> %mask) nounwind alwaysinline {
+                                         <$1 x i64> %offset_delta, <$1 x $2> %values,
+                                         <$1 x i32> %mask) nounwind alwaysinline {
  ;; And use the `per_lane' macro to do all of the per-lane work for scatter...
  per_lane($1, <$1 x i32> %mask, `
      call void @__scatter_elt64_$2(i8 * %base, <$1 x i64> %offsets, i32 %offset_scale,
-                                    <$1 x $2> %values, i32 LANE)')
+                                    <$1 x i64> %offset_delta, <$1 x $2> %values, i32 LANE)')
  ret void
 }

--- a/cbackend.cpp
+++ b/cbackend.cpp
@@ -16,6 +16,16 @@
 #warning "The C++ backend isn't supported when building with LLVM 2.9"
 #else

+#ifndef _MSC_VER
+#include <inttypes.h>
+#endif
+
+#ifndef PRIx64
+#define PRIx64 "llx"
+#endif
+
+#include "llvmutil.h"
+
 #include "llvm/CallingConv.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
@@ -224,6 +234,7 @@ namespace {
    unsigned NextAnonValueNumber;
    
    std::string includeName;
+    int vectorWidth;

    /// UnnamedStructIDs - This contains a unique ID for each struct that is
    /// either anonymous or has no name.
@@ -232,11 +243,13 @@ namespace {

  public:
    static char ID;
-    explicit CWriter(formatted_raw_ostream &o, const char *incname)
+      explicit CWriter(formatted_raw_ostream &o, const char *incname,
+                       int vecwidth)
      : FunctionPass(ID), Out(o), IL(0), Mang(0), LI(0),
        TheModule(0), TAsm(0), MRI(0), MOFI(0), TCtx(0), TD(0),
        OpaqueCounter(0), NextAnonValueNumber(0), 
-        includeName(incname ? incname : "generic_defs.h") {
+        includeName(incname ? incname : "generic_defs.h"),
+        vectorWidth(vecwidth) {
      initializeLoopInfoPass(*PassRegistry::getPassRegistry());
      FPCounter = 0;
    }
@@ -376,7 +389,7 @@ namespace {
      if (I.getType() == Type::getVoidTy(I.getContext()) || !I.hasOneUse() ||
          isa<TerminatorInst>(I) || isa<CallInst>(I) || isa<PHINode>(I) ||
          isa<LoadInst>(I) || isa<VAArgInst>(I) || isa<InsertElementInst>(I) ||
-          isa<InsertValueInst>(I) || isa<ExtractValueInst>(I))
+          isa<InsertValueInst>(I) || isa<ExtractValueInst>(I) || isa<SelectInst>(I))
        // Don't inline a load across a store or other bad things!
        return false;

@@ -765,6 +778,16 @@ raw_ostream &CWriter::printType(raw_ostream &Out, Type *Ty,
    Out << "    return ret;\n";
    Out << "  }\n  ";

+    // if it's an array of i8s, also provide a version that takes a const
+    // char *
+    if (ATy->getElementType() == LLVMTypes::Int8Type) {
+        Out << "  static " << NameSoFar << " init(const char *p) {\n";
+        Out << "    " << NameSoFar << " ret;\n";
+        Out << "    strncpy((char *)ret.array, p, " << NumElements << ");\n";
+        Out << "    return ret;\n";
+        Out << "  }\n";
+    }
+
    printType(Out, ATy->getElementType(), false,
              "array[" + utostr(NumElements) + "]");
    return Out << ";\n} ";
@@ -834,7 +857,8 @@ void CWriter::printConstantArray(ConstantArray *CPA, bool Static) {
    }
    Out << '\"';
  } else {
-    Out << '{';
+    if (Static)
+      Out << '{';
    if (CPA->getNumOperands()) {
      Out << ' ';
      printConstant(cast<Constant>(CPA->getOperand(0)), Static);
@@ -843,7 +867,8 @@ void CWriter::printConstantArray(ConstantArray *CPA, bool Static) {
        printConstant(cast<Constant>(CPA->getOperand(i)), Static);
      }
    }
-    Out << " }";
+    if (Static)
+      Out << " }";
  }
 }

@@ -1280,7 +1305,7 @@ void CWriter::printConstant(Constant *CPV, bool Static) {
        char Buffer[100];

        uint64_t ll = DoubleToBits(V);
-        sprintf(Buffer, "0x%llx", static_cast<long long>(ll));
+        sprintf(Buffer, "0x%"PRIx64, static_cast<long long>(ll));

        std::string Num(&Buffer[0], &Buffer[6]);
        unsigned long Val = strtoul(Num.c_str(), 0, 16);
@@ -1313,7 +1338,8 @@ void CWriter::printConstant(Constant *CPV, bool Static) {
    break;
  }

-  case Type::ArrayTyID:
+  case Type::ArrayTyID: {
+    ArrayType *AT = cast<ArrayType>(CPV->getType());
    if (Static)
      // arrays are wrapped in structs...
      Out << "{ ";
@@ -1326,7 +1352,6 @@ void CWriter::printConstant(Constant *CPV, bool Static) {
      printConstantArray(CA, Static);
    } else {
      assert(isa<ConstantAggregateZero>(CPV) || isa<UndefValue>(CPV));
-      ArrayType *AT = cast<ArrayType>(CPV->getType());
      if (AT->getNumElements()) {
        Out << ' ';
        Constant *CZ = Constant::getNullValue(AT->getElementType());
@@ -1342,7 +1367,7 @@ void CWriter::printConstant(Constant *CPV, bool Static) {
    else
        Out << ")";
    break;
-
+  }
  case Type::VectorTyID:
    printType(Out, CPV->getType());
    Out << "(";
@@ -1741,17 +1766,6 @@ void CWriter::writeOperandWithCast(Value* Operand, const ICmpInst &Cmp) {
 //
 static void generateCompilerSpecificCode(formatted_raw_ostream& Out,
                                         const TargetData *TD) {
-  // Alloca, ...
-  Out << "#include <stdlib.h>\n"
-      << "#include <stdint.h>\n"
-      << "/* get a declaration for alloca */\n"
-      << "#ifdef _MSC_VER\n"
-      << "#include <malloc.h>\n"
-      << "#define alloca _alloca\n"
-      << "#else\n"
-      << "#include <alloca.h>\n"
-      << "#endif\n\n";
-
  // We output GCC specific attributes to preserve 'linkonce'ness on globals.
  // If we aren't being compiled with GCC, just drop these attributes.
  Out << "#ifndef __GNUC__  /* Can only support \"linkonce\" vars with GCC */\n"
@@ -1976,7 +1990,6 @@ bool CWriter::doInitialization(Module &M) {
  Out << "  DO NOT EDIT THIS FILE DIRECTLY\n";
  Out << " *******************************************************************/\n\n";

-  // get declaration for alloca
  Out << "/* Provide Declarations */\n";
  Out << "#include <stdarg.h>\n";      // Varargs support
  Out << "#include <setjmp.h>\n";      // Unwind support
@@ -1987,6 +2000,15 @@ bool CWriter::doInitialization(Module &M) {
  Out << "  #define NOMINMAX\n";
  Out << "  #include <windows.h>\n";
  Out << "#endif // _MSC_VER\n";
+  Out << "#include <stdlib.h>\n";
+  Out << "#include <stdint.h>\n";
+  Out << "/* get a declaration for alloca */\n";
+  Out << "#ifdef _MSC_VER\n";
+  Out << "  #include <malloc.h>\n";
+  Out << "  #define alloca _alloca\n";
+  Out << "#else\n";
+  Out << "  #include <alloca.h>\n";
+  Out << "#endif\n\n";

  Out << "#include \"" << includeName << "\"\n";

@@ -2198,7 +2220,7 @@ bool CWriter::doInitialization(Module &M) {
        // FIXME common linkage should avoid this problem.
        if (!I->getInitializer()->isNullValue()) {
          Out << " = " ;
-          writeOperand(I->getInitializer(), true);
+          writeOperand(I->getInitializer(), false);
        } else if (I->hasWeakLinkage()) {
          // We have to specify an initializer, but it doesn't have to be
          // complete.  If the value is an aggregate, print out { 0 }, and let
@@ -2213,7 +2235,7 @@ bool CWriter::doInitialization(Module &M) {
            Out << "{ { 0 } }";
          } else {
            // Just print it out normally.
-            writeOperand(I->getInitializer(), true);
+            writeOperand(I->getInitializer(), false);
          }
        }
        Out << ";\n";
@@ -2887,7 +2909,21 @@ void CWriter::visitBinaryOperator(Instruction &I) {
      Out << "(";
      writeOperand(I.getOperand(0));
      Out << ", ";
-      writeOperand(I.getOperand(1));
+      if ((I.getOpcode() == Instruction::Shl ||
+           I.getOpcode() == Instruction::LShr ||
+           I.getOpcode() == Instruction::AShr)) {
+          std::vector<PHINode *> phis;
+          if (LLVMVectorValuesAllEqual(I.getOperand(1),
+                                       vectorWidth, phis)) {
+              Out << "__extract_element(";
+              writeOperand(I.getOperand(1));
+              Out << ", 0) ";
+          }
+          else
+              writeOperand(I.getOperand(1));
+      }
+      else
+          writeOperand(I.getOperand(1));
      Out << ")";
      return;
  }
@@ -3628,7 +3664,7 @@ std::string CWriter::InterpretASMConstraint(InlineAsm::ConstraintInfo& c) {
 #endif

  std::string E;
-  if (const Target *Match = TargetRegistry::lookupTarget(Triple, E))
+  if (const llvm::Target *Match = TargetRegistry::lookupTarget(Triple, E))
    TargetAsm = Match->createMCAsmInfo(Triple);
  else
    return c.Codes[0];
@@ -4330,7 +4366,7 @@ WriteCXXFile(llvm::Module *module, const char *fn, int vectorWidth,
    pm.add(new BitcastCleanupPass);
    pm.add(createDeadCodeEliminationPass()); // clean up after smear pass
 //CO    pm.add(createPrintModulePass(&fos));
-    pm.add(new CWriter(fos, includeName));
+    pm.add(new CWriter(fos, includeName, vectorWidth));
    pm.add(createGCInfoDeleter());
 //CO    pm.add(createVerifierPass());

--- a/ctx.cpp
+++ b/ctx.cpp
@@ -74,18 +74,35 @@ struct CFInfo {
                              llvm::Value *savedContinueLanesPtr,
                              llvm::Value *savedMask, llvm::Value *savedLoopMask);

+    static CFInfo *GetSwitch(bool isUniform, llvm::BasicBlock *breakTarget,
+                             llvm::BasicBlock *continueTarget, 
+                             llvm::Value *savedBreakLanesPtr,
+                             llvm::Value *savedContinueLanesPtr,
+                             llvm::Value *savedMask, llvm::Value *savedLoopMask,
+                             llvm::Value *switchExpr,
+                             llvm::BasicBlock *bbDefault,
+                             const std::vector<std::pair<int, llvm::BasicBlock *> > *bbCases,
+                             const std::map<llvm::BasicBlock *, llvm::BasicBlock *> *bbNext,
+                             bool scUniform);
+    
    bool IsIf() { return type == If; }
    bool IsLoop() { return type == Loop; }
    bool IsForeach() { return type == Foreach; }
-    bool IsVaryingType() { return !isUniform; }
+    bool IsSwitch() { return type == Switch; }
+    bool IsVarying() { return !isUniform; }
    bool IsUniform() { return isUniform; }

-    enum CFType { If, Loop, Foreach };
+    enum CFType { If, Loop, Foreach, Switch };
    CFType type;
    bool isUniform;
    llvm::BasicBlock *savedBreakTarget, *savedContinueTarget;
    llvm::Value *savedBreakLanesPtr, *savedContinueLanesPtr;
    llvm::Value *savedMask, *savedLoopMask;
+    llvm::Value *savedSwitchExpr;
+    llvm::BasicBlock *savedDefaultBlock;
+    const std::vector<std::pair<int, llvm::BasicBlock *> > *savedCaseBlocks;
+    const std::map<llvm::BasicBlock *, llvm::BasicBlock *> *savedNextBlocks;
+    bool savedSwitchConditionWasUniform;

 private:
    CFInfo(CFType t, bool uniformIf, llvm::Value *sm) {
@@ -95,11 +112,18 @@ private:
        savedBreakTarget = savedContinueTarget = NULL;
        savedBreakLanesPtr = savedContinueLanesPtr = NULL;
        savedMask = savedLoopMask = sm;
+        savedSwitchExpr = NULL;
+        savedDefaultBlock = NULL;
+        savedCaseBlocks = NULL;
+        savedNextBlocks = NULL;
    }
    CFInfo(CFType t, bool iu, llvm::BasicBlock *bt, llvm::BasicBlock *ct,
           llvm::Value *sb, llvm::Value *sc, llvm::Value *sm,
-           llvm::Value *lm) {
-        Assert(t == Loop);
+           llvm::Value *lm, llvm::Value *sse = NULL, llvm::BasicBlock *bbd = NULL, 
+           const std::vector<std::pair<int, llvm::BasicBlock *> > *bbc = NULL,
+           const std::map<llvm::BasicBlock *, llvm::BasicBlock *> *bbn = NULL,
+           bool scu = false) {
+        Assert(t == Loop || t == Switch);
        type = t;
        isUniform = iu;
        savedBreakTarget = bt;
@@ -108,6 +132,11 @@ private:
        savedContinueLanesPtr = sc;
        savedMask = sm;
        savedLoopMask = lm;
+        savedSwitchExpr = sse;
+        savedDefaultBlock = bbd;
+        savedCaseBlocks = bbc;
+        savedNextBlocks = bbn;
+        savedSwitchConditionWasUniform = scu;
    }
    CFInfo(CFType t, llvm::BasicBlock *bt, llvm::BasicBlock *ct,
           llvm::Value *sb, llvm::Value *sc, llvm::Value *sm,
@@ -121,6 +150,10 @@ private:
        savedContinueLanesPtr = sc;
        savedMask = sm;
        savedLoopMask = lm;
+        savedSwitchExpr = NULL;
+        savedDefaultBlock = NULL;
+        savedCaseBlocks = NULL;
+        savedNextBlocks = NULL;
    }
 };

@@ -154,12 +187,30 @@ CFInfo::GetForeach(llvm::BasicBlock *breakTarget,
                      savedMask, savedForeachMask);
 }

+
+CFInfo *
+CFInfo::GetSwitch(bool isUniform, llvm::BasicBlock *breakTarget,
+                  llvm::BasicBlock *continueTarget, 
+                  llvm::Value *savedBreakLanesPtr,
+                  llvm::Value *savedContinueLanesPtr, llvm::Value *savedMask,
+                  llvm::Value *savedLoopMask, llvm::Value *savedSwitchExpr,
+                  llvm::BasicBlock *savedDefaultBlock,
+                  const std::vector<std::pair<int, llvm::BasicBlock *> > *savedCases,
+                  const std::map<llvm::BasicBlock *, llvm::BasicBlock *> *savedNext,
+                  bool savedSwitchConditionUniform) {
+    return new CFInfo(Switch, isUniform, breakTarget, continueTarget, 
+                      savedBreakLanesPtr, savedContinueLanesPtr,
+                      savedMask, savedLoopMask, savedSwitchExpr, savedDefaultBlock, 
+                      savedCases, savedNext, savedSwitchConditionUniform);
+}
+
 ///////////////////////////////////////////////////////////////////////////

 FunctionEmitContext::FunctionEmitContext(Function *func, Symbol *funSym,
-                                         llvm::Function *llvmFunction,
+                                         llvm::Function *lf,
                                         SourcePos firstStmtPos) {
    function = func;
+    llvmFunction = lf;

    /* Create a new basic block to store all of the allocas */
    allocaBlock = llvm::BasicBlock::Create(*g->ctx, "allocas", llvmFunction, 0);
@@ -181,6 +232,11 @@ FunctionEmitContext::FunctionEmitContext(Function *func, Symbol *funSym,
    breakLanesPtr = continueLanesPtr = NULL;
    breakTarget = continueTarget = NULL;

+    switchExpr = NULL;
+    caseBlocks = NULL;
+    defaultBlock = NULL;
+    nextBlocks = NULL;
+
    returnedLanesPtr = AllocaInst(LLVMTypes::MaskType, "returned_lanes_memory");
    StoreInst(LLVMMaskAllOff, returnedLanesPtr);

@@ -421,51 +477,61 @@ FunctionEmitContext::StartVaryingIf(llvm::Value *oldMask) {

 void
 FunctionEmitContext::EndIf() {
+    CFInfo *ci = popCFState();
    // Make sure we match up with a Start{Uniform,Varying}If().
-    Assert(controlFlowInfo.size() > 0 && controlFlowInfo.back()->IsIf());
-    CFInfo *ci = controlFlowInfo.back();
-    controlFlowInfo.pop_back();
+    Assert(ci->IsIf());

    // 'uniform' ifs don't change the mask so we only need to restore the
    // mask going into the if for 'varying' if statements
-    if (!ci->IsUniform() && bblock != NULL) {
-        // We can't just restore the mask as it was going into the 'if'
-        // statement.  First we have to take into account any program
-        // instances that have executed 'return' statements; the restored
-        // mask must be off for those lanes.
-        restoreMaskGivenReturns(ci->savedMask);
+    if (ci->IsUniform() || bblock == NULL)
+        return;

-        // If the 'if' statement is inside a loop with a 'varying'
-        // consdition, we also need to account for any break or continue
-        // statements that executed inside the 'if' statmeent; we also must
-        // leave the lane masks for the program instances that ran those
-        // off after we restore the mask after the 'if'.  The code below
-        // ends up being optimized out in the case that there were no break
-        // or continue statements (and breakLanesPtr and continueLanesPtr
-        // have their initial 'all off' values), so we don't need to check
-        // for that here.
-        if (continueLanesPtr != NULL) {
-            // We want to compute:
-            // newMask = (oldMask & ~(breakLanes | continueLanes))
-            llvm::Value *oldMask = GetInternalMask();
-            llvm::Value *continueLanes = LoadInst(continueLanesPtr,
-                                                  "continue_lanes");
-            llvm::Value *bcLanes = continueLanes;
+    // We can't just restore the mask as it was going into the 'if'
+    // statement.  First we have to take into account any program
+    // instances that have executed 'return' statements; the restored
+    // mask must be off for those lanes.
+    restoreMaskGivenReturns(ci->savedMask);

-            if (breakLanesPtr != NULL) {
-                // breakLanesPtr will be NULL if we're inside a 'foreach' loop
-                llvm::Value *breakLanes = LoadInst(breakLanesPtr, "break_lanes");
-                bcLanes = BinaryOperator(llvm::Instruction::Or, breakLanes, 
-                                         continueLanes, "break|continue_lanes");
-            }
+    // If the 'if' statement is inside a loop with a 'varying'
+    // condition, we also need to account for any break or continue
+    // statements that executed inside the 'if' statmeent; we also must
+    // leave the lane masks for the program instances that ran those
+    // off after we restore the mask after the 'if'.  The code below
+    // ends up being optimized out in the case that there were no break
+    // or continue statements (and breakLanesPtr and continueLanesPtr
+    // have their initial 'all off' values), so we don't need to check
+    // for that here.
+    // 
+    // There are three general cases to deal with here:
+    // - Loops: both break and continue are allowed, and thus the corresponding
+    //   lane mask pointers are non-NULL
+    // - Foreach: only continueLanesPtr may be non-NULL
+    // - Switch: only breakLanesPtr may be non-NULL
+    if (continueLanesPtr != NULL || breakLanesPtr != NULL) {
+        // We want to compute:
+        // newMask = (oldMask & ~(breakLanes | continueLanes)),
+        // treading breakLanes or continueLanes as "all off" if the
+        // corresponding pointer is NULL.
+        llvm::Value *bcLanes = NULL;

-            llvm::Value *notBreakOrContinue = 
-                NotOperator(bcLanes, "!(break|continue)_lanes");
-            llvm::Value *newMask = 
-                BinaryOperator(llvm::Instruction::And, oldMask, 
-                               notBreakOrContinue, "new_mask");
-            SetInternalMask(newMask);
+        if (continueLanesPtr != NULL)
+            bcLanes = LoadInst(continueLanesPtr, "continue_lanes");
+        else
+            bcLanes = LLVMMaskAllOff;
+
+        if (breakLanesPtr != NULL) {
+            llvm::Value *breakLanes = LoadInst(breakLanesPtr, "break_lanes");
+            bcLanes = BinaryOperator(llvm::Instruction::Or, bcLanes, 
+                                     breakLanes, "|break_lanes");
        }
+
+        llvm::Value *notBreakOrContinue = 
+            NotOperator(bcLanes, "!(break|continue)_lanes");
+        llvm::Value *oldMask = GetInternalMask();
+        llvm::Value *newMask = 
+            BinaryOperator(llvm::Instruction::And, oldMask, 
+                           notBreakOrContinue, "new_mask");
+        SetInternalMask(newMask);
    }
 }

@@ -501,17 +567,8 @@ FunctionEmitContext::StartLoop(llvm::BasicBlock *bt, llvm::BasicBlock *ct,

 void
 FunctionEmitContext::EndLoop() {
-    Assert(controlFlowInfo.size() && controlFlowInfo.back()->IsLoop());
-    CFInfo *ci = controlFlowInfo.back();
-    controlFlowInfo.pop_back();
-
-    // Restore the break/continue state information to what it was before
-    // we went into this loop.
-    breakTarget = ci->savedBreakTarget;
-    continueTarget = ci->savedContinueTarget;
-    breakLanesPtr = ci->savedBreakLanesPtr;
-    continueLanesPtr = ci->savedContinueLanesPtr;
-    loopMask = ci->savedLoopMask;
+    CFInfo *ci = popCFState();
+    Assert(ci->IsLoop());

    if (!ci->IsUniform())
        // If the loop had a 'uniform' test, then it didn't make any
@@ -524,7 +581,7 @@ FunctionEmitContext::EndLoop() {


 void
-FunctionEmitContext::StartForeach(llvm::BasicBlock *ct) {
+FunctionEmitContext::StartForeach() {
    // Store the current values of various loop-related state so that we
    // can restore it when we exit this loop.
    llvm::Value *oldMask = GetInternalMask();
@@ -536,7 +593,7 @@ FunctionEmitContext::StartForeach(llvm::BasicBlock *ct) {

    continueLanesPtr = AllocaInst(LLVMTypes::MaskType, "foreach_continue_lanes");
    StoreInst(LLVMMaskAllOff, continueLanesPtr);
-    continueTarget = ct;
+    continueTarget = NULL; // should be set by SetContinueTarget()

    loopMask = NULL;
 }
@@ -544,17 +601,8 @@ FunctionEmitContext::StartForeach(llvm::BasicBlock *ct) {

 void
 FunctionEmitContext::EndForeach() {
-    Assert(controlFlowInfo.size() && controlFlowInfo.back()->IsForeach());
-    CFInfo *ci = controlFlowInfo.back();
-    controlFlowInfo.pop_back();
-
-    // Restore the break/continue state information to what it was before
-    // we went into this loop.
-    breakTarget = ci->savedBreakTarget;
-    continueTarget = ci->savedContinueTarget;
-    breakLanesPtr = ci->savedBreakLanesPtr;
-    continueLanesPtr = ci->savedContinueLanesPtr;
-    loopMask = ci->savedLoopMask;
+    CFInfo *ci = popCFState();
+    Assert(ci->IsForeach());
 }


@@ -575,28 +623,64 @@ FunctionEmitContext::restoreMaskGivenReturns(llvm::Value *oldMask) {
 }


+/** Returns "true" if the first enclosing non-if control flow expression is
+    a "switch" statement.
+*/
+bool
+FunctionEmitContext::inSwitchStatement() const {
+    // Go backwards through controlFlowInfo, since we add new nested scopes
+    // to the back.
+    int i = controlFlowInfo.size() - 1;
+    while (i >= 0 && controlFlowInfo[i]->IsIf())
+        --i;
+    // Got to the first non-if (or end of CF info)
+    if (i == -1)
+        return false;
+    return controlFlowInfo[i]->IsSwitch();
+}
+
+
 void
 FunctionEmitContext::Break(bool doCoherenceCheck) {
+    Assert(controlFlowInfo.size() > 0);
    if (breakTarget == NULL) {
        Error(currentPos, "\"break\" statement is illegal outside of "
-              "for/while/do loops.");
+              "for/while/do loops and \"switch\" statements.");
+        return;
+    }
+
+    if (bblock == NULL)
+        return;
+
+    if (inSwitchStatement() == true &&
+        switchConditionWasUniform == true && 
+        ifsInCFAllUniform(CFInfo::Switch)) {
+        // We know that all program instances are executing the break, so
+        // just jump to the block immediately after the switch.
+        Assert(breakTarget != NULL);
+        BranchInst(breakTarget);
+        bblock = NULL;
        return;
    }

    // If all of the enclosing 'if' tests in the loop have uniform control
    // flow or if we can tell that the mask is all on, then we can just
    // jump to the break location.
-    if (ifsInLoopAllUniform() || GetInternalMask() == LLVMMaskAllOn) {
+    if (inSwitchStatement() == false && 
+        (ifsInCFAllUniform(CFInfo::Loop) || 
+         GetInternalMask() == LLVMMaskAllOn)) {
        BranchInst(breakTarget);
-        if (ifsInLoopAllUniform() && doCoherenceCheck)
-            Warning(currentPos, "Coherent break statement not necessary in fully uniform "
-                    "control flow.");
+        if (ifsInCFAllUniform(CFInfo::Loop) && doCoherenceCheck)
+            Warning(currentPos, "Coherent break statement not necessary in "
+                    "fully uniform control flow.");
        // Set bblock to NULL since the jump has terminated the basic block
        bblock = NULL;
    }
    else {
-        // Otherwise we need to update the mask of the lanes that have
-        // executed a 'break' statement:
+        // Varying switch, uniform switch where the 'break' is under
+        // varying control flow, or a loop with varying 'if's above the
+        // break.  In these cases, we need to update the mask of the lanes
+        // that have executed a 'break' statement: 
        // breakLanes = breakLanes | mask
        Assert(breakLanesPtr != NULL);
        llvm::Value *mask = GetInternalMask();
@@ -612,16 +696,20 @@ FunctionEmitContext::Break(bool doCoherenceCheck) {
        // an 'if' statement and restore the mask then.
        SetInternalMask(LLVMMaskAllOff);

-        if (doCoherenceCheck)
-            // If the user has indicated that this is a 'coherent' break
-            // statement, then check to see if the mask is all off.  If so,
-            // we have to conservatively jump to the continueTarget, not
-            // the breakTarget, since part of the reason the mask is all
-            // off may be due to 'continue' statements that executed in the
-            // current loop iteration.  
-            // FIXME: if the loop only has break statements and no
-            // continues, we can jump to breakTarget in that case.
-            jumpIfAllLoopLanesAreDone(continueTarget);
+        if (doCoherenceCheck) {
+            if (continueTarget != NULL)
+                // If the user has indicated that this is a 'coherent'
+                // break statement, then check to see if the mask is all
+                // off.  If so, we have to conservatively jump to the
+                // continueTarget, not the breakTarget, since part of the
+                // reason the mask is all off may be due to 'continue'
+                // statements that executed in the current loop iteration.
+                jumpIfAllLoopLanesAreDone(continueTarget);
+            else if (breakTarget != NULL)
+                // Similarly handle these for switch statements, where we
+                // only have a break target.
+                jumpIfAllLoopLanesAreDone(breakTarget);
+        }
    }
 }

@@ -634,12 +722,12 @@ FunctionEmitContext::Continue(bool doCoherenceCheck) {
        return;
    }

-    if (ifsInLoopAllUniform() || GetInternalMask() == LLVMMaskAllOn) {
+    if (ifsInCFAllUniform(CFInfo::Loop) || GetInternalMask() == LLVMMaskAllOn) {
        // Similarly to 'break' statements, we can immediately jump to the
        // continue target if we're only in 'uniform' control flow within
        // loop or if we can tell that the mask is all on.
        AddInstrumentationPoint("continue: uniform CF, jumped");
-        if (ifsInLoopAllUniform() && doCoherenceCheck)
+        if (ifsInCFAllUniform(CFInfo::Loop) && doCoherenceCheck)
            Warning(currentPos, "Coherent continue statement not necessary in "
                    "fully uniform control flow.");
        BranchInst(continueTarget);
@@ -652,8 +740,9 @@ FunctionEmitContext::Continue(bool doCoherenceCheck) {
        llvm::Value *mask = GetInternalMask();
        llvm::Value *continueMask = 
            LoadInst(continueLanesPtr, "continue_mask");
-        llvm::Value *newMask = BinaryOperator(llvm::Instruction::Or,
-                                              mask, continueMask, "mask|continueMask");
+        llvm::Value *newMask = 
+            BinaryOperator(llvm::Instruction::Or, mask, continueMask,
+                           "mask|continueMask");
        StoreInst(newMask, continueLanesPtr);

        // And set the current mask to be all off in case there are any
@@ -670,22 +759,23 @@ FunctionEmitContext::Continue(bool doCoherenceCheck) {


 /** This function checks to see if all of the 'if' statements (if any)
-    between the current scope and the first enclosing loop have 'uniform'
-    tests.
+    between the current scope and the first enclosing loop/switch of given
+    control flow type have 'uniform' tests.
 */
 bool
-FunctionEmitContext::ifsInLoopAllUniform() const {
+FunctionEmitContext::ifsInCFAllUniform(int type) const {
    Assert(controlFlowInfo.size() > 0);
    // Go backwards through controlFlowInfo, since we add new nested scopes
-    // to the back.  Stop once we come to the first enclosing loop.
+    // to the back.  Stop once we come to the first enclosing control flow
+    // structure of the desired type.
    int i = controlFlowInfo.size() - 1;
-    while (i >= 0 && controlFlowInfo[i]->type != CFInfo::Loop) {
+    while (i >= 0 && controlFlowInfo[i]->type != type) {
        if (controlFlowInfo[i]->isUniform == false)
            // Found a scope due to an 'if' statement with a varying test
            return false;
        --i;
    }
-    Assert(i >= 0); // else we didn't find a loop!
+    Assert(i >= 0); // else we didn't find the expected control flow type!
    return true;
 }

@@ -758,11 +848,249 @@ FunctionEmitContext::RestoreContinuedLanes() {
 }


+void
+FunctionEmitContext::StartSwitch(bool cfIsUniform, llvm::BasicBlock *bbBreak) {
+    llvm::Value *oldMask = GetInternalMask();
+    controlFlowInfo.push_back(CFInfo::GetSwitch(cfIsUniform, breakTarget, 
+                                                continueTarget, breakLanesPtr,
+                                                continueLanesPtr, oldMask, 
+                                                loopMask, switchExpr, defaultBlock, 
+                                                caseBlocks, nextBlocks,
+                                                switchConditionWasUniform));
+
+    breakLanesPtr = AllocaInst(LLVMTypes::MaskType, "break_lanes_memory");
+    StoreInst(LLVMMaskAllOff, breakLanesPtr);
+    breakTarget = bbBreak;
+
+    continueLanesPtr = NULL;
+    continueTarget = NULL;
+    loopMask = NULL;
+
+    // These will be set by the SwitchInst() method
+    switchExpr = NULL;
+    defaultBlock = NULL;
+    caseBlocks = NULL;
+    nextBlocks = NULL;
+}
+
+
+void
+FunctionEmitContext::EndSwitch() {
+    Assert(bblock != NULL);
+
+    CFInfo *ci = popCFState();
+    if (ci->IsVarying() && bblock != NULL)
+        restoreMaskGivenReturns(ci->savedMask);
+}
+
+
+/** Emit code to check for an "all off" mask before the code for a 
+    case or default label in a "switch" statement.
+ */
+void
+FunctionEmitContext::addSwitchMaskCheck(llvm::Value *mask) {
+    llvm::Value *allOff = None(mask);
+    llvm::BasicBlock *bbSome = CreateBasicBlock("case_default_on");
+
+    // Find the basic block for the case or default label immediately after
+    // the current one in the switch statement--that's where we want to
+    // jump if the mask is all off at this label.
+    Assert(nextBlocks->find(bblock) != nextBlocks->end());
+    llvm::BasicBlock *bbNext = nextBlocks->find(bblock)->second;
+
+    // Jump to the next one of the mask is all off; otherwise jump to the
+    // newly created block that will hold the actual code for this label.
+    BranchInst(bbNext, bbSome, allOff);
+    SetCurrentBasicBlock(bbSome);
+}
+
+
+/** Returns the execution mask at entry to the first enclosing "switch"
+    statement. */
+llvm::Value *
+FunctionEmitContext::getMaskAtSwitchEntry() {
+    Assert(controlFlowInfo.size() > 0);
+    int i = controlFlowInfo.size() - 1;
+    while (i >= 0 && controlFlowInfo[i]->type != CFInfo::Switch)
+        --i;
+    Assert(i != -1);
+    return controlFlowInfo[i]->savedMask;
+}
+
+
+void
+FunctionEmitContext::EmitDefaultLabel(bool checkMask, SourcePos pos) {
+    if (inSwitchStatement() == false) {
+        Error(pos, "\"default\" label illegal outside of \"switch\" "
+              "statement.");
+        return;
+    }
+
+    // If there's a default label in the switch, a basic block for it
+    // should have been provided in the previous call to SwitchInst().
+    Assert(defaultBlock != NULL);
+
+    if (bblock != NULL)
+        // The previous case in the switch fell through, or we're in a
+        // varying switch; terminate the current block with a jump to the
+        // block for the code for the default label.
+        BranchInst(defaultBlock);
+    SetCurrentBasicBlock(defaultBlock);
+
+    if (switchConditionWasUniform)
+        // Nothing more to do for this case; return back to the caller,
+        // which will then emit the code for the default case.
+        return;
+
+    // For a varying switch, we need to update the execution mask.
+    //
+    // First, compute the mask that corresponds to which program instances
+    // should execute the "default" code; this corresponds to the set of
+    // program instances that don't match any of the case statements.
+    // Therefore, we generate code that compares the value of the switch
+    // expression to the value associated with each of the "case"
+    // statements such that the surviving lanes didn't match any of them.
+    llvm::Value *matchesDefault = getMaskAtSwitchEntry();
+    for (int i = 0; i < (int)caseBlocks->size(); ++i) {
+        int value = (*caseBlocks)[i].first;
+        llvm::Value *valueVec = (switchExpr->getType() == LLVMTypes::Int32VectorType) ?
+            LLVMInt32Vector(value) : LLVMInt64Vector(value);
+        // TODO: for AVX2 at least, the following generates better code
+        // than doing ICMP_NE and skipping the NotOperator() below; file a
+        // LLVM bug?
+        llvm::Value *matchesCaseValue = 
+            CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, switchExpr,
+                    valueVec, "cmp_case_value");
+        matchesCaseValue = I1VecToBoolVec(matchesCaseValue);
+
+        llvm::Value *notMatchesCaseValue = NotOperator(matchesCaseValue);
+        matchesDefault = BinaryOperator(llvm::Instruction::And, matchesDefault, 
+                                        notMatchesCaseValue, "default&~case_match");
+    }
+
+    // The mask may have some lanes on, which corresponds to the previous
+    // label falling through; compute the updated mask by ANDing with the
+    // current mask.
+    llvm::Value *oldMask = GetInternalMask();
+    llvm::Value *newMask = BinaryOperator(llvm::Instruction::Or, oldMask, 
+                                          matchesDefault, "old_mask|matches_default");
+    SetInternalMask(newMask);
+
+    if (checkMask)
+        addSwitchMaskCheck(newMask);
+}
+
+
+void
+FunctionEmitContext::EmitCaseLabel(int value, bool checkMask, SourcePos pos) {
+    if (inSwitchStatement() == false) {
+        Error(pos, "\"case\" label illegal outside of \"switch\" statement.");
+        return;
+    }
+
+    // Find the basic block for this case statement.
+    llvm::BasicBlock *bbCase = NULL;
+    Assert(caseBlocks != NULL);
+    for (int i = 0; i < (int)caseBlocks->size(); ++i)
+        if ((*caseBlocks)[i].first == value) {
+            bbCase = (*caseBlocks)[i].second;
+            break;
+        }
+    Assert(bbCase != NULL);
+
+    if (bblock != NULL)
+        // fall through from the previous case
+        BranchInst(bbCase);
+    SetCurrentBasicBlock(bbCase);
+
+    if (switchConditionWasUniform)
+        return;
+
+    // update the mask: first, get a mask that indicates which program
+    // instances have a value for the switch expression that matches this
+    // case statement.
+    llvm::Value *valueVec = (switchExpr->getType() == LLVMTypes::Int32VectorType) ?
+        LLVMInt32Vector(value) : LLVMInt64Vector(value);
+    llvm::Value *matchesCaseValue = 
+        CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, switchExpr,
+                valueVec, "cmp_case_value");
+    matchesCaseValue = I1VecToBoolVec(matchesCaseValue);
+
+    // If a lane was off going into the switch, we don't care if has a
+    // value in the switch expression that happens to match this case.
+    llvm::Value *entryMask = getMaskAtSwitchEntry();
+    matchesCaseValue = BinaryOperator(llvm::Instruction::And, entryMask,
+                                      matchesCaseValue, "entry_mask&case_match");
+
+    // Take the surviving lanes and turn on the mask for them.
+    llvm::Value *oldMask = GetInternalMask();
+    llvm::Value *newMask = BinaryOperator(llvm::Instruction::Or, oldMask, 
+                                          matchesCaseValue, "mask|case_match");
+    SetInternalMask(newMask);
+
+    if (checkMask)
+        addSwitchMaskCheck(newMask);
+}
+
+
+void
+FunctionEmitContext::SwitchInst(llvm::Value *expr, llvm::BasicBlock *bbDefault,
+                const std::vector<std::pair<int, llvm::BasicBlock *> > &bbCases,
+                const std::map<llvm::BasicBlock *, llvm::BasicBlock *> &bbNext) {
+    // The calling code should have called StartSwitch() before calling
+    // SwitchInst().
+    Assert(controlFlowInfo.size() &&
+           controlFlowInfo.back()->IsSwitch());
+
+    switchExpr = expr;
+    defaultBlock = bbDefault;
+    caseBlocks = new std::vector<std::pair<int, llvm::BasicBlock *> >(bbCases);
+    nextBlocks = new std::map<llvm::BasicBlock *, llvm::BasicBlock *>(bbNext);
+    switchConditionWasUniform = 
+        (llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(expr->getType()) == false);
+
+    if (switchConditionWasUniform == true) {
+        // For a uniform switch condition, just wire things up to the LLVM
+        // switch instruction.
+        llvm::SwitchInst *s = llvm::SwitchInst::Create(expr, bbDefault, 
+                                                       bbCases.size(), bblock);
+        for (int i = 0; i < (int)bbCases.size(); ++i) {
+            if (expr->getType() == LLVMTypes::Int32Type)
+                s->addCase(LLVMInt32(bbCases[i].first), bbCases[i].second);
+            else {
+                Assert(expr->getType() == LLVMTypes::Int64Type);
+                s->addCase(LLVMInt64(bbCases[i].first), bbCases[i].second);
+            }
+        }
+
+        AddDebugPos(s);
+        // switch is a terminator
+        bblock = NULL;
+    }
+    else {
+        // For a varying switch, we first turn off all lanes of the mask
+        SetInternalMask(LLVMMaskAllOff);
+
+        if (nextBlocks->size() > 0) {
+            // If there are any labels inside the switch, jump to the first
+            // one; any code before the first label won't be executed by
+            // anyone.
+            std::map<llvm::BasicBlock *, llvm::BasicBlock *>::const_iterator iter;
+            iter = nextBlocks->find(NULL);
+            Assert(iter != nextBlocks->end());
+            llvm::BasicBlock *bbFirst = iter->second;
+            BranchInst(bbFirst);
+            bblock = NULL;
+        }
+    }
+}
+
+
 int
 FunctionEmitContext::VaryingCFDepth() const { 
    int sum = 0;
    for (unsigned int i = 0; i < controlFlowInfo.size(); ++i)
-        if (controlFlowInfo[i]->IsVaryingType())
+        if (controlFlowInfo[i]->IsVarying())
            ++sum;
    return sum;
 }
@@ -777,6 +1105,41 @@ FunctionEmitContext::InForeachLoop() const {
 }


+bool
+FunctionEmitContext::initLabelBBlocks(ASTNode *node, void *data) {
+    LabeledStmt *ls = dynamic_cast<LabeledStmt *>(node);
+    if (ls == NULL)
+        return true;
+
+    FunctionEmitContext *ctx = (FunctionEmitContext *)data;
+
+    if (ctx->labelMap.find(ls->name) != ctx->labelMap.end())
+        Error(ls->pos, "Multiple labels named \"%s\" in function.",
+              ls->name.c_str());
+    else {
+        llvm::BasicBlock *bb = ctx->CreateBasicBlock(ls->name.c_str());
+        ctx->labelMap[ls->name] = bb;
+    }
+    return true;
+}
+
+
+void
+FunctionEmitContext::InitializeLabelMap(Stmt *code) {
+    labelMap.erase(labelMap.begin(), labelMap.end());
+    WalkAST(code, initLabelBBlocks, NULL, this);
+}
+
+
+llvm::BasicBlock *
+FunctionEmitContext::GetLabeledBasicBlock(const std::string &label) {
+    if (labelMap.find(label) != labelMap.end())
+        return labelMap[label];
+    else
+        return NULL;
+}
+
+
 void
 FunctionEmitContext::CurrentLanesReturned(Expr *expr, bool doCoherenceCheck) {
    const Type *returnType = function->GetReturnType();
@@ -869,6 +1232,14 @@ FunctionEmitContext::All(llvm::Value *mask) {
 }


+llvm::Value *
+FunctionEmitContext::None(llvm::Value *mask) {
+    llvm::Value *mmval = LaneMask(mask);
+    return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, mmval,
+                   LLVMInt32(0), "none_mm_cmp");
+}
+
+
 llvm::Value *
 FunctionEmitContext::LaneMask(llvm::Value *v) {
    // Call the target-dependent movmsk function to turn the vector mask
@@ -920,8 +1291,7 @@ FunctionEmitContext::GetStringPtr(const std::string &str) {

 llvm::BasicBlock *
 FunctionEmitContext::CreateBasicBlock(const char *name) {
-    llvm::Function *function = bblock->getParent();
-    return llvm::BasicBlock::Create(*g->ctx, name, function);
+    return llvm::BasicBlock::Create(*g->ctx, name, llvmFunction);
 }


@@ -2597,3 +2967,37 @@ FunctionEmitContext::addVaryingOffsetsIfNeeded(llvm::Value *ptr,

    return BinaryOperator(llvm::Instruction::Add, ptr, offset);
 }
+
+
+CFInfo *
+FunctionEmitContext::popCFState() {
+    Assert(controlFlowInfo.size() > 0);
+    CFInfo *ci = controlFlowInfo.back();
+    controlFlowInfo.pop_back();
+
+    if (ci->IsSwitch()) {
+        breakTarget = ci->savedBreakTarget;
+        continueTarget = ci->savedContinueTarget;
+        breakLanesPtr = ci->savedBreakLanesPtr;
+        continueLanesPtr = ci->savedContinueLanesPtr;
+        loopMask = ci->savedLoopMask;
+        switchExpr = ci->savedSwitchExpr;
+        defaultBlock = ci->savedDefaultBlock;
+        caseBlocks = ci->savedCaseBlocks;
+        nextBlocks = ci->savedNextBlocks;
+        switchConditionWasUniform = ci->savedSwitchConditionWasUniform;
+    }
+    else if (ci->IsLoop() || ci->IsForeach()) {
+        breakTarget = ci->savedBreakTarget;
+        continueTarget = ci->savedContinueTarget;
+        breakLanesPtr = ci->savedBreakLanesPtr;
+        continueLanesPtr = ci->savedContinueLanesPtr;
+        loopMask = ci->savedLoopMask;
+    }
+    else {
+        Assert(ci->IsIf());
+        // nothing to do
+    }
+
+    return ci;
+}
--- a/ctx.h
+++ b/ctx.h
@@ -39,6 +39,7 @@
 #define ISPC_CTX_H 1

 #include "ispc.h"
+#include <map>
 #include <llvm/InstrTypes.h>
 #include <llvm/Instructions.h>
 #include <llvm/Analysis/DIBuilder.h>
@@ -160,10 +161,8 @@ public:
    void EndLoop();

    /** Indicates that code generation for a 'foreach' or 'foreach_tiled'
-        loop is about to start.  The provided basic block pointer indicates
-        where control flow should go if a 'continue' statement is executed
-        in the loop. */
-    void StartForeach(llvm::BasicBlock *continueTarget);
+        loop is about to start. */
+    void StartForeach();
    void EndForeach();

    /** Emit code for a 'break' statement in a loop.  If doCoherenceCheck
@@ -186,12 +185,62 @@ public:
        previous iteration. */
    void RestoreContinuedLanes();

+    /** Indicates that code generation for a "switch" statement is about to
+        start.  isUniform indicates whether the "switch" value is uniform,
+        and bbAfterSwitch gives the basic block immediately following the
+        "switch" statement.  (For example, if the switch condition is
+        uniform, we jump here upon executing a "break" statement.) */
+    void StartSwitch(bool isUniform, llvm::BasicBlock *bbAfterSwitch);
+    /** Indicates the end of code generation for a "switch" statement. */
+    void EndSwitch();
+
+    /** Emits code for a "switch" statement in the program.
+        @param expr         Gives the value of the expression after the "switch"
+        @param defaultBlock Basic block to execute for the "default" case.  This
+                            should be NULL if there is no "default" label inside
+                            the switch.
+        @param caseBlocks   vector that stores the mapping from label values
+                            after "case" statements to basic blocks corresponding
+                            to the "case" labels.
+        @param nextBlocks   For each basic block for a "case" or "default" 
+                            label, this gives the basic block for the 
+                            immediately-following "case" or "default" label (or
+                            the basic block after the "switch" statement for the
+                            last label.)
+    */
+    void SwitchInst(llvm::Value *expr, llvm::BasicBlock *defaultBlock,
+                    const std::vector<std::pair<int, llvm::BasicBlock *> > &caseBlocks,
+                    const std::map<llvm::BasicBlock *, llvm::BasicBlock *> &nextBlocks);
+
+    /** Generates code for a "default" label after a "switch" statement.
+        The checkMask parameter indicates whether additional code should be
+        generated to check to see if the execution mask is all off after
+        the default label (in which case a jump to the following label will
+        be issued. */
+    void EmitDefaultLabel(bool checkMask, SourcePos pos);
+
+    /** Generates code for a "case" label after a "switch" statement.  See
+        the documentation for EmitDefaultLabel() for discussion of the
+        checkMask parameter. */
+    void EmitCaseLabel(int value, bool checkMask, SourcePos pos);
+
    /** Returns the current number of nested levels of 'varying' control
        flow */
    int VaryingCFDepth() const;

    bool InForeachLoop() const;

+    void SetContinueTarget(llvm::BasicBlock *bb) { continueTarget = bb; }
+
+    /** Step through the code and find label statements; create a basic
+        block for each one, so that subsequent calls to
+        GetLabeledBasicBlock() return the corresponding basic block. */
+    void InitializeLabelMap(Stmt *code);
+
+    /** If there is a label in the function with the given name, return the
+        new basic block that it starts. */
+    llvm::BasicBlock *GetLabeledBasicBlock(const std::string &label);
+
    /** Called to generate code for 'return' statement; value is the
        expression in the return statement (if non-NULL), and
        doCoherenceCheck indicates whether instructions should be generated
@@ -211,6 +260,10 @@ public:
        i1 value that indicates if all of the mask lanes are on. */
    llvm::Value *All(llvm::Value *mask);

+    /** Given a boolean mask value of type LLVMTypes::MaskType, return an
+        i1 value that indicates if all of the mask lanes are off. */
+    llvm::Value *None(llvm::Value *mask);
+
    /** Given a boolean mask value of type LLVMTypes::MaskType, return an
        i32 value wherein the i'th bit is on if and only if the i'th lane
        of the mask is on. */
@@ -446,6 +499,9 @@ private:
    /** Pointer to the Function for which we're currently generating code. */
    Function *function;

+    /** LLVM function representation for the current function. */
+    llvm::Function *llvmFunction;
+
    /** The basic block into which we add any alloca instructions that need
        to go at the very start of the function. */
    llvm::BasicBlock *allocaBlock;
@@ -479,10 +535,10 @@ private:
        the loop. */
    llvm::Value *loopMask;

-    /** If currently in a loop body, this is a pointer to memory to store a
-        mask value that represents which of the lanes have executed a
-        'break' statement.  If we're not in a loop body, this should be
-        NULL. */
+    /** If currently in a loop body or switch statement, this is a pointer
+        to memory to store a mask value that represents which of the lanes
+        have executed a 'break' statement.  If we're not in a loop body or
+        switch, this should be NULL. */
    llvm::Value *breakLanesPtr;

    /** Similar to breakLanesPtr, if we're inside a loop, this is a pointer
@@ -490,16 +546,49 @@ private:
        'continue' statement. */
    llvm::Value *continueLanesPtr;

-    /** If we're inside a loop, this gives the basic block immediately
-        after the current loop, which we will jump to if all of the lanes
-        have executed a break statement or are otherwise done with the
-        loop. */
+    /** If we're inside a loop or switch statement, this gives the basic
+        block immediately after the current loop or switch, which we will
+        jump to if all of the lanes have executed a break statement or are
+        otherwise done with it. */
    llvm::BasicBlock *breakTarget;

    /** If we're inside a loop, this gives the block to jump to if all of
        the running lanes have executed a 'continue' statement. */
    llvm::BasicBlock *continueTarget;

+    /** @name Switch statement state
+
+        These variables store various state that's active when we're
+        generating code for a switch statement.  They should all be NULL
+        outside of a switch.
+        @{
+    */
+
+    /** The value of the expression used to determine which case in the
+        statements after the switch to execute. */
+    llvm::Value *switchExpr;
+
+    /** Map from case label numbers to the basic block that will hold code
+        for that case. */
+    const std::vector<std::pair<int, llvm::BasicBlock *> > *caseBlocks;
+
+    /** The basic block of code to run for the "default" label in the
+        switch statement. */
+    llvm::BasicBlock *defaultBlock;
+
+    /** For each basic block for the code for cases (and the default label,
+        if present), this map gives the basic block for the immediately
+        following case/default label. */
+    const std::map<llvm::BasicBlock *, llvm::BasicBlock *> *nextBlocks;
+
+    /** Records whether the switch condition was uniform; this is a
+        distinct notion from whether the switch represents uniform or
+        varying control flow; we may have varying control flow from a
+        uniform switch condition if there is a 'break' inside the switch
+        that's under varying control flow. */
+    bool switchConditionWasUniform;
+    /** @} */
+
    /** A pointer to memory that records which of the program instances
        have executed a 'return' statement (and are thus really truly done
        running any more instructions in this functions. */
@@ -537,9 +626,13 @@ private:
        tasks launched from the current function. */
    llvm::Value *launchGroupHandlePtr;

+    std::map<std::string, llvm::BasicBlock *> labelMap;
+
+    static bool initLabelBBlocks(ASTNode *node, void *data);
+
    llvm::Value *pointerVectorToVoidPointers(llvm::Value *value);
    static void addGSMetadata(llvm::Value *inst, SourcePos pos);
-    bool ifsInLoopAllUniform() const;
+    bool ifsInCFAllUniform(int cfType) const;
    void jumpIfAllLoopLanesAreDone(llvm::BasicBlock *target);
    llvm::Value *emitGatherCallback(llvm::Value *lvalue, llvm::Value *retPtr);

@@ -547,6 +640,11 @@ private:
                                 const Type *ptrType);

    void restoreMaskGivenReturns(llvm::Value *oldMask);
+    void addSwitchMaskCheck(llvm::Value *mask);
+    bool inSwitchStatement() const;
+    llvm::Value *getMaskAtSwitchEntry();
+
+    CFInfo *popCFState();

    void scatter(llvm::Value *value, llvm::Value *ptr, const Type *ptrType, 
                 llvm::Value *mask);
--- a/decl.cpp
+++ b/decl.cpp
@@ -46,6 +46,18 @@
 #include <stdio.h>
 #include <set>

+static void
+lPrintTypeQualifiers(int typeQualifiers) {
+    if (typeQualifiers & TYPEQUAL_INLINE)    printf("inline ");
+    if (typeQualifiers & TYPEQUAL_CONST)     printf("const ");
+    if (typeQualifiers & TYPEQUAL_UNIFORM)   printf("uniform ");
+    if (typeQualifiers & TYPEQUAL_VARYING)   printf("varying ");
+    if (typeQualifiers & TYPEQUAL_TASK)      printf("task ");
+    if (typeQualifiers & TYPEQUAL_SIGNED)    printf("signed ");
+    if (typeQualifiers & TYPEQUAL_UNSIGNED)  printf("unsigned ");
+}
+
+
 /** Given a Type and a set of type qualifiers, apply the type qualifiers to
    the type, returning the type that is the result. 
 */
@@ -54,6 +66,16 @@ lApplyTypeQualifiers(int typeQualifiers, const Type *type, SourcePos pos) {
    if (type == NULL)
        return NULL;

+    if ((typeQualifiers & TYPEQUAL_CONST) != 0)
+        type = type->GetAsConstType();
+
+    if ((typeQualifiers & TYPEQUAL_UNIFORM) != 0)
+        type = type->GetAsUniformType();
+    else if ((typeQualifiers & TYPEQUAL_VARYING) != 0)
+        type = type->GetAsVaryingType();
+    else
+        type = type->GetAsUnboundVariabilityType();
+
    if ((typeQualifiers & TYPEQUAL_UNSIGNED) != 0) {
        if ((typeQualifiers & TYPEQUAL_SIGNED) != 0)
            Error(pos, "Illegal to apply both \"signed\" and \"unsigned\" "
@@ -64,29 +86,13 @@ lApplyTypeQualifiers(int typeQualifiers, const Type *type, SourcePos pos) {
            type = unsignedType;
        else
            Error(pos, "\"unsigned\" qualifier is illegal with \"%s\" type.",
-              type->GetString().c_str());
-
+                  type->ResolveUnboundVariability(Type::Varying)->GetString().c_str());
    }

    if ((typeQualifiers & TYPEQUAL_SIGNED) != 0 && type->IsIntType() == false)
        Error(pos, "\"signed\" qualifier is illegal with non-integer type "
-              "\"%s\".", type->GetString().c_str());
-
-    if ((typeQualifiers & TYPEQUAL_CONST) != 0)
-        type = type->GetAsConstType();
-
-    if ((typeQualifiers & TYPEQUAL_UNIFORM) != 0)
-        type = type->GetAsUniformType();
-    else if ((typeQualifiers & TYPEQUAL_VARYING) != 0)
-        type = type->GetAsVaryingType();
-    else {
-        // otherwise, structs are uniform by default and everything
-        // else is varying by default
-        if (dynamic_cast<const StructType *>(type->GetBaseType()) != NULL)
-            type = type->GetAsUniformType();
-        else
-            type = type->GetAsVaryingType();
-    }
+              "\"%s\".", 
+              type->ResolveUnboundVariability(Type::Varying)->GetString().c_str());

    return type;
 }
@@ -138,21 +144,14 @@ lGetStorageClassName(StorageClass storageClass) {

 void
 DeclSpecs::Print() const {
-    printf("%s ", lGetStorageClassName(storageClass));
+    printf("Declspecs: [%s ", lGetStorageClassName(storageClass));

    if (soaWidth > 0) printf("soa<%d> ", soaWidth);
-
-    if (typeQualifiers & TYPEQUAL_INLINE)    printf("inline ");
-    if (typeQualifiers & TYPEQUAL_CONST)     printf("const ");
-    if (typeQualifiers & TYPEQUAL_UNIFORM)   printf("uniform ");
-    if (typeQualifiers & TYPEQUAL_VARYING)   printf("varying ");
-    if (typeQualifiers & TYPEQUAL_TASK)      printf("task ");
-    if (typeQualifiers & TYPEQUAL_SIGNED)    printf("signed ");
-    if (typeQualifiers & TYPEQUAL_UNSIGNED)  printf("unsigned ");
-
-    printf("%s", baseType->GetString().c_str());
+    lPrintTypeQualifiers(typeQualifiers);
+    printf("base type: %s", baseType->GetString().c_str());

    if (vectorSize > 0) printf("<%d>", vectorSize);
+    printf("]");
 }


@@ -192,19 +191,46 @@ Declarator::GetSymbol() const {


 void
-Declarator::Print() const {
+Declarator::Print(int indent) const {
+    printf("%*cdeclarator: [", indent, ' ');
+    pos.Print();
+
+    lPrintTypeQualifiers(typeQualifiers);
    Symbol *sym = GetSymbol();
    if (sym != NULL)
        printf("%s", sym->name.c_str());
    else
        printf("(null symbol)");

+    printf(", array size = %d", arraySize);
+
+    printf(", kind = ");
+    switch (kind) {
+    case DK_BASE:      printf("base");      break;
+    case DK_POINTER:   printf("pointer");   break;
+    case DK_REFERENCE: printf("reference"); break;
+    case DK_ARRAY:     printf("array");     break;
+    case DK_FUNCTION:  printf("function");  break;
+    default:           FATAL("Unhandled declarator kind");
+    }
+
    if (initExpr != NULL) {
        printf(" = (");
        initExpr->Print();
        printf(")");
    }
-    pos.Print();
+
+    if (functionParams.size() > 0) {
+        for (unsigned int i = 0; i < functionParams.size(); ++i) {
+            printf("\n%*cfunc param %d:\n", indent, ' ', i);
+            functionParams[i]->Print(indent+4);
+        }
+    }
+
+    if (child != NULL)
+        child->Print(indent + 4);
+
+    printf("]\n");
 }


@@ -235,11 +261,13 @@ Declarator::GetFunctionInfo(DeclSpecs *ds, std::vector<Symbol *> *funArgs) {
    Assert(d != NULL);

    for (unsigned int i = 0; i < d->functionParams.size(); ++i) {
-        Declaration *pdecl = d->functionParams[i];
-        Assert(pdecl->declarators.size() == 1);
-        funArgs->push_back(pdecl->declarators[0]->GetSymbol());
+        Symbol *sym = d->GetSymbolForFunctionParameter(i);
+        sym->type = sym->type->ResolveUnboundVariability(Type::Varying);
+        funArgs->push_back(sym);
    }

+    funSym->type = funSym->type->ResolveUnboundVariability(Type::Varying);
+
    return funSym;
 }

@@ -258,6 +286,12 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
    if (kind != DK_FUNCTION && isTask)
        Error(pos, "\"task\" qualifier illegal in variable declaration.");

+    Type::Variability variability = Type::Unbound;
+    if (hasUniformQual)
+        variability = Type::Uniform;
+    else if (hasVaryingQual)
+        variability = Type::Varying;
+
    const Type *type = base;
    switch (kind) {
    case DK_BASE:
@@ -268,7 +302,7 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
        return type;

    case DK_POINTER:
-        type = new PointerType(type, hasUniformQual, isConst);
+        type = new PointerType(type, variability, isConst);
        if (child != NULL)
            return child->GetType(type, ds);
        else
@@ -316,25 +350,7 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
        for (unsigned int i = 0; i < functionParams.size(); ++i) {
            Declaration *d = functionParams[i];

-            char buf[32];
-            Symbol *sym;
-            if (d->declarators.size() == 0) {
-                // function declaration like foo(float), w/o a name for
-                // the parameter
-                sprintf(buf, "__anon_parameter_%d", i);
-                sym = new Symbol(buf, pos);
-                sym->type = d->declSpecs->GetBaseType(pos);
-            }
-            else {
-                sym = d->declarators[0]->GetSymbol();
-                if (sym == NULL) {
-                    // Handle more complex anonymous declarations like
-                    // float (float **).
-                    sprintf(buf, "__anon_parameter_%d", i);
-                    sym = new Symbol(buf, d->declarators[0]->pos);
-                    sym->type = d->declarators[0]->GetType(d->declSpecs);
-                }
-            }
+            Symbol *sym = GetSymbolForFunctionParameter(i);

            if (d->declSpecs->storageClass != SC_NONE)
                Error(sym->pos, "Storage class \"%s\" is illegal in "
@@ -397,7 +413,7 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
            Error(pos, "No return type provided in function declaration.");
            return NULL;
        }
-
+        
        bool isExported = ds && (ds->storageClass == SC_EXPORT);
        bool isExternC =  ds && (ds->storageClass == SC_EXTERN_C);
        bool isTask =     ds && ((ds->typeQualifiers & TYPEQUAL_TASK) != 0);
@@ -418,9 +434,10 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
            return NULL;
        }

-        Type *functionType = 
-            new FunctionType(returnType, args, pos, argNames, argDefaults,
+        const Type *functionType = 
+            new FunctionType(returnType, args, argNames, argDefaults,
                             argPos, isTask, isExported, isExternC);
+        functionType = functionType->ResolveUnboundVariability(Type::Varying);
        return child->GetType(functionType, ds);
    }
    default:
@@ -461,6 +478,35 @@ Declarator::GetType(DeclSpecs *ds) const {
 }


+Symbol *
+Declarator::GetSymbolForFunctionParameter(int paramNum) const {
+    Assert(paramNum < (int)functionParams.size());
+    Declaration *d = functionParams[paramNum];
+
+    char buf[32];
+    Symbol *sym;
+    if (d->declarators.size() == 0) {
+        // function declaration like foo(float), w/o a name for
+        // the parameter
+        sprintf(buf, "__anon_parameter_%d", paramNum);
+        sym = new Symbol(buf, pos);
+        sym->type = d->declSpecs->GetBaseType(pos);
+    }
+    else {
+        Assert(d->declarators.size() == 1);
+        sym = d->declarators[0]->GetSymbol();
+        if (sym == NULL) {
+            // Handle more complex anonymous declarations like
+            // float (float **).
+            sprintf(buf, "__anon_parameter_%d", paramNum);
+            sym = new Symbol(buf, d->declarators[0]->pos);
+            sym->type = d->declarators[0]->GetType(d->declSpecs);
+        }
+    }
+    return sym;
+}
+
+
 ///////////////////////////////////////////////////////////////////////////
 // Declaration

@@ -489,19 +535,15 @@ Declaration::GetVariableDeclarations() const {
    std::vector<VariableDeclaration> vars;

    for (unsigned int i = 0; i < declarators.size(); ++i) {
-        if (declarators[i] == NULL)
-            continue;
        Declarator *decl = declarators[i];
        if (decl == NULL)
            // Ignore earlier errors
            continue;

        Symbol *sym = decl->GetSymbol();
-        if (dynamic_cast<const FunctionType *>(sym->type) != NULL) {
-            // function declaration
-            m->symbolTable->AddFunction(sym);
-        }
-        else {
+        sym->type = sym->type->ResolveUnboundVariability(Type::Varying);
+
+        if (dynamic_cast<const FunctionType *>(sym->type) == NULL) {
            m->symbolTable->AddVariable(sym);
            vars.push_back(VariableDeclaration(sym, decl->initExpr));
        }
@@ -511,16 +553,36 @@ Declaration::GetVariableDeclarations() const {


 void
-Declaration::Print() const {
-    printf("Declaration: specs [");
-    declSpecs->Print();
-    printf("], declarators [");
-    for (unsigned int i = 0 ; i < declarators.size(); ++i) {
-        declarators[i]->Print();
-        printf("%s", (i == declarators.size() - 1) ? "]" : ", ");
+Declaration::DeclareFunctions() {
+    Assert(declSpecs->storageClass != SC_TYPEDEF);
+
+    for (unsigned int i = 0; i < declarators.size(); ++i) {
+        Declarator *decl = declarators[i];
+        if (decl == NULL)
+            // Ignore earlier errors
+            continue;
+
+        Symbol *sym = decl->GetSymbol();
+        sym->type = sym->type->ResolveUnboundVariability(Type::Varying);
+
+        if (dynamic_cast<const FunctionType *>(sym->type) == NULL)
+            continue;
+
+        bool isInline = (declSpecs->typeQualifiers & TYPEQUAL_INLINE);
+        m->AddFunctionDeclaration(sym, isInline);
    }
 }

+
+void
+Declaration::Print(int indent) const {
+    printf("%*cDeclaration: specs [", indent, ' ');
+    declSpecs->Print();
+    printf("], declarators:\n");
+    for (unsigned int i = 0 ; i < declarators.size(); ++i)
+        declarators[i]->Print(indent+4);
+}
+
 ///////////////////////////////////////////////////////////////////////////

 void
@@ -539,7 +601,7 @@ GetStructTypesNamesPositions(const std::vector<StructDeclaration *> &sd,
        DeclSpecs ds(type);
        if (type->IsUniformType()) 
            ds.typeQualifiers |= TYPEQUAL_UNIFORM;
-        else
+        else if (type->IsVaryingType())
            ds.typeQualifiers |= TYPEQUAL_VARYING;

        for (unsigned int j = 0; j < sd[i]->declarators->size(); ++j) {
--- a/decl.h
+++ b/decl.h
@@ -153,10 +153,12 @@ public:
        declarator and symbols for its arguments in *args. */
    Symbol *GetFunctionInfo(DeclSpecs *ds, std::vector<Symbol *> *args);

+    Symbol *GetSymbolForFunctionParameter(int paramNum) const;
+
    /** Returns the symbol associated with the declarator. */
    Symbol *GetSymbol() const;

-    void Print() const;
+    void Print(int indent) const;

    /** Position of the declarator in the source program. */
    const SourcePos pos;
@@ -199,7 +201,7 @@ public:
    Declaration(DeclSpecs *ds, std::vector<Declarator *> *dlist = NULL);
    Declaration(DeclSpecs *ds, Declarator *d);

-    void Print() const;
+    void Print(int indent) const;

    /** This method walks through all of the Declarators in a declaration
        and returns a fully-initialized Symbol and (possibly) and
@@ -208,6 +210,10 @@ public:
        Declarator representation.) */
    std::vector<VariableDeclaration> GetVariableDeclarations() const;

+    /** For any function declarations in the Declaration, add the
+        declaration to the module. */
+    void DeclareFunctions();
+
    DeclSpecs *declSpecs;
    std::vector<Declarator *> declarators;
 };
--- a/docs/ReleaseNotes.txt
+++ b/docs/ReleaseNotes.txt
@@ -1,3 +1,43 @@
+=== v1.1.3 === (20 January 2012)
+
+With this release, the language now supports "switch" statements, with the
+same semantics and syntax as in C.
+
+This release includes fixes for two important performance related issues:
+the quality of code generated for "foreach" statements has been
+substantially improved (https://github.com/ispc/ispc/issues/151), and a
+performance regression with code for "gathers" that was introduced in
+v1.1.2 has been fixed in this release. 
+
+A number of other small bugs were fixed in this release as well, including
+one where invalid memory would sometimes be incorrectly accessed
+(https://github.com/ispc/ispc/issues/160).
+
+Thanks to Jean-Luc Duprat for a number of patches that improve support for
+building on various platforms, and to Pierre-Antoine Lacaze for patches so
+that ispc builds under MinGW.
+
+=== v1.1.2 === (9 January 2012)
+
+The major new feature in this release is support for "generic" C++
+vectorized output; in other words, ispc can emit C++ code that corresponds
+to the vectorized computation that the ispc program represents.  See the
+examples/intrinsics directory in the ispc distribution for two example
+implementations of the set of functions that must be provided map the
+vector calls generated by ispc to target specific functions.
+
+ispc now has partial support for 'goto' statements; specifically, goto is
+allowed if any enclosing control flow statements (if/for/while/do) have
+'uniform' test expressions, but not if they have 'varying' tests.
+
+A number of improvements have been made to the code generated for gathers
+and scatters--one of them (better matching x86's "free" scale by 2/4/8 for
+addressing calculations) improved the performance of the noise example by
+14%.
+
+Many small bugs have been fixed in this release as well, including issue
+numbers 138, 129, 135, 127, 149, and 142.
+
 === v1.1.1 === (15 December 2011)

 This release doesn't include any significant new functionality, but does
--- a/docs/build.sh
+++ b/docs/build.sh
@@ -2,11 +2,11 @@

 for i in ispc perfguide faq; do
    rst2html.py --template=template.txt --link-stylesheet \
-        --stylesheet-path=css/style.css $i.txt > $i.html
+        --stylesheet-path=css/style.css $i.rst > $i.html
 done

 rst2html.py --template=template-perf.txt --link-stylesheet \
-        --stylesheet-path=css/style.css perf.txt > perf.html
+        --stylesheet-path=css/style.css perf.rst > perf.html

 #rst2latex --section-numbering --documentclass=article --documentoptions=DIV=9,10pt,letterpaper ispc.txt > ispc.tex
 #pdflatex ispc.tex
--- a/docs/faq.rst
+++ b/docs/faq.rst
@@ -1,10 +1,10 @@
-=============================================================
-Intel® SPMD Program Compiler Frequently Asked Questions (FAQ)
-=============================================================
+=====================================
+Frequently Asked Questions About ispc
+=====================================

 This document includes a number of frequently (and not frequently) asked
 questions about ispc, the Intel® SPMD Program Compiler.  The source to this
-document is in the file ``docs/faq.txt`` in the ``ispc`` source
+document is in the file ``docs/faq.rst`` in the ``ispc`` source
 distribution.

 * Understanding ispc's Output
--- a/docs/ispc.rst
+++ b/docs/ispc.rst
@@ -99,7 +99,9 @@ Contents:
  + `Control Flow`_

    * `Conditional Statements: "if"`_
+    * `Conditional Statements: "switch"`_
    * `Basic Iteration Statements: "for", "while", and "do"`_
+    * `Unstructured Control Flow: "goto"`_
    * `"Coherent" Control Flow Statements: "cif" and Friends`_
    * `Parallel Iteration Statements: "foreach" and "foreach_tiled"`_
    * `Parallel Iteration with "programIndex" and "programCount"`_
@@ -1140,7 +1142,7 @@ in C:

 * Expression syntax and basic types
 * Syntax for variable declarations
-* Control flow structures: if, for, while, do
+* Control flow structures: ``if``, ``for``, ``while``, ``do``, and ``switch``.
 * Pointers, including function pointers, ``void *``, and C's array/pointer
  duality (arrays are converted to pointers when passed to functions, etc.)
 * Structs and arrays
@@ -1184,7 +1186,7 @@ but are likely to be supported in future releases:
  ``int64`` types
 * Character constants
 * String constants and arrays of characters as strings
-* ``switch`` and ``goto`` statements
+* ``goto`` statements are partially supported (see `Unstructured Control Flow: "goto"`_)
 * ``union`` types
 * Bitfield members of ``struct`` types
 * Variable numbers of arguments to functions
@@ -1245,6 +1247,18 @@ Here are three ways of specifying the integer value "15":
   int fifteen_hex     = 0xf;
   int fifteen_binary  = 0b1111;

+A number of suffixes can be provided with integer numeric constants.
+First, "u" denotes that the constant is unsigned, and "ll" denotes a 64-bit
+integer constant (while "l" denotes a 32-bit integer constant).  It is also
+possible to denote units of 1024, 1024*1024, or 1024*1024*1024 with the
+SI-inspired suffixes "k", "M", and "G" respectively:
+
+::
+
+   int two_kb = 2k;   // 2048
+   int two_megs = 2M; // 2 * 1024 * 1024
+   int one_gig = 1G;  // 1024 * 1024 * 1024
+
 Floating-point constants can be specified in one of three ways.  First,
 they may be a sequence of zero or more digits from 0 to 9, followed by a
 period, followed by zero or more digits from 0 to 9. (There must be at
@@ -1980,6 +1994,31 @@ executes if the condition is false.
    else
        x *= 2.;

+Conditional Statements: "switch"
+--------------------------------
+
+The ``switch`` conditional statement is also available, again with the same
+behavior as in C; the expression used in the ``switch`` must be of integer
+type (but it can be uniform or varying).  As in C, if there is no ``break``
+statement at the end of the code for a given case, execution "falls
+through" to the following case.  These features are demonstrated in the
+code below.
+
+::
+
+    int x = ...;
+    switch (x) {
+    case 0:
+    case 1:
+        foo(x);
+        /* fall through */
+    case 5:
+        x = 0;
+        break;
+    default:
+        x *= x;
+    }
+
 Basic Iteration Statements: "for", "while", and "do"
 ----------------------------------------------------

@@ -2005,6 +2044,37 @@ one of them executes a ``continue`` statement, other program instances
 executing code in the loop body that didn't execute the ``continue`` will
 be unaffected by it.

+Unstructured Control Flow: "goto"
+---------------------------------
+
+``goto`` statements are allowed in ``ispc`` programs under limited
+circumstances; specifically, only when the compiler can determine that if
+any program instance executes a ``goto`` statement, then all of the program
+instances will be running at that statement, such that all will follow the
+``goto``.
+
+Put another way: it's illegal for there to be "varying" control flow
+statements in scopes that enclose a ``goto`` statement.  An error is issued
+if a ``goto`` is used in this situation.
+
+The syntax for adding labels to ``ispc`` programs and jumping to them with
+``goto`` is the same as in C.  The following code shows a ``goto`` based
+equivalent of a ``for`` loop where the induction variable ``i`` goes from
+zero to ten.
+
+::
+
+      uniform int i = 0;
+    check:
+      if (i > 10)
+          goto done;
+      // loop body
+      ++i;
+      goto check;
+    done:
+      // ...
+
+
 "Coherent" Control Flow Statements: "cif" and Friends
 -----------------------------------------------------

@@ -3374,12 +3444,27 @@ pointer types.
 System Information
 ------------------

-A routine is available to find the number of CPU cores available in the
-system:
+The value of a  high-precision hardware clock counter is returned by the
+``clock()`` routine; its value increments by one each processor cycle.
+Thus, taking the difference between the values returned by ``clock()`` and
+different points in program execution gives the number of cycles between
+those points in the program.

 ::

-    int num_cores()
+    uniform int64 clock()
+
+Note that ``clock()`` flushes the processor pipeline.  It has an overhead
+of a hundred or so cycles, so for very fine-grained measurements, it may be
+worthwhile to measure the cost of calling ``clock()`` and subtracting that
+value from reported results.
+    
+A routine is also available to find the number of CPU cores available in
+the system:
+
+::
+
+    uniform int num_cores()

 This value can be useful for adapting the granularity of parallel task
 decomposition depending on the number of processors in the system.
--- a/docs/perf.rst
+++ b/docs/perf.rst
--- a/docs/perfguide.rst
+++ b/docs/perfguide.rst
--- a/docs/template-perf.txt
+++ b/docs/template-perf.txt
@@ -45,8 +45,7 @@
              developers mailing list</a></li>
              <li><a href="http://github.com/ispc/ispc/wiki/">Wiki</a></li>
              <li><a href="http://github.com/ispc/ispc/issues/">Bug tracking</a></li>
-              <li><a href="doxygen/index.html">Doxygen documentation of
-              <tt>ispc</tt> source code</a></li>
+              <li><a href="doxygen/index.html">Doxygen</a></li>
            </ul>
        </div>
      </div>
--- a/docs/template.txt
+++ b/docs/template.txt
@@ -45,8 +45,7 @@
              developers mailing list</a></li>
              <li><a href="http://github.com/ispc/ispc/wiki/">Wiki</a></li>
              <li><a href="http://github.com/ispc/ispc/issues/">Bug tracking</a></li>
-              <li><a href="doxygen/index.html">Doxygen documentation of
-              <tt>ispc</tt> source code</a></li>
+              <li><a href="doxygen/index.html">Doxygen</a></li>
            </ul>
        </div>
      </div>
--- a/doxygen.cfg
+++ b/doxygen.cfg
@@ -31,7 +31,7 @@ PROJECT_NAME           = "Intel SPMD Program Compiler"
 # This could be handy for archiving the generated documentation or
 # if some version control system is used.

-PROJECT_NUMBER         = 1.1.1
+PROJECT_NUMBER         = 1.1.3

 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
 # base path where the generated documentation will be put.
--- a/examples/aobench/ao.ispc
+++ b/examples/aobench/ao.ispc
@@ -82,7 +82,7 @@ static inline void vnormalize(vec &v) {
 }


-static inline void
+static void
 ray_plane_intersect(Isect &isect, Ray &ray, Plane &plane) {
    float d = -dot(plane.p, plane.n);
    float v = dot(ray.dir, plane.n);
@@ -124,7 +124,7 @@ ray_sphere_intersect(Isect &isect, Ray &ray, Sphere &sphere) {
 }


-static inline void
+static void
 orthoBasis(vec basis[3], vec n) {
    basis[2] = n;
    basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;
@@ -147,7 +147,7 @@ orthoBasis(vec basis[3], vec n) {
 }


-static inline float
+static float
 ambient_occlusion(Isect &isect, Plane &plane, Sphere spheres[3], 
                  RNGState &rngstate) {
    float eps = 0.0001f;
--- a/examples/aobench_instrumented/Makefile
+++ b/examples/aobench_instrumented/Makefile
@@ -14,13 +14,13 @@ dirs:
 clean:
 	/bin/rm -rf objs *~ ao

-ao: dirs objs/ao.o objs/instrument.o objs/ao_ispc.o
-	$(CXX) $(CXXFLAGS) -o $@ objs/ao.o objs/ao_ispc.o objs/instrument.o -lm -lpthread
+ao: objs/ao.o objs/instrument.o objs/ao_ispc.o ../tasksys.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $^ -lm -lpthread

-objs/%.o: %.cpp
+objs/%.o: %.cpp dirs
 	$(CXX) $< $(CXXFLAGS) -c -o $@

 objs/ao.o: objs/ao_ispc.h 

-objs/%_ispc.h objs/%_ispc.o: %.ispc
-	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
+objs/%_ispc.h objs/%_ispc.o: %.ispc dirs
+	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_instrumented_ispc.h
--- a/examples/intrinsics/generic-16.h
+++ b/examples/intrinsics/generic-16.h
@@ -251,6 +251,14 @@ static FORCEINLINE TYPE __select(bool cond, TYPE a, TYPE b) {       \
    return cond ? a : b;                                            \
 }

+#define SHIFT_UNIFORM(TYPE, CAST, NAME, OP)                         \
+static FORCEINLINE TYPE NAME(TYPE a, int32_t b) {                   \
+   TYPE ret;                                                        \
+   for (int i = 0; i < 16; ++i)                                     \
+       ret.v[i] = (CAST)(a.v[i]) OP b;                              \
+   return ret;                                                      \
+}
+
 #define SMEAR(VTYPE, NAME, STYPE)               \
 static FORCEINLINE VTYPE __smear_##NAME(STYPE v) {        \
    VTYPE ret;                                  \
@@ -307,6 +315,12 @@ static FORCEINLINE uint32_t __movmsk(__vec16_i1 mask) {
    return mask.v;
 }

+static FORCEINLINE __vec16_i1 __equal(__vec16_i1 a, __vec16_i1 b) {
+    __vec16_i1 r;
+    r.v = (a.v & b.v) | (~a.v & ~b.v);
+    return r;
+}
+
 static FORCEINLINE __vec16_i1 __and(__vec16_i1 a, __vec16_i1 b) {
    __vec16_i1 r;
    r.v = a.v & b.v;
@@ -380,6 +394,10 @@ BINARY_OP_CAST(__vec16_i8, int8_t,  __srem, %)
 BINARY_OP_CAST(__vec16_i8, uint8_t, __lshr, >>)
 BINARY_OP_CAST(__vec16_i8, int8_t,  __ashr, >>)

+SHIFT_UNIFORM(__vec16_i8, uint8_t, __lshr, >>)
+SHIFT_UNIFORM(__vec16_i8, int8_t, __ashr, >>)
+SHIFT_UNIFORM(__vec16_i8, int8_t, __shl, <<)
+
 CMP_OP(__vec16_i8, int8_t,  __equal, ==)
 CMP_OP(__vec16_i8, int8_t,  __not_equal, !=)
 CMP_OP(__vec16_i8, uint8_t, __unsigned_less_equal, <=)
@@ -419,6 +437,10 @@ BINARY_OP_CAST(__vec16_i16, int16_t,  __srem, %)
 BINARY_OP_CAST(__vec16_i16, uint16_t, __lshr, >>)
 BINARY_OP_CAST(__vec16_i16, int16_t,  __ashr, >>)

+SHIFT_UNIFORM(__vec16_i16, uint16_t, __lshr, >>)
+SHIFT_UNIFORM(__vec16_i16, int16_t, __ashr, >>)
+SHIFT_UNIFORM(__vec16_i16, int16_t, __shl, <<)
+
 CMP_OP(__vec16_i16, int16_t,  __equal, ==)
 CMP_OP(__vec16_i16, int16_t,  __not_equal, !=)
 CMP_OP(__vec16_i16, uint16_t, __unsigned_less_equal, <=)
@@ -458,6 +480,10 @@ BINARY_OP_CAST(__vec16_i32, int32_t,  __srem, %)
 BINARY_OP_CAST(__vec16_i32, uint32_t, __lshr, >>)
 BINARY_OP_CAST(__vec16_i32, int32_t,  __ashr, >>)

+SHIFT_UNIFORM(__vec16_i32, uint32_t, __lshr, >>)
+SHIFT_UNIFORM(__vec16_i32, int32_t, __ashr, >>)
+SHIFT_UNIFORM(__vec16_i32, int32_t, __shl, <<)
+
 CMP_OP(__vec16_i32, int32_t,  __equal, ==)
 CMP_OP(__vec16_i32, int32_t,  __not_equal, !=)
 CMP_OP(__vec16_i32, uint32_t, __unsigned_less_equal, <=)
@@ -497,6 +523,10 @@ BINARY_OP_CAST(__vec16_i64, int64_t,  __srem, %)
 BINARY_OP_CAST(__vec16_i64, uint64_t, __lshr, >>)
 BINARY_OP_CAST(__vec16_i64, int64_t,  __ashr, >>)

+SHIFT_UNIFORM(__vec16_i64, uint64_t, __lshr, >>)
+SHIFT_UNIFORM(__vec16_i64, int64_t, __ashr, >>)
+SHIFT_UNIFORM(__vec16_i64, int64_t, __shl, <<)
+
 CMP_OP(__vec16_i64, int64_t,  __equal, ==)
 CMP_OP(__vec16_i64, int64_t,  __not_equal, !=)
 CMP_OP(__vec16_i64, uint64_t, __unsigned_less_equal, <=)
@@ -932,7 +962,7 @@ REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_max_uint64, >)
 ///////////////////////////////////////////////////////////////////////////
 // masked load/store

-static FORCEINLINE __vec16_i8 __masked_load_8(unsigned char *p,
+static FORCEINLINE __vec16_i8 __masked_load_8(void *p,
                                              __vec16_i1 mask) {
    __vec16_i8 ret;
    int8_t *ptr = (int8_t *)p;
@@ -942,7 +972,7 @@ static FORCEINLINE __vec16_i8 __masked_load_8(unsigned char *p,
    return ret;
 }

-static FORCEINLINE __vec16_i16 __masked_load_16(unsigned char *p,
+static FORCEINLINE __vec16_i16 __masked_load_16(void *p,
                                                __vec16_i1 mask) {
    __vec16_i16 ret;
    int16_t *ptr = (int16_t *)p;
@@ -952,7 +982,7 @@ static FORCEINLINE __vec16_i16 __masked_load_16(unsigned char *p,
    return ret;
 }

-static FORCEINLINE __vec16_i32 __masked_load_32(unsigned char *p,
+static FORCEINLINE __vec16_i32 __masked_load_32(void *p,
                                                __vec16_i1 mask) {
    __vec16_i32 ret;
    int32_t *ptr = (int32_t *)p;
@@ -962,7 +992,7 @@ static FORCEINLINE __vec16_i32 __masked_load_32(unsigned char *p,
    return ret;
 }

-static FORCEINLINE __vec16_i64 __masked_load_64(unsigned char *p,
+static FORCEINLINE __vec16_i64 __masked_load_64(void *p,
                                                __vec16_i1 mask) {
    __vec16_i64 ret;
    int64_t *ptr = (int64_t *)p;
@@ -972,7 +1002,7 @@ static FORCEINLINE __vec16_i64 __masked_load_64(unsigned char *p,
    return ret;
 }

-static FORCEINLINE void __masked_store_8(unsigned char *p, __vec16_i8 val,
+static FORCEINLINE void __masked_store_8(void *p, __vec16_i8 val,
                                         __vec16_i1 mask) {
    int8_t *ptr = (int8_t *)p;
    for (int i = 0; i < 16; ++i)
@@ -980,7 +1010,7 @@ static FORCEINLINE void __masked_store_8(unsigned char *p, __vec16_i8 val,
            ptr[i] = val.v[i];
 }

-static FORCEINLINE void __masked_store_16(unsigned char *p, __vec16_i16 val,
+static FORCEINLINE void __masked_store_16(void *p, __vec16_i16 val,
                                          __vec16_i1 mask) {
    int16_t *ptr = (int16_t *)p;
    for (int i = 0; i < 16; ++i)
@@ -988,7 +1018,7 @@ static FORCEINLINE void __masked_store_16(unsigned char *p, __vec16_i16 val,
            ptr[i] = val.v[i];
 }

-static FORCEINLINE void __masked_store_32(unsigned char *p, __vec16_i32 val,
+static FORCEINLINE void __masked_store_32(void *p, __vec16_i32 val,
                                          __vec16_i1 mask) {
    int32_t *ptr = (int32_t *)p;
    for (int i = 0; i < 16; ++i)
@@ -996,7 +1026,7 @@ static FORCEINLINE void __masked_store_32(unsigned char *p, __vec16_i32 val,
            ptr[i] = val.v[i];
 }

-static FORCEINLINE void __masked_store_64(unsigned char *p, __vec16_i64 val,
+static FORCEINLINE void __masked_store_64(void *p, __vec16_i64 val,
                                          __vec16_i1 mask) {
    int64_t *ptr = (int64_t *)p;
    for (int i = 0; i < 16; ++i)
@@ -1004,19 +1034,41 @@ static FORCEINLINE void __masked_store_64(unsigned char *p, __vec16_i64 val,
            ptr[i] = val.v[i];
 }

+static FORCEINLINE void __masked_store_blend_8(void *p, __vec16_i8 val,
+                                               __vec16_i1 mask) {
+    __masked_store_8(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_16(void *p, __vec16_i16 val,
+                                                __vec16_i1 mask) {
+    __masked_store_16(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_32(void *p, __vec16_i32 val,
+                                                __vec16_i1 mask) {
+    __masked_store_32(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_64(void *p, __vec16_i64 val,
+                                                __vec16_i1 mask) {
+    __masked_store_64(p, val, mask);
+}
+
 ///////////////////////////////////////////////////////////////////////////
 // gather/scatter

 // offsets * offsetScale is in bytes (for all of these)

 #define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                  \
-static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE offsets, uint32_t scale,\
-                         __vec16_i1 mask) {                             \
+static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset,    \
+                              uint32_t scale, OTYPE constOffset, \
+                              __vec16_i1 mask) {                        \
    VTYPE ret;                                                          \
    int8_t *base = (int8_t *)b;                                         \
    for (int i = 0; i < 16; ++i)                                        \
        if ((mask.v & (1 << i)) != 0) {                                 \
-            STYPE *ptr = (STYPE *)(base + scale * offsets.v[i]);        \
+            STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] +  \
+                                   constOffset.v[i]);                   \
            ret.v[i] = *ptr;                                            \
        }                                                               \
    return ret;                                                         \
@@ -1054,13 +1106,15 @@ GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __gather64_i64)

 // scatter

-#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                 \
-static FORCEINLINE void FUNC(unsigned char *b, OTYPE offsets, uint32_t scale,\
+#define SCATTER_BASE_VARYINGOFFSET(VTYPE, STYPE, OTYPE, FUNC)           \
+static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset,     \
+                             uint32_t scale, OTYPE constOffset,         \
                             VTYPE val, __vec16_i1 mask) {              \
    int8_t *base = (int8_t *)b;                                         \
    for (int i = 0; i < 16; ++i)                                        \
        if ((mask.v & (1 << i)) != 0) {                                 \
-            STYPE *ptr = (STYPE *)(base + scale * offsets.v[i]);        \
+            STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] +  \
+                                   constOffset.v[i]);                   \
            *ptr = val.v[i];                                            \
        }                                                               \
 }
--- a/examples/intrinsics/sse4.h
+++ b/examples/intrinsics/sse4.h
@@ -51,8 +51,8 @@
 #define FORCEINLINE __attribute__((always_inline)) inline
 #endif

-//CO#undef FORCEINLINE
-//CO#define FORCEINLINE
+#undef FORCEINLINE
+#define FORCEINLINE

 typedef float __vec1_f;
 typedef double __vec1_d;
@@ -228,6 +228,10 @@ static FORCEINLINE uint32_t __movmsk(__vec4_i1 mask) {
    return _mm_movemask_ps(mask.v);
 }

+static FORCEINLINE __vec4_i1 __equal(__vec4_i1 a, __vec4_i1 b) {
+    return _mm_cmpeq_epi32(_mm_castps_si128(a.v), _mm_castps_si128(b.v));
+}
+
 static FORCEINLINE __vec4_i1 __and(__vec4_i1 a, __vec4_i1 b) {
    return _mm_and_ps(a.v, b.v);
 }
@@ -299,6 +303,13 @@ static FORCEINLINE __vec4_i8 __shl(__vec4_i8 a, __vec4_i8 b) {
                     _mm_extract_epi8(a.v, 3) << _mm_extract_epi8(b.v, 3));
 }

+static FORCEINLINE __vec4_i8 __shl(__vec4_i8 a, int32_t b) {
+    return __vec4_i8(_mm_extract_epi8(a.v, 0) << b,
+                     _mm_extract_epi8(a.v, 1) << b,
+                     _mm_extract_epi8(a.v, 2) << b,
+                     _mm_extract_epi8(a.v, 3) << b);
+}
+
 static FORCEINLINE __vec4_i8 __udiv(__vec4_i8 a, __vec4_i8 b) {
    return __vec4_i8((uint8_t)_mm_extract_epi8(a.v, 0) / 
                     (uint8_t)_mm_extract_epi8(b.v, 0),
@@ -354,6 +365,13 @@ static FORCEINLINE __vec4_i8 __lshr(__vec4_i8 a, __vec4_i8 b) {
                     (uint8_t)_mm_extract_epi8(b.v, 3));
 }

+static FORCEINLINE __vec4_i8 __lshr(__vec4_i8 a, int32_t b) {
+    return __vec4_i8((uint8_t)_mm_extract_epi8(a.v, 0) >> b,
+                     (uint8_t)_mm_extract_epi8(a.v, 1) >> b,
+                     (uint8_t)_mm_extract_epi8(a.v, 2) >> b,
+                     (uint8_t)_mm_extract_epi8(a.v, 3) >> b);
+}
+
 static FORCEINLINE __vec4_i8 __ashr(__vec4_i8 a, __vec4_i8 b) {
    return __vec4_i8((int8_t)_mm_extract_epi8(a.v, 0) >>
                     (int8_t)_mm_extract_epi8(b.v, 0),
@@ -365,6 +383,13 @@ static FORCEINLINE __vec4_i8 __ashr(__vec4_i8 a, __vec4_i8 b) {
                     (int8_t)_mm_extract_epi8(b.v, 3));
 }

+static FORCEINLINE __vec4_i8 __ashr(__vec4_i8 a, int32_t b) {
+    return __vec4_i8((int8_t)_mm_extract_epi8(a.v, 0) >> b,
+                     (int8_t)_mm_extract_epi8(a.v, 1) >> b,
+                     (int8_t)_mm_extract_epi8(a.v, 2) >> b,
+                     (int8_t)_mm_extract_epi8(a.v, 3) >> b);
+}
+
 static FORCEINLINE __vec4_i1 __equal(__vec4_i8 a, __vec4_i8 b) {
    __m128i cmp = _mm_cmpeq_epi8(a.v, b.v);
    return __vec4_i1(_mm_extract_epi8(cmp, 0),
@@ -543,6 +568,10 @@ static FORCEINLINE __vec4_i16 __shl(__vec4_i16 a, __vec4_i16 b) {
                      _mm_extract_epi16(a.v, 3) << _mm_extract_epi16(b.v, 3));
 }

+static FORCEINLINE __vec4_i16 __shl(__vec4_i16 a, int32_t b) {
+    return _mm_sll_epi16(a.v, _mm_set_epi32(0, 0, 0, b));
+}
+
 static FORCEINLINE __vec4_i16 __udiv(__vec4_i16 a, __vec4_i16 b) {
    return __vec4_i16((uint16_t)_mm_extract_epi16(a.v, 0) /
                      (uint16_t)_mm_extract_epi16(b.v, 0),
@@ -598,6 +627,10 @@ static FORCEINLINE __vec4_i16 __lshr(__vec4_i16 a, __vec4_i16 b) {
                      (uint16_t)_mm_extract_epi16(b.v, 3));
 }

+static FORCEINLINE __vec4_i16 __lshr(__vec4_i16 a, int32_t b) {
+    return _mm_srl_epi16(a.v, _mm_set_epi32(0, 0, 0, b));
+}
+
 static FORCEINLINE __vec4_i16 __ashr(__vec4_i16 a, __vec4_i16 b) {
    return __vec4_i16((int16_t)_mm_extract_epi16(a.v, 0) >>
                      (int16_t)_mm_extract_epi16(b.v, 0),
@@ -609,6 +642,10 @@ static FORCEINLINE __vec4_i16 __ashr(__vec4_i16 a, __vec4_i16 b) {
                      (int16_t)_mm_extract_epi16(b.v, 3));
 }

+static FORCEINLINE __vec4_i16 __ashr(__vec4_i16 a, int32_t b) {
+    return _mm_sra_epi16(a.v, _mm_set_epi32(0, 0, 0, b));
+}
+
 static FORCEINLINE __vec4_i1 __equal(__vec4_i16 a, __vec4_i16 b) {
    __m128i cmp = _mm_cmpeq_epi16(a.v, b.v);
    return __vec4_i1(_mm_extract_epi16(cmp, 0),
@@ -785,9 +822,6 @@ static FORCEINLINE __vec4_i32 __xor(__vec4_i32 a, __vec4_i32 b) {
 }

 static FORCEINLINE __vec4_i32 __shl(__vec4_i32 a, __vec4_i32 b) {
-    // FIXME: if we can determine at compile time that b has the same value
-    // across all elements, then we can use _mm_sll_epi32.
-
    /* fixme: llvm generates thie code for shift left, which is presumably
       more efficient than doing each component individually as below.

@@ -809,57 +843,92 @@ _f___ii:                                ## @f___ii
        ret

     */
-    return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) << _mm_extract_epi32(b.v, 0),
-                      (uint32_t)_mm_extract_epi32(a.v, 1) << _mm_extract_epi32(b.v, 1),
-                      (uint32_t)_mm_extract_epi32(a.v, 2) << _mm_extract_epi32(b.v, 2),
-                      (uint32_t)_mm_extract_epi32(a.v, 3) << _mm_extract_epi32(b.v, 3));
+    return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) << 
+                      _mm_extract_epi32(b.v, 0),
+                      (uint32_t)_mm_extract_epi32(a.v, 1) << 
+                      _mm_extract_epi32(b.v, 1),
+                      (uint32_t)_mm_extract_epi32(a.v, 2) << 
+                      _mm_extract_epi32(b.v, 2),
+                      (uint32_t)_mm_extract_epi32(a.v, 3) << 
+                      _mm_extract_epi32(b.v, 3));
+}
+
+static FORCEINLINE __vec4_i32 __shl(__vec4_i32 a, int32_t b) {
+    return _mm_sll_epi32(a.v, _mm_set_epi32(0, 0, 0, b));
 }

 static FORCEINLINE __vec4_i32 __udiv(__vec4_i32 a, __vec4_i32 b) {
-    return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) / (uint32_t)_mm_extract_epi32(b.v, 0),
-                      (uint32_t)_mm_extract_epi32(a.v, 1) / (uint32_t)_mm_extract_epi32(b.v, 1),
-                      (uint32_t)_mm_extract_epi32(a.v, 2) / (uint32_t)_mm_extract_epi32(b.v, 2),
-                      (uint32_t)_mm_extract_epi32(a.v, 3) / (uint32_t)_mm_extract_epi32(b.v, 3));
+    return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) / 
+                      (uint32_t)_mm_extract_epi32(b.v, 0),
+                      (uint32_t)_mm_extract_epi32(a.v, 1) / 
+                      (uint32_t)_mm_extract_epi32(b.v, 1),
+                      (uint32_t)_mm_extract_epi32(a.v, 2) / 
+                      (uint32_t)_mm_extract_epi32(b.v, 2),
+                      (uint32_t)_mm_extract_epi32(a.v, 3) / 
+                      (uint32_t)_mm_extract_epi32(b.v, 3));
 }

 static FORCEINLINE __vec4_i32 __sdiv(__vec4_i32 a, __vec4_i32 b) {
-    return __vec4_i32((int32_t)_mm_extract_epi32(a.v, 0) / (int32_t)_mm_extract_epi32(b.v, 0),
-                      (int32_t)_mm_extract_epi32(a.v, 1) / (int32_t)_mm_extract_epi32(b.v, 1),
-                      (int32_t)_mm_extract_epi32(a.v, 2) / (int32_t)_mm_extract_epi32(b.v, 2),
-                      (int32_t)_mm_extract_epi32(a.v, 3) / (int32_t)_mm_extract_epi32(b.v, 3));
+    return __vec4_i32((int32_t)_mm_extract_epi32(a.v, 0) / 
+                      (int32_t)_mm_extract_epi32(b.v, 0),
+                      (int32_t)_mm_extract_epi32(a.v, 1) / 
+                      (int32_t)_mm_extract_epi32(b.v, 1),
+                      (int32_t)_mm_extract_epi32(a.v, 2) / 
+                      (int32_t)_mm_extract_epi32(b.v, 2),
+                      (int32_t)_mm_extract_epi32(a.v, 3) / 
+                      (int32_t)_mm_extract_epi32(b.v, 3));
 }

 static FORCEINLINE __vec4_i32 __urem(__vec4_i32 a, __vec4_i32 b) {
-    return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) % (uint32_t)_mm_extract_epi32(b.v, 0),
-                      (uint32_t)_mm_extract_epi32(a.v, 1) % (uint32_t)_mm_extract_epi32(b.v, 1),
-                      (uint32_t)_mm_extract_epi32(a.v, 2) % (uint32_t)_mm_extract_epi32(b.v, 2),
-                      (uint32_t)_mm_extract_epi32(a.v, 3) % (uint32_t)_mm_extract_epi32(b.v, 3));
+    return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) %
+                      (uint32_t)_mm_extract_epi32(b.v, 0),
+                      (uint32_t)_mm_extract_epi32(a.v, 1) %
+                      (uint32_t)_mm_extract_epi32(b.v, 1),
+                      (uint32_t)_mm_extract_epi32(a.v, 2) %
+                      (uint32_t)_mm_extract_epi32(b.v, 2),
+                      (uint32_t)_mm_extract_epi32(a.v, 3) %
+                      (uint32_t)_mm_extract_epi32(b.v, 3));
 }

 static FORCEINLINE __vec4_i32 __srem(__vec4_i32 a, __vec4_i32 b) {
-    return __vec4_i32((int32_t)_mm_extract_epi32(a.v, 0) % (int32_t)_mm_extract_epi32(b.v, 0),
-                      (int32_t)_mm_extract_epi32(a.v, 1) % (int32_t)_mm_extract_epi32(b.v, 1),
-                      (int32_t)_mm_extract_epi32(a.v, 2) % (int32_t)_mm_extract_epi32(b.v, 2),
-                      (int32_t)_mm_extract_epi32(a.v, 3) % (int32_t)_mm_extract_epi32(b.v, 3));
+    return __vec4_i32((int32_t)_mm_extract_epi32(a.v, 0) %
+                      (int32_t)_mm_extract_epi32(b.v, 0),
+                      (int32_t)_mm_extract_epi32(a.v, 1) %
+                      (int32_t)_mm_extract_epi32(b.v, 1),
+                      (int32_t)_mm_extract_epi32(a.v, 2) %
+                      (int32_t)_mm_extract_epi32(b.v, 2),
+                      (int32_t)_mm_extract_epi32(a.v, 3) %
+                      (int32_t)_mm_extract_epi32(b.v, 3));
 }

 static FORCEINLINE __vec4_i32 __lshr(__vec4_i32 a, __vec4_i32 b) {
-    // FIXME: if we can determine at compile time that b has the same value
-    // across all elements, e.g. using gcc's __builtin_constant_p, then we
-    // can use _mm_srl_epi32.
-    return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) >> _mm_extract_epi32(b.v, 0),
-                      (uint32_t)_mm_extract_epi32(a.v, 1) >> _mm_extract_epi32(b.v, 1),
-                      (uint32_t)_mm_extract_epi32(a.v, 2) >> _mm_extract_epi32(b.v, 2),
-                      (uint32_t)_mm_extract_epi32(a.v, 3) >> _mm_extract_epi32(b.v, 3));
+    return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) >>
+                      _mm_extract_epi32(b.v, 0),
+                      (uint32_t)_mm_extract_epi32(a.v, 1) >>
+                      _mm_extract_epi32(b.v, 1),
+                      (uint32_t)_mm_extract_epi32(a.v, 2) >>
+                      _mm_extract_epi32(b.v, 2),
+                      (uint32_t)_mm_extract_epi32(a.v, 3) >>
+                      _mm_extract_epi32(b.v, 3));
+}
+
+static FORCEINLINE __vec4_i32 __lshr(__vec4_i32 a, int32_t b) {
+    return _mm_srl_epi32(a.v, _mm_set_epi32(0, 0, 0, b));
 }

 static FORCEINLINE __vec4_i32 __ashr(__vec4_i32 a, __vec4_i32 b) {
-    // FIXME: if we can determine at compile time that b has the same value
-    // across all elements, then we can use _mm_sra_epi32.
-    return __vec4_i32((int32_t)_mm_extract_epi32(a.v, 0) >> _mm_extract_epi32(b.v, 0),
-                      (int32_t)_mm_extract_epi32(a.v, 1) >> _mm_extract_epi32(b.v, 1),
-                      (int32_t)_mm_extract_epi32(a.v, 2) >> _mm_extract_epi32(b.v, 2),
-                      (int32_t)_mm_extract_epi32(a.v, 3) >> _mm_extract_epi32(b.v, 3));
+    return __vec4_i32((int32_t)_mm_extract_epi32(a.v, 0) >>
+                      _mm_extract_epi32(b.v, 0),
+                      (int32_t)_mm_extract_epi32(a.v, 1) >>
+                      _mm_extract_epi32(b.v, 1),
+                      (int32_t)_mm_extract_epi32(a.v, 2) >>
+                      _mm_extract_epi32(b.v, 2),
+                      (int32_t)_mm_extract_epi32(a.v, 3) >>
+                      _mm_extract_epi32(b.v, 3));
+}
+
+static FORCEINLINE __vec4_i32 __ashr(__vec4_i32 a, int32_t b) {
+    return _mm_sra_epi32(a.v, _mm_set_epi32(0, 0, 0, b));
 }

 static FORCEINLINE __vec4_i1 __equal(__vec4_i32 a, __vec4_i32 b) {
@@ -1012,6 +1081,12 @@ static FORCEINLINE __vec4_i64 __shl(__vec4_i64 a, __vec4_i64 b) {
                      _mm_extract_epi64(a.v[1], 1) << _mm_extract_epi64(b.v[1], 1));
 }

+static FORCEINLINE __vec4_i64 __shl(__vec4_i64 a, int32_t b) {
+    __m128i amt = _mm_set_epi32(0, 0, 0, b);
+    return __vec4_i64(_mm_sll_epi64(a.v[0], amt),
+                      _mm_sll_epi64(a.v[1], amt));
+}
+
 static FORCEINLINE __vec4_i64 __udiv(__vec4_i64 a, __vec4_i64 b) {
    return __vec4_i64((uint64_t)_mm_extract_epi64(a.v[0], 0) /
                      (uint64_t)_mm_extract_epi64(b.v[0], 0),
@@ -1067,6 +1142,12 @@ static FORCEINLINE __vec4_i64 __lshr(__vec4_i64 a, __vec4_i64 b) {
                      (uint64_t)_mm_extract_epi64(b.v[1], 1));
 }

+static FORCEINLINE __vec4_i64 __lshr(__vec4_i64 a, int32_t b) {
+    __m128i amt = _mm_set_epi32(0, 0, 0, b);
+    return __vec4_i64(_mm_srl_epi64(a.v[0], amt),
+                      _mm_srl_epi64(a.v[1], amt));
+}
+
 static FORCEINLINE __vec4_i64 __ashr(__vec4_i64 a, __vec4_i64 b) {
    return __vec4_i64((int64_t)_mm_extract_epi64(a.v[0], 0) >>
                      (int64_t)_mm_extract_epi64(b.v[0], 0),
@@ -1078,6 +1159,13 @@ static FORCEINLINE __vec4_i64 __ashr(__vec4_i64 a, __vec4_i64 b) {
                      (int64_t)_mm_extract_epi64(b.v[1], 1));
 }

+static FORCEINLINE __vec4_i64 __ashr(__vec4_i64 a, int32_t b) {
+    return __vec4_i64((int64_t)_mm_extract_epi64(a.v[0], 0) >> b,
+                      (int64_t)_mm_extract_epi64(a.v[0], 1) >> b,
+                      (int64_t)_mm_extract_epi64(a.v[1], 0) >> b,
+                      (int64_t)_mm_extract_epi64(a.v[1], 1) >> b);
+}
+
 static FORCEINLINE __vec4_i1 __equal(__vec4_i64 a, __vec4_i64 b) {
    __m128i cmp0 = _mm_cmpeq_epi64(a.v[0], b.v[0]);
    __m128i cmp1 = _mm_cmpeq_epi64(a.v[1], b.v[1]);
@@ -2324,7 +2412,7 @@ static FORCEINLINE uint64_t __reduce_max_uint64(__vec4_i64 v) {
 ///////////////////////////////////////////////////////////////////////////
 // masked load/store

-static FORCEINLINE __vec4_i8 __masked_load_8(unsigned char *p, 
+static FORCEINLINE __vec4_i8 __masked_load_8(void *p, 
                                             __vec4_i1 mask) {
    int8_t r[4];
    int8_t *ptr = (int8_t *)p;
@@ -2344,7 +2432,7 @@ static FORCEINLINE __vec4_i8 __masked_load_8(unsigned char *p,
    return __vec4_i8(r[0], r[1], r[2], r[3]);
 }

-static FORCEINLINE __vec4_i16 __masked_load_16(unsigned char *p, 
+static FORCEINLINE __vec4_i16 __masked_load_16(void *p, 
                                               __vec4_i1 mask) {
    int16_t r[4];
    int16_t *ptr = (int16_t *)p;
@@ -2368,7 +2456,7 @@ static FORCEINLINE __vec4_i16 __masked_load_16(unsigned char *p,
    return __vec4_i16(r[0], r[1], r[2], r[3]);
 }

-static FORCEINLINE __vec4_i32 __masked_load_32(unsigned char *p, 
+static FORCEINLINE __vec4_i32 __masked_load_32(void *p, 
                                               __vec4_i1 mask) {
    __m128i r = _mm_set_epi32(0, 0, 0, 0);
    int32_t *ptr = (int32_t *)p;
@@ -2391,7 +2479,7 @@ static FORCEINLINE __vec4_i32 __masked_load_32(unsigned char *p,
    return r;
 }

-static FORCEINLINE __vec4_i64 __masked_load_64(unsigned char *p, 
+static FORCEINLINE __vec4_i64 __masked_load_64(void *p, 
                                               __vec4_i1 mask) {
    uint64_t r[4];
    uint64_t *ptr = (uint64_t *)p;
@@ -2414,7 +2502,7 @@ static FORCEINLINE __vec4_i64 __masked_load_64(unsigned char *p,
    return __vec4_i64(r[0], r[1], r[2], r[3]);
 }

-static FORCEINLINE void __masked_store_8(unsigned char *p, __vec4_i8 val, 
+static FORCEINLINE void __masked_store_8(void *p, __vec4_i8 val, 
                                         __vec4_i1 mask) {
    int8_t *ptr = (int8_t *)p;

@@ -2435,7 +2523,8 @@ static FORCEINLINE void __masked_store_8(unsigned char *p, __vec4_i8 val,
        ptr[3] = _mm_extract_epi8(val.v, 3);
 }

-static FORCEINLINE void __masked_store_16(unsigned char *p, __vec4_i16 val, __vec4_i1 mask) {
+static FORCEINLINE void __masked_store_16(void *p, __vec4_i16 val,
+                                          __vec4_i1 mask) {
    int16_t *ptr = (int16_t *)p;

    uint32_t m = _mm_extract_ps(mask.v, 0);
@@ -2455,7 +2544,7 @@ static FORCEINLINE void __masked_store_16(unsigned char *p, __vec4_i16 val, __ve
        ptr[3] = _mm_extract_epi16(val.v, 3);
 }

-static FORCEINLINE void __masked_store_32(unsigned char *p, __vec4_i32 val, 
+static FORCEINLINE void __masked_store_32(void *p, __vec4_i32 val, 
                                          __vec4_i1 mask) {
    int32_t *ptr = (int32_t *)p;
    uint32_t m = _mm_extract_ps(mask.v, 0);
@@ -2475,7 +2564,7 @@ static FORCEINLINE void __masked_store_32(unsigned char *p, __vec4_i32 val,
        ptr[3] = _mm_extract_epi32(val.v, 3);
 }

-static FORCEINLINE void __masked_store_64(unsigned char *p, __vec4_i64 val, 
+static FORCEINLINE void __masked_store_64(void *p, __vec4_i64 val, 
                                          __vec4_i1 mask) {
    int64_t *ptr = (int64_t *)p;
    uint32_t m = _mm_extract_ps(mask.v, 0);
@@ -2495,58 +2584,82 @@ static FORCEINLINE void __masked_store_64(unsigned char *p, __vec4_i64 val,
        ptr[3] = _mm_extract_epi64(val.v[1], 1);
 }

+static FORCEINLINE void __masked_store_blend_8(void *p, __vec4_i8 val, 
+                                               __vec4_i1 mask) {
+    __masked_store_8(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_16(void *p, __vec4_i16 val, 
+                                                __vec4_i1 mask) {
+    __masked_store_16(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_32(void *p, __vec4_i32 val, 
+                                                __vec4_i1 mask) {
+    // FIXME: do a load, blendvps, store here...
+    __masked_store_32(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_64(void *p, __vec4_i64 val, 
+                                                __vec4_i1 mask) {
+    // FIXME: do a 2x (load, blendvps, store) here...
+    __masked_store_64(p, val, mask);
+}
+
 ///////////////////////////////////////////////////////////////////////////
 // gather/scatter
 // offsets * offsetScale is in bytes (for all of these)

 template<typename RetVec, typename RetScalar>
 static FORCEINLINE RetVec
-lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p,
-                     __vec4_i32 offsets, uint32_t scale, __vec4_i1 mask) {
+lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p, __vec4_i32 offsets, 
+                     uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
    RetScalar r[4];
 #if 1
    // "Fast gather" trick...
    offsets = __select(mask, offsets, __smear_i32(0));
-    int offset = scale * _mm_extract_epi32(offsets.v, 0);
+    constOffset = __select(mask, constOffset, __smear_i32(0));
+
+    int offset = scale * _mm_extract_epi32(offsets.v, 0) + _mm_extract_epi32(constOffset.v, 0);
    RetScalar *ptr = (RetScalar *)(p + offset);
    r[0] = *ptr;

-    offset = scale * _mm_extract_epi32(offsets.v, 1);
+    offset = scale * _mm_extract_epi32(offsets.v, 1) + _mm_extract_epi32(constOffset.v, 1);
    ptr = (RetScalar *)(p + offset);
    r[1] = *ptr;

-    offset = scale * _mm_extract_epi32(offsets.v, 2);
+    offset = scale * _mm_extract_epi32(offsets.v, 2) + _mm_extract_epi32(constOffset.v, 2);
    ptr = (RetScalar *)(p + offset);
    r[2] = *ptr;

-    offset = scale * _mm_extract_epi32(offsets.v, 3);
+    offset = scale * _mm_extract_epi32(offsets.v, 3) + _mm_extract_epi32(constOffset.v, 3);
    ptr = (RetScalar *)(p + offset);
    r[3] = *ptr;
 #else
    uint32_t m = _mm_extract_ps(mask.v, 0);
    if (m != 0) {
-        int offset = scale * _mm_extract_epi32(offsets.v, 0);
+        int offset = scale * _mm_extract_epi32(offsets.v, 0) + _mm_extract_epi32(constOffset.v, 0);
        RetScalar *ptr = (RetScalar *)(p + offset);
        r[0] = *ptr;
    }

    m = _mm_extract_ps(mask.v, 1);
    if (m != 0) {
-        int offset = scale * _mm_extract_epi32(offsets.v, 1);
+        int offset = scale * _mm_extract_epi32(offsets.v, 1) + _mm_extract_epi32(constOffset.v, 1);
        RetScalar *ptr = (RetScalar *)(p + offset);
        r[1] = *ptr;
    }

    m = _mm_extract_ps(mask.v, 2);
    if (m != 0) {
-        int offset = scale * _mm_extract_epi32(offsets.v, 2);
+        int offset = scale * _mm_extract_epi32(offsets.v, 2) + _mm_extract_epi32(constOffset.v, 2);
        RetScalar *ptr = (RetScalar *)(p + offset);
        r[2] = *ptr;
    }

    m = _mm_extract_ps(mask.v, 3);
    if (m != 0) {
-        int offset = scale * _mm_extract_epi32(offsets.v, 3);
+        int offset = scale * _mm_extract_epi32(offsets.v, 3) + _mm_extract_epi32(constOffset.v, 3);
        RetScalar *ptr = (RetScalar *)(p + offset);
        r[3] = *ptr;
    }
@@ -2554,54 +2667,57 @@ lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p,
    return RetVec(r[0], r[1], r[2], r[3]);
 }

+
 template<typename RetVec, typename RetScalar>
 static FORCEINLINE RetVec
 lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, __vec4_i64 offsets,
-                     uint32_t scale, __vec4_i1 mask) {
+                     uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
    RetScalar r[4];
 #if 1
    // "Fast gather" trick...
    offsets = __select(mask, offsets, __smear_i64(0));
-    int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
+    constOffset = __select(mask, constOffset, __smear_i64(0));
+
+    int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + _mm_extract_epi64(constOffset.v[0], 0);
    RetScalar *ptr = (RetScalar *)(p + offset);
    r[0] = *ptr;

-    offset = scale * _mm_extract_epi64(offsets.v[0], 1);
+    offset = scale * _mm_extract_epi64(offsets.v[0], 1) + _mm_extract_epi64(constOffset.v[0], 1);
    ptr = (RetScalar *)(p + offset);
    r[1] = *ptr;

-    offset = scale * _mm_extract_epi64(offsets.v[1], 0);
+    offset = scale * _mm_extract_epi64(offsets.v[1], 0) + _mm_extract_epi64(constOffset.v[1], 0);
    ptr = (RetScalar *)(p + offset);
    r[2] = *ptr;

-    offset = scale * _mm_extract_epi64(offsets.v[1], 1);
+    offset = scale * _mm_extract_epi64(offsets.v[1], 1) + _mm_extract_epi64(constOffset.v[1], 1);
    ptr = (RetScalar *)(p + offset);
    r[3] = *ptr;
 #else
    uint32_t m = _mm_extract_ps(mask.v, 0);
    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + _mm_extract_epi64(constOffset.v[0], 0);
        RetScalar *ptr = (RetScalar *)(p + offset);
        r[0] = *ptr;
    }

    m = _mm_extract_ps(mask.v, 1);
    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1) + _mm_extract_epi64(constOffset.v[0], 1);
        RetScalar *ptr = (RetScalar *)(p + offset);
        r[1] = *ptr;
    }

    m = _mm_extract_ps(mask.v, 2);
    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0) + _mm_extract_epi64(constOffset.v[1], 0);
        RetScalar *ptr = (RetScalar *)(p + offset);
        r[2] = *ptr;
    }

    m = _mm_extract_ps(mask.v, 3);
    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1) + _mm_extract_epi64(constOffset.v[1], 1);
        RetScalar *ptr = (RetScalar *)(p + offset);
        r[3] = *ptr;
    }
@@ -2612,80 +2728,89 @@ lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, __vec4_i64 offsets,

 static FORCEINLINE __vec4_i8
 __gather_base_offsets32_i8(unsigned char *b, __vec4_i32 offsets,
-                           uint32_t scale,  __vec4_i1 mask) {
+                           uint32_t scale,  __vec4_i32 constOffset, __vec4_i1 mask) {
    return lGatherBaseOffsets32(__vec4_i8(), uint8_t(), b, offsets, scale, 
-                                mask);
+                                constOffset, mask);
 }

 static FORCEINLINE __vec4_i8
 __gather_base_offsets64_i8(unsigned char *b, __vec4_i64 offsets,
-                           uint32_t scale, __vec4_i1 mask) {
+                           uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
    return lGatherBaseOffsets64(__vec4_i8(), uint8_t(), b, offsets, scale, 
-                                mask);
+                                constOffset, mask);
 }

 static FORCEINLINE __vec4_i16
 __gather_base_offsets32_i16(unsigned char *b, __vec4_i32 offsets,
-                            uint32_t scale, __vec4_i1 mask) {
+                            uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
    return lGatherBaseOffsets32(__vec4_i16(), uint16_t(), b, offsets, scale, 
-                                mask);
+                                constOffset, mask);
 }

 static FORCEINLINE __vec4_i16
 __gather_base_offsets64_i16(unsigned char *b, __vec4_i64 offsets,
-                             uint32_t scale, __vec4_i1 mask) {
+                             uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
    return lGatherBaseOffsets64(__vec4_i16(), uint16_t(), b, offsets, scale, 
-                                mask);
+                                constOffset, mask);
 }

 static FORCEINLINE __vec4_i32
-__gather_base_offsets32_i32(uint8_t *p, __vec4_i32 offsets,
-                            uint32_t scale, __vec4_i1 mask) {
+__gather_base_offsets32_i32(uint8_t *p, __vec4_i32 offsets, uint32_t scale,
+                            __vec4_i32 constOffset, __vec4_i1 mask) {
    __m128i r = _mm_set_epi32(0, 0, 0, 0);
 #if 1
    // "Fast gather"...
    offsets = __select(mask, offsets, __smear_i32(0));
+    constOffset = __select(mask, constOffset, __smear_i32(0));

-    int offset = scale * _mm_extract_epi32(offsets.v, 0);
+    int offset = scale * _mm_extract_epi32(offsets.v, 0) +
+        _mm_extract_epi32(constOffset.v, 0);
    uint32_t *ptr = (uint32_t *)(p + offset);
    r = _mm_insert_epi32(r, *ptr, 0);

-    offset = scale * _mm_extract_epi32(offsets.v, 1);
+    offset = scale * _mm_extract_epi32(offsets.v, 1) +
+        _mm_extract_epi32(constOffset.v, 1);
    ptr = (uint32_t *)(p + offset);
    r = _mm_insert_epi32(r, *ptr, 1);

-    offset = scale * _mm_extract_epi32(offsets.v, 2);
+    offset = scale * _mm_extract_epi32(offsets.v, 2) +
+        _mm_extract_epi32(constOffset.v, 2);
    ptr = (uint32_t *)(p + offset);
    r = _mm_insert_epi32(r, *ptr, 2);

-    offset = scale * _mm_extract_epi32(offsets.v, 3);
+    offset = scale * _mm_extract_epi32(offsets.v, 3) +
+        _mm_extract_epi32(constOffset.v, 3);
    ptr = (uint32_t *)(p + offset);
    r = _mm_insert_epi32(r, *ptr, 3);
 #else
    uint32_t m = _mm_extract_ps(mask.v, 0);
    if (m != 0) {
-        int offset = scale * _mm_extract_epi32(offsets.v, 0);
+        int offset = scale * _mm_extract_epi32(offsets.v, 0) +
+            _mm_extract_epi32(constOffset.v, 0);
        uint32_t *ptr = (uint32_t *)(p + offset);
        r = _mm_insert_epi32(r, *ptr, 0);
    }

    m = _mm_extract_ps(mask.v, 1);
    if (m != 0) {
-        int offset = scale * _mm_extract_epi32(offsets.v, 1);
+        int offset = scale * _mm_extract_epi32(offsets.v, 1) +
+            _mm_extract_epi32(constOffset.v, 1);
        uint32_t *ptr = (uint32_t *)(p + offset);
        r = _mm_insert_epi32(r, *ptr, 1);
    }

    m = _mm_extract_ps(mask.v, 2);
    if (m != 0) {
-        int offset = scale * _mm_extract_epi32(offsets.v, 2);
+        int offset = scale * _mm_extract_epi32(offsets.v, 2) +
+            _mm_extract_epi32(constOffset.v, 2);
        uint32_t *ptr = (uint32_t *)(p + offset);
        r = _mm_insert_epi32(r, *ptr, 2);
    }

    m = _mm_extract_ps(mask.v, 3);
    if (m != 0) {
-        int offset = scale * _mm_extract_epi32(offsets.v, 3);
+        int offset = scale * _mm_extract_epi32(offsets.v, 3) +
+            _mm_extract_epi32(constOffset.v, 3);
        uint32_t *ptr = (uint32_t *)(p + offset);
        r = _mm_insert_epi32(r, *ptr, 3);
    }
@@ -2695,23 +2820,23 @@ __gather_base_offsets32_i32(uint8_t *p, __vec4_i32 offsets,

 static FORCEINLINE __vec4_i32
 __gather_base_offsets64_i32(unsigned char *p, __vec4_i64 offsets,
-                            uint32_t scale, __vec4_i1 mask) {
+                            uint32_t scale, __vec4_i64 delta, __vec4_i1 mask) {
    return lGatherBaseOffsets64(__vec4_i32(), uint32_t(), p, offsets, scale, 
-                                mask);
+                                delta, mask);
 }

 static FORCEINLINE __vec4_i64
 __gather_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets,
-                            uint32_t scale, __vec4_i1 mask) {
+                            uint32_t scale, __vec4_i32 delta, __vec4_i1 mask) {
    return lGatherBaseOffsets32(__vec4_i64(), uint64_t(), p, offsets, scale, 
-                                mask);
+                                delta, mask);
 }

 static FORCEINLINE __vec4_i64
 __gather_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets,
-                            uint32_t scale, __vec4_i1 mask) {
+                            uint32_t scale, __vec4_i64 delta, __vec4_i1 mask) {
    return lGatherBaseOffsets64(__vec4_i64(), uint64_t(), p, offsets, scale, 
-                                mask);
+                                delta, mask);
 }

 template<typename RetVec, typename RetScalar>
@@ -2858,217 +2983,108 @@ static FORCEINLINE __vec4_i64 __gather64_i64(__vec4_i64 ptrs, __vec4_i1 mask) {

 // scatter
  
-static FORCEINLINE void
-__scatter_base_offsets32_i8(unsigned char *b, __vec4_i32 offsets, 
-                            uint32_t scale, __vec4_i8 val, __vec4_i1 mask) {
-    uint32_t m = _mm_extract_ps(mask.v, 0);
-    if (m != 0) {
-        int8_t *ptr = (int8_t *)(b + scale * _mm_extract_epi32(offsets.v, 0));
-        *ptr = _mm_extract_epi8(val.v, 0);
-    }
-
-    m = _mm_extract_ps(mask.v, 1);
-    if (m != 0) {
-        int8_t *ptr = (int8_t *)(b + scale * _mm_extract_epi32(offsets.v, 1));
-        *ptr = _mm_extract_epi8(val.v, 1);
-    }
-
-    m = _mm_extract_ps(mask.v, 2);
-    if (m != 0) {
-        int8_t *ptr = (int8_t *)(b + scale * _mm_extract_epi32(offsets.v, 2));
-        *ptr = _mm_extract_epi8(val.v, 2);
-    }
-
-    m = _mm_extract_ps(mask.v, 3);
-    if (m != 0) {
-        int8_t *ptr = (int8_t *)(b + scale * _mm_extract_epi32(offsets.v, 3));
-        *ptr = _mm_extract_epi8(val.v, 3);
-    }
+#define SCATTER32_64(SUFFIX, TYPE, EXTRACT)                         \
+static FORCEINLINE void                                             \
+__scatter_base_offsets32_##SUFFIX (unsigned char *b, __vec4_i32 offsets, \
+                                   uint32_t scale, __vec4_i32 constOffset, \
+                                   __vec4_##SUFFIX val, __vec4_i1 mask) { \
+    uint32_t m = _mm_extract_ps(mask.v, 0);                             \
+    if (m != 0) {                                                       \
+        TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 0) + \
+                             _mm_extract_epi32(constOffset.v, 0));      \
+        *ptr = EXTRACT(val.v, 0);                                       \
+    }                                                                   \
+    m = _mm_extract_ps(mask.v, 1);                                      \
+    if (m != 0) {                                                       \
+        TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 1) + \
+                             _mm_extract_epi32(constOffset.v, 1));      \
+        *ptr = EXTRACT(val.v, 1);                                       \
+    }                                                                   \
+    m = _mm_extract_ps(mask.v, 2);                                      \
+    if (m != 0) {                                                       \
+        TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 2) + \
+                             _mm_extract_epi32(constOffset.v, 2));      \
+        *ptr = EXTRACT(val.v, 2);                                       \
+    }                                                                   \
+    m = _mm_extract_ps(mask.v, 3);                                      \
+    if (m != 0) {                                                       \
+        TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 3) + \
+                             _mm_extract_epi32(constOffset.v, 3));      \
+        *ptr = EXTRACT(val.v, 3);                                       \
+    }                                                                   \
+}                                                                       \
+static FORCEINLINE void                                                \
+__scatter_base_offsets64_##SUFFIX(unsigned char *p, __vec4_i64 offsets, \
+                                  uint32_t scale, __vec4_i64 constOffset, \
+                                  __vec4_##SUFFIX val, __vec4_i1 mask) { \
+    uint32_t m = _mm_extract_ps(mask.v, 0);                            \
+    if (m != 0) {                                                      \
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) +  \
+            _mm_extract_epi64(constOffset.v[0], 0);                    \
+        TYPE *ptr = (TYPE *)(p + offset);                              \
+        *ptr = EXTRACT(val.v, 0);                                      \
+    }                                                                  \
+    m = _mm_extract_ps(mask.v, 1);                                     \
+    if (m != 0) {                                                      \
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1) +  \
+            _mm_extract_epi64(constOffset.v[0], 1);                    \
+        TYPE *ptr = (TYPE *)(p + offset);                              \
+        *ptr = EXTRACT(val.v, 1);                                      \
+    }                                                                  \
+    m = _mm_extract_ps(mask.v, 2);                                     \
+    if (m != 0) {                                                      \
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0) +  \
+            _mm_extract_epi64(constOffset.v[1], 0);                    \
+        TYPE *ptr = (TYPE *)(p + offset);                              \
+        *ptr = EXTRACT(val.v, 2);                                      \
+    }                                                                  \
+    m = _mm_extract_ps(mask.v, 3);                                     \
+    if (m != 0) {                                                      \
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1) +  \
+            _mm_extract_epi64(constOffset.v[1], 1);                    \
+        TYPE *ptr = (TYPE *)(p + offset);                              \
+        *ptr = EXTRACT(val.v, 3);                                      \
+    }                                                                  \
 }

-static FORCEINLINE void
-__scatter_base_offsets64_i8(unsigned char *p, __vec4_i64 offsets, 
-                            uint32_t scale, __vec4_i8 val, __vec4_i1 mask) {
-    uint32_t m = _mm_extract_ps(mask.v, 0);
-    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
-        uint8_t *ptr = (uint8_t *)(p + offset);
-        *ptr = _mm_extract_epi8(val.v, 0);
-    }

-    m = _mm_extract_ps(mask.v, 1);
-    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
-        uint8_t *ptr = (uint8_t *)(p + offset);
-        *ptr = _mm_extract_epi8(val.v, 1);
-    }
+SCATTER32_64(i8, int8_t, _mm_extract_epi8)
+SCATTER32_64(i16, int16_t, _mm_extract_epi16)
+SCATTER32_64(i32, int32_t, _mm_extract_epi32)

-    m = _mm_extract_ps(mask.v, 2);
-    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
-        uint8_t *ptr = (uint8_t *)(p + offset);
-        *ptr = _mm_extract_epi8(val.v, 2);
-    }
-
-    m = _mm_extract_ps(mask.v, 3);
-    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
-        uint8_t *ptr = (uint8_t *)(p + offset);
-        *ptr = _mm_extract_epi8(val.v, 3);
-    }
-}
-
-static FORCEINLINE void
-__scatter_base_offsets32_i16(unsigned char *b, __vec4_i32 offsets,
-                             uint32_t scale, __vec4_i16 val, __vec4_i1 mask) {
-    uint32_t m = _mm_extract_ps(mask.v, 0);
-    if (m != 0) {
-        int16_t *ptr = (int16_t *)(b + scale * _mm_extract_epi32(offsets.v, 0));
-        *ptr = _mm_extract_epi16(val.v, 0);
-    }
-
-    m = _mm_extract_ps(mask.v, 1);
-    if (m != 0) {
-        int16_t *ptr = (int16_t *)(b + scale * _mm_extract_epi32(offsets.v, 1));
-        *ptr = _mm_extract_epi16(val.v, 1);
-    }
-
-    m = _mm_extract_ps(mask.v, 2);
-    if (m != 0) {
-        int16_t *ptr = (int16_t *)(b + scale * _mm_extract_epi32(offsets.v, 2));
-        *ptr = _mm_extract_epi16(val.v, 2);
-    }
-
-    m = _mm_extract_ps(mask.v, 3);
-    if (m != 0) {
-        int16_t *ptr = (int16_t *)(b + scale * _mm_extract_epi32(offsets.v, 3));
-        *ptr = _mm_extract_epi16(val.v, 3);
-    }
-}
-
-static FORCEINLINE void
-__scatter_base_offsets64_i16(unsigned char *p, __vec4_i64 offsets, 
-                             uint32_t scale, __vec4_i16 val, __vec4_i1 mask) {
-    uint32_t m = _mm_extract_ps(mask.v, 0);
-    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
-        uint16_t *ptr = (uint16_t *)(p + offset);
-        *ptr = _mm_extract_epi16(val.v, 0);
-    }
-
-    m = _mm_extract_ps(mask.v, 1);
-    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
-        uint16_t *ptr = (uint16_t *)(p + offset);
-        *ptr = _mm_extract_epi16(val.v, 1);
-    }
-
-    m = _mm_extract_ps(mask.v, 2);
-    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
-        uint16_t *ptr = (uint16_t *)(p + offset);
-        *ptr = _mm_extract_epi16(val.v, 2);
-    }
-
-    m = _mm_extract_ps(mask.v, 3);
-    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
-        uint16_t *ptr = (uint16_t *)(p + offset);
-        *ptr = _mm_extract_epi16(val.v, 3);
-    }
-}
-
-static FORCEINLINE void
-__scatter_base_offsets32_i32(unsigned char *b, __vec4_i32 offsets,
-                             uint32_t scale, __vec4_i32 val, __vec4_i1 mask) {
-    uint32_t m = _mm_extract_ps(mask.v, 0);
-    if (m != 0) {
-        int32_t *ptr = (int32_t *)(b + scale *
-                                   _mm_extract_epi32(offsets.v, 0));
-        *ptr = _mm_extract_epi32(val.v, 0);
-    }
-
-    m = _mm_extract_ps(mask.v, 1);
-    if (m != 0) {
-        int32_t *ptr = (int32_t *)(b + scale *
-                                   _mm_extract_epi32(offsets.v, 1));
-        *ptr = _mm_extract_epi32(val.v, 1);
-    }
-
-    m = _mm_extract_ps(mask.v, 2);
-    if (m != 0) {
-        int32_t *ptr = (int32_t *)(b + scale *
-                                   _mm_extract_epi32(offsets.v, 2));
-        *ptr = _mm_extract_epi32(val.v, 2);
-    }
-
-    m = _mm_extract_ps(mask.v, 3);
-    if (m != 0) {
-        int32_t *ptr = (int32_t *)(b + scale *
-                                   _mm_extract_epi32(offsets.v, 3));
-        *ptr = _mm_extract_epi32(val.v, 3);
-    }
-}
-
-static FORCEINLINE void
-__scatter_base_offsets64_i32(unsigned char *p, __vec4_i64 offsets, 
-                             uint32_t scale, __vec4_i32 val, __vec4_i1 mask) {
-    uint32_t m = _mm_extract_ps(mask.v, 0);
-    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
-        uint32_t *ptr = (uint32_t *)(p + offset);
-        *ptr = _mm_extract_epi32(val.v, 0);
-    }
-
-    m = _mm_extract_ps(mask.v, 1);
-    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
-        uint32_t *ptr = (uint32_t *)(p + offset);
-        *ptr = _mm_extract_epi32(val.v, 1);
-    }
-
-    m = _mm_extract_ps(mask.v, 2);
-    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
-        uint32_t *ptr = (uint32_t *)(p + offset);
-        *ptr = _mm_extract_epi32(val.v, 2);
-    }
-
-    m = _mm_extract_ps(mask.v, 3);
-    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
-        uint32_t *ptr = (uint32_t *)(p + offset);
-        *ptr = _mm_extract_epi32(val.v, 3);
-    }
-}

 static FORCEINLINE void
 __scatter_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets, 
-                             uint32_t scale, __vec4_i64 val, __vec4_i1 mask) {
+                             uint32_t scale, __vec4_i32 constOffset, __vec4_i64 val, 
+                             __vec4_i1 mask) {
    uint32_t m = _mm_extract_ps(mask.v, 0);
    if (m != 0) {
-        int32_t offset = scale * _mm_extract_epi32(offsets.v, 0);
+        int32_t offset = scale * _mm_extract_epi32(offsets.v, 0) +
+            _mm_extract_epi32(constOffset.v, 0);
        uint64_t *ptr = (uint64_t *)(p + offset);
        *ptr = _mm_extract_epi64(val.v[0], 0);
    }

    m = _mm_extract_ps(mask.v, 1);
    if (m != 0) {
-        int32_t offset = scale * _mm_extract_epi32(offsets.v, 1);
+        int32_t offset = scale * _mm_extract_epi32(offsets.v, 1) +
+            _mm_extract_epi32(constOffset.v, 1);
        uint64_t *ptr = (uint64_t *)(p + offset);
        *ptr = _mm_extract_epi64(val.v[0], 1);
    }

    m = _mm_extract_ps(mask.v, 2);
    if (m != 0) {
-        int32_t offset = scale * _mm_extract_epi32(offsets.v, 2);
+        int32_t offset = scale * _mm_extract_epi32(offsets.v, 2) +
+            _mm_extract_epi32(constOffset.v, 2);
        uint64_t *ptr = (uint64_t *)(p + offset);
        *ptr = _mm_extract_epi64(val.v[1], 0);
    }

    m = _mm_extract_ps(mask.v, 3);
    if (m != 0) {
-        int32_t offset = scale * _mm_extract_epi32(offsets.v, 3);
+        int32_t offset = scale * _mm_extract_epi32(offsets.v, 3) +
+            _mm_extract_epi32(constOffset.v, 3);
        uint64_t *ptr = (uint64_t *)(p + offset);
        *ptr = _mm_extract_epi64(val.v[1], 1);
    }
@@ -3076,31 +3092,36 @@ __scatter_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets,

 static FORCEINLINE void
 __scatter_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets, 
-                             uint32_t scale, __vec4_i64 val, __vec4_i1 mask) {
+                             uint32_t scale, __vec4_i64 constOffset,
+                             __vec4_i64 val, __vec4_i1 mask) {
    uint32_t m = _mm_extract_ps(mask.v, 0);
    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) +
+            _mm_extract_epi64(constOffset.v[0], 0);
        uint64_t *ptr = (uint64_t *)(p + offset);
        *ptr = _mm_extract_epi64(val.v[0], 0);
    }

    m = _mm_extract_ps(mask.v, 1);
    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1) +
+            _mm_extract_epi64(constOffset.v[0], 1);
        uint64_t *ptr = (uint64_t *)(p + offset);
        *ptr = _mm_extract_epi64(val.v[0], 1);
    }

    m = _mm_extract_ps(mask.v, 2);
    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0) +
+            _mm_extract_epi64(constOffset.v[1], 0);
        uint64_t *ptr = (uint64_t *)(p + offset);
        *ptr = _mm_extract_epi64(val.v[1], 0);
    }

    m = _mm_extract_ps(mask.v, 3);
    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1) +
+            _mm_extract_epi64(constOffset.v[1], 1);
        uint64_t *ptr = (uint64_t *)(p + offset);
        *ptr = _mm_extract_epi64(val.v[1], 1);
    }
--- a/examples/rt/rt.ispc
+++ b/examples/rt/rt.ispc
@@ -104,8 +104,8 @@ static void generateRay(uniform const float raster2camera[4][4],
 }


-static inline bool BBoxIntersect(const uniform float bounds[2][3], 
-                                 const Ray &ray) {
+static bool BBoxIntersect(const uniform float bounds[2][3], 
+                          const Ray &ray) {
    uniform float3 bounds0 = { bounds[0][0], bounds[0][1], bounds[0][2] };
    uniform float3 bounds1 = { bounds[1][0], bounds[1][1], bounds[1][2] };
    float t0 = ray.mint, t1 = ray.maxt;
@@ -143,7 +143,7 @@ static inline bool BBoxIntersect(const uniform float bounds[2][3],



-static inline bool TriIntersect(const Triangle &tri, Ray &ray) {
+static bool TriIntersect(const Triangle &tri, Ray &ray) {
    uniform float3 p0 = { tri.p[0][0], tri.p[0][1], tri.p[0][2] };
    uniform float3 p1 = { tri.p[1][0], tri.p[1][1], tri.p[1][2] };
    uniform float3 p2 = { tri.p[2][0], tri.p[2][1], tri.p[2][2] };
--- a/examples/volume_rendering/volume.ispc
+++ b/examples/volume_rendering/volume.ispc
@@ -129,8 +129,8 @@ static inline float3 Offset(float3 p, float3 pMin, float3 pMax) {
 }


-static inline float Density(float3 Pobj, float3 pMin, float3 pMax, 
-                            uniform float density[], uniform int nVoxels[3]) {
+static float Density(float3 Pobj, float3 pMin, float3 pMax, 
+                     uniform float density[], uniform int nVoxels[3]) {
    if (!Inside(Pobj, pMin, pMax)) 
        return 0;
    // Compute voxel coordinates and offsets for _Pobj_
--- a/expr.cpp
+++ b/expr.cpp
@@ -36,12 +36,22 @@
 */

 #include "expr.h"
+#include "ast.h"
 #include "type.h"
 #include "sym.h"
 #include "ctx.h"
 #include "module.h"
 #include "util.h"
 #include "llvmutil.h"
+#ifndef _MSC_VER
+#include <inttypes.h>
+#endif
+#ifndef PRId64
+#define PRId64 "lld"
+#endif
+#ifndef PRIu64
+#define PRIu64 "llu"
+#endif

 #include <list>
 #include <set>
@@ -224,7 +234,7 @@ lDoTypeConv(const Type *fromType, const Type *toType, Expr **expr,
            eltType = eltType->GetAsConstType();
        if (Type::Equal(toPointerType, 
                        new PointerType(eltType,
-                                        toPointerType->IsUniformType(),
+                                        toPointerType->GetVariability(),
                                        toPointerType->IsConstType())))
            goto typecast_ok;
        else {
@@ -466,7 +476,7 @@ lDoTypeConv(const Type *fromType, const Type *toType, Expr **expr,

 typecast_ok:
    if (expr != NULL)
-        *expr = new TypeCastExpr(toType, *expr, false, pos);
+        *expr = new TypeCastExpr(toType, *expr, pos);
    return true;
 }

@@ -638,6 +648,9 @@ lLLVMConstantValue(const Type *type, llvm::LLVMContext *ctx, double value) {

 static llvm::Value *
 lMaskForSymbol(Symbol *baseSym, FunctionEmitContext *ctx) {
+    if (baseSym == NULL)
+        return ctx->GetFullMask();
+
    if (dynamic_cast<const PointerType *>(baseSym->type) != NULL ||
        dynamic_cast<const ReferenceType *>(baseSym->type) != NULL)
        // FIXME: for pointers, we really only want to do this for
@@ -658,10 +671,11 @@ lMaskForSymbol(Symbol *baseSym, FunctionEmitContext *ctx) {
 static void
 lStoreAssignResult(llvm::Value *value, llvm::Value *ptr, const Type *ptrType,
                   FunctionEmitContext *ctx, Symbol *baseSym) {
-    Assert(baseSym != NULL &&
+    Assert(baseSym == NULL ||
           baseSym->varyingCFDepth <= ctx->VaryingCFDepth());
    if (!g->opt.disableMaskedStoreToStore &&
        !g->opt.disableMaskAllOnOptimizations &&
+        baseSym != NULL &&
        baseSym->varyingCFDepth == ctx->VaryingCFDepth() &&
        baseSym->storageClass != SC_STATIC &&
        dynamic_cast<const ReferenceType *>(baseSym->type) == NULL &&
@@ -2016,14 +2030,13 @@ AssignExpr::GetValue(FunctionEmitContext *ctx) const {
    ctx->SetDebugPos(pos);

    Symbol *baseSym = lvalue->GetBaseSymbol();
-    // Should be caught during type-checking...
-    assert(baseSym != NULL);

    switch (op) {
    case Assign: {
        llvm::Value *lv = lvalue->GetLValue(ctx);
        if (lv == NULL) {
-            Assert(m->errorCount > 0);
+            Error(lvalue->pos, "Left hand side of assignment expression can't "
+                  "be assigned to.");
            return NULL;
        }
        const Type *lvalueType = lvalue->GetLValueType();
@@ -2146,13 +2159,13 @@ AssignExpr::TypeCheck() {
        }
    }

-    if (lvalue->GetBaseSymbol() == NULL) {
-        Error(lvalue->pos, "Left hand side of assignment statement can't be "
-              "assigned to.");
+    const Type *lhsType = lvalue->GetType();
+    if (lhsType->IsConstType()) {
+        Error(lvalue->pos, "Can't assign to type \"%s\" on left-hand side of "
+              "expression.", lhsType->GetString().c_str());
        return NULL;
    }

-    const Type *lhsType = lvalue->GetType();
    if (dynamic_cast<const PointerType *>(lhsType) != NULL) {
        if (op == AddAssign || op == SubAssign) {
            if (PointerType::IsVoidPointer(lhsType)) {
@@ -2186,12 +2199,6 @@ AssignExpr::TypeCheck() {
    if (rvalue == NULL)
        return NULL;

-    if (lhsType->IsConstType()) {
-        Error(pos, "Can't assign to type \"%s\" on left-hand side of "
-              "expression.", lhsType->GetString().c_str());
-        return NULL;
-    }
-
    // Make sure we're not assigning to a struct that has a constant member
    const StructType *st = dynamic_cast<const StructType *>(lhsType);
    if (st != NULL && lCheckForConstStructMember(pos, st, st))
@@ -2709,7 +2716,7 @@ FunctionCallExpr::TypeCheck() {
                    !(argCouldBeNULL[i] == true &&
                      dynamic_cast<const PointerType *>(paramType) != NULL)) {
                    Error(args->exprs[i]->pos, "Can't convert argument of "
-                          "type \"%s\" to type \"%s\" for funcion call "
+                          "type \"%s\" to type \"%s\" for function call "
                          "argument.", argTypes[i]->GetString().c_str(),
                          paramType->GetString().c_str());
                    return NULL;
@@ -3525,6 +3532,12 @@ VectorMemberExpr::getElementType() const {
 MemberExpr *
 MemberExpr::create(Expr *e, const char *id, SourcePos p, SourcePos idpos,
                   bool derefLValue) {
+    // FIXME: we need to call TypeCheck() here so that we can call
+    // e->GetType() in the following.  But really we just shouldn't try to
+    // resolve this now but just have a generic MemberExpr type that
+    // handles all cases so that this is unnecessary.
+    e = ::TypeCheck(e);
+
    const Type *exprType;
    if (e == NULL || (exprType = e->GetType()) == NULL)
        return NULL;
@@ -4536,18 +4549,10 @@ ConstExpr::Print() const {
            printf("%f", floatVal[i]);
            break;
        case AtomicType::TYPE_INT64:
-#ifdef ISPC_IS_LINUX
-            printf("%ld", int64Val[i]);
-#else
-            printf("%lld", int64Val[i]);
-#endif
+            printf("%"PRId64, int64Val[i]);
            break;
        case AtomicType::TYPE_UINT64:
-#ifdef ISPC_IS_LINUX
-            printf("%lu", uint64Val[i]);
-#else
-            printf("%llu", uint64Val[i]);
-#endif
+            printf("%"PRIu64, uint64Val[i]);
            break;
        case AtomicType::TYPE_DOUBLE:
            printf("%f", doubleVal[i]);
@@ -4566,11 +4571,10 @@ ConstExpr::Print() const {
 ///////////////////////////////////////////////////////////////////////////
 // TypeCastExpr

-TypeCastExpr::TypeCastExpr(const Type *t, Expr *e, bool pu, SourcePos p) 
+TypeCastExpr::TypeCastExpr(const Type *t, Expr *e, SourcePos p) 
  : Expr(p) {
    type = t;
    expr = e;
-    preserveUniformity = pu;
 }


@@ -5213,7 +5217,7 @@ TypeCastExpr::GetValue(FunctionEmitContext *ctx) const {
        if (Type::EqualIgnoringConst(arrayAsPtr->GetType(), toPointerType) == false) {
            Assert(Type::EqualIgnoringConst(arrayAsPtr->GetType()->GetAsVaryingType(),
                                            toPointerType) == true);
-            arrayAsPtr = new TypeCastExpr(toPointerType, arrayAsPtr, false, pos);
+            arrayAsPtr = new TypeCastExpr(toPointerType, arrayAsPtr, pos);
            arrayAsPtr = ::TypeCheck(arrayAsPtr);
            Assert(arrayAsPtr != NULL);
            arrayAsPtr = ::Optimize(arrayAsPtr);
@@ -5364,6 +5368,7 @@ TypeCastExpr::GetValue(FunctionEmitContext *ctx) const {

 const Type *
 TypeCastExpr::GetType() const { 
+    Assert(type->HasUnboundVariability() == false);
    return type; 
 }

@@ -5373,7 +5378,7 @@ lDeconstifyType(const Type *t) {
    const PointerType *pt = dynamic_cast<const PointerType *>(t);
    if (pt != NULL)
        return new PointerType(lDeconstifyType(pt->GetBaseType()), 
-                               pt->IsUniformType(), false);
+                               pt->GetVariability(), false);
    else
        return t->GetAsNonConstType();
 }
@@ -5384,16 +5389,16 @@ TypeCastExpr::TypeCheck() {
    if (expr == NULL)
        return NULL;

-    const Type *toType = GetType(), *fromType = expr->GetType();
+    const Type *toType = type, *fromType = expr->GetType();
    if (toType == NULL || fromType == NULL)
        return NULL;

-    if (preserveUniformity == true && fromType->IsUniformType() &&
-        toType->IsVaryingType()) {
+    if (toType->HasUnboundVariability() && fromType->IsUniformType()) {
        TypeCastExpr *tce = new TypeCastExpr(toType->GetAsUniformType(),
-                                             expr, false, pos);
+                                             expr, pos);
        return ::TypeCheck(tce);
    }
+    type = toType = type->ResolveUnboundVariability(Type::Varying);

    fromType = lDeconstifyType(fromType);
    toType = lDeconstifyType(toType);
@@ -5862,6 +5867,8 @@ SizeOfExpr::SizeOfExpr(Expr *e, SourcePos p)

 SizeOfExpr::SizeOfExpr(const Type *t, SourcePos p)
    : Expr(p), expr(NULL), type(t) {
+    if (type->HasUnboundVariability())
+        type = type->ResolveUnboundVariability(Type::Varying);
 }


@@ -6026,7 +6033,8 @@ FunctionSymbolExpr::GetType() const {
        return NULL;
    }

-    return matchingFunc ? new PointerType(matchingFunc->type, true, true) : NULL;
+    return matchingFunc ? 
+        new PointerType(matchingFunc->type, Type::Uniform, true) : NULL;
 }


--- a/expr.h
+++ b/expr.h
@@ -314,7 +314,6 @@ public:
    std::string identifier;
    const SourcePos identifierPos;

-protected:
    MemberExpr(Expr *expr, const char *identifier, SourcePos pos, 
               SourcePos identifierPos, bool derefLValue);

@@ -493,8 +492,7 @@ private:
    probably-different type. */
 class TypeCastExpr : public Expr {
 public:
-    TypeCastExpr(const Type *t, Expr *e, bool preserveUniformity,
-                 SourcePos p);
+    TypeCastExpr(const Type *t, Expr *e, SourcePos p);

    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
    const Type *GetType() const;
@@ -507,7 +505,6 @@ public:

    const Type *type;
    Expr *expr;
-    bool preserveUniformity;
 };


--- a/func.cpp
+++ b/func.cpp
@@ -290,8 +290,10 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
            llvm::BasicBlock *bbAllOn = ctx->CreateBasicBlock("all_on");
            llvm::BasicBlock *bbNotAll = ctx->CreateBasicBlock("not_all_on");

-            ctx->BranchInst(bbAllOn, bbNotAll, allOn);
+            // Set up basic blocks for goto targets
+            ctx->InitializeLabelMap(code);

+            ctx->BranchInst(bbAllOn, bbNotAll, allOn);
            // all on: we've determined dynamically that the mask is all
            // on.  Set the current mask to "all on" explicitly so that
            // codegen for this path can be improved with this knowledge in
@@ -322,14 +324,22 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
            // above
            ctx->SetCurrentBasicBlock(bbSomeOn);
            ctx->SetFunctionMask(mask);
+
+            // Set up basic blocks for goto targets again; we want to have
+            // one set of them for gotos in the 'all on' case, and a
+            // distinct set for the 'mixed mask' case.
+            ctx->InitializeLabelMap(code);
+
            code->EmitCode(ctx);
            if (ctx->GetCurrentBasicBlock())
                ctx->ReturnInst();
-
        }
-        else
+        else {
+            // Set up basic blocks for goto targets
+            ctx->InitializeLabelMap(code);
            // No check, just emit the code
            code->EmitCode(ctx);
+        }
    }

    if (ctx->GetCurrentBasicBlock()) {
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -210,7 +210,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
        t->isa = Target::AVX2;
        t->nativeVectorWidth = 8;
        t->vectorWidth = 8;
-        t->attributes = "+avx2,+popcnt,+cmov";
+        t->attributes = "+avx2,+popcnt,+cmov,+f16c";
        t->maskingIsFree = false;
        t->allOffMaskIsSafe = false;
        t->maskBitCount = 32;
@@ -219,7 +219,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
        t->isa = Target::AVX2;
        t->nativeVectorWidth = 16;
        t->vectorWidth = 16;
-        t->attributes = "+avx2,+popcnt,+cmov";
+        t->attributes = "+avx2,+popcnt,+cmov,+f16c";
        t->maskingIsFree = false;
        t->allOffMaskIsSafe = false;
        t->maskBitCount = 32;
@@ -358,10 +358,45 @@ Target::GetISAString() const {
 }


+static bool
+lGenericTypeLayoutIndeterminate(LLVM_TYPE_CONST llvm::Type *type) {
+    if (type->isPrimitiveType() || type->isIntegerTy())
+        return false;
+
+    if (type == LLVMTypes::BoolVectorType ||
+        type == LLVMTypes::MaskType ||
+        type == LLVMTypes::Int1VectorType)
+        return true;
+
+    LLVM_TYPE_CONST llvm::ArrayType *at = 
+        llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(type);
+    if (at != NULL)
+        return lGenericTypeLayoutIndeterminate(at->getElementType());
+
+    LLVM_TYPE_CONST llvm::PointerType *pt = 
+        llvm::dyn_cast<LLVM_TYPE_CONST llvm::PointerType>(type);
+    if (pt != NULL)
+        return false;
+
+    LLVM_TYPE_CONST llvm::StructType *st =
+        llvm::dyn_cast<LLVM_TYPE_CONST llvm::StructType>(type);
+    if (st != NULL) {
+        for (int i = 0; i < (int)st->getNumElements(); ++i)
+            if (lGenericTypeLayoutIndeterminate(st->getElementType(i)))
+                return true;
+        return false;
+    }
+
+    Assert(llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(type));
+    return true;
+}
+
+
 llvm::Value *
 Target::SizeOf(LLVM_TYPE_CONST llvm::Type *type, 
               llvm::BasicBlock *insertAtEnd) {
-    if (isa == Target::GENERIC && type->isPrimitiveType() == false) {
+    if (isa == Target::GENERIC &&
+        lGenericTypeLayoutIndeterminate(type)) {
        llvm::Value *index[1] = { LLVMInt32(1) };
        LLVM_TYPE_CONST llvm::PointerType *ptrType = llvm::PointerType::get(type, 0);
        llvm::Value *voidPtr = llvm::ConstantPointerNull::get(ptrType);
@@ -396,7 +431,8 @@ Target::SizeOf(LLVM_TYPE_CONST llvm::Type *type,
 llvm::Value *
 Target::StructOffset(LLVM_TYPE_CONST llvm::Type *type, int element,
                     llvm::BasicBlock *insertAtEnd) {
-    if (isa == Target::GENERIC && type->isPrimitiveType() == false) {
+    if (isa == Target::GENERIC && 
+        lGenericTypeLayoutIndeterminate(type) == true) {
        llvm::Value *indices[2] = { LLVMInt32(0), LLVMInt32(element) };
        LLVM_TYPE_CONST llvm::PointerType *ptrType = llvm::PointerType::get(type, 0);
        llvm::Value *voidPtr = llvm::ConstantPointerNull::get(ptrType);
--- a/ispc.h
+++ b/ispc.h
@@ -98,6 +98,8 @@ namespace llvm {
 #endif

 class ArrayType;
+class AST;
+class ASTNode;
 class AtomicType;
 class FunctionEmitContext;
 class Expr;
@@ -421,6 +423,7 @@ enum {
    COST_FUNPTR_UNIFORM = 12,
    COST_FUNPTR_VARYING = 24,
    COST_GATHER = 8,
+    COST_GOTO = 4,
    COST_LOAD = 2,
    COST_REGULAR_BREAK_CONTINUE = 2,
    COST_RETURN = 4,
@@ -434,6 +437,8 @@ enum {
    COST_VARYING_IF = 3,
    COST_UNIFORM_LOOP = 4,
    COST_VARYING_LOOP = 6,
+    COST_UNIFORM_SWITCH = 4,
+    COST_VARYING_SWITCH = 12,
    COST_ASSERT = 8,

    CHECK_MASK_AT_FUNCTION_START_COST = 16,
--- a/ispc.vcxproj
+++ b/ispc.vcxproj
@@ -18,8 +18,10 @@
    <ClCompile Include="decl.cpp" />
    <ClCompile Include="expr.cpp" />
    <ClCompile Include="func.cpp" />
-    <ClCompile Include="gen-bitcode-avx.cpp" />
-    <ClCompile Include="gen-bitcode-avx-x2.cpp" />
+    <ClCompile Include="gen-bitcode-avx1.cpp" />
+    <ClCompile Include="gen-bitcode-avx1-x2.cpp" />
+    <ClCompile Include="gen-bitcode-avx2.cpp" />
+    <ClCompile Include="gen-bitcode-avx2-x2.cpp" />
    <ClCompile Include="gen-bitcode-c-32.cpp" />
    <ClCompile Include="gen-bitcode-c-64.cpp" />
    <ClCompile Include="gen-bitcode-dispatch.cpp" />
@@ -158,29 +160,55 @@
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
-    <CustomBuild Include="builtins\target-avx.ll">
+    <CustomBuild Include="builtins\target-avx1.ll">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx.ll | python bitcode2cpp.py builtins\target-avx.ll &gt; gen-bitcode-avx.cpp</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-avx-common.ll</AdditionalInputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx.ll | python bitcode2cpp.py builtins\target-avx.ll &gt; gen-bitcode-avx.cpp</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-avx-common.ll</AdditionalInputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx.cpp</Message>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx.cpp</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll &gt; gen-bitcode-avx1.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx1.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll &gt; gen-bitcode-avx1.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx1.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx1.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx1.cpp</Message>
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
-    <CustomBuild Include="builtins\target-avx-x2.ll">
+    <CustomBuild Include="builtins\target-avx1-x2.ll">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx-x2.ll | python bitcode2cpp.py builtins\target-avx-x2.ll &gt; gen-bitcode-avx-x2.cpp</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx-x2.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-avx-common.ll</AdditionalInputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx-x2.ll | python bitcode2cpp.py builtins\target-avx-x2.ll &gt; gen-bitcode-avx-x2.cpp</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx-x2.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-avx-common.ll</AdditionalInputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx-x2.cpp</Message>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx-x2.cpp</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx1-x2.ll | python bitcode2cpp.py builtins\target-avx1-x2.ll &gt; gen-bitcode-avx1-x2.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx1-x2.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx1-x2.ll | python bitcode2cpp.py builtins\target-avx1-x2.ll &gt; gen-bitcode-avx1-x2.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx1-x2.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\targets-avx-x2.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx1-x2.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx1-x2.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-avx2.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx2.ll | python bitcode2cpp.py builtins\target-avx2.ll &gt; gen-bitcode-avx2.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx2.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx2.ll | python bitcode2cpp.py builtins\target-avx2.ll &gt; gen-bitcode-avx2.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx2.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx2.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx2.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-avx2-x2.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx2-x2.ll | python bitcode2cpp.py builtins\target-avx2-x2.ll &gt; gen-bitcode-avx2-x2.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx2-x2.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx2-x2.ll | python bitcode2cpp.py builtins\target-avx2-x2.ll &gt; gen-bitcode-avx2-x2.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx2-x2.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\targets-avx-x2.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx2-x2.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx2-x2.cpp</Message>
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
--- a/lex.ll
+++ b/lex.ll
@@ -42,7 +42,7 @@
 #include <stdlib.h>
 #include <stdint.h>

-static uint64_t lParseBinary(const char *ptr, SourcePos pos);
+static uint64_t lParseBinary(const char *ptr, SourcePos pos, char **endPtr);
 static void lCComment(SourcePos *);
 static void lCppComment(SourcePos *);
 static void lHandleCppHash(SourcePos *);
@@ -67,7 +67,7 @@ inline int isatty(int) { return 0; }
 %option nounistd

 WHITESPACE [ \t\r]+
-INT_NUMBER (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))
+INT_NUMBER (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[kMG]?
 FLOAT_NUMBER (([0-9]+|(([0-9]+\.[0-9]*[fF]?)|(\.[0-9]+)))([eE][-+]?[0-9]+)?[fF]?)
 HEX_FLOAT_NUMBER (0x[01](\.[0-9a-fA-F]*)?p[-+]?[0-9]+[fF]?)

@@ -151,30 +151,44 @@ L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL;
 {INT_NUMBER}+(u|U|l|L)*? { 
    int ls = 0, us = 0;

+    char *endPtr = NULL;
    if (yytext[0] == '0' && yytext[1] == 'b')
-        yylval->intVal = lParseBinary(yytext+2, *yylloc);
+        yylval->intVal = lParseBinary(yytext+2, *yylloc, &endPtr);
    else {
-        char *endPtr = NULL;
-
-#ifdef ISPC_IS_WINDOWS
+#if defined(ISPC_IS_WINDOWS) && !defined(__MINGW32__)
        yylval->intVal = _strtoi64(yytext, &endPtr, 0);
 #else
        // FIXME: should use strtouq and then issue an error if we can't
        // fit into 64 bits...
        yylval->intVal = strtoull(yytext, &endPtr, 0);
 #endif
-        for (; *endPtr; endPtr++) {
-           if (*endPtr == 'l' || *endPtr == 'L')
-                ls++;
-           else if (*endPtr == 'u' || *endPtr == 'U')
-                us++;
-        }
-        if (ls >= 2)
-            return us ? TOKEN_UINT64_CONSTANT : TOKEN_INT64_CONSTANT;
-        else if (ls == 1)
-           return us ? TOKEN_UINT32_CONSTANT : TOKEN_INT32_CONSTANT;
    }

+    bool kilo = false, mega = false, giga = false;
+    for (; *endPtr; endPtr++) {
+        if (*endPtr == 'k')
+            kilo = true;
+        else if (*endPtr == 'M')
+            mega = true;
+        else if (*endPtr == 'G')
+            giga = true;        
+        else if (*endPtr == 'l' || *endPtr == 'L')
+            ls++;
+        else if (*endPtr == 'u' || *endPtr == 'U')
+            us++;
+    }
+    if (kilo)
+        yylval->intVal *= 1024;
+    if (mega)
+        yylval->intVal *= 1024*1024;
+    if (giga)
+        yylval->intVal *= 1024*1024*1024;
+
+    if (ls >= 2)
+        return us ? TOKEN_UINT64_CONSTANT : TOKEN_INT64_CONSTANT;
+    else if (ls == 1)
+        return us ? TOKEN_UINT32_CONSTANT : TOKEN_INT32_CONSTANT;
+
    // See if we can fit this into a 32-bit integer...
    if ((yylval->intVal & 0xffffffff) == yylval->intVal)
        return us ? TOKEN_UINT32_CONSTANT : TOKEN_INT32_CONSTANT;
@@ -268,14 +282,11 @@ L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL;
 /** Return the integer version of a binary constant from a string.
 */
 static uint64_t
-lParseBinary(const char *ptr, SourcePos pos) {
+lParseBinary(const char *ptr, SourcePos pos, char **endPtr) {
    uint64_t val = 0;
    bool warned = false;

-    while (*ptr != '\0') {
-        /* if this hits, the regexp for 0b... constants is broken */
-        Assert(*ptr == '0' || *ptr == '1');
-
+    while (*ptr == '0' || *ptr == '1') {
        if ((val & (((int64_t)1)<<63)) && warned == false) {
            // We're about to shift out a set bit
            Warning(pos, "Can't represent binary constant with a 64-bit integer type");
@@ -285,6 +296,7 @@ lParseBinary(const char *ptr, SourcePos pos) {
        val = (val << 1) | (*ptr == '0' ? 0 : 1);
        ++ptr;
    }
+    *endPtr = (char *)ptr;
    return val;
 }

--- a/llvmutil.cpp
+++ b/llvmutil.cpp
@@ -36,7 +36,9 @@
 */

 #include "llvmutil.h"
+#include "ispc.h"
 #include "type.h"
+#include <llvm/Instructions.h>

 LLVM_TYPE_CONST llvm::Type *LLVMTypes::VoidType = NULL;
 LLVM_TYPE_CONST llvm::PointerType *LLVMTypes::VoidPointerType = NULL;
@@ -109,7 +111,7 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target target) {
        LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
            llvm::VectorType::get(llvm::Type::getInt1Ty(*ctx), target.vectorWidth);
    else {
-        assert(target.maskBitCount == 32);
+        Assert(target.maskBitCount == 32);
        LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
            llvm::VectorType::get(llvm::Type::getInt32Ty(*ctx), target.vectorWidth);
    }
@@ -465,3 +467,239 @@ LLVMBoolVector(const bool *bvec) {
    }
    return llvm::ConstantVector::get(vals);
 }
+
+
+/** Conservative test to see if two llvm::Values are equal.  There are
+    (potentially many) cases where the two values actually are equal but
+    this will return false.  However, if it does return true, the two
+    vectors definitely are equal.
+
+    @todo This seems to catch all of the cases we currently need it for in
+    practice, but it's be nice to make it a little more robust/general.  In
+    general, though, a little something called the halting problem means we
+    won't get all of them.
+*/
+static bool
+lValuesAreEqual(llvm::Value *v0, llvm::Value *v1, 
+                std::vector<llvm::PHINode *> &seenPhi0,
+                std::vector<llvm::PHINode *> &seenPhi1) {
+    // Thanks to the fact that LLVM hashes and returns the same pointer for
+    // constants (of all sorts, even constant expressions), this first test
+    // actually catches a lot of cases.  LLVM's SSA form also helps a lot
+    // with this..
+    if (v0 == v1)
+        return true;
+
+    Assert(seenPhi0.size() == seenPhi1.size());
+    for (unsigned int i = 0; i < seenPhi0.size(); ++i)
+        if (v0 == seenPhi0[i] && v1 == seenPhi1[i])
+            return true;
+
+    llvm::BinaryOperator *bo0 = llvm::dyn_cast<llvm::BinaryOperator>(v0);
+    llvm::BinaryOperator *bo1 = llvm::dyn_cast<llvm::BinaryOperator>(v1);
+    if (bo0 != NULL && bo1 != NULL) {
+        if (bo0->getOpcode() != bo1->getOpcode())
+            return false;
+        return (lValuesAreEqual(bo0->getOperand(0), bo1->getOperand(0),
+                                seenPhi0, seenPhi1) &&
+                lValuesAreEqual(bo0->getOperand(1), bo1->getOperand(1),
+                                seenPhi0, seenPhi1));
+    }
+
+    llvm::PHINode *phi0 = llvm::dyn_cast<llvm::PHINode>(v0);
+    llvm::PHINode *phi1 = llvm::dyn_cast<llvm::PHINode>(v1);
+    if (phi0 != NULL && phi1 != NULL) {
+        if (phi0->getNumIncomingValues() != phi1->getNumIncomingValues())
+            return false;
+
+        seenPhi0.push_back(phi0);
+        seenPhi1.push_back(phi1);
+
+        unsigned int numIncoming = phi0->getNumIncomingValues();
+        // Check all of the incoming values: if all of them are all equal,
+        // then we're good.
+        bool anyFailure = false;
+        for (unsigned int i = 0; i < numIncoming; ++i) {
+            Assert(phi0->getIncomingBlock(i) == phi1->getIncomingBlock(i));
+            if (!lValuesAreEqual(phi0->getIncomingValue(i), 
+                                 phi1->getIncomingValue(i), seenPhi0, seenPhi1)) {
+                anyFailure = true;
+                break;
+            }
+        }
+
+        seenPhi0.pop_back();
+        seenPhi1.pop_back();
+
+        return !anyFailure;
+    }
+
+    return false;
+}
+
+
+/** Given an llvm::Value known to be an integer, return its value as
+    an int64_t.
+*/
+static int64_t
+lGetIntValue(llvm::Value *offset) {
+    llvm::ConstantInt *intOffset = llvm::dyn_cast<llvm::ConstantInt>(offset);
+    Assert(intOffset && (intOffset->getBitWidth() == 32 ||
+                         intOffset->getBitWidth() == 64));
+    return intOffset->getSExtValue();
+}
+
+
+/** This function takes chains of InsertElement instructions along the
+    lines of:
+
+    %v0 = insertelement undef, value_0, i32 index_0
+    %v1 = insertelement %v1,   value_1, i32 index_1
+    ...
+    %vn = insertelement %vn-1, value_n-1, i32 index_n-1
+
+    and initializes the provided elements array such that the i'th
+    llvm::Value * in the array is the element that was inserted into the
+    i'th element of the vector.
+*/
+void
+LLVMFlattenInsertChain(llvm::InsertElementInst *ie, int vectorWidth,
+                       llvm::Value **elements) {
+    for (int i = 0; i < vectorWidth; ++i)
+        elements[i] = NULL;
+
+    while (ie != NULL) {
+        int64_t iOffset = lGetIntValue(ie->getOperand(2));
+        Assert(iOffset >= 0 && iOffset < vectorWidth);
+        Assert(elements[iOffset] == NULL);
+
+        elements[iOffset] = ie->getOperand(1);
+
+        llvm::Value *insertBase = ie->getOperand(0);
+        ie = llvm::dyn_cast<llvm::InsertElementInst>(insertBase);
+        if (ie == NULL) {
+            if (llvm::isa<llvm::UndefValue>(insertBase))
+                return;
+
+            llvm::ConstantVector *cv = 
+                llvm::dyn_cast<llvm::ConstantVector>(insertBase);
+            Assert(cv != NULL);
+            Assert(iOffset < (int)cv->getNumOperands());
+            elements[iOffset] = cv->getOperand(iOffset);
+        }
+    }
+}
+
+
+/** Tests to see if all of the elements of the vector in the 'v' parameter
+    are equal.  Like lValuesAreEqual(), this is a conservative test and may
+    return false for arrays where the values are actually all equal.  */
+bool
+LLVMVectorValuesAllEqual(llvm::Value *v, int vectorLength,
+                         std::vector<llvm::PHINode *> &seenPhis) {
+    if (llvm::isa<llvm::ConstantAggregateZero>(v))
+        return true;
+
+    llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(v);
+    if (cv != NULL)
+        return (cv->getSplatValue() != NULL);
+
+    llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(v);
+    if (bop != NULL)
+        return (LLVMVectorValuesAllEqual(bop->getOperand(0), vectorLength, 
+                                      seenPhis) &&
+                LLVMVectorValuesAllEqual(bop->getOperand(1), vectorLength, 
+                                      seenPhis));
+
+    llvm::CastInst *cast = llvm::dyn_cast<llvm::CastInst>(v);
+    if (cast != NULL)
+        return LLVMVectorValuesAllEqual(cast->getOperand(0), vectorLength, 
+                                     seenPhis);
+
+    llvm::InsertElementInst *ie = llvm::dyn_cast<llvm::InsertElementInst>(v);
+    if (ie != NULL) {
+        llvm::Value *elements[ISPC_MAX_NVEC];
+        LLVMFlattenInsertChain(ie, vectorLength, elements);
+
+        // We will ignore any values of elements[] that are NULL; as they
+        // correspond to undefined values--we just want to see if all of
+        // the defined values have the same value.
+        int lastNonNull = 0;
+        while (lastNonNull < vectorLength && elements[lastNonNull] == NULL)
+            ++lastNonNull;
+
+        if (lastNonNull == vectorLength)
+            // all of them are undef!
+            return true;
+
+        for (int i = lastNonNull; i < vectorLength; ++i) {
+            if (elements[i] == NULL)
+                continue;
+
+            std::vector<llvm::PHINode *> seenPhi0;
+            std::vector<llvm::PHINode *> seenPhi1;
+            if (lValuesAreEqual(elements[lastNonNull], elements[i], seenPhi0, 
+                                seenPhi1) == false)
+                return false;
+            lastNonNull = i;
+        }
+        return true;
+    }
+
+    llvm::PHINode *phi = llvm::dyn_cast<llvm::PHINode>(v);
+    if (phi) {
+        for (unsigned int i = 0; i < seenPhis.size(); ++i)
+            if (seenPhis[i] == phi)
+                return true;
+
+        seenPhis.push_back(phi);
+
+        unsigned int numIncoming = phi->getNumIncomingValues();
+        // Check all of the incoming values: if all of them are all equal,
+        // then we're good.
+        for (unsigned int i = 0; i < numIncoming; ++i) {
+            if (!LLVMVectorValuesAllEqual(phi->getIncomingValue(i), vectorLength,
+                                       seenPhis)) {
+                seenPhis.pop_back();
+                return false;
+            }
+        }
+
+        seenPhis.pop_back();
+        return true;
+    }
+
+    Assert(!llvm::isa<llvm::Constant>(v));
+
+    if (llvm::isa<llvm::CallInst>(v) || llvm::isa<llvm::LoadInst>(v) ||
+        !llvm::isa<llvm::Instruction>(v))
+        return false;
+
+    llvm::ShuffleVectorInst *shuffle = llvm::dyn_cast<llvm::ShuffleVectorInst>(v);
+    if (shuffle != NULL) {
+        llvm::Value *indices = shuffle->getOperand(2);
+        if (LLVMVectorValuesAllEqual(indices, vectorLength, seenPhis))
+            // The easy case--just a smear of the same element across the
+            // whole vector.
+            return true;
+
+        // TODO: handle more general cases?
+        return false;
+    }
+
+#if 0
+    fprintf(stderr, "all equal: ");
+    v->dump();
+    fprintf(stderr, "\n");
+    llvm::Instruction *inst = llvm::dyn_cast<llvm::Instruction>(v);
+    if (inst) {
+        inst->getParent()->dump();
+        fprintf(stderr, "\n");
+        fprintf(stderr, "\n");
+    }
+#endif
+
+    return false;
+}
+
+
--- a/llvmutil.h
+++ b/llvmutil.h
@@ -38,12 +38,23 @@
 #ifndef ISPC_LLVMUTIL_H
 #define ISPC_LLVMUTIL_H 1

-#include "ispc.h"
 #include <llvm/LLVMContext.h>
 #include <llvm/Type.h>
 #include <llvm/DerivedTypes.h>
 #include <llvm/Constants.h>

+namespace llvm {
+    class PHINode;
+    class InsertElementInst;
+}
+
+// llvm::Type *s are no longer const in llvm 3.0
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
+#define LLVM_TYPE_CONST
+#else
+#define LLVM_TYPE_CONST const
+#endif
+

 /** This structure holds pointers to a variety of LLVM types; code
    elsewhere can use them from here, ratherthan needing to make more
@@ -99,6 +110,7 @@ extern llvm::Constant *LLVMTrue, *LLVMFalse;
    of LLVMTypes and the LLVMTrue/LLVMFalse constants.  However, it can't
    be called until the compilation target is known.
 */
+struct Target;
 extern void InitLLVMUtil(llvm::LLVMContext *ctx, Target target);

 /** Returns an LLVM i8 constant of the given value */
@@ -205,4 +217,13 @@ extern llvm::Constant *LLVMMaskAllOn;
 /** LLVM constant value representing an 'all off' SIMD lane mask */
 extern llvm::Constant *LLVMMaskAllOff;

+/** Tests to see if all of the elements of the vector in the 'v' parameter
+    are equal.  Like lValuesAreEqual(), this is a conservative test and may
+    return false for arrays where the values are actually all equal.  */
+extern bool LLVMVectorValuesAllEqual(llvm::Value *v, int vectorLength,
+                                     std::vector<llvm::PHINode *> &seenPhis);
+
+void LLVMFlattenInsertChain(llvm::InsertElementInst *ie, int vectorWidth,
+                            llvm::Value **elements);
+
 #endif // ISPC_LLVMUTIL_H
--- a/main.cpp
+++ b/main.cpp
@@ -38,6 +38,7 @@
 #include "ispc.h"
 #include "module.h"
 #include "util.h"
+#include "type.h"
 #include <stdio.h>
 #include <stdlib.h>
 #include <llvm/Support/PrettyStackTrace.h>
@@ -53,14 +54,33 @@

 #ifdef ISPC_IS_WINDOWS
 #define strcasecmp stricmp
+#ifndef BUILD_DATE
 #define BUILD_DATE __DATE__
+#endif
 #define BUILD_VERSION ""
 #endif // ISPC_IS_WINDOWS

-static void usage(int ret) {
-    printf("This is the Intel(r) SPMD Program Compiler (ispc), build %s (%s)\n\n", 
-           BUILD_DATE, BUILD_VERSION);
-    printf("usage: ispc\n");
+static void
+lPrintVersion() {
+    printf("Intel(r) SPMD Program Compiler (ispc), build %s (%s, LLVM %s)\n", 
+           BUILD_DATE, BUILD_VERSION,
+#ifdef LLVM_2_9
+           "2.9"
+#elif defined(LLVM_3_0) || defined(LLVM_3_0svn)
+           "3.0"
+#elif defined(LLVM_3_1) || defined(LLVM_3_1svn)
+           "3.1"
+#else
+#error "Unhandled LLVM version"
+#endif 
+           );
+}
+
+
+static void
+usage(int ret) {
+    lPrintVersion();
+    printf("\nusage: ispc\n");
    printf("    [--addressing={32,64}]\t\tSelect 32- or 64-bit addressing. (Note that 32-bit\n");
    printf("                          \t\taddressing calculations are done by default, even\n");
    printf("                          \t\ton 64-bit target architectures.)\n");
@@ -188,6 +208,8 @@ int main(int Argc, char *Argv[]) {
    LLVMInitializeX86TargetMC();
 #endif

+    AtomicType::Init();
+
    char *file = NULL;
    const char *headerFileName = NULL;
    const char *outFileName = NULL;
@@ -362,8 +384,7 @@ int main(int Argc, char *Argv[]) {
            generatePIC = true;
 #endif // !ISPC_IS_WINDOWS
        else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--version")) {
-            printf("Intel(r) SPMD Program Compiler (ispc) build %s (%s)\n", 
-                   BUILD_DATE, BUILD_VERSION);
+            lPrintVersion();
            return 0;
        }
        else if (argv[i][0] == '-') {
--- a/opt.cpp
+++ b/opt.cpp
--- a/parse.yy
+++ b/parse.yy
@@ -224,7 +224,7 @@ struct ForeachDimension {
 %type <declSpecs> declaration_specifiers 

 %type <stringVal> string_constant
-%type <constCharPtr> struct_or_union_name enum_identifier
+%type <constCharPtr> struct_or_union_name enum_identifier goto_identifier
 %type <intVal> int_constant soa_width_specifier

 %type <foreachDimension> foreach_dimension_specifier
@@ -362,13 +362,7 @@ cast_expression
    : unary_expression
    | '(' type_name ')' cast_expression
      {
-          // Pass true here to try to preserve uniformity 
-          // so that things like:
-          // uniform int y = ...;
-          // uniform float x = 1. / (float)y;
-          // don't issue an error due to (float)y being inadvertently
-          // and undesirably-to-the-user "varying"...
-          $$ = new TypeCastExpr($2, $4, true, Union(@1,@4)); 
+          $$ = new TypeCastExpr($2, $4, Union(@1,@4)); 
      }
    ;

@@ -500,6 +494,7 @@ declaration_statement
            $$ = NULL;
        }
        else {
+            $1->DeclareFunctions();
            std::vector<VariableDeclaration> vars = $1->GetVariableDeclarations();
            $$ = new DeclStmt(vars, @1);
        }
@@ -638,13 +633,13 @@ type_specifier

 atomic_var_type_specifier
    : TOKEN_VOID { $$ = AtomicType::Void; }
-    | TOKEN_BOOL { $$ = AtomicType::VaryingBool; }
-    | TOKEN_INT8 { $$ = AtomicType::VaryingInt8; }
-    | TOKEN_INT16 { $$ = AtomicType::VaryingInt16; }
-    | TOKEN_INT { $$ = AtomicType::VaryingInt32; }
-    | TOKEN_FLOAT { $$ = AtomicType::VaryingFloat; }
-    | TOKEN_DOUBLE { $$ = AtomicType::VaryingDouble; }
-    | TOKEN_INT64 { $$ = AtomicType::VaryingInt64; }
+    | TOKEN_BOOL { $$ = AtomicType::UnboundBool; }
+    | TOKEN_INT8 { $$ = AtomicType::UnboundInt8; }
+    | TOKEN_INT16 { $$ = AtomicType::UnboundInt16; }
+    | TOKEN_INT { $$ = AtomicType::UnboundInt32; }
+    | TOKEN_FLOAT { $$ = AtomicType::UnboundFloat; }
+    | TOKEN_DOUBLE { $$ = AtomicType::UnboundDouble; }
+    | TOKEN_INT64 { $$ = AtomicType::UnboundInt64; }
    ;

 short_vec_specifier
@@ -670,7 +665,7 @@ struct_or_union_specifier
          GetStructTypesNamesPositions(*$4, &elementTypes, &elementNames,
                                       &elementPositions);
          StructType *st = new StructType($2, elementTypes, elementNames,
-                                          elementPositions, false, true, @2);
+                                          elementPositions, false, Type::Unbound, @2);
          m->symbolTable->AddType($2, st, @2);
          $$ = st;
      }
@@ -681,8 +676,9 @@ struct_or_union_specifier
          std::vector<SourcePos> elementPositions;
          GetStructTypesNamesPositions(*$3, &elementTypes, &elementNames,
                                       &elementPositions);
+          // FIXME: should be unbound
          $$ = new StructType("", elementTypes, elementNames, elementPositions,
-                              false, true, @1);
+                              false, Type::Unbound, @1);
      }
    | struct_or_union '{' '}' 
      {
@@ -748,7 +744,7 @@ specifier_qualifier_list
            else if ($1 == TYPEQUAL_SIGNED) {
                if ($2->IsIntType() == false) {
                    Error(@1, "Can't apply \"signed\" qualifier to \"%s\" type.",
-                          $2->GetString().c_str());
+                          $2->ResolveUnboundVariability(Type::Varying)->GetString().c_str());
                    $$ = $2;
                }
            }
@@ -758,7 +754,7 @@ specifier_qualifier_list
                    $$ = t;
                else {
                    Error(@1, "Can't apply \"unsigned\" qualifier to \"%s\" type. Ignoring.",
-                          $2->GetString().c_str());
+                          $2->ResolveUnboundVariability(Type::Varying)->GetString().c_str());
                    $$ = $2;
                }
            } 
@@ -775,8 +771,11 @@ specifier_qualifier_list
            else
                FATAL("Unhandled type qualifier in parser.");
        }
-        else
+        else {
+            if (m->errorCount == 0)
+                Error(@1, "Lost type qualifier in parser.");  
            $$ = NULL;
+        }
    }
    ;

@@ -1112,8 +1111,7 @@ type_name
 abstract_declarator
    : pointer
      {
-          Declarator *d = new Declarator(DK_POINTER, @1);
-          $$ = d;
+          $$ = $1;
      }
    | direct_abstract_declarator
    | pointer direct_abstract_declarator
@@ -1262,10 +1260,22 @@ statement
    ;

 labeled_statement
-    : TOKEN_CASE constant_expression ':' statement
-      { UNIMPLEMENTED; }
+    : goto_identifier ':' statement
+    {
+        $$ = new LabeledStmt($1, $3, @1);
+    }
+    | TOKEN_CASE constant_expression ':' statement
+      { 
+          int value;
+          if ($2 != NULL && 
+              lGetConstantInt($2, &value, @2, "Case statement value")) {
+              $$ = new CaseStmt(value, $4, Union(@1, @2));
+          }
+          else
+              $$ = NULL;
+      }
    | TOKEN_DEFAULT ':' statement
-      { UNIMPLEMENTED; }
+      { $$ = new DefaultStmt($3, @1); }
    ;

 start_scope
@@ -1311,7 +1321,7 @@ selection_statement
    | TOKEN_CIF '(' expression ')' statement TOKEN_ELSE statement
      { $$ = new IfStmt($3, $5, $7, true, @1); }
    | TOKEN_SWITCH '(' expression ')' statement
-      { UNIMPLEMENTED; }
+      { $$ = new SwitchStmt($3, $5, @1); }
    ;

 for_test
@@ -1433,9 +1443,13 @@ iteration_statement
     }
    ;

+goto_identifier
+    : TOKEN_IDENTIFIER { $$ = yylval.stringVal->c_str(); }
+    ;
+
 jump_statement
-    : TOKEN_GOTO TOKEN_IDENTIFIER ';'
-      { UNIMPLEMENTED; }
+    : TOKEN_GOTO goto_identifier ';'
+      { $$ = new GotoStmt($2, @1, @2); }
    | TOKEN_CONTINUE ';'
      { $$ = new ContinueStmt(false, @1); }
    | TOKEN_BREAK ';'
@@ -1551,19 +1565,21 @@ lAddDeclaration(DeclSpecs *ds, Declarator *decl) {
        const Type *t = decl->GetType(ds);
        if (t == NULL)
            return;
+
+        Symbol *sym = decl->GetSymbol();
+        Assert(sym != NULL);
        const FunctionType *ft = dynamic_cast<const FunctionType *>(t);
        if (ft != NULL) {
-            Symbol *funSym = decl->GetSymbol();
-            Assert(funSym != NULL);
-            funSym->type = ft;
-            funSym->storageClass = ds->storageClass;
-
+            sym->type = ft;
+            sym->storageClass = ds->storageClass;
            bool isInline = (ds->typeQualifiers & TYPEQUAL_INLINE);
-            m->AddFunctionDeclaration(funSym, isInline);
+            m->AddFunctionDeclaration(sym, isInline);
+        }
+        else {
+            sym->type = sym->type->ResolveUnboundVariability(Type::Varying);
+            bool isConst = (ds->typeQualifiers & TYPEQUAL_CONST) != 0;
+            m->AddGlobalVariable(sym, decl->initExpr, isConst);
        }
-        else
-            m->AddGlobalVariable(decl->GetSymbol(), decl->initExpr,
-                                 (ds->typeQualifiers & TYPEQUAL_CONST) != 0);
    }
 }

@@ -1589,6 +1605,7 @@ lAddFunctionParams(Declarator *decl) {
            continue;
        Assert(pdecl->declarators.size() == 1);
        Symbol *sym = pdecl->declarators[0]->GetSymbol();
+        sym->type = sym->type->ResolveUnboundVariability(Type::Varying);
 #ifndef NDEBUG
        bool ok = m->symbolTable->AddVariable(sym);
        if (ok == false)
@@ -1754,7 +1771,7 @@ lFinalizeEnumeratorSymbols(std::vector<Symbol *> &enums,
               the actual enum type here and optimize it, which will have
               us end up with a ConstExpr with the desired EnumType... */
            Expr *castExpr = new TypeCastExpr(enumType, enums[i]->constValue,
-                                              false, enums[i]->pos);
+                                              enums[i]->pos);
            castExpr = Optimize(castExpr);
            enums[i]->constValue = dynamic_cast<ConstExpr *>(castExpr);
            Assert(enums[i]->constValue != NULL);
--- a/run_tests.py
+++ b/run_tests.py
@@ -15,6 +15,7 @@ import string
 import subprocess
 import shlex
 import platform
+import tempfile

 # This script is affected by http://bugs.python.org/issue5261 on OSX 10.5 Leopard
 # git history has a workaround for that issue.
@@ -79,7 +80,12 @@ if len(args) == 0:
    files = glob.glob("tests/*ispc") + glob.glob("failing_tests/*ispc") + \
        glob.glob("tests_errors/*ispc")
 else:
-    files = args
+    files = [ ]
+    for f in args:
+        if os.path.splitext(string.lower(f))[1] != ".ispc":
+            print "Ignoring file %s, which doesn't have an .ispc extension." % f
+        else:
+            files += [ f ]

 # randomly shuffle the tests if asked to do so
 if (options.random):
@@ -146,16 +152,22 @@ def run_cmds(compile_cmds, run_cmd, filename, expect_failure):


 def run_test(filename):
+    global is_windows
+    if is_windows:
+        input_prefix = "../"
+    else:
+        input_prefix = ""
+        
    # is this a test to make sure an error is issued?
    want_error = (filename.find("tests_errors") != -1)
    if want_error == True:
        ispc_cmd = ispc_exe + " --werror --nowrap %s --arch=%s --target=%s" % \
-            (filename, options.arch, options.target)
+            (input_prefix + filename, options.arch, options.target)
        (return_code, output) = run_command(ispc_cmd)
        got_error = (return_code != 0)

        # figure out the error message we're expecting
-        file = open(filename, 'r')
+        file = open(input_prefix + filename, 'r')
        firstline = file.readline()
        firstline = firstline.replace("//", "")
        firstline = firstline.lstrip()
@@ -179,7 +191,7 @@ def run_test(filename):
        # function that this test has.
        sig2def = { "f_v(" : 0, "f_f(" : 1, "f_fu(" : 2, "f_fi(" : 3, 
                    "f_du(" : 4, "f_duf(" : 5, "f_di(" : 6 }
-        file = open(filename, 'r')
+        file = open(input_prefix + filename, 'r')
        match = -1
        for line in file:
            # look for lines with 'export'...
@@ -201,14 +213,13 @@ def run_test(filename):
            if is_generic_target:
                obj_name = "%s.cpp" % filename

-            global is_windows
            if is_windows:
                if not is_generic_target:
-                    obj_name = "%s.obj" % filename
-                exe_name = "%s.exe" % filename
+                    obj_name = "%s%s.obj" % (input_prefix, filename)
+                exe_name = "%s%s.exe" % (input_prefix, filename)

-                cc_cmd = "%s /I. /Iwinstuff /Zi /nologo /DTEST_SIG=%d test_static.cpp %s /Fe%s" % \
-                         (options.compiler_exe, match, obj_name, exe_name)
+                cc_cmd = "%s /I. /Iwinstuff /Zi /nologo /DTEST_SIG=%d %stest_static.cpp %s /Fe%s" % \
+                         (options.compiler_exe, match, input_prefix, obj_name, exe_name)
                if should_fail:
                    cc_cmd += " /DEXPECT_FAILURE"
            else:
@@ -220,7 +231,7 @@ def run_test(filename):
                    gcc_arch = '-m32'
                else:
                    gcc_arch = '-m64'
-                cc_cmd = "%s -msse4.2 -I. %s test_static.cpp -DTEST_SIG=%d %s -o %s" % \
+                cc_cmd = "%s -O2 -msse4.2 -I. %s test_static.cpp -DTEST_SIG=%d %s -o %s" % \
                         (options.compiler_exe, gcc_arch, match, obj_name, exe_name)
                if platform.system() == 'Darwin':
                    cc_cmd += ' -Wl,-no_pie'
@@ -228,7 +239,7 @@ def run_test(filename):
                    cc_cmd += " -DEXPECT_FAILURE"

            ispc_cmd = ispc_exe + " --woff %s -o %s --arch=%s --target=%s" % \
-                       (filename, obj_name, options.arch, options.target)
+                       (input_prefix+filename, obj_name, options.arch, options.target)
            if options.no_opt:
                ispc_cmd += " -O0" 
            if is_generic_target:
@@ -257,12 +268,28 @@ def run_test(filename):
 # this function will be running in parallel across all of the CPU cores of
 # the system.
 def run_tasks_from_queue(queue, queue_ret):
+    if is_windows:
+        tmpdir = "tmp%d" % os.getpid()
+        os.mkdir(tmpdir)
+        os.chdir(tmpdir)
+    else:
+        olddir = ""
+        
    compile_error_files = [ ]
    run_error_files = [ ]
    while True:
        filename = queue.get()
        if (filename == 'STOP'):
            queue_ret.put((compile_error_files, run_error_files))
+            if is_windows:
+                try:
+                    os.remove("test_static.obj")
+                    os.remove("/vc100.pdb")
+                    os.chdir("..")
+                    os.rmdir(tmpdir)
+                except:
+                    None
+                
            sys.exit(0)

        (compile_error, run_error) = run_test(filename)
@@ -286,61 +313,38 @@ if __name__ == '__main__':

    compile_error_files = [ ]
    run_error_files = [ ]
-    if is_windows:
-        # cl.exe gets itself all confused if we have multiple instances of
-        # it running concurrently and operating on the same .cpp file
-        # (test_static.cpp), even if we are generating a differently-named
-        # exe in the end.  So run serially. :-(
-        nthreads = 1
-        num_done = 0
-        sys.stdout.write("Running %d tests.\n" % (total_tests))
-        for fn in files:
-            fn = fn.replace("\\",'/')
-            (compile_error, run_error) = run_test(fn)
-            if compile_error != 0:
-                compile_error_files += [ fn ]
-            if run_error != 0:
-                run_error_files += [ fn ]
-            num_done += 1
-            progress_str = " Done %d / %d [%s]\n" % (num_done, total_tests, fn)
-            # spaces to clear out detrius from previous printing...
-            for x in range(30):
-                progress_str += ' '
-            progress_str += '\r'
-            sys.stdout.write(progress_str)
-            sys.stdout.flush()
-    else:
-        nthreads = multiprocessing.cpu_count()
-        sys.stdout.write("Found %d CPUs. Running %d tests.\n" % (nthreads, total_tests))

-        # put each of the test filenames into a queue
-        q = multiprocessing.Queue()
-        for fn in files:
-            q.put(fn)
-        for x in range(nthreads):
-            q.put('STOP')
-        qret = multiprocessing.Queue()
+    nthreads = multiprocessing.cpu_count()
+    print "Found %d CPUs. Running %d tests." % (nthreads, total_tests)

-        # need to catch sigint so that we can terminate all of the tasks if
-        # we're interrupted
-        signal.signal(signal.SIGINT, sigint)
+    # put each of the test filenames into a queue
+    q = multiprocessing.Queue()
+    for fn in files:
+        q.put(fn)
+    for x in range(nthreads):
+        q.put('STOP')
+    qret = multiprocessing.Queue()

-        # launch jobs to run tests
-        for x in range(nthreads):
-            t = multiprocessing.Process(target=run_tasks_from_queue, args=(q,qret))
-            task_threads.append(t)
-            t.start()
+    # need to catch sigint so that we can terminate all of the tasks if
+    # we're interrupted
+    signal.signal(signal.SIGINT, sigint)

-        # wait for them to all finish and then return the number that failed
-        # (i.e. return 0 if all is ok)
-        for t in task_threads:
-            t.join()
-        sys.stdout.write("\n")
+    # launch jobs to run tests
+    for x in range(nthreads):
+        t = multiprocessing.Process(target=run_tasks_from_queue, args=(q,qret))
+        task_threads.append(t)
+        t.start()

-        while not qret.empty():
-            (c, r) = qret.get()
-            compile_error_files += c
-            run_error_files += r
+    # wait for them to all finish and then return the number that failed
+    # (i.e. return 0 if all is ok)
+    for t in task_threads:
+        t.join()
+    print
+
+    while not qret.empty():
+        (c, r) = qret.get()
+        compile_error_files += c
+        run_error_files += r

    if len(compile_error_files) > 0:
        compile_error_files.sort()
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -787,10 +787,14 @@ packed_store_active(uniform int * uniform a, int vals) {
 ///////////////////////////////////////////////////////////////////////////
 // System information

-static inline int num_cores() {
+static inline uniform int num_cores() {
    return __num_cores();
 }

+static inline uniform int64 clock() {
+    return __clock();
+}
+
 ///////////////////////////////////////////////////////////////////////////
 // Atomics and memory barriers

@@ -808,8 +812,7 @@ static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
 static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
                                               uniform TA value) {      \
    memory_barrier();                                                   \
-    uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value, \
-                                                            (MASKTYPE)__mask); \
+    uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value); \
    memory_barrier();                                                   \
    return ret;                                                         \
 }                                                                       \
@@ -824,22 +827,80 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \
            continue;                                                   \
        uniform TA * uniform p = ptrArray[i];                           \
        uniform TA v = extract(value, i);                               \
-        uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v,     \
-                                                      (MASKTYPE)__mask); \
+        uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v);    \
        ret = insert(ret, i, r);                                        \
    }                                                                   \
    memory_barrier();                                                   \
    return ret;                                                         \
 }                                                                       \

-#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB, MASKTYPE)                \
+#define DEFINE_ATOMIC_SWAP(TA,TB)                                       \
+static inline TA atomic_swap_global(uniform TA * uniform ptr, TA value) { \
+    memory_barrier();                                                   \
+    uniform int i = 0;                                                  \
+    TA ret[programCount];                                               \
+    TA memVal;                                                          \
+    uniform int lastSwap;                                               \
+    uniform int mask = lanemask();                                      \
+    /* First, have the first running program instance (if any) perform  \
+       the swap with memory with its value of "value"; record the       \
+       value returned. */                                               \
+    for (; i < programCount; ++i) {                                     \
+        if ((mask & (1 << i)) == 0)                                     \
+            continue;                                                   \
+        memVal = __atomic_swap_uniform_##TB##_global(ptr, extract(value, i)); \
+        lastSwap = i;                                                   \
+        break;                                                          \
+    }                                                                   \
+    /* Now, for all of the remaining running program instances, set the \
+       return value of the last instance that did a swap with this      \
+       instance's value of "value"; this gives the same effect as if the \
+       current instance had executed a hardware atomic swap right before \
+       the last one that did a swap. */                                 \
+    for (; i < programCount; ++i) {                                     \
+        if ((mask & (1 << i)) == 0)                                     \
+            continue;                                                   \
+        ret[lastSwap] = extract(value, i);                              \
+        lastSwap = i;                                                   \
+    }                                                                   \
+    /* And the last instance that wanted to swap gets the value we      \
+       originally got back from memory... */                            \
+    ret[lastSwap] = memVal;                                             \
+    memory_barrier();                                                   \
+    return ret[programIndex];                                           \
+}                                                                       \
+static inline uniform TA atomic_swap_global(uniform TA * uniform ptr,   \
+                                            uniform TA value) {         \
+    memory_barrier();                                                   \
+    uniform TA ret = __atomic_swap_uniform_##TB##_global(ptr, value);   \
+    memory_barrier();                                                   \
+    return ret;                                                         \
+}                                                                       \
+static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \
+    uniform TA * uniform ptrArray[programCount];                        \
+    ptrArray[programIndex] = ptr;                                       \
+    memory_barrier();                                                   \
+    TA ret;                                                             \
+    uniform int mask = lanemask();                                      \
+    for (uniform int i = 0; i < programCount; ++i) {                    \
+        if ((mask & (1 << i)) == 0)                                     \
+            continue;                                                   \
+        uniform TA * uniform p = ptrArray[i];                           \
+        uniform TA v = extract(value, i);                               \
+        uniform TA r = __atomic_swap_uniform_##TB##_global(p, v);       \
+        ret = insert(ret, i, r);                                        \
+    }                                                                   \
+    memory_barrier();                                                   \
+    return ret;                                                         \
+}                                                                       \
+
+#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB)                          \
 static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
    uniform TA oneval = reduce_##OPA(value);                            \
    TA ret;                                                             \
    if (lanemask() != 0) {                                              \
        memory_barrier();                                               \
-        ret = __atomic_##OPB##_uniform_##TB##_global(ptr, oneval,       \
-                                                     (MASKTYPE)__mask); \
+        ret = __atomic_##OPB##_uniform_##TB##_global(ptr, oneval);      \
        memory_barrier();                                               \
    }                                                                   \
    return ret;                                                         \
@@ -847,8 +908,7 @@ static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
 static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
                                               uniform TA value) {      \
    memory_barrier();                                                   \
-    uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value, \
-                                                            (MASKTYPE)__mask); \
+    uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value); \
    memory_barrier();                                                   \
    return ret;                                                         \
 }                                                                       \
@@ -864,8 +924,7 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr,        \
            continue;                                                   \
        uniform TA * uniform p = ptrArray[i];                           \
        uniform TA v = extract(value, i);                               \
-        uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v,     \
-                                                      (MASKTYPE)__mask); \
+        uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v);    \
        ret = insert(ret, i, r);                                        \
    }                                                                   \
    memory_barrier();                                                   \
@@ -874,49 +933,51 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr,        \

 DEFINE_ATOMIC_OP(int32,int32,add,add,IntMaskType)
 DEFINE_ATOMIC_OP(int32,int32,subtract,sub,IntMaskType)
-DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min,IntMaskType)
-DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max,IntMaskType)
+DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min)
+DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max)
 DEFINE_ATOMIC_OP(int32,int32,and,and,IntMaskType)
 DEFINE_ATOMIC_OP(int32,int32,or,or,IntMaskType)
 DEFINE_ATOMIC_OP(int32,int32,xor,xor,IntMaskType)
-DEFINE_ATOMIC_OP(int32,int32,swap,swap,IntMaskType)
+DEFINE_ATOMIC_SWAP(int32,int32)

 // For everything but atomic min and max, we can use the same
 // implementations for unsigned as for signed.
 DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,UIntMaskType)
 DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,UIntMaskType)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin,UIntMaskType)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax,UIntMaskType)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax)
 DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,UIntMaskType)
 DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,UIntMaskType)
 DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,UIntMaskType)
-DEFINE_ATOMIC_OP(unsigned int32,int32,swap,swap,UIntMaskType)
+DEFINE_ATOMIC_SWAP(unsigned int32,int32)

-DEFINE_ATOMIC_OP(float,float,swap,swap,IntMaskType)
+DEFINE_ATOMIC_SWAP(float,float)

 DEFINE_ATOMIC_OP(int64,int64,add,add,IntMaskType)
 DEFINE_ATOMIC_OP(int64,int64,subtract,sub,IntMaskType)
-DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min,IntMaskType)
-DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max,IntMaskType)
+DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min)
+DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max)
 DEFINE_ATOMIC_OP(int64,int64,and,and,IntMaskType)
 DEFINE_ATOMIC_OP(int64,int64,or,or,IntMaskType)
 DEFINE_ATOMIC_OP(int64,int64,xor,xor,IntMaskType)
-DEFINE_ATOMIC_OP(int64,int64,swap,swap,IntMaskType)
+DEFINE_ATOMIC_SWAP(int64,int64)

 // For everything but atomic min and max, we can use the same
 // implementations for unsigned as for signed.
 DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,UIntMaskType)
 DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,UIntMaskType)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin,UIntMaskType)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax,UIntMaskType)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax)
 DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,UIntMaskType)
 DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,UIntMaskType)
 DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,UIntMaskType)
-DEFINE_ATOMIC_OP(unsigned int64,int64,swap,swap,UIntMaskType)
+DEFINE_ATOMIC_SWAP(unsigned int64,int64)

-DEFINE_ATOMIC_OP(double,double,swap,swap,IntMaskType)
+DEFINE_ATOMIC_SWAP(double,double)

 #undef DEFINE_ATOMIC_OP
+#undef DEFINE_ATOMIC_MINMAX_OP
+#undef DEFINE_ATOMIC_SWAP

 #define ATOMIC_DECL_CMPXCHG(TA, TB, MASKTYPE)                           \
 static inline TA atomic_compare_exchange_global(                           \
@@ -931,8 +992,7 @@ static inline uniform TA atomic_compare_exchange_global(               \
         uniform TA * uniform ptr, uniform TA oldval, uniform TA newval) { \
    memory_barrier();                                                      \
    uniform TA ret =                                                    \
-        __atomic_compare_exchange_uniform_##TB##_global(ptr, oldval, newval, \
-                                                        (MASKTYPE)__mask); \
+        __atomic_compare_exchange_uniform_##TB##_global(ptr, oldval, newval); \
    memory_barrier();                                                   \
    return ret;                                                         \
 }
@@ -2764,114 +2824,124 @@ static inline uniform double pow(uniform double a, uniform double b) {
 // half-precision floats

 static inline uniform float half_to_float(uniform unsigned int16 h) {
-    if ((h & 0x7FFFu) == 0) 
-        // Signed zero
-        return floatbits(((unsigned int32) h) << 16);
+    if (__have_native_half) {
+        return __half_to_float_uniform(h);
+    }
    else {
-        // Though these are int16 quantities, we get much better code 
-        // with them stored as int32s...
-        uniform unsigned int32 hs = h & (int32)0x8000u;  // Pick off sign bit
-        uniform unsigned int32 he = h & (int32)0x7C00u;  // Pick off exponent bits
-        uniform unsigned int32 hm = h & (int32)0x03FFu;  // Pick off mantissa bits
-        if (he == 0) {  
-            // Denormal will convert to normalized
-            uniform int e = -1;
-            // The following loop figures out how much extra to adjust the exponent
-            // Shift until leading bit overflows into exponent bit
-            do {
-                e++;
-                hm <<= 1;
-            } while((hm & 0x0400u) == 0);
-
-            // Sign bit
-            uniform unsigned int32 xs = ((unsigned int32) hs) << 16; 
-            // Exponent: unbias the halfp, then bias the single
-            uniform int32 xes = ((int32)(he >> 10)) - 15 + 127 - e;
-            // Exponent
-            uniform unsigned int32 xe = (unsigned int32) (xes << 23); 
-            // Mantissa
-            uniform unsigned int32 xm = ((unsigned int32) (hm & 0x03FFu)) << 13; 
-            return floatbits(xs | xe | xm);
-        } 
+        if ((h & 0x7FFFu) == 0) 
+            // Signed zero
+            return floatbits(((unsigned int32) h) << 16);
        else {
-            if (he == 0x7C00u) {  
-                // Inf or NaN (all the exponent bits are set)
-                if (hm == 0)
-                    // Zero mantissa -> signed inf
-                    return floatbits((((unsigned int32) hs) << 16) | 
-                                     ((unsigned int32) 0x7F800000u));
-                else
-                    // NaN
-                    return floatbits(0xFFC00000u);
-            }
-            else { 
-                // Normalized number
-                // sign
+            // Though these are int16 quantities, we get much better code 
+            // with them stored as int32s...
+            uniform unsigned int32 hs = h & (int32)0x8000u;  // Pick off sign bit
+            uniform unsigned int32 he = h & (int32)0x7C00u;  // Pick off exponent bits
+            uniform unsigned int32 hm = h & (int32)0x03FFu;  // Pick off mantissa bits
+            if (he == 0) {  
+                // Denormal will convert to normalized
+                uniform int e = -1;
+                // The following loop figures out how much extra to adjust the exponent
+                // Shift until leading bit overflows into exponent bit
+                do {
+                    e++;
+                    hm <<= 1;
+                } while((hm & 0x0400u) == 0);
+
+                // Sign bit
                uniform unsigned int32 xs = ((unsigned int32) hs) << 16; 
                // Exponent: unbias the halfp, then bias the single
-                uniform int32 xes = ((int32) (he >> 10)) - 15 + 127; 
+                uniform int32 xes = ((int32)(he >> 10)) - 15 + 127 - e;
                // Exponent
-                uniform unsigned int32 xe = (unsigned int32) (xes << 23);
+                uniform unsigned int32 xe = (unsigned int32) (xes << 23); 
                // Mantissa
-                uniform unsigned int32 xm = ((unsigned int32) hm) << 13; 
+                uniform unsigned int32 xm = ((unsigned int32) (hm & 0x03FFu)) << 13; 
                return floatbits(xs | xe | xm);
+            } 
+            else {
+                if (he == 0x7C00u) {  
+                    // Inf or NaN (all the exponent bits are set)
+                    if (hm == 0)
+                        // Zero mantissa -> signed inf
+                        return floatbits((((unsigned int32) hs) << 16) | 
+                                         ((unsigned int32) 0x7F800000u));
+                    else
+                        // NaN
+                        return floatbits(0xFFC00000u);
+                }
+                else { 
+                    // Normalized number
+                    // sign
+                    uniform unsigned int32 xs = ((unsigned int32) hs) << 16; 
+                    // Exponent: unbias the halfp, then bias the single
+                    uniform int32 xes = ((int32) (he >> 10)) - 15 + 127; 
+                    // Exponent
+                    uniform unsigned int32 xe = (unsigned int32) (xes << 23);
+                    // Mantissa
+                    uniform unsigned int32 xm = ((unsigned int32) hm) << 13; 
+                    return floatbits(xs | xe | xm);
+                }
            }
        }
    }
 }

 static inline float half_to_float(unsigned int16 h) {
-    if ((h & 0x7FFFu) == 0) 
-        // Signed zero
-        return floatbits(((unsigned int32) h) << 16);
+    if (__have_native_half) {
+        return __half_to_float_varying(h);
+    }
    else {
-        // Though these are int16 quantities, we get much better code 
-        // with them stored as int32s...
-        unsigned int32 hs = h & (int32)0x8000u;  // Pick off sign bit
-        unsigned int32 he = h & (int32)0x7C00u;  // Pick off exponent bits
-        unsigned int32 hm = h & (int32)0x03FFu;  // Pick off mantissa bits
-        cif (he == 0) {  
-            // Denormal will convert to normalized
-            int e = -1;
-            // The following loop figures out how much extra to adjust the exponent
-            // Shift until leading bit overflows into exponent bit
-            do {
-                e++;
-                hm <<= 1;
-            } while((hm & 0x0400u) == 0);
-
-            // Sign bit
-            unsigned int32 xs = ((unsigned int32) hs) << 16; 
-            // Exponent: unbias the halfp, then bias the single
-            int32 xes = ((int32)(he >> 10)) - 15 + 127 - e;
-            // Exponent
-            unsigned int32 xe = (unsigned int32) (xes << 23); 
-            // Mantissa
-            unsigned int32 xm = ((unsigned int32) (hm & 0x03FFu)) << 13; 
-            return floatbits(xs | xe | xm);
-        } 
+        if ((h & 0x7FFFu) == 0) 
+            // Signed zero
+            return floatbits(((unsigned int32) h) << 16);
        else {
-            if (he == 0x7C00u) {  
-                // Inf or NaN (all the exponent bits are set)
-                if (hm == 0)
-                    // Zero mantissa -> signed inf
-                    return floatbits((((unsigned int32) hs) << 16) | 
-                                     ((unsigned int32) 0x7F800000u));
-                else
-                    // NaN
-                    return floatbits(0xFFC00000u);
-            }
-            else { 
-                // Normalized number
-                // sign
+            // Though these are int16 quantities, we get much better code 
+            // with them stored as int32s...
+            unsigned int32 hs = h & (int32)0x8000u;  // Pick off sign bit
+            unsigned int32 he = h & (int32)0x7C00u;  // Pick off exponent bits
+            unsigned int32 hm = h & (int32)0x03FFu;  // Pick off mantissa bits
+            cif (he == 0) {  
+                // Denormal will convert to normalized
+                int e = -1;
+                // The following loop figures out how much extra to adjust the exponent
+                // Shift until leading bit overflows into exponent bit
+                do {
+                    e++;
+                    hm <<= 1;
+                } while((hm & 0x0400u) == 0);
+
+                // Sign bit
                unsigned int32 xs = ((unsigned int32) hs) << 16; 
                // Exponent: unbias the halfp, then bias the single
-                int32 xes = ((int32) (he >> 10)) - 15 + 127; 
+                int32 xes = ((int32)(he >> 10)) - 15 + 127 - e;
                // Exponent
-                unsigned int32 xe = (unsigned int32) (xes << 23);
+                unsigned int32 xe = (unsigned int32) (xes << 23); 
                // Mantissa
-                unsigned int32 xm = ((unsigned int32) hm) << 13; 
+                unsigned int32 xm = ((unsigned int32) (hm & 0x03FFu)) << 13; 
                return floatbits(xs | xe | xm);
+            } 
+            else {
+                if (he == 0x7C00u) {  
+                    // Inf or NaN (all the exponent bits are set)
+                    if (hm == 0)
+                        // Zero mantissa -> signed inf
+                        return floatbits((((unsigned int32) hs) << 16) | 
+                                         ((unsigned int32) 0x7F800000u));
+                    else
+                        // NaN
+                        return floatbits(0xFFC00000u);
+                }
+                else { 
+                    // Normalized number
+                    // sign
+                    unsigned int32 xs = ((unsigned int32) hs) << 16; 
+                    // Exponent: unbias the halfp, then bias the single
+                    int32 xes = ((int32) (he >> 10)) - 15 + 127; 
+                    // Exponent
+                    unsigned int32 xe = (unsigned int32) (xes << 23);
+                    // Mantissa
+                    unsigned int32 xm = ((unsigned int32) hm) << 13; 
+                    return floatbits(xs | xe | xm);
+                }
            }
        }
    }
@@ -2879,209 +2949,237 @@ static inline float half_to_float(unsigned int16 h) {


 static inline uniform int16 float_to_half(uniform float f) {
-    uniform int32 x = intbits(f);
-    // Store the return value in an int32 until the very end; this ends up
-    // generating better code...
-    uniform int32 ret;
-    if ((x & 0x7FFFFFFFu) == 0)
-        // Signed zero
-        ret = (x >> 16); 
+    if (__have_native_half) {
+        return __float_to_half_uniform(f);
+    }
    else {
-        uniform unsigned int32 xs = x & 0x80000000u;  // Pick off sign bit
-        uniform unsigned int32 xe = x & 0x7F800000u;  // Pick off exponent bits
-        uniform unsigned int32 xm = x & 0x007FFFFFu;  // Pick off mantissa bits
-        if (xe == 0) {  
-            // Denormal will underflow, return a signed zero
-            ret = (xs >> 16);
-        } 
+        uniform int32 x = intbits(f);
+        // Store the return value in an int32 until the very end; this ends up
+        // generating better code...
+        uniform int32 ret;
+        if ((x & 0x7FFFFFFFu) == 0)
+            // Signed zero
+            ret = (x >> 16); 
        else {
-            if (xe == 0x7F800000u) {  
-                // Inf or NaN (all the exponent bits are set)
-                if (xm == 0)
-                    // Zero mantissa -> signed infinity
-                    ret = ((xs >> 16) | 0x7C00u);
-                else
-                    // NaN, only 1st mantissa bit set
-                    ret = 0xFE00u;
-            }
-            else { 
-                // Normalized number
-                uniform unsigned int32 hs = (xs >> 16); // Sign bit
-                uniform unsigned int32 hm;
-                // Exponent unbias the single, then bias the halfp
-                uniform int32 hes = ((int)(xe >> 23)) - 127 + 15; 
-                if (hes >= 0x1F)  
-                    // Overflow: return signed infinity
-                    ret = ((xs >> 16) | 0x7C00u);
-                else if (hes <= 0) {
-                    // Underflow
-                    if ((14 - hes) > 24) {  
-                        // Mantissa shifted all the way off & no rounding possibility
-                        hm = 0u;  // Set mantissa to zero
+            uniform unsigned int32 xs = x & 0x80000000u;  // Pick off sign bit
+            uniform unsigned int32 xe = x & 0x7F800000u;  // Pick off exponent bits
+            uniform unsigned int32 xm = x & 0x007FFFFFu;  // Pick off mantissa bits
+            if (xe == 0) {  
+                // Denormal will underflow, return a signed zero
+                ret = (xs >> 16);
+            } 
+            else {
+                if (xe == 0x7F800000u) {  
+                    // Inf or NaN (all the exponent bits are set)
+                    if (xm == 0)
+                        // Zero mantissa -> signed infinity
+                        ret = ((xs >> 16) | 0x7C00u);
+                    else
+                        // NaN, only 1st mantissa bit set
+                        ret = 0xFE00u;
+                }
+                else { 
+                    // Normalized number
+                    uniform unsigned int32 hs = (xs >> 16); // Sign bit
+                    uniform unsigned int32 hm;
+                    // Exponent unbias the single, then bias the halfp
+                    uniform int32 hes = ((int)(xe >> 23)) - 127 + 15; 
+                    if (hes >= 0x1F)  
+                        // Overflow: return signed infinity
+                        ret = ((xs >> 16) | 0x7C00u);
+                    else if (hes <= 0) {
+                        // Underflow
+                        if ((14 - hes) > 24) {  
+                            // Mantissa shifted all the way off & no rounding possibility
+                            hm = 0u;  // Set mantissa to zero
+                        } 
+                        else {
+                            xm |= 0x00800000u;  // Add the hidden leading bit
+                            hm = (xm >> (14 - hes)); // Mantissa
+                            if ((xm >> (13 - hes)) & 0x00000001u) // Check for rounding
+                                // Round, might overflow into exp bit, but this is OK
+                                hm += 1u; 
+                        }
+                        ret = (hs | hm);
                    } 
                    else {
-                        xm |= 0x00800000u;  // Add the hidden leading bit
-                        hm = (xm >> (14 - hes)); // Mantissa
-                        if ((xm >> (13 - hes)) & 0x00000001u) // Check for rounding
-                            // Round, might overflow into exp bit, but this is OK
-                            hm += 1u; 
+                        uniform unsigned int32 he = (hes << 10); // Exponent
+                        hm = (xm >> 13); // Mantissa
+                        if (xm & 0x00001000u) // Check for rounding
+                            // Round, might overflow to inf, this is OK
+                            ret = (hs | he | hm) + 1u; 
+                        else
+                            ret = (hs | he | hm);
                    }
-                    ret = (hs | hm);
-                } 
-                else {
-                    uniform unsigned int32 he = (hes << 10); // Exponent
-                    hm = (xm >> 13); // Mantissa
-                    if (xm & 0x00001000u) // Check for rounding
-                        // Round, might overflow to inf, this is OK
-                        ret = (hs | he | hm) + 1u; 
-                    else
-                        ret = (hs | he | hm);
                }
            }
        }
+        return (int16)ret;
    }
-    return (int16)ret;
 }


 static inline int16 float_to_half(float f) {
-    int32 x = intbits(f);
-    // Store the return value in an int32 until the very end; this ends up
-    // generating better code...
-    int32 ret;
-    if ((x & 0x7FFFFFFFu) == 0)
-        // Signed zero
-        ret = (x >> 16); 
+    if (__have_native_half) {
+        return __float_to_half_varying(f);
+    }
    else {
-        unsigned int32 xs = x & 0x80000000u;  // Pick off sign bit
-        unsigned int32 xe = x & 0x7F800000u;  // Pick off exponent bits
-        unsigned int32 xm = x & 0x007FFFFFu;  // Pick off mantissa bits
-        if (xe == 0) {  
-            // Denormal will underflow, return a signed zero
-            ret = (xs >> 16);
-        } 
+        int32 x = intbits(f);
+        // Store the return value in an int32 until the very end; this ends up
+        // generating better code...
+        int32 ret;
+        if ((x & 0x7FFFFFFFu) == 0)
+            // Signed zero
+            ret = (x >> 16); 
        else {
-            cif (xe == 0x7F800000u) {  
-                // Inf or NaN (all the exponent bits are set)
-                if (xm == 0)
-                    // Zero mantissa -> signed infinity
-                    ret = ((xs >> 16) | 0x7C00u);
-                else
-                    // NaN, only 1st mantissa bit set
-                    ret = 0xFE00u;
-            }
-            else { 
-                // Normalized number
-                unsigned int32 hs = (xs >> 16); // Sign bit
-                unsigned int32 hm;
-                // Exponent unbias the single, then bias the halfp
-                int32 hes = ((int)(xe >> 23)) - 127 + 15; 
-                if (hes >= 0x1F)  
-                    // Overflow: return signed infinity
-                    ret = ((xs >> 16) | 0x7C00u);
-                else if (hes <= 0) {
-                    // Underflow
-                    if ((14 - hes) > 24) {  
-                        // Mantissa shifted all the way off & no rounding possibility
-                        hm = 0u;  // Set mantissa to zero
+            unsigned int32 xs = x & 0x80000000u;  // Pick off sign bit
+            unsigned int32 xe = x & 0x7F800000u;  // Pick off exponent bits
+            unsigned int32 xm = x & 0x007FFFFFu;  // Pick off mantissa bits
+            if (xe == 0) {  
+                // Denormal will underflow, return a signed zero
+                ret = (xs >> 16);
+            } 
+            else {
+                cif (xe == 0x7F800000u) {  
+                    // Inf or NaN (all the exponent bits are set)
+                    if (xm == 0)
+                        // Zero mantissa -> signed infinity
+                        ret = ((xs >> 16) | 0x7C00u);
+                    else
+                        // NaN, only 1st mantissa bit set
+                        ret = 0xFE00u;
+                }
+                else { 
+                    // Normalized number
+                    unsigned int32 hs = (xs >> 16); // Sign bit
+                    unsigned int32 hm;
+                    // Exponent unbias the single, then bias the halfp
+                    int32 hes = ((int)(xe >> 23)) - 127 + 15; 
+                    if (hes >= 0x1F)  
+                        // Overflow: return signed infinity
+                        ret = ((xs >> 16) | 0x7C00u);
+                    else if (hes <= 0) {
+                        // Underflow
+                        if ((14 - hes) > 24) {  
+                            // Mantissa shifted all the way off & no rounding possibility
+                            hm = 0u;  // Set mantissa to zero
+                        } 
+                        else {
+                            xm |= 0x00800000u;  // Add the hidden leading bit
+                            hm = (xm >> (14 - hes)); // Mantissa
+                            if ((xm >> (13 - hes)) & 0x00000001u) // Check for rounding
+                                // Round, might overflow into exp bit, but this is OK
+                                hm += 1u; 
+                        }
+                        ret = (hs | hm);
                    } 
                    else {
-                        xm |= 0x00800000u;  // Add the hidden leading bit
-                        hm = (xm >> (14 - hes)); // Mantissa
-                        if ((xm >> (13 - hes)) & 0x00000001u) // Check for rounding
-                            // Round, might overflow into exp bit, but this is OK
-                            hm += 1u; 
+                        unsigned int32 he = (hes << 10); // Exponent
+                        hm = (xm >> 13); // Mantissa
+                        if (xm & 0x00001000u) // Check for rounding
+                            // Round, might overflow to inf, this is OK
+                            ret = (hs | he | hm) + 1u; 
+                        else
+                            ret = (hs | he | hm);
                    }
-                    ret = (hs | hm);
-                } 
-                else {
-                    unsigned int32 he = (hes << 10); // Exponent
-                    hm = (xm >> 13); // Mantissa
-                    if (xm & 0x00001000u) // Check for rounding
-                        // Round, might overflow to inf, this is OK
-                        ret = (hs | he | hm) + 1u; 
-                    else
-                        ret = (hs | he | hm);
                }
            }
        }
+        return (int16)ret;
    }
-    return (int16)ret;
 }


 static inline uniform float half_to_float_fast(uniform unsigned int16 h) {
-    uniform unsigned int32 hs = h & (int32)0x8000u;  // Pick off sign bit
-    uniform unsigned int32 he = h & (int32)0x7C00u;  // Pick off exponent bits
-    uniform unsigned int32 hm = h & (int32)0x03FFu;  // Pick off mantissa bits
-
-    // sign
-    uniform unsigned int32 xs = ((unsigned int32) hs) << 16; 
-    // Exponent: unbias the halfp, then bias the single
-    uniform int32 xes = ((int32) (he >> 10)) - 15 + 127; 
-    // Exponent
-    uniform unsigned int32 xe = (unsigned int32) (xes << 23);
-    // Mantissa
-    uniform unsigned int32 xm = ((unsigned int32) hm) << 13; 
-    return floatbits(xs | xe | xm);
+    if (__have_native_half) {
+        return __half_to_float_uniform(h);
+    }
+    else {
+        uniform unsigned int32 hs = h & (int32)0x8000u;  // Pick off sign bit
+        uniform unsigned int32 he = h & (int32)0x7C00u;  // Pick off exponent bits
+        uniform unsigned int32 hm = h & (int32)0x03FFu;  // Pick off mantissa bits

+        // sign
+        uniform unsigned int32 xs = ((unsigned int32) hs) << 16; 
+        // Exponent: unbias the halfp, then bias the single
+        uniform int32 xes = ((int32) (he >> 10)) - 15 + 127; 
+        // Exponent
+        uniform unsigned int32 xe = (unsigned int32) (xes << 23);
+        // Mantissa
+        uniform unsigned int32 xm = ((unsigned int32) hm) << 13; 
+        return floatbits(xs | xe | xm);
+    }
 }

 static inline float half_to_float_fast(unsigned int16 h) {
-    unsigned int32 hs = h & (int32)0x8000u;  // Pick off sign bit
-    unsigned int32 he = h & (int32)0x7C00u;  // Pick off exponent bits
-    unsigned int32 hm = h & (int32)0x03FFu;  // Pick off mantissa bits
-
-    // sign
-    unsigned int32 xs = ((unsigned int32) hs) << 16; 
-    // Exponent: unbias the halfp, then bias the single
-    int32 xes = ((int32) (he >> 10)) - 15 + 127; 
-    // Exponent
-    unsigned int32 xe = (unsigned int32) (xes << 23);
-    // Mantissa
-    unsigned int32 xm = ((unsigned int32) hm) << 13; 
-    return floatbits(xs | xe | xm);
+    if (__have_native_half) {
+        return __half_to_float_varying(h);
+    }
+    else {
+        unsigned int32 hs = h & (int32)0x8000u;  // Pick off sign bit
+        unsigned int32 he = h & (int32)0x7C00u;  // Pick off exponent bits
+        unsigned int32 hm = h & (int32)0x03FFu;  // Pick off mantissa bits

+        // sign
+        unsigned int32 xs = ((unsigned int32) hs) << 16; 
+        // Exponent: unbias the halfp, then bias the single
+        int32 xes = ((int32) (he >> 10)) - 15 + 127; 
+        // Exponent
+        unsigned int32 xe = (unsigned int32) (xes << 23);
+        // Mantissa
+        unsigned int32 xm = ((unsigned int32) hm) << 13; 
+        return floatbits(xs | xe | xm);
+    }
 }

 static inline uniform int16 float_to_half_fast(uniform float f) {
-    uniform int32 x = intbits(f);
-    uniform unsigned int32 xs = x & 0x80000000u;  // Pick off sign bit
-    uniform unsigned int32 xe = x & 0x7F800000u;  // Pick off exponent bits
-    uniform unsigned int32 xm = x & 0x007FFFFFu;  // Pick off mantissa bits
+    if (__have_native_half) {
+        return __float_to_half_uniform(f);
+    }
+    else {
+        uniform int32 x = intbits(f);
+        uniform unsigned int32 xs = x & 0x80000000u;  // Pick off sign bit
+        uniform unsigned int32 xe = x & 0x7F800000u;  // Pick off exponent bits
+        uniform unsigned int32 xm = x & 0x007FFFFFu;  // Pick off mantissa bits

-    uniform unsigned int32 hs = (xs >> 16); // Sign bit
-    // Exponent unbias the single, then bias the halfp
-    uniform int32 hes = ((int)(xe >> 23)) - 127 + 15; 
-    uniform unsigned int32 he = (hes << 10); // Exponent
-    uniform int32 hm = (xm >> 13); // Mantissa
-    uniform int32 ret = (hs | he | hm);
+        uniform unsigned int32 hs = (xs >> 16); // Sign bit
+        // Exponent unbias the single, then bias the halfp
+        uniform int32 hes = ((int)(xe >> 23)) - 127 + 15; 
+        uniform unsigned int32 he = (hes << 10); // Exponent
+        uniform int32 hm = (xm >> 13); // Mantissa
+        uniform int32 ret = (hs | he | hm);

-    if (xm & 0x00001000u) // Check for rounding
-        // Round, might overflow to inf, this is OK
-        ret += 1u; 
+        if (xm & 0x00001000u) // Check for rounding
+            // Round, might overflow to inf, this is OK
+            ret += 1u; 

-    return (int16)ret;
+        return (int16)ret;
+    }
 }

 static inline int16 float_to_half_fast(float f) {
-    int32 x = intbits(f);
-    unsigned int32 xs = x & 0x80000000u;  // Pick off sign bit
-    unsigned int32 xe = x & 0x7F800000u;  // Pick off exponent bits
-    unsigned int32 xm = x & 0x007FFFFFu;  // Pick off mantissa bits
+    if (__have_native_half) {
+        return __float_to_half_varying(f);
+    }
+    else {
+        int32 x = intbits(f);
+        unsigned int32 xs = x & 0x80000000u;  // Pick off sign bit
+        unsigned int32 xe = x & 0x7F800000u;  // Pick off exponent bits
+        unsigned int32 xm = x & 0x007FFFFFu;  // Pick off mantissa bits

-    unsigned int32 hs = (xs >> 16); // Sign bit
-    // Exponent unbias the single, then bias the halfp
-    int32 hes = ((int)(xe >> 23)) - 127 + 15; 
-    unsigned int32 he = (hes << 10); // Exponent
-    int32 hm = (xm >> 13); // Mantissa
-    int32 ret = (hs | he | hm);
+        unsigned int32 hs = (xs >> 16); // Sign bit
+        // Exponent unbias the single, then bias the halfp
+        int32 hes = ((int)(xe >> 23)) - 127 + 15; 
+        unsigned int32 he = (hes << 10); // Exponent
+        int32 hm = (xm >> 13); // Mantissa
+        int32 ret = (hs | he | hm);

-    if (xm & 0x00001000u) // Check for rounding
-        // Round, might overflow to inf, this is OK
-        ret += 1u; 
+        if (xm & 0x00001000u) // Check for rounding
+            // Round, might overflow to inf, this is OK
+            ret += 1u; 

-    return (int16)ret;
+        return (int16)ret;
+    }
 }

 ///////////////////////////////////////////////////////////////////////////
@@ -3095,16 +3193,15 @@ static inline unsigned int random(RNGState * uniform state)
 {
    unsigned int b;

-    // FIXME: state->z1, etc..
-    b  = (((*state).z1 << 6) ^ (*state).z1) >> 13;
-    (*state).z1 = (((*state).z1 & 4294967294U) << 18) ^ b;
-    b  = (((*state).z2 << 2) ^ (*state).z2) >> 27; 
-    (*state).z2 = (((*state).z2 & 4294967288U) << 2) ^ b;
-    b  = (((*state).z3 << 13) ^ (*state).z3) >> 21;
-    (*state).z3 = (((*state).z3 & 4294967280U) << 7) ^ b;
-    b  = (((*state).z4 << 3) ^ (*state).z4) >> 12;
-    (*state).z4 = (((*state).z4 & 4294967168U) << 13) ^ b;
-    return ((*state).z1 ^ (*state).z2 ^ (*state).z3 ^ (*state).z4);
+    b  = ((state->z1 << 6) ^ state->z1) >> 13;
+    state->z1 = ((state->z1 & 4294967294U) << 18) ^ b;
+    b  = ((state->z2 << 2) ^ state->z2) >> 27; 
+    state->z2 = ((state->z2 & 4294967288U) << 2) ^ b;
+    b  = ((state->z3 << 13) ^ state->z3) >> 21;
+    state->z3 = ((state->z3 & 4294967280U) << 7) ^ b;
+    b  = ((state->z4 << 3) ^ state->z4) >> 12;
+    state->z4 = ((state->z4 & 4294967168U) << 13) ^ b;
+    return (state->z1 ^ state->z2 ^ state->z3 ^ state->z4);
 }

 static inline float frandom(RNGState * uniform state)
@@ -3120,30 +3217,30 @@ static inline uniform unsigned int __seed4(RNGState * uniform state,
    uniform unsigned int c1 = 0xf0f0f0f0;
    uniform unsigned int c2 = 0x0f0f0f0f;

-    (*state).z1 = insert((*state).z1, start + 0, seed);
-    (*state).z1 = insert((*state).z1, start + 1, seed ^ c1);
-    (*state).z1 = insert((*state).z1, start + 2, (seed << 3) ^ c1);
-    (*state).z1 = insert((*state).z1, start + 3, (seed << 2) ^ c2);
+    state->z1 = insert(state->z1, start + 0, seed);
+    state->z1 = insert(state->z1, start + 1, seed ^ c1);
+    state->z1 = insert(state->z1, start + 2, (seed << 3) ^ c1);
+    state->z1 = insert(state->z1, start + 3, (seed << 2) ^ c2);

    seed += 131;
-    (*state).z2 = insert((*state).z2, start + 0, seed);
-    (*state).z2 = insert((*state).z2, start + 1, seed ^ c1);
-    (*state).z2 = insert((*state).z2, start + 2, (seed << 3) ^ c1);
-    (*state).z2 = insert((*state).z2, start + 3, (seed << 2) ^ c2);
+    state->z2 = insert(state->z2, start + 0, seed);
+    state->z2 = insert(state->z2, start + 1, seed ^ c1);
+    state->z2 = insert(state->z2, start + 2, (seed << 3) ^ c1);
+    state->z2 = insert(state->z2, start + 3, (seed << 2) ^ c2);

-    seed ^= extract((*state).z2, 2);
-    (*state).z3 = insert((*state).z3, start + 0, seed);
-    (*state).z3 = insert((*state).z3, start + 1, seed ^ c1);
-    (*state).z3 = insert((*state).z3, start + 2, (seed << 3) ^ c1);
-    (*state).z3 = insert((*state).z3, start + 3, (seed << 2) ^ c2);
+    seed ^= extract(state->z2, 2);
+    state->z3 = insert(state->z3, start + 0, seed);
+    state->z3 = insert(state->z3, start + 1, seed ^ c1);
+    state->z3 = insert(state->z3, start + 2, (seed << 3) ^ c1);
+    state->z3 = insert(state->z3, start + 3, (seed << 2) ^ c2);

    seed <<= 4;
    seed += 3;
-    seed ^= extract((*state).z1, 3);
-    (*state).z4 = insert((*state).z4, start + 0, seed);
-    (*state).z4 = insert((*state).z4, start + 1, seed ^ c1);
-    (*state).z4 = insert((*state).z4, start + 2, (seed << 3) ^ c1);
-    (*state).z4 = insert((*state).z4, start + 3, (seed << 2) ^ c2);
+    seed ^= extract(state->z1, 3);
+    state->z4 = insert(state->z4, start + 0, seed);
+    state->z4 = insert(state->z4, start + 1, seed ^ c1);
+    state->z4 = insert(state->z4, start + 2, (seed << 3) ^ c1);
+    state->z4 = insert(state->z4, start + 3, (seed << 2) ^ c2);

    return seed;
 }
--- a/stmt.cpp
+++ b/stmt.cpp
@@ -494,6 +494,7 @@ lEmitIfStatements(FunctionEmitContext *ctx, Stmt *stmts, const char *trueOrFalse
        ctx->EndScope();
 }

+
 void
 IfStmt::EmitCode(FunctionEmitContext *ctx) const {
    // First check all of the things that might happen due to errors
@@ -694,6 +695,23 @@ lCheckAllOffSafety(ASTNode *node, void *data) {
        }

        // All indices are in-bounds
+        return true;
+    }
+
+    MemberExpr *me;
+    if ((me = dynamic_cast<MemberExpr *>(node)) != NULL &&
+        me->dereferenceExpr) {
+        *okPtr = false;
+        return false;
+    }
+
+    DereferenceExpr *de;
+    if ((de = dynamic_cast<DereferenceExpr *>(node)) != NULL) {
+        const Type *exprType = de->expr->GetType();
+        if (dynamic_cast<const PointerType *>(exprType) != NULL) {
+            *okPtr = false;
+            return false;
+        }
    }

    return true;
@@ -1132,7 +1150,7 @@ DoStmt::TypeCheck() {
                            !lHasVaryingBreakOrContinue(bodyStmts));
        testExpr = new TypeCastExpr(uniformTest ? AtomicType::UniformBool :
                                                  AtomicType::VaryingBool,
-                                    testExpr, false, testExpr->pos);
+                                    testExpr, testExpr->pos);
    }

    return this;
@@ -1317,8 +1335,7 @@ ForStmt::TypeCheck() {
                            !g->opt.disableUniformControlFlow &&
                            !lHasVaryingBreakOrContinue(stmts));
        test = new TypeCastExpr(uniformTest ? AtomicType::UniformBool :
-                                AtomicType::VaryingBool,
-                                test, false, test->pos);
+                                AtomicType::VaryingBool, test, test->pos);
        test = ::TypeCheck(test);
        if (test == NULL)
            return NULL;
@@ -1558,9 +1575,8 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
    if (ctx->GetCurrentBasicBlock() == NULL || stmts == NULL) 
        return;

-    llvm::BasicBlock *bbCheckExtras = ctx->CreateBasicBlock("foreach_check_extras");
-    llvm::BasicBlock *bbDoExtras = ctx->CreateBasicBlock("foreach_do_extras");
-    llvm::BasicBlock *bbBody = ctx->CreateBasicBlock("foreach_body");
+    llvm::BasicBlock *bbFullBody = ctx->CreateBasicBlock("foreach_full_body");
+    llvm::BasicBlock *bbMaskedBody = ctx->CreateBasicBlock("foreach_masked_body");
    llvm::BasicBlock *bbExit = ctx->CreateBasicBlock("foreach_exit");

    llvm::Value *oldMask = ctx->GetInternalMask();
@@ -1578,8 +1594,7 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
    // dimension and a number of derived values.
    std::vector<llvm::BasicBlock *> bbReset, bbStep, bbTest;
    std::vector<llvm::Value *> startVals, endVals, uniformCounterPtrs;
-    std::vector<llvm::Value *> nItems, nExtras, alignedEnd;
-    std::vector<llvm::Value *> extrasMaskPtrs;
+    std::vector<llvm::Value *> nExtras, alignedEnd, extrasMaskPtrs;

    std::vector<int> span(nDims, 0);
    lGetSpans(nDims-1, nDims, g->target.vectorWidth, isTiled, &span[0]);
@@ -1588,7 +1603,9 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
        // Basic blocks that we'll fill in later with the looping logic for
        // this dimension.
        bbReset.push_back(ctx->CreateBasicBlock("foreach_reset"));
-        bbStep.push_back(ctx->CreateBasicBlock("foreach_step"));
+        if (i < nDims-1)
+            // stepping for the innermost dimension is handled specially
+            bbStep.push_back(ctx->CreateBasicBlock("foreach_step"));
        bbTest.push_back(ctx->CreateBasicBlock("foreach_test"));

        // Start and end value for this loop dimension
@@ -1600,14 +1617,14 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
        endVals.push_back(ev);

        // nItems = endVal - startVal
-        nItems.push_back(ctx->BinaryOperator(llvm::Instruction::Sub, ev, sv,
-                                             "nitems"));
+        llvm::Value *nItems = 
+            ctx->BinaryOperator(llvm::Instruction::Sub, ev, sv, "nitems");

        // nExtras = nItems % (span for this dimension)
        // This gives us the number of extra elements we need to deal with
        // at the end of the loop for this dimension that don't fit cleanly
        // into a vector width.
-        nExtras.push_back(ctx->BinaryOperator(llvm::Instruction::SRem, nItems[i],
+        nExtras.push_back(ctx->BinaryOperator(llvm::Instruction::SRem, nItems,
                                              LLVMInt32(span[i]), "nextras"));

        // alignedEnd = endVal - nExtras
@@ -1626,8 +1643,9 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
        // There is also a varying variable that holds the set of index
        // values for each dimension in the current loop iteration; this is
        // the value that is program-visible.
-        dimVariables[i]->storagePtr = ctx->AllocaInst(LLVMTypes::Int32VectorType, 
-                                                  dimVariables[i]->name.c_str());
+        dimVariables[i]->storagePtr = 
+            ctx->AllocaInst(LLVMTypes::Int32VectorType, 
+                            dimVariables[i]->name.c_str());
        dimVariables[i]->parentFunction = ctx->GetFunction();
        ctx->EmitVariableDebugInfo(dimVariables[i]);

@@ -1639,7 +1657,7 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
        ctx->StoreInst(LLVMMaskAllOn, extrasMaskPtrs[i]);
    }

-    ctx->StartForeach(bbStep[nDims-1]);
+    ctx->StartForeach();

    // On to the outermost loop's test
    ctx->BranchInst(bbTest[0]);
@@ -1660,9 +1678,25 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
    }

    ///////////////////////////////////////////////////////////////////////////
-    // foreach_test
+    // foreach_step: increment the uniform counter by the vector width.
+    // Note that we don't increment the varying counter here as well but
+    // just generate its value when we need it in the loop body.  Don't do
+    // this for the innermost dimension, which has a more complex stepping
+    // structure..
+    for (int i = 0; i < nDims-1; ++i) {
+        ctx->SetCurrentBasicBlock(bbStep[i]);
+        llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i]);
+        llvm::Value *newCounter =  
+            ctx->BinaryOperator(llvm::Instruction::Add, counter,
+                                LLVMInt32(span[i]), "new_counter");
+        ctx->StoreInst(newCounter, uniformCounterPtrs[i]);
+        ctx->BranchInst(bbTest[i]);
+    }
+
+    ///////////////////////////////////////////////////////////////////////////
+    // foreach_test (for all dimensions other than the innermost...)
    std::vector<llvm::Value *> inExtras;
-    for (int i = 0; i < nDims; ++i) {
+    for (int i = 0; i < nDims-1; ++i) {
        ctx->SetCurrentBasicBlock(bbTest[i]);

        llvm::Value *haveExtras = 
@@ -1700,8 +1734,6 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
        if (i == 0)
            ctx->StoreInst(emask, extrasMaskPtrs[i]);
        else {
-            // FIXME: at least specialize the innermost loop to not do all
-            // this mask stuff each time through the test...
            llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[i-1]);
            llvm::Value *newMask =
                ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask,
@@ -1712,59 +1744,267 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
        llvm::Value *notAtEnd = 
            ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
                         counter, endVals[i]);
-        if (i != nDims-1)
-            ctx->BranchInst(bbTest[i+1], bbReset[i], notAtEnd);
+        ctx->BranchInst(bbTest[i+1], bbReset[i], notAtEnd);
+    }
+
+    ///////////////////////////////////////////////////////////////////////////
+    // foreach_test (for innermost dimension)
+    //
+    // All of the outer dimensions are handled generically--basically as a
+    // for() loop from the start value to the end value, where at each loop
+    // test, we compute the mask of active elements for the current
+    // dimension and then update an overall mask that is the AND
+    // combination of all of the outer ones.
+    //
+    // The innermost loop is handled specially, for performance purposes.
+    // When starting the innermost dimension, we start by checking once
+    // whether any of the outer dimensions has set the mask to be
+    // partially-active or not.  We follow different code paths for these
+    // two cases, taking advantage of the knowledge that the mask is all
+    // on, when this is the case.
+    //
+    // In each of these code paths, we start with a loop from the starting
+    // value to the aligned end value for the innermost dimension; we can
+    // guarantee that the innermost loop will have an "all on" mask (as far
+    // as its dimension is concerned) for the duration of this loop.  Doing
+    // so allows us to emit code that assumes the mask is all on (for the
+    // case where none of the outer dimensions has set the mask to be
+    // partially on), or allows us to emit code that just uses the mask
+    // from the outer dimensions directly (for the case where they have).
+    //
+    // After this loop, we just need to deal with one vector's worth of
+    // "ragged extra bits", where the mask used includes the effect of the
+    // mask for the innermost dimension.
+    //
+    // We start out this process by emitting the check that determines
+    // whether any of the enclosing dimensions is partially active
+    // (i.e. processing extra elements that don't exactly fit into a
+    // vector).
+    llvm::BasicBlock *bbOuterInExtras = 
+        ctx->CreateBasicBlock("outer_in_extras");
+    llvm::BasicBlock *bbOuterNotInExtras = 
+        ctx->CreateBasicBlock("outer_not_in_extras");
+
+    ctx->SetCurrentBasicBlock(bbTest[nDims-1]);
+    if (inExtras.size())
+        ctx->BranchInst(bbOuterInExtras, bbOuterNotInExtras,
+                        inExtras.back());
+    else
+        // for a 1D iteration domain, we certainly don't have any enclosing
+        // dimensions that are processing extra elements.
+        ctx->BranchInst(bbOuterNotInExtras);
+
+    ///////////////////////////////////////////////////////////////////////////
+    // One or more outer dimensions in extras, so we need to mask for the loop
+    // body regardless.  We break this into two cases, roughly:
+    // for (counter = start; counter < alignedEnd; counter += step) {
+    //   // mask is all on for inner, so set mask to outer mask
+    //   // run loop body with mask
+    // }
+    // // counter == alignedEnd
+    // if (counter < end) {
+    //   // set mask to outermask & (counter+programCounter < end)
+    //   // run loop body with mask
+    // }
+    llvm::BasicBlock *bbAllInnerPartialOuter =
+        ctx->CreateBasicBlock("all_inner_partial_outer");
+    llvm::BasicBlock *bbPartial =
+        ctx->CreateBasicBlock("both_partial");
+    ctx->SetCurrentBasicBlock(bbOuterInExtras); {
+        // Update the varying counter value here, since all subsequent
+        // blocks along this path need it.
+        lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1], 
+                              dimVariables[nDims-1]->storagePtr, span);
+
+        // here we just check to see if counter < alignedEnd
+        llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter");
+        llvm::Value *beforeAlignedEnd = 
+            ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
+                         counter, alignedEnd[nDims-1], "before_aligned_end");
+        ctx->BranchInst(bbAllInnerPartialOuter, bbPartial, beforeAlignedEnd);
+    }
+
+    // Below we have a basic block that runs the loop body code for the
+    // case where the mask is partially but not fully on.  This same block
+    // runs in multiple cases: both for handling any ragged extra data for
+    // the innermost dimension but also when outer dimensions have set the
+    // mask to be partially on. 
+    //
+    // The value stored in stepIndexAfterMaskedBodyPtr is used after each
+    // execution of the body code to determine whether the innermost index
+    // value should be incremented by the step (we're running the "for"
+    // loop of full vectors at the innermost dimension, with outer
+    // dimensions having set the mask to be partially on), or whether we're
+    // running once for the ragged extra bits at the end of the innermost
+    // dimension, in which case we're done with the innermost dimension and
+    // should step the loop counter for the next enclosing dimension
+    // instead.
+    llvm::Value *stepIndexAfterMaskedBodyPtr =
+        ctx->AllocaInst(LLVMTypes::BoolType, "step_index");
+
+    ///////////////////////////////////////////////////////////////////////////
+    // We're in the inner loop part where the only masking is due to outer
+    // dimensions but the innermost dimension fits fully into a vector's
+    // width.  Set the mask and jump to the masked loop body.
+    ctx->SetCurrentBasicBlock(bbAllInnerPartialOuter); {
+        llvm::Value *mask;
+        if (extrasMaskPtrs.size() == 0)
+            // 1D loop; we shouldn't ever get here anyway
+            mask = LLVMMaskAllOff;
        else
-            ctx->BranchInst(bbCheckExtras, bbReset[i], notAtEnd);
+            mask = ctx->LoadInst(extrasMaskPtrs.back());
+        ctx->SetInternalMask(mask);
+
+        ctx->StoreInst(LLVMTrue, stepIndexAfterMaskedBodyPtr);
+        ctx->BranchInst(bbMaskedBody);
    }

    ///////////////////////////////////////////////////////////////////////////
-    // foreach_step: increment the uniform counter by the vector width.
-    // Note that we don't increment the varying counter here as well but
-    // just generate its value when we need it in the loop body.
-    for (int i = 0; i < nDims; ++i) {
-        ctx->SetCurrentBasicBlock(bbStep[i]);
-        if (i == nDims-1)
-            ctx->RestoreContinuedLanes();
-        llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i]);
-        llvm::Value *newCounter =  
-            ctx->BinaryOperator(llvm::Instruction::Add, counter,
-                                LLVMInt32(span[i]), "new_counter");
-        ctx->StoreInst(newCounter, uniformCounterPtrs[i]);
-        ctx->BranchInst(bbTest[i]);
+    // We need to include the effect of the innermost dimension in the mask
+    // for the final bits here
+    ctx->SetCurrentBasicBlock(bbPartial); {
+        llvm::Value *varyingCounter = 
+            ctx->LoadInst(dimVariables[nDims-1]->storagePtr);
+        llvm::Value *smearEnd = llvm::UndefValue::get(LLVMTypes::Int32VectorType);
+        for (int j = 0; j < g->target.vectorWidth; ++j)
+            smearEnd = ctx->InsertInst(smearEnd, endVals[nDims-1], j, "smear_end");
+        llvm::Value *emask = 
+            ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
+                         varyingCounter, smearEnd);
+        emask = ctx->I1VecToBoolVec(emask);
+
+        if (nDims == 1)
+            ctx->SetInternalMask(emask);
+        else {
+            llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[nDims-2]);
+            llvm::Value *newMask =
+                ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask,
+                                    "extras_mask");
+            ctx->SetInternalMask(newMask);
+        }
+
+        ctx->StoreInst(LLVMFalse, stepIndexAfterMaskedBodyPtr);
+        ctx->BranchInst(bbMaskedBody);
    }

    ///////////////////////////////////////////////////////////////////////////
-    // foreach_check_extras: see if we need to deal with any partial
-    // vector's worth of work that's left.
-    ctx->SetCurrentBasicBlock(bbCheckExtras);
-    ctx->AddInstrumentationPoint("foreach loop check extras");
-    ctx->BranchInst(bbDoExtras, bbBody, inExtras[nDims-1]);
+    // None of the outer dimensions is processing extras; along the lines
+    // of above, we can express this as:
+    // for (counter = start; counter < alignedEnd; counter += step) {
+    //   // mask is all on
+    //   // run loop body with mask all on
+    // }
+    // // counter == alignedEnd
+    // if (counter < end) {
+    //   // set mask to (counter+programCounter < end)
+    //   // run loop body with mask
+    // }
+    llvm::BasicBlock *bbPartialInnerAllOuter =
+        ctx->CreateBasicBlock("partial_inner_all_outer");
+    ctx->SetCurrentBasicBlock(bbOuterNotInExtras); {
+        llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter");
+        llvm::Value *beforeAlignedEnd = 
+            ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
+                         counter, alignedEnd[nDims-1], "before_aligned_end");
+        ctx->BranchInst(bbFullBody, bbPartialInnerAllOuter,
+                        beforeAlignedEnd);
+    }

    ///////////////////////////////////////////////////////////////////////////
-    // foreach_body: do a full vector's worth of work.  We know that all
+    // full_body: do a full vector's worth of work.  We know that all
    // lanes will be running here, so we explicitly set the mask to be 'all
    // on'.  This ends up being relatively straightforward: just update the
    // value of the varying loop counter and have the statements in the
    // loop body emit their code.
-    ctx->SetCurrentBasicBlock(bbBody);
-    ctx->SetInternalMask(LLVMMaskAllOn);
-    ctx->AddInstrumentationPoint("foreach loop body");
-    stmts->EmitCode(ctx);
-    Assert(ctx->GetCurrentBasicBlock() != NULL);
-    ctx->BranchInst(bbStep[nDims-1]);
+    llvm::BasicBlock *bbFullBodyContinue = 
+        ctx->CreateBasicBlock("foreach_full_continue");
+    ctx->SetCurrentBasicBlock(bbFullBody); {
+        ctx->SetInternalMask(LLVMMaskAllOn);
+        lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1], 
+                              dimVariables[nDims-1]->storagePtr, span);
+        ctx->SetContinueTarget(bbFullBodyContinue);
+        ctx->AddInstrumentationPoint("foreach loop body (all on)");
+        stmts->EmitCode(ctx);
+        Assert(ctx->GetCurrentBasicBlock() != NULL);
+        ctx->BranchInst(bbFullBodyContinue);
+    }
+    ctx->SetCurrentBasicBlock(bbFullBodyContinue); {
+        ctx->RestoreContinuedLanes();
+        llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1]);
+        llvm::Value *newCounter =  
+            ctx->BinaryOperator(llvm::Instruction::Add, counter,
+                                LLVMInt32(span[nDims-1]), "new_counter");
+        ctx->StoreInst(newCounter, uniformCounterPtrs[nDims-1]);
+        ctx->BranchInst(bbOuterNotInExtras);
+    }

    ///////////////////////////////////////////////////////////////////////////
-    // foreach_doextras: set the mask and have the statements emit their
+    // We're done running blocks with the mask all on; see if the counter is
+    // less than the end value, in which case we need to run the body one
+    // more time to get the extra bits.
+    llvm::BasicBlock *bbSetInnerMask = 
+        ctx->CreateBasicBlock("partial_inner_only");
+    ctx->SetCurrentBasicBlock(bbPartialInnerAllOuter); {
+        llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter");
+        llvm::Value *beforeFullEnd = 
+            ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
+                         counter, endVals[nDims-1], "before_full_end");
+        ctx->BranchInst(bbSetInnerMask, bbReset[nDims-1], beforeFullEnd);
+    }
+
+    ///////////////////////////////////////////////////////////////////////////
+    // The outer dimensions are all on, so the mask is just given by the
+    // mask for the innermost dimension
+    ctx->SetCurrentBasicBlock(bbSetInnerMask); {
+        llvm::Value *varyingCounter = 
+            lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1], 
+                                  dimVariables[nDims-1]->storagePtr, span);
+        llvm::Value *smearEnd = llvm::UndefValue::get(LLVMTypes::Int32VectorType);
+        for (int j = 0; j < g->target.vectorWidth; ++j)
+            smearEnd = ctx->InsertInst(smearEnd, endVals[nDims-1], j, "smear_end");
+        llvm::Value *emask = 
+            ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
+                         varyingCounter, smearEnd);
+        emask = ctx->I1VecToBoolVec(emask);
+        ctx->SetInternalMask(emask);
+
+        ctx->StoreInst(LLVMFalse, stepIndexAfterMaskedBodyPtr);
+        ctx->BranchInst(bbMaskedBody);
+    }
+
+    ///////////////////////////////////////////////////////////////////////////
+    // masked_body: set the mask and have the statements emit their
    // code again.  Note that it's generally worthwhile having two copies
    // of the statements' code, since the code above is emitted with the
    // mask known to be all-on, which in turn leads to more efficient code
    // for that case.
-    ctx->SetCurrentBasicBlock(bbDoExtras);
-    llvm::Value *mask = ctx->LoadInst(extrasMaskPtrs[nDims-1]);
-    ctx->SetInternalMask(mask);
-    stmts->EmitCode(ctx);
-    ctx->BranchInst(bbStep[nDims-1]);
+    llvm::BasicBlock *bbStepInnerIndex = 
+        ctx->CreateBasicBlock("step_inner_index");
+    llvm::BasicBlock *bbMaskedBodyContinue = 
+        ctx->CreateBasicBlock("foreach_masked_continue");
+    ctx->SetCurrentBasicBlock(bbMaskedBody); {
+        ctx->AddInstrumentationPoint("foreach loop body (masked)");
+        ctx->SetContinueTarget(bbMaskedBodyContinue);
+        stmts->EmitCode(ctx);
+        ctx->BranchInst(bbMaskedBodyContinue);
+    }
+    ctx->SetCurrentBasicBlock(bbMaskedBodyContinue); {
+        ctx->RestoreContinuedLanes();
+        llvm::Value *stepIndex = ctx->LoadInst(stepIndexAfterMaskedBodyPtr);
+        ctx->BranchInst(bbStepInnerIndex, bbReset[nDims-1], stepIndex);
+    }
+
+    ///////////////////////////////////////////////////////////////////////////
+    // step the innermost index, for the case where we're doing the
+    // innermost for loop over full vectors.
+    ctx->SetCurrentBasicBlock(bbStepInnerIndex); {
+        llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1]);
+        llvm::Value *newCounter =  
+            ctx->BinaryOperator(llvm::Instruction::Add, counter,
+                                LLVMInt32(span[nDims-1]), "new_counter");
+        ctx->StoreInst(newCounter, uniformCounterPtrs[nDims-1]);
+        ctx->BranchInst(bbOuterInExtras);
+    }

    ///////////////////////////////////////////////////////////////////////////
    // foreach_exit: All done.  Restore the old mask and clean up
@@ -1869,6 +2109,301 @@ ForeachStmt::Print(int indent) const {
 }


+///////////////////////////////////////////////////////////////////////////
+// CaseStmt
+
+/** Given the statements following a 'case' or 'default' label, this
+    function determines whether the mask should be checked to see if it is
+    "all off" immediately after the label, before executing the code for
+    the statements.
+ */
+static bool
+lCheckMask(Stmt *stmts) {
+    if (stmts == NULL)
+        return false;
+
+    int cost = EstimateCost(stmts);
+
+    bool safeToRunWithAllLanesOff = true;
+    WalkAST(stmts, lCheckAllOffSafety, NULL, &safeToRunWithAllLanesOff);
+
+    // The mask should be checked if the code following the
+    // 'case'/'default' is relatively complex, or if it would be unsafe to
+    // run that code with the execution mask all off.
+    return (cost > PREDICATE_SAFE_IF_STATEMENT_COST ||
+            safeToRunWithAllLanesOff == false);
+}
+
+
+CaseStmt::CaseStmt(int v, Stmt *s, SourcePos pos) 
+    : Stmt(pos), value(v) {
+    stmts = s;
+}
+
+
+void
+CaseStmt::EmitCode(FunctionEmitContext *ctx) const {
+    ctx->EmitCaseLabel(value, lCheckMask(stmts), pos);
+    if (stmts)
+        stmts->EmitCode(ctx);
+}
+
+
+void
+CaseStmt::Print(int indent) const {
+    printf("%*cCase [%d] label", indent, ' ', value);
+    pos.Print();
+    printf("\n");
+    stmts->Print(indent+4);
+}
+
+
+Stmt *
+CaseStmt::TypeCheck() {
+    return this;
+}
+
+
+int
+CaseStmt::EstimateCost() const {
+    return 0;
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// DefaultStmt
+
+DefaultStmt::DefaultStmt(Stmt *s, SourcePos pos) 
+    : Stmt(pos) {
+    stmts = s;
+}
+
+
+void
+DefaultStmt::EmitCode(FunctionEmitContext *ctx) const {
+    ctx->EmitDefaultLabel(lCheckMask(stmts), pos);
+    if (stmts)
+        stmts->EmitCode(ctx);
+}
+
+
+void
+DefaultStmt::Print(int indent) const {
+    printf("%*cDefault Stmt", indent, ' ');
+    pos.Print();
+    printf("\n");
+    stmts->Print(indent+4);
+}
+
+
+Stmt *
+DefaultStmt::TypeCheck() {
+    return this;
+}
+
+
+int
+DefaultStmt::EstimateCost() const {
+    return 0;
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// SwitchStmt
+
+SwitchStmt::SwitchStmt(Expr *e, Stmt *s, SourcePos pos) 
+    : Stmt(pos) {
+    expr = e;
+    stmts = s;
+}
+
+
+/* An instance of this structure is carried along as we traverse the AST
+   nodes for the statements after a "switch" statement.  We use this
+   structure to record all of the 'case' and 'default' statements after the
+   "switch". */
+struct SwitchVisitInfo {
+    SwitchVisitInfo(FunctionEmitContext *c) { 
+        ctx = c;
+        defaultBlock = NULL; 
+        lastBlock = NULL;
+    }
+
+    FunctionEmitContext *ctx;
+
+    /* Basic block for the code following the "default" label (if any). */
+    llvm::BasicBlock *defaultBlock;
+
+    /* Map from integer values after "case" labels to the basic blocks that
+       follow the corresponding "case" label. */
+    std::vector<std::pair<int, llvm::BasicBlock *> > caseBlocks;
+
+    /* For each basic block for a "case" label or a "default" label,
+       nextBlock[block] stores the basic block pointer for the next
+       subsequent "case" or "default" label in the program. */
+    std::map<llvm::BasicBlock *, llvm::BasicBlock *> nextBlock;
+
+    /* The last basic block created for a "case" or "default" label; when
+       we create the basic block for the next one, we'll use this to update
+       the nextBlock map<> above. */
+    llvm::BasicBlock *lastBlock;
+};
+
+
+static bool
+lSwitchASTPreVisit(ASTNode *node, void *d) {
+    if (dynamic_cast<SwitchStmt *>(node) != NULL)
+        // don't continue recursively into a nested switch--we only want
+        // our own case and default statements!
+        return false;
+
+    CaseStmt *cs = dynamic_cast<CaseStmt *>(node);
+    DefaultStmt *ds = dynamic_cast<DefaultStmt *>(node);
+
+    SwitchVisitInfo *svi = (SwitchVisitInfo *)d;
+    llvm::BasicBlock *bb = NULL;
+    if (cs != NULL) {
+        // Complain if we've seen a case statement with the same value
+        // already
+        for (int i = 0; i < (int)svi->caseBlocks.size(); ++i) {
+            if (svi->caseBlocks[i].first == cs->value) {
+                Error(cs->pos, "Duplicate case value \"%d\".", cs->value); 
+                return true;
+            }
+        }
+        
+        // Otherwise create a new basic block for the code following this
+        // 'case' statement and record the mappign between the case label
+        // value and the basic block
+        char buf[32];
+        sprintf(buf, "case_%d", cs->value);
+        bb = svi->ctx->CreateBasicBlock(buf);
+        svi->caseBlocks.push_back(std::make_pair(cs->value, bb));
+    }
+    else if (ds != NULL) {
+        // And complain if we've seen another 'default' label..
+        if (svi->defaultBlock != NULL) {
+            Error(ds->pos, "Multiple \"default\" lables in switch statement.");
+            return true;
+        }
+        else {
+            // Otherwise create a basic block for the code following the
+            // "default".
+            bb = svi->ctx->CreateBasicBlock("default");
+            svi->defaultBlock = bb;
+        }
+    }
+
+    // If we saw a "case" or "default" label, then update the map to record
+    // that the block we just created follows the block created for the
+    // previous label in the "switch".
+    if (bb != NULL) {
+        svi->nextBlock[svi->lastBlock] = bb;
+        svi->lastBlock = bb;
+    }
+
+    return true;
+}
+
+
+void
+SwitchStmt::EmitCode(FunctionEmitContext *ctx) const {
+    if (ctx->GetCurrentBasicBlock() == NULL)
+        return;
+
+    const Type *type;
+    if (expr == NULL || ((type = expr->GetType()) == NULL)) {
+        Assert(m->errorCount > 0);
+        return;
+    }
+
+    // Basic block we'll end up after the switch statement
+    llvm::BasicBlock *bbDone = ctx->CreateBasicBlock("switch_done");
+
+    // Walk the AST of the statements after the 'switch' to collect a bunch
+    // of information about the structure of the 'case' and 'default'
+    // statements.
+    SwitchVisitInfo svi(ctx);
+    WalkAST(stmts, lSwitchASTPreVisit, NULL, &svi);
+    // Record that the basic block following the last one created for a
+    // case/default is the block after the end of the switch statement.
+    svi.nextBlock[svi.lastBlock] = bbDone;
+
+    llvm::Value *exprValue = expr->GetValue(ctx);
+    if (exprValue == NULL) {
+        Assert(m->errorCount > 0);
+        return;
+    }
+
+    bool isUniformCF = (type->IsUniformType() &&
+                        lHasVaryingBreakOrContinue(stmts) == false);
+    ctx->StartSwitch(isUniformCF, bbDone);
+    ctx->SwitchInst(exprValue, svi.defaultBlock ? svi.defaultBlock : bbDone,
+                    svi.caseBlocks, svi.nextBlock);
+
+    if (stmts != NULL)
+        stmts->EmitCode(ctx);
+
+    if (ctx->GetCurrentBasicBlock() != NULL)
+        ctx->BranchInst(bbDone);
+
+    ctx->SetCurrentBasicBlock(bbDone);
+    ctx->EndSwitch();
+}
+
+
+void
+SwitchStmt::Print(int indent) const {
+    printf("%*cSwitch Stmt", indent, ' ');
+    pos.Print();
+    printf("\n");
+    printf("%*cexpr = ", indent, ' ');
+    expr->Print();
+    printf("\n");
+    stmts->Print(indent+4);
+}
+
+
+Stmt *
+SwitchStmt::TypeCheck() {
+    const Type *exprType = expr->GetType();
+    if (exprType == NULL)
+        return NULL;
+
+    const Type *toType = NULL;
+    exprType = exprType->GetAsConstType();
+    bool is64bit = (exprType->GetAsUniformType() == 
+                    AtomicType::UniformConstUInt64 ||
+                    exprType->GetAsUniformType() == 
+                    AtomicType::UniformConstInt64);
+
+    if (exprType->IsUniformType()) {
+        if (is64bit) toType = AtomicType::UniformInt64;
+        else         toType = AtomicType::UniformInt32;
+    }
+    else {
+        if (is64bit) toType = AtomicType::VaryingInt64;
+        else         toType = AtomicType::VaryingInt32;
+    }
+
+    expr = TypeConvertExpr(expr, toType, "switch expression");
+    if (expr == NULL)
+        return NULL;
+
+    return this;
+}
+
+
+int
+SwitchStmt::EstimateCost() const {
+    const Type *type = expr->GetType();
+    if (type && type->IsVaryingType())
+        return COST_VARYING_SWITCH;
+    else
+        return COST_UNIFORM_SWITCH;
+}
+
+
 ///////////////////////////////////////////////////////////////////////////
 // ReturnStmt

@@ -1915,14 +2450,137 @@ ReturnStmt::Print(int indent) const {
 }


+///////////////////////////////////////////////////////////////////////////
+// GotoStmt
+
+GotoStmt::GotoStmt(const char *l, SourcePos gotoPos, SourcePos ip) 
+    : Stmt(gotoPos) {
+    label = l;
+    identifierPos = ip;
+}
+
+
+void
+GotoStmt::EmitCode(FunctionEmitContext *ctx) const {
+    if (ctx->VaryingCFDepth() > 0) {
+        Error(pos, "\"goto\" statements are only legal under \"uniform\" "
+              "control flow.");
+        return;
+    }
+    if (ctx->InForeachLoop()) {
+        Error(pos, "\"goto\" statements are currently illegal inside "
+              "\"foreach\" loops.");
+        return;
+    }
+
+    llvm::BasicBlock *bb = ctx->GetLabeledBasicBlock(label);
+    if (bb == NULL) {
+        // TODO: use the string distance stuff to suggest alternatives if
+        // there are some with names close to the label name we have here..
+        Error(identifierPos, "No label named \"%s\" found in current function.",
+              label.c_str());
+        return;
+    }
+
+    ctx->BranchInst(bb);
+    ctx->SetCurrentBasicBlock(NULL);
+}
+
+
+void
+GotoStmt::Print(int indent) const {
+    printf("%*cGoto label \"%s\"\n", indent, ' ', label.c_str());
+}
+
+
+Stmt *
+GotoStmt::Optimize() {
+    return this;
+}
+
+
+Stmt *
+GotoStmt::TypeCheck() {
+    return this;
+}
+
+
+int
+GotoStmt::EstimateCost() const {
+    return COST_GOTO;
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// LabeledStmt
+
+LabeledStmt::LabeledStmt(const char *n, Stmt *s, SourcePos p) 
+    : Stmt(p) {
+    name = n;
+    stmt = s;
+}
+
+
+void
+LabeledStmt::EmitCode(FunctionEmitContext *ctx) const {
+    llvm::BasicBlock *bblock = ctx->GetLabeledBasicBlock(name);
+    Assert(bblock != NULL);
+
+    // End the current basic block with a jump to our basic block and then
+    // set things up for emission to continue there.  Note that the current
+    // basic block may validly be NULL going into this statement due to an
+    // earlier goto that NULLed it out; that doesn't stop us from
+    // re-establishing a current basic block starting at the label..
+    if (ctx->GetCurrentBasicBlock() != NULL)
+        ctx->BranchInst(bblock);
+    ctx->SetCurrentBasicBlock(bblock);
+
+    if (stmt != NULL)
+        stmt->EmitCode(ctx);
+}
+
+
+void
+LabeledStmt::Print(int indent) const {
+    printf("%*cLabel \"%s\"\n", indent, ' ', name.c_str());
+    if (stmt != NULL)
+        stmt->Print(indent);
+}
+
+
+Stmt *
+LabeledStmt::Optimize() {
+    return this;
+}
+
+
+Stmt *
+LabeledStmt::TypeCheck() {
+    if (!isalpha(name[0]) || name[0] == '_') {
+        Error(pos, "Label must start with either alphabetic character or '_'.");
+        return NULL;
+    }
+    for (unsigned int i = 1; i < name.size(); ++i) {
+        if (!isalnum(name[i]) && name[i] != '_') {
+            Error(pos, "Character \"%c\" is illegal in labels.", name[i]);
+            return NULL;
+        }
+    }
+    return this;
+}
+
+
+int
+LabeledStmt::EstimateCost() const {
+    return 0;
+}
+
+
 ///////////////////////////////////////////////////////////////////////////
 // StmtList

 void
 StmtList::EmitCode(FunctionEmitContext *ctx) const {
-    if (!ctx->GetCurrentBasicBlock()) 
-        return;
-
    ctx->StartScope();
    ctx->SetDebugPos(pos);
    for (unsigned int i = 0; i < stmts.size(); ++i)
@@ -2020,7 +2678,7 @@ lProcessPrintArg(Expr *expr, FunctionEmitContext *ctx, std::string &argTypes) {
        baseType == AtomicType::UniformUInt16) {
        expr = new TypeCastExpr(type->IsUniformType() ? AtomicType::UniformInt32 :
                                                        AtomicType::VaryingInt32, 
-                                expr, false, expr->pos);
+                                expr, expr->pos);
        type = expr->GetType();
    }
        
@@ -2173,16 +2831,6 @@ AssertStmt::EmitCode(FunctionEmitContext *ctx) const {
                    m->module->getFunction("__do_assert_varying");
    Assert(assertFunc != NULL);

-#ifdef ISPC_IS_WINDOWS
-    char errorString[2048];
-    if (sprintf_s(errorString, sizeof(errorString),
-                  "%s(%d): Assertion failed: %s\n", pos.name,
-                  pos.first_line, message.c_str()) == -1) {
-        Error(pos, "Fatal error in sprintf_s() call when generating assert "
-              "string.");
-        return;
-    }
-#else
    char *errorString;
    if (asprintf(&errorString, "%s:%d:%d: Assertion failed: %s\n", 
                 pos.name, pos.first_line, pos.first_column, 
@@ -2191,7 +2839,6 @@ AssertStmt::EmitCode(FunctionEmitContext *ctx) const {
              "unable to allocate memory!");
        return;
    }
-#endif

    std::vector<llvm::Value *> args;
    args.push_back(ctx->GetStringPtr(errorString));
@@ -2199,9 +2846,7 @@ AssertStmt::EmitCode(FunctionEmitContext *ctx) const {
    args.push_back(ctx->GetFullMask());
    ctx->CallInst(assertFunc, NULL, args, "");

-#ifndef ISPC_IS_WINDOWS
    free(errorString);
-#endif // !ISPC_IS_WINDOWS
 }


@@ -2223,7 +2868,7 @@ AssertStmt::TypeCheck() {
        }
        expr = new TypeCastExpr(isUniform ? AtomicType::UniformBool : 
                                            AtomicType::VaryingBool, 
-                                expr, false, expr->pos);
+                                expr, expr->pos);
        expr = ::TypeCheck(expr);
    }
    return this;
--- a/stmt.h
+++ b/stmt.h
@@ -282,6 +282,97 @@ public:
 };


+/** Statement corresponding to a "case" label in the program.  In addition
+    to the value associated with the "case", this statement also stores the
+    statements following it. */
+class CaseStmt : public Stmt {
+public:
+    CaseStmt(int value, Stmt *stmt, SourcePos pos);
+
+    void EmitCode(FunctionEmitContext *ctx) const;
+    void Print(int indent) const;
+
+    Stmt *TypeCheck();
+    int EstimateCost() const;
+
+    /** Integer value after the "case" statement */
+    const int value;
+    Stmt *stmts;
+};
+
+
+/** Statement for a "default" label (as would be found inside a "switch"
+    statement). */
+class DefaultStmt : public Stmt {
+public:
+    DefaultStmt(Stmt *stmt, SourcePos pos);
+
+    void EmitCode(FunctionEmitContext *ctx) const;
+    void Print(int indent) const;
+
+    Stmt *TypeCheck();
+    int EstimateCost() const;
+
+    Stmt *stmts;
+};
+
+
+/** A "switch" statement in the program. */
+class SwitchStmt : public Stmt {
+public:
+    SwitchStmt(Expr *expr, Stmt *stmts, SourcePos pos);
+
+    void EmitCode(FunctionEmitContext *ctx) const;
+    void Print(int indent) const;
+
+    Stmt *TypeCheck();
+    int EstimateCost() const;
+
+    /** Expression that is used to determine which label to jump to. */
+    Expr *expr;
+    /** Statement block after the "switch" expression. */
+    Stmt *stmts;
+};
+
+
+/** A "goto" in an ispc program. */
+class GotoStmt : public Stmt {
+public:
+    GotoStmt(const char *label, SourcePos gotoPos, SourcePos idPos);
+
+    void EmitCode(FunctionEmitContext *ctx) const;
+    void Print(int indent) const;
+
+    Stmt *Optimize();
+    Stmt *TypeCheck();
+    int EstimateCost() const;
+
+    /** Name of the label to jump to when the goto is executed. */
+    std::string label;
+    SourcePos identifierPos;
+};
+
+
+/** Statement corresponding to a label (as would be used as a goto target)
+    in the program. */
+class LabeledStmt : public Stmt {
+public:
+    LabeledStmt(const char *label, Stmt *stmt, SourcePos p);
+
+    void EmitCode(FunctionEmitContext *ctx) const;
+    void Print(int indent) const;
+
+    Stmt *Optimize();
+    Stmt *TypeCheck();
+    int EstimateCost() const;
+
+    /** Name of the label. */
+    std::string name;
+    /** Statements following the label. */
+    Stmt *stmt;
+};
+
+
 /** @brief Representation of a list of statements in the program.
 */
 class StmtList : public Stmt {
--- a/sym.cpp
+++ b/sym.cpp
@@ -72,8 +72,7 @@ SymbolTable::SymbolTable() {

 SymbolTable::~SymbolTable() {
    // Otherwise we have mismatched push/pop scopes
-    Assert(variables.size() == 1 && functions.size() == 1 &&
-           types.size() == 1);
+    Assert(variables.size() == 1 && types.size() == 1);
    PopScope();
 }

@@ -81,7 +80,6 @@ SymbolTable::~SymbolTable() {
 void
 SymbolTable::PushScope() { 
    variables.push_back(new SymbolMapType);
-    functions.push_back(new FunctionMapType);
    types.push_back(new TypeMapType);
 }

@@ -92,10 +90,6 @@ SymbolTable::PopScope() {
    delete variables.back();
    variables.pop_back();

-    Assert(functions.size() > 1);
-    delete functions.back();
-    functions.pop_back();
-
    Assert(types.size() > 1);
    delete types.back();
    types.pop_back();
@@ -160,7 +154,7 @@ SymbolTable::AddFunction(Symbol *symbol) {
        // the symbol table
        return false;

-    std::vector<Symbol *> &funOverloads = (*functions.back())[symbol->name];
+    std::vector<Symbol *> &funOverloads = functions[symbol->name];
    funOverloads.push_back(symbol);
    return true;
 }
@@ -168,17 +162,14 @@ SymbolTable::AddFunction(Symbol *symbol) {

 bool
 SymbolTable::LookupFunction(const char *name, std::vector<Symbol *> *matches) {
-    for (int i = (int)functions.size() - 1; i >= 0; --i) {
-        FunctionMapType &fm = *(functions[i]);
-        FunctionMapType::iterator iter = fm.find(name);
-        if (iter != fm.end()) {
-            if (matches == NULL)
-                return true;
-            else {
-                const std::vector<Symbol *> &funcs = iter->second;
-                for (int j = 0; j < (int)funcs.size(); ++j)
-                    matches->push_back(funcs[j]);
-            }
+    FunctionMapType::iterator iter = functions.find(name);
+    if (iter != functions.end()) {
+        if (matches == NULL)
+            return true;
+        else {
+            const std::vector<Symbol *> &funcs = iter->second;
+            for (int j = 0; j < (int)funcs.size(); ++j)
+                matches->push_back(funcs[j]);
        }
    }
    return matches ? (matches->size() > 0) : false;
@@ -187,15 +178,12 @@ SymbolTable::LookupFunction(const char *name, std::vector<Symbol *> *matches) {

 Symbol *
 SymbolTable::LookupFunction(const char *name, const FunctionType *type) {
-    for (int i = (int)functions.size() - 1; i >= 0; --i) {
-        FunctionMapType &fm = *(functions[i]);
-        FunctionMapType::iterator iter = fm.find(name);
-        if (iter != fm.end()) {
-            std::vector<Symbol *> funcs = iter->second;
-            for (int j = 0; j < (int)funcs.size(); ++j) {
-                if (Type::Equal(funcs[j]->type, type))
-                    return funcs[j];
-            }
+    FunctionMapType::iterator iter = functions.find(name);
+    if (iter != functions.end()) {
+        std::vector<Symbol *> funcs = iter->second;
+        for (int j = 0; j < (int)funcs.size(); ++j) {
+            if (Type::Equal(funcs[j]->type, type))
+                return funcs[j];
        }
    }
    return NULL;
@@ -261,14 +249,11 @@ SymbolTable::ClosestVariableOrFunctionMatch(const char *str) const {
        }
    }

-    for (int i = 0; i < (int)functions.size(); ++i) {
-        const FunctionMapType &fm = *(functions[i]);
-        FunctionMapType::const_iterator iter;
-        for (iter = fm.begin(); iter != fm.end(); ++iter) {
-            int dist = StringEditDistance(str, iter->first, maxDelta+1);
-            if (dist <= maxDelta)
-                matches[dist].push_back(iter->first);
-        }
+    FunctionMapType::const_iterator iter;
+    for (iter = functions.begin(); iter != functions.end(); ++iter) {
+        int dist = StringEditDistance(str, iter->first, maxDelta+1);
+        if (dist <= maxDelta)
+            matches[dist].push_back(iter->first);
    }

    // Now, return the first entry of matches[] that is non-empty, if any.
@@ -346,15 +331,13 @@ SymbolTable::Print() {
    }

    fprintf(stderr, "Functions:\n----------------\n");
-    for (int i = 0; i < (int)functions.size(); ++i) {
-        FunctionMapType::iterator fiter = functions[i]->begin();
-        while (fiter != functions[i]->end()) {
-            fprintf(stderr, "%s\n", fiter->first.c_str());
-            std::vector<Symbol *> &syms = fiter->second;
-            for (unsigned int j = 0; j < syms.size(); ++j)
-                fprintf(stderr, "    %s\n", syms[j]->type->GetString().c_str());
-            ++fiter;
-        }
+    FunctionMapType::iterator fiter = functions.begin();
+    while (fiter != functions.end()) {
+        fprintf(stderr, "%s\n", fiter->first.c_str());
+        std::vector<Symbol *> &syms = fiter->second;
+        for (unsigned int j = 0; j < syms.size(); ++j)
+            fprintf(stderr, "    %s\n", syms[j]->type->GetString().c_str());
+        ++fiter;
    }

    depth = 0;
--- a/sym.h
+++ b/sym.h
@@ -257,12 +257,13 @@ private:
    typedef std::map<std::string, Symbol *> SymbolMapType;
    std::vector<SymbolMapType *> variables;

-    /** Function declarations are also scoped., A STL \c vector is used to
-        store the function symbols for a given name since, due to function
-        overloading, a name can have multiple function symbols associated
-        with it. */
+    /** Function declarations are *not* scoped.  (C99, for example, allows
+        an implementation to maintain function declarations in a single
+        namespace.)  A STL \c vector is used to store the function symbols
+        for a given name since, due to function overloading, a name can
+        have multiple function symbols associated with it. */
    typedef std::map<std::string, std::vector<Symbol *> > FunctionMapType;
-    std::vector<FunctionMapType *> functions;
+    FunctionMapType functions;

    /** Type definitions can also be scoped.  A new \c TypeMapType
        is added to the back of the \c types \c vector each time a new scope
@@ -278,15 +279,12 @@ SymbolTable::GetMatchingFunctions(Predicate pred,
                                  std::vector<Symbol *> *matches) const {
    // Iterate through all function symbols and apply the given predicate.
    // If it returns true, add the Symbol * to the provided vector.
-    for (unsigned int i = 0; i < functions.size(); ++i) {
-        FunctionMapType &fm = *(functions[i]);
-        FunctionMapType::const_iterator iter;
-        for (iter = fm.begin(); iter != fm.end(); ++iter) {
-            const std::vector<Symbol *> &syms = iter->second;
-            for (unsigned int j = 0; j < syms.size(); ++j) {
-                if (pred(syms[j]))
-                    matches->push_back(syms[j]);
-            }
+    FunctionMapType::const_iterator iter;
+    for (iter = functions.begin(); iter != functions.end(); ++iter) {
+        const std::vector<Symbol *> &syms = iter->second;
+        for (unsigned int j = 0; j < syms.size(); ++j) {
+            if (pred(syms[j]))
+                matches->push_back(syms[j]);
        }
    }
 }
--- a/test_static.cpp
+++ b/test_static.cpp
@@ -46,7 +46,6 @@
 #include <assert.h>
 #include <string.h>
 #include <stdio.h>
-#include <assert.h>
 #include <stdint.h>
 #ifdef ISPC_IS_LINUX
 #include <malloc.h>
--- a/tests/atomics-swap.ispc
+++ b/tests/atomics-swap.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+uniform int32 s = 1234;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = 0;
+    if (programIndex & 1) {
+        b = atomic_swap_global(&s, programIndex);
+    }
+    RET[programIndex] = reduce_add(b) + s;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1234 + reduce_add(programIndex & 1 ? programIndex : 0);
+}
--- a/tests/goto-1.ispc
+++ b/tests/goto-1.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = 0.; b = a; 
+    RET[programIndex] = a+b; 
+    goto skip;
+    RET[programIndex] = 0; 
+ skip:
+    ;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2 + 2*programIndex;
+}
--- a/tests/goto-2.ispc
+++ b/tests/goto-2.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = 0.; b = a; 
+    RET[programIndex] = a+b; 
+    if (all(a != 0))
+        goto skip;
+    RET[programIndex] = 0; 
+ skip:
+    ;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2 + 2*programIndex;
+}
--- a/tests/goto-3.ispc
+++ b/tests/goto-3.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = 0.; b = a; 
+    RET[programIndex] = a+b; 
+    if (all(a == 0))
+        goto skip;
+    RET[programIndex] = 0; 
+ skip:
+    ;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}
--- a/tests/goto-4.ispc
+++ b/tests/goto-4.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = 0.; b = a; 
+    RET[programIndex] = 0; 
+ encore:
+    ++RET[programIndex];
+    if (any(a != 0)) {
+        a = max(a-1, 0);
+        goto encore;
+    }
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = programCount+1; 
+}
--- a/tests/half-3.ispc
+++ b/tests/half-3.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+export void f_v(uniform float RET[]) {
+    int errors = 0;
+
+    foreach (i = 0 ... 65535) {
+        unsigned int16 h = i;
+        float f = half_to_float(i);
+        h = float_to_half(f);
+
+        int mismatches = (f == f && i != h);
+        errors += reduce_add(mismatches);
+    }
+
+    RET[programIndex] = errors;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}
--- a/tests/kilo-mega-giga-1.ispc
+++ b/tests/kilo-mega-giga-1.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    a *= 1k;
+    RET[programIndex] = a; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1024*(programIndex+1);
+}
--- a/tests/kilo-mega-giga-2.ispc
+++ b/tests/kilo-mega-giga-2.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int a = b + 2M;
+    RET[programIndex] = a;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2*1024*1024 + 5;
+}
--- a/tests/kilo-mega-giga-3.ispc
+++ b/tests/kilo-mega-giga-3.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    unsigned int32 a = 3G;
+    a -= 2G;
+    a -= 1024M;
+    RET[programIndex] = a;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}
--- a/tests/ptr-assign-lhs-math-1.ispc
+++ b/tests/ptr-assign-lhs-math-1.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform float a[programCount];
+    a[programIndex] = aFOO[programIndex];
+
+    uniform float * uniform ptr = a;
+    *(ptr+1) = 0;
+    RET[programIndex] = a[programIndex];
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1+programIndex;
+    RET[1] = 0;
+}
--- a/tests/ptr-assign-lhs-math-2.ispc
+++ b/tests/ptr-assign-lhs-math-2.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform float a[programCount];
+    a[programIndex] = aFOO[programIndex];
+
+    uniform float * varying ptr = a;
+    *(ptr+programIndex) = 0;
+    RET[programIndex] = a[programIndex];
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}
--- a/tests/switch-1.ispc
+++ b/tests/switch-1.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int a = aFOO[programIndex]; 
+    switch (b) {
+    default:
+        RET[programIndex] = -1; 
+        break;
+    case 5:
+        RET[programIndex] = 0; 
+    }
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}
--- a/tests/switch-10.ispc
+++ b/tests/switch-10.ispc
@@ -0,0 +1,44 @@
+
+export uniform int width() { return programCount; }
+
+int switchit(int a, uniform int b) {
+    switch (a) {
+    case 3:
+        return 1;
+    case 7:
+    case 6:
+    case 4:
+    case 5:
+        if (a & 1)
+            break;
+        return 2;
+    case 1: {
+        switch (a+b) {
+        case 6:
+            return 42;
+        default:
+            break;
+        }
+        return -1234;
+    }
+    case 32:
+        *((int *)NULL) = 0;
+    default:
+        return 0;
+    }
+    return 3;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int a = aFOO[programIndex]; 
+    int x = switchit(a, b);
+    RET[programIndex] = x; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 42;
+    RET[2] = 1;
+    RET[6] = RET[4] = 3;
+    RET[5] = RET[3] = 2;
+}
--- a/tests/switch-11.ispc
+++ b/tests/switch-11.ispc
@@ -0,0 +1,50 @@
+
+export uniform int width() { return programCount; }
+
+int switchit(int a, uniform int b) {
+    switch (a) {
+    case 3:
+        return 1;
+    case 7:
+    case 6:
+    case 4:
+    case 5:
+        if (a & 1)
+            break;
+        return 2;
+    case 1: {
+        switch (a+b) {
+        case 60:
+            return -1234;
+        default:
+            break;
+        case 6:
+            if (b == 5)
+                break;
+            return -42;
+        case 12:
+            return -1;
+        }
+        return 42;
+    }
+    case 32:
+        *((int *)NULL) = 0;
+    default:
+        return 0;
+    }
+    return 3;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int a = aFOO[programIndex]; 
+    int x = switchit(a, b);
+    RET[programIndex] = x; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 42;
+    RET[2] = 1;
+    RET[6] = RET[4] = 3;
+    RET[5] = RET[3] = 2;
+}
--- a/tests/switch-12.ispc
+++ b/tests/switch-12.ispc
@@ -0,0 +1,54 @@
+
+export uniform int width() { return programCount; }
+
+int switchit(int a, uniform int b) {
+    switch (a) {
+    case 3:
+        return 1;
+    case 7:
+    case 6:
+    case 4:
+    case 5:
+        if (a & 1)
+            break;
+        return 2;
+    case 1: {
+        switch (a+b) {
+        case 60:
+            return -1234;
+        default:
+            break;
+        case 6:
+            int count = 0;
+            for (count = 0; count < 10; ++count) {
+                a += b;
+                if (a == 11)
+                    break;
+            }
+            return a;
+        case 12:
+            return -1;
+        }
+        return 42;
+    }
+    case 32:
+        *((int *)NULL) = 0;
+    default:
+        return 0;
+    }
+    return 3;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int a = aFOO[programIndex]; 
+    int x = switchit(a, b);
+    RET[programIndex] = x; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 11;
+    RET[2] = 1;
+    RET[6] = RET[4] = 3;
+    RET[5] = RET[3] = 2;
+}
--- a/tests/switch-13.ispc
+++ b/tests/switch-13.ispc
@@ -0,0 +1,28 @@
+
+export uniform int width() { return programCount; }
+
+int switchit(int a, uniform int b) {
+    int r = -1;
+    switch (b) {
+    case 5:
+        if (a & 1) {
+            r=3;
+            break;
+        }
+        r= 2;
+        break;
+    default:
+        r= 3;
+    }
+    return r;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int a = aFOO[programIndex]; 
+    int x = switchit(a, b);
+    RET[programIndex] = x; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (programIndex & 1) ? 2 : 3;
+}
--- a/tests/switch-14.ispc
+++ b/tests/switch-14.ispc
@@ -0,0 +1,24 @@
+
+export uniform int width() { return programCount; }
+
+int switchit(int a, uniform int b) {
+    switch (b) {
+    case 5:
+        if (a & 1)
+            break;
+        return 2;
+    default:
+        return 42;
+    }
+    return 3;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int a = aFOO[programIndex]; 
+    int x = switchit(a, b);
+    RET[programIndex] = x; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (programIndex & 1) ? 2 : 3;
+}
--- a/tests/switch-2.ispc
+++ b/tests/switch-2.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int a = aFOO[programIndex]; 
+    switch (b) {
+    default:
+        RET[programIndex] = -1; 
+    case 5:
+        RET[programIndex] = 0; 
+    }
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}
--- a/tests/switch-3.ispc
+++ b/tests/switch-3.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int a = aFOO[programIndex]; 
+    switch (b) {
+    case 5:
+        RET[programIndex] = 0; 
+        break;
+    default:
+        RET[programIndex] = -1; 
+    }
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}
--- a/tests/switch-4.ispc
+++ b/tests/switch-4.ispc
@@ -0,0 +1,24 @@
+
+export uniform int width() { return programCount; }
+
+int switchit(int a, uniform int b) {
+    int r = 0;
+    switch (a) {
+    case 3:
+        r = 1;
+        break;
+    default:
+        r = 0;
+    }
+    return r;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int a = aFOO[programIndex]; 
+    int x = switchit(a, b);
+    RET[programIndex] = x; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (programIndex == 2) ? 1 : 0;
+}
--- a/tests/switch-5.ispc
+++ b/tests/switch-5.ispc
@@ -0,0 +1,22 @@
+
+export uniform int width() { return programCount; }
+
+int switchit(int a, uniform int b) {
+    int r = 0;
+    switch (a) {
+    case 3:
+        return 1;
+    default:
+        return 0;
+    }
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int a = aFOO[programIndex]; 
+    int x = switchit(a, b);
+    RET[programIndex] = x; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (programIndex == 2) ? 1 : 0;
+}
--- a/tests/switch-6.ispc
+++ b/tests/switch-6.ispc
@@ -0,0 +1,27 @@
+
+export uniform int width() { return programCount; }
+
+int switchit(int a, uniform int b) {
+    switch (a) {
+    case 3:
+        return 1;
+    case 7:
+        if (b == 5)
+            break;
+    default:
+        return 0;
+    }
+    return -1;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int a = aFOO[programIndex]; 
+    int x = switchit(a, b);
+    RET[programIndex] = x; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[2] = 1;
+    RET[6] = -1;
+}
--- a/tests/switch-7.ispc
+++ b/tests/switch-7.ispc
@@ -0,0 +1,32 @@
+
+export uniform int width() { return programCount; }
+
+int switchit(int a, uniform int b) {
+    switch (a) {
+    case 3:
+        return 1;
+    case 7:
+    case 6:
+    case 4:
+    case 5:
+        if (a & 1)
+            break;
+        return 2;
+    default:
+        return 0;
+    }
+    return 3;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int a = aFOO[programIndex]; 
+    int x = switchit(a, b);
+    RET[programIndex] = x; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[2] = 1;
+    RET[6] = RET[4] = 3;
+    RET[5] = RET[3] = 2;
+}
--- a/tests/switch-8.ispc
+++ b/tests/switch-8.ispc
@@ -0,0 +1,36 @@
+
+export uniform int width() { return programCount; }
+
+int switchit(int a, uniform int b) {
+    switch (a) {
+    case 3:
+        return 1;
+    case 7:
+    case 6:
+    case 4:
+    case 5:
+        if (a & 1)
+            break;
+        return 2;
+    case 32:
+        *((int *)NULL) = 0;
+    default:
+    case 1:
+    case 2:
+        return 0;
+    }
+    return 3;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int a = aFOO[programIndex]; 
+    int x = switchit(a, b);
+    RET[programIndex] = x; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[2] = 1;
+    RET[6] = RET[4] = 3;
+    RET[5] = RET[3] = 2;
+}
--- a/tests/switch-9.ispc
+++ b/tests/switch-9.ispc
@@ -0,0 +1,34 @@
+
+export uniform int width() { return programCount; }
+
+int switchit(int a, uniform int b) {
+    switch (a) {
+    case 3:
+        return 1;
+    case 7:
+    case 6:
+    case 4:
+    case 5:
+        if (a & 1)
+            break;
+        return 2;
+    case 32:
+        *((int *)NULL) = 0;
+    default:
+        return 0;
+    }
+    return 3;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int a = aFOO[programIndex]; 
+    int x = switchit(a, b);
+    RET[programIndex] = x; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[2] = 1;
+    RET[6] = RET[4] = 3;
+    RET[5] = RET[3] = 2;
+}
--- a/tests_errors/fptr-typecheck-2.ispc
+++ b/tests_errors/fptr-typecheck-2.ispc
@@ -1,4 +1,4 @@
-// Can't convert argument of type "void * const uniform" to type "float" for funcion call argument.
+// Can't convert argument of type "void * uniform" to type "float" for function call argument.

 float bar(float a, float b);

--- a/tests_errors/goto-1.ispc
+++ b/tests_errors/goto-1.ispc
@@ -0,0 +1,10 @@
+// Multiple labels named "label" in function
+
+void func(int x) {
+ label:
+    ;
+ label:
+     ;
+}
+
+
--- a/tests_errors/goto-2.ispc
+++ b/tests_errors/goto-2.ispc
@@ -0,0 +1,11 @@
+// "goto" statements are only legal under "uniform" control flow
+
+void func(int x) {
+    if (x < 0)
+        goto label;
+
+ label:
+        ;
+}
+
+
--- a/tests_errors/goto-3.ispc
+++ b/tests_errors/goto-3.ispc
@@ -0,0 +1,11 @@
+// "goto" statements are only legal under "uniform" control flow
+
+void func(int x) {
+    cif (x < 0)
+        goto label;
+
+ label:
+        ;
+}
+
+
--- a/tests_errors/goto-4.ispc
+++ b/tests_errors/goto-4.ispc
@@ -0,0 +1,10 @@
+// "goto" statements are only legal under "uniform" control flow
+
+void func(int x) {
+ label:
+
+    for(int i =0 ;i<x;)
+        goto label;
+}
+
+
--- a/tests_errors/lvalue-1.ispc
+++ b/tests_errors/lvalue-1.ispc
@@ -1,4 +1,4 @@
-// Left hand side of assignment statement can't be assigned to
+// Left hand side of assignment expression can't be assigned to

 int foo() {return 2;}

--- a/tests_errors/lvalue-2.ispc
+++ b/tests_errors/lvalue-2.ispc
@@ -1,4 +1,4 @@
-// Left hand side of assignment statement can't be assigned to
+// Can't assign to type "const uniform int32" on left-hand side of expression

 int bar(){ 
    4 = 0;
--- a/tests_errors/lvalue-3.ispc
+++ b/tests_errors/lvalue-3.ispc
@@ -1,4 +1,4 @@
-// Left hand side of assignment statement can't be assigned to
+// Can't assign to type "const uniform int32" on left-hand side of expression

 int bar(){ 
    int x;
--- a/tests_errors/switch-1.ispc
+++ b/tests_errors/switch-1.ispc
@@ -0,0 +1,9 @@
+// Case statement value must be a compile-time integer constant
+
+void foo(float f) {
+    switch (f) {
+    case 1.5:
+        ++f;
+    }
+}
+
--- a/tests_errors/switch-2.ispc
+++ b/tests_errors/switch-2.ispc
@@ -0,0 +1,12 @@
+// Duplicate case value "1"
+
+void foo(float f) {
+    switch (f) {
+    case 1:
+        ++f;
+    case 2:
+    case 1:
+        f = 0;
+    }
+}
+
--- a/tests_errors/switch-3.ispc
+++ b/tests_errors/switch-3.ispc
@@ -0,0 +1,13 @@
+// "case" label illegal outside of "switch" statement
+
+void foo(float f) {
+    switch (f) {
+    case 1:
+        ++f;
+    case 2:
+        f = 0;
+    }
+ case 3:
+     --f;
+}
+
--- a/tests_errors/switch-4.ispc
+++ b/tests_errors/switch-4.ispc
@@ -0,0 +1,13 @@
+// "default" label illegal outside of "switch" statement
+
+void foo(float f) {
+  default:
+     ++f;
+    switch (f) {
+    case 1:
+        ++f;
+    case 2:
+        f = 0;
+    }
+}
+
--- a/tests_errors/switch-5.ispc
+++ b/tests_errors/switch-5.ispc
@@ -0,0 +1,14 @@
+// "default" label illegal outside of "switch" statement
+
+void foo(float f) {
+  default:
+     ++f;
+    switch (f) {
+    case 1:
+        ++f;
+        continue;
+    case 2:
+        f = 0;
+    }
+}
+
--- a/tests_errors/switch-6.ispc
+++ b/tests_errors/switch-6.ispc
@@ -0,0 +1,12 @@
+// "continue" statement illegal outside of for/while/do/foreach loops
+
+void foo(float f) {
+    switch (f) {
+    case 1:
+        ++f;
+        continue;
+    case 2:
+        f = 0;
+    }
+}
+
--- a/type.cpp
+++ b/type.cpp
--- a/type.h
+++ b/type.h
@@ -78,20 +78,44 @@ public:
    /** Returns true if the underlying type is a float or integer type. */
    bool IsNumericType() const { return IsFloatType() || IsIntType(); }

+    /** Types may have uniform, varying, or not-yet-determined variability;
+        this enumerant is used by Type implementations to record their
+        variability. */
+    enum Variability {
+        Uniform,
+        Varying,
+        Unbound
+    };
+
+    /** Returns the variability of the type. */
+    virtual Variability GetVariability() const = 0;
+
    /** Returns true if the underlying type is uniform */
-    virtual bool IsUniformType() const = 0;
+    bool IsUniformType() const { return GetVariability() == Uniform; }

    /** Returns true if the underlying type is varying */
-    bool IsVaryingType() const { return !IsUniformType(); }
+    bool IsVaryingType() const { return GetVariability() == Varying; }
+
+    /** Returns true if the underlying type's uniform/varying-ness is
+        unbound. */
+    bool HasUnboundVariability() const { return GetVariability() == Unbound; }
+
+    /* Returns a type wherein any elements of the original type and
+       contained types that have unbound variability have their variability
+       set to the given variability. */
+    virtual const Type *ResolveUnboundVariability(Variability v) const = 0;

    /** Return a "uniform" instance of this type.  If the type is already
        uniform, its "this" pointer will be returned. */
    virtual const Type *GetAsUniformType() const = 0;

    /** Return a "varying" instance of this type.  If the type is already
-        uniform, its "this" pointer will be returned. */
+        varying, its "this" pointer will be returned. */
    virtual const Type *GetAsVaryingType() const = 0;

+    /** Get an instance of the type with unbound variability. */
+    virtual const Type *GetAsUnboundVariabilityType() const = 0;
+
    /** If this is a signed integer type, return the unsigned version of
        the type.  Otherwise, return the original type. */
    virtual const Type *GetAsUnsignedType() const;
@@ -185,7 +209,8 @@ public:
 */
 class AtomicType : public Type {
 public:
-    bool IsUniformType() const;
+    Variability GetVariability() const;
+
    bool IsBoolType() const;
    bool IsFloatType() const;
    bool IsIntType() const;
@@ -195,8 +220,10 @@ public:
    /** For AtomicTypes, the base type is just the same as the AtomicType
        itself. */
    const AtomicType *GetBaseType() const;
-    const AtomicType *GetAsVaryingType() const;
    const AtomicType *GetAsUniformType() const;
+    const AtomicType *GetAsVaryingType() const;
+    const AtomicType *GetAsUnboundVariabilityType() const;
+    const AtomicType *ResolveUnboundVariability(Variability v) const;
    const AtomicType *GetAsUnsignedType() const;
    const Type *GetSOAType(int width) const;
    const AtomicType *GetAsConstType() const;
@@ -224,38 +251,45 @@ public:
        TYPE_INT64,
        TYPE_UINT64,
        TYPE_DOUBLE,
+        NUM_BASIC_TYPES
    };

    const BasicType basicType;

-    static const AtomicType *UniformBool, *VaryingBool;
-    static const AtomicType *UniformInt8, *VaryingInt8;
-    static const AtomicType *UniformInt16, *VaryingInt16;
-    static const AtomicType *UniformInt32, *VaryingInt32;
-    static const AtomicType *UniformUInt8, *VaryingUInt8;
-    static const AtomicType *UniformUInt16, *VaryingUInt16;
-    static const AtomicType *UniformUInt32, *VaryingUInt32;
-    static const AtomicType *UniformFloat, *VaryingFloat;
-    static const AtomicType *UniformInt64, *VaryingInt64;
-    static const AtomicType *UniformUInt64, *VaryingUInt64;
-    static const AtomicType *UniformDouble, *VaryingDouble;
-    static const AtomicType *UniformConstBool, *VaryingConstBool;
-    static const AtomicType *UniformConstInt8, *VaryingConstInt8;
-    static const AtomicType *UniformConstInt16, *VaryingConstInt16;
-    static const AtomicType *UniformConstInt32, *VaryingConstInt32;
-    static const AtomicType *UniformConstUInt8, *VaryingConstUInt8;
-    static const AtomicType *UniformConstUInt16, *VaryingConstUInt16;
-    static const AtomicType *UniformConstUInt32, *VaryingConstUInt32;
-    static const AtomicType *UniformConstFloat, *VaryingConstFloat;
-    static const AtomicType *UniformConstInt64, *VaryingConstInt64;
-    static const AtomicType *UniformConstUInt64, *VaryingConstUInt64;
-    static const AtomicType *UniformConstDouble, *VaryingConstDouble;
+    static const AtomicType *UniformBool, *VaryingBool, *UnboundBool;
+    static const AtomicType *UniformInt8, *VaryingInt8, *UnboundInt8;
+    static const AtomicType *UniformInt16, *VaryingInt16, *UnboundInt16;
+    static const AtomicType *UniformInt32, *VaryingInt32, *UnboundInt32;
+    static const AtomicType *UniformUInt8, *VaryingUInt8, *UnboundUInt8;
+    static const AtomicType *UniformUInt16, *VaryingUInt16, *UnboundUInt16;
+    static const AtomicType *UniformUInt32, *VaryingUInt32, *UnboundUInt32;
+    static const AtomicType *UniformFloat, *VaryingFloat, *UnboundFloat;
+    static const AtomicType *UniformInt64, *VaryingInt64, *UnboundInt64;
+    static const AtomicType *UniformUInt64, *VaryingUInt64, *UnboundUInt64;
+    static const AtomicType *UniformDouble, *VaryingDouble, *UnboundDouble;
+    static const AtomicType *UniformConstBool, *VaryingConstBool, *UnboundConstBool;
+    static const AtomicType *UniformConstInt8, *VaryingConstInt8, *UnboundConstInt8;
+    static const AtomicType *UniformConstInt16, *VaryingConstInt16, *UnboundConstInt16;
+    static const AtomicType *UniformConstInt32, *VaryingConstInt32, *UnboundConstInt32;
+    static const AtomicType *UniformConstUInt8, *VaryingConstUInt8, *UnboundConstUInt8;
+    static const AtomicType *UniformConstUInt16, *VaryingConstUInt16, *UnboundConstUInt16;
+    static const AtomicType *UniformConstUInt32, *VaryingConstUInt32, *UnboundConstUInt32;
+    static const AtomicType *UniformConstFloat, *VaryingConstFloat, *UnboundConstFloat;
+    static const AtomicType *UniformConstInt64, *VaryingConstInt64, *UnboundConstInt64;
+    static const AtomicType *UniformConstUInt64, *VaryingConstUInt64, *UnboundConstUInt64;
+    static const AtomicType *UniformConstDouble, *VaryingConstDouble, *UnboundConstDouble;
    static const AtomicType *Void;

+    /** This function must be called before any of the above static const
+        AtomicType values is used; in practice, we do it early in
+        main(). */
+    static void Init();
+
 private:
-    const bool isUniform;
+    static const AtomicType *typeTable[NUM_BASIC_TYPES][3][2];
+    const Variability variability;
    const bool isConst;
-    AtomicType(BasicType basicType, bool isUniform, bool isConst);
+    AtomicType(BasicType basicType, Variability v, bool isConst);
 };


@@ -268,7 +302,8 @@ public:
    /** Constructor for named enumerated types */
    EnumType(const char *name, SourcePos pos);

-    bool IsUniformType() const;
+    Variability GetVariability() const;
+
    bool IsBoolType() const;
    bool IsFloatType() const;
    bool IsIntType() const;
@@ -278,6 +313,8 @@ public:
    const EnumType *GetBaseType() const;
    const EnumType *GetAsVaryingType() const;
    const EnumType *GetAsUniformType() const;
+    const EnumType *GetAsUnboundVariabilityType() const;
+    const EnumType *ResolveUnboundVariability(Variability v) const;
    const Type *GetSOAType(int width) const;
    const EnumType *GetAsConstType() const;
    const EnumType *GetAsNonConstType() const;
@@ -300,15 +337,17 @@ public:

 private:
    const std::string name;
-    bool isUniform, isConst;
+    Variability variability;
+    bool isConst;
    std::vector<Symbol *> enumerators;
 };

+
 /** @brief Type implementation for pointers to other types
 */
 class PointerType : public Type {
 public:
-    PointerType(const Type *t, bool isUniform, bool isConst);
+    PointerType(const Type *t, Variability v, bool isConst);

    /** Helper method to return a uniform pointer to the given type. */
    static PointerType *GetUniform(const Type *t);
@@ -318,7 +357,8 @@ public:
    /** Returns true if the given type is a void * type. */
    static bool IsVoidPointer(const Type *t);

-    bool IsUniformType() const;
+    Variability GetVariability() const;
+
    bool IsBoolType() const;
    bool IsFloatType() const;
    bool IsIntType() const;
@@ -328,6 +368,8 @@ public:
    const Type *GetBaseType() const;
    const PointerType *GetAsVaryingType() const;
    const PointerType *GetAsUniformType() const;
+    const PointerType *GetAsUnboundVariabilityType() const;
+    const PointerType *ResolveUnboundVariability(Variability v) const;
    const Type *GetSOAType(int width) const;
    const PointerType *GetAsConstType() const;
    const PointerType *GetAsNonConstType() const;
@@ -342,7 +384,8 @@ public:
    static PointerType *Void;

 private:
-    const bool isUniform, isConst;
+    const Variability variability;
+    const bool isConst;
    const Type *baseType;
 };

@@ -408,7 +451,8 @@ public:
     */
    ArrayType(const Type *elementType, int numElements);

-    bool IsUniformType() const;
+    Variability GetVariability() const;
+
    bool IsBoolType() const;
    bool IsFloatType() const;
    bool IsIntType() const;
@@ -418,6 +462,9 @@ public:
    const Type *GetBaseType() const;
    const ArrayType *GetAsVaryingType() const;
    const ArrayType *GetAsUniformType() const;
+    const ArrayType *GetAsUnboundVariabilityType() const;
+    const ArrayType *ResolveUnboundVariability(Variability v) const;
+
    const ArrayType *GetAsUnsignedType() const;
    const Type *GetSOAType(int width) const;
    const ArrayType *GetAsConstType() const;
@@ -495,6 +542,9 @@ public:

    const SOAArrayType *GetAsVaryingType() const;
    const SOAArrayType *GetAsUniformType() const;
+    const SOAArrayType *GetAsUnboundVariabilityType() const;
+    const SOAArrayType *ResolveUnboundVariability(Variability v) const;
+
    const Type *GetSOAType(int width) const;
    const SOAArrayType *GetAsConstType() const;
    const SOAArrayType *GetAsNonConstType() const;
@@ -536,7 +586,8 @@ class VectorType : public SequentialType {
 public:
    VectorType(const AtomicType *base, int size);

-    bool IsUniformType() const;
+    Variability GetVariability() const;
+
    bool IsBoolType() const;
    bool IsFloatType() const;
    bool IsIntType() const;
@@ -546,6 +597,9 @@ public:
    const Type *GetBaseType() const;
    const VectorType *GetAsVaryingType() const;
    const VectorType *GetAsUniformType() const;
+    const VectorType *GetAsUnboundVariabilityType() const;
+    const VectorType *ResolveUnboundVariability(Variability v) const;
+
    const Type *GetSOAType(int width) const;
    const VectorType *GetAsConstType() const;
    const VectorType *GetAsNonConstType() const;
@@ -580,9 +634,10 @@ public:
    StructType(const std::string &name, const std::vector<const Type *> &elts, 
               const std::vector<std::string> &eltNames, 
               const std::vector<SourcePos> &eltPositions, bool isConst, 
-               bool isUniform, SourcePos pos);
+               Variability variability, SourcePos pos);
+
+    Variability GetVariability() const;

-    bool IsUniformType() const;
    bool IsBoolType() const;
    bool IsFloatType() const;
    bool IsIntType() const;
@@ -592,6 +647,9 @@ public:
    const Type *GetBaseType() const;
    const StructType *GetAsVaryingType() const;
    const StructType *GetAsUniformType() const;
+    const StructType *GetAsUnboundVariabilityType() const;
+    const StructType *ResolveUnboundVariability(Variability v) const;
+
    const Type *GetSOAType(int width) const;
    const StructType *GetAsConstType() const;
    const StructType *GetAsNonConstType() const;
@@ -641,7 +699,7 @@ private:
    /** Source file position at which each structure element declaration
        appeared. */
    const std::vector<SourcePos> elementPositions;
-    const bool isUniform;
+    const Variability variability;
    const bool isConst;
    const SourcePos pos;
 };
@@ -653,7 +711,8 @@ class ReferenceType : public Type {
 public:
    ReferenceType(const Type *targetType);

-    bool IsUniformType() const;
+    Variability GetVariability() const;
+
    bool IsBoolType() const;
    bool IsFloatType() const;
    bool IsIntType() const;
@@ -664,6 +723,9 @@ public:
    const Type *GetReferenceTarget() const;
    const ReferenceType *GetAsVaryingType() const;
    const ReferenceType *GetAsUniformType() const;
+    const ReferenceType *GetAsUnboundVariabilityType() const;
+    const ReferenceType *ResolveUnboundVariability(Variability v) const;
+
    const Type *GetSOAType(int width) const;
    const ReferenceType *GetAsConstType() const;
    const ReferenceType *GetAsNonConstType() const;
@@ -696,13 +758,14 @@ public:
    FunctionType(const Type *returnType, 
                 const std::vector<const Type *> &argTypes, SourcePos pos);
    FunctionType(const Type *returnType, 
-                 const std::vector<const Type *> &argTypes, SourcePos pos,
+                 const std::vector<const Type *> &argTypes,
                 const std::vector<std::string> &argNames,
                 const std::vector<ConstExpr *> &argDefaults,
                 const std::vector<SourcePos> &argPos,
                 bool isTask, bool isExported, bool isExternC);

-    bool IsUniformType() const;
+    Variability GetVariability() const;
+
    bool IsBoolType() const;
    bool IsFloatType() const;
    bool IsIntType() const;
@@ -712,6 +775,9 @@ public:
    const Type *GetBaseType() const;
    const Type *GetAsVaryingType() const;
    const Type *GetAsUniformType() const;
+    const Type *GetAsUnboundVariabilityType() const;
+    const FunctionType *ResolveUnboundVariability(Variability v) const;
+
    const Type *GetSOAType(int width) const;
    const Type *GetAsConstType() const;
    const Type *GetAsNonConstType() const;
@@ -752,6 +818,7 @@ public:

 private:
    const Type * const returnType;
+
    // The following four vectors should all have the same length (which is
    // in turn the length returned by GetNumParameters()).
    const std::vector<const Type *> paramTypes;
--- a/util.cpp
+++ b/util.cpp
@@ -39,6 +39,9 @@
 #include "module.h"
 #ifdef ISPC_IS_WINDOWS
 #include <shlwapi.h>
+#ifdef __MINGW32__
+#include <malloc.h> // for alloca()
+#endif
 #else
 #include <alloca.h>
 #endif
@@ -75,7 +78,7 @@ lTerminalWidth() {
    HANDLE h = GetStdHandle(STD_OUTPUT_HANDLE);
    if (h == INVALID_HANDLE_VALUE || h == NULL)
        return 80;
-    CONSOLE_SCREEN_BUFFER_INFO bufferInfo = { 0 };
+    CONSOLE_SCREEN_BUFFER_INFO bufferInfo = { {0} };
    GetConsoleScreenBufferInfo(h, &bufferInfo);
    return bufferInfo.dwSize.X;
 #else
@@ -187,6 +190,32 @@ lPrintWithWordBreaks(const char *buf, int columnWidth, FILE *out) {
 }


+#ifdef ISPC_IS_WINDOWS
+// we cover for the lack vasprintf and asprintf on windows (also covers mingw)
+int
+vasprintf(char **sptr, const char *fmt, va_list argv)
+{
+    int wanted = vsnprintf(*sptr = NULL, 0, fmt, argv);
+    if((wanted < 0) || ((*sptr = (char*)malloc( 1 + wanted )) == NULL))
+        return -1;
+
+    return vsprintf(*sptr, fmt, argv);
+}
+
+
+int
+asprintf(char **sptr, const char *fmt, ...)
+{
+    int retval;
+    va_list argv;
+    va_start(argv, fmt);
+    retval = vasprintf(sptr, fmt, argv);
+    va_end(argv);
+    return retval;
+}
+#endif
+
+
 /** Helper function for Error(), Warning(), etc.

    @param type   The type of message being printed (e.g. "Warning")
@@ -197,30 +226,6 @@ lPrintWithWordBreaks(const char *buf, int columnWidth, FILE *out) {
 */
 static void
 lPrint(const char *type, SourcePos p, const char *fmt, va_list args) {
-#ifdef ISPC_IS_WINDOWS
-    char errorBuf[2048], formattedBuf[2048];
-    if (vsnprintf_s(errorBuf, sizeof(errorBuf), _TRUNCATE, fmt, args) == -1) {
-        fprintf(stderr, "vsnprintf_s() error!\n");
-        return;
-    }
-
-    if (p.first_line == 0) {
-        // We don't have a valid SourcePos, so create a message without it
-        if (sprintf_s(formattedBuf, sizeof(formattedBuf), "%s: %s\n", 
-                       type, errorBuf) == -1) {
-            fprintf(stderr, "vsnprintf_s() error!\n");
-            exit(1);
-        }
-    }
-    else {
-        // Create an error message that includes the file and line number
-        if (sprintf_s(formattedBuf, sizeof(formattedBuf), "%s(%d): %s: %s\n", 
-                      p.name, p.first_line, type, errorBuf) == -1) {
-            fprintf(stderr, "vsnprintf_s() error!\n");
-            exit(1);
-        }
-    }
-#else
    char *errorBuf, *formattedBuf;
    if (vasprintf(&errorBuf, fmt, args) == -1) {
        fprintf(stderr, "vasprintf() unable to allocate memory!\n");
@@ -241,7 +246,6 @@ lPrint(const char *type, SourcePos p, const char *fmt, va_list args) {
            exit(1);
        }
    }
-#endif

    // Now that we've done all that work, see if we've already printed the
    // exact same error message.  If so, return, so we don't redundantly
@@ -254,10 +258,8 @@ lPrint(const char *type, SourcePos p, const char *fmt, va_list args) {
    lPrintWithWordBreaks(formattedBuf, lTerminalWidth(), stderr);
    lPrintFileLineContext(p);

-#ifndef ISPC_IS_WINDOWS
    free(errorBuf);
    free(formattedBuf);
-#endif // !ISPC_IS_WINDOWS
 }


--- a/util.h
+++ b/util.h
@@ -40,6 +40,9 @@
 #define ISPC_UTIL_H

 #include "ispc.h"
+#ifdef ISPC_IS_WINDOWS
+#include <stdarg.h>
+#endif

 struct SourcePos;

@@ -62,6 +65,12 @@ inline uint32_t RoundUpPow2(uint32_t v) {
 #define PRINTF_FUNC
 #endif // __GNUG__

+// for cross-platform compatibility
+#ifdef ISPC_IS_WINDOWS
+int vasprintf(char **sptr, const char *fmt, va_list argv);
+int  asprintf(char **sptr, const char *fmt, ...);
+#endif
+
 /** Prints a debugging message.  These messages are only printed if
    g->debugPrint is \c true.  In addition to a program source code
    position to associate with the message, a printf()-style format string