Release notes, bump doxygen release number for 1.1.4

Add missing "varying/varying" atomic_compare_exchange_global() functions.
Add support for "local" atomics.
2012-02-04 15:38:17 -08:00 · 2012-02-03 13:19:15 -08:00 · 2012-02-03 13:15:21 -08:00 · 2012-02-01 11:03:58 -08:00 · 2012-02-01 08:17:25 -08:00 · 2012-01-31 14:10:07 -08:00
204 changed files with 8627 additions and 2442 deletions
--- a/9
+++ b/9
@@ -58,11 +58,7 @@ LDFLAGS=
 ifeq ($(ARCH_OS),Linux)
  # try to link everything statically under Linux (including libstdc++) so
  # that the binaries we generate will be portable across distributions...
-  ifeq ($(ARCH_TYPE),x86_64)
-    LDFLAGS=-static -L/usr/lib/gcc/x86_64-linux-gnu/4.4
-  else
-    LDFLAGS=-L/usr/lib/gcc/i686-redhat-linux/4.6.0
-  endif
+    LDFLAGS=-static
 endif

 LEX=flex
@@ -75,7 +71,8 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
 	type.cpp util.cpp
 HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
 	opt.h stmt.h sym.h type.h util.h
-TARGETS=avx avx-x2 sse2 sse2-x2 sse4 sse4-x2 generic-4 generic-8 generic-16
+TARGETS=avx1 avx1-x2 avx2 avx2-x2 sse2 sse2-x2 sse4 sse4-x2 generic-4 generic-8 \
+	generic-16 generic-1
 BUILTINS_SRC=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS))) \
 	builtins/dispatch.ll
 BUILTINS_OBJS=$(addprefix builtins-, $(notdir $(BUILTINS_SRC:.ll=.o))) \
--- a/README.rst
+++ b/README.rst
@@ -47,7 +47,7 @@ remarkable `LLVM Compiler Infrastructure <http://llvm.org>`_ for back-end
 code generation and optimization and is `hosted on
 github <http://github.com/ispc/ispc/>`_. It supports Windows, Mac, and
 Linux, with both x86 and x86-64 targets.  It currently supports the SSE2,
-SSE4, and AVX instruction sets.
+SSE4, AVX1, and AVX2 instruction sets.

 Features
 --------
--- a/ast.cpp
+++ b/ast.cpp
@@ -90,11 +90,15 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
        DoStmt *dos;
        ForStmt *fs;
        ForeachStmt *fes;
+        CaseStmt *cs;
+        DefaultStmt *defs;
+        SwitchStmt *ss;
        ReturnStmt *rs;
        LabeledStmt *ls;
        StmtList *sl;
        PrintStmt *ps;
        AssertStmt *as;
+        DeleteStmt *dels;

        if ((es = dynamic_cast<ExprStmt *>(node)) != NULL)
            es->expr = (Expr *)WalkAST(es->expr, preFunc, postFunc, data);
@@ -131,6 +135,14 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
                                                   postFunc, data);
            fes->stmts = (Stmt *)WalkAST(fes->stmts, preFunc, postFunc, data);
        }
+        else if ((cs = dynamic_cast<CaseStmt *>(node)) != NULL)
+            cs->stmts = (Stmt *)WalkAST(cs->stmts, preFunc, postFunc, data);
+        else if ((defs = dynamic_cast<DefaultStmt *>(node)) != NULL)
+            defs->stmts = (Stmt *)WalkAST(defs->stmts, preFunc, postFunc, data);
+        else if ((ss = dynamic_cast<SwitchStmt *>(node)) != NULL) {
+            ss->expr = (Expr *)WalkAST(ss->expr, preFunc, postFunc, data);
+            ss->stmts = (Stmt *)WalkAST(ss->stmts, preFunc, postFunc, data);
+        }
        else if (dynamic_cast<BreakStmt *>(node) != NULL ||
                 dynamic_cast<ContinueStmt *>(node) != NULL ||
                 dynamic_cast<GotoStmt *>(node) != NULL) {
@@ -149,6 +161,8 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
            ps->values = (Expr *)WalkAST(ps->values, preFunc, postFunc, data);
        else if ((as = dynamic_cast<AssertStmt *>(node)) != NULL)
            as->expr = (Expr *)WalkAST(as->expr, preFunc, postFunc, data);
+        else if ((dels = dynamic_cast<DeleteStmt *>(node)) != NULL)
+            dels->expr = (Expr *)WalkAST(dels->expr, preFunc, postFunc, data);
        else
            FATAL("Unhandled statement type in WalkAST()");
    }
@@ -169,6 +183,7 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
        DereferenceExpr *dre;
        SizeOfExpr *soe;
        AddressOfExpr *aoe;
+        NewExpr *newe;

        if ((ue = dynamic_cast<UnaryExpr *>(node)) != NULL)
            ue->expr = (Expr *)WalkAST(ue->expr, preFunc, postFunc, data);
@@ -212,6 +227,12 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
            soe->expr = (Expr *)WalkAST(soe->expr, preFunc, postFunc, data);
        else if ((aoe = dynamic_cast<AddressOfExpr *>(node)) != NULL)
            aoe->expr = (Expr *)WalkAST(aoe->expr, preFunc, postFunc, data);
+        else if ((newe = dynamic_cast<NewExpr *>(node)) != NULL) {
+            newe->countExpr = (Expr *)WalkAST(newe->countExpr, preFunc, 
+                                              postFunc, data);
+            newe->initExpr = (Expr *)WalkAST(newe->initExpr, preFunc, 
+                                             postFunc, data);
+        }
        else if (dynamic_cast<SymbolExpr *>(node) != NULL ||
                 dynamic_cast<ConstExpr *>(node) != NULL ||
                 dynamic_cast<FunctionSymbolExpr *>(node) != NULL ||
@@ -294,3 +315,116 @@ EstimateCost(ASTNode *root) {
    return cost;
 }

+
+/** Given an AST node, check to see if it's safe if we happen to run the
+    code for that node with the execution mask all off.
+ */
+static bool
+lCheckAllOffSafety(ASTNode *node, void *data) {
+    bool *okPtr = (bool *)data;
+
+    if (dynamic_cast<FunctionCallExpr *>(node) != NULL) {
+        // FIXME: If we could somehow determine that the function being
+        // called was safe (and all of the args Exprs were safe, then it'd
+        // be nice to be able to return true here.  (Consider a call to
+        // e.g. floatbits() in the stdlib.)  Unfortunately for now we just
+        // have to be conservative.
+        *okPtr = false;
+        return false;
+    }
+
+    if (dynamic_cast<AssertStmt *>(node) != NULL) {
+        // While it's fine to run the assert for varying tests, it's not
+        // desirable to check an assert on a uniform variable if all of the
+        // lanes are off.
+        *okPtr = false;
+        return false;
+    }
+
+    if (dynamic_cast<NewExpr *>(node) != NULL ||
+        dynamic_cast<DeleteStmt *>(node) != NULL) {
+        // We definitely don't want to run the uniform variants of these if
+        // the mask is all off.  It's also worth skipping the overhead of
+        // executing the varying versions of them in the all-off mask case.
+        *okPtr = false;
+        return false;
+    }
+
+    if (g->target.allOffMaskIsSafe == true)
+        // Don't worry about memory accesses if we have a target that can
+        // safely run them with the mask all off
+        return true;
+
+    IndexExpr *ie;
+    if ((ie = dynamic_cast<IndexExpr *>(node)) != NULL && ie->baseExpr != NULL) {
+        const Type *type = ie->baseExpr->GetType();
+        if (type == NULL)
+            return true;
+        if (dynamic_cast<const ReferenceType *>(type) != NULL)
+            type = type->GetReferenceTarget();
+
+        ConstExpr *ce = dynamic_cast<ConstExpr *>(ie->index);
+        if (ce == NULL) {
+            // indexing with a variable... -> not safe
+            *okPtr = false;
+            return false;
+        }
+
+        const PointerType *pointerType = 
+            dynamic_cast<const PointerType *>(type);
+        if (pointerType != NULL) {
+            // pointer[index] -> can't be sure -> not safe
+            *okPtr = false;
+            return false;
+        }
+
+        const SequentialType *seqType = 
+            dynamic_cast<const SequentialType *>(type);
+        Assert(seqType != NULL);
+        int nElements = seqType->GetElementCount();
+        if (nElements == 0) {
+            // Unsized array, so we can't be sure -> not safe
+            *okPtr = false;
+            return false;
+        }
+
+        int32_t indices[ISPC_MAX_NVEC];
+        int count = ce->AsInt32(indices);
+        for (int i = 0; i < count; ++i) {
+            if (indices[i] < 0 || indices[i] >= nElements) {
+                // Index is out of bounds -> not safe
+                *okPtr = false;
+                return false;
+            }
+        }
+
+        // All indices are in-bounds
+        return true;
+    }
+
+    MemberExpr *me;
+    if ((me = dynamic_cast<MemberExpr *>(node)) != NULL &&
+        me->dereferenceExpr) {
+        *okPtr = false;
+        return false;
+    }
+
+    DereferenceExpr *de;
+    if ((de = dynamic_cast<DereferenceExpr *>(node)) != NULL) {
+        const Type *exprType = de->expr->GetType();
+        if (dynamic_cast<const PointerType *>(exprType) != NULL) {
+            *okPtr = false;
+            return false;
+        }
+    }
+
+    return true;
+}
+
+
+bool
+SafeToRunWithMaskAllOff(ASTNode *root) {
+    bool safe = true;
+    WalkAST(root, lCheckAllOffSafety, NULL, &safe);
+    return safe;
+}
--- a/ast.h
+++ b/ast.h
@@ -144,4 +144,8 @@ extern Stmt *TypeCheck(Stmt *);
    the given root. */
 extern int EstimateCost(ASTNode *root);

+/** Returns true if it would be safe to run the given code with an "all
+    off" mask. */ 
+extern bool SafeToRunWithMaskAllOff(ASTNode *root);
+
 #endif // ISPC_AST_H
--- a/bitcode2cpp.py
+++ b/bitcode2cpp.py
@@ -26,17 +26,21 @@ if platform.system() == 'Windows' or string.find(platform.system(), "CYGWIN_NT")
 try:
    as_out=subprocess.Popen([llvm_as, "-", "-o", "-"], stdout=subprocess.PIPE)
 except IOError:
-    print >> sys.stderr, "Couldn't open " + src
+    sys.stderr.write("Couldn't open " + src)
    sys.exit(1)

-print "unsigned char builtins_bitcode_" + target + "[] = {"
-for line in as_out.stdout.readlines():
-    length = length + len(line)
-    for c in line:
-        print ord(c)
-        print ", "
-print " 0 };\n\n"
-print "int builtins_bitcode_" + target + "_length = " + str(length) + ";\n"
+width = 16;
+sys.stdout.write("unsigned char builtins_bitcode_" + target + "[] = {\n")
+
+data = as_out.stdout.read()
+for i in range(0, len(data), 1):
+        sys.stdout.write("0x%0.2X, " % ord(data[i:i+1]))
+
+        if i%width == (width-1):
+            sys.stdout.write("\n")
+
+sys.stdout.write("0x00 };\n\n")
+sys.stdout.write("int builtins_bitcode_" + target + "_length = " + str(i+1) + ";\n")

 as_out.wait()

--- a/builtins.cpp
+++ b/builtins.cpp
@@ -386,10 +386,13 @@ lSetInternalFunctions(llvm::Module *module) {
        "__ceil_uniform_float",
        "__ceil_varying_double",
        "__ceil_varying_float",
+        "__clock",
        "__count_trailing_zeros_i32",
        "__count_trailing_zeros_i64",
        "__count_leading_zeros_i32",
        "__count_leading_zeros_i64",
+        "__delete_uniform",
+        "__delete_varying",
        "__do_assert_uniform",
        "__do_assert_varying",
        "__do_print", 
@@ -448,6 +451,9 @@ lSetInternalFunctions(llvm::Module *module) {
        "__min_varying_uint32",
        "__min_varying_uint64",
        "__movmsk",
+        "__new_uniform",
+        "__new_varying32",
+        "__new_varying64",
        "__num_cores",
        "__packed_load_active",
        "__packed_store_active",
@@ -717,11 +723,13 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
        extern int builtins_bitcode_sse4_x2_length;
        switch (g->target.vectorWidth) {
        case 4: 
-            AddBitcodeToModule(builtins_bitcode_sse4, builtins_bitcode_sse4_length, 
+            AddBitcodeToModule(builtins_bitcode_sse4,
+                               builtins_bitcode_sse4_length, 
                               module, symbolTable);
            break;
        case 8:
-            AddBitcodeToModule(builtins_bitcode_sse4_x2, builtins_bitcode_sse4_x2_length, 
+            AddBitcodeToModule(builtins_bitcode_sse4_x2, 
+                               builtins_bitcode_sse4_x2_length, 
                               module, symbolTable);
            break;
        default:
@@ -729,18 +737,39 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
        }
        break;
    case Target::AVX:
-    case Target::AVX2:
        switch (g->target.vectorWidth) {
        case 8:
-            extern unsigned char builtins_bitcode_avx[];
-            extern int builtins_bitcode_avx_length;
-            AddBitcodeToModule(builtins_bitcode_avx, builtins_bitcode_avx_length, 
+            extern unsigned char builtins_bitcode_avx1[];
+            extern int builtins_bitcode_avx1_length;
+            AddBitcodeToModule(builtins_bitcode_avx1, 
+                               builtins_bitcode_avx1_length, 
                               module, symbolTable);
            break;
        case 16:
-            extern unsigned char builtins_bitcode_avx_x2[];
-            extern int builtins_bitcode_avx_x2_length;
-            AddBitcodeToModule(builtins_bitcode_avx_x2, builtins_bitcode_avx_x2_length,
+            extern unsigned char builtins_bitcode_avx1_x2[];
+            extern int builtins_bitcode_avx1_x2_length;
+            AddBitcodeToModule(builtins_bitcode_avx1_x2, 
+                               builtins_bitcode_avx1_x2_length,
+                               module,  symbolTable);
+            break;
+        default:
+            FATAL("logic error in DefineStdlib");
+        }
+        break;
+    case Target::AVX2:
+        switch (g->target.vectorWidth) {
+        case 8:
+            extern unsigned char builtins_bitcode_avx2[];
+            extern int builtins_bitcode_avx2_length;
+            AddBitcodeToModule(builtins_bitcode_avx2, 
+                               builtins_bitcode_avx2_length, 
+                               module, symbolTable);
+            break;
+        case 16:
+            extern unsigned char builtins_bitcode_avx2_x2[];
+            extern int builtins_bitcode_avx2_x2_length;
+            AddBitcodeToModule(builtins_bitcode_avx2_x2, 
+                               builtins_bitcode_avx2_x2_length,
                               module,  symbolTable);
            break;
        default:
@@ -770,6 +799,13 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
                               builtins_bitcode_generic_16_length, 
                               module, symbolTable);
            break;
+	case 1:
+            extern unsigned char builtins_bitcode_generic_1[];
+            extern int builtins_bitcode_generic_1_length;
+            AddBitcodeToModule(builtins_bitcode_generic_1, 
+                               builtins_bitcode_generic_1_length, 
+                               module, symbolTable);
+            break;
        default:
            FATAL("logic error in DefineStdlib");
        }
@@ -798,11 +834,14 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
    lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload, module,
                           symbolTable);

+    lDefineConstantInt("__have_native_half", (g->target.isa == Target::AVX2),
+                       module, symbolTable);
+
    if (includeStdlibISPC) {
        // If the user wants the standard library to be included, parse the
        // serialized version of the stdlib.ispc file to get its
        // definitions added.
-        if (g->target.isa == Target::GENERIC) {
+      if (g->target.isa == Target::GENERIC&&g->target.vectorWidth!=1) { // 1 wide uses x86 stdlib
            extern char stdlib_generic_code[];
            yy_scan_string(stdlib_generic_code);
            yyparse();
--- a/builtins/dispatch.ll
+++ b/builtins/dispatch.ll
@@ -48,23 +48,42 @@ declare void @abort() noreturn
 ;; corresponding to one of the Target::ISA enumerant values that gives the
 ;; most capable ISA that the curremt system can run.
 ;;
-;; #ifdef _MSC_VER
-;; extern void __stdcall __cpuid(int info[4], int infoType);
-;; #else
+;; Note: clang from LLVM 2.9 should be used if this is updated, for maximum
+;; backwards compatibility for anyone building ispc with LLVM 2.9.
+;;
+;; #include <stdint.h>
+;; #include <stdlib.h>
+;; 
 ;; static void __cpuid(int info[4], int infoType) {
 ;;     __asm__ __volatile__ ("cpuid"
 ;;                           : "=a" (info[0]), "=b" (info[1]), "=c" (info[2]), "=d" (info[3])
 ;;                           : "0" (infoType));
 ;; }
-;; #endif
+;; 
+;; /* Save %ebx in case it's the PIC register */
+;; static void __cpuid_count(int info[4], int level, int count) {
+;;   __asm__ __volatile__ ("xchg{l}\t{%%}ebx, %1\n\t"
+;;                         "cpuid\n\t"
+;;                         "xchg{l}\t{%%}ebx, %1\n\t"
+;;                         : "=a" (info[0]), "=r" (info[1]), "=c" (info[2]), "=d" (info[3])
+;;                         : "0" (level), "2" (count));
+;; }
 ;; 
 ;; int32_t __get_system_isa() {
 ;;     int info[4];
 ;;     __cpuid(info, 1);
+;; 
 ;;     /* NOTE: the values returned below must be the same as the
 ;;        corresponding enumerant values in Target::ISA. */
-;;     if ((info[2] & (1 << 28)) != 0)
-;;         return 2; // AVX
+;;     if ((info[2] & (1 << 28)) != 0) {
+;;         // AVX1 for sure. Do we have AVX2?
+;;         // Call cpuid with eax=7, ecx=0
+;;         __cpuid_count(info, 7, 0);
+;;         if ((info[1] & (1 << 5)) != 0)
+;;             return 3; // AVX2
+;;         else
+;;             return 2; // AVX1
+;;     }
 ;;     else if ((info[2] & (1 << 19)) != 0)
 ;;         return 1; // SSE4
 ;;     else if ((info[3] & (1 << 26)) != 0)
@@ -76,33 +95,42 @@ declare void @abort() noreturn
 %0 = type { i32, i32, i32, i32 }

 define i32 @__get_system_isa() nounwind ssp {
-  %1 = tail call %0 asm sideeffect "cpuid", "={ax},={bx},={cx},={dx},0,~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind
-  %2 = extractvalue %0 %1, 2
-  %3 = extractvalue %0 %1, 3
-  %4 = and i32 %2, 268435456
-  %5 = icmp eq i32 %4, 0
-  br i1 %5, label %6, label %13
+entry:
+  %0 = tail call %0 asm sideeffect "cpuid", "={ax},={bx},={cx},={dx},0,~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind
+  %asmresult9.i = extractvalue %0 %0, 2
+  %asmresult10.i = extractvalue %0 %0, 3
+  %and = and i32 %asmresult9.i, 268435456
+  %cmp = icmp eq i32 %and, 0
+  br i1 %cmp, label %if.else7, label %if.then

-; <label>:6                                       ; preds = %0
-  %7 = and i32 %2, 524288
-  %8 = icmp eq i32 %7, 0
-  br i1 %8, label %9, label %13
+if.then:                                          ; preds = %entry
+  %1 = tail call %0 asm sideeffect "xchg$(l$)\09$(%$)ebx, $1\0A\09cpuid\0A\09xchg$(l$)\09$(%$)ebx, $1\0A\09", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 7, i32 0) nounwind
+  %asmresult9.i24 = extractvalue %0 %1, 1
+  %and4 = lshr i32 %asmresult9.i24, 5
+  %2 = and i32 %and4, 1
+  %3 = or i32 %2, 2
+  br label %return

-; <label>:9                                       ; preds = %6
-  %10 = and i32 %3, 67108864
-  %11 = icmp eq i32 %10, 0
-  br i1 %11, label %12, label %13
+if.else7:                                         ; preds = %entry
+  %and10 = and i32 %asmresult9.i, 524288
+  %cmp11 = icmp eq i32 %and10, 0
+  br i1 %cmp11, label %if.else13, label %return

-; <label>:12                                      ; preds = %9
+if.else13:                                        ; preds = %if.else7
+  %and16 = and i32 %asmresult10.i, 67108864
+  %cmp17 = icmp eq i32 %and16, 0
+  br i1 %cmp17, label %if.else19, label %return
+
+if.else19:                                        ; preds = %if.else13
  tail call void @abort() noreturn nounwind
  unreachable

-; <label>:13                                      ; preds = %9, %6, %0
-  %.0 = phi i32 [ 2, %0 ], [ 1, %6 ], [ 0, %9 ]
-  ret i32 %.0
+return:                                           ; preds = %if.else13, %if.else7, %if.then
+  %retval.0 = phi i32 [ %3, %if.then ], [ 1, %if.else7 ], [ 0, %if.else13 ]
+  ret i32 %retval.0
 }

-
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; This function is called by each of the dispatch functions we generate;
 ;; it sets @__system_best_isa if it is unset.

--- a/builtins/target-avx-x2.ll
+++ b/builtins/target-avx-x2.ll
@@ -170,33 +170,6 @@ define <16 x float> @__min_varying_float(<16 x float>,
 }


-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; int min/max
-
-define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
-  binary4to16(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
-  ret <16 x i32> %ret
-}
-
-define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
-  binary4to16(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
-  ret <16 x i32> %ret
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; unsigned int min/max
-
-define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
-  binary4to16(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
-  ret <16 x i32> %ret
-}
-
-define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
-  binary4to16(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
-  ret <16 x i32> %ret
-}
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops

@@ -622,12 +595,7 @@ define void @__masked_store_blend_64(<16 x i64>* nocapture %ptr, <16 x i64> %new
 }

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; gather/scatter
-
-gen_gather(16, i8)
-gen_gather(16, i16)
-gen_gather(16, i32)
-gen_gather(16, i64)
+;; scatter

 gen_scatter(16, i8)
 gen_scatter(16, i16)
--- a/builtins/target-avx.ll
+++ b/builtins/target-avx.ll
@@ -170,33 +170,6 @@ define <8 x float> @__min_varying_float(<8 x float>,
 }


-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; int min/max
-
-define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
-  binary4to8(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
-  ret <8 x i32> %ret
-}
-
-define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
-  binary4to8(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
-  ret <8 x i32> %ret
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; unsigned int min/max
-
-define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
-  binary4to8(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
-  ret <8 x i32> %ret
-}
-
-define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
-  binary4to8(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
-  ret <8 x i32> %ret
-}
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops

@@ -403,9 +376,6 @@ define <8 x i64> @__masked_load_64(i8 *, <8 x i32> %mask) nounwind alwaysinline
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store

-; FIXME: there is no AVX instruction for these, but we could be clever
-; by packing the bits down and setting the last 3/4 or half, respectively,
-; of the mask to zero...  Not sure if this would be a win in the end
 gen_masked_store(8, i8, 8)
 gen_masked_store(8, i16, 16)

@@ -520,12 +490,7 @@ define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,


 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; gather/scatter
-
-gen_gather(8, i8)
-gen_gather(8, i16)
-gen_gather(8, i32)
-gen_gather(8, i64)
+;; scatter

 gen_scatter(8, i8)
 gen_scatter(8, i16)
--- a/builtins/target-avx1-x2.ll
+++ b/builtins/target-avx1-x2.ll
@@ -0,0 +1,77 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+include(`target-avx-x2.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret <16 x i32> %ret
+}
+
+define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret <16 x i32> %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret <16 x i32> %ret
+}
+
+define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret <16 x i32> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather(16, i8)
+gen_gather(16, i16)
+gen_gather(16, i32)
+gen_gather(16, i64)
+
+
--- a/builtins/target-avx1.ll
+++ b/builtins/target-avx1.ll
@@ -0,0 +1,75 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+include(`target-avx.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret <8 x i32> %ret
+}
+
+define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret <8 x i32> %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret <8 x i32> %ret
+}
+
+define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret <8 x i32> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather(8, i8)
+gen_gather(8, i16)
+gen_gather(8, i32)
+gen_gather(8, i64)
--- a/builtins/target-avx2-x2.ll
+++ b/builtins/target-avx2-x2.ll
@@ -0,0 +1,129 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+include(`target-avx-x2.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readonly
+declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readonly
+
+define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary8to16(m, i32, @llvm.x86.avx2.pmins.d, %0, %1)
+  ret <16 x i32> %m
+}
+
+define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary8to16(m, i32, @llvm.x86.avx2.pmaxs.d, %0, %1)
+  ret <16 x i32> %m
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readonly
+declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readonly
+
+define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary8to16(m, i32, @llvm.x86.avx2.pminu.d, %0, %1)
+  ret <16 x i32> %m
+}
+
+define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary8to16(m, i32, @llvm.x86.avx2.pmaxu.d, %0, %1)
+  ret <16 x i32> %m
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float/half conversions
+
+declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
+; 0 is round nearest even
+declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
+
+define <16 x float> @__half_to_float_varying(<16 x i16> %v) nounwind readnone {
+  %r_0 = shufflevector <16 x i16> %v, <16 x i16> undef,
+             <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %vr_0 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_0)
+  %r_1 = shufflevector <16 x i16> %v, <16 x i16> undef,
+             <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vr_1 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_1)
+  %r = shufflevector <8 x float> %vr_0, <8 x float> %vr_1, 
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %r
+}
+
+define <16 x i16> @__float_to_half_varying(<16 x float> %v) nounwind readnone {
+  %r_0 = shufflevector <16 x float> %v, <16 x float> undef,
+             <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %vr_0 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_0, i32 0)
+  %r_1 = shufflevector <16 x float> %v, <16 x float> undef,
+             <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vr_1 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_1, i32 0)
+  %r = shufflevector <8 x i16> %vr_0, <8 x i16> %vr_1, 
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i16> %r
+}
+
+define float @__half_to_float_uniform(i16 %v) nounwind readnone {
+  %v1 = bitcast i16 %v to <1 x i16>
+  %vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
+  %r = extractelement <8 x float> %rv, i32 0
+  ret float %r
+}
+
+define i16 @__float_to_half_uniform(float %v) nounwind readnone {
+  %v1 = bitcast float %v to <1 x float>
+  %vv = shufflevector <1 x float> %v1, <1 x float> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  ; round to nearest even
+  %rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
+  %r = extractelement <8 x i16> %rv, i32 0
+  ret i16 %r
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather(16, i8)
+gen_gather(16, i16)
+gen_gather(16, i32)
+gen_gather(16, i64)
+
+
--- a/builtins/target-avx2.ll
+++ b/builtins/target-avx2.ll
@@ -0,0 +1,110 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+include(`target-avx.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readonly
+declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readonly
+
+define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  %m = call <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32> %0, <8 x i32> %1)
+  ret <8 x i32> %m
+}
+
+define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  %m = call <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32> %0, <8 x i32> %1)
+  ret <8 x i32> %m
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readonly
+declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readonly
+
+define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  %m = call <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32> %0, <8 x i32> %1)
+  ret <8 x i32> %m
+}
+
+define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  %m = call <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32> %0, <8 x i32> %1)
+  ret <8 x i32> %m
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float/half conversions
+
+declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
+; 0 is round nearest even
+declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
+
+define <8 x float> @__half_to_float_varying(<8 x i16> %v) nounwind readnone {
+  %r = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %v)
+  ret <8 x float> %r
+}
+
+define <8 x i16> @__float_to_half_varying(<8 x float> %v) nounwind readnone {
+  %r = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %v, i32 0)
+  ret <8 x i16> %r
+}
+
+define float @__half_to_float_uniform(i16 %v) nounwind readnone {
+  %v1 = bitcast i16 %v to <1 x i16>
+  %vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
+  %r = extractelement <8 x float> %rv, i32 0
+  ret float %r
+}
+
+define i16 @__float_to_half_uniform(float %v) nounwind readnone {
+  %v1 = bitcast float %v to <1 x float>
+  %vv = shufflevector <1 x float> %v1, <1 x float> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  ; round to nearest even
+  %rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
+  %r = extractelement <8 x i16> %rv, i32 0
+  ret i16 %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather(8, i8)
+gen_gather(8, i16)
+gen_gather(8, i32)
+gen_gather(8, i64)
--- a/builtins/target-generic-1.ll
+++ b/builtins/target-generic-1.ll
@@ -0,0 +1,935 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Define the standard library builtins for the NOVEC target
+define(`MASK',`i32')
+define(`WIDTH',`1')
+include(`util.m4')
+; Define some basics for a 1-wide target
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+aossoa()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+gen_masked_store(1, i8, 8)
+gen_masked_store(1, i16, 16)
+gen_masked_store(1, i32, 32)
+gen_masked_store(1, i64, 64)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+load_and_broadcast(1, i8, 8)
+load_and_broadcast(1, i16, 16)
+load_and_broadcast(1, i32, 32)
+load_and_broadcast(1, i64, 64)
+
+masked_load(1, i8,  8,  1)
+masked_load(1, i16, 16, 2)
+masked_load(1, i32, 32, 4)
+masked_load(1, i64, 64, 8)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather/scatter
+
+; define these with the macros from stdlib.m4
+
+gen_gather(1, i8)
+gen_gather(1, i16)
+gen_gather(1, i32)
+gen_gather(1, i64)
+
+gen_scatter(1, i8)
+gen_scatter(1, i16)
+gen_scatter(1, i32)
+gen_scatter(1, i64)
+
+
+define  <1 x i8> @__vselect_i8(<1 x i8>, <1 x i8> ,
+                                         <1 x i32> %mask) nounwind readnone alwaysinline {
+;  %mv = trunc <1 x i32> %mask to <1 x i8>
+;  %notmask = xor <1 x i8> %mv, <i8 -1>
+;  %cleared_old = and <1 x i8> %0, %notmask
+;  %masked_new = and <1 x i8> %1, %mv
+;  %new = or <1 x i8> %cleared_old, %masked_new
+;  ret <1 x i8> %new
+
+   ; not doing this the easy way because of problems with LLVM's scalarizer
+;   %cmp = icmp eq <1 x i32> %mask, <i32 0>
+;   %sel = select <1 x i1> %cmp, <1 x i8> %0, <1 x i8> %1
+    %m = extractelement <1 x i32> %mask, i32 0
+    %cmp = icmp eq i32 %m, 0
+    %d0 = extractelement <1 x i8> %0, i32 0
+    %d1 = extractelement <1 x i8> %1, i32 0
+    %sel = select i1 %cmp, i8 %d0, i8 %d1    
+    %r = insertelement <1 x i8> undef, i8 %sel, i32 0
+   ret <1 x i8> %r
+}
+
+define  <1 x i16> @__vselect_i16(<1 x i16>, <1 x i16> ,
+                                         <1 x i32> %mask) nounwind readnone alwaysinline {
+;  %mv = trunc <1 x i32> %mask to <1 x i16>
+;  %notmask = xor <1 x i16> %mv, <i16 -1>
+;  %cleared_old = and <1 x i16> %0, %notmask
+;  %masked_new = and <1 x i16> %1, %mv
+;  %new = or <1 x i16> %cleared_old, %masked_new
+;  ret <1 x i16> %new
+;   %cmp = icmp eq <1 x i32> %mask, <i32 0>
+;   %sel = select <1 x i1> %cmp, <1 x i16> %0, <1 x i16> %1
+    %m = extractelement <1 x i32> %mask, i32 0
+    %cmp = icmp eq i32 %m, 0
+    %d0 = extractelement <1 x i16> %0, i32 0
+    %d1 = extractelement <1 x i16> %1, i32 0
+    %sel = select i1 %cmp, i16 %d0, i16 %d1    
+    %r = insertelement <1 x i16> undef, i16 %sel, i32 0
+   ret <1 x i16> %r
+
+;   ret <1 x i16> %sel
+}
+
+
+define  <1 x i32> @__vselect_i32(<1 x i32>, <1 x i32> ,
+                                         <1 x i32> %mask) nounwind readnone alwaysinline {
+;  %notmask = xor <1 x i32> %mask, <i32 -1>
+;  %cleared_old = and <1 x i32> %0, %notmask
+;  %masked_new = and <1 x i32> %1, %mask
+;  %new = or <1 x i32> %cleared_old, %masked_new
+;  ret <1 x i32> %new
+;   %cmp = icmp eq <1 x i32> %mask, <i32 0>
+;   %sel = select <1 x i1> %cmp, <1 x i32> %0, <1 x i32> %1
+;   ret <1 x i32> %sel
+    %m = extractelement <1 x i32> %mask, i32 0
+    %cmp = icmp eq i32 %m, 0
+    %d0 = extractelement <1 x i32> %0, i32 0
+    %d1 = extractelement <1 x i32> %1, i32 0
+    %sel = select i1 %cmp, i32 %d0, i32 %d1    
+    %r = insertelement <1 x i32> undef, i32 %sel, i32 0
+   ret <1 x i32> %r
+
+}
+define  <1 x i64> @__vselect_i64(<1 x i64>, <1 x i64> ,
+                                         <1 x i32> %mask) nounwind readnone alwaysinline {
+;  %newmask = zext <1 x i32> %mask to <1 x i64>
+;  %notmask = xor <1 x i64> %newmask, <i64 -1>
+;  %cleared_old = and <1 x i64> %0, %notmask
+;  %masked_new = and <1 x i64> %1, %newmask
+;  %new = or <1 x i64> %cleared_old, %masked_new
+;  ret <1 x i64> %new
+;   %cmp = icmp eq <1 x i32> %mask, <i32 0>
+;   %sel = select <1 x i1> %cmp, <1 x i64> %0, <1 x i64> %1
+;   ret <1 x i64> %sel
+    %m = extractelement <1 x i32> %mask, i32 0
+    %cmp = icmp eq i32 %m, 0
+    %d0 = extractelement <1 x i64> %0, i32 0
+    %d1 = extractelement <1 x i64> %1, i32 0
+    %sel = select i1 %cmp, i64 %d0, i64 %d1    
+    %r = insertelement <1 x i64> undef, i64 %sel, i32 0
+   ret <1 x i64> %r
+
+}
+
+define  <1 x float> @__vselect_float(<1 x float>, <1 x float>,
+                                             <1 x i32> %mask) nounwind readnone alwaysinline {
+;  %v0 = bitcast <1 x float> %0 to <1 x i32>
+;  %v1 = bitcast <1 x float> %1 to <1 x i32>
+;  %r = call <1 x i32> @__vselect_i32(<1 x i32> %v0, <1 x i32> %v1, <1 x i32> %mask)
+;  %rf = bitcast <1 x i32> %r to <1 x float>
+;  ret <1 x float> %rf
+;   %cmp = icmp eq <1 x i32> %mask, <i32 0>
+;   %sel = select <1 x i1> %cmp, <1 x float> %0, <1 x float> %1
+;   ret <1 x float> %sel
+    %m = extractelement <1 x i32> %mask, i32 0
+    %cmp = icmp eq i32 %m, 0
+    %d0 = extractelement <1 x float> %0, i32 0
+    %d1 = extractelement <1 x float> %1, i32 0
+    %sel = select i1 %cmp, float %d0, float %d1    
+    %r = insertelement <1 x float> undef, float %sel, i32 0
+   ret <1 x float> %r
+
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+define void @__masked_store_blend_8(<1 x i8>* nocapture, <1 x i8>, 
+                                     <1 x i32> %mask) nounwind alwaysinline {
+  %val = load <1 x i8> * %0, align 4
+  %newval = call <1 x i8> @__vselect_i8(<1 x i8> %val, <1 x i8> %1, <1 x i32> %mask) 
+  store <1 x i8> %newval, <1 x i8> * %0, align 4
+  ret void
+}
+define void @__masked_store_blend_16(<1 x i16>* nocapture, <1 x i16>, 
+                                     <1 x i32> %mask) nounwind alwaysinline {
+  %val = load <1 x i16> * %0, align 4
+  %newval = call <1 x i16> @__vselect_i16(<1 x i16> %val, <1 x i16> %1, <1 x i32> %mask) 
+  store <1 x i16> %newval, <1 x i16> * %0, align 4
+  ret void
+}
+
+
+define void @__masked_store_blend_32(<1 x i32>* nocapture, <1 x i32>, 
+                                     <1 x i32> %mask) nounwind alwaysinline {
+  %val = load <1 x i32> * %0, align 4
+  %newval = call <1 x i32> @__vselect_i32(<1 x i32> %val, <1 x i32> %1, <1 x i32> %mask) 
+  store <1 x i32> %newval, <1 x i32> * %0, align 4
+  ret void
+}
+
+define void @__masked_store_blend_64(<1 x i64>* nocapture, <1 x i64>,
+                                     <1 x i32> %mask) nounwind alwaysinline {
+  %val = load <1 x i64> * %0, align 4
+  %newval = call <1 x i64> @__vselect_i64(<1 x i64> %val, <1 x i64> %1, <1 x i32> %mask) 
+  store <1 x i64> %newval, <1 x i64> * %0, align 4
+  ret void
+}
+
+define  i32 @__movmsk(<1 x i32>) nounwind readnone alwaysinline {
+  %item = extractelement <1 x i32> %0, i32 0
+  %v = lshr i32 %item, 31
+  ret i32 %v
+}
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding
+;;
+;; There are not any rounding instructions in SSE2, so we have to emulate
+;; the functionality with multiple instructions...
+
+; The code for __round_* is the result of compiling the following source
+; code.
+;
+; export float Round(float x) {
+;    unsigned int sign = signbits(x);
+;    unsigned int ix = intbits(x);
+;    ix ^= sign;
+;    x = floatbits(ix);
+;    x += 0x1.0p23f;
+;    x -= 0x1.0p23f;
+;    ix = intbits(x);
+;    ix ^= sign;
+;    x = floatbits(ix);
+;    return x;
+;}
+
+define  <1 x float> @__round_varying_float(<1 x float>) nounwind readonly alwaysinline {
+  %float_to_int_bitcast.i.i.i.i = bitcast <1 x float> %0 to <1 x i32>
+  %bitop.i.i = and <1 x i32> %float_to_int_bitcast.i.i.i.i, <i32 -2147483648>
+  %bitop.i = xor <1 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i40.i = bitcast <1 x i32> %bitop.i to <1 x float>
+  %binop.i = fadd <1 x float> %int_to_float_bitcast.i.i40.i, <float 8.388608e+06>
+  %binop21.i = fadd <1 x float> %binop.i, <float -8.388608e+06>
+  %float_to_int_bitcast.i.i.i = bitcast <1 x float> %binop21.i to <1 x i32>
+  %bitop31.i = xor <1 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i.i = bitcast <1 x i32> %bitop31.i to <1 x float>
+  ret <1 x float> %int_to_float_bitcast.i.i.i
+}
+
+;; Similarly, for implementations of the __floor* functions below, we have the
+;; bitcode from compiling the following source code...
+
+;export float Floor(float x) {
+;    float y = Round(x);
+;    unsigned int cmp = y > x ? 0xffffffff : 0;
+;    float delta = -1.f;
+;    unsigned int idelta = intbits(delta);
+;    idelta &= cmp;
+;    delta = floatbits(idelta);
+;    return y + delta;
+;}
+
+define  <1 x float> @__floor_varying_float(<1 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <1 x float> @__round_varying_float(<1 x float> %0) nounwind
+  %bincmp.i = fcmp ogt <1 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <1 x i1> %bincmp.i to <1 x i32>
+  %bitop.i = and <1 x i32> %val_to_boolvec32.i, <i32 -1082130432>
+  %int_to_float_bitcast.i.i.i = bitcast <1 x i32> %bitop.i to <1 x float>
+  %binop.i = fadd <1 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <1 x float> %binop.i
+}
+
+;; And here is the code we compiled to get the __ceil* functions below
+;
+;export uniform float Ceil(uniform float x) {
+;    uniform float y = Round(x);
+;    uniform int yltx = y < x ? 0xffffffff : 0;
+;    uniform float delta = 1.f;
+;    uniform int idelta = intbits(delta);
+;    idelta &= yltx;
+;    delta = floatbits(idelta);
+;    return y + delta;
+;}
+
+define  <1 x float> @__ceil_varying_float(<1 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <1 x float> @__round_varying_float(<1 x float> %0) nounwind
+  %bincmp.i = fcmp olt <1 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <1 x i1> %bincmp.i to <1 x i32>
+  %bitop.i = and <1 x i32> %val_to_boolvec32.i, <i32 1065353216>
+  %int_to_float_bitcast.i.i.i = bitcast <1 x i32> %bitop.i to <1 x float>
+  %binop.i = fadd <1 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <1 x float> %binop.i
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+; expecting math lib to provide this
+declare double @ceil (double) nounwind readnone
+declare double @floor (double) nounwind readnone
+declare double @round (double) nounwind readnone
+;declare float     @llvm.sqrt.f32(float %Val)
+declare double    @llvm.sqrt.f64(double %Val)
+declare float     @llvm.sin.f32(float %Val)
+declare float     @llvm.cos.f32(float %Val)
+declare float     @llvm.sqrt.f32(float %Val)
+declare float     @llvm.exp.f32(float %Val)
+declare float     @llvm.log.f32(float %Val)
+declare float     @llvm.pow.f32(float %f, float %e)
+
+
+
+
+;; stuff that could be in builtins ...
+
+define(`unary1to1', `
+  %v_0 = extractelement <1 x $1> %0, i32 0
+  %r_0 = call $1 $2($1 %v_0)
+  %ret_0 = insertelement <1 x $1> undef, $1 %r_0, i32 0
+  ret <1 x $1> %ret_0
+')
+
+
+
+;; dummy 1 wide vector ops
+define  void
+@__aos_to_soa4_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2,
+        <1 x float> %v3, <1 x float> * noalias %out0, 
+        <1 x float> * noalias %out1, <1 x float> * noalias %out2, 
+        <1 x float> * noalias %out3) nounwind alwaysinline { 
+
+  store <1 x float> %v0, <1 x float > * %out0
+  store <1 x float> %v1, <1 x float > * %out1
+  store <1 x float> %v2, <1 x float > * %out2
+  store <1 x float> %v3, <1 x float > * %out3
+
+  ret void
+}
+
+define  void
+@__soa_to_aos4_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2,
+        <1 x float> %v3, <1 x float> * noalias %out0, 
+        <1 x float> * noalias %out1, <1 x float> * noalias %out2, 
+        <1 x float> * noalias %out3) nounwind alwaysinline { 
+  call void @__aos_to_soa4_float1(<1 x float> %v0, <1 x float> %v1, 
+    <1 x float> %v2, <1 x float> %v3, <1 x float> * %out0, 
+    <1 x float> * %out1, <1 x float> * %out2, <1 x float> * %out3)
+  ret void
+}
+
+define  void
+@__aos_to_soa3_float1(<1 x float> %v0, <1 x float> %v1,
+         <1 x float> %v2, <1 x float> * %out0, <1 x float> * %out1,
+         <1 x float> * %out2) {
+  store <1 x float> %v0, <1 x float > * %out0
+  store <1 x float> %v1, <1 x float > * %out1
+  store <1 x float> %v2, <1 x float > * %out2
+
+  ret void
+}
+
+define  void
+@__soa_to_aos3_float1(<1 x float> %v0, <1 x float> %v1,
+         <1 x float> %v2, <1 x float> * %out0, <1 x float> * %out1,
+         <1 x float> * %out2) {
+  call void @__aos_to_soa3_float1(<1 x float> %v0, <1 x float> %v1,
+         <1 x float> %v2, <1 x float> * %out0, <1 x float> * %out1,
+         <1 x float> * %out2)
+  ret void
+}
+
+
+;; end builtins
+
+
+define  <1 x double> @__round_varying_double(<1 x double>) nounwind readonly alwaysinline {
+  unary1to1(double, @round)
+}
+
+define  <1 x double> @__floor_varying_double(<1 x double>) nounwind readonly alwaysinline {
+  unary1to1(double, @floor)
+}
+
+
+define  <1 x double> @__ceil_varying_double(<1 x double>) nounwind readonly alwaysinline {
+  unary1to1(double, @ceil)
+}
+
+; To do vector integer min and max, we do the vector compare and then sign
+; extend the i1 vector result to an i32 mask.  The __vselect does the
+; rest...
+
+define  <1 x i32> @__min_varying_int32(<1 x i32>, <1 x i32>) nounwind readonly alwaysinline {
+  %c = icmp slt <1 x i32> %0, %1
+  %mask = sext <1 x i1> %c to <1 x i32>
+  %v = call <1 x i32> @__vselect_i32(<1 x i32> %1, <1 x i32> %0, <1 x i32> %mask)
+  ret <1 x i32> %v
+}
+
+define  i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp slt i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define  <1 x i32> @__max_varying_int32(<1 x i32>, <1 x i32>) nounwind readonly alwaysinline {
+  %c = icmp sgt <1 x i32> %0, %1
+  %mask = sext <1 x i1> %c to <1 x i32>
+  %v = call <1 x i32> @__vselect_i32(<1 x i32> %1, <1 x i32> %0, <1 x i32> %mask)
+  ret <1 x i32> %v
+}
+
+define  i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp sgt i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+
+; The functions for unsigned ints are similar, just with unsigned
+; comparison functions...
+
+define  <1 x i32> @__min_varying_uint32(<1 x i32>, <1 x i32>) nounwind readonly alwaysinline {
+  %c = icmp ult <1 x i32> %0, %1
+  %mask = sext <1 x i1> %c to <1 x i32>
+  %v = call <1 x i32> @__vselect_i32(<1 x i32> %1, <1 x i32> %0, <1 x i32> %mask)
+  ret <1 x i32> %v
+}
+
+define  i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp ult i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define  <1 x i32> @__max_varying_uint32(<1 x i32>, <1 x i32>) nounwind readonly alwaysinline {
+  %c = icmp ugt <1 x i32> %0, %1
+  %mask = sext <1 x i1> %c to <1 x i32>
+  %v = call <1 x i32> @__vselect_i32(<1 x i32> %1, <1 x i32> %0, <1 x i32> %mask)
+  ret <1 x i32> %v
+}
+
+define  i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp ugt i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops / reductions
+
+declare i32 @llvm.ctpop.i32(i32) nounwind readnone
+
+define  i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
+  %call = call i32 @llvm.ctpop.i32(i32 %0)
+  ret i32 %call
+}
+
+declare i64 @llvm.ctpop.i64(i64) nounwind readnone
+
+define  i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
+  %call = call i64 @llvm.ctpop.i64(i64 %0)
+  ret i64 %call
+}
+
+
+define  float @__reduce_add_float(<1 x float> %v) nounwind readonly alwaysinline {
+  %r = extractelement <1 x float> %v, i32 0
+  ret float %r
+}
+
+define  float @__reduce_min_float(<1 x float>) nounwind readnone {
+  %r = extractelement <1 x float> %0, i32 0
+  ret float %r
+}
+
+define  float @__reduce_max_float(<1 x float>) nounwind readnone {
+  %r = extractelement <1 x float> %0, i32 0
+  ret float %r
+}
+
+define  i32 @__reduce_add_int32(<1 x i32> %v) nounwind readnone {
+  %r = extractelement <1 x i32> %v, i32 0
+  ret i32 %r
+}
+
+define  i32 @__reduce_min_int32(<1 x i32>) nounwind readnone {
+  %r = extractelement <1 x i32> %0, i32 0
+  ret i32 %r
+}
+
+define  i32 @__reduce_max_int32(<1 x i32>) nounwind readnone {
+  %r = extractelement <1 x i32> %0, i32 0
+  ret i32 %r
+}
+
+define  i32 @__reduce_add_uint32(<1 x i32> %v) nounwind readnone {
+  %r = call i32 @__reduce_add_int32(<1 x i32> %v)
+  ret i32 %r
+}
+
+define  i32 @__reduce_min_uint32(<1 x i32>) nounwind readnone {
+  %r = extractelement <1 x i32> %0, i32 0
+  ret i32 %r
+}
+
+define  i32 @__reduce_max_uint32(<1 x i32>) nounwind readnone {
+  %r = extractelement <1 x i32> %0, i32 0
+  ret i32 %r
+ }
+
+
+define  double @__reduce_add_double(<1 x double>) nounwind readnone {
+  %m = extractelement <1 x double> %0, i32 0
+  ret double %m
+}
+
+define  double @__reduce_min_double(<1 x double>) nounwind readnone {
+  %m = extractelement <1 x double> %0, i32 0
+  ret double %m
+}
+
+define  double @__reduce_max_double(<1 x double>) nounwind readnone {
+  %m = extractelement <1 x double> %0, i32 0
+  ret double %m
+}
+
+define  i64 @__reduce_add_int64(<1 x i64>) nounwind readnone {
+  %m = extractelement <1 x i64> %0, i32 0
+  ret i64 %m
+}
+
+define  i64 @__reduce_min_int64(<1 x i64>) nounwind readnone {
+  %m = extractelement <1 x i64> %0, i32 0
+  ret i64 %m
+}
+
+define  i64 @__reduce_max_int64(<1 x i64>) nounwind readnone {
+  %m = extractelement <1 x i64> %0, i32 0
+  ret i64 %m
+}
+
+define  i64 @__reduce_min_uint64(<1 x i64>) nounwind readnone {
+  %m = extractelement <1 x i64> %0, i32 0
+  ret i64 %m
+}
+
+define  i64 @__reduce_max_uint64(<1 x i64>) nounwind readnone {
+  %m = extractelement <1 x i64> %0, i32 0
+  ret i64 %m
+}
+
+define  i1 @__reduce_equal_int32(<1 x i32> %vv, i32 * %samevalue,
+                                      <1 x i32> %mask) nounwind alwaysinline {
+  %v=extractelement <1 x i32> %vv, i32 0
+  store i32 %v, i32 * %samevalue
+  ret i1 true
+
+}
+
+define  i1 @__reduce_equal_float(<1 x float> %vv, float * %samevalue,
+                                      <1 x i32> %mask) nounwind alwaysinline {
+  %v=extractelement <1 x float> %vv, i32 0
+  store float %v, float * %samevalue
+  ret i1 true
+
+}
+
+define  i1 @__reduce_equal_int64(<1 x i64> %vv, i64 * %samevalue,
+                                      <1 x i32> %mask) nounwind alwaysinline {
+  %v=extractelement <1 x i64> %vv, i32 0
+  store i64 %v, i64 * %samevalue
+  ret i1 true
+
+}
+
+define  i1 @__reduce_equal_double(<1 x double> %vv, double * %samevalue,
+                                      <1 x i32> %mask) nounwind alwaysinline {
+  %v=extractelement <1 x double> %vv, i32 0
+  store double %v, double * %samevalue
+  ret i1 true
+
+}
+
+; extracting/reinserting elements because I want to be able to remove vectors later on
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+define  <1 x float> @__rcp_varying_float(<1 x float>) nounwind readonly alwaysinline {
+  ;%call = call <1 x float> @llvm.x86.sse.rcp.ps(<1 x float> %0)
+  ; do one N-R iteration to improve precision
+  ;  float iv = __rcp_v(v);
+  ;  return iv * (2. - v * iv);
+  ;%v_iv = fmul <1 x float> %0, %call
+  ;%two_minus = fsub <1 x float> <float 2., float 2., float 2., float 2.>, %v_iv  
+  ;%iv_mul = fmul <1 x float> %call, %two_minus
+  ;ret <1 x float> %iv_mul
+  %d = extractelement <1 x float> %0, i32 0
+  %r = fdiv float 1.,%d
+  %rv = insertelement <1 x float> undef, float %r, i32 0
+  ret <1 x float> %rv
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; sqrt
+
+define  <1 x float> @__sqrt_varying_float(<1 x float>) nounwind readonly alwaysinline {
+  ;%call = call <1 x float> @llvm.x86.sse.sqrt.ps(<1 x float> %0)
+  ;ret <1 x float> %call
+  %d = extractelement <1 x float> %0, i32 0
+  %r = call float @llvm.sqrt.f32(float %d)
+  %rv = insertelement <1 x float> undef, float %r, i32 0
+  ret <1 x float> %rv
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; rsqrt
+
+define  <1 x float> @__rsqrt_varying_float(<1 x float> %v) nounwind readonly alwaysinline {
+  ;  float is = __rsqrt_v(v);
+  ;%is = call <1 x float> @llvm.x86.sse.rsqrt.ps(<1 x float> %v)
+  ; Newton-Raphson iteration to improve precision
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  ;%v_is = fmul <1 x float> %v, %is
+  ;%v_is_is = fmul <1 x float> %v_is, %is
+  ;%three_sub = fsub <1 x float> <float 3., float 3., float 3., float 3.>, %v_is_is
+  ;%is_mul = fmul <1 x float> %is, %three_sub
+  ;%half_scale = fmul <1 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  ;ret <1 x float> %half_scale
+  %s = call <1 x float> @__sqrt_varying_float(<1 x float> %v)
+  %r = call <1 x float> @__rcp_varying_float(<1 x float> %s)
+  ret <1 x float> %r
+  
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; svml stuff
+
+define  <1 x float> @__svml_sin(<1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_sinf4(<1 x float> %0)
+  ;ret <1 x float> %ret
+  ;%r = extractelement <1 x float> %0, i32 0
+  ;%s = call float @llvm.sin.f32(float %r)
+  ;%rv = insertelement <1 x float> undef, float %r, i32 0
+  ;ret <1 x float> %rv
+  unary1to1(float,@llvm.sin.f32)
+   
+}
+
+define  <1 x float> @__svml_cos(<1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_cosf4(<1 x float> %0)
+  ;ret <1 x float> %ret
+  ;%r = extractelement <1 x float> %0, i32 0
+  ;%s = call float @llvm.cos.f32(float %r)
+  ;%rv = insertelement <1 x float> undef, float %r, i32 0
+  ;ret <1 x float> %rv
+  unary1to1(float, @llvm.cos.f32)
+
+}
+
+define  void @__svml_sincos(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline {
+;  %s = call <1 x float> @__svml_sincosf4(<1 x float> * %2, <1 x float> %0)
+;  store <1 x float> %s, <1 x float> * %1
+;  ret void
+   %sin = call <1 x float> @__svml_sin (<1 x float> %0)
+   %cos = call <1 x float> @__svml_cos (<1 x float> %0)
+   store <1 x float> %sin, <1 x float> * %1
+   store <1 x float> %cos, <1 x float> * %2
+   ret void
+}
+
+define  <1 x float> @__svml_tan(<1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_tanf4(<1 x float> %0)
+  ;ret <1 x float> %ret
+  ;%r = extractelement <1 x float> %0, i32 0
+  ;%s = call float @llvm_tan_f32(float %r)
+  ;%rv = insertelement <1 x float> undef, float %r, i32 0
+  ;ret <1 x float> %rv
+  ;unasry1to1(float, @llvm.tan.f32)
+  ; UNSUPPORTED!
+  ret <1 x float > %0
+}
+
+define  <1 x float> @__svml_atan(<1 x float>) nounwind readnone alwaysinline {
+;  %ret = call <1 x float> @__svml_atanf4(<1 x float> %0)
+;  ret <1 x float> %ret
+  ;%r = extractelement <1 x float> %0, i32 0
+  ;%s = call float @llvm_atan_f32(float %r)
+  ;%rv = insertelement <1 x float> undef, float %r, i32 0
+  ;ret <1 x float> %rv
+  ;unsary1to1(float,@llvm.atan.f32)
+  ;UNSUPPORTED!
+  ret <1 x float > %0
+
+}
+
+define  <1 x float> @__svml_atan2(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_atan2f4(<1 x float> %0, <1 x float> %1)
+  ;ret <1 x float> %ret
+  ;%y = extractelement <1 x float> %0, i32 0
+  ;%x = extractelement <1 x float> %1, i32 0
+  ;%q = fdiv float %y, %x
+  ;%a = call float @llvm.atan.f32 (float %q)
+  ;%rv = insertelement <1 x float> undef, float %a, i32 0
+  ;ret <1 x float> %rv
+  ; UNSUPPORTED!
+  ret <1 x float > %0
+}
+
+define  <1 x float> @__svml_exp(<1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_expf4(<1 x float> %0)
+  ;ret <1 x float> %ret
+  unary1to1(float, @llvm.exp.f32)
+}
+
+define  <1 x float> @__svml_log(<1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_logf4(<1 x float> %0)
+  ;ret <1 x float> %ret
+  unary1to1(float, @llvm.log.f32)
+}
+
+define  <1 x float> @__svml_pow(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_powf4(<1 x float> %0, <1 x float> %1)
+  ;ret <1 x float> %ret
+  %r = extractelement <1 x float> %0, i32 0
+  %e  = extractelement <1 x float> %1, i32 0
+  %s = call float @llvm.pow.f32(float %r,float %e)
+  %rv = insertelement <1 x float> undef, float %s, i32 0
+  ret <1 x float> %rv
+
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+define  <1 x float> @__max_varying_float(<1 x float>, <1 x float>) nounwind readonly alwaysinline {
+;  %call = call <1 x float> @llvm.x86.sse.max.ps(<1 x float> %0, <1 x float> %1)
+;  ret <1 x float> %call
+  %a = extractelement <1 x float> %0, i32 0
+  %b = extractelement <1 x float> %1, i32 0
+  %d = fcmp ogt float %a, %b  
+  %r = select i1 %d, float %a, float %b
+  %rv = insertelement <1 x float> undef, float %r, i32 0
+  ret <1 x float> %rv    
+}
+
+define  <1 x float> @__min_varying_float(<1 x float>, <1 x float>) nounwind readonly alwaysinline {
+;  %call = call <1 x float> @llvm.x86.sse.min.ps(<1 x float> %0, <1 x float> %1)
+;  ret <1 x float> %call
+  %a = extractelement <1 x float> %0, i32 0
+  %b = extractelement <1 x float> %1, i32 0
+  %d = fcmp olt float %a, %b  
+  %r = select i1 %d, float %a, float %b
+  %rv = insertelement <1 x float> undef, float %r, i32 0
+  ret <1 x float> %rv    
+
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+;declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
+
+define  <1 x double> @__sqrt_varying_double(<1 x double>) nounwind alwaysinline {
+  ;unarya2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
+  ;ret <1 x double> %ret
+  unary1to1(double, @llvm.sqrt.f64)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+;declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
+;declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define  <1 x double> @__min_varying_double(<1 x double>, <1 x double>) nounwind readnone {
+  ;binarsy2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
+  ;ret <1 x double> %ret
+  %a = extractelement <1 x double> %0, i32 0
+  %b = extractelement <1 x double> %1, i32 0
+  %d = fcmp olt double %a, %b  
+  %r = select i1 %d, double %a, double %b
+  %rv = insertelement <1 x double> undef, double %r, i32 0
+  ret <1 x double> %rv    
+
+}
+
+define  <1 x double> @__max_varying_double(<1 x double>, <1 x double>) nounwind readnone {
+  ;binary2sto4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
+  ;ret <1 x double> %ret
+  %a = extractelement <1 x double> %0, i32 0
+  %b = extractelement <1 x double> %1, i32 0
+  %d = fcmp ogt double %a, %b  
+  %r = select i1 %d, double %a, double %b
+  %rv = insertelement <1 x double> undef, double %r, i32 0
+  ret <1 x double> %rv    
+
+}
+
+
+define  float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
+;    uniform float iv = extract(__rcp_u(v), 0);
+;    return iv * (2. - v * iv);
+  %r = fdiv float 1.,%0
+  ret float %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding floats
+
+define  float @__round_uniform_float(float) nounwind readonly alwaysinline {
+  ; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  ; the roundss intrinsic is a total mess--docs say:
+  ;
+  ;  __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
+  ;       
+  ;  b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
+  ;  on b0. The higher order 96 bits are copied directly from input parameter a. The
+  ;  return value is described by the following equations:
+  ;
+  ;  r0 = RND(b0)
+  ;  r1 = a1
+  ;  r2 = a2
+  ;  r3 = a3
+  ;
+  ;  It doesn't matter what we pass as a, since we only need the r0 value
+  ;  here.  So we pass the same register for both.
+  %v = insertelement<1 x float> undef, float %0, i32 0
+  %rv = call <1 x float> @__round_varying_float(<1 x float> %v)
+  %r=extractelement <1 x float> %rv, i32 0
+  ret float %r
+
+}
+
+define  float @__floor_uniform_float(float) nounwind readonly alwaysinline {
+  %v = insertelement<1 x float> undef, float %0, i32 0
+  %rv = call <1 x float> @__floor_varying_float(<1 x float> %v)
+  %r=extractelement <1 x float> %rv, i32 0
+  ret float %r
+
+}
+
+define  float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
+  %v = insertelement<1 x float> undef, float %0, i32 0
+  %rv = call <1 x float> @__ceil_varying_float(<1 x float> %v)
+  %r=extractelement <1 x float> %rv, i32 0
+  ret float %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+
+define  double @__round_uniform_double(double) nounwind readonly alwaysinline {
+       %rs=call double @round(double %0)
+       ret double %rs
+}
+
+define  double @__floor_uniform_double(double) nounwind readonly alwaysinline {
+  %rs = call double @floor(double %0)
+  ret double %rs
+}
+
+define  double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
+  %rs = call double @ceil(double %0)
+  ret double %rs
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; sqrt
+
+
+define  float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
+  %ret = call float @llvm.sqrt.f32(float %0)
+  ret float %ret
+}
+
+define  double @__sqrt_uniform_double(double) nounwind readonly alwaysinline {
+  %ret = call double @llvm.sqrt.f64(double %0)
+  ret double %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rsqrt
+
+
+define  float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
+  %s = call float @__sqrt_uniform_float(float %0)
+  %r = call float @__rcp_uniform_float(float %s)
+  ret float %r
+}
+
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; fastmath
+
+
+define  void @__fastmath() nounwind alwaysinline {
+ ; no-op
+  ret void
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+
+define  float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
+  %d = fcmp ogt float %0, %1 
+  %r = select i1 %d, float %0, float %1
+  ret float %r
+
+}
+
+define  float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
+  %d = fcmp olt float %0, %1 
+  %r = select i1 %d, float %0, float %1
+  ret float %r
+
+}
+define  double @__max_uniform_double(double, double) nounwind readonly alwaysinline {
+  %d = fcmp ogt double %0, %1 
+  %r = select i1 %d, double %0, double %1
+  ret double %r
+
+}
+
+define  double @__min_uniform_double(double, double) nounwind readonly alwaysinline {
+  %d = fcmp olt double %0, %1 
+  %r = select i1 %d, double %0, double %1
+  ret double %r
+
+}
+
+define_shuffles()
+
+ctlztz()
+
+define_prefetches()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -98,6 +98,14 @@ declare void @__aos_to_soa4_float(float * noalias %p, <WIDTH x float> * noalias
                                  <WIDTH x float> * noalias %out2,
                                  <WIDTH x float> * noalias %out3) nounwind

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; math

@@ -241,8 +249,9 @@ declare void @__masked_store_32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
 declare void @__masked_store_64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
                                <WIDTH x i1> %mask) nounwind 

+ifelse(LLVM_VERSION, `LLVM_3_1svn',`
 define void @__masked_store_blend_8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
-                                     <WIDTH x i1>) nounwind {
+                                     <WIDTH x i1>) nounwind alwaysinline {
  %v = load <WIDTH x i8> * %0
  %v1 = select <WIDTH x i1> %2, <WIDTH x i8> %1, <WIDTH x i8> %v
  store <WIDTH x i8> %v1, <WIDTH x i8> * %0
@@ -250,7 +259,7 @@ define void @__masked_store_blend_8(<WIDTH x i8>* nocapture, <WIDTH x i8>,
 }

 define void @__masked_store_blend_16(<WIDTH x i16>* nocapture, <WIDTH x i16>, 
-                                     <WIDTH x i1>) nounwind {
+                                     <WIDTH x i1>) nounwind alwaysinline {
  %v = load <WIDTH x i16> * %0
  %v1 = select <WIDTH x i1> %2, <WIDTH x i16> %1, <WIDTH x i16> %v
  store <WIDTH x i16> %v1, <WIDTH x i16> * %0
@@ -258,7 +267,7 @@ define void @__masked_store_blend_16(<WIDTH x i16>* nocapture, <WIDTH x i16>,
 }

 define void @__masked_store_blend_32(<WIDTH x i32>* nocapture, <WIDTH x i32>, 
-                                     <WIDTH x i1>) nounwind {
+                                     <WIDTH x i1>) nounwind alwaysinline {
  %v = load <WIDTH x i32> * %0
  %v1 = select <WIDTH x i1> %2, <WIDTH x i32> %1, <WIDTH x i32> %v
  store <WIDTH x i32> %v1, <WIDTH x i32> * %0
@@ -266,30 +275,40 @@ define void @__masked_store_blend_32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
 }

 define void @__masked_store_blend_64(<WIDTH x i64>* nocapture,
-                                     <WIDTH x i64>, <WIDTH x i1>) nounwind {
+                            <WIDTH x i64>, <WIDTH x i1>) nounwind alwaysinline {
  %v = load <WIDTH x i64> * %0
  %v1 = select <WIDTH x i1> %2, <WIDTH x i64> %1, <WIDTH x i64> %v
  store <WIDTH x i64> %v1, <WIDTH x i64> * %0
  ret void
 }
+',`
+declare void @__masked_store_blend_8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
+                                     <WIDTH x i1>) nounwind 
+declare void @__masked_store_blend_16(<WIDTH x i16>* nocapture, <WIDTH x i16>, 
+                                      <WIDTH x i1>) nounwind 
+declare void @__masked_store_blend_32(<WIDTH x i32>* nocapture, <WIDTH x i32>, 
+                                      <WIDTH x i1>) nounwind 
+declare void @__masked_store_blend_64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
+                                      <WIDTH x i1> %mask) nounwind 
+')

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter

 define(`gather_scatter', `
 declare <WIDTH x $1> @__gather_base_offsets32_$1(i8 * nocapture, <WIDTH x i32>,
-                        i32, <WIDTH x i1>) nounwind readonly 
+                        i32, <WIDTH x i32>, <WIDTH x i1>) nounwind readonly 
 declare <WIDTH x $1> @__gather_base_offsets64_$1(i8 * nocapture, <WIDTH x i64>,
-                        i32, <WIDTH x i1>) nounwind readonly 
+                        i32, <WIDTH x i64>, <WIDTH x i1>) nounwind readonly 
 declare <WIDTH x $1> @__gather32_$1(<WIDTH x i32>, 
                                    <WIDTH x i1>) nounwind readonly 
 declare <WIDTH x $1> @__gather64_$1(<WIDTH x i64>, 
                                    <WIDTH x i1>) nounwind readonly 

 declare void @__scatter_base_offsets32_$1(i8* nocapture, <WIDTH x i32>,
-                  i32, <WIDTH x $1>, <WIDTH x i1>) nounwind 
+                  i32, <WIDTH x i32>, <WIDTH x $1>, <WIDTH x i1>) nounwind 
 declare void @__scatter_base_offsets64_$1(i8* nocapture, <WIDTH x i64>,
-                  i32, <WIDTH x $1>, <WIDTH x i1>) nounwind 
+                  i32, <WIDTH x i64>, <WIDTH x $1>, <WIDTH x i1>) nounwind 
 declare void @__scatter32_$1(<WIDTH x i32>, <WIDTH x $1>,
                             <WIDTH x i1>) nounwind 
 declare void @__scatter64_$1(<WIDTH x i64>, <WIDTH x $1>,
--- a/builtins/target-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -47,6 +47,14 @@ int64minmax()

 include(`target-sse2-common.ll')

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp

--- a/builtins/target-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -44,6 +44,14 @@ int64minmax()

 include(`target-sse2-common.ll')

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding
 ;;
--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -47,6 +47,14 @@ int64minmax()

 include(`target-sse4-common.ll')

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp

--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -44,6 +44,14 @@ int64minmax()

 include(`target-sse4-common.ll')

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp

--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -760,14 +760,12 @@ define(`global_atomic_uniform', `
 ifelse(LLVM_VERSION, `LLVM_2_9',`
 declare $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %delta)

-define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val,
-                                         <$1 x MASK> %mask) nounwind alwaysinline {
+define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val) nounwind alwaysinline {
  %r = call $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %val)
  ret $3 %r
 }
 ', `
-define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val,
-                                         <$1 x MASK> %mask) nounwind alwaysinline {
+define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val) nounwind alwaysinline {
  %r = atomicrmw $2 $3 * %ptr, $3 %val seq_cst
  ret $3 %r
 }
@@ -786,26 +784,7 @@ declare i32 @llvm.atomic.swap.i32.p0i32(i32 * %ptr, i32 %val)
 declare i64 @llvm.atomic.swap.i64.p0i64(i64 * %ptr, i64 %val)')

 define(`global_swap', `
-
-define <$1 x $2> @__atomic_swap_$3_global($2* %ptr, <$1 x $2> %val,
-                                          <$1 x MASK> %mask) nounwind alwaysinline {
-  %rptr = alloca <$1 x $2>
-  %rptr32 = bitcast <$1 x $2> * %rptr to $2 *
-
-  per_lane($1, <$1 x MASK> %mask, `
-   %val_LANE_ID = extractelement <$1 x $2> %val, i32 LANE
-ifelse(LLVM_VERSION, `LLVM_2_9',`
-   %r_LANE_ID = call $2 @llvm.atomic.swap.$2.p0$2($2 * %ptr, $2 %val_LANE_ID)', `
-   %r_LANE_ID = atomicrmw xchg $2 * %ptr, $2 %val_LANE_ID seq_cst')
-   %rp_LANE_ID = getelementptr $2 * %rptr32, i32 LANE
-   store $2 %r_LANE_ID, $2 * %rp_LANE_ID')
-
-  %r = load <$1 x $2> * %rptr
-  ret <$1 x $2> %r
-}
-
-define $2 @__atomic_swap_uniform_$3_global($2* %ptr, $2 %val,
-                                           <$1 x MASK> %mask) nounwind alwaysinline {
+define $2 @__atomic_swap_uniform_$3_global($2* %ptr, $2 %val) nounwind alwaysinline {
 ifelse(LLVM_VERSION, `LLVM_2_9',`
 %r = call $2 @llvm.atomic.swap.$2.p0$2($2 * %ptr, $2 %val)', `
 %r = atomicrmw xchg $2 * %ptr, $2 %val seq_cst')
@@ -845,7 +824,7 @@ ifelse(LLVM_VERSION, `LLVM_2_9',`
 }

 define $2 @__atomic_compare_exchange_uniform_$3_global($2* %ptr, $2 %cmp,
-                               $2 %val, <$1 x MASK> %mask) nounwind alwaysinline {
+                                                       $2 %val) nounwind alwaysinline {
 ifelse(LLVM_VERSION, `LLVM_2_9',`
  %r = call $2 @llvm.atomic.cmp.swap.$2.p0$2($2 * %ptr, $2 %cmp, $2 %val)', `
  %r = cmpxchg $2 * %ptr, $2 %cmp, $2 %val seq_cst')
@@ -1586,17 +1565,15 @@ declare void @__pseudo_masked_store_64(<WIDTH x i64> * nocapture, <WIDTH x i64>,
 ; these represent gathers from a common base pointer with offsets.  The
 ; offset_scale factor scales the offsets before they are added to the base
 ; pointer--it should have the value 1, 2, 4, or 8.  (It can always just be 1.)
-; The 2, 4, 8 cases are used to match LLVM patterns that use the free 2/4/8 scaling
-; available in x86 addressing calculations... 
+; Then, the offset delta_value (guaranteed to be a compile-time constant value),
+; is added to the final address. The 2, 4, 8 scales are used to match LLVM patterns
+; that use the free 2/4/8 scaling available in x86 addressing calculations, and
+; offset_delta feeds into the free offset calculation. 
 ;
-; varying int8  __pseudo_gather_base_offsets{32,64}_8(uniform int8 *base, 
-;                                    int{32,64} offsets, int32 offset_scale, mask)
-; varying int16 __pseudo_gather_base_offsets{32,64}_16(uniform int16 *base, 
-;                                    int{32,64} offsets, int32 offset_scale, mask)
-; varying int32 __pseudo_gather_base_offsets{32,64}_32(uniform int32 *base, 
-;                                    int{32,64} offsets, int32 offset_scale, mask)
-; varying int64 __pseudo_gather_base_offsets{32,64}_64(uniform int64 *base, 
-;                                    int{32,64} offsets, int32 offset_scale, mask)
+; varying int{8,16,32,64}
+; __pseudo_gather_base_offsets{32,64}_{8,16,32,64}(uniform int8 *base,
+;                                    int{32,64} offsets, uniform int32 offset_scale, 
+;                                    int{32,64} offset_delta, mask)
 ;
 ; Then, the GSImprovementsPass optimizations finds these and either
 ; converts them to native gather functions or converts them to vector
@@ -1612,22 +1589,22 @@ declare <WIDTH x i16> @__pseudo_gather64_16(<WIDTH x i64>, <WIDTH x MASK>) nounw
 declare <WIDTH x i32> @__pseudo_gather64_32(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
 declare <WIDTH x i64> @__pseudo_gather64_64(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly

-declare <WIDTH x i8>  @__pseudo_gather_base_offsets32_8(i8 *, <WIDTH x i32>, i32,
+declare <WIDTH x i8>  @__pseudo_gather_base_offsets32_8(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
                                                        <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x i16> @__pseudo_gather_base_offsets32_16(i8 *, <WIDTH x i32>, i32,
+declare <WIDTH x i16> @__pseudo_gather_base_offsets32_16(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
                                                      <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x i32> @__pseudo_gather_base_offsets32_32(i8 *, <WIDTH x i32>, i32,
+declare <WIDTH x i32> @__pseudo_gather_base_offsets32_32(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
                                                      <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x i64> @__pseudo_gather_base_offsets32_64(i8 *, <WIDTH x i32>, i32,
+declare <WIDTH x i64> @__pseudo_gather_base_offsets32_64(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
                                                      <WIDTH x MASK>) nounwind readonly

-declare <WIDTH x i8>  @__pseudo_gather_base_offsets64_8(i8 *, <WIDTH x i64>, i32,
+declare <WIDTH x i8>  @__pseudo_gather_base_offsets64_8(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                     <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x i16> @__pseudo_gather_base_offsets64_16(i8 *, <WIDTH x i64>, i32,
+declare <WIDTH x i16> @__pseudo_gather_base_offsets64_16(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                      <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x i32> @__pseudo_gather_base_offsets64_32(i8 *, <WIDTH x i64>, i32,
+declare <WIDTH x i32> @__pseudo_gather_base_offsets64_32(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                      <WIDTH x MASK>) nounwind readonly
-declare <WIDTH x i64> @__pseudo_gather_base_offsets64_64(i8 *, <WIDTH x i64>, i32,
+declare <WIDTH x i64> @__pseudo_gather_base_offsets64_64(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                      <WIDTH x MASK>) nounwind readonly

 ; Similarly to the pseudo-gathers defined above, we also declare undefined
@@ -1642,13 +1619,9 @@ declare <WIDTH x i64> @__pseudo_gather_base_offsets64_64(i8 *, <WIDTH x i64>, i3
 ; transforms them to scatters like:
 ;
 ; void __pseudo_scatter_base_offsets{32,64}_8(uniform int8 *base, 
-;             varying int32 offsets, int32 offset_scale, varying int8 values, mask)
-; void __pseudo_scatter_base_offsets{32,64}_16(uniform int16 *base, 
-;             varying int32 offsets, int32 offset_scale, varying int16 values, mask)
-; void __pseudo_scatter_base_offsets{32,64}_32(uniform int32 *base, 
-;             varying int32 offsets, int32 offset_scale, varying int32 values, mask)
-; void __pseudo_scatter_base_offsets{32,64}_64(uniform int64 *base, 
-;             varying int32 offsets, int32 offset_scale, varying int64 values, mask)
+;             varying int32 offsets, uniform int32 offset_scale, 
+;             varying int{32,64} offset_delta, varying int8 values, mask)
+; (and similarly for 16/32/64 bit values)
 ;
 ; And the GSImprovementsPass in turn converts these to actual native
 ; scatters or masked stores.  
@@ -1663,22 +1636,22 @@ declare void @__pseudo_scatter64_16(<WIDTH x i64>, <WIDTH x i16>, <WIDTH x MASK>
 declare void @__pseudo_scatter64_32(<WIDTH x i64>, <WIDTH x i32>, <WIDTH x MASK>) nounwind
 declare void @__pseudo_scatter64_64(<WIDTH x i64>, <WIDTH x i64>, <WIDTH x MASK>) nounwind

-declare void @__pseudo_scatter_base_offsets32_8(i8 * nocapture, <WIDTH x i32>, i32,
+declare void @__pseudo_scatter_base_offsets32_8(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
                                                <WIDTH x i8>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_base_offsets32_16(i8 * nocapture, <WIDTH x i32>, i32,
+declare void @__pseudo_scatter_base_offsets32_16(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
                                                 <WIDTH x i16>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_base_offsets32_32(i8 * nocapture, <WIDTH x i32>, i32,
+declare void @__pseudo_scatter_base_offsets32_32(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
                                                 <WIDTH x i32>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_base_offsets32_64(i8 * nocapture, <WIDTH x i32>, i32,
+declare void @__pseudo_scatter_base_offsets32_64(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
                                                 <WIDTH x i64>, <WIDTH x MASK>) nounwind

-declare void @__pseudo_scatter_base_offsets64_8(i8 * nocapture, <WIDTH x i64>, i32,
+declare void @__pseudo_scatter_base_offsets64_8(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                <WIDTH x i8>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_base_offsets64_16(i8 * nocapture, <WIDTH x i64>, i32,
+declare void @__pseudo_scatter_base_offsets64_16(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                 <WIDTH x i16>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_base_offsets64_32(i8 * nocapture, <WIDTH x i64>, i32,
+declare void @__pseudo_scatter_base_offsets64_32(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                 <WIDTH x i32>, <WIDTH x MASK>) nounwind
-declare void @__pseudo_scatter_base_offsets64_64(i8 * nocapture, <WIDTH x i64>, i32,
+declare void @__pseudo_scatter_base_offsets64_64(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                 <WIDTH x i64>, <WIDTH x MASK>) nounwind

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -1832,6 +1805,81 @@ ok:
  ret void
 }

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; new/delete
+
+declare i8 * @malloc(i64)
+declare void @free(i8 *)
+
+define i8 * @__new_uniform(i64 %size) {
+  %a = call i8 * @malloc(i64 %size)
+  ret i8 * %a
+}
+
+define <WIDTH x i64> @__new_varying32(<WIDTH x i32> %size, <WIDTH x MASK> %mask) {
+  %ret = alloca <WIDTH x i64>
+  store <WIDTH x i64> zeroinitializer, <WIDTH x i64> * %ret
+  %ret64 = bitcast <WIDTH x i64> * %ret to i64 *
+
+  per_lane(WIDTH, <WIDTH x MASK> %mask, `
+    %sz_LANE_ID = extractelement <WIDTH x i32> %size, i32 LANE
+    %sz64_LANE_ID = zext i32 %sz_LANE_ID to i64
+    %ptr_LANE_ID = call i8 * @malloc(i64 %sz64_LANE_ID)
+    %ptr_int_LANE_ID = ptrtoint i8 * %ptr_LANE_ID to i64
+    %store_LANE_ID = getelementptr i64 * %ret64, i32 LANE
+    store i64 %ptr_int_LANE_ID, i64 * %store_LANE_ID')
+
+  %r = load <WIDTH x i64> * %ret
+  ret <WIDTH x i64> %r
+}
+
+define <WIDTH x i64> @__new_varying64(<WIDTH x i64> %size, <WIDTH x MASK> %mask) {
+  %ret = alloca <WIDTH x i64>
+  store <WIDTH x i64> zeroinitializer, <WIDTH x i64> * %ret
+  %ret64 = bitcast <WIDTH x i64> * %ret to i64 *
+
+  per_lane(WIDTH, <WIDTH x MASK> %mask, `
+    %sz_LANE_ID = extractelement <WIDTH x i64> %size, i32 LANE
+    %ptr_LANE_ID = call i8 * @malloc(i64 %sz_LANE_ID)
+    %ptr_int_LANE_ID = ptrtoint i8 * %ptr_LANE_ID to i64
+    %store_LANE_ID = getelementptr i64 * %ret64, i32 LANE
+    store i64 %ptr_int_LANE_ID, i64 * %store_LANE_ID')
+
+  %r = load <WIDTH x i64> * %ret
+  ret <WIDTH x i64> %r
+}
+
+define void @__delete_uniform(i8 * %ptr) {
+  call void @free(i8 * %ptr)
+  ret void
+}
+
+define void @__delete_varying(<WIDTH x i64> %ptr, <WIDTH x MASK> %mask) {
+  per_lane(WIDTH, <WIDTH x MASK> %mask, `
+      %iptr_LANE_ID = extractelement <WIDTH x i64> %ptr, i32 LANE
+      %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to i8 *
+      call void @free(i8 * %ptr_LANE_ID)
+  ')
+  ret void
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; read hw clock
+
+define i64 @__clock() nounwind uwtable ssp {
+entry:
+  tail call void asm sideeffect "xorl %eax,%eax \0A    cpuid", "~{rax},~{rbx},~{rcx},~{rdx},~{dirflag},~{fpsr},~{flags}"() nounwind
+  %0 = tail call { i32, i32 } asm sideeffect "rdtsc", "={ax},={dx},~{dirflag},~{fpsr},~{flags}"() nounwind
+  %asmresult = extractvalue { i32, i32 } %0, 0
+  %asmresult1 = extractvalue { i32, i32 } %0, 1
+  %conv = zext i32 %asmresult1 to i64
+  %shl = shl nuw i64 %conv, 32
+  %conv2 = zext i32 %asmresult to i64
+  %or = or i64 %shl, %conv2
+  ret i64 %or
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; stdlib transcendentals
 ;;
@@ -1997,38 +2045,18 @@ global_atomic_uniform(WIDTH, umax, i64, uint64)
 global_swap(WIDTH, i32, int32)
 global_swap(WIDTH, i64, int64)

-define <WIDTH x float> @__atomic_swap_float_global(float * %ptr, <WIDTH x float> %val,
-                                                   <WIDTH x MASK> %mask) nounwind alwaysinline {
-  %iptr = bitcast float * %ptr to i32 *
-  %ival = bitcast <WIDTH x float> %val to <WIDTH x i32>
-  %iret = call <WIDTH x i32> @__atomic_swap_int32_global(i32 * %iptr, <WIDTH x i32> %ival, <WIDTH x MASK> %mask)
-  %ret = bitcast <WIDTH x i32> %iret to <WIDTH x float>
-  ret <WIDTH x float> %ret
-}
-
-define <WIDTH x double> @__atomic_swap_double_global(double * %ptr, <WIDTH x double> %val,
-                                                   <WIDTH x MASK> %mask) nounwind alwaysinline {
-  %iptr = bitcast double * %ptr to i64 *
-  %ival = bitcast <WIDTH x double> %val to <WIDTH x i64>
-  %iret = call <WIDTH x i64> @__atomic_swap_int64_global(i64 * %iptr, <WIDTH x i64> %ival, <WIDTH x MASK> %mask)
-  %ret = bitcast <WIDTH x i64> %iret to <WIDTH x double>
-  ret <WIDTH x double> %ret
-}
-
-define float @__atomic_swap_uniform_float_global(float * %ptr, float %val,
-                                                   <WIDTH x MASK> %mask) nounwind alwaysinline {
+define float @__atomic_swap_uniform_float_global(float * %ptr, float %val) nounwind alwaysinline {
  %iptr = bitcast float * %ptr to i32 *
  %ival = bitcast float %val to i32
-  %iret = call i32 @__atomic_swap_uniform_int32_global(i32 * %iptr, i32 %ival, <WIDTH x MASK> %mask)
+  %iret = call i32 @__atomic_swap_uniform_int32_global(i32 * %iptr, i32 %ival)
  %ret = bitcast i32 %iret to float
  ret float %ret
 }

-define double @__atomic_swap_uniform_double_global(double * %ptr, double %val,
-                                                   <WIDTH x MASK> %mask) nounwind alwaysinline {
+define double @__atomic_swap_uniform_double_global(double * %ptr, double %val) nounwind alwaysinline {
  %iptr = bitcast double * %ptr to i64 *
  %ival = bitcast double %val to i64
-  %iret = call i64 @__atomic_swap_uniform_int64_global(i64 * %iptr, i64 %ival, <WIDTH x MASK> %mask)
+  %iret = call i64 @__atomic_swap_uniform_int64_global(i64 * %iptr, i64 %ival)
  %ret = bitcast i64 %iret to double
  ret double %ret
 }
@@ -2058,24 +2086,23 @@ define <WIDTH x double> @__atomic_compare_exchange_double_global(double * %ptr,
  ret <WIDTH x double> %ret
 }

-define float @__atomic_compare_exchange_uniform_float_global(float * %ptr, float %cmp, float %val,
-                                                   <WIDTH x MASK> %mask) nounwind alwaysinline {
+define float @__atomic_compare_exchange_uniform_float_global(float * %ptr, float %cmp,
+                                                             float %val) nounwind alwaysinline {
  %iptr = bitcast float * %ptr to i32 *
  %icmp = bitcast float %cmp to i32
  %ival = bitcast float %val to i32
  %iret = call i32 @__atomic_compare_exchange_uniform_int32_global(i32 * %iptr, i32 %icmp,
-                                                                   i32 %ival, <WIDTH x MASK> %mask)
+                                                                   i32 %ival)
  %ret = bitcast i32 %iret to float
  ret float %ret
 }

 define double @__atomic_compare_exchange_uniform_double_global(double * %ptr, double %cmp,
-                                            double %val, <WIDTH x MASK> %mask) nounwind alwaysinline {
+                                                               double %val) nounwind alwaysinline {
  %iptr = bitcast double * %ptr to i64 *
  %icmp = bitcast double %cmp to i64
  %ival = bitcast double %val to i64
-  %iret = call i64 @__atomic_compare_exchange_uniform_int64_global(i64 * %iptr, i64 %icmp,
-                                                                   i64 %ival, <WIDTH x MASK> %mask)
+  %iret = call i64 @__atomic_compare_exchange_uniform_int64_global(i64 * %iptr, i64 %icmp, i64 %ival)
  %ret = bitcast i64 %iret to double
  ret double %ret
 }
@@ -2219,9 +2246,9 @@ return:
 define(`gen_masked_store', `
 define void @__masked_store_$3(<$1 x $2>* nocapture, <$1 x $2>, <$1 x i32>) nounwind alwaysinline {
  per_lane($1, <$1 x i32> %2, `
-      %ptr_ID = getelementptr <$1 x $2> * %0, i32 0, i32 LANE
-      %storeval_ID = extractelement <$1 x $2> %1, i32 LANE
-      store $2 %storeval_ID, $2 * %ptr_ID')
+      %ptr_LANE_ID = getelementptr <$1 x $2> * %0, i32 0, i32 LANE
+      %storeval_LANE_ID = extractelement <$1 x $2> %1, i32 LANE
+      store $2 %storeval_LANE_ID, $2 * %ptr_LANE_ID')
  ret void
 }
 ')
@@ -2676,7 +2703,7 @@ pl_known_mask:
 pl_all_on:
  ;; the mask is all on--just expand the code for each lane sequentially
  forloop(i, 0, eval($1-1), 
-          `patsubst(`$3', `ID\|LANE', i)')
+          `patsubst(`$3', `LANE', i)')
  br label %pl_done

 pl_unknown_mask:
@@ -2727,7 +2754,8 @@ define(`gen_gather', `
 ;; Define the utility function to do the gather operation for a single element
 ;; of the type
 define <$1 x $2> @__gather_elt32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_scale,
-                                    <$1 x $2> %ret, i32 %lane) nounwind readonly alwaysinline {
+                                    <$1 x i32> %offset_delta, <$1 x $2> %ret,
+                                    i32 %lane) nounwind readonly alwaysinline {
  ; compute address for this one from the base
  %offset32 = extractelement <$1 x i32> %offsets, i32 %lane
  ; the order and details of the next 4 lines are important--they match LLVMs 
@@ -2737,15 +2765,20 @@ define <$1 x $2> @__gather_elt32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_
  %offset = mul i64 %offset64, %scale64
  %ptroffset = getelementptr i8 * %ptr, i64 %offset

+  %delta = extractelement <$1 x i32> %offset_delta, i32 %lane
+  %delta64 = sext i32 %delta to i64
+  %finalptr = getelementptr i8 * %ptroffset, i64 %delta64
+
  ; load value and insert into returned value
-  %ptrcast = bitcast i8 * %ptroffset to $2 *
+  %ptrcast = bitcast i8 * %finalptr to $2 *
  %val = load $2 *%ptrcast
  %updatedret = insertelement <$1 x $2> %ret, $2 %val, i32 %lane
  ret <$1 x $2> %updatedret
 }

 define <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_scale,
-                                    <$1 x $2> %ret, i32 %lane) nounwind readonly alwaysinline {
+                                    <$1 x i64> %offset_delta, <$1 x $2> %ret,
+                                    i32 %lane) nounwind readonly alwaysinline {
  ; compute address for this one from the base
  %offset64 = extractelement <$1 x i64> %offsets, i32 %lane
  ; the order and details of the next 4 lines are important--they match LLVMs 
@@ -2754,8 +2787,11 @@ define <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_
  %offset = mul i64 %offset64, %offset_scale64
  %ptroffset = getelementptr i8 * %ptr, i64 %offset

+  %delta64 = extractelement <$1 x i64> %offset_delta, i32 %lane
+  %finalptr = getelementptr i8 * %ptroffset, i64 %delta64
+
  ; load value and insert into returned value
-  %ptrcast = bitcast i8 * %ptroffset to $2 *
+  %ptrcast = bitcast i8 * %finalptr to $2 *
  %val = load $2 *%ptrcast
  %updatedret = insertelement <$1 x $2> %ret, $2 %val, i32 %lane
  ret <$1 x $2> %updatedret
@@ -2763,6 +2799,7 @@ define <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_


 define <$1 x $2> @__gather_base_offsets32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_scale,
+                                             <$1 x i32> %offset_delta,
                                             <$1 x i32> %vecmask) nounwind readonly alwaysinline {
  ; We can be clever and avoid the per-lane stuff for gathers if we are willing
  ; to require that the 0th element of the array being gathered from is always
@@ -2775,16 +2812,25 @@ define <$1 x $2> @__gather_base_offsets32_$2(i8 * %ptr, <$1 x i32> %offsets, i32
                                     <$1 x i32> %vecmask)
  %newOffsets = load <$1 x i32> * %offsetsPtr

+  %deltaPtr = alloca <$1 x i32>
+  store <$1 x i32> zeroinitializer, <$1 x i32> * %deltaPtr
+  call void @__masked_store_blend_32(<$1 x i32> * %deltaPtr, <$1 x i32> %offset_delta, 
+                                     <$1 x i32> %vecmask)
+  %newDelta = load <$1 x i32> * %deltaPtr
+
  %ret0 = call <$1 x $2> @__gather_elt32_$2(i8 * %ptr, <$1 x i32> %newOffsets,
-                                            i32 %offset_scale, <$1 x $2> undef, i32 0)
+                                            i32 %offset_scale, <$1 x i32> %offset_delta,
+                                            <$1 x $2> undef, i32 0)
  forloop(lane, 1, eval($1-1), 
          `patsubst(patsubst(`%retLANE = call <$1 x $2> @__gather_elt32_$2(i8 * %ptr, 
-                                <$1 x i32> %newOffsets, i32 %offset_scale, <$1 x $2> %retPREV, i32 LANE)
+                                <$1 x i32> %newOffsets, i32 %offset_scale, <$1 x i32> %offset_delta,
+                                <$1 x $2> %retPREV, i32 LANE)
                    ', `LANE', lane), `PREV', eval(lane-1))')
  ret <$1 x $2> %ret`'eval($1-1)
 }

 define <$1 x $2> @__gather_base_offsets64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_scale,
+                                             <$1 x i64> %offset_delta,
                                             <$1 x i32> %vecmask) nounwind readonly alwaysinline {
  ; We can be clever and avoid the per-lane stuff for gathers if we are willing
  ; to require that the 0th element of the array being gathered from is always
@@ -2797,11 +2843,19 @@ define <$1 x $2> @__gather_base_offsets64_$2(i8 * %ptr, <$1 x i64> %offsets, i32
                                     <$1 x i32> %vecmask)
  %newOffsets = load <$1 x i64> * %offsetsPtr

+  %deltaPtr = alloca <$1 x i64>
+  store <$1 x i64> zeroinitializer, <$1 x i64> * %deltaPtr
+  call void @__masked_store_blend_64(<$1 x i64> * %deltaPtr, <$1 x i64> %offset_delta, 
+                                     <$1 x i32> %vecmask)
+  %newDelta = load <$1 x i64> * %deltaPtr
+
  %ret0 = call <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %newOffsets,
-                                            i32 %offset_scale, <$1 x $2> undef, i32 0)
+                                            i32 %offset_scale, <$1 x i64> %newDelta,
+                                            <$1 x $2> undef, i32 0)
  forloop(lane, 1, eval($1-1), 
          `patsubst(patsubst(`%retLANE = call <$1 x $2> @__gather_elt64_$2(i8 * %ptr, 
-                                <$1 x i64> %newOffsets, i32 %offset_scale, <$1 x $2> %retPREV, i32 LANE)
+                                <$1 x i64> %newOffsets, i32 %offset_scale, <$1 x i64> %newDelta,
+                                <$1 x $2> %retPREV, i32 LANE)
                    ', `LANE', lane), `PREV', eval(lane-1))')
  ret <$1 x $2> %ret`'eval($1-1)
 }
@@ -2811,11 +2865,11 @@ define <$1 x $2> @__gather32_$2(<$1 x i32> %ptrs,
                                <$1 x i32> %vecmask) nounwind readonly alwaysinline {
  %ret_ptr = alloca <$1 x $2>
  per_lane($1, <$1 x i32> %vecmask, `
-  %iptr_ID = extractelement <$1 x i32> %ptrs, i32 LANE
-  %ptr_ID = inttoptr i32 %iptr_ID to $2 *
-  %val_ID = load $2 * %ptr_ID
-  %store_ptr_ID = getelementptr <$1 x $2> * %ret_ptr, i32 0, i32 LANE
-  store $2 %val_ID, $2 * %store_ptr_ID
+  %iptr_LANE_ID = extractelement <$1 x i32> %ptrs, i32 LANE
+  %ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $2 *
+  %val_LANE_ID = load $2 * %ptr_LANE_ID
+  %store_ptr_LANE_ID = getelementptr <$1 x $2> * %ret_ptr, i32 0, i32 LANE
+  store $2 %val_LANE_ID, $2 * %store_ptr_LANE_ID
 ')

  %ret = load <$1 x $2> * %ret_ptr
@@ -2827,11 +2881,11 @@ define <$1 x $2> @__gather64_$2(<$1 x i64> %ptrs,
                                <$1 x i32> %vecmask) nounwind readonly alwaysinline {
  %ret_ptr = alloca <$1 x $2>
  per_lane($1, <$1 x i32> %vecmask, `
-  %iptr_ID = extractelement <$1 x i64> %ptrs, i32 LANE
-  %ptr_ID = inttoptr i64 %iptr_ID to $2 *
-  %val_ID = load $2 * %ptr_ID
-  %store_ptr_ID = getelementptr <$1 x $2> * %ret_ptr, i32 0, i32 LANE
-  store $2 %val_ID, $2 * %store_ptr_ID
+  %iptr_LANE_ID = extractelement <$1 x i64> %ptrs, i32 LANE
+  %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $2 *
+  %val_LANE_ID = load $2 * %ptr_LANE_ID
+  %store_ptr_LANE_ID = getelementptr <$1 x $2> * %ret_ptr, i32 0, i32 LANE
+  store $2 %val_LANE_ID, $2 * %store_ptr_LANE_ID
 ')

  %ret = load <$1 x $2> * %ret_ptr
@@ -2852,7 +2906,8 @@ define(`gen_scatter', `
 ;; Define the function that descripes the work to do to scatter a single
 ;; value
 define void @__scatter_elt32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_scale,
-                                <$1 x $2> %values, i32 %lane) nounwind alwaysinline {
+                                <$1 x i32> %offset_delta, <$1 x $2> %values,
+                                i32 %lane) nounwind alwaysinline {
  %offset32 = extractelement <$1 x i32> %offsets, i32 %lane
  ; the order and details of the next 4 lines are important--they match LLVMs 
  ; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations
@@ -2861,42 +2916,52 @@ define void @__scatter_elt32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_scal
  %offset = mul i64 %offset64, %scale64
  %ptroffset = getelementptr i8 * %ptr, i64 %offset

-  %ptrcast = bitcast i8 * %ptroffset to $2 *
+  %delta = extractelement <$1 x i32> %offset_delta, i32 %lane
+  %delta64 = sext i32 %delta to i64
+  %finalptr = getelementptr i8 * %ptroffset, i64 %delta64
+
+  %ptrcast = bitcast i8 * %finalptr to $2 *
  %storeval = extractelement <$1 x $2> %values, i32 %lane
  store $2 %storeval, $2 * %ptrcast
  ret void
 }

 define void @__scatter_elt64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_scale,
-                                <$1 x $2> %values, i32 %lane) nounwind alwaysinline {
+                                <$1 x i64> %offset_delta, <$1 x $2> %values,
+                                i32 %lane) nounwind alwaysinline {
  %offset64 = extractelement <$1 x i64> %offsets, i32 %lane
  ; the order and details of the next 4 lines are important--they match LLVMs 
  ; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations
  %scale64 = sext i32 %offset_scale to i64
  %offset = mul i64 %offset64, %scale64
  %ptroffset = getelementptr i8 * %ptr, i64 %offset
-  %ptrcast = bitcast i8 * %ptroffset to $2 *

+  %delta64 = extractelement <$1 x i64> %offset_delta, i32 %lane
+  %finalptr = getelementptr i8 * %ptroffset, i64 %delta64
+
+  %ptrcast = bitcast i8 * %finalptr to $2 *
  %storeval = extractelement <$1 x $2> %values, i32 %lane
  store $2 %storeval, $2 * %ptrcast
  ret void
 }

 define void @__scatter_base_offsets32_$2(i8* %base, <$1 x i32> %offsets, i32 %offset_scale,
-                                         <$1 x $2> %values, <$1 x i32> %mask) nounwind alwaysinline {
+                                         <$1 x i32> %offset_delta, <$1 x $2> %values,
+                                         <$1 x i32> %mask) nounwind alwaysinline {
  ;; And use the `per_lane' macro to do all of the per-lane work for scatter...
  per_lane($1, <$1 x i32> %mask, `
      call void @__scatter_elt32_$2(i8 * %base, <$1 x i32> %offsets, i32 %offset_scale,
-                                    <$1 x $2> %values, i32 LANE)')
+                                    <$1 x i32> %offset_delta, <$1 x $2> %values, i32 LANE)')
  ret void
 }

 define void @__scatter_base_offsets64_$2(i8* %base, <$1 x i64> %offsets, i32 %offset_scale,
-                                         <$1 x $2> %values, <$1 x i32> %mask) nounwind alwaysinline {
+                                         <$1 x i64> %offset_delta, <$1 x $2> %values,
+                                         <$1 x i32> %mask) nounwind alwaysinline {
  ;; And use the `per_lane' macro to do all of the per-lane work for scatter...
  per_lane($1, <$1 x i32> %mask, `
      call void @__scatter_elt64_$2(i8 * %base, <$1 x i64> %offsets, i32 %offset_scale,
-                                    <$1 x $2> %values, i32 LANE)')
+                                    <$1 x i64> %offset_delta, <$1 x $2> %values, i32 LANE)')
  ret void
 }

@@ -2904,10 +2969,10 @@ define void @__scatter_base_offsets64_$2(i8* %base, <$1 x i64> %offsets, i32 %of
 define void @__scatter32_$2(<$1 x i32> %ptrs, <$1 x $2> %values,
                            <$1 x i32> %mask) nounwind alwaysinline {
  per_lane($1, <$1 x i32> %mask, `
-  %iptr_ID = extractelement <$1 x i32> %ptrs, i32 LANE
-  %ptr_ID = inttoptr i32 %iptr_ID to $2 *
-  %val_ID = extractelement <$1 x $2> %values, i32 LANE
-  store $2 %val_ID, $2 * %ptr_ID
+  %iptr_LANE_ID = extractelement <$1 x i32> %ptrs, i32 LANE
+  %ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $2 *
+  %val_LANE_ID = extractelement <$1 x $2> %values, i32 LANE
+  store $2 %val_LANE_ID, $2 * %ptr_LANE_ID
 ')
  ret void
 }
@@ -2916,10 +2981,10 @@ define void @__scatter32_$2(<$1 x i32> %ptrs, <$1 x $2> %values,
 define void @__scatter64_$2(<$1 x i64> %ptrs, <$1 x $2> %values,
                            <$1 x i32> %mask) nounwind alwaysinline {
  per_lane($1, <$1 x i32> %mask, `
-  %iptr_ID = extractelement <$1 x i64> %ptrs, i32 LANE
-  %ptr_ID = inttoptr i64 %iptr_ID to $2 *
-  %val_ID = extractelement <$1 x $2> %values, i32 LANE
-  store $2 %val_ID, $2 * %ptr_ID
+  %iptr_LANE_ID = extractelement <$1 x i64> %ptrs, i32 LANE
+  %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $2 *
+  %val_LANE_ID = extractelement <$1 x $2> %values, i32 LANE
+  store $2 %val_LANE_ID, $2 * %ptr_LANE_ID
 ')
  ret void
 }
--- a/cbackend.cpp
+++ b/cbackend.cpp
@@ -24,6 +24,8 @@
 #define PRIx64 "llx"
 #endif

+#include "llvmutil.h"
+
 #include "llvm/CallingConv.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
@@ -232,6 +234,7 @@ namespace {
    unsigned NextAnonValueNumber;
    
    std::string includeName;
+    int vectorWidth;

    /// UnnamedStructIDs - This contains a unique ID for each struct that is
    /// either anonymous or has no name.
@@ -240,11 +243,13 @@ namespace {

  public:
    static char ID;
-    explicit CWriter(formatted_raw_ostream &o, const char *incname)
+      explicit CWriter(formatted_raw_ostream &o, const char *incname,
+                       int vecwidth)
      : FunctionPass(ID), Out(o), IL(0), Mang(0), LI(0),
        TheModule(0), TAsm(0), MRI(0), MOFI(0), TCtx(0), TD(0),
        OpaqueCounter(0), NextAnonValueNumber(0), 
-        includeName(incname ? incname : "generic_defs.h") {
+        includeName(incname ? incname : "generic_defs.h"),
+        vectorWidth(vecwidth) {
      initializeLoopInfoPass(*PassRegistry::getPassRegistry());
      FPCounter = 0;
    }
@@ -773,6 +778,16 @@ raw_ostream &CWriter::printType(raw_ostream &Out, Type *Ty,
    Out << "    return ret;\n";
    Out << "  }\n  ";

+    // if it's an array of i8s, also provide a version that takes a const
+    // char *
+    if (ATy->getElementType() == LLVMTypes::Int8Type) {
+        Out << "  static " << NameSoFar << " init(const char *p) {\n";
+        Out << "    " << NameSoFar << " ret;\n";
+        Out << "    strncpy((char *)ret.array, p, " << NumElements << ");\n";
+        Out << "    return ret;\n";
+        Out << "  }\n";
+    }
+
    printType(Out, ATy->getElementType(), false,
              "array[" + utostr(NumElements) + "]");
    return Out << ";\n} ";
@@ -842,6 +857,7 @@ void CWriter::printConstantArray(ConstantArray *CPA, bool Static) {
    }
    Out << '\"';
  } else {
+    if (Static)
      Out << '{';
    if (CPA->getNumOperands()) {
      Out << ' ';
@@ -851,6 +867,7 @@ void CWriter::printConstantArray(ConstantArray *CPA, bool Static) {
        printConstant(cast<Constant>(CPA->getOperand(i)), Static);
      }
    }
+    if (Static)
      Out << " }";
  }
 }
@@ -1321,7 +1338,8 @@ void CWriter::printConstant(Constant *CPV, bool Static) {
    break;
  }

-  case Type::ArrayTyID:
+  case Type::ArrayTyID: {
+    ArrayType *AT = cast<ArrayType>(CPV->getType());
    if (Static)
      // arrays are wrapped in structs...
      Out << "{ ";
@@ -1334,7 +1352,6 @@ void CWriter::printConstant(Constant *CPV, bool Static) {
      printConstantArray(CA, Static);
    } else {
      assert(isa<ConstantAggregateZero>(CPV) || isa<UndefValue>(CPV));
-      ArrayType *AT = cast<ArrayType>(CPV->getType());
      if (AT->getNumElements()) {
        Out << ' ';
        Constant *CZ = Constant::getNullValue(AT->getElementType());
@@ -1350,7 +1367,7 @@ void CWriter::printConstant(Constant *CPV, bool Static) {
    else
        Out << ")";
    break;
-
+  }
  case Type::VectorTyID:
    printType(Out, CPV->getType());
    Out << "(";
@@ -2097,7 +2114,8 @@ bool CWriter::doInitialization(Module &M) {
        I->getName() == "memset" || I->getName() == "memset_pattern16" ||
        I->getName() == "puts" ||
        I->getName() == "printf" || I->getName() == "putchar" ||
-        I->getName() == "fflush")
+        I->getName() == "fflush" || I->getName() == "malloc" ||
+        I->getName() == "free")
      continue;

    // Don't redeclare ispc's own intrinsics
@@ -2203,7 +2221,7 @@ bool CWriter::doInitialization(Module &M) {
        // FIXME common linkage should avoid this problem.
        if (!I->getInitializer()->isNullValue()) {
          Out << " = " ;
-          writeOperand(I->getInitializer(), true);
+          writeOperand(I->getInitializer(), false);
        } else if (I->hasWeakLinkage()) {
          // We have to specify an initializer, but it doesn't have to be
          // complete.  If the value is an aggregate, print out { 0 }, and let
@@ -2218,7 +2236,7 @@ bool CWriter::doInitialization(Module &M) {
            Out << "{ { 0 } }";
          } else {
            // Just print it out normally.
-            writeOperand(I->getInitializer(), true);
+            writeOperand(I->getInitializer(), false);
          }
        }
        Out << ";\n";
@@ -2892,6 +2910,20 @@ void CWriter::visitBinaryOperator(Instruction &I) {
      Out << "(";
      writeOperand(I.getOperand(0));
      Out << ", ";
+      if ((I.getOpcode() == Instruction::Shl ||
+           I.getOpcode() == Instruction::LShr ||
+           I.getOpcode() == Instruction::AShr)) {
+          std::vector<PHINode *> phis;
+          if (LLVMVectorValuesAllEqual(I.getOperand(1),
+                                       vectorWidth, phis)) {
+              Out << "__extract_element(";
+              writeOperand(I.getOperand(1));
+              Out << ", 0) ";
+          }
+          else
+              writeOperand(I.getOperand(1));
+      }
+      else
          writeOperand(I.getOperand(1));
      Out << ")";
      return;
@@ -3406,6 +3438,9 @@ void CWriter::visitCallInst(CallInst &I) {
          Callee = RF;
        }

+    if (Callee->getName() == "malloc")
+        Out << "(uint8_t *)";
+
    if (NeedsCast) {
      // Ok, just cast the pointer type.
      Out << "((";
@@ -3633,7 +3668,7 @@ std::string CWriter::InterpretASMConstraint(InlineAsm::ConstraintInfo& c) {
 #endif

  std::string E;
-  if (const Target *Match = TargetRegistry::lookupTarget(Triple, E))
+  if (const llvm::Target *Match = TargetRegistry::lookupTarget(Triple, E))
    TargetAsm = Match->createMCAsmInfo(Triple);
  else
    return c.Codes[0];
@@ -4335,7 +4370,7 @@ WriteCXXFile(llvm::Module *module, const char *fn, int vectorWidth,
    pm.add(new BitcastCleanupPass);
    pm.add(createDeadCodeEliminationPass()); // clean up after smear pass
 //CO    pm.add(createPrintModulePass(&fos));
-    pm.add(new CWriter(fos, includeName));
+    pm.add(new CWriter(fos, includeName, vectorWidth));
    pm.add(createGCInfoDeleter());
 //CO    pm.add(createVerifierPass());

--- a/ctx.cpp
+++ b/ctx.cpp
@@ -74,18 +74,35 @@ struct CFInfo {
                              llvm::Value *savedContinueLanesPtr,
                              llvm::Value *savedMask, llvm::Value *savedLoopMask);

+    static CFInfo *GetSwitch(bool isUniform, llvm::BasicBlock *breakTarget,
+                             llvm::BasicBlock *continueTarget, 
+                             llvm::Value *savedBreakLanesPtr,
+                             llvm::Value *savedContinueLanesPtr,
+                             llvm::Value *savedMask, llvm::Value *savedLoopMask,
+                             llvm::Value *switchExpr,
+                             llvm::BasicBlock *bbDefault,
+                             const std::vector<std::pair<int, llvm::BasicBlock *> > *bbCases,
+                             const std::map<llvm::BasicBlock *, llvm::BasicBlock *> *bbNext,
+                             bool scUniform);
+    
    bool IsIf() { return type == If; }
    bool IsLoop() { return type == Loop; }
    bool IsForeach() { return type == Foreach; }
+    bool IsSwitch() { return type == Switch; }
    bool IsVarying() { return !isUniform; }
    bool IsUniform() { return isUniform; }

-    enum CFType { If, Loop, Foreach };
+    enum CFType { If, Loop, Foreach, Switch };
    CFType type;
    bool isUniform;
    llvm::BasicBlock *savedBreakTarget, *savedContinueTarget;
    llvm::Value *savedBreakLanesPtr, *savedContinueLanesPtr;
    llvm::Value *savedMask, *savedLoopMask;
+    llvm::Value *savedSwitchExpr;
+    llvm::BasicBlock *savedDefaultBlock;
+    const std::vector<std::pair<int, llvm::BasicBlock *> > *savedCaseBlocks;
+    const std::map<llvm::BasicBlock *, llvm::BasicBlock *> *savedNextBlocks;
+    bool savedSwitchConditionWasUniform;

 private:
    CFInfo(CFType t, bool uniformIf, llvm::Value *sm) {
@@ -95,11 +112,18 @@ private:
        savedBreakTarget = savedContinueTarget = NULL;
        savedBreakLanesPtr = savedContinueLanesPtr = NULL;
        savedMask = savedLoopMask = sm;
+        savedSwitchExpr = NULL;
+        savedDefaultBlock = NULL;
+        savedCaseBlocks = NULL;
+        savedNextBlocks = NULL;
    }
    CFInfo(CFType t, bool iu, llvm::BasicBlock *bt, llvm::BasicBlock *ct,
           llvm::Value *sb, llvm::Value *sc, llvm::Value *sm,
-           llvm::Value *lm) {
-        Assert(t == Loop);
+           llvm::Value *lm, llvm::Value *sse = NULL, llvm::BasicBlock *bbd = NULL, 
+           const std::vector<std::pair<int, llvm::BasicBlock *> > *bbc = NULL,
+           const std::map<llvm::BasicBlock *, llvm::BasicBlock *> *bbn = NULL,
+           bool scu = false) {
+        Assert(t == Loop || t == Switch);
        type = t;
        isUniform = iu;
        savedBreakTarget = bt;
@@ -108,6 +132,11 @@ private:
        savedContinueLanesPtr = sc;
        savedMask = sm;
        savedLoopMask = lm;
+        savedSwitchExpr = sse;
+        savedDefaultBlock = bbd;
+        savedCaseBlocks = bbc;
+        savedNextBlocks = bbn;
+        savedSwitchConditionWasUniform = scu;
    }
    CFInfo(CFType t, llvm::BasicBlock *bt, llvm::BasicBlock *ct,
           llvm::Value *sb, llvm::Value *sc, llvm::Value *sm,
@@ -121,6 +150,10 @@ private:
        savedContinueLanesPtr = sc;
        savedMask = sm;
        savedLoopMask = lm;
+        savedSwitchExpr = NULL;
+        savedDefaultBlock = NULL;
+        savedCaseBlocks = NULL;
+        savedNextBlocks = NULL;
    }
 };

@@ -154,6 +187,23 @@ CFInfo::GetForeach(llvm::BasicBlock *breakTarget,
                      savedMask, savedForeachMask);
 }

+
+CFInfo *
+CFInfo::GetSwitch(bool isUniform, llvm::BasicBlock *breakTarget,
+                  llvm::BasicBlock *continueTarget, 
+                  llvm::Value *savedBreakLanesPtr,
+                  llvm::Value *savedContinueLanesPtr, llvm::Value *savedMask,
+                  llvm::Value *savedLoopMask, llvm::Value *savedSwitchExpr,
+                  llvm::BasicBlock *savedDefaultBlock,
+                  const std::vector<std::pair<int, llvm::BasicBlock *> > *savedCases,
+                  const std::map<llvm::BasicBlock *, llvm::BasicBlock *> *savedNext,
+                  bool savedSwitchConditionUniform) {
+    return new CFInfo(Switch, isUniform, breakTarget, continueTarget, 
+                      savedBreakLanesPtr, savedContinueLanesPtr,
+                      savedMask, savedLoopMask, savedSwitchExpr, savedDefaultBlock, 
+                      savedCases, savedNext, savedSwitchConditionUniform);
+}
+
 ///////////////////////////////////////////////////////////////////////////

 FunctionEmitContext::FunctionEmitContext(Function *func, Symbol *funSym,
@@ -182,6 +232,11 @@ FunctionEmitContext::FunctionEmitContext(Function *func, Symbol *funSym,
    breakLanesPtr = continueLanesPtr = NULL;
    breakTarget = continueTarget = NULL;

+    switchExpr = NULL;
+    caseBlocks = NULL;
+    defaultBlock = NULL;
+    nextBlocks = NULL;
+
    returnedLanesPtr = AllocaInst(LLVMTypes::MaskType, "returned_lanes_memory");
    StoreInst(LLVMMaskAllOff, returnedLanesPtr);

@@ -422,14 +477,15 @@ FunctionEmitContext::StartVaryingIf(llvm::Value *oldMask) {

 void
 FunctionEmitContext::EndIf() {
+    CFInfo *ci = popCFState();
    // Make sure we match up with a Start{Uniform,Varying}If().
-    Assert(controlFlowInfo.size() > 0 && controlFlowInfo.back()->IsIf());
-    CFInfo *ci = controlFlowInfo.back();
-    controlFlowInfo.pop_back();
+    Assert(ci->IsIf());

    // 'uniform' ifs don't change the mask so we only need to restore the
    // mask going into the if for 'varying' if statements
-    if (!ci->IsUniform() && bblock != NULL) {
+    if (ci->IsUniform() || bblock == NULL)
+        return;
+
    // We can't just restore the mask as it was going into the 'if'
    // statement.  First we have to take into account any program
    // instances that have executed 'return' statements; the restored
@@ -437,7 +493,7 @@ FunctionEmitContext::EndIf() {
    restoreMaskGivenReturns(ci->savedMask);

    // If the 'if' statement is inside a loop with a 'varying'
-        // consdition, we also need to account for any break or continue
+    // condition, we also need to account for any break or continue
    // statements that executed inside the 'if' statmeent; we also must
    // leave the lane masks for the program instances that ran those
    // off after we restore the mask after the 'if'.  The code below
@@ -445,30 +501,39 @@ FunctionEmitContext::EndIf() {
    // or continue statements (and breakLanesPtr and continueLanesPtr
    // have their initial 'all off' values), so we don't need to check
    // for that here.
-        if (continueLanesPtr != NULL) {
+    // 
+    // There are three general cases to deal with here:
+    // - Loops: both break and continue are allowed, and thus the corresponding
+    //   lane mask pointers are non-NULL
+    // - Foreach: only continueLanesPtr may be non-NULL
+    // - Switch: only breakLanesPtr may be non-NULL
+    if (continueLanesPtr != NULL || breakLanesPtr != NULL) {
        // We want to compute:
-            // newMask = (oldMask & ~(breakLanes | continueLanes))
-            llvm::Value *oldMask = GetInternalMask();
-            llvm::Value *continueLanes = LoadInst(continueLanesPtr,
-                                                  "continue_lanes");
-            llvm::Value *bcLanes = continueLanes;
+        // newMask = (oldMask & ~(breakLanes | continueLanes)),
+        // treading breakLanes or continueLanes as "all off" if the
+        // corresponding pointer is NULL.
+        llvm::Value *bcLanes = NULL;
+
+        if (continueLanesPtr != NULL)
+            bcLanes = LoadInst(continueLanesPtr, "continue_lanes");
+        else
+            bcLanes = LLVMMaskAllOff;

        if (breakLanesPtr != NULL) {
-                // breakLanesPtr will be NULL if we're inside a 'foreach' loop
            llvm::Value *breakLanes = LoadInst(breakLanesPtr, "break_lanes");
-                bcLanes = BinaryOperator(llvm::Instruction::Or, breakLanes, 
-                                         continueLanes, "break|continue_lanes");
+            bcLanes = BinaryOperator(llvm::Instruction::Or, bcLanes, 
+                                     breakLanes, "|break_lanes");
        }

        llvm::Value *notBreakOrContinue = 
            NotOperator(bcLanes, "!(break|continue)_lanes");
+        llvm::Value *oldMask = GetInternalMask();
        llvm::Value *newMask = 
            BinaryOperator(llvm::Instruction::And, oldMask, 
                           notBreakOrContinue, "new_mask");
        SetInternalMask(newMask);
    }
 }
-}


 void
@@ -502,17 +567,8 @@ FunctionEmitContext::StartLoop(llvm::BasicBlock *bt, llvm::BasicBlock *ct,

 void
 FunctionEmitContext::EndLoop() {
-    Assert(controlFlowInfo.size() && controlFlowInfo.back()->IsLoop());
-    CFInfo *ci = controlFlowInfo.back();
-    controlFlowInfo.pop_back();
-
-    // Restore the break/continue state information to what it was before
-    // we went into this loop.
-    breakTarget = ci->savedBreakTarget;
-    continueTarget = ci->savedContinueTarget;
-    breakLanesPtr = ci->savedBreakLanesPtr;
-    continueLanesPtr = ci->savedContinueLanesPtr;
-    loopMask = ci->savedLoopMask;
+    CFInfo *ci = popCFState();
+    Assert(ci->IsLoop());

    if (!ci->IsUniform())
        // If the loop had a 'uniform' test, then it didn't make any
@@ -525,7 +581,7 @@ FunctionEmitContext::EndLoop() {


 void
-FunctionEmitContext::StartForeach(llvm::BasicBlock *ct) {
+FunctionEmitContext::StartForeach() {
    // Store the current values of various loop-related state so that we
    // can restore it when we exit this loop.
    llvm::Value *oldMask = GetInternalMask();
@@ -537,7 +593,7 @@ FunctionEmitContext::StartForeach(llvm::BasicBlock *ct) {

    continueLanesPtr = AllocaInst(LLVMTypes::MaskType, "foreach_continue_lanes");
    StoreInst(LLVMMaskAllOff, continueLanesPtr);
-    continueTarget = ct;
+    continueTarget = NULL; // should be set by SetContinueTarget()

    loopMask = NULL;
 }
@@ -545,17 +601,8 @@ FunctionEmitContext::StartForeach(llvm::BasicBlock *ct) {

 void
 FunctionEmitContext::EndForeach() {
-    Assert(controlFlowInfo.size() && controlFlowInfo.back()->IsForeach());
-    CFInfo *ci = controlFlowInfo.back();
-    controlFlowInfo.pop_back();
-
-    // Restore the break/continue state information to what it was before
-    // we went into this loop.
-    breakTarget = ci->savedBreakTarget;
-    continueTarget = ci->savedContinueTarget;
-    breakLanesPtr = ci->savedBreakLanesPtr;
-    continueLanesPtr = ci->savedContinueLanesPtr;
-    loopMask = ci->savedLoopMask;
+    CFInfo *ci = popCFState();
+    Assert(ci->IsForeach());
 }


@@ -576,28 +623,64 @@ FunctionEmitContext::restoreMaskGivenReturns(llvm::Value *oldMask) {
 }


+/** Returns "true" if the first enclosing non-if control flow expression is
+    a "switch" statement.
+*/
+bool
+FunctionEmitContext::inSwitchStatement() const {
+    // Go backwards through controlFlowInfo, since we add new nested scopes
+    // to the back.
+    int i = controlFlowInfo.size() - 1;
+    while (i >= 0 && controlFlowInfo[i]->IsIf())
+        --i;
+    // Got to the first non-if (or end of CF info)
+    if (i == -1)
+        return false;
+    return controlFlowInfo[i]->IsSwitch();
+}
+
+
 void
 FunctionEmitContext::Break(bool doCoherenceCheck) {
+    Assert(controlFlowInfo.size() > 0);
    if (breakTarget == NULL) {
        Error(currentPos, "\"break\" statement is illegal outside of "
-              "for/while/do loops.");
+              "for/while/do loops and \"switch\" statements.");
+        return;
+    }
+
+    if (bblock == NULL)
+        return;
+
+    if (inSwitchStatement() == true &&
+        switchConditionWasUniform == true && 
+        ifsInCFAllUniform(CFInfo::Switch)) {
+        // We know that all program instances are executing the break, so
+        // just jump to the block immediately after the switch.
+        Assert(breakTarget != NULL);
+        BranchInst(breakTarget);
+        bblock = NULL;
        return;
    }

    // If all of the enclosing 'if' tests in the loop have uniform control
    // flow or if we can tell that the mask is all on, then we can just
    // jump to the break location.
-    if (ifsInLoopAllUniform() || GetInternalMask() == LLVMMaskAllOn) {
+    if (inSwitchStatement() == false && 
+        (ifsInCFAllUniform(CFInfo::Loop) || 
+         GetInternalMask() == LLVMMaskAllOn)) {
        BranchInst(breakTarget);
-        if (ifsInLoopAllUniform() && doCoherenceCheck)
-            Warning(currentPos, "Coherent break statement not necessary in fully uniform "
-                    "control flow.");
+        if (ifsInCFAllUniform(CFInfo::Loop) && doCoherenceCheck)
+            Warning(currentPos, "Coherent break statement not necessary in "
+                    "fully uniform control flow.");
        // Set bblock to NULL since the jump has terminated the basic block
        bblock = NULL;
    }
    else {
-        // Otherwise we need to update the mask of the lanes that have
-        // executed a 'break' statement:
+        // Varying switch, uniform switch where the 'break' is under
+        // varying control flow, or a loop with varying 'if's above the
+        // break.  In these cases, we need to update the mask of the lanes
+        // that have executed a 'break' statement: 
        // breakLanes = breakLanes | mask
        Assert(breakLanesPtr != NULL);
        llvm::Value *mask = GetInternalMask();
@@ -613,16 +696,20 @@ FunctionEmitContext::Break(bool doCoherenceCheck) {
        // an 'if' statement and restore the mask then.
        SetInternalMask(LLVMMaskAllOff);

-        if (doCoherenceCheck)
-            // If the user has indicated that this is a 'coherent' break
-            // statement, then check to see if the mask is all off.  If so,
-            // we have to conservatively jump to the continueTarget, not
-            // the breakTarget, since part of the reason the mask is all
-            // off may be due to 'continue' statements that executed in the
-            // current loop iteration.  
-            // FIXME: if the loop only has break statements and no
-            // continues, we can jump to breakTarget in that case.
+        if (doCoherenceCheck) {
+            if (continueTarget != NULL)
+                // If the user has indicated that this is a 'coherent'
+                // break statement, then check to see if the mask is all
+                // off.  If so, we have to conservatively jump to the
+                // continueTarget, not the breakTarget, since part of the
+                // reason the mask is all off may be due to 'continue'
+                // statements that executed in the current loop iteration.
                jumpIfAllLoopLanesAreDone(continueTarget);
+            else if (breakTarget != NULL)
+                // Similarly handle these for switch statements, where we
+                // only have a break target.
+                jumpIfAllLoopLanesAreDone(breakTarget);
+        }
    }
 }

@@ -635,12 +722,12 @@ FunctionEmitContext::Continue(bool doCoherenceCheck) {
        return;
    }

-    if (ifsInLoopAllUniform() || GetInternalMask() == LLVMMaskAllOn) {
+    if (ifsInCFAllUniform(CFInfo::Loop) || GetInternalMask() == LLVMMaskAllOn) {
        // Similarly to 'break' statements, we can immediately jump to the
        // continue target if we're only in 'uniform' control flow within
        // loop or if we can tell that the mask is all on.
        AddInstrumentationPoint("continue: uniform CF, jumped");
-        if (ifsInLoopAllUniform() && doCoherenceCheck)
+        if (ifsInCFAllUniform(CFInfo::Loop) && doCoherenceCheck)
            Warning(currentPos, "Coherent continue statement not necessary in "
                    "fully uniform control flow.");
        BranchInst(continueTarget);
@@ -653,8 +740,9 @@ FunctionEmitContext::Continue(bool doCoherenceCheck) {
        llvm::Value *mask = GetInternalMask();
        llvm::Value *continueMask = 
            LoadInst(continueLanesPtr, "continue_mask");
-        llvm::Value *newMask = BinaryOperator(llvm::Instruction::Or,
-                                              mask, continueMask, "mask|continueMask");
+        llvm::Value *newMask = 
+            BinaryOperator(llvm::Instruction::Or, mask, continueMask,
+                           "mask|continueMask");
        StoreInst(newMask, continueLanesPtr);

        // And set the current mask to be all off in case there are any
@@ -671,22 +759,23 @@ FunctionEmitContext::Continue(bool doCoherenceCheck) {


 /** This function checks to see if all of the 'if' statements (if any)
-    between the current scope and the first enclosing loop have 'uniform'
-    tests.
+    between the current scope and the first enclosing loop/switch of given
+    control flow type have 'uniform' tests.
 */
 bool
-FunctionEmitContext::ifsInLoopAllUniform() const {
+FunctionEmitContext::ifsInCFAllUniform(int type) const {
    Assert(controlFlowInfo.size() > 0);
    // Go backwards through controlFlowInfo, since we add new nested scopes
-    // to the back.  Stop once we come to the first enclosing loop.
+    // to the back.  Stop once we come to the first enclosing control flow
+    // structure of the desired type.
    int i = controlFlowInfo.size() - 1;
-    while (i >= 0 && controlFlowInfo[i]->type != CFInfo::Loop) {
+    while (i >= 0 && controlFlowInfo[i]->type != type) {
        if (controlFlowInfo[i]->isUniform == false)
            // Found a scope due to an 'if' statement with a varying test
            return false;
        --i;
    }
-    Assert(i >= 0); // else we didn't find a loop!
+    Assert(i >= 0); // else we didn't find the expected control flow type!
    return true;
 }

@@ -759,6 +848,244 @@ FunctionEmitContext::RestoreContinuedLanes() {
 }


+void
+FunctionEmitContext::StartSwitch(bool cfIsUniform, llvm::BasicBlock *bbBreak) {
+    llvm::Value *oldMask = GetInternalMask();
+    controlFlowInfo.push_back(CFInfo::GetSwitch(cfIsUniform, breakTarget, 
+                                                continueTarget, breakLanesPtr,
+                                                continueLanesPtr, oldMask, 
+                                                loopMask, switchExpr, defaultBlock, 
+                                                caseBlocks, nextBlocks,
+                                                switchConditionWasUniform));
+
+    breakLanesPtr = AllocaInst(LLVMTypes::MaskType, "break_lanes_memory");
+    StoreInst(LLVMMaskAllOff, breakLanesPtr);
+    breakTarget = bbBreak;
+
+    continueLanesPtr = NULL;
+    continueTarget = NULL;
+    loopMask = NULL;
+
+    // These will be set by the SwitchInst() method
+    switchExpr = NULL;
+    defaultBlock = NULL;
+    caseBlocks = NULL;
+    nextBlocks = NULL;
+}
+
+
+void
+FunctionEmitContext::EndSwitch() {
+    Assert(bblock != NULL);
+
+    CFInfo *ci = popCFState();
+    if (ci->IsVarying() && bblock != NULL)
+        restoreMaskGivenReturns(ci->savedMask);
+}
+
+
+/** Emit code to check for an "all off" mask before the code for a 
+    case or default label in a "switch" statement.
+ */
+void
+FunctionEmitContext::addSwitchMaskCheck(llvm::Value *mask) {
+    llvm::Value *allOff = None(mask);
+    llvm::BasicBlock *bbSome = CreateBasicBlock("case_default_on");
+
+    // Find the basic block for the case or default label immediately after
+    // the current one in the switch statement--that's where we want to
+    // jump if the mask is all off at this label.
+    Assert(nextBlocks->find(bblock) != nextBlocks->end());
+    llvm::BasicBlock *bbNext = nextBlocks->find(bblock)->second;
+
+    // Jump to the next one of the mask is all off; otherwise jump to the
+    // newly created block that will hold the actual code for this label.
+    BranchInst(bbNext, bbSome, allOff);
+    SetCurrentBasicBlock(bbSome);
+}
+
+
+/** Returns the execution mask at entry to the first enclosing "switch"
+    statement. */
+llvm::Value *
+FunctionEmitContext::getMaskAtSwitchEntry() {
+    Assert(controlFlowInfo.size() > 0);
+    int i = controlFlowInfo.size() - 1;
+    while (i >= 0 && controlFlowInfo[i]->type != CFInfo::Switch)
+        --i;
+    Assert(i != -1);
+    return controlFlowInfo[i]->savedMask;
+}
+
+
+void
+FunctionEmitContext::EmitDefaultLabel(bool checkMask, SourcePos pos) {
+    if (inSwitchStatement() == false) {
+        Error(pos, "\"default\" label illegal outside of \"switch\" "
+              "statement.");
+        return;
+    }
+
+    // If there's a default label in the switch, a basic block for it
+    // should have been provided in the previous call to SwitchInst().
+    Assert(defaultBlock != NULL);
+
+    if (bblock != NULL)
+        // The previous case in the switch fell through, or we're in a
+        // varying switch; terminate the current block with a jump to the
+        // block for the code for the default label.
+        BranchInst(defaultBlock);
+    SetCurrentBasicBlock(defaultBlock);
+
+    if (switchConditionWasUniform)
+        // Nothing more to do for this case; return back to the caller,
+        // which will then emit the code for the default case.
+        return;
+
+    // For a varying switch, we need to update the execution mask.
+    //
+    // First, compute the mask that corresponds to which program instances
+    // should execute the "default" code; this corresponds to the set of
+    // program instances that don't match any of the case statements.
+    // Therefore, we generate code that compares the value of the switch
+    // expression to the value associated with each of the "case"
+    // statements such that the surviving lanes didn't match any of them.
+    llvm::Value *matchesDefault = getMaskAtSwitchEntry();
+    for (int i = 0; i < (int)caseBlocks->size(); ++i) {
+        int value = (*caseBlocks)[i].first;
+        llvm::Value *valueVec = (switchExpr->getType() == LLVMTypes::Int32VectorType) ?
+            LLVMInt32Vector(value) : LLVMInt64Vector(value);
+        // TODO: for AVX2 at least, the following generates better code
+        // than doing ICMP_NE and skipping the NotOperator() below; file a
+        // LLVM bug?
+        llvm::Value *matchesCaseValue = 
+            CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, switchExpr,
+                    valueVec, "cmp_case_value");
+        matchesCaseValue = I1VecToBoolVec(matchesCaseValue);
+
+        llvm::Value *notMatchesCaseValue = NotOperator(matchesCaseValue);
+        matchesDefault = BinaryOperator(llvm::Instruction::And, matchesDefault, 
+                                        notMatchesCaseValue, "default&~case_match");
+    }
+
+    // The mask may have some lanes on, which corresponds to the previous
+    // label falling through; compute the updated mask by ANDing with the
+    // current mask.
+    llvm::Value *oldMask = GetInternalMask();
+    llvm::Value *newMask = BinaryOperator(llvm::Instruction::Or, oldMask, 
+                                          matchesDefault, "old_mask|matches_default");
+    SetInternalMask(newMask);
+
+    if (checkMask)
+        addSwitchMaskCheck(newMask);
+}
+
+
+void
+FunctionEmitContext::EmitCaseLabel(int value, bool checkMask, SourcePos pos) {
+    if (inSwitchStatement() == false) {
+        Error(pos, "\"case\" label illegal outside of \"switch\" statement.");
+        return;
+    }
+
+    // Find the basic block for this case statement.
+    llvm::BasicBlock *bbCase = NULL;
+    Assert(caseBlocks != NULL);
+    for (int i = 0; i < (int)caseBlocks->size(); ++i)
+        if ((*caseBlocks)[i].first == value) {
+            bbCase = (*caseBlocks)[i].second;
+            break;
+        }
+    Assert(bbCase != NULL);
+
+    if (bblock != NULL)
+        // fall through from the previous case
+        BranchInst(bbCase);
+    SetCurrentBasicBlock(bbCase);
+
+    if (switchConditionWasUniform)
+        return;
+
+    // update the mask: first, get a mask that indicates which program
+    // instances have a value for the switch expression that matches this
+    // case statement.
+    llvm::Value *valueVec = (switchExpr->getType() == LLVMTypes::Int32VectorType) ?
+        LLVMInt32Vector(value) : LLVMInt64Vector(value);
+    llvm::Value *matchesCaseValue = 
+        CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, switchExpr,
+                valueVec, "cmp_case_value");
+    matchesCaseValue = I1VecToBoolVec(matchesCaseValue);
+
+    // If a lane was off going into the switch, we don't care if has a
+    // value in the switch expression that happens to match this case.
+    llvm::Value *entryMask = getMaskAtSwitchEntry();
+    matchesCaseValue = BinaryOperator(llvm::Instruction::And, entryMask,
+                                      matchesCaseValue, "entry_mask&case_match");
+
+    // Take the surviving lanes and turn on the mask for them.
+    llvm::Value *oldMask = GetInternalMask();
+    llvm::Value *newMask = BinaryOperator(llvm::Instruction::Or, oldMask, 
+                                          matchesCaseValue, "mask|case_match");
+    SetInternalMask(newMask);
+
+    if (checkMask)
+        addSwitchMaskCheck(newMask);
+}
+
+
+void
+FunctionEmitContext::SwitchInst(llvm::Value *expr, llvm::BasicBlock *bbDefault,
+                const std::vector<std::pair<int, llvm::BasicBlock *> > &bbCases,
+                const std::map<llvm::BasicBlock *, llvm::BasicBlock *> &bbNext) {
+    // The calling code should have called StartSwitch() before calling
+    // SwitchInst().
+    Assert(controlFlowInfo.size() &&
+           controlFlowInfo.back()->IsSwitch());
+
+    switchExpr = expr;
+    defaultBlock = bbDefault;
+    caseBlocks = new std::vector<std::pair<int, llvm::BasicBlock *> >(bbCases);
+    nextBlocks = new std::map<llvm::BasicBlock *, llvm::BasicBlock *>(bbNext);
+    switchConditionWasUniform = 
+        (llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(expr->getType()) == false);
+
+    if (switchConditionWasUniform == true) {
+        // For a uniform switch condition, just wire things up to the LLVM
+        // switch instruction.
+        llvm::SwitchInst *s = llvm::SwitchInst::Create(expr, bbDefault, 
+                                                       bbCases.size(), bblock);
+        for (int i = 0; i < (int)bbCases.size(); ++i) {
+            if (expr->getType() == LLVMTypes::Int32Type)
+                s->addCase(LLVMInt32(bbCases[i].first), bbCases[i].second);
+            else {
+                Assert(expr->getType() == LLVMTypes::Int64Type);
+                s->addCase(LLVMInt64(bbCases[i].first), bbCases[i].second);
+            }
+        }
+
+        AddDebugPos(s);
+        // switch is a terminator
+        bblock = NULL;
+    }
+    else {
+        // For a varying switch, we first turn off all lanes of the mask
+        SetInternalMask(LLVMMaskAllOff);
+
+        if (nextBlocks->size() > 0) {
+            // If there are any labels inside the switch, jump to the first
+            // one; any code before the first label won't be executed by
+            // anyone.
+            std::map<llvm::BasicBlock *, llvm::BasicBlock *>::const_iterator iter;
+            iter = nextBlocks->find(NULL);
+            Assert(iter != nextBlocks->end());
+            llvm::BasicBlock *bbFirst = iter->second;
+            BranchInst(bbFirst);
+            bblock = NULL;
+        }
+    }
+}
+
+
 int
 FunctionEmitContext::VaryingCFDepth() const { 
    int sum = 0;
@@ -905,6 +1232,14 @@ FunctionEmitContext::All(llvm::Value *mask) {
 }


+llvm::Value *
+FunctionEmitContext::None(llvm::Value *mask) {
+    llvm::Value *mmval = LaneMask(mask);
+    return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, mmval,
+                   LLVMInt32(0), "none_mm_cmp");
+}
+
+
 llvm::Value *
 FunctionEmitContext::LaneMask(llvm::Value *v) {
    // Call the target-dependent movmsk function to turn the vector mask
@@ -944,7 +1279,11 @@ FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {

 llvm::Value *
 FunctionEmitContext::GetStringPtr(const std::string &str) {
+#ifdef LLVM_3_1svn
+    llvm::Constant *lstr = llvm::ConstantDataArray::getString(*g->ctx, str);
+#else
    llvm::Constant *lstr = llvm::ConstantArray::get(*g->ctx, str);
+#endif
    llvm::GlobalValue::LinkageTypes linkage = llvm::GlobalValue::InternalLinkage;
    llvm::Value *lstrPtr = new llvm::GlobalVariable(*m->module, lstr->getType(),
                                                    true /*isConst*/, 
@@ -994,7 +1333,11 @@ FunctionEmitContext::I1VecToBoolVec(llvm::Value *b) {

 static llvm::Value *
 lGetStringAsValue(llvm::BasicBlock *bblock, const char *s) {
+#ifdef LLVM_3_1svn
+    llvm::Constant *sConstant = llvm::ConstantDataArray::getString(*g->ctx, s);
+#else
    llvm::Constant *sConstant = llvm::ConstantArray::get(*g->ctx, s);
+#endif
    llvm::Value *sPtr = new llvm::GlobalVariable(*m->module, sConstant->getType(), 
                                                 true /* const */,
                                                 llvm::GlobalValue::InternalLinkage,
@@ -2588,7 +2931,7 @@ FunctionEmitContext::SyncInst() {


 /** When we gathering from or scattering to a varying atomic type, we need
-    to add an appropraite offset to the final address for each lane right
+    to add an appropriate offset to the final address for each lane right
    before we use it.  Given a varying pointer we're about to use and its
    type, this function determines whether these offsets are needed and
    returns an updated pointer that incorporates these offsets if needed.
@@ -2632,3 +2975,37 @@ FunctionEmitContext::addVaryingOffsetsIfNeeded(llvm::Value *ptr,

    return BinaryOperator(llvm::Instruction::Add, ptr, offset);
 }
+
+
+CFInfo *
+FunctionEmitContext::popCFState() {
+    Assert(controlFlowInfo.size() > 0);
+    CFInfo *ci = controlFlowInfo.back();
+    controlFlowInfo.pop_back();
+
+    if (ci->IsSwitch()) {
+        breakTarget = ci->savedBreakTarget;
+        continueTarget = ci->savedContinueTarget;
+        breakLanesPtr = ci->savedBreakLanesPtr;
+        continueLanesPtr = ci->savedContinueLanesPtr;
+        loopMask = ci->savedLoopMask;
+        switchExpr = ci->savedSwitchExpr;
+        defaultBlock = ci->savedDefaultBlock;
+        caseBlocks = ci->savedCaseBlocks;
+        nextBlocks = ci->savedNextBlocks;
+        switchConditionWasUniform = ci->savedSwitchConditionWasUniform;
+    }
+    else if (ci->IsLoop() || ci->IsForeach()) {
+        breakTarget = ci->savedBreakTarget;
+        continueTarget = ci->savedContinueTarget;
+        breakLanesPtr = ci->savedBreakLanesPtr;
+        continueLanesPtr = ci->savedContinueLanesPtr;
+        loopMask = ci->savedLoopMask;
+    }
+    else {
+        Assert(ci->IsIf());
+        // nothing to do
+    }
+
+    return ci;
+}
--- a/ctx.h
+++ b/ctx.h
@@ -161,10 +161,8 @@ public:
    void EndLoop();

    /** Indicates that code generation for a 'foreach' or 'foreach_tiled'
-        loop is about to start.  The provided basic block pointer indicates
-        where control flow should go if a 'continue' statement is executed
-        in the loop. */
-    void StartForeach(llvm::BasicBlock *continueTarget);
+        loop is about to start. */
+    void StartForeach();
    void EndForeach();

    /** Emit code for a 'break' statement in a loop.  If doCoherenceCheck
@@ -187,12 +185,53 @@ public:
        previous iteration. */
    void RestoreContinuedLanes();

+    /** Indicates that code generation for a "switch" statement is about to
+        start.  isUniform indicates whether the "switch" value is uniform,
+        and bbAfterSwitch gives the basic block immediately following the
+        "switch" statement.  (For example, if the switch condition is
+        uniform, we jump here upon executing a "break" statement.) */
+    void StartSwitch(bool isUniform, llvm::BasicBlock *bbAfterSwitch);
+    /** Indicates the end of code generation for a "switch" statement. */
+    void EndSwitch();
+
+    /** Emits code for a "switch" statement in the program.
+        @param expr         Gives the value of the expression after the "switch"
+        @param defaultBlock Basic block to execute for the "default" case.  This
+                            should be NULL if there is no "default" label inside
+                            the switch.
+        @param caseBlocks   vector that stores the mapping from label values
+                            after "case" statements to basic blocks corresponding
+                            to the "case" labels.
+        @param nextBlocks   For each basic block for a "case" or "default" 
+                            label, this gives the basic block for the 
+                            immediately-following "case" or "default" label (or
+                            the basic block after the "switch" statement for the
+                            last label.)
+    */
+    void SwitchInst(llvm::Value *expr, llvm::BasicBlock *defaultBlock,
+                    const std::vector<std::pair<int, llvm::BasicBlock *> > &caseBlocks,
+                    const std::map<llvm::BasicBlock *, llvm::BasicBlock *> &nextBlocks);
+
+    /** Generates code for a "default" label after a "switch" statement.
+        The checkMask parameter indicates whether additional code should be
+        generated to check to see if the execution mask is all off after
+        the default label (in which case a jump to the following label will
+        be issued. */
+    void EmitDefaultLabel(bool checkMask, SourcePos pos);
+
+    /** Generates code for a "case" label after a "switch" statement.  See
+        the documentation for EmitDefaultLabel() for discussion of the
+        checkMask parameter. */
+    void EmitCaseLabel(int value, bool checkMask, SourcePos pos);
+
    /** Returns the current number of nested levels of 'varying' control
        flow */
    int VaryingCFDepth() const;

    bool InForeachLoop() const;

+    void SetContinueTarget(llvm::BasicBlock *bb) { continueTarget = bb; }
+
    /** Step through the code and find label statements; create a basic
        block for each one, so that subsequent calls to
        GetLabeledBasicBlock() return the corresponding basic block. */
@@ -221,6 +260,10 @@ public:
        i1 value that indicates if all of the mask lanes are on. */
    llvm::Value *All(llvm::Value *mask);

+    /** Given a boolean mask value of type LLVMTypes::MaskType, return an
+        i1 value that indicates if all of the mask lanes are off. */
+    llvm::Value *None(llvm::Value *mask);
+
    /** Given a boolean mask value of type LLVMTypes::MaskType, return an
        i32 value wherein the i'th bit is on if and only if the i'th lane
        of the mask is on. */
@@ -492,10 +535,10 @@ private:
        the loop. */
    llvm::Value *loopMask;

-    /** If currently in a loop body, this is a pointer to memory to store a
-        mask value that represents which of the lanes have executed a
-        'break' statement.  If we're not in a loop body, this should be
-        NULL. */
+    /** If currently in a loop body or switch statement, this is a pointer
+        to memory to store a mask value that represents which of the lanes
+        have executed a 'break' statement.  If we're not in a loop body or
+        switch, this should be NULL. */
    llvm::Value *breakLanesPtr;

    /** Similar to breakLanesPtr, if we're inside a loop, this is a pointer
@@ -503,16 +546,49 @@ private:
        'continue' statement. */
    llvm::Value *continueLanesPtr;

-    /** If we're inside a loop, this gives the basic block immediately
-        after the current loop, which we will jump to if all of the lanes
-        have executed a break statement or are otherwise done with the
-        loop. */
+    /** If we're inside a loop or switch statement, this gives the basic
+        block immediately after the current loop or switch, which we will
+        jump to if all of the lanes have executed a break statement or are
+        otherwise done with it. */
    llvm::BasicBlock *breakTarget;

    /** If we're inside a loop, this gives the block to jump to if all of
        the running lanes have executed a 'continue' statement. */
    llvm::BasicBlock *continueTarget;

+    /** @name Switch statement state
+
+        These variables store various state that's active when we're
+        generating code for a switch statement.  They should all be NULL
+        outside of a switch.
+        @{
+    */
+
+    /** The value of the expression used to determine which case in the
+        statements after the switch to execute. */
+    llvm::Value *switchExpr;
+
+    /** Map from case label numbers to the basic block that will hold code
+        for that case. */
+    const std::vector<std::pair<int, llvm::BasicBlock *> > *caseBlocks;
+
+    /** The basic block of code to run for the "default" label in the
+        switch statement. */
+    llvm::BasicBlock *defaultBlock;
+
+    /** For each basic block for the code for cases (and the default label,
+        if present), this map gives the basic block for the immediately
+        following case/default label. */
+    const std::map<llvm::BasicBlock *, llvm::BasicBlock *> *nextBlocks;
+
+    /** Records whether the switch condition was uniform; this is a
+        distinct notion from whether the switch represents uniform or
+        varying control flow; we may have varying control flow from a
+        uniform switch condition if there is a 'break' inside the switch
+        that's under varying control flow. */
+    bool switchConditionWasUniform;
+    /** @} */
+
    /** A pointer to memory that records which of the program instances
        have executed a 'return' statement (and are thus really truly done
        running any more instructions in this functions. */
@@ -556,7 +632,7 @@ private:

    llvm::Value *pointerVectorToVoidPointers(llvm::Value *value);
    static void addGSMetadata(llvm::Value *inst, SourcePos pos);
-    bool ifsInLoopAllUniform() const;
+    bool ifsInCFAllUniform(int cfType) const;
    void jumpIfAllLoopLanesAreDone(llvm::BasicBlock *target);
    llvm::Value *emitGatherCallback(llvm::Value *lvalue, llvm::Value *retPtr);

@@ -564,6 +640,11 @@ private:
                                 const Type *ptrType);

    void restoreMaskGivenReturns(llvm::Value *oldMask);
+    void addSwitchMaskCheck(llvm::Value *mask);
+    bool inSwitchStatement() const;
+    llvm::Value *getMaskAtSwitchEntry();
+
+    CFInfo *popCFState();

    void scatter(llvm::Value *value, llvm::Value *ptr, const Type *ptrType, 
                 llvm::Value *mask);
--- a/decl.cpp
+++ b/decl.cpp
@@ -266,6 +266,7 @@ Declarator::GetFunctionInfo(DeclSpecs *ds, std::vector<Symbol *> *funArgs) {
        funArgs->push_back(sym);
    }

+    if (funSym != NULL)
        funSym->type = funSym->type->ResolveUnboundVariability(Type::Varying);

    return funSym;
--- a/docs/ReleaseNotes.txt
+++ b/docs/ReleaseNotes.txt
@@ -1,3 +1,61 @@
+=== v1.1.4 === (4 February 2012)
+
+There are two major bugfixes for Windows in this release.  First, a number
+of failures in AVX code generation on Windows have been fixed; AVX on
+Windows now has no known issues.  Second, a longstanding bug in parsing 64-bit
+integer constants on Windows has been fixed.
+
+This release features a new experimental scalar target, contributed by Gabe
+Weisz <gweisz@cs.cmu.edu>.  This target ("--target=generic-1") compiles
+gangs of single program instances (i.e. programCount == 1); it can be
+useful for debugging ispc programs.
+
+The compiler now supports dynamic memory allocation in ispc programs (with
+"new" and "delete" operators based on C++).  See
+http://ispc.github.com/ispc.html#dynamic-memory-allocation in the
+documentation for more information.
+
+ispc now performs "short circuit" evaluation of the || and && logical
+operators and the ? : selection operator.  (This represents the correction
+of a major incompatibility with C.)  Code like "(index < arraySize &&
+array[index] == 1)" thus now executes as in C, where "array[index]" won't
+be evaluated unless "index" is less than "arraySize".
+
+The standard library now provides "local" atomic operations, which are
+atomic across the gang of program instances (but not across other gangs or
+other hardware threads.  See the updated documentation on atomics for more
+information:
+http://ispc.github.com/ispc.html#atomic-operations-and-memory-fences.
+
+The standard library now offers a clock() function, which returns a uniform
+int64 value that counts processor cycles; it can be used for
+fine-resolution timing measurements.
+
+Finally (of limited interest now): ispc now supports the forthcoming AVX2
+instruction set, due with Haswell-generation CPUs.  All tests and examples
+compile and execute correctly with AVX2.  (Thanks specifically to Craig
+Topper and Nadav Rotem for work on AVX2 support in LLVM, which made this
+possible.)
+ 
+=== v1.1.3 === (20 January 2012)
+
+With this release, the language now supports "switch" statements, with the
+same semantics and syntax as in C.
+
+This release includes fixes for two important performance related issues:
+the quality of code generated for "foreach" statements has been
+substantially improved (https://github.com/ispc/ispc/issues/151), and a
+performance regression with code for "gathers" that was introduced in
+v1.1.2 has been fixed in this release. 
+
+A number of other small bugs were fixed in this release as well, including
+one where invalid memory would sometimes be incorrectly accessed
+(https://github.com/ispc/ispc/issues/160).
+
+Thanks to Jean-Luc Duprat for a number of patches that improve support for
+building on various platforms, and to Pierre-Antoine Lacaze for patches so
+that ispc builds under MinGW.
+
 === v1.1.2 === (9 January 2012)

 The major new feature in this release is support for "generic" C++
--- a/docs/faq.rst
+++ b/docs/faq.rst
@@ -1,10 +1,10 @@
-=============================================================
-Intel® SPMD Program Compiler Frequently Asked Questions (FAQ)
-=============================================================
+=====================================
+Frequently Asked Questions About ispc
+=====================================

 This document includes a number of frequently (and not frequently) asked
 questions about ispc, the Intel® SPMD Program Compiler.  The source to this
-document is in the file ``docs/faq.txt`` in the ``ispc`` source
+document is in the file ``docs/faq.rst`` in the ``ispc`` source
 distribution.

 * Understanding ispc's Output
--- a/docs/ispc.rst
+++ b/docs/ispc.rst
@@ -96,9 +96,13 @@ Contents:

  + `Declarations and Initializers`_
  + `Expressions`_
+
+    * `Dynamic Memory Allocation`_
+
  + `Control Flow`_

    * `Conditional Statements: "if"`_
+    * `Conditional Statements: "switch"`_
    * `Basic Iteration Statements: "for", "while", and "do"`_
    * `Unstructured Control Flow: "goto"`_
    * `"Coherent" Control Flow Statements: "cif" and Friends`_
@@ -1141,12 +1145,13 @@ in C:

 * Expression syntax and basic types
 * Syntax for variable declarations
-* Control flow structures: if, for, while, do
+* Control flow structures: ``if``, ``for``, ``while``, ``do``, and ``switch``.
 * Pointers, including function pointers, ``void *``, and C's array/pointer
  duality (arrays are converted to pointers when passed to functions, etc.)
 * Structs and arrays
 * Support for recursive function calls
 * Support for separate compilation of source files
+* "Short-circuit" evaluation of ``||``, ``&&`` and ``? :`` operators
 * The preprocessor

 ``ispc`` adds a number of features from C++ and C99 to this base:
@@ -1161,6 +1166,7 @@ in C:
 * The ``inline`` qualifier to indicate that a function should be inlined 
 * Function overloading by parameter type
 * Hexadecimal floating-point constants
+* Dynamic memory allocation with ``new`` and ``delete``.

 ``ispc`` also adds a number of new features that aren't in C89, C99, or
 C++:
@@ -1179,13 +1185,11 @@ C++:
 There are a number of features of C89 that are not supported in ``ispc``
 but are likely to be supported in future releases:

-* Short circuiting of logical operations
 * There are no types named ``char``, ``short``, or ``long`` (or ``long
  double``).  However, there are built-in ``int8``, ``int16``, and
  ``int64`` types
 * Character constants
 * String constants and arrays of characters as strings
-* ``switch`` statements
 * ``goto`` statements are partially supported (see `Unstructured Control Flow: "goto"`_)
 * ``union`` types
 * Bitfield members of ``struct`` types
@@ -1965,19 +1969,137 @@ operator also work as expected.
    (*fp).a = 0;
    fp->b = 1;
  
+As in C and C++, evaluation of the ``||`` and ``&&`` logical operators as
+well as the selection operator ``? :`` is "short-circuited"; the right hand
+side won't be evaluated if the value from the left-hand side determines the
+logical operator's value.  For example, in the following code,
+``array[index]`` won't be evaluated for values of ``index`` that are
+greater than or equal to ``NUM_ITEMS``.
+
+::
+
+    if (index < NUM_ITEMS && array[index] > 0) {
+        // ...
+    }
+
+
+Dynamic Memory Allocation
+-------------------------
+
+``ispc`` programs can dynamically allocate (and free) memory, using syntax
+based on C++'s ``new`` and ``delete`` operators:
+
+::
+
+   int count = ...;
+   int *ptr = new uniform int[count];
+   // use ptr...
+   delete[] ptr;
+
+In the above code, each program instance allocates its own ``count`-sized
+array of ``uniform int`` values, uses that memory, and then deallocates
+that memory.  Uses of ``new`` and ``delete`` in ``ispc`` programs are
+serviced by corresponding calls the system C library's ``malloc()`` and
+``free()`` functions.
+
+After a pointer has been deleted, it is illegal to access the memory it
+points to.  However, note that deletion happens on a per-program-instance
+basis.  In other words, consider the following code:
+
+::
+
+    int *ptr = new uniform int[count];
+    // use ptr
+    if (count > 1000)
+        delete[] ptr;
+    // ...
+
+Here, the program instances where ``count`` is greater than 1000 have
+deleted the dynamically allocated memory pointed to by ``ptr``, but the
+other program instances have not.  As such, it's illegal for the former set
+of program instances to access ``*ptr``, but it's perfectly fine for the
+latter set to continue to use the memory ``ptr`` points to.  Note that it
+is illegal to delete a pointer value returned by ``new`` more than one
+time.
+ 
+Sometimes, it's useful to be able to do a single allocation for the entire
+gang of program instances.  A ``new`` statement can be qualified with
+``uniform`` to indicate a single memory allocation:
+
+::
+
+    float * uniform ptr = uniform new float[10];
+
+While a regular call to ``new`` returns a ``varying`` pointer (i.e. a
+distinct pointer to separately-allocated memory for each program instance),
+a ``uniform new`` performs a single allocation and returns a ``uniform``
+pointer.
+
+When using ``uniform new``, it's important to be aware of a subtlety; if
+the returned pointer is stored in a varying pointer variable (as may be
+appropriate and useful for the particular program being written), then the
+varying pointer may inadvertently be passed to a subsequent ``delete``
+statement, which is an error: effectively
+
+::
+
+    float *ptr = uniform new float[10];
+    // use ptr...
+    delete ptr;  // ERROR: varying pointer is deleted
+
+In this case, ``ptr`` will be deleted multiple times, once for each
+executing program instance, which is an error (unless it happens that only
+a single program instance is active in the above code.)
+
+When using ``new`` statements, it's important to make an appropriate choice
+of ``uniform`` or ``varying`` (as always, the default), for both the
+``new`` operator itself as well as the type of data being allocated, based
+on the program's needs.  Consider the following four memory allocations:
+
+::
+
+    uniform float * uniform p1 = uniform new uniform float[10];
+    float * uniform p2 = uniform new float[10];
+    uniform float * p3 = new uniform float[10];
+    float * p4 = new float[10];
+
+Assuming that a ``float`` is 4 bytes in memory and if the gang size is 8
+program instances, then the first allocation represents a single allocation
+of 40 bytes, the second is a single allocation of 8*4*10 = 320 bytes, the
+third is 8 allocations of 40 bytes, and the last performs 8 allocations of
+80 bytes each.
+
+Note in particular that varying allocations of varying data types are rarely
+desirable in practice.  In that case, each program instance is performing a
+separate allocation of ``varying float`` memory.  In this case, it's likely
+that the program instances will only access a single element of each
+``varying float``, which is wasteful.
+
+Although ``ispc`` doesn't support constructors or destructors like C++, it
+is possible to provide initializer values with ``new`` statements:
+
+::
+
+    struct Point { float x, y, z; };
+    Point *pptr = new Point(10, 20, 30);
+
+Here for example, the "x" element of the returned ``Point`` is initialized
+to have the value 10 and so forth.  In general, the rules for how
+initializer values provided in ``new`` statements are used to initialize
+complex data types follow the same rules as initializers for variables
+described in `Declarations and Initializers`_.

 Control Flow
 ------------

 ``ispc`` supports most of C's control flow constructs, including ``if``,
-``for``, ``while``, ``do``.  It also supports variants of C's control flow
+``switch``, ``for``, ``while``, ``do``.  It has limited support for
+``goto``, detailed below.  It also supports variants of C's control flow
 constructs that provide hints about the expected runtime coherence of the
 control flow at that statement.  It also provides parallel looping
 constructs, ``foreach`` and ``foreach_tiled``, all of which will be
 detailed in this section.

-``ispc`` does not currently support ``switch`` statements or ``goto``.
-
 Conditional Statements: "if"
 ----------------------------

@@ -1994,6 +2116,31 @@ executes if the condition is false.
    else
        x *= 2.;

+Conditional Statements: "switch"
+--------------------------------
+
+The ``switch`` conditional statement is also available, again with the same
+behavior as in C; the expression used in the ``switch`` must be of integer
+type (but it can be uniform or varying).  As in C, if there is no ``break``
+statement at the end of the code for a given case, execution "falls
+through" to the following case.  These features are demonstrated in the
+code below.
+
+::
+
+    int x = ...;
+    switch (x) {
+    case 0:
+    case 1:
+        foo(x);
+        /* fall through */
+    case 5:
+        x = 0;
+        break;
+    default:
+        x *= x;
+    }
+
 Basic Iteration Statements: "for", "while", and "do"
 ----------------------------------------------------

@@ -3242,24 +3389,53 @@ Systems Programming Support
 Atomic Operations and Memory Fences
 -----------------------------------

-The usual range of atomic memory operations are provided in ``ispc``,
-including variants to handle both uniform and varying types.  As a first
-example, consider on variant of the 32-bit integer atomic add routine:
+The standard range of atomic memory operations are provided by the standard
+library``ispc``, including variants to handle both uniform and varying
+types as well as "local" and "global" atomics.
+
+Local atomics provide atomic behavior across the program instances in a
+gang, but not across multiple gangs or memory operations in different
+hardware threads.  To see why they are needed, consider a histogram
+calculation where each program instance in the gang computes which bucket a
+value lies in and then increments a corresponding counter.  If the code is
+written like this:

 ::

-  int32 atomic_add_global(uniform int32 * uniform ptr, int32 delta)
+    uniform int count[N_BUCKETS] = ...;
+    float value = ...;
+    int bucket = clamp(value / N_BUCKETS, 0, N_BUCKETS);
+    ++count[bucket];  // ERROR: undefined behavior if collisions

-The semantics are the expected ones for an atomic add function: the pointer
-points to a single location in memory (the same one for all program
-instances), and for each executing program instance, the value stored in
-the location that ``ptr`` points to has that program instance's value
-"delta" added to it atomically, and the old value at that location is
-returned from the function.  (Thus, if multiple processors simultaneously
-issue atomic adds to the same memory location, the adds will be serialized
-by the hardware so that the correct result is computed in the end.
-Furthermore, the atomic adds are serialized across the running program
-instances.)
+then the program's behavior is undefined: whenever multiple program
+instances have values that map to the same value of ``bucket``, then the
+effect of the increment is undefined.  (See the discussion in the `Data
+Races Within a Gang`_ section; in the case here, there isn't a sequence
+point between one program instance updating ``count[bucket]`` and the other
+program instance reading its value.)
+
+The ``atomic_add_local()`` function can be used in this case; as a local
+atomic it is atomic across the gang of program instances, such that the
+expected result is computed.
+
+::
+
+    ...
+    int bucket = clamp(value / N_BUCKETS, 0, N_BUCKETS);
+    atomic_add_local(&count[bucket], 1);
+
+It uses this variant of the 32-bit integer atomic add routine:
+
+::
+
+  int32 atomic_add_local(uniform int32 * uniform ptr, int32 delta)
+
+The semantics of this routine are typical for an atomic add function: the
+pointer here points to a single location in memory (the same one for all
+program instances), and for each executing program instance, the value
+stored in the location that ``ptr`` points to has that program instance's
+value "delta" added to it atomically, and the old value at that location is
+returned from the function.

 One thing to note is that that the type of the value being added to a
 ``uniform`` integer, while the increment amount and the return value are
@@ -3270,44 +3446,75 @@ atomics for the running program instances may be issued in arbitrary order;
 it's not guaranteed that they will be issued in ``programIndex`` order, for
 example.

-Here are the declarations of the ``int32`` variants of these functions.
-There are also ``int64`` equivalents as well as variants that take
-``unsigned`` ``int32`` and ``int64`` values.  (The ``atomic_swap_global()``
-function can be used with ``float`` and ``double`` types as well.)
+Global atomics are more powerful than local atomics; they are atomic across
+both the program instances in the gang as well as atomic across different
+gangs and different hardware threads.  For example, for the global variant
+of the atomic used above,

 ::

-  int32 atomic_add_global(uniform int32 * uniform ptr, int32 value)
-  int32 atomic_subtract_global(uniform int32 * uniform ptr, int32 value)
-  int32 atomic_min_global(uniform int32 * uniform ptr, int32 value)
-  int32 atomic_max_global(uniform int32 * uniform ptr, int32 value)
-  int32 atomic_and_global(uniform int32 * uniform ptr, int32 value)
-  int32 atomic_or_global(uniform int32 * uniform ptr, int32 value)
-  int32 atomic_xor_global(uniform int32 * uniform ptr, int32 value)
-  int32 atomic_swap_global(uniform int32 * uniform ptr, int32 value)
+  int32 atomic_add_global(uniform int32 * uniform ptr, int32 delta)

-There are also variants of these functions that take ``uniform`` values for
-the operand and return a ``uniform`` result.  These correspond to a single
+if multiple processors simultaneously issue atomic adds to the same memory
+location, the adds will be serialized by the hardware so that the correct
+result is computed in the end.
+
+Here are the declarations of the ``int32`` variants of these functions.
+There are also ``int64`` equivalents as well as variants that take
+``unsigned`` ``int32`` and ``int64`` values.
+
+::
+
+  int32 atomic_add_{local,global}(uniform int32 * uniform ptr, int32 value)
+  int32 atomic_subtract_{local,global}(uniform int32 * uniform ptr, int32 value)
+  int32 atomic_min_{local,global}(uniform int32 * uniform ptr, int32 value)
+  int32 atomic_max_{local,global}(uniform int32 * uniform ptr, int32 value)
+  int32 atomic_and_{local,global}(uniform int32 * uniform ptr, int32 value)
+  int32 atomic_or_{local,global}(uniform int32 * uniform ptr, int32 value)
+  int32 atomic_xor_{local,global}(uniform int32 * uniform ptr, int32 value)
+  int32 atomic_swap_{local,global}(uniform int32 * uniform ptr, int32 value)
+
+Support for ``float`` and ``double`` types is also available.  For local
+atomics, all but the logical operations are available.  (There are
+corresponding ``double`` variants of these, not listed here.)
+
+::
+
+  float atomic_add_local(uniform float * uniform ptr, float value)
+  float atomic_subtract_local(uniform float * uniform ptr, float value)
+  float atomic_min_local(uniform float * uniform ptr, float value)
+  float atomic_max_local(uniform float * uniform ptr, float value)
+  float atomic_swap_local(uniform float * uniform ptr, float value)
+
+For global atomics, only atomic swap is available for these types:
+
+::
+
+  float atomic_swap_global(uniform float * uniform ptr, float value)
+  double atomic_swap_global(uniform double * uniform ptr, double value)
+
+There are also variants of the atomic that take ``uniform`` values for the
+operand and return a ``uniform`` result.  These correspond to a single
 atomic operation being performed for the entire gang of program instances,
 rather than one per program instance.

 ::

-  uniform int32 atomic_add_global(uniform int32 * uniform ptr,
+  uniform int32 atomic_add_{local,global}(uniform int32 * uniform ptr,
                                          uniform int32 value)
-  uniform int32 atomic_subtract_global(uniform int32 * uniform ptr,
+  uniform int32 atomic_subtract_{local,global}(uniform int32 * uniform ptr,
                                               uniform int32 value)
-  uniform int32 atomic_min_global(uniform int32 * uniform ptr,
+  uniform int32 atomic_min_{local,global}(uniform int32 * uniform ptr,
                                          uniform int32 value)
-  uniform int32 atomic_max_global(uniform int32 * uniform ptr,
+  uniform int32 atomic_max_{local,global}(uniform int32 * uniform ptr,
                                          uniform int32 value)
-  uniform int32 atomic_and_global(uniform int32 * uniform ptr,
+  uniform int32 atomic_and_{local,global}(uniform int32 * uniform ptr,
                                          uniform int32 value)
-  uniform int32 atomic_or_global(uniform int32 * uniform ptr,
+  uniform int32 atomic_or_{local,global}(uniform int32 * uniform ptr,
                                          uniform int32 value)
-  uniform int32 atomic_xor_global(uniform int32 * uniform ptr,
+  uniform int32 atomic_xor_{local,global}(uniform int32 * uniform ptr,
                                          uniform int32 value)
-  uniform int32 atomic_swap_global(uniform int32 * uniform ptr,
+  uniform int32 atomic_swap_{local,global}(uniform int32 * uniform ptr,
                                           uniform int32 newval)

 Be careful that you use the atomic function that you mean to; consider the
@@ -3332,8 +3539,7 @@ will cause the desired atomic add function to be called.
 ::

    extern uniform int32 counter;
-    int32 one = 1;
-    int32 myCounter = atomic_add_global(&counter, one);
+    int32 myCounter = atomic_add_global(&counter, (varying int32)1);

 There is a third variant of each of these atomic functions that takes a
 ``varying`` pointer; this allows each program instance to issue an atomic
@@ -3343,30 +3549,27 @@ the same location in memory!)

 ::

-  int32 atomic_add_global(uniform int32 * varying ptr, int32 value)
-  int32 atomic_subtract_global(uniform int32 * varying ptr, int32 value)
-  int32 atomic_min_global(uniform int32 * varying ptr, int32 value)
-  int32 atomic_max_global(uniform int32 * varying ptr, int32 value)
-  int32 atomic_and_global(uniform int32 * varying ptr, int32 value)
-  int32 atomic_or_global(uniform int32 * varying ptr, int32 value)
-  int32 atomic_xor_global(uniform int32 * varying ptr, int32 value)
-  int32 atomic_swap_global(uniform int32 * varying ptr, int32 value)
+  int32 atomic_add_{local,global}(uniform int32 * varying ptr, int32 value)
+  int32 atomic_subtract_{local,global}(uniform int32 * varying ptr, int32 value)
+  int32 atomic_min_{local,global}(uniform int32 * varying ptr, int32 value)
+  int32 atomic_max_{local,global}(uniform int32 * varying ptr, int32 value)
+  int32 atomic_and_{local,global}(uniform int32 * varying ptr, int32 value)
+  int32 atomic_or_{local,global}(uniform int32 * varying ptr, int32 value)
+  int32 atomic_xor_{local,global}(uniform int32 * varying ptr, int32 value)
+  int32 atomic_swap_{local,global}(uniform int32 * varying ptr, int32 value)

-There are also atomic swap and "compare and exchange" functions.
-Compare and exchange atomically compares the value in "val" to
-"compare"--if they match, it assigns "newval" to "val".  In either case,
-the old value of "val" is returned.  (As with the other atomic operations,
-there are also ``unsigned`` and 64-bit variants of this function.
-Furthermore, there are ``float`` and ``double`` variants as well.)
+There are also atomic "compare and exchange" functions.  Compare and
+exchange atomically compares the value in "val" to "compare"--if they
+match, it assigns "newval" to "val".  In either case, the old value of
+"val" is returned.  (As with the other atomic operations, there are also
+``unsigned`` and 64-bit variants of this function.  Furthermore, there are
+``float`` and ``double`` variants as well.)

 ::

-  int32 atomic_swap_global(uniform int32 * uniform ptr, int32 newvalue)
-  uniform int32 atomic_swap_global(uniform int32 * uniform ptr,
-                                   uniform int32 newvalue)
-  int32 atomic_compare_exchange_global(uniform int32 * uniform ptr,
+  int32 atomic_compare_exchange_{local,global}(uniform int32 * uniform ptr,
                                               int32 compare, int32 newval)
-  uniform int32 atomic_compare_exchange_global(uniform int32 * uniform ptr,
+  uniform int32 atomic_compare_exchange_{local,global}(uniform int32 * uniform ptr,
                                  uniform int32 compare, uniform int32 newval)

 ``ispc`` also has a standard library routine that inserts a memory barrier
@@ -3419,12 +3622,27 @@ pointer types.
 System Information
 ------------------

-A routine is available to find the number of CPU cores available in the
-system:
+The value of a  high-precision hardware clock counter is returned by the
+``clock()`` routine; its value increments by one each processor cycle.
+Thus, taking the difference between the values returned by ``clock()`` and
+different points in program execution gives the number of cycles between
+those points in the program.

 ::

-    int num_cores()
+    uniform int64 clock()
+
+Note that ``clock()`` flushes the processor pipeline.  It has an overhead
+of a hundred or so cycles, so for very fine-grained measurements, it may be
+worthwhile to measure the cost of calling ``clock()`` and subtracting that
+value from reported results.
+    
+A routine is also available to find the number of CPU cores available in
+the system:
+
+::
+
+    uniform int num_cores()

 This value can be useful for adapting the granularity of parallel task
 decomposition depending on the number of processors in the system.
--- a/docs/template-perf.txt
+++ b/docs/template-perf.txt
@@ -45,8 +45,7 @@
              developers mailing list</a></li>
              <li><a href="http://github.com/ispc/ispc/wiki/">Wiki</a></li>
              <li><a href="http://github.com/ispc/ispc/issues/">Bug tracking</a></li>
-              <li><a href="doxygen/index.html">Doxygen documentation of
-              <tt>ispc</tt> source code</a></li>
+              <li><a href="doxygen/index.html">Doxygen</a></li>
            </ul>
        </div>
      </div>
--- a/docs/template.txt
+++ b/docs/template.txt
@@ -45,8 +45,7 @@
              developers mailing list</a></li>
              <li><a href="http://github.com/ispc/ispc/wiki/">Wiki</a></li>
              <li><a href="http://github.com/ispc/ispc/issues/">Bug tracking</a></li>
-              <li><a href="doxygen/index.html">Doxygen documentation of
-              <tt>ispc</tt> source code</a></li>
+              <li><a href="doxygen/index.html">Doxygen</a></li>
            </ul>
        </div>
      </div>
--- a/doxygen.cfg
+++ b/doxygen.cfg
@@ -31,7 +31,7 @@ PROJECT_NAME           = "Intel SPMD Program Compiler"
 # This could be handy for archiving the generated documentation or
 # if some version control system is used.

-PROJECT_NUMBER         = 1.1.2
+PROJECT_NUMBER         = 1.1.4

 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
 # base path where the generated documentation will be put.
--- a/examples/aobench/ao.ispc
+++ b/examples/aobench/ao.ispc
@@ -82,7 +82,7 @@ static inline void vnormalize(vec &v) {
 }


-static inline void
+static void
 ray_plane_intersect(Isect &isect, Ray &ray, Plane &plane) {
    float d = -dot(plane.p, plane.n);
    float v = dot(ray.dir, plane.n);
@@ -124,7 +124,7 @@ ray_sphere_intersect(Isect &isect, Ray &ray, Sphere &sphere) {
 }


-static inline void
+static void
 orthoBasis(vec basis[3], vec n) {
    basis[2] = n;
    basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;
@@ -147,7 +147,7 @@ orthoBasis(vec basis[3], vec n) {
 }


-static inline float
+static float
 ambient_occlusion(Isect &isect, Plane &plane, Sphere spheres[3], 
                  RNGState &rngstate) {
    float eps = 0.0001f;
@@ -212,49 +212,12 @@ static void ao_scanlines(uniform int y0, uniform int y1, uniform int w,
    RNGState rngstate;

    seed_rng(&rngstate, y0);
+    float invSamples = 1.f / nsubsamples;

-    // Compute the mapping between the 'programCount'-wide program
-    // instances running in parallel and samples in the image.  
-    //
-    // For now, we'll always take four samples per pixel, so start by
-    // initializing du and dv with offsets into subpixel samples.  We'll
-    // take care of further updating du and dv for the case where we're
-    // doing more than 4 program instances in parallel shortly.
-    uniform float uSteps[4] = { 0, 1, 0, 1 };
-    uniform float vSteps[4] = { 0, 0, 1, 1 };
-    float du = uSteps[programIndex % 4] / nsubsamples;
-    float dv = vSteps[programIndex % 4] / nsubsamples;
+    foreach_tiled(y = y0 ... y1, x = 0 ... w, 
+                  u = 0 ... nsubsamples, v = 0 ... nsubsamples) {
+        float du = (float)u * invSamples, dv = (float)v * invSamples;

-    // Now handle the case where we are able to do more than one pixel's
-    // worth of work at once.  nx records the number of pixels in the x
-    // direction we do per iteration and ny the number in y.
-    uniform int nx = 1, ny = 1;
-
-    // FIXME: We actually need ny to be 1 regardless of the decomposition,
-    // since the task decomposition is one scanline high.
-
-    if (programCount == 8) {
-        // Do two pixels at once in the x direction
-        nx = 2;
-        if (programIndex >= 4) 
-            // And shift the offsets for the second pixel's worth of work
-            ++du;
-    }
-    else if (programCount == 16) {
-        nx = 4;
-        ny = 1;
-        if (programIndex >= 4 && programIndex < 8)
-            ++du;
-        if (programIndex >= 8 && programIndex < 12)
-            du += 2;
-        if (programIndex >= 12)
-            du += 3;
-    }
-
-    // Now loop over all of the pixels, stepping in x and y as calculated
-    // above.  (Assumes that ny divides y and nx divides x...)
-    for (uniform int y = y0; y < y1; y += ny) {
-        for (uniform int x = 0; x < w; x += nx)  {
        // Figure out x,y pixel in NDC
        float px =  (x + du - (w / 2.0f)) / (w / 2.0f);
        float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
@@ -279,37 +242,14 @@ static void ao_scanlines(uniform int y0, uniform int y1, uniform int w,

        // Note use of 'coherent' if statement; the set of rays we
        // trace will often all hit or all miss the scene
-            cif (isect.hit)
+        cif (isect.hit) {
            ret = ambient_occlusion(isect, plane, spheres, rngstate);
+            ret *= invSamples * invSamples;

-            // This is a little grungy; we have results for
-            // programCount-worth of values.  Because we're doing 2x2
-            // subsamples, we need to peel them off in groups of four,
-            // average the four values for each pixel, and update the
-            // output image.
-            //
-            // Store the varying value to a uniform array of the same size.
-            // See the discussion about communication among program
-            // instances in the ispc user's manual for more discussion on
-            // this idiom.
-            uniform float retArray[programCount];
-            retArray[programIndex] = ret;
-
-            // offset to the first pixel in the image
-            uniform int offset = 3 * (y * w + x);
-            for (uniform int p = 0; p < programCount; p += 4, offset += 3) {
-                // Get the four sample values for this pixel
-                uniform float sumret = retArray[p] + retArray[p+1] + retArray[p+2] +
-                    retArray[p+3];
-
-                // Normalize by number of samples taken
-                sumret /= nsubsamples * nsubsamples; 
-                
-                // Store result in the image
-                image[offset+0] = sumret;
-                image[offset+1] = sumret;
-                image[offset+2] = sumret;
-            }
+            int offset = 3 * (y * w + x);
+            atomic_add_local(&image[offset], ret);
+            atomic_add_local(&image[offset+1], ret);
+            atomic_add_local(&image[offset+2], ret);
        }
    }
 }
--- a/examples/common.mk
+++ b/examples/common.mk
@@ -14,7 +14,7 @@ CPP_OBJS=$(addprefix objs/, $(CPP_SRC:.cpp=.o) $(TASK_OBJ))

 default: $(EXAMPLE)

-all: $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16
+all: $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16 $(EXAMPLE)-scalar

 .PHONY: dirs clean

@@ -57,3 +57,9 @@ objs/$(ISPC_SRC:.ispc=)_generic16.o: objs/$(ISPC_SRC:.ispc=)_generic16.cpp

 $(EXAMPLE)-generic16: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_generic16.o
 	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
+
+objs/$(ISPC_SRC:.ispc=)_scalar.o: $(ISPC_SRC)
+	$(ISPC) $< -o $@ --target=generic-1
+
+$(EXAMPLE)-scalar: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_scalar.o
+	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
--- a/examples/deferred/kernels.ispc
+++ b/examples/deferred/kernels.ispc
@@ -158,38 +158,22 @@ IntersectLightsWithTileMinMax(
    uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
    uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
        
-    // Parallize across frustum planes.
-    // We really only have four side planes here, but write the code to
-    // handle programCount > 4 robustly
-    uniform float frustumPlanes_xy[programCount];
-    uniform float frustumPlanes_z[programCount];
+    uniform float frustumPlanes_xy[4] = {
+        -(cameraProj_11 * gBufferScale_x),
+         (cameraProj_11 * gBufferScale_x),
+         (cameraProj_22 * gBufferScale_y),
+        -(cameraProj_22 * gBufferScale_y) };
+    uniform float frustumPlanes_z[4] = {
+         tileEndX - gBufferScale_x,
+        -tileStartX + gBufferScale_x,
+         tileEndY - gBufferScale_y,
+        -tileStartY + gBufferScale_y };

-    // TODO: If programIndex < 4 here? Don't care about masking off the
-    // rest but if interleaving ("x2" modes) the other lanes should ideally
-    // not be emitted...
-    {
-        // This one is totally constant over the whole screen... worth pulling it up at all?
-        float frustumPlanes_xy_v;
-        frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 0, -(cameraProj_11 * gBufferScale_x));
-        frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 1,  (cameraProj_11 * gBufferScale_x));
-        frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 2,  (cameraProj_22 * gBufferScale_y));
-        frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 3, -(cameraProj_22 * gBufferScale_y));
-    
-        float frustumPlanes_z_v;
-        frustumPlanes_z_v = insert(frustumPlanes_z_v, 0,  tileEndX - gBufferScale_x);
-        frustumPlanes_z_v = insert(frustumPlanes_z_v, 1, -tileStartX + gBufferScale_x);
-        frustumPlanes_z_v = insert(frustumPlanes_z_v, 2,  tileEndY - gBufferScale_y);
-        frustumPlanes_z_v = insert(frustumPlanes_z_v, 3, -tileStartY + gBufferScale_y);
-
-        // Normalize
-        float norm = rsqrt(frustumPlanes_xy_v * frustumPlanes_xy_v + 
-                           frustumPlanes_z_v * frustumPlanes_z_v);
-            frustumPlanes_xy_v *= norm;
-            frustumPlanes_z_v *= norm;
-
-        // Save out for uniform use later
-        frustumPlanes_xy[programIndex] = frustumPlanes_xy_v;
-        frustumPlanes_z[programIndex] = frustumPlanes_z_v;
+    for (uniform int i = 0; i < 4; ++i) {
+        uniform float norm = rsqrt(frustumPlanes_xy[i] * frustumPlanes_xy[i] + 
+                                   frustumPlanes_z[i] * frustumPlanes_z[i]);
+        frustumPlanes_xy[i] *= norm;
+        frustumPlanes_z[i] *= norm;
    }

    uniform int32 tileNumLights = 0;
@@ -601,30 +585,20 @@ SplitTileMinMax(
    uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
    uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
        
-    // Parallize across frustum planes
-    // Only have 2 frustum split planes here so may not be worth it, but
-    // we'll do it for now for consistency
-    uniform float frustumPlanes_xy[programCount];
-    uniform float frustumPlanes_z[programCount];
-
-    // This one is totally constant over the whole screen... worth pulling it up at all?
-    float frustumPlanes_xy_v;
-    frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 0, -(cameraProj_11 * gBufferScale_x));
-    frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 1,  (cameraProj_22 * gBufferScale_y));
-    
-    float frustumPlanes_z_v;
-    frustumPlanes_z_v = insert(frustumPlanes_z_v, 0, tileMidX - gBufferScale_x);
-    frustumPlanes_z_v = insert(frustumPlanes_z_v, 1, tileMidY - gBufferScale_y);
+    uniform float frustumPlanes_xy[2] = { -(cameraProj_11 * gBufferScale_x),
+                                           (cameraProj_22 * gBufferScale_y) };
+    uniform float frustumPlanes_z[2] = { tileMidX - gBufferScale_x,
+                                         tileMidY - gBufferScale_y };

    // Normalize
-    float norm = rsqrt(frustumPlanes_xy_v * frustumPlanes_xy_v + 
-                       frustumPlanes_z_v * frustumPlanes_z_v);
-    frustumPlanes_xy_v *= norm;
-    frustumPlanes_z_v *= norm;
-
-    // Save out for uniform use later
-    frustumPlanes_xy[programIndex] = frustumPlanes_xy_v;
-    frustumPlanes_z[programIndex] = frustumPlanes_z_v;
+    uniform float norm[2] = { rsqrt(frustumPlanes_xy[0] * frustumPlanes_xy[0] + 
+                                    frustumPlanes_z[0] * frustumPlanes_z[0]),
+                              rsqrt(frustumPlanes_xy[1] * frustumPlanes_xy[1] + 
+                                    frustumPlanes_z[1] * frustumPlanes_z[1]) };
+    frustumPlanes_xy[0] *= norm[0];
+    frustumPlanes_xy[1] *= norm[1];
+    frustumPlanes_z[0] *= norm[0];
+    frustumPlanes_z[1] *= norm[1];

    // Initialize
    uniform int32 subtileLightOffset[4];
--- a/examples/intrinsics/generic-16.h
+++ b/examples/intrinsics/generic-16.h
@@ -251,6 +251,14 @@ static FORCEINLINE TYPE __select(bool cond, TYPE a, TYPE b) {       \
    return cond ? a : b;                                            \
 }

+#define SHIFT_UNIFORM(TYPE, CAST, NAME, OP)                         \
+static FORCEINLINE TYPE NAME(TYPE a, int32_t b) {                   \
+   TYPE ret;                                                        \
+   for (int i = 0; i < 16; ++i)                                     \
+       ret.v[i] = (CAST)(a.v[i]) OP b;                              \
+   return ret;                                                      \
+}
+
 #define SMEAR(VTYPE, NAME, STYPE)               \
 static FORCEINLINE VTYPE __smear_##NAME(STYPE v) {        \
    VTYPE ret;                                  \
@@ -386,6 +394,10 @@ BINARY_OP_CAST(__vec16_i8, int8_t,  __srem, %)
 BINARY_OP_CAST(__vec16_i8, uint8_t, __lshr, >>)
 BINARY_OP_CAST(__vec16_i8, int8_t,  __ashr, >>)

+SHIFT_UNIFORM(__vec16_i8, uint8_t, __lshr, >>)
+SHIFT_UNIFORM(__vec16_i8, int8_t, __ashr, >>)
+SHIFT_UNIFORM(__vec16_i8, int8_t, __shl, <<)
+
 CMP_OP(__vec16_i8, int8_t,  __equal, ==)
 CMP_OP(__vec16_i8, int8_t,  __not_equal, !=)
 CMP_OP(__vec16_i8, uint8_t, __unsigned_less_equal, <=)
@@ -425,6 +437,10 @@ BINARY_OP_CAST(__vec16_i16, int16_t,  __srem, %)
 BINARY_OP_CAST(__vec16_i16, uint16_t, __lshr, >>)
 BINARY_OP_CAST(__vec16_i16, int16_t,  __ashr, >>)

+SHIFT_UNIFORM(__vec16_i16, uint16_t, __lshr, >>)
+SHIFT_UNIFORM(__vec16_i16, int16_t, __ashr, >>)
+SHIFT_UNIFORM(__vec16_i16, int16_t, __shl, <<)
+
 CMP_OP(__vec16_i16, int16_t,  __equal, ==)
 CMP_OP(__vec16_i16, int16_t,  __not_equal, !=)
 CMP_OP(__vec16_i16, uint16_t, __unsigned_less_equal, <=)
@@ -464,6 +480,10 @@ BINARY_OP_CAST(__vec16_i32, int32_t,  __srem, %)
 BINARY_OP_CAST(__vec16_i32, uint32_t, __lshr, >>)
 BINARY_OP_CAST(__vec16_i32, int32_t,  __ashr, >>)

+SHIFT_UNIFORM(__vec16_i32, uint32_t, __lshr, >>)
+SHIFT_UNIFORM(__vec16_i32, int32_t, __ashr, >>)
+SHIFT_UNIFORM(__vec16_i32, int32_t, __shl, <<)
+
 CMP_OP(__vec16_i32, int32_t,  __equal, ==)
 CMP_OP(__vec16_i32, int32_t,  __not_equal, !=)
 CMP_OP(__vec16_i32, uint32_t, __unsigned_less_equal, <=)
@@ -503,6 +523,10 @@ BINARY_OP_CAST(__vec16_i64, int64_t,  __srem, %)
 BINARY_OP_CAST(__vec16_i64, uint64_t, __lshr, >>)
 BINARY_OP_CAST(__vec16_i64, int64_t,  __ashr, >>)

+SHIFT_UNIFORM(__vec16_i64, uint64_t, __lshr, >>)
+SHIFT_UNIFORM(__vec16_i64, int64_t, __ashr, >>)
+SHIFT_UNIFORM(__vec16_i64, int64_t, __shl, <<)
+
 CMP_OP(__vec16_i64, int64_t,  __equal, ==)
 CMP_OP(__vec16_i64, int64_t,  __not_equal, !=)
 CMP_OP(__vec16_i64, uint64_t, __unsigned_less_equal, <=)
@@ -938,7 +962,7 @@ REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_max_uint64, >)
 ///////////////////////////////////////////////////////////////////////////
 // masked load/store

-static FORCEINLINE __vec16_i8 __masked_load_8(unsigned char *p,
+static FORCEINLINE __vec16_i8 __masked_load_8(void *p,
                                              __vec16_i1 mask) {
    __vec16_i8 ret;
    int8_t *ptr = (int8_t *)p;
@@ -948,7 +972,7 @@ static FORCEINLINE __vec16_i8 __masked_load_8(unsigned char *p,
    return ret;
 }

-static FORCEINLINE __vec16_i16 __masked_load_16(unsigned char *p,
+static FORCEINLINE __vec16_i16 __masked_load_16(void *p,
                                                __vec16_i1 mask) {
    __vec16_i16 ret;
    int16_t *ptr = (int16_t *)p;
@@ -958,7 +982,7 @@ static FORCEINLINE __vec16_i16 __masked_load_16(unsigned char *p,
    return ret;
 }

-static FORCEINLINE __vec16_i32 __masked_load_32(unsigned char *p,
+static FORCEINLINE __vec16_i32 __masked_load_32(void *p,
                                                __vec16_i1 mask) {
    __vec16_i32 ret;
    int32_t *ptr = (int32_t *)p;
@@ -968,7 +992,7 @@ static FORCEINLINE __vec16_i32 __masked_load_32(unsigned char *p,
    return ret;
 }

-static FORCEINLINE __vec16_i64 __masked_load_64(unsigned char *p,
+static FORCEINLINE __vec16_i64 __masked_load_64(void *p,
                                                __vec16_i1 mask) {
    __vec16_i64 ret;
    int64_t *ptr = (int64_t *)p;
@@ -978,7 +1002,7 @@ static FORCEINLINE __vec16_i64 __masked_load_64(unsigned char *p,
    return ret;
 }

-static FORCEINLINE void __masked_store_8(unsigned char *p, __vec16_i8 val,
+static FORCEINLINE void __masked_store_8(void *p, __vec16_i8 val,
                                         __vec16_i1 mask) {
    int8_t *ptr = (int8_t *)p;
    for (int i = 0; i < 16; ++i)
@@ -986,7 +1010,7 @@ static FORCEINLINE void __masked_store_8(unsigned char *p, __vec16_i8 val,
            ptr[i] = val.v[i];
 }

-static FORCEINLINE void __masked_store_16(unsigned char *p, __vec16_i16 val,
+static FORCEINLINE void __masked_store_16(void *p, __vec16_i16 val,
                                          __vec16_i1 mask) {
    int16_t *ptr = (int16_t *)p;
    for (int i = 0; i < 16; ++i)
@@ -994,7 +1018,7 @@ static FORCEINLINE void __masked_store_16(unsigned char *p, __vec16_i16 val,
            ptr[i] = val.v[i];
 }

-static FORCEINLINE void __masked_store_32(unsigned char *p, __vec16_i32 val,
+static FORCEINLINE void __masked_store_32(void *p, __vec16_i32 val,
                                          __vec16_i1 mask) {
    int32_t *ptr = (int32_t *)p;
    for (int i = 0; i < 16; ++i)
@@ -1002,7 +1026,7 @@ static FORCEINLINE void __masked_store_32(unsigned char *p, __vec16_i32 val,
            ptr[i] = val.v[i];
 }

-static FORCEINLINE void __masked_store_64(unsigned char *p, __vec16_i64 val,
+static FORCEINLINE void __masked_store_64(void *p, __vec16_i64 val,
                                          __vec16_i1 mask) {
    int64_t *ptr = (int64_t *)p;
    for (int i = 0; i < 16; ++i)
@@ -1010,19 +1034,41 @@ static FORCEINLINE void __masked_store_64(unsigned char *p, __vec16_i64 val,
            ptr[i] = val.v[i];
 }

+static FORCEINLINE void __masked_store_blend_8(void *p, __vec16_i8 val,
+                                               __vec16_i1 mask) {
+    __masked_store_8(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_16(void *p, __vec16_i16 val,
+                                                __vec16_i1 mask) {
+    __masked_store_16(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_32(void *p, __vec16_i32 val,
+                                                __vec16_i1 mask) {
+    __masked_store_32(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_64(void *p, __vec16_i64 val,
+                                                __vec16_i1 mask) {
+    __masked_store_64(p, val, mask);
+}
+
 ///////////////////////////////////////////////////////////////////////////
 // gather/scatter

 // offsets * offsetScale is in bytes (for all of these)

 #define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                  \
-static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE offsets, uint32_t scale,\
+static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset,    \
+                              uint32_t scale, OTYPE constOffset, \
                              __vec16_i1 mask) {                        \
    VTYPE ret;                                                          \
    int8_t *base = (int8_t *)b;                                         \
    for (int i = 0; i < 16; ++i)                                        \
        if ((mask.v & (1 << i)) != 0) {                                 \
-            STYPE *ptr = (STYPE *)(base + scale * offsets.v[i]);        \
+            STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] +  \
+                                   constOffset.v[i]);                   \
            ret.v[i] = *ptr;                                            \
        }                                                               \
    return ret;                                                         \
@@ -1061,12 +1107,14 @@ GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __gather64_i64)
 // scatter

 #define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                 \
-static FORCEINLINE void FUNC(unsigned char *b, OTYPE offsets, uint32_t scale,\
+static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset,     \
+                             uint32_t scale, OTYPE constOffset,         \
                             VTYPE val, __vec16_i1 mask) {              \
    int8_t *base = (int8_t *)b;                                         \
    for (int i = 0; i < 16; ++i)                                        \
        if ((mask.v & (1 << i)) != 0) {                                 \
-            STYPE *ptr = (STYPE *)(base + scale * offsets.v[i]);        \
+            STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] +  \
+                                   constOffset.v[i]);                   \
            *ptr = val.v[i];                                            \
        }                                                               \
 }
--- a/examples/intrinsics/sse4.h
+++ b/examples/intrinsics/sse4.h
@@ -51,8 +51,8 @@
 #define FORCEINLINE __attribute__((always_inline)) inline
 #endif

-//CO#undef FORCEINLINE
-//CO#define FORCEINLINE
+#undef FORCEINLINE
+#define FORCEINLINE

 typedef float __vec1_f;
 typedef double __vec1_d;
@@ -303,6 +303,13 @@ static FORCEINLINE __vec4_i8 __shl(__vec4_i8 a, __vec4_i8 b) {
                     _mm_extract_epi8(a.v, 3) << _mm_extract_epi8(b.v, 3));
 }

+static FORCEINLINE __vec4_i8 __shl(__vec4_i8 a, int32_t b) {
+    return __vec4_i8(_mm_extract_epi8(a.v, 0) << b,
+                     _mm_extract_epi8(a.v, 1) << b,
+                     _mm_extract_epi8(a.v, 2) << b,
+                     _mm_extract_epi8(a.v, 3) << b);
+}
+
 static FORCEINLINE __vec4_i8 __udiv(__vec4_i8 a, __vec4_i8 b) {
    return __vec4_i8((uint8_t)_mm_extract_epi8(a.v, 0) / 
                     (uint8_t)_mm_extract_epi8(b.v, 0),
@@ -358,6 +365,13 @@ static FORCEINLINE __vec4_i8 __lshr(__vec4_i8 a, __vec4_i8 b) {
                     (uint8_t)_mm_extract_epi8(b.v, 3));
 }

+static FORCEINLINE __vec4_i8 __lshr(__vec4_i8 a, int32_t b) {
+    return __vec4_i8((uint8_t)_mm_extract_epi8(a.v, 0) >> b,
+                     (uint8_t)_mm_extract_epi8(a.v, 1) >> b,
+                     (uint8_t)_mm_extract_epi8(a.v, 2) >> b,
+                     (uint8_t)_mm_extract_epi8(a.v, 3) >> b);
+}
+
 static FORCEINLINE __vec4_i8 __ashr(__vec4_i8 a, __vec4_i8 b) {
    return __vec4_i8((int8_t)_mm_extract_epi8(a.v, 0) >>
                     (int8_t)_mm_extract_epi8(b.v, 0),
@@ -369,6 +383,13 @@ static FORCEINLINE __vec4_i8 __ashr(__vec4_i8 a, __vec4_i8 b) {
                     (int8_t)_mm_extract_epi8(b.v, 3));
 }

+static FORCEINLINE __vec4_i8 __ashr(__vec4_i8 a, int32_t b) {
+    return __vec4_i8((int8_t)_mm_extract_epi8(a.v, 0) >> b,
+                     (int8_t)_mm_extract_epi8(a.v, 1) >> b,
+                     (int8_t)_mm_extract_epi8(a.v, 2) >> b,
+                     (int8_t)_mm_extract_epi8(a.v, 3) >> b);
+}
+
 static FORCEINLINE __vec4_i1 __equal(__vec4_i8 a, __vec4_i8 b) {
    __m128i cmp = _mm_cmpeq_epi8(a.v, b.v);
    return __vec4_i1(_mm_extract_epi8(cmp, 0),
@@ -547,6 +568,10 @@ static FORCEINLINE __vec4_i16 __shl(__vec4_i16 a, __vec4_i16 b) {
                      _mm_extract_epi16(a.v, 3) << _mm_extract_epi16(b.v, 3));
 }

+static FORCEINLINE __vec4_i16 __shl(__vec4_i16 a, int32_t b) {
+    return _mm_sll_epi16(a.v, _mm_set_epi32(0, 0, 0, b));
+}
+
 static FORCEINLINE __vec4_i16 __udiv(__vec4_i16 a, __vec4_i16 b) {
    return __vec4_i16((uint16_t)_mm_extract_epi16(a.v, 0) /
                      (uint16_t)_mm_extract_epi16(b.v, 0),
@@ -602,6 +627,10 @@ static FORCEINLINE __vec4_i16 __lshr(__vec4_i16 a, __vec4_i16 b) {
                      (uint16_t)_mm_extract_epi16(b.v, 3));
 }

+static FORCEINLINE __vec4_i16 __lshr(__vec4_i16 a, int32_t b) {
+    return _mm_srl_epi16(a.v, _mm_set_epi32(0, 0, 0, b));
+}
+
 static FORCEINLINE __vec4_i16 __ashr(__vec4_i16 a, __vec4_i16 b) {
    return __vec4_i16((int16_t)_mm_extract_epi16(a.v, 0) >>
                      (int16_t)_mm_extract_epi16(b.v, 0),
@@ -613,6 +642,10 @@ static FORCEINLINE __vec4_i16 __ashr(__vec4_i16 a, __vec4_i16 b) {
                      (int16_t)_mm_extract_epi16(b.v, 3));
 }

+static FORCEINLINE __vec4_i16 __ashr(__vec4_i16 a, int32_t b) {
+    return _mm_sra_epi16(a.v, _mm_set_epi32(0, 0, 0, b));
+}
+
 static FORCEINLINE __vec4_i1 __equal(__vec4_i16 a, __vec4_i16 b) {
    __m128i cmp = _mm_cmpeq_epi16(a.v, b.v);
    return __vec4_i1(_mm_extract_epi16(cmp, 0),
@@ -789,9 +822,6 @@ static FORCEINLINE __vec4_i32 __xor(__vec4_i32 a, __vec4_i32 b) {
 }

 static FORCEINLINE __vec4_i32 __shl(__vec4_i32 a, __vec4_i32 b) {
-    // FIXME: if we can determine at compile time that b has the same value
-    // across all elements, then we can use _mm_sll_epi32.
-
    /* fixme: llvm generates thie code for shift left, which is presumably
       more efficient than doing each component individually as below.

@@ -813,57 +843,92 @@ _f___ii:                                ## @f___ii
        ret

     */
-    return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) << _mm_extract_epi32(b.v, 0),
-                      (uint32_t)_mm_extract_epi32(a.v, 1) << _mm_extract_epi32(b.v, 1),
-                      (uint32_t)_mm_extract_epi32(a.v, 2) << _mm_extract_epi32(b.v, 2),
-                      (uint32_t)_mm_extract_epi32(a.v, 3) << _mm_extract_epi32(b.v, 3));
+    return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) << 
+                      _mm_extract_epi32(b.v, 0),
+                      (uint32_t)_mm_extract_epi32(a.v, 1) << 
+                      _mm_extract_epi32(b.v, 1),
+                      (uint32_t)_mm_extract_epi32(a.v, 2) << 
+                      _mm_extract_epi32(b.v, 2),
+                      (uint32_t)_mm_extract_epi32(a.v, 3) << 
+                      _mm_extract_epi32(b.v, 3));
+}
+
+static FORCEINLINE __vec4_i32 __shl(__vec4_i32 a, int32_t b) {
+    return _mm_sll_epi32(a.v, _mm_set_epi32(0, 0, 0, b));
 }

 static FORCEINLINE __vec4_i32 __udiv(__vec4_i32 a, __vec4_i32 b) {
-    return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) / (uint32_t)_mm_extract_epi32(b.v, 0),
-                      (uint32_t)_mm_extract_epi32(a.v, 1) / (uint32_t)_mm_extract_epi32(b.v, 1),
-                      (uint32_t)_mm_extract_epi32(a.v, 2) / (uint32_t)_mm_extract_epi32(b.v, 2),
-                      (uint32_t)_mm_extract_epi32(a.v, 3) / (uint32_t)_mm_extract_epi32(b.v, 3));
+    return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) / 
+                      (uint32_t)_mm_extract_epi32(b.v, 0),
+                      (uint32_t)_mm_extract_epi32(a.v, 1) / 
+                      (uint32_t)_mm_extract_epi32(b.v, 1),
+                      (uint32_t)_mm_extract_epi32(a.v, 2) / 
+                      (uint32_t)_mm_extract_epi32(b.v, 2),
+                      (uint32_t)_mm_extract_epi32(a.v, 3) / 
+                      (uint32_t)_mm_extract_epi32(b.v, 3));
 }

 static FORCEINLINE __vec4_i32 __sdiv(__vec4_i32 a, __vec4_i32 b) {
-    return __vec4_i32((int32_t)_mm_extract_epi32(a.v, 0) / (int32_t)_mm_extract_epi32(b.v, 0),
-                      (int32_t)_mm_extract_epi32(a.v, 1) / (int32_t)_mm_extract_epi32(b.v, 1),
-                      (int32_t)_mm_extract_epi32(a.v, 2) / (int32_t)_mm_extract_epi32(b.v, 2),
-                      (int32_t)_mm_extract_epi32(a.v, 3) / (int32_t)_mm_extract_epi32(b.v, 3));
+    return __vec4_i32((int32_t)_mm_extract_epi32(a.v, 0) / 
+                      (int32_t)_mm_extract_epi32(b.v, 0),
+                      (int32_t)_mm_extract_epi32(a.v, 1) / 
+                      (int32_t)_mm_extract_epi32(b.v, 1),
+                      (int32_t)_mm_extract_epi32(a.v, 2) / 
+                      (int32_t)_mm_extract_epi32(b.v, 2),
+                      (int32_t)_mm_extract_epi32(a.v, 3) / 
+                      (int32_t)_mm_extract_epi32(b.v, 3));
 }

 static FORCEINLINE __vec4_i32 __urem(__vec4_i32 a, __vec4_i32 b) {
-    return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) % (uint32_t)_mm_extract_epi32(b.v, 0),
-                      (uint32_t)_mm_extract_epi32(a.v, 1) % (uint32_t)_mm_extract_epi32(b.v, 1),
-                      (uint32_t)_mm_extract_epi32(a.v, 2) % (uint32_t)_mm_extract_epi32(b.v, 2),
-                      (uint32_t)_mm_extract_epi32(a.v, 3) % (uint32_t)_mm_extract_epi32(b.v, 3));
+    return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) %
+                      (uint32_t)_mm_extract_epi32(b.v, 0),
+                      (uint32_t)_mm_extract_epi32(a.v, 1) %
+                      (uint32_t)_mm_extract_epi32(b.v, 1),
+                      (uint32_t)_mm_extract_epi32(a.v, 2) %
+                      (uint32_t)_mm_extract_epi32(b.v, 2),
+                      (uint32_t)_mm_extract_epi32(a.v, 3) %
+                      (uint32_t)_mm_extract_epi32(b.v, 3));
 }

 static FORCEINLINE __vec4_i32 __srem(__vec4_i32 a, __vec4_i32 b) {
-    return __vec4_i32((int32_t)_mm_extract_epi32(a.v, 0) % (int32_t)_mm_extract_epi32(b.v, 0),
-                      (int32_t)_mm_extract_epi32(a.v, 1) % (int32_t)_mm_extract_epi32(b.v, 1),
-                      (int32_t)_mm_extract_epi32(a.v, 2) % (int32_t)_mm_extract_epi32(b.v, 2),
-                      (int32_t)_mm_extract_epi32(a.v, 3) % (int32_t)_mm_extract_epi32(b.v, 3));
+    return __vec4_i32((int32_t)_mm_extract_epi32(a.v, 0) %
+                      (int32_t)_mm_extract_epi32(b.v, 0),
+                      (int32_t)_mm_extract_epi32(a.v, 1) %
+                      (int32_t)_mm_extract_epi32(b.v, 1),
+                      (int32_t)_mm_extract_epi32(a.v, 2) %
+                      (int32_t)_mm_extract_epi32(b.v, 2),
+                      (int32_t)_mm_extract_epi32(a.v, 3) %
+                      (int32_t)_mm_extract_epi32(b.v, 3));
 }

 static FORCEINLINE __vec4_i32 __lshr(__vec4_i32 a, __vec4_i32 b) {
-    // FIXME: if we can determine at compile time that b has the same value
-    // across all elements, e.g. using gcc's __builtin_constant_p, then we
-    // can use _mm_srl_epi32.
-    return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) >> _mm_extract_epi32(b.v, 0),
-                      (uint32_t)_mm_extract_epi32(a.v, 1) >> _mm_extract_epi32(b.v, 1),
-                      (uint32_t)_mm_extract_epi32(a.v, 2) >> _mm_extract_epi32(b.v, 2),
-                      (uint32_t)_mm_extract_epi32(a.v, 3) >> _mm_extract_epi32(b.v, 3));
+    return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) >>
+                      _mm_extract_epi32(b.v, 0),
+                      (uint32_t)_mm_extract_epi32(a.v, 1) >>
+                      _mm_extract_epi32(b.v, 1),
+                      (uint32_t)_mm_extract_epi32(a.v, 2) >>
+                      _mm_extract_epi32(b.v, 2),
+                      (uint32_t)_mm_extract_epi32(a.v, 3) >>
+                      _mm_extract_epi32(b.v, 3));
+}
+
+static FORCEINLINE __vec4_i32 __lshr(__vec4_i32 a, int32_t b) {
+    return _mm_srl_epi32(a.v, _mm_set_epi32(0, 0, 0, b));
 }

 static FORCEINLINE __vec4_i32 __ashr(__vec4_i32 a, __vec4_i32 b) {
-    // FIXME: if we can determine at compile time that b has the same value
-    // across all elements, then we can use _mm_sra_epi32.
-    return __vec4_i32((int32_t)_mm_extract_epi32(a.v, 0) >> _mm_extract_epi32(b.v, 0),
-                      (int32_t)_mm_extract_epi32(a.v, 1) >> _mm_extract_epi32(b.v, 1),
-                      (int32_t)_mm_extract_epi32(a.v, 2) >> _mm_extract_epi32(b.v, 2),
-                      (int32_t)_mm_extract_epi32(a.v, 3) >> _mm_extract_epi32(b.v, 3));
+    return __vec4_i32((int32_t)_mm_extract_epi32(a.v, 0) >>
+                      _mm_extract_epi32(b.v, 0),
+                      (int32_t)_mm_extract_epi32(a.v, 1) >>
+                      _mm_extract_epi32(b.v, 1),
+                      (int32_t)_mm_extract_epi32(a.v, 2) >>
+                      _mm_extract_epi32(b.v, 2),
+                      (int32_t)_mm_extract_epi32(a.v, 3) >>
+                      _mm_extract_epi32(b.v, 3));
+}
+
+static FORCEINLINE __vec4_i32 __ashr(__vec4_i32 a, int32_t b) {
+    return _mm_sra_epi32(a.v, _mm_set_epi32(0, 0, 0, b));
 }

 static FORCEINLINE __vec4_i1 __equal(__vec4_i32 a, __vec4_i32 b) {
@@ -876,10 +941,8 @@ static FORCEINLINE __vec4_i1 __not_equal(__vec4_i32 a, __vec4_i32 b) {
 }

 static FORCEINLINE __vec4_i1 __unsigned_less_equal(__vec4_i32 a, __vec4_i32 b) {
-    a.v = _mm_xor_si128(a.v, _mm_set1_epi32(0x80000000));
-    b.v = _mm_xor_si128(b.v, _mm_set1_epi32(0x80000000));
-    return _mm_or_si128(_mm_cmplt_epi32(a.v, b.v),
-                        _mm_cmpeq_epi32(a.v, b.v));
+    // a<=b == (min(a,b) == a)
+    return _mm_cmpeq_epi32(_mm_min_epu32(a.v, b.v), a.v);
 }

 static FORCEINLINE __vec4_i1 __signed_less_equal(__vec4_i32 a, __vec4_i32 b) {
@@ -888,10 +951,8 @@ static FORCEINLINE __vec4_i1 __signed_less_equal(__vec4_i32 a, __vec4_i32 b) {
 }

 static FORCEINLINE __vec4_i1 __unsigned_greater_equal(__vec4_i32 a, __vec4_i32 b) {
-    a.v = _mm_xor_si128(a.v, _mm_set1_epi32(0x80000000));
-    b.v = _mm_xor_si128(b.v, _mm_set1_epi32(0x80000000));
-    return _mm_or_si128(_mm_cmpgt_epi32(a.v, b.v),
-                        _mm_cmpeq_epi32(a.v, b.v));
+    // a>=b == (max(a,b) == a)
+    return _mm_cmpeq_epi32(_mm_max_epu32(a.v, b.v), a.v);
 }

 static FORCEINLINE __vec4_i1 __signed_greater_equal(__vec4_i32 a, __vec4_i32 b) {
@@ -1016,6 +1077,12 @@ static FORCEINLINE __vec4_i64 __shl(__vec4_i64 a, __vec4_i64 b) {
                      _mm_extract_epi64(a.v[1], 1) << _mm_extract_epi64(b.v[1], 1));
 }

+static FORCEINLINE __vec4_i64 __shl(__vec4_i64 a, int32_t b) {
+    __m128i amt = _mm_set_epi32(0, 0, 0, b);
+    return __vec4_i64(_mm_sll_epi64(a.v[0], amt),
+                      _mm_sll_epi64(a.v[1], amt));
+}
+
 static FORCEINLINE __vec4_i64 __udiv(__vec4_i64 a, __vec4_i64 b) {
    return __vec4_i64((uint64_t)_mm_extract_epi64(a.v[0], 0) /
                      (uint64_t)_mm_extract_epi64(b.v[0], 0),
@@ -1071,6 +1138,12 @@ static FORCEINLINE __vec4_i64 __lshr(__vec4_i64 a, __vec4_i64 b) {
                      (uint64_t)_mm_extract_epi64(b.v[1], 1));
 }

+static FORCEINLINE __vec4_i64 __lshr(__vec4_i64 a, int32_t b) {
+    __m128i amt = _mm_set_epi32(0, 0, 0, b);
+    return __vec4_i64(_mm_srl_epi64(a.v[0], amt),
+                      _mm_srl_epi64(a.v[1], amt));
+}
+
 static FORCEINLINE __vec4_i64 __ashr(__vec4_i64 a, __vec4_i64 b) {
    return __vec4_i64((int64_t)_mm_extract_epi64(a.v[0], 0) >>
                      (int64_t)_mm_extract_epi64(b.v[0], 0),
@@ -1082,6 +1155,13 @@ static FORCEINLINE __vec4_i64 __ashr(__vec4_i64 a, __vec4_i64 b) {
                      (int64_t)_mm_extract_epi64(b.v[1], 1));
 }

+static FORCEINLINE __vec4_i64 __ashr(__vec4_i64 a, int32_t b) {
+    return __vec4_i64((int64_t)_mm_extract_epi64(a.v[0], 0) >> b,
+                      (int64_t)_mm_extract_epi64(a.v[0], 1) >> b,
+                      (int64_t)_mm_extract_epi64(a.v[1], 0) >> b,
+                      (int64_t)_mm_extract_epi64(a.v[1], 1) >> b);
+}
+
 static FORCEINLINE __vec4_i1 __equal(__vec4_i64 a, __vec4_i64 b) {
    __m128i cmp0 = _mm_cmpeq_epi64(a.v[0], b.v[0]);
    __m128i cmp1 = _mm_cmpeq_epi64(a.v[1], b.v[1]);
@@ -2328,7 +2408,7 @@ static FORCEINLINE uint64_t __reduce_max_uint64(__vec4_i64 v) {
 ///////////////////////////////////////////////////////////////////////////
 // masked load/store

-static FORCEINLINE __vec4_i8 __masked_load_8(unsigned char *p, 
+static FORCEINLINE __vec4_i8 __masked_load_8(void *p, 
                                             __vec4_i1 mask) {
    int8_t r[4];
    int8_t *ptr = (int8_t *)p;
@@ -2348,7 +2428,7 @@ static FORCEINLINE __vec4_i8 __masked_load_8(unsigned char *p,
    return __vec4_i8(r[0], r[1], r[2], r[3]);
 }

-static FORCEINLINE __vec4_i16 __masked_load_16(unsigned char *p, 
+static FORCEINLINE __vec4_i16 __masked_load_16(void *p, 
                                               __vec4_i1 mask) {
    int16_t r[4];
    int16_t *ptr = (int16_t *)p;
@@ -2372,7 +2452,7 @@ static FORCEINLINE __vec4_i16 __masked_load_16(unsigned char *p,
    return __vec4_i16(r[0], r[1], r[2], r[3]);
 }

-static FORCEINLINE __vec4_i32 __masked_load_32(unsigned char *p, 
+static FORCEINLINE __vec4_i32 __masked_load_32(void *p, 
                                               __vec4_i1 mask) {
    __m128i r = _mm_set_epi32(0, 0, 0, 0);
    int32_t *ptr = (int32_t *)p;
@@ -2395,7 +2475,7 @@ static FORCEINLINE __vec4_i32 __masked_load_32(unsigned char *p,
    return r;
 }

-static FORCEINLINE __vec4_i64 __masked_load_64(unsigned char *p, 
+static FORCEINLINE __vec4_i64 __masked_load_64(void *p, 
                                               __vec4_i1 mask) {
    uint64_t r[4];
    uint64_t *ptr = (uint64_t *)p;
@@ -2418,7 +2498,7 @@ static FORCEINLINE __vec4_i64 __masked_load_64(unsigned char *p,
    return __vec4_i64(r[0], r[1], r[2], r[3]);
 }

-static FORCEINLINE void __masked_store_8(unsigned char *p, __vec4_i8 val, 
+static FORCEINLINE void __masked_store_8(void *p, __vec4_i8 val, 
                                         __vec4_i1 mask) {
    int8_t *ptr = (int8_t *)p;

@@ -2439,7 +2519,8 @@ static FORCEINLINE void __masked_store_8(unsigned char *p, __vec4_i8 val,
        ptr[3] = _mm_extract_epi8(val.v, 3);
 }

-static FORCEINLINE void __masked_store_16(unsigned char *p, __vec4_i16 val, __vec4_i1 mask) {
+static FORCEINLINE void __masked_store_16(void *p, __vec4_i16 val,
+                                          __vec4_i1 mask) {
    int16_t *ptr = (int16_t *)p;

    uint32_t m = _mm_extract_ps(mask.v, 0);
@@ -2459,7 +2540,7 @@ static FORCEINLINE void __masked_store_16(unsigned char *p, __vec4_i16 val, __ve
        ptr[3] = _mm_extract_epi16(val.v, 3);
 }

-static FORCEINLINE void __masked_store_32(unsigned char *p, __vec4_i32 val, 
+static FORCEINLINE void __masked_store_32(void *p, __vec4_i32 val, 
                                          __vec4_i1 mask) {
    int32_t *ptr = (int32_t *)p;
    uint32_t m = _mm_extract_ps(mask.v, 0);
@@ -2479,7 +2560,7 @@ static FORCEINLINE void __masked_store_32(unsigned char *p, __vec4_i32 val,
        ptr[3] = _mm_extract_epi32(val.v, 3);
 }

-static FORCEINLINE void __masked_store_64(unsigned char *p, __vec4_i64 val, 
+static FORCEINLINE void __masked_store_64(void *p, __vec4_i64 val, 
                                          __vec4_i1 mask) {
    int64_t *ptr = (int64_t *)p;
    uint32_t m = _mm_extract_ps(mask.v, 0);
@@ -2499,58 +2580,82 @@ static FORCEINLINE void __masked_store_64(unsigned char *p, __vec4_i64 val,
        ptr[3] = _mm_extract_epi64(val.v[1], 1);
 }

+static FORCEINLINE void __masked_store_blend_8(void *p, __vec4_i8 val, 
+                                               __vec4_i1 mask) {
+    __masked_store_8(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_16(void *p, __vec4_i16 val, 
+                                                __vec4_i1 mask) {
+    __masked_store_16(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_32(void *p, __vec4_i32 val, 
+                                                __vec4_i1 mask) {
+    // FIXME: do a load, blendvps, store here...
+    __masked_store_32(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_64(void *p, __vec4_i64 val, 
+                                                __vec4_i1 mask) {
+    // FIXME: do a 2x (load, blendvps, store) here...
+    __masked_store_64(p, val, mask);
+}
+
 ///////////////////////////////////////////////////////////////////////////
 // gather/scatter
 // offsets * offsetScale is in bytes (for all of these)

 template<typename RetVec, typename RetScalar>
 static FORCEINLINE RetVec
-lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p,
-                     __vec4_i32 offsets, uint32_t scale, __vec4_i1 mask) {
+lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p, __vec4_i32 offsets, 
+                     uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
    RetScalar r[4];
 #if 1
    // "Fast gather" trick...
    offsets = __select(mask, offsets, __smear_i32(0));
-    int offset = scale * _mm_extract_epi32(offsets.v, 0);
+    constOffset = __select(mask, constOffset, __smear_i32(0));
+
+    int offset = scale * _mm_extract_epi32(offsets.v, 0) + _mm_extract_epi32(constOffset.v, 0);
    RetScalar *ptr = (RetScalar *)(p + offset);
    r[0] = *ptr;

-    offset = scale * _mm_extract_epi32(offsets.v, 1);
+    offset = scale * _mm_extract_epi32(offsets.v, 1) + _mm_extract_epi32(constOffset.v, 1);
    ptr = (RetScalar *)(p + offset);
    r[1] = *ptr;

-    offset = scale * _mm_extract_epi32(offsets.v, 2);
+    offset = scale * _mm_extract_epi32(offsets.v, 2) + _mm_extract_epi32(constOffset.v, 2);
    ptr = (RetScalar *)(p + offset);
    r[2] = *ptr;

-    offset = scale * _mm_extract_epi32(offsets.v, 3);
+    offset = scale * _mm_extract_epi32(offsets.v, 3) + _mm_extract_epi32(constOffset.v, 3);
    ptr = (RetScalar *)(p + offset);
    r[3] = *ptr;
 #else
    uint32_t m = _mm_extract_ps(mask.v, 0);
    if (m != 0) {
-        int offset = scale * _mm_extract_epi32(offsets.v, 0);
+        int offset = scale * _mm_extract_epi32(offsets.v, 0) + _mm_extract_epi32(constOffset.v, 0);
        RetScalar *ptr = (RetScalar *)(p + offset);
        r[0] = *ptr;
    }

    m = _mm_extract_ps(mask.v, 1);
    if (m != 0) {
-        int offset = scale * _mm_extract_epi32(offsets.v, 1);
+        int offset = scale * _mm_extract_epi32(offsets.v, 1) + _mm_extract_epi32(constOffset.v, 1);
        RetScalar *ptr = (RetScalar *)(p + offset);
        r[1] = *ptr;
    }

    m = _mm_extract_ps(mask.v, 2);
    if (m != 0) {
-        int offset = scale * _mm_extract_epi32(offsets.v, 2);
+        int offset = scale * _mm_extract_epi32(offsets.v, 2) + _mm_extract_epi32(constOffset.v, 2);
        RetScalar *ptr = (RetScalar *)(p + offset);
        r[2] = *ptr;
    }

    m = _mm_extract_ps(mask.v, 3);
    if (m != 0) {
-        int offset = scale * _mm_extract_epi32(offsets.v, 3);
+        int offset = scale * _mm_extract_epi32(offsets.v, 3) + _mm_extract_epi32(constOffset.v, 3);
        RetScalar *ptr = (RetScalar *)(p + offset);
        r[3] = *ptr;
    }
@@ -2558,54 +2663,57 @@ lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p,
    return RetVec(r[0], r[1], r[2], r[3]);
 }

+
 template<typename RetVec, typename RetScalar>
 static FORCEINLINE RetVec
 lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, __vec4_i64 offsets,
-                     uint32_t scale, __vec4_i1 mask) {
+                     uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
    RetScalar r[4];
 #if 1
    // "Fast gather" trick...
    offsets = __select(mask, offsets, __smear_i64(0));
-    int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
+    constOffset = __select(mask, constOffset, __smear_i64(0));
+
+    int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + _mm_extract_epi64(constOffset.v[0], 0);
    RetScalar *ptr = (RetScalar *)(p + offset);
    r[0] = *ptr;

-    offset = scale * _mm_extract_epi64(offsets.v[0], 1);
+    offset = scale * _mm_extract_epi64(offsets.v[0], 1) + _mm_extract_epi64(constOffset.v[0], 1);
    ptr = (RetScalar *)(p + offset);
    r[1] = *ptr;

-    offset = scale * _mm_extract_epi64(offsets.v[1], 0);
+    offset = scale * _mm_extract_epi64(offsets.v[1], 0) + _mm_extract_epi64(constOffset.v[1], 0);
    ptr = (RetScalar *)(p + offset);
    r[2] = *ptr;

-    offset = scale * _mm_extract_epi64(offsets.v[1], 1);
+    offset = scale * _mm_extract_epi64(offsets.v[1], 1) + _mm_extract_epi64(constOffset.v[1], 1);
    ptr = (RetScalar *)(p + offset);
    r[3] = *ptr;
 #else
    uint32_t m = _mm_extract_ps(mask.v, 0);
    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + _mm_extract_epi64(constOffset.v[0], 0);
        RetScalar *ptr = (RetScalar *)(p + offset);
        r[0] = *ptr;
    }

    m = _mm_extract_ps(mask.v, 1);
    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1) + _mm_extract_epi64(constOffset.v[0], 1);
        RetScalar *ptr = (RetScalar *)(p + offset);
        r[1] = *ptr;
    }

    m = _mm_extract_ps(mask.v, 2);
    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0) + _mm_extract_epi64(constOffset.v[1], 0);
        RetScalar *ptr = (RetScalar *)(p + offset);
        r[2] = *ptr;
    }

    m = _mm_extract_ps(mask.v, 3);
    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1) + _mm_extract_epi64(constOffset.v[1], 1);
        RetScalar *ptr = (RetScalar *)(p + offset);
        r[3] = *ptr;
    }
@@ -2616,80 +2724,89 @@ lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, __vec4_i64 offsets,

 static FORCEINLINE __vec4_i8
 __gather_base_offsets32_i8(unsigned char *b, __vec4_i32 offsets,
-                           uint32_t scale,  __vec4_i1 mask) {
+                           uint32_t scale,  __vec4_i32 constOffset, __vec4_i1 mask) {
    return lGatherBaseOffsets32(__vec4_i8(), uint8_t(), b, offsets, scale, 
-                                mask);
+                                constOffset, mask);
 }

 static FORCEINLINE __vec4_i8
 __gather_base_offsets64_i8(unsigned char *b, __vec4_i64 offsets,
-                           uint32_t scale, __vec4_i1 mask) {
+                           uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
    return lGatherBaseOffsets64(__vec4_i8(), uint8_t(), b, offsets, scale, 
-                                mask);
+                                constOffset, mask);
 }

 static FORCEINLINE __vec4_i16
 __gather_base_offsets32_i16(unsigned char *b, __vec4_i32 offsets,
-                            uint32_t scale, __vec4_i1 mask) {
+                            uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
    return lGatherBaseOffsets32(__vec4_i16(), uint16_t(), b, offsets, scale, 
-                                mask);
+                                constOffset, mask);
 }

 static FORCEINLINE __vec4_i16
 __gather_base_offsets64_i16(unsigned char *b, __vec4_i64 offsets,
-                             uint32_t scale, __vec4_i1 mask) {
+                             uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
    return lGatherBaseOffsets64(__vec4_i16(), uint16_t(), b, offsets, scale, 
-                                mask);
+                                constOffset, mask);
 }

 static FORCEINLINE __vec4_i32
-__gather_base_offsets32_i32(uint8_t *p, __vec4_i32 offsets,
-                            uint32_t scale, __vec4_i1 mask) {
+__gather_base_offsets32_i32(uint8_t *p, __vec4_i32 offsets, uint32_t scale,
+                            __vec4_i32 constOffset, __vec4_i1 mask) {
    __m128i r = _mm_set_epi32(0, 0, 0, 0);
 #if 1
    // "Fast gather"...
    offsets = __select(mask, offsets, __smear_i32(0));
+    constOffset = __select(mask, constOffset, __smear_i32(0));

-    int offset = scale * _mm_extract_epi32(offsets.v, 0);
+    int offset = scale * _mm_extract_epi32(offsets.v, 0) +
+        _mm_extract_epi32(constOffset.v, 0);
    uint32_t *ptr = (uint32_t *)(p + offset);
    r = _mm_insert_epi32(r, *ptr, 0);

-    offset = scale * _mm_extract_epi32(offsets.v, 1);
+    offset = scale * _mm_extract_epi32(offsets.v, 1) +
+        _mm_extract_epi32(constOffset.v, 1);
    ptr = (uint32_t *)(p + offset);
    r = _mm_insert_epi32(r, *ptr, 1);

-    offset = scale * _mm_extract_epi32(offsets.v, 2);
+    offset = scale * _mm_extract_epi32(offsets.v, 2) +
+        _mm_extract_epi32(constOffset.v, 2);
    ptr = (uint32_t *)(p + offset);
    r = _mm_insert_epi32(r, *ptr, 2);

-    offset = scale * _mm_extract_epi32(offsets.v, 3);
+    offset = scale * _mm_extract_epi32(offsets.v, 3) +
+        _mm_extract_epi32(constOffset.v, 3);
    ptr = (uint32_t *)(p + offset);
    r = _mm_insert_epi32(r, *ptr, 3);
 #else
    uint32_t m = _mm_extract_ps(mask.v, 0);
    if (m != 0) {
-        int offset = scale * _mm_extract_epi32(offsets.v, 0);
+        int offset = scale * _mm_extract_epi32(offsets.v, 0) +
+            _mm_extract_epi32(constOffset.v, 0);
        uint32_t *ptr = (uint32_t *)(p + offset);
        r = _mm_insert_epi32(r, *ptr, 0);
    }

    m = _mm_extract_ps(mask.v, 1);
    if (m != 0) {
-        int offset = scale * _mm_extract_epi32(offsets.v, 1);
+        int offset = scale * _mm_extract_epi32(offsets.v, 1) +
+            _mm_extract_epi32(constOffset.v, 1);
        uint32_t *ptr = (uint32_t *)(p + offset);
        r = _mm_insert_epi32(r, *ptr, 1);
    }

    m = _mm_extract_ps(mask.v, 2);
    if (m != 0) {
-        int offset = scale * _mm_extract_epi32(offsets.v, 2);
+        int offset = scale * _mm_extract_epi32(offsets.v, 2) +
+            _mm_extract_epi32(constOffset.v, 2);
        uint32_t *ptr = (uint32_t *)(p + offset);
        r = _mm_insert_epi32(r, *ptr, 2);
    }

    m = _mm_extract_ps(mask.v, 3);
    if (m != 0) {
-        int offset = scale * _mm_extract_epi32(offsets.v, 3);
+        int offset = scale * _mm_extract_epi32(offsets.v, 3) +
+            _mm_extract_epi32(constOffset.v, 3);
        uint32_t *ptr = (uint32_t *)(p + offset);
        r = _mm_insert_epi32(r, *ptr, 3);
    }
@@ -2699,23 +2816,23 @@ __gather_base_offsets32_i32(uint8_t *p, __vec4_i32 offsets,

 static FORCEINLINE __vec4_i32
 __gather_base_offsets64_i32(unsigned char *p, __vec4_i64 offsets,
-                            uint32_t scale, __vec4_i1 mask) {
+                            uint32_t scale, __vec4_i64 delta, __vec4_i1 mask) {
    return lGatherBaseOffsets64(__vec4_i32(), uint32_t(), p, offsets, scale, 
-                                mask);
+                                delta, mask);
 }

 static FORCEINLINE __vec4_i64
 __gather_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets,
-                            uint32_t scale, __vec4_i1 mask) {
+                            uint32_t scale, __vec4_i32 delta, __vec4_i1 mask) {
    return lGatherBaseOffsets32(__vec4_i64(), uint64_t(), p, offsets, scale, 
-                                mask);
+                                delta, mask);
 }

 static FORCEINLINE __vec4_i64
 __gather_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets,
-                            uint32_t scale, __vec4_i1 mask) {
+                            uint32_t scale, __vec4_i64 delta, __vec4_i1 mask) {
    return lGatherBaseOffsets64(__vec4_i64(), uint64_t(), p, offsets, scale, 
-                                mask);
+                                delta, mask);
 }

 template<typename RetVec, typename RetScalar>
@@ -2862,217 +2979,108 @@ static FORCEINLINE __vec4_i64 __gather64_i64(__vec4_i64 ptrs, __vec4_i1 mask) {

 // scatter
  
-static FORCEINLINE void
-__scatter_base_offsets32_i8(unsigned char *b, __vec4_i32 offsets, 
-                            uint32_t scale, __vec4_i8 val, __vec4_i1 mask) {
-    uint32_t m = _mm_extract_ps(mask.v, 0);
-    if (m != 0) {
-        int8_t *ptr = (int8_t *)(b + scale * _mm_extract_epi32(offsets.v, 0));
-        *ptr = _mm_extract_epi8(val.v, 0);
+#define SCATTER32_64(SUFFIX, TYPE, EXTRACT)                         \
+static FORCEINLINE void                                             \
+__scatter_base_offsets32_##SUFFIX (unsigned char *b, __vec4_i32 offsets, \
+                                   uint32_t scale, __vec4_i32 constOffset, \
+                                   __vec4_##SUFFIX val, __vec4_i1 mask) { \
+    uint32_t m = _mm_extract_ps(mask.v, 0);                             \
+    if (m != 0) {                                                       \
+        TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 0) + \
+                             _mm_extract_epi32(constOffset.v, 0));      \
+        *ptr = EXTRACT(val.v, 0);                                       \
+    }                                                                   \
+    m = _mm_extract_ps(mask.v, 1);                                      \
+    if (m != 0) {                                                       \
+        TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 1) + \
+                             _mm_extract_epi32(constOffset.v, 1));      \
+        *ptr = EXTRACT(val.v, 1);                                       \
+    }                                                                   \
+    m = _mm_extract_ps(mask.v, 2);                                      \
+    if (m != 0) {                                                       \
+        TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 2) + \
+                             _mm_extract_epi32(constOffset.v, 2));      \
+        *ptr = EXTRACT(val.v, 2);                                       \
+    }                                                                   \
+    m = _mm_extract_ps(mask.v, 3);                                      \
+    if (m != 0) {                                                       \
+        TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 3) + \
+                             _mm_extract_epi32(constOffset.v, 3));      \
+        *ptr = EXTRACT(val.v, 3);                                       \
+    }                                                                   \
+}                                                                       \
+static FORCEINLINE void                                                \
+__scatter_base_offsets64_##SUFFIX(unsigned char *p, __vec4_i64 offsets, \
+                                  uint32_t scale, __vec4_i64 constOffset, \
+                                  __vec4_##SUFFIX val, __vec4_i1 mask) { \
+    uint32_t m = _mm_extract_ps(mask.v, 0);                            \
+    if (m != 0) {                                                      \
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) +  \
+            _mm_extract_epi64(constOffset.v[0], 0);                    \
+        TYPE *ptr = (TYPE *)(p + offset);                              \
+        *ptr = EXTRACT(val.v, 0);                                      \
+    }                                                                  \
+    m = _mm_extract_ps(mask.v, 1);                                     \
+    if (m != 0) {                                                      \
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1) +  \
+            _mm_extract_epi64(constOffset.v[0], 1);                    \
+        TYPE *ptr = (TYPE *)(p + offset);                              \
+        *ptr = EXTRACT(val.v, 1);                                      \
+    }                                                                  \
+    m = _mm_extract_ps(mask.v, 2);                                     \
+    if (m != 0) {                                                      \
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0) +  \
+            _mm_extract_epi64(constOffset.v[1], 0);                    \
+        TYPE *ptr = (TYPE *)(p + offset);                              \
+        *ptr = EXTRACT(val.v, 2);                                      \
+    }                                                                  \
+    m = _mm_extract_ps(mask.v, 3);                                     \
+    if (m != 0) {                                                      \
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1) +  \
+            _mm_extract_epi64(constOffset.v[1], 1);                    \
+        TYPE *ptr = (TYPE *)(p + offset);                              \
+        *ptr = EXTRACT(val.v, 3);                                      \
+    }                                                                  \
 }

-    m = _mm_extract_ps(mask.v, 1);
-    if (m != 0) {
-        int8_t *ptr = (int8_t *)(b + scale * _mm_extract_epi32(offsets.v, 1));
-        *ptr = _mm_extract_epi8(val.v, 1);
-    }

-    m = _mm_extract_ps(mask.v, 2);
-    if (m != 0) {
-        int8_t *ptr = (int8_t *)(b + scale * _mm_extract_epi32(offsets.v, 2));
-        *ptr = _mm_extract_epi8(val.v, 2);
-    }
+SCATTER32_64(i8, int8_t, _mm_extract_epi8)
+SCATTER32_64(i16, int16_t, _mm_extract_epi16)
+SCATTER32_64(i32, int32_t, _mm_extract_epi32)

-    m = _mm_extract_ps(mask.v, 3);
-    if (m != 0) {
-        int8_t *ptr = (int8_t *)(b + scale * _mm_extract_epi32(offsets.v, 3));
-        *ptr = _mm_extract_epi8(val.v, 3);
-    }
-}
-
-static FORCEINLINE void
-__scatter_base_offsets64_i8(unsigned char *p, __vec4_i64 offsets, 
-                            uint32_t scale, __vec4_i8 val, __vec4_i1 mask) {
-    uint32_t m = _mm_extract_ps(mask.v, 0);
-    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
-        uint8_t *ptr = (uint8_t *)(p + offset);
-        *ptr = _mm_extract_epi8(val.v, 0);
-    }
-
-    m = _mm_extract_ps(mask.v, 1);
-    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
-        uint8_t *ptr = (uint8_t *)(p + offset);
-        *ptr = _mm_extract_epi8(val.v, 1);
-    }
-
-    m = _mm_extract_ps(mask.v, 2);
-    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
-        uint8_t *ptr = (uint8_t *)(p + offset);
-        *ptr = _mm_extract_epi8(val.v, 2);
-    }
-
-    m = _mm_extract_ps(mask.v, 3);
-    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
-        uint8_t *ptr = (uint8_t *)(p + offset);
-        *ptr = _mm_extract_epi8(val.v, 3);
-    }
-}
-
-static FORCEINLINE void
-__scatter_base_offsets32_i16(unsigned char *b, __vec4_i32 offsets,
-                             uint32_t scale, __vec4_i16 val, __vec4_i1 mask) {
-    uint32_t m = _mm_extract_ps(mask.v, 0);
-    if (m != 0) {
-        int16_t *ptr = (int16_t *)(b + scale * _mm_extract_epi32(offsets.v, 0));
-        *ptr = _mm_extract_epi16(val.v, 0);
-    }
-
-    m = _mm_extract_ps(mask.v, 1);
-    if (m != 0) {
-        int16_t *ptr = (int16_t *)(b + scale * _mm_extract_epi32(offsets.v, 1));
-        *ptr = _mm_extract_epi16(val.v, 1);
-    }
-
-    m = _mm_extract_ps(mask.v, 2);
-    if (m != 0) {
-        int16_t *ptr = (int16_t *)(b + scale * _mm_extract_epi32(offsets.v, 2));
-        *ptr = _mm_extract_epi16(val.v, 2);
-    }
-
-    m = _mm_extract_ps(mask.v, 3);
-    if (m != 0) {
-        int16_t *ptr = (int16_t *)(b + scale * _mm_extract_epi32(offsets.v, 3));
-        *ptr = _mm_extract_epi16(val.v, 3);
-    }
-}
-
-static FORCEINLINE void
-__scatter_base_offsets64_i16(unsigned char *p, __vec4_i64 offsets, 
-                             uint32_t scale, __vec4_i16 val, __vec4_i1 mask) {
-    uint32_t m = _mm_extract_ps(mask.v, 0);
-    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
-        uint16_t *ptr = (uint16_t *)(p + offset);
-        *ptr = _mm_extract_epi16(val.v, 0);
-    }
-
-    m = _mm_extract_ps(mask.v, 1);
-    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
-        uint16_t *ptr = (uint16_t *)(p + offset);
-        *ptr = _mm_extract_epi16(val.v, 1);
-    }
-
-    m = _mm_extract_ps(mask.v, 2);
-    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
-        uint16_t *ptr = (uint16_t *)(p + offset);
-        *ptr = _mm_extract_epi16(val.v, 2);
-    }
-
-    m = _mm_extract_ps(mask.v, 3);
-    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
-        uint16_t *ptr = (uint16_t *)(p + offset);
-        *ptr = _mm_extract_epi16(val.v, 3);
-    }
-}
-
-static FORCEINLINE void
-__scatter_base_offsets32_i32(unsigned char *b, __vec4_i32 offsets,
-                             uint32_t scale, __vec4_i32 val, __vec4_i1 mask) {
-    uint32_t m = _mm_extract_ps(mask.v, 0);
-    if (m != 0) {
-        int32_t *ptr = (int32_t *)(b + scale *
-                                   _mm_extract_epi32(offsets.v, 0));
-        *ptr = _mm_extract_epi32(val.v, 0);
-    }
-
-    m = _mm_extract_ps(mask.v, 1);
-    if (m != 0) {
-        int32_t *ptr = (int32_t *)(b + scale *
-                                   _mm_extract_epi32(offsets.v, 1));
-        *ptr = _mm_extract_epi32(val.v, 1);
-    }
-
-    m = _mm_extract_ps(mask.v, 2);
-    if (m != 0) {
-        int32_t *ptr = (int32_t *)(b + scale *
-                                   _mm_extract_epi32(offsets.v, 2));
-        *ptr = _mm_extract_epi32(val.v, 2);
-    }
-
-    m = _mm_extract_ps(mask.v, 3);
-    if (m != 0) {
-        int32_t *ptr = (int32_t *)(b + scale *
-                                   _mm_extract_epi32(offsets.v, 3));
-        *ptr = _mm_extract_epi32(val.v, 3);
-    }
-}
-
-static FORCEINLINE void
-__scatter_base_offsets64_i32(unsigned char *p, __vec4_i64 offsets, 
-                             uint32_t scale, __vec4_i32 val, __vec4_i1 mask) {
-    uint32_t m = _mm_extract_ps(mask.v, 0);
-    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
-        uint32_t *ptr = (uint32_t *)(p + offset);
-        *ptr = _mm_extract_epi32(val.v, 0);
-    }
-
-    m = _mm_extract_ps(mask.v, 1);
-    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
-        uint32_t *ptr = (uint32_t *)(p + offset);
-        *ptr = _mm_extract_epi32(val.v, 1);
-    }
-
-    m = _mm_extract_ps(mask.v, 2);
-    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
-        uint32_t *ptr = (uint32_t *)(p + offset);
-        *ptr = _mm_extract_epi32(val.v, 2);
-    }
-
-    m = _mm_extract_ps(mask.v, 3);
-    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
-        uint32_t *ptr = (uint32_t *)(p + offset);
-        *ptr = _mm_extract_epi32(val.v, 3);
-    }
-}

 static FORCEINLINE void
 __scatter_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets, 
-                             uint32_t scale, __vec4_i64 val, __vec4_i1 mask) {
+                             uint32_t scale, __vec4_i32 constOffset, __vec4_i64 val, 
+                             __vec4_i1 mask) {
    uint32_t m = _mm_extract_ps(mask.v, 0);
    if (m != 0) {
-        int32_t offset = scale * _mm_extract_epi32(offsets.v, 0);
+        int32_t offset = scale * _mm_extract_epi32(offsets.v, 0) +
+            _mm_extract_epi32(constOffset.v, 0);
        uint64_t *ptr = (uint64_t *)(p + offset);
        *ptr = _mm_extract_epi64(val.v[0], 0);
    }

    m = _mm_extract_ps(mask.v, 1);
    if (m != 0) {
-        int32_t offset = scale * _mm_extract_epi32(offsets.v, 1);
+        int32_t offset = scale * _mm_extract_epi32(offsets.v, 1) +
+            _mm_extract_epi32(constOffset.v, 1);
        uint64_t *ptr = (uint64_t *)(p + offset);
        *ptr = _mm_extract_epi64(val.v[0], 1);
    }

    m = _mm_extract_ps(mask.v, 2);
    if (m != 0) {
-        int32_t offset = scale * _mm_extract_epi32(offsets.v, 2);
+        int32_t offset = scale * _mm_extract_epi32(offsets.v, 2) +
+            _mm_extract_epi32(constOffset.v, 2);
        uint64_t *ptr = (uint64_t *)(p + offset);
        *ptr = _mm_extract_epi64(val.v[1], 0);
    }

    m = _mm_extract_ps(mask.v, 3);
    if (m != 0) {
-        int32_t offset = scale * _mm_extract_epi32(offsets.v, 3);
+        int32_t offset = scale * _mm_extract_epi32(offsets.v, 3) +
+            _mm_extract_epi32(constOffset.v, 3);
        uint64_t *ptr = (uint64_t *)(p + offset);
        *ptr = _mm_extract_epi64(val.v[1], 1);
    }
@@ -3080,31 +3088,36 @@ __scatter_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets,

 static FORCEINLINE void
 __scatter_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets, 
-                             uint32_t scale, __vec4_i64 val, __vec4_i1 mask) {
+                             uint32_t scale, __vec4_i64 constOffset,
+                             __vec4_i64 val, __vec4_i1 mask) {
    uint32_t m = _mm_extract_ps(mask.v, 0);
    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) +
+            _mm_extract_epi64(constOffset.v[0], 0);
        uint64_t *ptr = (uint64_t *)(p + offset);
        *ptr = _mm_extract_epi64(val.v[0], 0);
    }

    m = _mm_extract_ps(mask.v, 1);
    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1) +
+            _mm_extract_epi64(constOffset.v[0], 1);
        uint64_t *ptr = (uint64_t *)(p + offset);
        *ptr = _mm_extract_epi64(val.v[0], 1);
    }

    m = _mm_extract_ps(mask.v, 2);
    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0) +
+            _mm_extract_epi64(constOffset.v[1], 0);
        uint64_t *ptr = (uint64_t *)(p + offset);
        *ptr = _mm_extract_epi64(val.v[1], 0);
    }

    m = _mm_extract_ps(mask.v, 3);
    if (m != 0) {
-        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
+        int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1) +
+            _mm_extract_epi64(constOffset.v[1], 1);
        uint64_t *ptr = (uint64_t *)(p + offset);
        *ptr = _mm_extract_epi64(val.v[1], 1);
    }
--- a/examples/rt/rt.ispc
+++ b/examples/rt/rt.ispc
@@ -104,7 +104,7 @@ static void generateRay(uniform const float raster2camera[4][4],
 }


-static inline bool BBoxIntersect(const uniform float bounds[2][3], 
+static bool BBoxIntersect(const uniform float bounds[2][3], 
                          const Ray &ray) {
    uniform float3 bounds0 = { bounds[0][0], bounds[0][1], bounds[0][2] };
    uniform float3 bounds1 = { bounds[1][0], bounds[1][1], bounds[1][2] };
@@ -143,7 +143,7 @@ static inline bool BBoxIntersect(const uniform float bounds[2][3],



-static inline bool TriIntersect(const Triangle &tri, Ray &ray) {
+static bool TriIntersect(const Triangle &tri, Ray &ray) {
    uniform float3 p0 = { tri.p[0][0], tri.p[0][1], tri.p[0][2] };
    uniform float3 p1 = { tri.p[1][0], tri.p[1][1], tri.p[1][2] };
    uniform float3 p2 = { tri.p[2][0], tri.p[2][1], tri.p[2][2] };
--- a/examples/tasksys.cpp
+++ b/examples/tasksys.cpp
@@ -273,7 +273,7 @@ lAtomicCompareAndSwapPointer(void **v, void *newValue, void *oldValue) {
 #else
    void *result;
 #if (ISPC_POINTER_BYTES == 4)
-    __asm__ __volatile__("lock\ncmpxchgd %2,%1"
+    __asm__ __volatile__("lock\ncmpxchgl %2,%1"
                          : "=a"(result), "=m"(*v)
                          : "q"(newValue), "0"(oldValue)
                          : "memory");
--- a/examples/volume_rendering/volume.ispc
+++ b/examples/volume_rendering/volume.ispc
@@ -129,7 +129,7 @@ static inline float3 Offset(float3 p, float3 pMin, float3 pMax) {
 }


-static inline float Density(float3 Pobj, float3 pMin, float3 pMax, 
+static float Density(float3 Pobj, float3 pMin, float3 pMax, 
                     uniform float density[], uniform int nVoxels[3]) {
    if (!Inside(Pobj, pMin, pMax)) 
        return 0;
--- a/expr.cpp
+++ b/expr.cpp
--- a/expr.h
+++ b/expr.h
@@ -314,7 +314,6 @@ public:
    std::string identifier;
    const SourcePos identifierPos;

-protected:
    MemberExpr(Expr *expr, const char *identifier, SourcePos pos, 
               SourcePos identifierPos, bool derefLValue);

@@ -389,6 +388,10 @@ public:
        with values given by the "vales" parameter. */
    ConstExpr(ConstExpr *old, double *values);

+    /** Create ConstExpr with the same type and values as the given one,
+        but at the given position. */
+    ConstExpr(ConstExpr *old, SourcePos pos);
+
    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
    const Type *GetType() const;
    void Print() const;
@@ -681,11 +684,44 @@ public:
    const Type *GetType() const;
    Expr *TypeCheck();
    Expr *Optimize();
+    llvm::Constant *GetConstant(const Type *type) const;
    void Print() const;
    int EstimateCost() const;
 };


+/** An expression representing a "new" expression, used for dynamically
+    allocating memory. 
+*/
+class NewExpr : public Expr {
+public:
+    NewExpr(int typeQual, const Type *type, Expr *initializer, Expr *count, 
+            SourcePos tqPos, SourcePos p);
+
+    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
+    const Type *GetType() const;
+    Expr *TypeCheck();
+    Expr *Optimize();
+    void Print() const;
+    int EstimateCost() const;
+
+    /** Type of object to allocate storage for. */
+    const Type *allocType;
+    /** Expression giving the number of elements to allocate, when the 
+        "new Foo[expr]" form is used.  This may be NULL, in which case a
+        single element of the given type will be allocated. */
+    Expr *countExpr;
+    /** Optional initializer expression used to initialize the allocated
+        memory. */
+    Expr *initExpr;
+    /** Indicates whether this is a "varying new" or "uniform new"
+        (i.e. whether a separate allocation is performed per program
+        instance, or whether a single allocation is performed for the
+        entire gang of program instances.) */
+    bool isVarying;
+};
+
+
 /** This function indicates whether it's legal to convert from fromType to
    toType.  If the optional errorMsgBase and source position parameters
    are provided, then an error message is issued if the type conversion
@@ -704,4 +740,20 @@ bool CanConvertTypes(const Type *fromType, const Type *toType,
 */
 Expr *TypeConvertExpr(Expr *expr, const Type *toType, const char *errorMsgBase);

+/** Utility routine that emits code to initialize a symbol given an
+    initializer expression.
+
+    @param lvalue    Memory location of storage for the symbol's data
+    @param symName   Name of symbol (used in error messages)
+    @param symType   Type of variable being initialized
+    @param initExpr  Expression for the initializer
+    @param ctx       FunctionEmitContext to use for generating instructions
+    @param pos       Source file position of the variable being initialized
+*/
+void
+InitSymbol(llvm::Value *lvalue, const Type *symType, Expr *initExpr,
+           FunctionEmitContext *ctx, SourcePos pos);
+
+bool PossiblyResolveFunctionOverloads(Expr *expr, const Type *type);
+
 #endif // ISPC_EXPR_H
--- a/func.cpp
+++ b/func.cpp
@@ -334,12 +334,13 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
            if (ctx->GetCurrentBasicBlock())
                ctx->ReturnInst();
        }
-        else
+        else {
            // Set up basic blocks for goto targets
            ctx->InitializeLabelMap(code);
            // No check, just emit the code
            code->EmitCode(ctx);
        }
+    }

    if (ctx->GetCurrentBasicBlock()) {
        // FIXME: We'd like to issue a warning if we've reached the end of
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -185,6 +185,14 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
        t->allOffMaskIsSafe = true;
        t->maskBitCount = 1;
    }
+    else if (!strcasecmp(isa, "generic-1")) {
+        t->isa = Target::GENERIC;
+        t->nativeVectorWidth = 1;
+        t->vectorWidth = 1;
+        t->maskingIsFree = false;
+        t->allOffMaskIsSafe = false;
+        t->maskBitCount = 32;
+    }
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
    else if (!strcasecmp(isa, "avx")) {
        t->isa = Target::AVX;
@@ -210,7 +218,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
        t->isa = Target::AVX2;
        t->nativeVectorWidth = 8;
        t->vectorWidth = 8;
-        t->attributes = "+avx2,+popcnt,+cmov";
+        t->attributes = "+avx2,+popcnt,+cmov,+f16c";
        t->maskingIsFree = false;
        t->allOffMaskIsSafe = false;
        t->maskBitCount = 32;
@@ -219,7 +227,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
        t->isa = Target::AVX2;
        t->nativeVectorWidth = 16;
        t->vectorWidth = 16;
-        t->attributes = "+avx2,+popcnt,+cmov";
+        t->attributes = "+avx2,+popcnt,+cmov,+f16c";
        t->maskingIsFree = false;
        t->allOffMaskIsSafe = false;
        t->maskBitCount = 32;
@@ -270,7 +278,7 @@ Target::SupportedTargetISAs() {
 #ifdef LLVM_3_1svn
        ", avx2, avx2-x2"
 #endif // LLVM_3_1svn
-        ", generic-4, generic-8, generic-16";
+        ", generic-4, generic-8, generic-16, generic-1";
 }


@@ -387,7 +395,6 @@ lGenericTypeLayoutIndeterminate(LLVM_TYPE_CONST llvm::Type *type) {
        return false;
    }

-    type->dump();
    Assert(llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(type));
    return true;
 }
--- a/ispc.h
+++ b/ispc.h
@@ -418,6 +418,7 @@ enum {
    COST_ASSIGN = 1,
    COST_COHERENT_BREAK_CONTINE = 4,
    COST_COMPLEX_ARITH_OP = 4,
+    COST_DELETE = 32,
    COST_DEREF = 4,
    COST_FUNCALL = 4,
    COST_FUNPTR_UNIFORM = 12,
@@ -425,6 +426,7 @@ enum {
    COST_GATHER = 8,
    COST_GOTO = 4,
    COST_LOAD = 2,
+    COST_NEW = 32,
    COST_REGULAR_BREAK_CONTINUE = 2,
    COST_RETURN = 4,
    COST_SELECT = 4,
@@ -437,6 +439,8 @@ enum {
    COST_VARYING_IF = 3,
    COST_UNIFORM_LOOP = 4,
    COST_VARYING_LOOP = 6,
+    COST_UNIFORM_SWITCH = 4,
+    COST_VARYING_SWITCH = 12,
    COST_ASSERT = 8,

    CHECK_MASK_AT_FUNCTION_START_COST = 16,
--- a/ispc.vcxproj
+++ b/ispc.vcxproj
@@ -18,11 +18,14 @@
    <ClCompile Include="decl.cpp" />
    <ClCompile Include="expr.cpp" />
    <ClCompile Include="func.cpp" />
-    <ClCompile Include="gen-bitcode-avx.cpp" />
-    <ClCompile Include="gen-bitcode-avx-x2.cpp" />
+    <ClCompile Include="gen-bitcode-avx1.cpp" />
+    <ClCompile Include="gen-bitcode-avx1-x2.cpp" />
+    <ClCompile Include="gen-bitcode-avx2.cpp" />
+    <ClCompile Include="gen-bitcode-avx2-x2.cpp" />
    <ClCompile Include="gen-bitcode-c-32.cpp" />
    <ClCompile Include="gen-bitcode-c-64.cpp" />
    <ClCompile Include="gen-bitcode-dispatch.cpp" />
+    <ClCompile Include="gen-bitcode-generic-1.cpp" />
    <ClCompile Include="gen-bitcode-generic-4.cpp" />
    <ClCompile Include="gen-bitcode-generic-8.cpp" />
    <ClCompile Include="gen-bitcode-generic-16.cpp" />
@@ -158,29 +161,68 @@
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
-    <CustomBuild Include="builtins\target-avx.ll">
+    <CustomBuild Include="builtins\target-avx1.ll">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx.ll | python bitcode2cpp.py builtins\target-avx.ll &gt; gen-bitcode-avx.cpp</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-avx-common.ll</AdditionalInputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx.ll | python bitcode2cpp.py builtins\target-avx.ll &gt; gen-bitcode-avx.cpp</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-avx-common.ll</AdditionalInputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx.cpp</Message>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx.cpp</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll &gt; gen-bitcode-avx1.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx1.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll &gt; gen-bitcode-avx1.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx1.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx1.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx1.cpp</Message>
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
-    <CustomBuild Include="builtins\target-avx-x2.ll">
+    <CustomBuild Include="builtins\target-avx1-x2.ll">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx-x2.ll | python bitcode2cpp.py builtins\target-avx-x2.ll &gt; gen-bitcode-avx-x2.cpp</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx-x2.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-avx-common.ll</AdditionalInputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx-x2.ll | python bitcode2cpp.py builtins\target-avx-x2.ll &gt; gen-bitcode-avx-x2.cpp</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx-x2.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-avx-common.ll</AdditionalInputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx-x2.cpp</Message>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx-x2.cpp</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx1-x2.ll | python bitcode2cpp.py builtins\target-avx1-x2.ll &gt; gen-bitcode-avx1-x2.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx1-x2.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx1-x2.ll | python bitcode2cpp.py builtins\target-avx1-x2.ll &gt; gen-bitcode-avx1-x2.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx1-x2.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\targets-avx-x2.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx1-x2.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx1-x2.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-avx2.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx2.ll | python bitcode2cpp.py builtins\target-avx2.ll &gt; gen-bitcode-avx2.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx2.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx2.ll | python bitcode2cpp.py builtins\target-avx2.ll &gt; gen-bitcode-avx2.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx2.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx2.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx2.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-avx2-x2.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx2-x2.ll | python bitcode2cpp.py builtins\target-avx2-x2.ll &gt; gen-bitcode-avx2-x2.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx2-x2.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx2-x2.ll | python bitcode2cpp.py builtins\target-avx2-x2.ll &gt; gen-bitcode-avx2-x2.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx2-x2.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\targets-avx-x2.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx2-x2.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx2-x2.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-generic-1.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-1.ll | python bitcode2cpp.py builtins\target-generic-1.ll &gt; gen-bitcode-generic-1.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-generic-1.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-1.ll | python bitcode2cpp.py builtins\target-generic-1.ll &gt; gen-bitcode-generic-1.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-generic-1.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-generic-1.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-generic-1.cpp</Message>
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
--- a/lex.ll
+++ b/lex.ll
@@ -93,6 +93,8 @@ continue { return TOKEN_CONTINUE; }
 creturn { return TOKEN_CRETURN; }
 default { return TOKEN_DEFAULT; }
 do { return TOKEN_DO; }
+delete { return TOKEN_DELETE; }
+delete\[\] { return TOKEN_DELETE; }
 double { return TOKEN_DOUBLE; }
 else { return TOKEN_ELSE; }
 enum { return TOKEN_ENUM; }
@@ -112,6 +114,7 @@ int16 { return TOKEN_INT16; }
 int32 { return TOKEN_INT; }
 int64 { return TOKEN_INT64; }
 launch { return TOKEN_LAUNCH; }
+new { return TOKEN_NEW; }
 NULL { return TOKEN_NULL; }
 print { return TOKEN_PRINT; }
 reference { Error(*yylloc, "\"reference\" qualifier is no longer supported; "
@@ -156,7 +159,7 @@ L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL;
        yylval->intVal = lParseBinary(yytext+2, *yylloc, &endPtr);
    else {
 #if defined(ISPC_IS_WINDOWS) && !defined(__MINGW32__)
-        yylval->intVal = _strtoi64(yytext, &endPtr, 0);
+        yylval->intVal = _strtoui64(yytext, &endPtr, 0);
 #else
        // FIXME: should use strtouq and then issue an error if we can't
        // fit into 64 bits...
--- a/llvmutil.cpp
+++ b/llvmutil.cpp
@@ -36,7 +36,9 @@
 */

 #include "llvmutil.h"
+#include "ispc.h"
 #include "type.h"
+#include <llvm/Instructions.h>

 LLVM_TYPE_CONST llvm::Type *LLVMTypes::VoidType = NULL;
 LLVM_TYPE_CONST llvm::PointerType *LLVMTypes::VoidPointerType = NULL;
@@ -465,3 +467,252 @@ LLVMBoolVector(const bool *bvec) {
    }
    return llvm::ConstantVector::get(vals);
 }
+
+
+/** Conservative test to see if two llvm::Values are equal.  There are
+    (potentially many) cases where the two values actually are equal but
+    this will return false.  However, if it does return true, the two
+    vectors definitely are equal.
+
+    @todo This seems to catch all of the cases we currently need it for in
+    practice, but it's be nice to make it a little more robust/general.  In
+    general, though, a little something called the halting problem means we
+    won't get all of them.
+*/
+static bool
+lValuesAreEqual(llvm::Value *v0, llvm::Value *v1, 
+                std::vector<llvm::PHINode *> &seenPhi0,
+                std::vector<llvm::PHINode *> &seenPhi1) {
+    // Thanks to the fact that LLVM hashes and returns the same pointer for
+    // constants (of all sorts, even constant expressions), this first test
+    // actually catches a lot of cases.  LLVM's SSA form also helps a lot
+    // with this..
+    if (v0 == v1)
+        return true;
+
+    Assert(seenPhi0.size() == seenPhi1.size());
+    for (unsigned int i = 0; i < seenPhi0.size(); ++i)
+        if (v0 == seenPhi0[i] && v1 == seenPhi1[i])
+            return true;
+
+    llvm::BinaryOperator *bo0 = llvm::dyn_cast<llvm::BinaryOperator>(v0);
+    llvm::BinaryOperator *bo1 = llvm::dyn_cast<llvm::BinaryOperator>(v1);
+    if (bo0 != NULL && bo1 != NULL) {
+        if (bo0->getOpcode() != bo1->getOpcode())
+            return false;
+        return (lValuesAreEqual(bo0->getOperand(0), bo1->getOperand(0),
+                                seenPhi0, seenPhi1) &&
+                lValuesAreEqual(bo0->getOperand(1), bo1->getOperand(1),
+                                seenPhi0, seenPhi1));
+    }
+
+    llvm::PHINode *phi0 = llvm::dyn_cast<llvm::PHINode>(v0);
+    llvm::PHINode *phi1 = llvm::dyn_cast<llvm::PHINode>(v1);
+    if (phi0 != NULL && phi1 != NULL) {
+        if (phi0->getNumIncomingValues() != phi1->getNumIncomingValues())
+            return false;
+
+        seenPhi0.push_back(phi0);
+        seenPhi1.push_back(phi1);
+
+        unsigned int numIncoming = phi0->getNumIncomingValues();
+        // Check all of the incoming values: if all of them are all equal,
+        // then we're good.
+        bool anyFailure = false;
+        for (unsigned int i = 0; i < numIncoming; ++i) {
+            Assert(phi0->getIncomingBlock(i) == phi1->getIncomingBlock(i));
+            if (!lValuesAreEqual(phi0->getIncomingValue(i), 
+                                 phi1->getIncomingValue(i), seenPhi0, seenPhi1)) {
+                anyFailure = true;
+                break;
+            }
+        }
+
+        seenPhi0.pop_back();
+        seenPhi1.pop_back();
+
+        return !anyFailure;
+    }
+
+    return false;
+}
+
+
+/** Given an llvm::Value known to be an integer, return its value as
+    an int64_t.
+*/
+static int64_t
+lGetIntValue(llvm::Value *offset) {
+    llvm::ConstantInt *intOffset = llvm::dyn_cast<llvm::ConstantInt>(offset);
+    Assert(intOffset && (intOffset->getBitWidth() == 32 ||
+                         intOffset->getBitWidth() == 64));
+    return intOffset->getSExtValue();
+}
+
+
+/** This function takes chains of InsertElement instructions along the
+    lines of:
+
+    %v0 = insertelement undef, value_0, i32 index_0
+    %v1 = insertelement %v1,   value_1, i32 index_1
+    ...
+    %vn = insertelement %vn-1, value_n-1, i32 index_n-1
+
+    and initializes the provided elements array such that the i'th
+    llvm::Value * in the array is the element that was inserted into the
+    i'th element of the vector.
+*/
+void
+LLVMFlattenInsertChain(llvm::InsertElementInst *ie, int vectorWidth,
+                       llvm::Value **elements) {
+    for (int i = 0; i < vectorWidth; ++i)
+        elements[i] = NULL;
+
+    while (ie != NULL) {
+        int64_t iOffset = lGetIntValue(ie->getOperand(2));
+        Assert(iOffset >= 0 && iOffset < vectorWidth);
+        Assert(elements[iOffset] == NULL);
+
+        elements[iOffset] = ie->getOperand(1);
+
+        llvm::Value *insertBase = ie->getOperand(0);
+        ie = llvm::dyn_cast<llvm::InsertElementInst>(insertBase);
+        if (ie == NULL) {
+            if (llvm::isa<llvm::UndefValue>(insertBase))
+                return;
+
+            llvm::ConstantVector *cv = 
+                llvm::dyn_cast<llvm::ConstantVector>(insertBase);
+            Assert(cv != NULL);
+            Assert(iOffset < (int)cv->getNumOperands());
+            elements[iOffset] = cv->getOperand(iOffset);
+        }
+    }
+}
+
+
+/** Tests to see if all of the elements of the vector in the 'v' parameter
+    are equal.  Like lValuesAreEqual(), this is a conservative test and may
+    return false for arrays where the values are actually all equal.  */
+bool
+LLVMVectorValuesAllEqual(llvm::Value *v, int vectorLength,
+                         std::vector<llvm::PHINode *> &seenPhis) {
+    if (vectorLength == 1)
+        return true;
+
+    if (llvm::isa<llvm::ConstantAggregateZero>(v))
+        return true;
+
+    llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(v);
+    if (cv != NULL)
+        return (cv->getSplatValue() != NULL);
+
+#ifdef LLVM_3_1svn
+    llvm::ConstantDataVector *cdv = llvm::dyn_cast<llvm::ConstantDataVector>(v);
+    if (cdv != NULL)
+        return (cdv->getSplatValue() != NULL);
+#endif
+
+    llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(v);
+    if (bop != NULL)
+        return (LLVMVectorValuesAllEqual(bop->getOperand(0), vectorLength, 
+                                      seenPhis) &&
+                LLVMVectorValuesAllEqual(bop->getOperand(1), vectorLength, 
+                                      seenPhis));
+
+    llvm::CastInst *cast = llvm::dyn_cast<llvm::CastInst>(v);
+    if (cast != NULL)
+        return LLVMVectorValuesAllEqual(cast->getOperand(0), vectorLength, 
+                                     seenPhis);
+
+    llvm::InsertElementInst *ie = llvm::dyn_cast<llvm::InsertElementInst>(v);
+    if (ie != NULL) {
+        llvm::Value *elements[ISPC_MAX_NVEC];
+        LLVMFlattenInsertChain(ie, vectorLength, elements);
+
+        // We will ignore any values of elements[] that are NULL; as they
+        // correspond to undefined values--we just want to see if all of
+        // the defined values have the same value.
+        int lastNonNull = 0;
+        while (lastNonNull < vectorLength && elements[lastNonNull] == NULL)
+            ++lastNonNull;
+
+        if (lastNonNull == vectorLength)
+            // all of them are undef!
+            return true;
+
+        for (int i = lastNonNull; i < vectorLength; ++i) {
+            if (elements[i] == NULL)
+                continue;
+
+            std::vector<llvm::PHINode *> seenPhi0;
+            std::vector<llvm::PHINode *> seenPhi1;
+            if (lValuesAreEqual(elements[lastNonNull], elements[i], seenPhi0, 
+                                seenPhi1) == false)
+                return false;
+            lastNonNull = i;
+        }
+        return true;
+    }
+
+    llvm::PHINode *phi = llvm::dyn_cast<llvm::PHINode>(v);
+    if (phi) {
+        for (unsigned int i = 0; i < seenPhis.size(); ++i)
+            if (seenPhis[i] == phi)
+                return true;
+
+        seenPhis.push_back(phi);
+
+        unsigned int numIncoming = phi->getNumIncomingValues();
+        // Check all of the incoming values: if all of them are all equal,
+        // then we're good.
+        for (unsigned int i = 0; i < numIncoming; ++i) {
+            if (!LLVMVectorValuesAllEqual(phi->getIncomingValue(i), vectorLength,
+                                       seenPhis)) {
+                seenPhis.pop_back();
+                return false;
+            }
+        }
+
+        seenPhis.pop_back();
+        return true;
+    }
+
+    if (llvm::isa<llvm::UndefValue>(v))
+        // ?
+        return false;
+
+    Assert(!llvm::isa<llvm::Constant>(v));
+
+    if (llvm::isa<llvm::CallInst>(v) || llvm::isa<llvm::LoadInst>(v) ||
+        !llvm::isa<llvm::Instruction>(v))
+        return false;
+
+    llvm::ShuffleVectorInst *shuffle = llvm::dyn_cast<llvm::ShuffleVectorInst>(v);
+    if (shuffle != NULL) {
+        llvm::Value *indices = shuffle->getOperand(2);
+        if (LLVMVectorValuesAllEqual(indices, vectorLength, seenPhis))
+            // The easy case--just a smear of the same element across the
+            // whole vector.
+            return true;
+
+        // TODO: handle more general cases?
+        return false;
+    }
+
+#if 0
+    fprintf(stderr, "all equal: ");
+    v->dump();
+    fprintf(stderr, "\n");
+    llvm::Instruction *inst = llvm::dyn_cast<llvm::Instruction>(v);
+    if (inst) {
+        inst->getParent()->dump();
+        fprintf(stderr, "\n");
+        fprintf(stderr, "\n");
+    }
+#endif
+
+    return false;
+}
+
+
--- a/llvmutil.h
+++ b/llvmutil.h
@@ -38,12 +38,23 @@
 #ifndef ISPC_LLVMUTIL_H
 #define ISPC_LLVMUTIL_H 1

-#include "ispc.h"
 #include <llvm/LLVMContext.h>
 #include <llvm/Type.h>
 #include <llvm/DerivedTypes.h>
 #include <llvm/Constants.h>

+namespace llvm {
+    class PHINode;
+    class InsertElementInst;
+}
+
+// llvm::Type *s are no longer const in llvm 3.0
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
+#define LLVM_TYPE_CONST
+#else
+#define LLVM_TYPE_CONST const
+#endif
+

 /** This structure holds pointers to a variety of LLVM types; code
    elsewhere can use them from here, ratherthan needing to make more
@@ -99,6 +110,7 @@ extern llvm::Constant *LLVMTrue, *LLVMFalse;
    of LLVMTypes and the LLVMTrue/LLVMFalse constants.  However, it can't
    be called until the compilation target is known.
 */
+struct Target;
 extern void InitLLVMUtil(llvm::LLVMContext *ctx, Target target);

 /** Returns an LLVM i8 constant of the given value */
@@ -205,4 +217,13 @@ extern llvm::Constant *LLVMMaskAllOn;
 /** LLVM constant value representing an 'all off' SIMD lane mask */
 extern llvm::Constant *LLVMMaskAllOff;

+/** Tests to see if all of the elements of the vector in the 'v' parameter
+    are equal.  Like lValuesAreEqual(), this is a conservative test and may
+    return false for arrays where the values are actually all equal.  */
+extern bool LLVMVectorValuesAllEqual(llvm::Value *v, int vectorLength,
+                                     std::vector<llvm::PHINode *> &seenPhis);
+
+void LLVMFlattenInsertChain(llvm::InsertElementInst *ie, int vectorWidth,
+                            llvm::Value **elements);
+
 #endif // ISPC_LLVMUTIL_H
--- a/main.cpp
+++ b/main.cpp
@@ -60,10 +60,27 @@
 #define BUILD_VERSION ""
 #endif // ISPC_IS_WINDOWS

-static void usage(int ret) {
-    printf("This is the Intel(r) SPMD Program Compiler (ispc), build %s (%s)\n\n", 
-           BUILD_DATE, BUILD_VERSION);
-    printf("usage: ispc\n");
+static void
+lPrintVersion() {
+    printf("Intel(r) SPMD Program Compiler (ispc), build %s (%s, LLVM %s)\n", 
+           BUILD_DATE, BUILD_VERSION,
+#ifdef LLVM_2_9
+           "2.9"
+#elif defined(LLVM_3_0) || defined(LLVM_3_0svn)
+           "3.0"
+#elif defined(LLVM_3_1) || defined(LLVM_3_1svn)
+           "3.1"
+#else
+#error "Unhandled LLVM version"
+#endif 
+           );
+}
+
+
+static void
+usage(int ret) {
+    lPrintVersion();
+    printf("\nusage: ispc\n");
    printf("    [--addressing={32,64}]\t\tSelect 32- or 64-bit addressing. (Note that 32-bit\n");
    printf("                          \t\taddressing calculations are done by default, even\n");
    printf("                          \t\ton 64-bit target architectures.)\n");
@@ -367,8 +384,7 @@ int main(int Argc, char *Argv[]) {
            generatePIC = true;
 #endif // !ISPC_IS_WINDOWS
        else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--version")) {
-            printf("Intel(r) SPMD Program Compiler (ispc) build %s (%s)\n", 
-                   BUILD_DATE, BUILD_VERSION);
+            lPrintVersion();
            return 0;
        }
        else if (argv[i][0] == '-') {
--- a/opt.cpp
+++ b/opt.cpp
--- a/parse.yy
+++ b/parse.yy
@@ -106,13 +106,14 @@ static void lFinalizeEnumeratorSymbols(std::vector<Symbol *> &enums,
                                       const EnumType *enumType);

 static const char *lBuiltinTokens[] = {
-    "assert", "bool", "break", "case", "cbreak", "ccontinue", "cdo", "cfor",
-    "cif", "cwhile", "const", "continue", "creturn", "default", "do", "double", 
-    "else", "enum", "export", "extern", "false", "float", "for", "foreach",
-    "foreach_tiled", "goto", "if", "inline", "int", "int8", "int16",
-    "int32", "int64", "launch", "NULL", "print", "return", "signed", "sizeof",
-    "static", "struct", "switch", "sync", "task", "true", "typedef", "uniform",
-    "unsigned", "varying", "void", "while", NULL 
+    "assert", "bool", "break", "case", "cbreak", "ccontinue", "cdo",
+    "cfor", "cif", "cwhile", "const", "continue", "creturn", "default",
+    "do", "delete", "double", "else", "enum", "export", "extern", "false",
+    "float", "for", "foreach", "foreach_tiled", "goto", "if", "inline",
+    "int", "int8", "int16", "int32", "int64", "launch", "new", "NULL",
+    "print", "return", "signed", "sizeof", "static", "struct", "switch",
+    "sync", "task", "true", "typedef", "uniform", "unsigned", "varying",
+    "void", "while", NULL 
 };

 static const char *lParamListTokens[] = {
@@ -170,7 +171,7 @@ struct ForeachDimension {
 %token TOKEN_AND_OP TOKEN_OR_OP TOKEN_MUL_ASSIGN TOKEN_DIV_ASSIGN TOKEN_MOD_ASSIGN 
 %token TOKEN_ADD_ASSIGN TOKEN_SUB_ASSIGN TOKEN_LEFT_ASSIGN TOKEN_RIGHT_ASSIGN 
 %token TOKEN_AND_ASSIGN TOKEN_OR_ASSIGN TOKEN_XOR_ASSIGN
-%token TOKEN_SIZEOF
+%token TOKEN_SIZEOF TOKEN_NEW TOKEN_DELETE

 %token TOKEN_EXTERN TOKEN_EXPORT TOKEN_STATIC TOKEN_INLINE TOKEN_TASK 
 %token TOKEN_UNIFORM TOKEN_VARYING TOKEN_TYPEDEF TOKEN_SOA
@@ -189,7 +190,7 @@ struct ForeachDimension {
 %type <expr> multiplicative_expression additive_expression shift_expression
 %type <expr> relational_expression equality_expression and_expression
 %type <expr> exclusive_or_expression inclusive_or_expression
-%type <expr> logical_and_expression logical_or_expression 
+%type <expr> logical_and_expression logical_or_expression new_expression
 %type <expr> conditional_expression assignment_expression expression
 %type <expr> initializer constant_expression for_test
 %type <exprList> argument_expression_list initializer_list
@@ -197,7 +198,7 @@ struct ForeachDimension {
 %type <stmt> statement labeled_statement compound_statement for_init_statement
 %type <stmt> expression_statement selection_statement iteration_statement
 %type <stmt> jump_statement statement_list declaration_statement print_statement
-%type <stmt> assert_statement sync_statement
+%type <stmt> assert_statement sync_statement delete_statement

 %type <declaration> declaration parameter_declaration
 %type <declarators> init_declarator_list 
@@ -215,7 +216,7 @@ struct ForeachDimension {
 %type <enumType> enum_specifier

 %type <type> specifier_qualifier_list struct_or_union_specifier
-%type <type> type_specifier type_name
+%type <type> type_specifier type_name rate_qualified_new_type
 %type <type> short_vec_specifier
 %type <atomicType> atomic_var_type_specifier

@@ -225,7 +226,7 @@ struct ForeachDimension {

 %type <stringVal> string_constant
 %type <constCharPtr> struct_or_union_name enum_identifier goto_identifier
-%type <intVal> int_constant soa_width_specifier
+%type <intVal> int_constant soa_width_specifier rate_qualified_new

 %type <foreachDimension> foreach_dimension_specifier
 %type <foreachDimensionList> foreach_dimension_list
@@ -448,8 +449,36 @@ conditional_expression
      { $$ = new SelectExpr($1, $3, $5, Union(@1,@5)); }
    ;

-assignment_expression
+rate_qualified_new
+    : TOKEN_NEW { $$ = 0; }
+    | TOKEN_UNIFORM TOKEN_NEW { $$ = TYPEQUAL_UNIFORM; }
+    | TOKEN_VARYING TOKEN_NEW { $$ = TYPEQUAL_VARYING; }
+    ;
+
+rate_qualified_new_type
+    : type_specifier { $$ = $1; }
+    | TOKEN_UNIFORM type_specifier { $$ = $2->GetAsUniformType(); }
+    | TOKEN_VARYING type_specifier { $$ = $2->GetAsVaryingType(); }
+    ;
+
+new_expression
    : conditional_expression
+    | rate_qualified_new rate_qualified_new_type
+    {
+        $$ = new NewExpr($1, $2, NULL, NULL, @1, Union(@1, @2));
+    }
+    | rate_qualified_new rate_qualified_new_type '(' initializer_list ')'
+    {
+        $$ = new NewExpr($1, $2, $4, NULL, @1, Union(@1, @2));
+    }
+    | rate_qualified_new rate_qualified_new_type '[' expression ']'
+    {
+        $$ = new NewExpr($1, $2, NULL, $4, @1, Union(@1, @4));
+    }
+    ;
+
+assignment_expression
+    : new_expression
    | unary_expression '=' assignment_expression
      { $$ = new AssignExpr(AssignExpr::Assign, $1, $3, Union(@1, @3)); }
    | unary_expression TOKEN_MUL_ASSIGN assignment_expression
@@ -1240,6 +1269,7 @@ statement
    | print_statement
    | assert_statement
    | sync_statement
+    | delete_statement
    | error
    {
        std::vector<std::string> builtinTokens;
@@ -1265,9 +1295,17 @@ labeled_statement
        $$ = new LabeledStmt($1, $3, @1);
    }
    | TOKEN_CASE constant_expression ':' statement
-      { UNIMPLEMENTED; }
+      { 
+          int value;
+          if ($2 != NULL && 
+              lGetConstantInt($2, &value, @2, "Case statement value")) {
+              $$ = new CaseStmt(value, $4, Union(@1, @2));
+          }
+          else
+              $$ = NULL;
+      }
    | TOKEN_DEFAULT ':' statement
-      { UNIMPLEMENTED; }
+      { $$ = new DefaultStmt($3, @1); }
    ;

 start_scope
@@ -1313,7 +1351,7 @@ selection_statement
    | TOKEN_CIF '(' expression ')' statement TOKEN_ELSE statement
      { $$ = new IfStmt($3, $5, $7, true, @1); }
    | TOKEN_SWITCH '(' expression ')' statement
-      { UNIMPLEMENTED; }
+      { $$ = new SwitchStmt($3, $5, @1); }
    ;

 for_test
@@ -1461,23 +1499,30 @@ jump_statement
    ;

 sync_statement
-    : TOKEN_SYNC 
+    : TOKEN_SYNC ';'
      { $$ = new ExprStmt(new SyncExpr(@1), @1); }
    ;

+delete_statement
+    : TOKEN_DELETE expression ';'
+    {
+        $$ = new DeleteStmt($2, Union(@1, @2));
+    }
+    ;
+
 print_statement
-    : TOKEN_PRINT '(' string_constant ')'
+    : TOKEN_PRINT '(' string_constant ')' ';'
      {
           $$ = new PrintStmt(*$3, NULL, @1); 
      }
-    | TOKEN_PRINT '(' string_constant ',' argument_expression_list ')'
+    | TOKEN_PRINT '(' string_constant ',' argument_expression_list ')' ';'
      {
           $$ = new PrintStmt(*$3, $5, @1); 
      }
    ;

 assert_statement
-    : TOKEN_ASSERT '(' string_constant ',' expression ')'
+    : TOKEN_ASSERT '(' string_constant ',' expression ')' ';'
      {
          $$ = new AssertStmt(*$3, $5, @1);
      }
@@ -1614,7 +1659,7 @@ lAddFunctionParams(Declarator *decl) {

 /** Add a symbol for the built-in mask variable to the symbol table */
 static void lAddMaskToSymbolTable(SourcePos pos) {
-    const Type *t = g->target.isa == Target::GENERIC ?
+    const Type *t = g->target.maskBitCount == 1 ?
        AtomicType::VaryingConstBool : AtomicType::VaryingConstUInt32;
    Symbol *maskSymbol = new Symbol("__mask", pos, t);
    m->symbolTable->AddVariable(maskSymbol);
--- a/run_tests.py
+++ b/run_tests.py
@@ -12,12 +12,14 @@ import re
 import signal
 import random
 import string
-import mutex
 import subprocess
 import shlex
 import platform
 import tempfile

+# This script is affected by http://bugs.python.org/issue5261 on OSX 10.5 Leopard
+# git history has a workaround for that issue.
+
 is_windows = (platform.system() == 'Windows' or
              'CYGWIN_NT' in platform.system())

@@ -36,29 +38,32 @@ parser.add_option("-c", "--compiler", dest="compiler_exe", help="Compiler binary
                  default=None)
 parser.add_option('-o', '--no-opt', dest='no_opt', help='Disable optimization',
                  default=False, action="store_true")
+parser.add_option('-j', '--jobs', dest='num_jobs', help='Maximum number of jobs to run in parallel',
+                  default="1024", type="int")
 parser.add_option('-v', '--verbose', dest='verbose', help='Enable verbose output',
                  default=False, action="store_true")
-if not is_windows:
-    parser.add_option('--valgrind', dest='valgrind', help='Run tests with valgrind',
-                      default=False, action="store_true")
+parser.add_option('--wrap-exe', dest='wrapexe',
+                  help='Executable to wrap test runs with (e.g. "valgrind")',
+                  default="")

 (options, args) = parser.parse_args()

-if not is_windows and options.valgrind:
-    valgrind_cmd = "valgrind "
+if not is_windows:
+    ispc_exe = "./ispc"
 else:
-    valgrind_cmd = ""
+    ispc_exe = "../Release/ispc.exe"

-is_generic_target = options.target.find("generic-") != -1
+is_generic_target = (options.target.find("generic-") != -1 and
+                     options.target != "generic-1")
 if is_generic_target and options.include_file == None:
    if options.target == "generic-4":
-        print "No generics #include specified; using examples/intrinsics/sse4.h"
+        sys.stderr.write("No generics #include specified; using examples/intrinsics/sse4.h\n")
        options.include_file = "examples/intrinsics/sse4.h"
    elif options.target == "generic-8":
-        print "No generics #include specified and no default available for \"generic-8\" target.";
+        sys.stderr.write("No generics #include specified and no default available for \"generic-8\" target.\n")
        sys.exit(1)
    elif options.target == "generic-16":
-        print "No generics #include specified; using examples/intrinsics/generic-16.h"
+        sys.stderr.write("No generics #include specified; using examples/intrinsics/generic-16.h\n")
        options.include_file = "examples/intrinsics/generic-16.h"

 if options.compiler_exe == None:
@@ -67,16 +72,33 @@ if options.compiler_exe == None:
    else:
        options.compiler_exe = "g++"

-# if no specific test files are specified, run all of the tests in tests/
-# and failing_tests/
+def fix_windows_paths(files):
+    ret = [ ]
+    for fn in files:
+        ret += [ string.replace(fn, '\\', '/') ]
+    return ret
+
+    
+# if no specific test files are specified, run all of the tests in tests/,
+# failing_tests/, and tests_errors/
 if len(args) == 0:
    files = glob.glob("tests/*ispc") + glob.glob("failing_tests/*ispc") + \
        glob.glob("tests_errors/*ispc")
+    files = fix_windows_paths(files)
 else:
-    files = [ ]
+    if is_windows:
+        argfiles = [ ]
        for f in args:
+            # we have to glob ourselves if this is being run under a DOS
+            # shell..
+            argfiles += glob.glob(f)
+    else:
+        argfiles = args
+        
+    files = [ ]
+    for f in argfiles:
        if os.path.splitext(string.lower(f))[1] != ".ispc":
-            print "Ignoring file %s, which doesn't have an .ispc extension." % f
+            sys.stdout.write("Ignoring file %s, which doesn't have an .ispc extension.\n" % f)
        else:
            files += [ f ]

@@ -88,18 +110,13 @@ if (options.random):
 # counter
 total_tests = 0

-# We'd like to use the Lock class from the multiprocessing package to
-# serialize accesses to finished_tests_counter.  Unfortunately, the version of
-# python that ships with OSX 10.5 has this bug:
-# http://bugs.python.org/issue5261.  Therefore, we use the (deprecated but
-# still available) mutex class.
-#finished_tests_counter_lock = multiprocessing.Lock()
-finished_tests_mutex = mutex.mutex()
 finished_tests_counter = multiprocessing.Value(c_int)
+finished_tests_counter_lock = multiprocessing.Lock()

 # utility routine to print an update on the number of tests that have been
-# finished.  Should be called with the mutex (or lock) held..
+# finished.  Should be called with the lock held..
 def update_progress(fn):
+    global total_tests
    finished_tests_counter.value = finished_tests_counter.value + 1
    progress_str = " Done %d / %d [%s]" % (finished_tests_counter.value, total_tests, fn)
    # spaces to clear out detrius from previous printing...
@@ -108,18 +125,18 @@ def update_progress(fn):
    progress_str += '\r'
    sys.stdout.write(progress_str)
    sys.stdout.flush()
-    finished_tests_mutex.unlock()

 def run_command(cmd):
    if options.verbose:
-        print "Running: %s" % cmd
+        sys.stdout.write("Running: %s\n" % cmd)
    sp = subprocess.Popen(shlex.split(cmd), stdin=None,
                          stdout=subprocess.PIPE,
                          stderr=subprocess.PIPE)
    out = sp.communicate()
    output = ""
-    output += out[0]
-    output += out[1]
+    output += out[0].decode("utf-8")
+    output += out[1].decode("utf-8")
+
    return (sp.returncode, output)

 # run the commands in cmd_list
@@ -128,9 +145,9 @@ def run_cmds(compile_cmds, run_cmd, filename, expect_failure):
        (return_code, output) = run_command(cmd)
        compile_failed = (return_code != 0)
        if compile_failed:
-            print "Compilation of test %s failed            " % filename
+            sys.stdout.write("Compilation of test %s failed            \n" % filename)
            if output != "":
-                print "%s" % output
+                sys.stdout.write("%s" % output)
            return (1, 0)

    (return_code, output) = run_command(run_cmd)
@@ -139,11 +156,11 @@ def run_cmds(compile_cmds, run_cmd, filename, expect_failure):
    surprise = ((expect_failure and not run_failed) or
                (not expect_failure and run_failed))
    if surprise == True:
-        print "Test %s %s (return code %d)            " % \
+        sys.stderr.write("Test %s %s (return code %d)            \n" % \
            (filename, "unexpectedly passed" if expect_failure else "failed",
-             return_code)
+             return_code))
    if output != "":
-        print "%s" % output
+        sys.stdout.write("%s\n" % output)
    if surprise == True:
        return (0, 1)
    else:
@@ -160,7 +177,7 @@ def run_test(filename):
    # is this a test to make sure an error is issued?
    want_error = (filename.find("tests_errors") != -1)
    if want_error == True:
-        ispc_cmd = "ispc --werror --nowrap %s --arch=%s --target=%s" % \
+        ispc_cmd = ispc_exe + " --werror --nowrap %s --arch=%s --target=%s" % \
            (input_prefix + filename, options.arch, options.target)
        (return_code, output) = run_command(ispc_cmd)
        got_error = (return_code != 0)
@@ -168,18 +185,17 @@ def run_test(filename):
        # figure out the error message we're expecting
        file = open(input_prefix + filename, 'r')
        firstline = file.readline()
-        firstline = string.replace(firstline, "//", "")
-        firstline = string.lstrip(firstline)
-        firstline = string.rstrip(firstline)
+        firstline = firstline.replace("//", "")
+        firstline = firstline.lstrip()
+        firstline = firstline.rstrip()
        file.close()

        if (output.find(firstline) == -1):
-            print "OUT %s" % filename
-            print "Didnt see expected error message %s from test %s.\nActual output:\n%s" % \
-                (firstline, filename, output)
+            sys.stderr.write("Didn't see expected error message %s from test %s.\nActual output:\n%s\n" % \
+                (firstline, filename, output))
            return (1, 0)
        elif got_error == False:
-            print "Unexpectedly no errors issued from test %s" % filename
+            sys.stderr.write("Unexpectedly no errors issued from test %s\n" % filename)
            return (1, 0)
        else:
            return (0, 0)
@@ -199,17 +215,17 @@ def run_test(filename):
                continue
            # one of them should have a function with one of the
            # declarations in sig2def
-            for pattern, ident in sig2def.items():
+            for pattern, ident in list(sig2def.items()):
                if line.find(pattern) != -1:
                    match = ident
                    break
        file.close()
        if match == -1:
-            print "Fatal error: unable to find function signature " + \
-                  "in test %s" % filename
+            sys.stderr.write("Fatal error: unable to find function signature " + \
+                  "in test %s\n" % filename)
            return (1, 0)
        else:
-            is_generic_target = options.target.find("generic-") != -1
+            global is_generic_target
            if is_generic_target:
                obj_name = "%s.cpp" % filename

@@ -218,7 +234,7 @@ def run_test(filename):
                    obj_name = "%s%s.obj" % (input_prefix, filename)
                exe_name = "%s%s.exe" % (input_prefix, filename)

-                cc_cmd = "%s /I. /Zi /nologo /DTEST_SIG=%d %stest_static.cpp %s /Fe%s" % \
+                cc_cmd = "%s /I. /I../winstuff /Zi /nologo /DTEST_SIG=%d %stest_static.cpp %s /Fe%s" % \
                         (options.compiler_exe, match, input_prefix, obj_name, exe_name)
                if should_fail:
                    cc_cmd += " /DEXPECT_FAILURE"
@@ -238,7 +254,7 @@ def run_test(filename):
                if should_fail:
                    cc_cmd += " -DEXPECT_FAILURE"

-            ispc_cmd = "ispc --woff %s -o %s --arch=%s --target=%s" % \
+            ispc_cmd = ispc_exe + " --woff %s -o %s --arch=%s --target=%s" % \
                       (input_prefix+filename, obj_name, options.arch, options.target)
            if options.no_opt:
                ispc_cmd += " -O0" 
@@ -246,17 +262,17 @@ def run_test(filename):
                ispc_cmd += " --emit-c++ --c++-include-file=%s" % options.include_file

        # compile the ispc code, make the executable, and run it...
-        global valgrind_cmd
        (compile_error, run_error) = run_cmds([ispc_cmd, cc_cmd], 
-                                              valgrind_cmd + " " + exe_name, \
+                                              options.wrapexe + " " + exe_name, \
                                              filename, should_fail)
+
        # clean up after running the test
        try:
            if not run_error:
                os.unlink(exe_name)
                if is_windows:
-                    os.unlink(filename + ".pdb")
-                    os.unlink(filename + ".ilk")
+                    os.unlink("%s%s.pdb" % (input_prefix, filename))
+                    os.unlink("%s%s.ilk" % (input_prefix, filename))
            os.unlink(obj_name)
        except:
            None
@@ -297,11 +313,8 @@ def run_tasks_from_queue(queue, queue_ret):
        if run_error != 0:
            run_error_files += [ filename ]

-        # If not for http://bugs.python.org/issue5261 on OSX, we'd like to do this:
-        #with finished_tests_counter_lock:
-            #update_progress(filename)
-        # but instead we do this...
-        finished_tests_mutex.lock(update_progress, filename)
+        with finished_tests_counter_lock:
+            update_progress(filename)

 task_threads = []

@@ -315,12 +328,15 @@ if __name__ == '__main__':

    compile_error_files = [ ]
    run_error_files = [ ]
-    nthreads = multiprocessing.cpu_count()
-    print "Found %d CPUs. Running %d tests." % (nthreads, total_tests)
+
+    nthreads = min(multiprocessing.cpu_count(), options.num_jobs)
+    sys.stdout.write("Running %d jobs in parallel. Running %d tests.\n" % (nthreads, total_tests))

    # put each of the test filenames into a queue
    q = multiprocessing.Queue()
    for fn in files:
+        if is_windows:
+            fn = fn.replace("\\",'/')
        q.put(fn)
    for x in range(nthreads):
        q.put('STOP')
@@ -340,7 +356,7 @@ if __name__ == '__main__':
    # (i.e. return 0 if all is ok)
    for t in task_threads:
        t.join()
-    print
+    sys.stdout.write("\n")

    while not qret.empty():
        (c, r) = qret.get()
@@ -349,13 +365,13 @@ if __name__ == '__main__':

    if len(compile_error_files) > 0:
        compile_error_files.sort()
-        print "%d / %d tests FAILED compilation:" % (len(compile_error_files), total_tests)
+        sys.stdout.write("%d / %d tests FAILED compilation:\n" % (len(compile_error_files), total_tests))
        for f in compile_error_files:
-            print "\t%s" % f
+            sys.stdout.write("\t%s\n" % f)
    if len(run_error_files) > 0:
        run_error_files.sort()
-        print "%d / %d tests FAILED execution:" % (len(run_error_files), total_tests)
+        sys.stdout.write("%d / %d tests FAILED execution:\n" % (len(run_error_files), total_tests))
        for f in run_error_files:
-            print "\t%s" % f
+            sys.stdout.write("\t%s\n" % f)

    sys.exit(len(compile_error_files) + len(run_error_files))
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -787,165 +787,14 @@ packed_store_active(uniform int * uniform a, int vals) {
 ///////////////////////////////////////////////////////////////////////////
 // System information

-static inline int num_cores() {
+static inline uniform int num_cores() {
    return __num_cores();
 }

-///////////////////////////////////////////////////////////////////////////
-// Atomics and memory barriers
-
-static inline void memory_barrier() {
-    __memory_barrier();
+static inline uniform int64 clock() {
+    return __clock();
 }

-#define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB,MASKTYPE)                        \
-static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
-    memory_barrier();                                                   \
-    TA ret = __atomic_##OPB##_##TB##_global(ptr, value, (MASKTYPE)__mask); \
-    memory_barrier();                                                   \
-    return ret;                                                         \
-}                                                                       \
-static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
-                                               uniform TA value) {      \
-    memory_barrier();                                                   \
-    uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value, \
-                                                            (MASKTYPE)__mask); \
-    memory_barrier();                                                   \
-    return ret;                                                         \
-}                                                                       \
-static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \
-    uniform TA * uniform ptrArray[programCount];                        \
-    ptrArray[programIndex] = ptr;                                       \
-    memory_barrier();                                                   \
-    TA ret;                                                             \
-    uniform int mask = lanemask();                                      \
-    for (uniform int i = 0; i < programCount; ++i) {                    \
-        if ((mask & (1 << i)) == 0)                                     \
-            continue;                                                   \
-        uniform TA * uniform p = ptrArray[i];                           \
-        uniform TA v = extract(value, i);                               \
-        uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v,     \
-                                                      (MASKTYPE)__mask); \
-        ret = insert(ret, i, r);                                        \
-    }                                                                   \
-    memory_barrier();                                                   \
-    return ret;                                                         \
-}                                                                       \
-
-#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB, MASKTYPE)                \
-static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
-    uniform TA oneval = reduce_##OPA(value);                            \
-    TA ret;                                                             \
-    if (lanemask() != 0) {                                              \
-        memory_barrier();                                               \
-        ret = __atomic_##OPB##_uniform_##TB##_global(ptr, oneval,       \
-                                                     (MASKTYPE)__mask); \
-        memory_barrier();                                               \
-    }                                                                   \
-    return ret;                                                         \
-}                                                                       \
-static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
-                                               uniform TA value) {      \
-    memory_barrier();                                                   \
-    uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value, \
-                                                            (MASKTYPE)__mask); \
-    memory_barrier();                                                   \
-    return ret;                                                         \
-}                                                                       \
-static inline TA atomic_##OPA##_global(uniform TA * varying ptr,        \
-                                       TA value) {                      \
-    uniform TA * uniform ptrArray[programCount];                        \
-    ptrArray[programIndex] = ptr;                                       \
-    memory_barrier();                                                   \
-    TA ret;                                                             \
-    uniform int mask = lanemask();                                      \
-    for (uniform int i = 0; i < programCount; ++i) {                    \
-        if ((mask & (1 << i)) == 0)                                     \
-            continue;                                                   \
-        uniform TA * uniform p = ptrArray[i];                           \
-        uniform TA v = extract(value, i);                               \
-        uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v,     \
-                                                      (MASKTYPE)__mask); \
-        ret = insert(ret, i, r);                                        \
-    }                                                                   \
-    memory_barrier();                                                   \
-    return ret;                                                         \
-}
-
-DEFINE_ATOMIC_OP(int32,int32,add,add,IntMaskType)
-DEFINE_ATOMIC_OP(int32,int32,subtract,sub,IntMaskType)
-DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min,IntMaskType)
-DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max,IntMaskType)
-DEFINE_ATOMIC_OP(int32,int32,and,and,IntMaskType)
-DEFINE_ATOMIC_OP(int32,int32,or,or,IntMaskType)
-DEFINE_ATOMIC_OP(int32,int32,xor,xor,IntMaskType)
-DEFINE_ATOMIC_OP(int32,int32,swap,swap,IntMaskType)
-
-// For everything but atomic min and max, we can use the same
-// implementations for unsigned as for signed.
-DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,UIntMaskType)
-DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,UIntMaskType)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin,UIntMaskType)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax,UIntMaskType)
-DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,UIntMaskType)
-DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,UIntMaskType)
-DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,UIntMaskType)
-DEFINE_ATOMIC_OP(unsigned int32,int32,swap,swap,UIntMaskType)
-
-DEFINE_ATOMIC_OP(float,float,swap,swap,IntMaskType)
-
-DEFINE_ATOMIC_OP(int64,int64,add,add,IntMaskType)
-DEFINE_ATOMIC_OP(int64,int64,subtract,sub,IntMaskType)
-DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min,IntMaskType)
-DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max,IntMaskType)
-DEFINE_ATOMIC_OP(int64,int64,and,and,IntMaskType)
-DEFINE_ATOMIC_OP(int64,int64,or,or,IntMaskType)
-DEFINE_ATOMIC_OP(int64,int64,xor,xor,IntMaskType)
-DEFINE_ATOMIC_OP(int64,int64,swap,swap,IntMaskType)
-
-// For everything but atomic min and max, we can use the same
-// implementations for unsigned as for signed.
-DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,UIntMaskType)
-DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,UIntMaskType)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin,UIntMaskType)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax,UIntMaskType)
-DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,UIntMaskType)
-DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,UIntMaskType)
-DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,UIntMaskType)
-DEFINE_ATOMIC_OP(unsigned int64,int64,swap,swap,UIntMaskType)
-
-DEFINE_ATOMIC_OP(double,double,swap,swap,IntMaskType)
-
-#undef DEFINE_ATOMIC_OP
-
-#define ATOMIC_DECL_CMPXCHG(TA, TB, MASKTYPE)                           \
-static inline TA atomic_compare_exchange_global(                           \
-         uniform TA * uniform ptr, TA oldval, TA newval) {                 \
-    memory_barrier();                                                      \
-    TA ret = __atomic_compare_exchange_##TB##_global(ptr, oldval, newval,  \
-                                                     (MASKTYPE)__mask);    \
-    memory_barrier();                                                      \
-    return ret;                                                            \
-} \
-static inline uniform TA atomic_compare_exchange_global(               \
-         uniform TA * uniform ptr, uniform TA oldval, uniform TA newval) { \
-    memory_barrier();                                                      \
-    uniform TA ret =                                                    \
-        __atomic_compare_exchange_uniform_##TB##_global(ptr, oldval, newval, \
-                                                        (MASKTYPE)__mask); \
-    memory_barrier();                                                   \
-    return ret;                                                         \
-}
-
-ATOMIC_DECL_CMPXCHG(int32, int32, IntMaskType)
-ATOMIC_DECL_CMPXCHG(unsigned int32, int32, UIntMaskType)
-ATOMIC_DECL_CMPXCHG(float, float, IntMaskType)
-ATOMIC_DECL_CMPXCHG(int64, int64, IntMaskType)
-ATOMIC_DECL_CMPXCHG(unsigned int64, int64, UIntMaskType)
-ATOMIC_DECL_CMPXCHG(double, double, IntMaskType)
-
-#undef ATOMIC_DECL_CMPXCHG
-
 ///////////////////////////////////////////////////////////////////////////
 // Floating-Point Math

@@ -1329,6 +1178,419 @@ static inline uniform int64 clamp(uniform int64 v, uniform int64 low,
    return min(max(v, low), high);
 }

+///////////////////////////////////////////////////////////////////////////
+// Global atomics and memory barriers
+
+static inline void memory_barrier() {
+    __memory_barrier();
+}
+
+#define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB,MASKTYPE)                        \
+static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
+    memory_barrier();                                                   \
+    TA ret = __atomic_##OPB##_##TB##_global(ptr, value, (MASKTYPE)__mask); \
+    memory_barrier();                                                   \
+    return ret;                                                         \
+}                                                                       \
+static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
+                                               uniform TA value) {      \
+    memory_barrier();                                                   \
+    uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value); \
+    memory_barrier();                                                   \
+    return ret;                                                         \
+}                                                                       \
+static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \
+    uniform TA * uniform ptrArray[programCount];                        \
+    ptrArray[programIndex] = ptr;                                       \
+    memory_barrier();                                                   \
+    TA ret;                                                             \
+    uniform int mask = lanemask();                                      \
+    for (uniform int i = 0; i < programCount; ++i) {                    \
+        if ((mask & (1 << i)) == 0)                                     \
+            continue;                                                   \
+        uniform TA * uniform p = ptrArray[i];                           \
+        uniform TA v = extract(value, i);                               \
+        uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v);    \
+        ret = insert(ret, i, r);                                        \
+    }                                                                   \
+    memory_barrier();                                                   \
+    return ret;                                                         \
+}                                                                       \
+
+#define DEFINE_ATOMIC_SWAP(TA,TB)                                       \
+static inline TA atomic_swap_global(uniform TA * uniform ptr, TA value) { \
+    memory_barrier();                                                   \
+    uniform int i = 0;                                                  \
+    TA ret[programCount];                                               \
+    TA memVal;                                                          \
+    uniform int lastSwap;                                               \
+    uniform int mask = lanemask();                                      \
+    /* First, have the first running program instance (if any) perform  \
+       the swap with memory with its value of "value"; record the       \
+       value returned. */                                               \
+    for (; i < programCount; ++i) {                                     \
+        if ((mask & (1 << i)) == 0)                                     \
+            continue;                                                   \
+        memVal = __atomic_swap_uniform_##TB##_global(ptr, extract(value, i)); \
+        lastSwap = i;                                                   \
+        break;                                                          \
+    }                                                                   \
+    /* Now, for all of the remaining running program instances, set the \
+       return value of the last instance that did a swap with this      \
+       instance's value of "value"; this gives the same effect as if the \
+       current instance had executed a hardware atomic swap right before \
+       the last one that did a swap. */                                 \
+    for (; i < programCount; ++i) {                                     \
+        if ((mask & (1 << i)) == 0)                                     \
+            continue;                                                   \
+        ret[lastSwap] = extract(value, i);                              \
+        lastSwap = i;                                                   \
+    }                                                                   \
+    /* And the last instance that wanted to swap gets the value we      \
+       originally got back from memory... */                            \
+    ret[lastSwap] = memVal;                                             \
+    memory_barrier();                                                   \
+    return ret[programIndex];                                           \
+}                                                                       \
+static inline uniform TA atomic_swap_global(uniform TA * uniform ptr,   \
+                                            uniform TA value) {         \
+    memory_barrier();                                                   \
+    uniform TA ret = __atomic_swap_uniform_##TB##_global(ptr, value);   \
+    memory_barrier();                                                   \
+    return ret;                                                         \
+}                                                                       \
+static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \
+    uniform TA * uniform ptrArray[programCount];                        \
+    ptrArray[programIndex] = ptr;                                       \
+    memory_barrier();                                                   \
+    TA ret;                                                             \
+    uniform int mask = lanemask();                                      \
+    for (uniform int i = 0; i < programCount; ++i) {                    \
+        if ((mask & (1 << i)) == 0)                                     \
+            continue;                                                   \
+        uniform TA * uniform p = ptrArray[i];                           \
+        uniform TA v = extract(value, i);                               \
+        uniform TA r = __atomic_swap_uniform_##TB##_global(p, v);       \
+        ret = insert(ret, i, r);                                        \
+    }                                                                   \
+    memory_barrier();                                                   \
+    return ret;                                                         \
+}                                                                       \
+
+#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB)                          \
+static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
+    uniform TA oneval = reduce_##OPA(value);                            \
+    TA ret;                                                             \
+    if (lanemask() != 0) {                                              \
+        memory_barrier();                                               \
+        ret = __atomic_##OPB##_uniform_##TB##_global(ptr, oneval);      \
+        memory_barrier();                                               \
+    }                                                                   \
+    return ret;                                                         \
+}                                                                       \
+static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
+                                               uniform TA value) {      \
+    memory_barrier();                                                   \
+    uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value); \
+    memory_barrier();                                                   \
+    return ret;                                                         \
+}                                                                       \
+static inline TA atomic_##OPA##_global(uniform TA * varying ptr,        \
+                                       TA value) {                      \
+    uniform TA * uniform ptrArray[programCount];                        \
+    ptrArray[programIndex] = ptr;                                       \
+    memory_barrier();                                                   \
+    TA ret;                                                             \
+    uniform int mask = lanemask();                                      \
+    for (uniform int i = 0; i < programCount; ++i) {                    \
+        if ((mask & (1 << i)) == 0)                                     \
+            continue;                                                   \
+        uniform TA * uniform p = ptrArray[i];                           \
+        uniform TA v = extract(value, i);                               \
+        uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v);    \
+        ret = insert(ret, i, r);                                        \
+    }                                                                   \
+    memory_barrier();                                                   \
+    return ret;                                                         \
+}
+
+DEFINE_ATOMIC_OP(int32,int32,add,add,IntMaskType)
+DEFINE_ATOMIC_OP(int32,int32,subtract,sub,IntMaskType)
+DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min)
+DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max)
+DEFINE_ATOMIC_OP(int32,int32,and,and,IntMaskType)
+DEFINE_ATOMIC_OP(int32,int32,or,or,IntMaskType)
+DEFINE_ATOMIC_OP(int32,int32,xor,xor,IntMaskType)
+DEFINE_ATOMIC_SWAP(int32,int32)
+
+// For everything but atomic min and max, we can use the same
+// implementations for unsigned as for signed.
+DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,UIntMaskType)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax)
+DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,UIntMaskType)
+DEFINE_ATOMIC_SWAP(unsigned int32,int32)
+
+DEFINE_ATOMIC_SWAP(float,float)
+
+DEFINE_ATOMIC_OP(int64,int64,add,add,IntMaskType)
+DEFINE_ATOMIC_OP(int64,int64,subtract,sub,IntMaskType)
+DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min)
+DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max)
+DEFINE_ATOMIC_OP(int64,int64,and,and,IntMaskType)
+DEFINE_ATOMIC_OP(int64,int64,or,or,IntMaskType)
+DEFINE_ATOMIC_OP(int64,int64,xor,xor,IntMaskType)
+DEFINE_ATOMIC_SWAP(int64,int64)
+
+// For everything but atomic min and max, we can use the same
+// implementations for unsigned as for signed.
+DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,UIntMaskType)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax)
+DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,UIntMaskType)
+DEFINE_ATOMIC_SWAP(unsigned int64,int64)
+
+DEFINE_ATOMIC_SWAP(double,double)
+
+#undef DEFINE_ATOMIC_OP
+#undef DEFINE_ATOMIC_MINMAX_OP
+#undef DEFINE_ATOMIC_SWAP
+
+#define ATOMIC_DECL_CMPXCHG(TA, TB, MASKTYPE)                           \
+static inline uniform TA atomic_compare_exchange_global(               \
+         uniform TA * uniform ptr, uniform TA oldval, uniform TA newval) { \
+    memory_barrier();                                                      \
+    uniform TA ret =                                                    \
+        __atomic_compare_exchange_uniform_##TB##_global(ptr, oldval, newval); \
+    memory_barrier();                                                   \
+    return ret;                                                         \
+}                                                                       \
+static inline TA atomic_compare_exchange_global(                           \
+         uniform TA * uniform ptr, TA oldval, TA newval) {                 \
+    memory_barrier();                                                      \
+    TA ret = __atomic_compare_exchange_##TB##_global(ptr, oldval, newval,  \
+                                                     (MASKTYPE)__mask);    \
+    memory_barrier();                                                      \
+    return ret;                                                            \
+} \
+static inline TA atomic_compare_exchange_global(               \
+         uniform TA * varying ptr, TA oldval, TA newval) { \
+    uniform TA * uniform ptrArray[programCount];                        \
+    ptrArray[programIndex] = ptr;                                       \
+    memory_barrier();                                                   \
+    TA ret;                                                             \
+    uniform int mask = lanemask();                                      \
+    for (uniform int i = 0; i < programCount; ++i) {                    \
+        if ((mask & (1 << i)) == 0)                                     \
+            continue;                                                   \
+        uniform TA r =                                                  \
+            __atomic_compare_exchange_uniform_##TB##_global(ptrArray[i], \
+                                                            extract(oldval, i), \
+                                                            extract(newval, i)); \
+        ret = insert(ret, i, r);                                        \
+    }                                                                   \
+    memory_barrier();                                                   \
+    return ret;                                                         \
+}
+
+ATOMIC_DECL_CMPXCHG(int32, int32, IntMaskType)
+ATOMIC_DECL_CMPXCHG(unsigned int32, int32, UIntMaskType)
+ATOMIC_DECL_CMPXCHG(float, float, IntMaskType)
+ATOMIC_DECL_CMPXCHG(int64, int64, IntMaskType)
+ATOMIC_DECL_CMPXCHG(unsigned int64, int64, UIntMaskType)
+ATOMIC_DECL_CMPXCHG(double, double, IntMaskType)
+
+#undef ATOMIC_DECL_CMPXCHG
+
+///////////////////////////////////////////////////////////////////////////
+// local atomics
+
+#define LOCAL_ATOMIC(TYPE,NAME,OPFUNC)                                  \
+static inline uniform TYPE atomic_##NAME##_local(uniform TYPE * uniform ptr, \
+                                                 uniform TYPE value) {  \
+    uniform TYPE ret = *ptr;                                           \
+    *ptr = OPFUNC(*ptr, value);                                        \
+     return ret;                                                       \
+}                                                                      \
+static inline TYPE atomic_##NAME##_local(uniform TYPE * uniform ptr, TYPE value) { \
+    TYPE ret;                                                          \
+    uniform int mask = lanemask();                                     \
+    for (uniform int i = 0; i < programCount; ++i) {                   \
+        if ((mask & (1 << i)) == 0)                                    \
+            continue;                                                  \
+        ret = insert(ret, i, *ptr);                                    \
+        *ptr = OPFUNC(*ptr, extract(value, i));                        \
+    }                                                                  \
+    return ret;                                                        \
+}                                                                      \
+static inline TYPE atomic_##NAME##_local(uniform TYPE * p, TYPE value) {    \
+    TYPE ret;                                                          \
+    uniform TYPE * uniform ptrs[programCount];                         \
+    ptrs[programIndex] = p;                                            \
+    uniform int mask = lanemask();                                     \
+    for (uniform int i = 0; i < programCount; ++i) {                   \
+        if ((mask & (1 << i)) == 0)                                    \
+            continue;                                                  \
+        ret = insert(ret, i, *ptrs[i]);                                \
+        *ptrs[i] = OPFUNC(*ptrs[i], extract(value, i));                \
+    }                                                                  \
+    return ret;                                                        \
+}
+
+static inline uniform int32 __add(uniform int32 a, uniform int32 b) { return a+b; }
+static inline uniform int32 __sub(uniform int32 a, uniform int32 b) { return a-b; }
+static inline uniform int32 __and(uniform int32 a, uniform int32 b) { return a & b; }
+static inline uniform int32 __or(uniform int32 a, uniform int32 b) { return a | b; }
+static inline uniform int32 __xor(uniform int32 a, uniform int32 b) { return a ^ b; }
+static inline uniform int32 __swap(uniform int32 a, uniform int32 b) { return b; }
+
+static inline uniform unsigned int32 __add(uniform unsigned int32 a, 
+                                           uniform unsigned int32 b) { return a+b; }
+static inline uniform unsigned int32 __sub(uniform unsigned int32 a, 
+                                           uniform unsigned int32 b) { return a-b; }
+static inline uniform unsigned int32 __and(uniform unsigned int32 a, 
+                                           uniform unsigned int32 b) { return a & b; }
+static inline uniform unsigned int32 __or(uniform unsigned int32 a, 
+                                          uniform unsigned int32 b) { return a | b; }
+static inline uniform unsigned int32 __xor(uniform unsigned int32 a, 
+                                           uniform unsigned int32 b) { return a ^ b; }
+static inline uniform unsigned int32 __swap(uniform unsigned int32 a, 
+                                            uniform unsigned int32 b) { return b; }
+
+
+static inline uniform float __add(uniform float a, uniform float b) { return a+b; }
+static inline uniform float __sub(uniform float a, uniform float b) { return a-b; }
+static inline uniform float __swap(uniform float a, uniform float b) { return b; }
+
+static inline uniform int64 __add(uniform int64 a, uniform int64 b) { return a+b; }
+static inline uniform int64 __sub(uniform int64 a, uniform int64 b) { return a-b; }
+static inline uniform int64 __and(uniform int64 a, uniform int64 b) { return a & b; }
+static inline uniform int64 __or(uniform int64 a, uniform int64 b) { return a | b; }
+static inline uniform int64 __xor(uniform int64 a, uniform int64 b) { return a ^ b; }
+static inline uniform int64 __swap(uniform int64 a, uniform int64 b) { return b; }
+
+static inline uniform unsigned int64 __add(uniform unsigned int64 a, 
+                                           uniform unsigned int64 b) { return a+b; }
+static inline uniform unsigned int64 __sub(uniform unsigned int64 a, 
+                                           uniform unsigned int64 b) { return a-b; }
+static inline uniform unsigned int64 __and(uniform unsigned int64 a, 
+                                           uniform unsigned int64 b) { return a & b; }
+static inline uniform unsigned int64 __or(uniform unsigned int64 a, 
+                                          uniform unsigned int64 b) { return a | b; }
+static inline uniform unsigned int64 __xor(uniform unsigned int64 a, 
+                                           uniform unsigned int64 b) { return a ^ b; }
+static inline uniform unsigned int64 __swap(uniform unsigned int64 a, 
+                                            uniform unsigned int64 b) { return b; }
+
+static inline uniform double __add(uniform double a, uniform double b) { return a+b; }
+static inline uniform double __sub(uniform double a, uniform double b) { return a-b; }
+static inline uniform double __swap(uniform double a, uniform double b) { return a-b; }
+
+LOCAL_ATOMIC(int32, add, __add)
+LOCAL_ATOMIC(int32, subtract, __sub)
+LOCAL_ATOMIC(int32, and, __and)
+LOCAL_ATOMIC(int32, or, __or)
+LOCAL_ATOMIC(int32, xor, __xor)
+LOCAL_ATOMIC(int32, min, min)
+LOCAL_ATOMIC(int32, max, max)
+LOCAL_ATOMIC(int32, swap, __swap)
+
+LOCAL_ATOMIC(unsigned int32, add, __add)
+LOCAL_ATOMIC(unsigned int32, subtract, __sub)
+LOCAL_ATOMIC(unsigned int32, and, __and)
+LOCAL_ATOMIC(unsigned int32, or, __or)
+LOCAL_ATOMIC(unsigned int32, xor, __xor)
+LOCAL_ATOMIC(unsigned int32, min, min)
+LOCAL_ATOMIC(unsigned int32, max, max)
+LOCAL_ATOMIC(unsigned int32, swap, __swap)
+
+LOCAL_ATOMIC(float, add, __add)
+LOCAL_ATOMIC(float, subtract, __sub)
+LOCAL_ATOMIC(float, min, min)
+LOCAL_ATOMIC(float, max, max)
+LOCAL_ATOMIC(float, swap, __swap)
+
+LOCAL_ATOMIC(int64, add, __add)
+LOCAL_ATOMIC(int64, subtract, __sub)
+LOCAL_ATOMIC(int64, and, __and)
+LOCAL_ATOMIC(int64, or, __or)
+LOCAL_ATOMIC(int64, xor, __xor)
+LOCAL_ATOMIC(int64, min, min)
+LOCAL_ATOMIC(int64, max, max)
+LOCAL_ATOMIC(int64, swap, __swap)
+
+LOCAL_ATOMIC(unsigned int64, add, __add)
+LOCAL_ATOMIC(unsigned int64, subtract, __sub)
+LOCAL_ATOMIC(unsigned int64, and, __and)
+LOCAL_ATOMIC(unsigned int64, or, __or)
+LOCAL_ATOMIC(unsigned int64, xor, __xor)
+LOCAL_ATOMIC(unsigned int64, min, min)
+LOCAL_ATOMIC(unsigned int64, max, max)
+LOCAL_ATOMIC(unsigned int64, swap, __swap)
+
+LOCAL_ATOMIC(double, add, __add)
+LOCAL_ATOMIC(double, subtract, __sub)
+LOCAL_ATOMIC(double, min, min)
+LOCAL_ATOMIC(double, max, max)
+LOCAL_ATOMIC(double, swap, __swap)
+
+// compare exchange
+#define LOCAL_CMPXCHG(TYPE)                                             \
+static inline uniform TYPE atomic_compare_exchange_local(uniform TYPE * uniform ptr, \
+                                                         uniform TYPE cmp, \
+                                                         uniform TYPE update) { \
+    uniform TYPE old = *ptr;                                               \
+    if (old == cmp)                                                     \
+        *ptr = update;                                                  \
+    return old;                                                         \
+}                                                                       \
+static inline TYPE atomic_compare_exchange_local(uniform TYPE * uniform ptr, \
+                                                 TYPE cmp, TYPE update) { \
+    TYPE ret;                                                          \
+    uniform int mask = lanemask();                                     \
+    for (uniform int i = 0; i < programCount; ++i) {                   \
+        if ((mask & (1 << i)) == 0)                                    \
+            continue;                                                  \
+        uniform TYPE old = *ptr;                                       \
+        if (old == extract(cmp, i))                                    \
+            *ptr = extract(update, i);                                 \
+        ret = insert(ret, i, old);                                     \
+    }                                                                  \
+    return ret;                                                        \
+}                                                                       \
+static inline TYPE atomic_compare_exchange_local(uniform TYPE * varying p, \
+                                                 TYPE cmp, TYPE update) { \
+    uniform TYPE * uniform ptrs[programCount];                          \
+    ptrs[programIndex] = p;                                            \
+    TYPE ret;                                                          \
+    uniform int mask = lanemask();                                     \
+    for (uniform int i = 0; i < programCount; ++i) {                   \
+        if ((mask & (1 << i)) == 0)                                    \
+            continue;                                                  \
+        uniform TYPE old = *ptrs[i];                                   \
+        if (old == extract(cmp, i))                                    \
+            *ptrs[i] = extract(update, i);                             \
+        ret = insert(ret, i, old);                                     \
+    }                                                                  \
+    return ret;                                                        \
+}
+
+LOCAL_CMPXCHG(int32)
+LOCAL_CMPXCHG(unsigned int32)
+LOCAL_CMPXCHG(float)
+LOCAL_CMPXCHG(int64)
+LOCAL_CMPXCHG(unsigned int64)
+LOCAL_CMPXCHG(double)
+
+#undef LOCAL_ATOMIC
+#undef LOCAL_CMPXCHG
+
 ///////////////////////////////////////////////////////////////////////////
 // Transcendentals (float precision)

@@ -2764,6 +3026,10 @@ static inline uniform double pow(uniform double a, uniform double b) {
 // half-precision floats

 static inline uniform float half_to_float(uniform unsigned int16 h) {
+    if (__have_native_half) {
+        return __half_to_float_uniform(h);
+    }
+    else {
        if ((h & 0x7FFFu) == 0) 
            // Signed zero
            return floatbits(((unsigned int32) h) << 16);
@@ -2819,8 +3085,13 @@ static inline uniform float half_to_float(uniform unsigned int16 h) {
            }
        }
    }
+}

 static inline float half_to_float(unsigned int16 h) {
+    if (__have_native_half) {
+        return __half_to_float_varying(h);
+    }
+    else {
        if ((h & 0x7FFFu) == 0) 
            // Signed zero
            return floatbits(((unsigned int32) h) << 16);
@@ -2876,9 +3147,14 @@ static inline float half_to_float(unsigned int16 h) {
            }
        }
    }
+}


 static inline uniform int16 float_to_half(uniform float f) {
+    if (__have_native_half) {
+        return __float_to_half_uniform(f);
+    }
+    else {
        uniform int32 x = intbits(f);
        // Store the return value in an int32 until the very end; this ends up
        // generating better code...
@@ -2942,9 +3218,14 @@ static inline uniform int16 float_to_half(uniform float f) {
        }
        return (int16)ret;
    }
+}


 static inline int16 float_to_half(float f) {
+    if (__have_native_half) {
+        return __float_to_half_varying(f);
+    }
+    else {
        int32 x = intbits(f);
        // Store the return value in an int32 until the very end; this ends up
        // generating better code...
@@ -3008,9 +3289,14 @@ static inline int16 float_to_half(float f) {
        }
        return (int16)ret;
    }
+}


 static inline uniform float half_to_float_fast(uniform unsigned int16 h) {
+    if (__have_native_half) {
+        return __half_to_float_uniform(h);
+    }
+    else {
        uniform unsigned int32 hs = h & (int32)0x8000u;  // Pick off sign bit
        uniform unsigned int32 he = h & (int32)0x7C00u;  // Pick off exponent bits
        uniform unsigned int32 hm = h & (int32)0x03FFu;  // Pick off mantissa bits
@@ -3024,10 +3310,14 @@ static inline uniform float half_to_float_fast(uniform unsigned int16 h) {
        // Mantissa
        uniform unsigned int32 xm = ((unsigned int32) hm) << 13; 
        return floatbits(xs | xe | xm);
-
+    }
 }

 static inline float half_to_float_fast(unsigned int16 h) {
+    if (__have_native_half) {
+        return __half_to_float_varying(h);
+    }
+    else {
        unsigned int32 hs = h & (int32)0x8000u;  // Pick off sign bit
        unsigned int32 he = h & (int32)0x7C00u;  // Pick off exponent bits
        unsigned int32 hm = h & (int32)0x03FFu;  // Pick off mantissa bits
@@ -3041,10 +3331,14 @@ static inline float half_to_float_fast(unsigned int16 h) {
        // Mantissa
        unsigned int32 xm = ((unsigned int32) hm) << 13; 
        return floatbits(xs | xe | xm);
-
+    }
 }

 static inline uniform int16 float_to_half_fast(uniform float f) {
+    if (__have_native_half) {
+        return __float_to_half_uniform(f);
+    }
+    else {
        uniform int32 x = intbits(f);
        uniform unsigned int32 xs = x & 0x80000000u;  // Pick off sign bit
        uniform unsigned int32 xe = x & 0x7F800000u;  // Pick off exponent bits
@@ -3063,8 +3357,13 @@ static inline uniform int16 float_to_half_fast(uniform float f) {

        return (int16)ret;
    }
+}

 static inline int16 float_to_half_fast(float f) {
+    if (__have_native_half) {
+        return __float_to_half_varying(f);
+    }
+    else {
        int32 x = intbits(f);
        unsigned int32 xs = x & 0x80000000u;  // Pick off sign bit
        unsigned int32 xe = x & 0x7F800000u;  // Pick off exponent bits
@@ -3083,6 +3382,7 @@ static inline int16 float_to_half_fast(float f) {

        return (int16)ret;
    }
+}

 ///////////////////////////////////////////////////////////////////////////
 // RNG stuff
@@ -3095,16 +3395,15 @@ static inline unsigned int random(RNGState * uniform state)
 {
    unsigned int b;

-    // FIXME: state->z1, etc..
-    b  = (((*state).z1 << 6) ^ (*state).z1) >> 13;
-    (*state).z1 = (((*state).z1 & 4294967294U) << 18) ^ b;
-    b  = (((*state).z2 << 2) ^ (*state).z2) >> 27; 
-    (*state).z2 = (((*state).z2 & 4294967288U) << 2) ^ b;
-    b  = (((*state).z3 << 13) ^ (*state).z3) >> 21;
-    (*state).z3 = (((*state).z3 & 4294967280U) << 7) ^ b;
-    b  = (((*state).z4 << 3) ^ (*state).z4) >> 12;
-    (*state).z4 = (((*state).z4 & 4294967168U) << 13) ^ b;
-    return ((*state).z1 ^ (*state).z2 ^ (*state).z3 ^ (*state).z4);
+    b  = ((state->z1 << 6) ^ state->z1) >> 13;
+    state->z1 = ((state->z1 & 4294967294U) << 18) ^ b;
+    b  = ((state->z2 << 2) ^ state->z2) >> 27; 
+    state->z2 = ((state->z2 & 4294967288U) << 2) ^ b;
+    b  = ((state->z3 << 13) ^ state->z3) >> 21;
+    state->z3 = ((state->z3 & 4294967280U) << 7) ^ b;
+    b  = ((state->z4 << 3) ^ state->z4) >> 12;
+    state->z4 = ((state->z4 & 4294967168U) << 13) ^ b;
+    return (state->z1 ^ state->z2 ^ state->z3 ^ state->z4);
 }

 static inline float frandom(RNGState * uniform state)
@@ -3120,35 +3419,43 @@ static inline uniform unsigned int __seed4(RNGState * uniform state,
    uniform unsigned int c1 = 0xf0f0f0f0;
    uniform unsigned int c2 = 0x0f0f0f0f;

-    (*state).z1 = insert((*state).z1, start + 0, seed);
-    (*state).z1 = insert((*state).z1, start + 1, seed ^ c1);
-    (*state).z1 = insert((*state).z1, start + 2, (seed << 3) ^ c1);
-    (*state).z1 = insert((*state).z1, start + 3, (seed << 2) ^ c2);
+    state->z1 = insert(state->z1, start + 0, seed);
+    state->z1 = insert(state->z1, start + 1, seed ^ c1);
+    state->z1 = insert(state->z1, start + 2, (seed << 3) ^ c1);
+    state->z1 = insert(state->z1, start + 3, (seed << 2) ^ c2);

    seed += 131;
-    (*state).z2 = insert((*state).z2, start + 0, seed);
-    (*state).z2 = insert((*state).z2, start + 1, seed ^ c1);
-    (*state).z2 = insert((*state).z2, start + 2, (seed << 3) ^ c1);
-    (*state).z2 = insert((*state).z2, start + 3, (seed << 2) ^ c2);
+    state->z2 = insert(state->z2, start + 0, seed);
+    state->z2 = insert(state->z2, start + 1, seed ^ c1);
+    state->z2 = insert(state->z2, start + 2, (seed << 3) ^ c1);
+    state->z2 = insert(state->z2, start + 3, (seed << 2) ^ c2);

-    seed ^= extract((*state).z2, 2);
-    (*state).z3 = insert((*state).z3, start + 0, seed);
-    (*state).z3 = insert((*state).z3, start + 1, seed ^ c1);
-    (*state).z3 = insert((*state).z3, start + 2, (seed << 3) ^ c1);
-    (*state).z3 = insert((*state).z3, start + 3, (seed << 2) ^ c2);
+    seed ^= extract(state->z2, 2);
+    state->z3 = insert(state->z3, start + 0, seed);
+    state->z3 = insert(state->z3, start + 1, seed ^ c1);
+    state->z3 = insert(state->z3, start + 2, (seed << 3) ^ c1);
+    state->z3 = insert(state->z3, start + 3, (seed << 2) ^ c2);

    seed <<= 4;
    seed += 3;
-    seed ^= extract((*state).z1, 3);
-    (*state).z4 = insert((*state).z4, start + 0, seed);
-    (*state).z4 = insert((*state).z4, start + 1, seed ^ c1);
-    (*state).z4 = insert((*state).z4, start + 2, (seed << 3) ^ c1);
-    (*state).z4 = insert((*state).z4, start + 3, (seed << 2) ^ c2);
+    seed ^= extract(state->z1, 3);
+    state->z4 = insert(state->z4, start + 0, seed);
+    state->z4 = insert(state->z4, start + 1, seed ^ c1);
+    state->z4 = insert(state->z4, start + 2, (seed << 3) ^ c1);
+    state->z4 = insert(state->z4, start + 3, (seed << 2) ^ c2);

    return seed;
 }

 static inline void seed_rng(uniform RNGState * uniform state, uniform unsigned int seed) {
+    if (programCount == 1) {
+        state->z1 = seed;
+        state->z2 = seed ^ 0xbeeff00d;
+        state->z3 = ((seed & 0xffff) << 16) | (seed >> 16);
+        state->z4 = (((seed & 0xff) << 24) | ((seed & 0xff00)  << 8) |
+                     ((seed & 0xff0000) >> 8) | (seed & 0xff000000) >> 24);
+    }
+    else {
        seed = __seed4(state, 0, seed);
        if (programCount == 8)
            __seed4(state, 4, seed ^ 0xbeeff00d);
@@ -3159,6 +3466,7 @@ static inline void seed_rng(uniform RNGState * uniform state, uniform unsigned i
                                ((seed & 0xff0000) >> 8) | (seed & 0xff000000) >> 24));
        }
    }
+}

 static inline void fastmath() {
    __fastmath();
--- a/stdlib2cpp.py
+++ b/stdlib2cpp.py
@@ -4,11 +4,15 @@ import sys

 t=str(sys.argv[1])

-print "char stdlib_" + t + "_code[] = { "
+sys.stdout.write("char stdlib_" + t + "_code[] = {\n")

-for line in sys.stdin:
-    for c in line:
-        print ord(c)
-        print ", "
+width = 16
+data = sys.stdin.read()
+for i in range(0, len(data), 1):
+    sys.stdout.write("0x%0.2X, " % ord(data[i:i+1]))
+
+    if i%width == (width-1):
+        sys.stdout.write("\n")
+
+sys.stdout.write("0x00 };\n\n")
                                    
-print "0 };"
--- a/stmt.cpp
+++ b/stmt.cpp
--- a/stmt.h
+++ b/stmt.h
@@ -282,6 +282,60 @@ public:
 };


+/** Statement corresponding to a "case" label in the program.  In addition
+    to the value associated with the "case", this statement also stores the
+    statements following it. */
+class CaseStmt : public Stmt {
+public:
+    CaseStmt(int value, Stmt *stmt, SourcePos pos);
+
+    void EmitCode(FunctionEmitContext *ctx) const;
+    void Print(int indent) const;
+
+    Stmt *TypeCheck();
+    int EstimateCost() const;
+
+    /** Integer value after the "case" statement */
+    const int value;
+    Stmt *stmts;
+};
+
+
+/** Statement for a "default" label (as would be found inside a "switch"
+    statement). */
+class DefaultStmt : public Stmt {
+public:
+    DefaultStmt(Stmt *stmt, SourcePos pos);
+
+    void EmitCode(FunctionEmitContext *ctx) const;
+    void Print(int indent) const;
+
+    Stmt *TypeCheck();
+    int EstimateCost() const;
+
+    Stmt *stmts;
+};
+
+
+/** A "switch" statement in the program. */
+class SwitchStmt : public Stmt {
+public:
+    SwitchStmt(Expr *expr, Stmt *stmts, SourcePos pos);
+
+    void EmitCode(FunctionEmitContext *ctx) const;
+    void Print(int indent) const;
+
+    Stmt *TypeCheck();
+    int EstimateCost() const;
+
+    /** Expression that is used to determine which label to jump to. */
+    Expr *expr;
+    /** Statement block after the "switch" expression. */
+    Stmt *stmts;
+};
+
+
+/** A "goto" in an ispc program. */
 class GotoStmt : public Stmt {
 public:
    GotoStmt(const char *label, SourcePos gotoPos, SourcePos idPos);
@@ -293,11 +347,14 @@ public:
    Stmt *TypeCheck();
    int EstimateCost() const;

+    /** Name of the label to jump to when the goto is executed. */
    std::string label;
    SourcePos identifierPos;
 };


+/** Statement corresponding to a label (as would be used as a goto target)
+    in the program. */
 class LabeledStmt : public Stmt {
 public:
    LabeledStmt(const char *label, Stmt *stmt, SourcePos p);
@@ -309,7 +366,9 @@ public:
    Stmt *TypeCheck();
    int EstimateCost() const;

+    /** Name of the label. */
    std::string name;
+    /** Statements following the label. */
    Stmt *stmt;
 };

@@ -383,4 +442,21 @@ public:
    Expr *expr;
 };

+
+/** Representation of a delete statement in the program.
+*/
+class DeleteStmt : public Stmt {
+public:
+    DeleteStmt(Expr *e, SourcePos p);
+
+    void EmitCode(FunctionEmitContext *ctx) const;
+    void Print(int indent) const;
+
+    Stmt *TypeCheck();
+    int EstimateCost() const;
+
+    /** Expression that gives the pointer value to be deleted. */
+    Expr *expr;
+};
+
 #endif // ISPC_STMT_H
--- a/tests/array-gather-multi-unif.ispc
+++ b/tests/array-gather-multi-unif.ispc
@@ -15,7 +15,9 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 }

 export void result(uniform float RET[]) { 
-    if (programCount == 4)
+    if (programCount == 1)
+        RET[programIndex] = 1;
+    else if (programCount == 4)
        RET[programIndex] = 5.; 
    else
        RET[programIndex] = 10.; 
--- a/tests/array-pointer-duality-1.ispc
+++ b/tests/array-pointer-duality-1.ispc
@@ -3,13 +3,13 @@ export uniform int width() { return programCount; }


 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    uniform float a[programCount];
-    for (unsigned int i = 0; i < programCount; ++i)
-        a[i] = aFOO[i];
+    uniform float a[programCount+4];
+    for (unsigned int i = 0; i < programCount+4; ++i)
+        a[i] = aFOO[min((int)i, programCount)];

    RET[programIndex] = *(a + 2);
 }

 export void result(uniform float RET[]) {
-    RET[programIndex] = 3;
+    RET[programIndex] = (programCount == 1) ? 2 : 3;
 }
--- a/tests/array-scatter-unif-2.ispc
+++ b/tests/array-scatter-unif-2.ispc
@@ -14,4 +14,4 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 }

    
-export void result(uniform float RET[]) { RET[programIndex] = 5; }
+export void result(uniform float RET[]) { RET[programIndex] = programCount == 1 ? 0 : 5; }
--- a/tests/array-scatter-unif.ispc
+++ b/tests/array-scatter-unif.ispc
@@ -14,7 +14,9 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {

    
 export void result(uniform float RET[]) { 
-    if (programCount == 4)
+    if (programCount == 1)
+        RET[programIndex] = 0;
+    else if (programCount == 4)
        RET[programIndex] = 2;
    else
        RET[programIndex] = 4;
--- a/tests/atomics-10.ispc
+++ b/tests/atomics-10.ispc
@@ -13,5 +13,5 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
 }

 export void result(uniform float RET[]) {
-    RET[programIndex] = 2;
+    RET[programIndex] = programCount == 1 ? 1 : 2;
 }
--- a/tests/atomics-13.ispc
+++ b/tests/atomics-13.ispc
@@ -12,5 +12,5 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
 }

 export void result(uniform float RET[]) {
-    RET[programIndex] = (programCount/2) - 1;
+    RET[programIndex] = programCount == 1 ? 0 : ((programCount/2) - 1);
 }
--- a/tests/atomics-3.ispc
+++ b/tests/atomics-3.ispc
@@ -5,11 +5,11 @@ uniform int32 s = 0xff;

 export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex]; 
-    int32 bits = 0xfffffff0;
+    int32 bits = 0xfff0;
    float b = atomic_xor_global(&s, bits);
    RET[programIndex] = s;
 }

 export void result(uniform float RET[]) {
-    RET[programIndex] = 0xff;
+    RET[programIndex] = (programCount & 1) ? 0xff0f : 0xff;
 }
--- a/tests/atomics-6.ispc
+++ b/tests/atomics-6.ispc
@@ -10,5 +10,5 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
 }

 export void result(uniform float RET[]) {
-    RET[programIndex] = 3000;
+    RET[programIndex] = (programCount == 1) ? 2 : 3000;
 }
--- a/tests/atomics-8.ispc
+++ b/tests/atomics-8.ispc
@@ -12,5 +12,5 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
 }

 export void result(uniform float RET[]) {
-    RET[programIndex] = programCount;
+    RET[programIndex] = (programCount == 1) ? 0 : programCount;
 }
--- a/tests/atomics-9.ispc
+++ b/tests/atomics-9.ispc
@@ -13,5 +13,5 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
 }

 export void result(uniform float RET[]) {
-    RET[programIndex] = 1;
+    RET[programIndex] = (programCount == 1) ? 0 : 1;
 }
--- a/tests/atomics-swap.ispc
+++ b/tests/atomics-swap.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+uniform int32 s = 1234;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = 0;
+    if (programIndex & 1) {
+        b = atomic_swap_global(&s, programIndex);
+    }
+    RET[programIndex] = reduce_add(b) + s;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1234 + reduce_add(programIndex & 1 ? programIndex : 0);
+}
--- a/tests/broadcast-1.ispc
+++ b/tests/broadcast-1.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }

 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    int a = aFOO[programIndex]; 
-    int br = broadcast(a, (uniform int)b-2);
+    int br = (programCount == 1) ? 4 : broadcast(a, (uniform int)b-2);
    RET[programIndex] = br;
 }

--- a/tests/broadcast-2.ispc
+++ b/tests/broadcast-2.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }

 export void f_f(uniform float RET[], uniform float aFOO[]) {
    int16 a = aFOO[programIndex]; 
-    int16 b = broadcast(a, 2);
+    int16 b = (programCount == 1) ? 3 : broadcast(a, 2);
    RET[programIndex] = b;
 }

--- a/tests/broadcast-3.ispc
+++ b/tests/broadcast-3.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }

 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    int8 a = aFOO[programIndex]; 
-    int8 br = broadcast(a, (uniform int)b-2);
+    int8 br = (programCount == 1) ? 4 : broadcast(a, (uniform int)b-2);
    RET[programIndex] = br;
 }

--- a/tests/broadcast.ispc
+++ b/tests/broadcast.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }

 export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex]; 
-    float b = broadcast(a, 2);
+    float b = (programCount == 1) ? 3 : broadcast(a, 2);
    RET[programIndex] = b;
 }

--- a/tests/const-fold-select-1.ispc
+++ b/tests/const-fold-select-1.ispc
@@ -0,0 +1,10 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    RET[programIndex] = (programIndex >= 0) ? 1 : 0;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1;
+}
--- a/tests/const-fold-select-2.ispc
+++ b/tests/const-fold-select-2.ispc
@@ -0,0 +1,10 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    RET[programIndex] = (programCount < 10000) ? 1 : 0;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1;
+}
--- a/tests/extract-1.ispc
+++ b/tests/extract-1.ispc
@@ -3,9 +3,9 @@ export uniform int width() { return programCount; }

 export void f_f(uniform float RET[], uniform float aFOO[]) {
    double a = programIndex;
-    RET[programIndex] = extract(a, 3); 
+    RET[programIndex] = extract(a, min(programCount-1, 3)); 
 }

 export void result(uniform float RET[]) {
-    RET[programIndex] = 3;
+    RET[programIndex] = (programCount == 1) ? 0 : 3;
 }
--- a/tests/foreach-double-1.ispc
+++ b/tests/foreach-double-1.ispc
@@ -0,0 +1,30 @@
+
+export uniform int width() { return programCount; }
+
+uniform double one = 1;
+
+void copy(uniform double dst[], uniform double src[], uniform int count) {
+    foreach (i = 0 ... count)
+        dst[i] = one * src[i];
+}
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int count = 200 + aFOO[1];
+    uniform double * uniform src = uniform new uniform double[count];
+    for (uniform int i = 0; i < count; ++i)
+        src[i] = i;
+
+    uniform double * uniform dst = uniform new uniform double[count];
+    copy(dst, src, count);
+
+    uniform int errors = 0;
+    for (uniform int i = 0; i < count; ++i)
+        if (dst[i] != src[i])
+            ++errors;
+
+    RET[programIndex] = errors; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}
--- a/tests/half-3.ispc
+++ b/tests/half-3.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+export void f_v(uniform float RET[]) {
+    int errors = 0;
+
+    foreach (i = 0 ... 65535) {
+        unsigned int16 h = i;
+        float f = half_to_float(i);
+        h = float_to_half(f);
+
+        int mismatches = (f == f && i != h);
+        errors += reduce_add(mismatches);
+    }
+
+    RET[programIndex] = errors;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}
--- a/tests/local-atomics-1.ispc
+++ b/tests/local-atomics-1.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+uniform unsigned int32 s = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float delta = 1;
+    float b = atomic_add_local(&s, delta);
+    RET[programIndex] = reduce_add(b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = reduce_add(programIndex);
+}
--- a/tests/local-atomics-10.ispc
+++ b/tests/local-atomics-10.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+uniform unsigned int32 s = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = 0;
+    float delta = 1;
+    if (programIndex < 2)
+        b = atomic_add_local(&s, delta);
+    RET[programIndex] = s;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = programCount == 1 ? 1 : 2;
+}
--- a/tests/local-atomics-11.ispc
+++ b/tests/local-atomics-11.ispc
@@ -0,0 +1,20 @@
+
+export uniform int width() { return programCount; }
+
+uniform unsigned int32 s = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = 0;
+    if (programIndex & 1)
+        b = atomic_add_local(&s, programIndex);
+    RET[programIndex] = s;
+}
+
+export void result(uniform float RET[]) {
+    uniform int sum = 0;
+    for (uniform int i = 0; i < programCount; ++i)
+        if (i & 1)
+            sum += i;
+    RET[programIndex] = sum;
+}
--- a/tests/local-atomics-12.ispc
+++ b/tests/local-atomics-12.ispc
@@ -0,0 +1,20 @@
+
+export uniform int width() { return programCount; }
+
+uniform unsigned int32 s = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = 0;
+    if (programIndex & 1)
+        b = atomic_or_local(&s, (1 << programIndex));
+    RET[programIndex] = s;
+}
+
+export void result(uniform float RET[]) {
+    uniform int sum = 0;
+    for (uniform int i = 0; i < programCount; ++i)
+        if (i & 1)
+            sum += (1 << i);
+    RET[programIndex] = sum;
+}
--- a/tests/local-atomics-13.ispc
+++ b/tests/local-atomics-13.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+uniform unsigned int32 s = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = 0;
+    if (programIndex & 1)
+        b = atomic_or_local(&s, (1 << programIndex));
+    RET[programIndex] = popcnt(reduce_max((int32)b));
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = programCount == 1 ? 0 : ((programCount/2) - 1);
+}
--- a/tests/local-atomics-14.ispc
+++ b/tests/local-atomics-14.ispc
@@ -0,0 +1,20 @@
+
+export uniform int width() { return programCount; }
+
+uniform unsigned int64 s = 0xffffffffff000000;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = 0;
+    if (programIndex & 1)
+        b = atomic_or_local(&s, (1 << programIndex));
+    RET[programIndex] = (s>>20);
+}
+
+export void result(uniform float RET[]) {
+    uniform int sum = 0;
+    for (uniform int i = 0; i < programCount; ++i)
+        if (i & 1)
+            sum += (1 << i);
+    RET[programIndex] = ((unsigned int64)(0xffffffffff000000 | sum)) >> 20;
+}
--- a/tests/local-atomics-2.ispc
+++ b/tests/local-atomics-2.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+uniform int64 s = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float delta = 1;
+    float b = atomic_add_local(&s, delta);
+    RET[programIndex] = reduce_add(b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = reduce_add(programIndex);
+}
--- a/tests/local-atomics-3.ispc
+++ b/tests/local-atomics-3.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+uniform int32 s = 0xff;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    int32 bits = 0xfff0;
+    float b = atomic_xor_local(&s, bits);
+    RET[programIndex] = s;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (programCount & 1) ? 0xff0f : 0xff;
+}
--- a/tests/local-atomics-4.ispc
+++ b/tests/local-atomics-4.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+uniform int32 s = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = atomic_or_local(&s, (1<<programIndex));
+    RET[programIndex] = s;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (1<<programCount)-1;
+}
--- a/tests/local-atomics-5.ispc
+++ b/tests/local-atomics-5.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+uniform int32 s = 0xbeef;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = atomic_swap_local(&s, programIndex);
+    RET[programIndex] = reduce_max(b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0xbeef;
+}
--- a/tests/local-atomics-6.ispc
+++ b/tests/local-atomics-6.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+uniform int32 s = 2;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = atomic_compare_exchange_local(&s, programIndex, a*1000);
+    RET[programIndex] = s;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (programCount == 1) ? 2 : 3000;
+}
--- a/tests/local-atomics-7.ispc
+++ b/tests/local-atomics-7.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+uniform int32 s = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    int32 a = aFOO[programIndex]; 
+    float b = atomic_min_local(&s, a);
+    RET[programIndex] = reduce_min(b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = reduce_min(programIndex);
+}
--- a/tests/local-atomics-8.ispc
+++ b/tests/local-atomics-8.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+uniform int32 s = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    int32 a = aFOO[programIndex]; 
+    int32 b = 0;
+    if (programIndex & 1)
+        b = atomic_max_local(&s, a);
+    RET[programIndex] = s;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (programCount == 1) ? 0 : programCount;
+}
--- a/tests/local-atomics-9.ispc
+++ b/tests/local-atomics-9.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+uniform unsigned int32 s = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = 0;
+    int32 delta = 1;
+    if (programIndex < 2)
+        b = atomic_add_local(&s, delta);
+    RET[programIndex] = reduce_add(b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (programCount == 1) ? 0 : 1;
+}
--- a/tests/local-atomics-swap.ispc
+++ b/tests/local-atomics-swap.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+uniform int32 s = 1234;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = 0;
+    if (programIndex & 1) {
+        b = atomic_swap_local(&s, programIndex);
+    }
+    RET[programIndex] = reduce_add(b) + s;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1234 + reduce_add(programIndex & 1 ? programIndex : 0);
+}
--- a/tests/local-atomics-uniform-1.ispc
+++ b/tests/local-atomics-uniform-1.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+uniform unsigned int32 s = 10;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    uniform unsigned int32 b = atomic_add_local(&s, 1);
+    RET[programIndex] = s;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 11;
+}
--- a/tests/local-atomics-uniform-2.ispc
+++ b/tests/local-atomics-uniform-2.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+uniform unsigned int32 s = 0b1010;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    uniform unsigned int32 b = atomic_or_local(&s, 1);
+    RET[programIndex] = s;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0b1011;
+}
--- a/tests/local-atomics-uniform-3.ispc
+++ b/tests/local-atomics-uniform-3.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+uniform unsigned int32 s = 0b1010;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    uniform unsigned int32 b = atomic_or_local(&s, 1);
+    RET[programIndex] = b;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0b1010;
+}
--- a/tests/local-atomics-uniform-4.ispc
+++ b/tests/local-atomics-uniform-4.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+uniform unsigned int32 s = 0xffff;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    uniform unsigned int32 b = atomic_min_local(&s, 1);
+    RET[programIndex] = b;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0xffff;
+}
--- a/tests/local-atomics-uniform-5.ispc
+++ b/tests/local-atomics-uniform-5.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+uniform unsigned int32 s = 0xffff;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    uniform unsigned int32 b = atomic_min_local(&s, 1);
+    RET[programIndex] = s;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1;
+}
--- a/tests/local-atomics-uniform-6.ispc
+++ b/tests/local-atomics-uniform-6.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+uniform float s = 100.;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    uniform float b = atomic_swap_local(&s, 1.);
+    RET[programIndex] = s;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1.;
+}
--- a/tests/local-atomics-uniform-7.ispc
+++ b/tests/local-atomics-uniform-7.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+uniform float s = 100.;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    uniform float b = atomic_swap_local(&s, 1.);
+    RET[programIndex] = b;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 100.;
+}
--- a/tests/local-atomics-uniform-8.ispc
+++ b/tests/local-atomics-uniform-8.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+uniform float s = 100.;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    uniform float b = atomic_compare_exchange_local(&s, 1., -100.);
+    RET[programIndex] = b;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 100.;
+}
--- a/tests/local-atomics-uniform-9.ispc
+++ b/tests/local-atomics-uniform-9.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+uniform int64 s = 100.;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    uniform int64 b = atomic_compare_exchange_local(&s, 100, -100);
+    RET[programIndex] = s;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = -100.;
+}
--- a/tests/local-atomics-varyingptr-1.ispc
+++ b/tests/local-atomics-varyingptr-1.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+uniform unsigned int32 s[programCount];
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = 0;
+    float delta = 1;
+    if (programIndex < 2)
+        atomic_add_local(&s[programIndex], delta);
+    RET[programIndex] = s[programIndex];
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = RET[1] = 1;
+}
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Matt Pharr	a9ec745275	Release notes, bump doxygen release number for 1.1.4	2012-02-04 15:38:17 -08:00
Matt Pharr	c2ecc15b93	Add missing "varying/varying" atomic_compare_exchange_global() functions.	2012-02-03 13:19:15 -08:00
Matt Pharr	83c8650b36	Add support for "local" atomics. Also updated aobench example to use them, which in turn allows using foreach() and thence a much cleaner implementation. Issue #58.	2012-02-03 13:15:21 -08:00
Matt Pharr	89cb809922	Short-circuit evaluation of ? : operator for varying tests. ? : now short-circuits evaluation of the expressions following the boolean test for varying test types. (It already did this for uniform tests). Issue #169.	2012-02-01 11:03:58 -08:00
Matt Pharr	fdb4eaf437	Fix bug in &&/\|\| short-circuiting. Use full mask, not internal mask when checking "any lanes running" before evaluating expressions. Added some more tests to try to cover this case.	2012-02-01 08:17:25 -08:00
Matt Pharr	0432f97555	Fix build with LLVM 3.1 TOT	2012-01-31 14:10:07 -08:00
Matt Pharr	8d1631b714	Constant fold in SelectExpr::Optimize(). Resolves issue #170.	2012-01-31 12:22:11 -08:00
Matt Pharr	dac091552d	Fix errors in tests for scalar target. Issue #167.	2012-01-31 11:57:12 -08:00
Matt Pharr	ea027a95a8	Fix various places in deferred shading example that assumed programCount >= 4. This gets deferred closer to working with the scalar target, but there are still some issues. (Partially in gamma correction / final clamping, it seems.) This fix causes a ~0.5% performance degradation with e.g. the AVX target, though it's not clear that it's worth having a separate code path in order to not lose this small amount of perf. (Partially addresses issue #167)	2012-01-31 11:46:33 -08:00
Matt Pharr	f73abb05a7	Fix bug in handling scatters where all instances go to the same location. Previously, we'd pick one lane and generate a regular store for its value. This was the wrong thing to do, since we also should have been checking that the mask was on (for the lane that was chosen). This bug didn't become evident until the scalar target was added, since many stores fall into this case with that target. Now, we just leave those as regular scatters. Fixes most of the failing tests for the scalar target listed in issue #167.	2012-01-31 11:06:14 -08:00
Matt Pharr	d71c49494f	Missed pass that should be skipped when pseudo memory ops are supposed to be left unchanged.	2012-01-31 11:02:23 -08:00
Matt Pharr	25665f0841	Implement NullPointerExpr::GetConstant() Also reworked TypeCastExpr::GetConstant() to just forward the request along and moved the code that was previously there to handle uniform->varying smears of function pointers to FunctionSymbolExpr::GetConstant(). Fixes issue #168.	2012-01-31 09:37:39 -08:00
Matt Pharr	1eec27f890	Scalar target fixes. Don't issue warnings about all instances writing to the same location if there is only one program instance in the gang. Be sure to report that all values are equal in one-element vectors in LLVMVectorValuesAllEqual(). Issue #166.	2012-01-31 08:52:11 -08:00
Matt Pharr	950f86200b	Fix examples/tasksys.cpp to compile with 32-bit targets. (Change a cmpxchgd to cmpxchl.) Note that a number of the examples still don't work with 32-bit compilation, why still TBD.	2012-01-30 15:03:54 -08:00
Matt Pharr	e19f4931d1	Short-circuit evaluation of && and \|\| operators. We now follow C's approach of evaluating these: we don't evaluate the second expression in the operator if the value of the first one determines the overall result. Thus, these can now be used idiomatically like (index < limit && array[index] > 0) and such. For varying expressions, the mask is set appropriately when evaluating the second expression. (For expressions that can be determined to be both simple and safe to evaluate with the mask all off, we still evaluate both sides and compute the logical op result directly, which saves a number of branches and tests. However, the effect of this should never be visible to the programmer.) Issue #4.	2012-01-30 05:58:41 -08:00
Matt Pharr	0575b1f38d	Update run_tests and examples makefile for scalar target. Fixed a number of tests that didn't handle the programCount == 1 case correctly.	2012-01-29 16:22:25 -08:00
Matt Pharr	f6cd01f7cf	Windows build support for scalar target.	2012-01-29 13:48:01 -08:00
Matt Pharr	f2fbc168af	Scalar target builtins bugfixes. Typo in __max_varying_double. Add declarations for half functions. Use the gen_scatter macro to get the scatter functions.	2012-01-29 13:47:44 -08:00
Matt Pharr	b50f6f1730	Fix RNG seed code in stdlib for scalar target.	2012-01-29 13:46:57 -08:00
Matt Pharr	f8a7120d9c	Detect division by 0 during constant folding and issue a sensible error.	2012-01-29 13:46:38 -08:00
Matt Pharr	20dbf59420	Don't lose source position when returning values of constant symbols.	2012-01-29 13:46:17 -08:00
Gabe Weisz	c67a286aa6	Add support for 1-wide scalar target. Issue #40.	2012-01-29 06:36:07 -08:00
Matt Pharr	c96fef6bc8	Fix silly error in generic-16.h example C++ bindings.	2012-01-27 17:04:57 -08:00
Matt Pharr	bba02f87ea	Improve implementations of unsigned <=, >= in sse4 intrinsics file.	2012-01-27 16:49:41 -08:00
Matt Pharr	12dc3f5c28	Fixes to c++ backend for new and delete Don't include declarations of malloc/free in the generated code (get the standard ones from system headers instead). Add a cast to (uint8_t ) before calls to malloc, which C++ requires, since proper malloc returns a void .	2012-01-27 16:49:09 -08:00
Matt Pharr	0f01a5dcbe	Handle undef values in LLVMVectorValuesAllEqual()	2012-01-27 16:48:14 -08:00
Matt Pharr	664dc3bdda	Add support for "new" and "delete" to the language. Issue #139.	2012-01-27 14:47:06 -08:00
Matt Pharr	bdba3cd97d	Bugfix: add per-lane offsets when accessing varying data through a pointer!	2012-01-27 14:44:52 -08:00
Matt Pharr	d9c0f9315a	Fix generic targets: half conversion functions weren't declared. (Broken by `1867b5b31`).	2012-01-27 14:44:43 -08:00
Matt Pharr	b7f17d435f	Fix crash in gather/scatter optimization pass.	2012-01-27 14:44:35 -08:00
Matt Pharr	37cdc18639	Issue error instead of crashing given attempted function call through non-function. Fixes issue #163.	2012-01-27 10:01:06 -08:00
Matt Pharr	5893a9c49d	Remove incorrect assert	2012-01-27 09:14:45 -08:00
Matt Pharr	24f58fa16a	Update per_lane macro to not use ID for lane number in macro expansion This was leading to unintended consequences if WIDTH was used in macro code, which was undesirable.	2012-01-27 09:12:13 -08:00
Matt Pharr	56ffc78fa4	Require semicolons after sync, assert, and print statements. (Silly parser oversight.)	2012-01-27 09:12:13 -08:00
Matt Pharr	061e68bc77	Fix compiler crash from malformed program.	2012-01-27 09:12:13 -08:00
Matt Pharr	177e6312b4	Fix build with LLVM ToT (ConstantVector::getVectorElements() is gone now).	2012-01-27 09:07:58 -08:00
Matt Pharr	1acf4032c2	Merge branch 'master' of https://github.com/jduprat/ispc	2012-01-26 14:18:25 -08:00
Jean-Luc Duprat	9c5444698e	run_tests.py fixes: - Python 3 fixes (can't use print) - Fixed for running tests on Windows	2012-01-26 13:39:54 -08:00
Matt Pharr	65f3252760	Various fixes to test running script for Windows. Also, removed the --valgrind option and replaced it with a more general --wrap-exe option, which can be used both for running Valgrind and SDE.	2012-01-26 10:56:29 -08:00
Matt Pharr	e612abe4ba	Fix parsing of 64-bit integer constants on Windows. (i.e., use the 64-bit unsigned integer parsing function, not the 64-bit signed one.) Fixes bug #68.	2012-01-26 10:56:28 -08:00
Jean-Luc Duprat	34352e4e0e	beefed up stdin.h on Windows so it compiles ispc 1.1.3	2012-01-25 15:04:19 -08:00
Matt Pharr	1867b5b317	Use native float/half conversion instructions with the AVX2 target.	2012-01-24 15:33:38 -08:00
Matt Pharr	a5b7fca7e0	Extract constant offsets from gather/scatter base+offsets offset vectors. When we're able to turn a general gather/scatter into the "base + offsets" form, we now try to extract out any constant components of the offsets and then pass them as a separate parameter to the gather/scatter function implementation. We then in turn carefully emit code for the addressing calculation so that these constant offsets match LLVM's patterns to detect this case, such that we get the constant offsets directly encoded in the instruction's addressing calculation in many cases, saving arithmetic instructions to do these calculations. Improves performance of stencil by ~15%. Other workloads unchanged.	2012-01-24 14:41:15 -08:00
Matt Pharr	7be2c399b1	Rename various optimization passes to have more descriptive names. No functionality change.	2012-01-23 14:49:48 -08:00
Matt Pharr	d6337b3b22	Code cleanups in opt.cpp; no functional change	2012-01-23 14:36:32 -08:00
Matt Pharr	d2f8b0ace5	Add __clock to list of symbols to make internal from builtins.	2012-01-23 06:19:16 -08:00
Matt Pharr	d805e8b183	Add clock() function to standard library. Also corrected the declaration of num_cores() to return a uniform value.	2012-01-22 13:05:27 -08:00
Matt Pharr	1f0f2ec05f	Include AVX2 in supported ISAs	2012-01-22 07:05:47 -08:00
Matt Pharr	91ac3b9d7c	Back out WIP changes to opt.cpp that were inadvertently checked in.	2012-01-21 07:34:53 -08:00
Matt Pharr	d65bf2eb2f	Doxygen number bump and release notes for 1.1.3	2012-01-20 17:04:16 -08:00
Matt Pharr	1bba9d4307	Improve atomic_swap_global() to take advantage of associativity. We now do a single atomic hardware swap and then effectively do swaps between the running program instances such that the result is the same as if they had happened to run a particular ordering of hardware swaps themselves. Also cleaned up __atomic_swap_uniform_* built-in implementations to not take the mask, which they weren't using anyway. Finishes Issue #56.	2012-01-20 10:37:33 -08:00
Matt Pharr	4388338dad	Fix performance regression introduced in `be0c77d556` Effectively, the patterns that detected when given a gather or scatter in base+offsets form, the offsets were actually a multiple of 2/4/8, were no longer working. This change not only fixes this, but also expands the set of patterns that are matched by this. For example, given offsets of the form 4v1 + 16v2, it identifies a scale of 4 and new offsets of v1 + 4*v2. This fix makes the volume renderer run 1.19x faster, and noise 1.54x faster.	2012-01-19 17:57:59 -08:00
Matt Pharr	2fb59c90cf	Fix C++ backend bug introduced in `d14a2de168`. (This was causing a number of tests to fail with the generic targets.)	2012-01-19 11:35:02 -07:00
Matt Pharr	68f6ea8def	For << and >> with C++, detect when all instances are shifting by the same amount. In this case, we now emit calls to potentially-specialized functions for the left/right shifts that take a single integer value for the shift amount. These in turn can be matched to the corresponding intrinsics for the SSE target. Issue #145.	2012-01-19 10:04:32 -07:00
Matt Pharr	3f89295d10	Update RNG code in stdlib to use -> operator where appropriate.	2012-01-19 10:02:47 -07:00
Matt Pharr	748b292e77	Improve code for uniform switches with a 'break' under varying control flow. Previously, when we had a switch statement with a uniform switch condition but a 'break' statement that was under varying control flow inside the switch, we'd promote the switch condition to be varying so that the break would work correctly. Now, we leave the condition as uniform and are thus able to use the more-efficient LLVM switch instruction in this case. Issue #156.	2012-01-19 08:41:19 -07:00
Matt Pharr	6451c3d99d	Fix bug with code for initializers for static arrays in generated C++ code. (This was preventing aobench from compiling successfully with the generic target.)	2012-01-18 16:55:09 -07:00
Matt Pharr	d14a2de168	Fix generic code emission when building with LLVM3.0/2.9. Specifically, don't use vector select for masked store blend there, but emit a call to a undefined __masked_store_blend_*() functions. Added implementations of these functions to the sse4.h and generic-16.h in examples/instrinsics. (Calls to these will never be generated with LLVM 3.1).	2012-01-17 23:42:22 -07:00
Matt Pharr	642150095d	Include LLVM version used to build in version info printed out.	2012-01-17 23:42:22 -07:00
Matt Pharr	3bf3ac7922	Be more conservative about using blending in place of masked store. More specifically, we do a proper masked store (rather than a load- blend-store) unless we can determine that we're accessing a stack-allocated "varying" variable. This fixes a number of nefarious bugs where given code like: uniform float a[21]; foreach (i = 0 … 21) a[i] = 0; We'd use a blend and in turn read past the end of a[] in the last iteration. Also made slight changes to inlining in aobench; this keeps compiles to ~5s, versus ~45s without them (with this change). Fixes issue #160.	2012-01-17 23:42:22 -07:00
Matt Pharr	c6d1cebad4	Update masked_load/store implementations for generic targets to take void *s (Fixes compile errors when we try to actually use these!)	2012-01-17 23:42:22 -07:00
Matt Pharr	08189ce08c	Update "inline" qualifiers in a few examples.	2012-01-17 23:42:22 -07:00
Matt Pharr	7013d7d52f	Small documentation updates and cleanups	2012-01-17 23:42:21 -07:00
Matt Pharr	7045b76f84	Improvements to code generation for "foreach" Specialize the code for the innermost loop to not do any masking computations for the innermost dimension for the iterations where we are certainly working on a full vector's worth of data. This fix improves performance/code quality of "foreach" such that it's essentially the same as the equivalent "for" loop. Fixes issue #151.	2012-01-17 11:34:00 -08:00
Matt Pharr	58a0b4a20d	Add separate set of builtins for AVX2. (i.e., stop just reusing the ones for AVX1). For now the only difference is that the int/uint min/max functions call the new intrinsic for that. Once gather is available from LLVM, that will go here as well.	2012-01-13 14:40:01 -08:00
Matt Pharr	0f8eee9809	Fix cases in optimization code to not inadvertently match calls to func ptrs. If we call a function pointer, CallInst::getCalledFunction() returns NULL; we need to be careful about this case when we're matching various function calls in optimization passes. (Fixes a crash.)	2012-01-12 10:33:06 -08:00
Matt Pharr	0740299860	Fix switch test	2012-01-12 09:45:31 -08:00
Matt Pharr	652215861e	Update dynamic target dispatch code to support AVX2.	2012-01-12 08:37:18 -08:00
Matt Pharr	602209e5a8	Tiny updates to documentation, comment for switch stuff.	2012-01-12 05:55:42 -08:00
Matt Pharr	b60f8b4f70	Fix merge conflicts	2012-01-11 17:13:51 -08:00
Matt Pharr	b67446d998	Add support for "switch" statements. Switches with both uniform and varying "switch" expressions are supported. Switch statements with varying expressions and very large numbers of labels may not perform well; some issues to be filed shortly will track opportunities for improving these.	2012-01-11 09:16:31 -08:00
Matt Pharr	9670ab0887	Add missing cases to watch out for in lCheckAllOffSafety() Previously, we weren't checking for member expressions that dereferenced a pointer or pointer dereference expressions--only array indexing!	2012-01-11 09:16:31 -08:00
Matt Pharr	0223bb85ee	Fix bug in StmtList::EmitCode() Previously, we would return immediately if the current basic block was NULL; however, this is the wrong thing to do in that goto labels and case/default labels in switch statements will establish a new current basic block even if the current one is NULL.	2012-01-11 09:14:39 -08:00
Jean-Luc Duprat	fd81255db1	Removed mutex support for OSX 10.5 Allow to run from the build directory even if it is not on the path properly decode subprocess stdout/stderr as UTF-8 Added newlines that were mistakenly left out of print->sys.stdout.wriote() conversion in previous CL Python 3: - fixed error message comparison - explicit list creation Windows: - forward/back slash annoyances - added stdint.h with definitions for int32_t, int64_t - compile_error_files and run_error_files were being appended to improperly	2012-01-10 16:55:00 -08:00
Matt Pharr	8a8e1a7f73	Fix bug with multiple EmitCode() calls due to missing braces. In short, we were inadvertently trying to emit each function's code a second time if the function had a mask check at the start of it. StmtList::EmitCode() was covering this error up by not emitting code if the current basic block is NULL.	2012-01-10 16:50:13 -08:00
Jean-Luc Duprat	ef05fbf424	run_tests.py more compatible with python 3.x except for the mutex class...	2012-01-10 13:12:38 -08:00
Jean-Luc Duprat	fa01b63fa5	Remove assumption that . is in the PATH in run_tests.py	2012-01-10 11:41:08 -08:00
Jean-Luc Duprat	63d3d25030	Fixed off by one error in array size generated by bitcode2cpp.py	2012-01-10 11:22:13 -08:00
Jean-Luc Duprat	a8db866228	Python build compatible on both python 2 and 3	2012-01-10 10:42:15 -08:00
Jean-Luc Duprat	0519eea951	Makefile does not hardcode link paths on Linux Link statically for both x86 and x86-64	2012-01-10 10:34:57 -08:00
Jean-Luc Duprat	5d67252ed0	Python scripts now compatible with both 2.x and 3.x releases of python	2012-01-09 13:56:05 -08:00
Jean-Luc Duprat	59f4c9985e	Python files compatible with python 3	2012-01-06 16:56:09 -08:00