diff --git a/Makefile b/Makefile
index e39eb831..0d61c611 100644
--- a/Makefile
+++ b/Makefile
@@ -2,6 +2,15 @@
 # ispc Makefile
 #
 
+# If you have your own special version of llvm and/or clang, change
+# these variables to match.
+LLVM_CONFIG=$(shell which llvm-config)
+CLANG_INCLUDE=$(shell $(LLVM_CONFIG) --includedir)
+
+# Add llvm bin to the path so any scripts run will go to the right llvm-config
+LLVM_BIN= $(shell $(LLVM_CONFIG) --bindir)
+export PATH:=$(LLVM_BIN):$(PATH)
+
 ARCH_OS = $(shell uname)
 ifeq ($(ARCH_OS), Darwin)
 	ARCH_OS2 = "OSX"
@@ -10,10 +19,12 @@ else
 endif
 ARCH_TYPE = $(shell arch)
 
-ifeq ($(shell llvm-config --version), 3.1svn)
+ifeq ($(shell $(LLVM_CONFIG) --version), 3.0)
+  LLVM_LIBS=$(shell $(LLVM_CONFIG) --libs)
+else
   LLVM_LIBS=-lLLVMAsmParser -lLLVMInstrumentation -lLLVMLinker			\
 	-lLLVMArchive -lLLVMBitReader -lLLVMDebugInfo -lLLVMJIT -lLLVMipo	\
-	-lLLVMBitWriter -lLLVMTableGen -lLLVMCBackendInfo			\
+	-lLLVMBitWriter -lLLVMTableGen 			                        \
 	-lLLVMX86Disassembler -lLLVMX86CodeGen -lLLVMSelectionDAG		\
 	-lLLVMAsmPrinter -lLLVMX86AsmParser -lLLVMX86Desc -lLLVMX86Info		\
 	-lLLVMX86AsmPrinter -lLLVMX86Utils -lLLVMMCDisassembler	-lLLVMMCParser	\
@@ -21,19 +32,17 @@ ifeq ($(shell llvm-config --version), 3.1svn)
 	-lLLVMipa -lLLVMAnalysis -lLLVMMCJIT -lLLVMRuntimeDyld			\
 	-lLLVMExecutionEngine -lLLVMTarget -lLLVMMC -lLLVMObject -lLLVMCore 	\
 	-lLLVMSupport
-else
-  LLVM_LIBS=$(shell llvm-config --libs)
 endif
 
 CLANG=clang
 CLANG_LIBS = -lclangFrontend -lclangDriver \
              -lclangSerialization -lclangParse -lclangSema \
              -lclangAnalysis -lclangAST -lclangLex -lclangBasic
-ifeq ($(shell llvm-config --version), 3.1svn)
+ifneq ($(shell $(LLVM_CONFIG) --version), 3.0)
   CLANG_LIBS += -lclangEdit
 endif
 
-ISPC_LIBS=$(shell llvm-config --ldflags) $(CLANG_LIBS) $(LLVM_LIBS) \
+ISPC_LIBS=$(shell $(LLVM_CONFIG) --ldflags) $(CLANG_LIBS) $(LLVM_LIBS) \
 	-lpthread
 
 ifeq ($(ARCH_OS),Linux)
@@ -44,8 +53,8 @@ ifeq ($(ARCH_OS2),Msys)
 	ISPC_LIBS += -lshlwapi -limagehlp -lpsapi
 endif
 
-LLVM_CXXFLAGS=$(shell llvm-config --cppflags)
-LLVM_VERSION=LLVM_$(shell llvm-config --version | sed s/\\./_/)
+LLVM_CXXFLAGS=$(shell $(LLVM_CONFIG) --cppflags)
+LLVM_VERSION=LLVM_$(shell $(LLVM_CONFIG) --version | sed -e s/\\./_/ -e s/svn//)
 LLVM_VERSION_DEF=-D$(LLVM_VERSION)
 
 BUILD_DATE=$(shell date +%Y%m%d)
@@ -53,8 +62,9 @@ BUILD_VERSION=$(shell git log --abbrev-commit --abbrev=16 | head -1)
 
 CXX=g++
 CPP=cpp
-OPT=-g3
-CXXFLAGS=$(OPT) $(LLVM_CXXFLAGS) -I. -Iobjs/ -Wall $(LLVM_VERSION_DEF) \
+OPT=-O2
+CXXFLAGS=$(OPT) $(LLVM_CXXFLAGS) -I. -Iobjs/ -I$(CLANG_INCLUDE)  \
+	-Wall $(LLVM_VERSION_DEF) \
 	-DBUILD_DATE="\"$(BUILD_DATE)\"" -DBUILD_VERSION="\"$(BUILD_VERSION)\""
 
 LDFLAGS=
@@ -75,7 +85,7 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
 HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
 	opt.h stmt.h sym.h type.h util.h
 TARGETS=avx1 avx1-x2 avx2 avx2-x2 sse2 sse2-x2 sse4 sse4-x2 generic-4 generic-8 \
-	generic-16 generic-1
+	generic-16 generic-32 generic-64 generic-1
 BUILTINS_SRC=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS))) \
 	builtins/dispatch.ll
 BUILTINS_OBJS=$(addprefix builtins-, $(notdir $(BUILTINS_SRC:.ll=.o))) \
@@ -114,7 +124,7 @@ doxygen:
 
 ispc: print_llvm_src dirs $(OBJS)
 	@echo Creating ispc executable
-	@$(CXX) $(LDFLAGS) -o $@ $(OBJS) $(ISPC_LIBS)
+	@$(CXX) $(OPT) $(LDFLAGS) -o $@ $(OBJS) $(ISPC_LIBS)
 
 objs/%.o: %.cpp
 	@echo Compiling $<
diff --git a/ast.cpp b/ast.cpp
index c89f00bb..96c41616 100644
--- a/ast.cpp
+++ b/ast.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2011, Intel Corporation
+  Copyright (c) 2011-2012, Intel Corporation
   All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
@@ -32,8 +32,10 @@
 */
 
 /** @file ast.cpp
-    @brief 
-*/
+
+    @brief General functionality related to abstract syntax trees and
+    traversal of them.
+ */
 
 #include "ast.h"
 #include "expr.h"
@@ -53,10 +55,10 @@ ASTNode::~ASTNode() {
 // AST
 
 void
-AST::AddFunction(Symbol *sym, const std::vector<Symbol *> &args, Stmt *code) {
+AST::AddFunction(Symbol *sym, Stmt *code) {
     if (sym == NULL)
         return;
-    functions.push_back(new Function(sym, args, code));
+    functions.push_back(new Function(sym, code));
 }
 
 
@@ -151,7 +153,7 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
         else if ((ls = dynamic_cast<LabeledStmt *>(node)) != NULL)
             ls->stmt = (Stmt *)WalkAST(ls->stmt, preFunc, postFunc, data);
         else if ((rs = dynamic_cast<ReturnStmt *>(node)) != NULL)
-            rs->val = (Expr *)WalkAST(rs->val, preFunc, postFunc, data);
+            rs->expr = (Expr *)WalkAST(rs->expr, preFunc, postFunc, data);
         else if ((sl = dynamic_cast<StmtList *>(node)) != NULL) {
             std::vector<Stmt *> &sls = sl->stmts;
             for (unsigned int i = 0; i < sls.size(); ++i)
@@ -305,19 +307,39 @@ TypeCheck(Stmt *stmt) {
 }
 
 
+struct CostData {
+    CostData() { cost = foreachDepth = 0; }
+
+    int cost;
+    int foreachDepth;
+};
+
+
 static bool
-lCostCallback(ASTNode *node, void *c) {
-    int *cost = (int *)c;
-    *cost += node->EstimateCost();
+lCostCallbackPre(ASTNode *node, void *d) {
+    CostData *data = (CostData *)d;
+    if (dynamic_cast<ForeachStmt *>(node) != NULL)
+        ++data->foreachDepth;
+    if (data->foreachDepth == 0)
+        data->cost += node->EstimateCost();
     return true;
 }
 
 
+static ASTNode *
+lCostCallbackPost(ASTNode *node, void *d) {
+    CostData *data = (CostData *)d;
+    if (dynamic_cast<ForeachStmt *>(node) != NULL)
+        --data->foreachDepth;
+    return node;
+}
+
+
 int
 EstimateCost(ASTNode *root) {
-    int cost = 0;
-    WalkAST(root, lCostCallback, NULL, &cost);
-    return cost;
+    CostData data;
+    WalkAST(root, lCostCallbackPre, lCostCallbackPost, &data);
+    return data.cost;
 }
 
 
@@ -334,10 +356,10 @@ lCheckAllOffSafety(ASTNode *node, void *data) {
             return false;
 
         const Type *type = fce->func->GetType();
-        const PointerType *pt = dynamic_cast<const PointerType *>(type);
+        const PointerType *pt = CastType<PointerType>(type);
         if (pt != NULL)
             type = pt->GetBaseType();
-        const FunctionType *ftype = dynamic_cast<const FunctionType *>(type);
+        const FunctionType *ftype = CastType<FunctionType>(type);
         Assert(ftype != NULL);
 
         if (ftype->isSafe == false) {
@@ -363,17 +385,22 @@ lCheckAllOffSafety(ASTNode *node, void *data) {
         return false;
     }
 
-    if (g->target.allOffMaskIsSafe == true)
-        // Don't worry about memory accesses if we have a target that can
-        // safely run them with the mask all off
-        return true;
+    if (dynamic_cast<ForeachStmt *>(node) != NULL) {
+        // foreach() statements also shouldn't be run with an all-off mask.
+        // Since they re-establish an 'all on' mask, this would be pretty
+        // unintuitive.  (More generally, it's possibly a little strange to
+        // allow foreach() in the presence of any non-uniform control
+        // flow...)
+        *okPtr = false;
+        return false;
+    }
 
     IndexExpr *ie;
     if ((ie = dynamic_cast<IndexExpr *>(node)) != NULL && ie->baseExpr != NULL) {
         const Type *type = ie->baseExpr->GetType();
         if (type == NULL)
             return true;
-        if (dynamic_cast<const ReferenceType *>(type) != NULL)
+        if (CastType<ReferenceType>(type) != NULL)
             type = type->GetReferenceTarget();
 
         ConstExpr *ce = dynamic_cast<ConstExpr *>(ie->index);
@@ -383,16 +410,14 @@ lCheckAllOffSafety(ASTNode *node, void *data) {
             return false;
         }
 
-        const PointerType *pointerType = 
-            dynamic_cast<const PointerType *>(type);
+        const PointerType *pointerType = CastType<PointerType>(type);
         if (pointerType != NULL) {
             // pointer[index] -> can't be sure -> not safe
             *okPtr = false;
             return false;
         }
 
-        const SequentialType *seqType = 
-            dynamic_cast<const SequentialType *>(type);
+        const SequentialType *seqType = CastType<SequentialType>(type);
         Assert(seqType != NULL);
         int nElements = seqType->GetElementCount();
         if (nElements == 0) {
diff --git a/ast.h b/ast.h
index 0f73677b..f03d7343 100644
--- a/ast.h
+++ b/ast.h
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2011, Intel Corporation
+  Copyright (c) 2011-2012, Intel Corporation
   All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
@@ -84,8 +84,7 @@ class AST {
 public:
     /** Add the AST for a function described by the given declaration
         information and source code. */
-    void AddFunction(Symbol *sym, const std::vector<Symbol *> &args, 
-                     Stmt *code);
+    void AddFunction(Symbol *sym, Stmt *code);
 
     /** Generate LLVM IR for all of the functions into the current
         module. */
diff --git a/builtins.cpp b/builtins.cpp
index 0e34596d..db55758a 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
   All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
@@ -157,7 +157,7 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
 
 static void
 lCreateSymbol(const std::string &name, const Type *returnType, 
-              const std::vector<const Type *> &argTypes, 
+              llvm::SmallVector<const Type *, 8> &argTypes, 
               const llvm::FunctionType *ftype, llvm::Function *func, 
               SymbolTable *symbolTable) {
     SourcePos noPos;
@@ -199,7 +199,7 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
     // bool, so just have a one-off override for that one...
     if (g->target.maskBitCount != 1 && name == "__sext_varying_bool") {
         const Type *returnType = AtomicType::VaryingInt32;
-        std::vector<const Type *> argTypes;
+        llvm::SmallVector<const Type *, 8> argTypes;
         argTypes.push_back(AtomicType::VaryingBool);
 
         FunctionType *funcType = new FunctionType(returnType, argTypes, noPos);
@@ -229,7 +229,7 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
         // Iterate over the arguments and try to find their equivalent ispc
         // types.  Track if any of the arguments has an integer type.
         bool anyIntArgs = false;
-        std::vector<const Type *> argTypes;
+        llvm::SmallVector<const Type *, 8> argTypes;
         for (unsigned int j = 0; j < ftype->getNumParams(); ++j) {
             const llvm::Type *llvmArgType = ftype->getParamType(j);
             const Type *type = lLLVMTypeToISPCType(llvmArgType, intAsUnsigned);
@@ -291,7 +291,7 @@ lCheckModuleIntrinsics(llvm::Module *module) {
         if (!strncmp(funcName.c_str(), "llvm.x86.", 9)) {
             llvm::Intrinsic::ID id = (llvm::Intrinsic::ID)func->getIntrinsicID();
             Assert(id != 0);
-            LLVM_TYPE_CONST llvm::Type *intrinsicType = 
+            llvm::Type *intrinsicType = 
                 llvm::Intrinsic::getType(*g->ctx, id);
             intrinsicType = llvm::PointerType::get(intrinsicType, 0);
             Assert(func->getType() == intrinsicType);
@@ -411,12 +411,16 @@ lSetInternalFunctions(llvm::Module *module) {
         "__extract_int64",
         "__extract_int8",
         "__fastmath",
+        "__float_to_half_uniform",
+        "__float_to_half_varying",
         "__floatbits_uniform_int32",
         "__floatbits_varying_int32",
         "__floor_uniform_double",
         "__floor_uniform_float",
         "__floor_varying_double",
         "__floor_varying_float",
+        "__half_to_float_uniform",
+        "__half_to_float_varying",
         "__insert_int16",
         "__insert_int32",
         "__insert_int64",
@@ -616,9 +620,7 @@ AddBitcodeToModule(const unsigned char *bitcode, int length,
 
         std::string(linkError);
         if (llvm::Linker::LinkModules(module, bcModule, 
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
                                       llvm::Linker::DestroySource,
-#endif // LLVM_3_0
                                       &linkError))
             Error(SourcePos(), "Error linking stdlib bitcode: %s", linkError.c_str());
         lSetInternalFunctions(module);
@@ -635,16 +637,36 @@ AddBitcodeToModule(const unsigned char *bitcode, int length,
 static void
 lDefineConstantInt(const char *name, int val, llvm::Module *module,
                    SymbolTable *symbolTable) {
-    Symbol *pw = 
+    Symbol *sym = 
         new Symbol(name, SourcePos(), AtomicType::UniformInt32->GetAsConstType(),
                    SC_STATIC);
-    pw->constValue = new ConstExpr(pw->type, val, SourcePos());
-    LLVM_TYPE_CONST llvm::Type *ltype = LLVMTypes::Int32Type;
+    sym->constValue = new ConstExpr(sym->type, val, SourcePos());
+    llvm::Type *ltype = LLVMTypes::Int32Type;
     llvm::Constant *linit = LLVMInt32(val);
-    pw->storagePtr = new llvm::GlobalVariable(*module, ltype, true, 
-                                              llvm::GlobalValue::InternalLinkage,
-                                              linit, pw->name.c_str());
-    symbolTable->AddVariable(pw);
+    // Use WeakODRLinkage rather than InternalLinkage so that a definition
+    // survives even if it's not used in the module, so that the symbol is
+    // there in the debugger.
+    sym->storagePtr = new llvm::GlobalVariable(*module, ltype, true, 
+                                               llvm::GlobalValue::WeakODRLinkage,
+                                               linit, name);
+    symbolTable->AddVariable(sym);
+
+    if (m->diBuilder != NULL) {
+        llvm::DIFile file;
+        llvm::DIType diType = sym->type->GetDIType(file);
+        Assert(diType.Verify());
+        // FIXME? DWARF says that this (and programIndex below) should
+        // have the DW_AT_artifical attribute.  It's not clear if this
+        // matters for anything though.
+        llvm::DIGlobalVariable var = 
+            m->diBuilder->createGlobalVariable(name, 
+                                               file,
+                                               0 /* line */,
+                                               diType,
+                                               true /* static */,
+                                               sym->storagePtr);
+        Assert(var.Verify());
+    }
 }
 
 
@@ -652,7 +674,7 @@ lDefineConstantInt(const char *name, int val, llvm::Module *module,
 static void
 lDefineConstantIntFunc(const char *name, int val, llvm::Module *module,
                        SymbolTable *symbolTable) {
-    std::vector<const Type *> args;
+    llvm::SmallVector<const Type *, 8> args;
     FunctionType *ft = new FunctionType(AtomicType::UniformInt32, args, SourcePos());
     Symbol *sym = new Symbol(name, SourcePos(), ft, SC_STATIC);
 
@@ -670,21 +692,37 @@ lDefineConstantIntFunc(const char *name, int val, llvm::Module *module,
 
 static void
 lDefineProgramIndex(llvm::Module *module, SymbolTable *symbolTable) {
-    Symbol *pidx = 
+    Symbol *sym = 
         new Symbol("programIndex", SourcePos(), 
                    AtomicType::VaryingInt32->GetAsConstType(), SC_STATIC);
 
     int pi[ISPC_MAX_NVEC];
     for (int i = 0; i < g->target.vectorWidth; ++i)
         pi[i] = i;
-    pidx->constValue = new ConstExpr(pidx->type, pi, SourcePos());
+    sym->constValue = new ConstExpr(sym->type, pi, SourcePos());
 
-    LLVM_TYPE_CONST llvm::Type *ltype = LLVMTypes::Int32VectorType;
+    llvm::Type *ltype = LLVMTypes::Int32VectorType;
     llvm::Constant *linit = LLVMInt32Vector(pi);
-    pidx->storagePtr = new llvm::GlobalVariable(*module, ltype, true, 
-                                                llvm::GlobalValue::InternalLinkage, linit, 
-                                                pidx->name.c_str());
-    symbolTable->AddVariable(pidx);
+    // See comment in lDefineConstantInt() for why WeakODRLinkage is used here
+    sym->storagePtr = new llvm::GlobalVariable(*module, ltype, true, 
+                                               llvm::GlobalValue::WeakODRLinkage,
+                                               linit, 
+                                               sym->name.c_str());
+    symbolTable->AddVariable(sym);
+
+    if (m->diBuilder != NULL) {
+        llvm::DIFile file;
+        llvm::DIType diType = sym->type->GetDIType(file);
+        Assert(diType.Verify());
+        llvm::DIGlobalVariable var =
+            m->diBuilder->createGlobalVariable(sym->name.c_str(), 
+                                               file,
+                                               0 /* line */,
+                                               diType,
+                                               false /* static */,
+                                               sym->storagePtr);
+        Assert(var.Verify());
+    }
 }
 
 
@@ -809,6 +847,20 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
                                builtins_bitcode_generic_16_length, 
                                module, symbolTable);
             break;
+        case 32:
+            extern unsigned char builtins_bitcode_generic_32[];
+            extern int builtins_bitcode_generic_32_length;
+            AddBitcodeToModule(builtins_bitcode_generic_32, 
+                               builtins_bitcode_generic_32_length, 
+                               module, symbolTable);
+            break;
+        case 64:
+            extern unsigned char builtins_bitcode_generic_64[];
+            extern int builtins_bitcode_generic_64_length;
+            AddBitcodeToModule(builtins_bitcode_generic_64, 
+                               builtins_bitcode_generic_64_length, 
+                               module, symbolTable);
+            break;
 	case 1:
             extern unsigned char builtins_bitcode_generic_1[];
             extern int builtins_bitcode_generic_1_length;
@@ -841,10 +893,12 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
                        symbolTable);
     lDefineConstantInt("__math_lib_system", (int)Globals::Math_System, module,
                        symbolTable);
-    lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload, module,
-                           symbolTable);
+    lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload,
+                           module, symbolTable);
 
-    lDefineConstantInt("__have_native_half", (g->target.isa == Target::AVX2),
+    lDefineConstantInt("__have_native_half", g->target.hasHalf, module, 
+                       symbolTable);
+    lDefineConstantInt("__have_native_transcendentals", g->target.hasTranscendentals,
                        module, symbolTable);
 
     if (includeStdlibISPC) {
diff --git a/builtins/builtins.c b/builtins/builtins.c
index 36498e1a..8e1a5624 100644
--- a/builtins/builtins.c
+++ b/builtins/builtins.c
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
   All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
@@ -70,7 +70,7 @@ typedef int Bool;
     putchar('[');                                                       \
     for (int i = 0; i < width; ++i) {                                   \
         /* only print the value if the current lane is executing */     \
-        if (mask & (1<<i))                                              \
+        if (mask & (1ull<<i))                                           \
             printf(fmt, ((type *)ptr)[i]);                              \
         else                                                            \
             printf("((" fmt "))", ((type *)ptr)[i]);                    \
@@ -89,7 +89,7 @@ typedef int Bool;
     @param mask    Current lane mask when the print statemnt is called
     @param args    Array of pointers to the values to be printed
  */
-void __do_print(const char *format, const char *types, int width, int mask, 
+void __do_print(const char *format, const char *types, int width, uint64_t mask, 
                 void **args) {
     if (mask == 0) 
         return;
@@ -113,7 +113,7 @@ void __do_print(const char *format, const char *types, int width, int mask,
                 case 'B': {
                     putchar('[');
                     for (int i = 0; i < width; ++i) {
-                        if (mask & (1<<i))
+                        if (mask & (1ull << i))
                             printf("%s", ((Bool *)ptr)[i] ? "true" : "false");
                         else
                             printf("_________");
diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll
index c1979e30..13c4335d 100644
--- a/builtins/target-avx-x2.ll
+++ b/builtins/target-avx-x2.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2010-2012, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -175,7 +175,7 @@ define <16 x float> @__min_varying_float(<16 x float>,
 
 declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
 
-define i32 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
+define i64 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
   %floatmask = bitcast <16 x i32> %0 to <16 x float>
   %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -186,7 +186,8 @@ define i32 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
 
   %v1shift = shl i32 %v1, 8
   %v = or i32 %v1shift, %v0
-  ret i32 %v
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
 }
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll
index 53659b7c..608d2dcd 100644
--- a/builtins/target-avx.ll
+++ b/builtins/target-avx.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2010-2012, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -175,10 +175,11 @@ define <8 x float> @__min_varying_float(<8 x float>,
 
 declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
 
-define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
+define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
   %floatmask = bitcast <8 x i32> %0 to <8 x float>
   %v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone
-  ret i32 %v
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
 }
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll
index ad911e64..5ced9da9 100755
--- a/builtins/target-generic-1.ll
+++ b/builtins/target-generic-1.ll
@@ -186,14 +186,14 @@ define void @__masked_store_blend_64(<1 x i64>* nocapture, <1 x i64>,
   ret void
 }
 
-define  i32 @__movmsk(<1 x i32>) nounwind readnone alwaysinline {
+define  i64 @__movmsk(<1 x i32>) nounwind readnone alwaysinline {
   %item = extractelement <1 x i32> %0, i32 0
   %v = lshr i32 %item, 31
-  ret i32 %v
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
 }
 
 
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding
 ;;
diff --git a/builtins/target-generic-32.ll b/builtins/target-generic-32.ll
new file mode 100644
index 00000000..5f89bcdf
--- /dev/null
+++ b/builtins/target-generic-32.ll
@@ -0,0 +1,33 @@
+;;  Copyright (c) 2010-2012, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`WIDTH',`32')
+include(`target-generic-common.ll')
diff --git a/builtins/target-generic-64.ll b/builtins/target-generic-64.ll
new file mode 100644
index 00000000..09443f8e
--- /dev/null
+++ b/builtins/target-generic-64.ll
@@ -0,0 +1,33 @@
+;;  Copyright (c) 2010-2012, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`WIDTH',`64')
+include(`target-generic-common.ll')
diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll
index e4c70aa4..6bf90d95 100644
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2010-2012, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -39,12 +39,12 @@ reduce_equal(WIDTH)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; broadcast/rotate/shuffle
 
-declare <WIDTH x float> @__smear_float(float) nounwind readnone
-declare <WIDTH x double> @__smear_double(double) nounwind readnone
-declare <WIDTH x i8> @__smear_i8(i8) nounwind readnone
-declare <WIDTH x i16> @__smear_i16(i16) nounwind readnone
-declare <WIDTH x i32> @__smear_i32(i32) nounwind readnone
-declare <WIDTH x i64> @__smear_i64(i64) nounwind readnone
+declare <WIDTH x float> @__smear_float(<WIDTH x float>, float) nounwind readnone
+declare <WIDTH x double> @__smear_double(<WIDTH x double>, double) nounwind readnone
+declare <WIDTH x i8> @__smear_i8(<WIDTH x i8>, i8) nounwind readnone
+declare <WIDTH x i16> @__smear_i16(<WIDTH x i16>, i16) nounwind readnone
+declare <WIDTH x i32> @__smear_i32(<WIDTH x i32>, i32) nounwind readnone
+declare <WIDTH x i64> @__smear_i64(<WIDTH x i64>, i64) nounwind readnone
 
 declare <WIDTH x float> @__broadcast_float(<WIDTH x float>, i32) nounwind readnone
 declare <WIDTH x double> @__broadcast_double(<WIDTH x double>, i32) nounwind readnone
@@ -201,7 +201,7 @@ declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; reductions
 
-declare i32 @__movmsk(<WIDTH x i1>) nounwind readnone 
+declare i64 @__movmsk(<WIDTH x i1>) nounwind readnone 
 
 declare float @__reduce_add_float(<WIDTH x float>) nounwind readnone
 declare float @__reduce_min_float(<WIDTH x float>) nounwind readnone 
@@ -249,7 +249,16 @@ declare void @__masked_store_32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
 declare void @__masked_store_64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
                                 <WIDTH x i1> %mask) nounwind 
 
-ifelse(LLVM_VERSION, `LLVM_3_1svn',`
+ifelse(LLVM_VERSION, `LLVM_3_0', `
+declare void @__masked_store_blend_8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
+                                     <WIDTH x i1>) nounwind 
+declare void @__masked_store_blend_16(<WIDTH x i16>* nocapture, <WIDTH x i16>, 
+                                      <WIDTH x i1>) nounwind 
+declare void @__masked_store_blend_32(<WIDTH x i32>* nocapture, <WIDTH x i32>, 
+                                      <WIDTH x i1>) nounwind 
+declare void @__masked_store_blend_64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
+                                      <WIDTH x i1> %mask) nounwind 
+', `
 define void @__masked_store_blend_8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
                                      <WIDTH x i1>) nounwind alwaysinline {
   %v = load <WIDTH x i8> * %0
@@ -281,15 +290,6 @@ define void @__masked_store_blend_64(<WIDTH x i64>* nocapture,
   store <WIDTH x i64> %v1, <WIDTH x i64> * %0
   ret void
 }
-',`
-declare void @__masked_store_blend_8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
-                                     <WIDTH x i1>) nounwind 
-declare void @__masked_store_blend_16(<WIDTH x i16>* nocapture, <WIDTH x i16>, 
-                                      <WIDTH x i1>) nounwind 
-declare void @__masked_store_blend_32(<WIDTH x i32>* nocapture, <WIDTH x i32>, 
-                                      <WIDTH x i1>) nounwind 
-declare void @__masked_store_blend_64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
-                                      <WIDTH x i1> %mask) nounwind 
 ')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll
index 2e6d1bdc..65d30939 100644
--- a/builtins/target-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2010-2012, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -295,7 +295,7 @@ define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
 
 declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
 
-define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
+define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
   ; first do two 4-wide movmsk calls
   %floatmask = bitcast <8 x i32> %0 to <8 x float>
   %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
@@ -309,7 +309,8 @@ define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
   ; of the second one
   %v1s = shl i32 %v1, 4
   %v = or i32 %v0, %v1s
-  ret i32 %v
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
 }
 
 define <4 x float> @__vec4_add_float(<4 x float> %v0,
diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll
index 21ffb267..e6eb7390 100644
--- a/builtins/target-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2010-2012, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -239,10 +239,11 @@ define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
 
 declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
 
-define i32 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
+define i64 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
   %floatmask = bitcast <4 x i32> %0 to <4 x float>
   %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
-  ret i32 %v
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
 }
 
 define float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline {
diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll
index 5a467ec2..1ac6b3e5 100644
--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2010-2012, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -237,7 +237,7 @@ define <8 x i32> @__max_varying_uint32(<8 x i32>,
 
 declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
 
-define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
+define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
   ; first do two 4-wide movmsk calls
   %floatmask = bitcast <8 x i32> %0 to <8 x float>
   %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
@@ -251,7 +251,8 @@ define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
   ; of the second one
   %v1s = shl i32 %v1, 4
   %v = or i32 %v0, %v1s
-  ret i32 %v
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
 }
 
 define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll
index 9dfe9db7..98426b24 100644
--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2010-2012, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -271,10 +271,11 @@ define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alway
 
 declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
 
-define i32 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
+define i64 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
   %floatmask = bitcast <4 x i32> %0 to <4 x float>
   %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
-  ret i32 %v
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
 }
 
 declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
diff --git a/builtins/util.m4 b/builtins/util.m4
index 26cbfafb..59185942 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2010-2012, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -38,6 +38,18 @@ declare i1 @__is_compile_time_constant_uniform_int32(i32)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
+;; It is a bit of a pain to compute this in m4 for 32 and 64-wide targets...
+define(`ALL_ON_MASK',
+`ifelse(WIDTH, `64', `-1', 
+        WIDTH, `32', `4294967295',
+                     `eval((1<<WIDTH)-1)')')
+
+define(`MASK_HIGH_BIT_ON',
+`ifelse(WIDTH, `64', `-9223372036854775808',
+        WIDTH, `32', `2147483648',
+                     `eval(1<<(WIDTH-1))')')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 ;; Helper macro for calling various SSE instructions for scalar values
 ;; but where the instruction takes a vector parameter.
@@ -1529,7 +1541,7 @@ declare i32 @__fast_masked_vload()
 declare i8* @ISPCAlloc(i8**, i64, i32) nounwind
 declare void @ISPCLaunch(i8**, i8*, i8*, i32) nounwind
 declare void @ISPCSync(i8*) nounwind
-declare void @ISPCInstrument(i8*, i8*, i32, i32) nounwind
+declare void @ISPCInstrument(i8*, i8*, i32, i64) nounwind
 
 declare i1 @__is_compile_time_constant_mask(<WIDTH x MASK> %mask)
 declare i1 @__is_compile_time_constant_varying_int32(<WIDTH x i32>)
@@ -1654,6 +1666,265 @@ declare void @__pseudo_scatter_base_offsets64_32(i8 * nocapture, <WIDTH x i64>,
 declare void @__pseudo_scatter_base_offsets64_64(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                  <WIDTH x i64>, <WIDTH x MASK>) nounwind
 
+declare float @__log_uniform_float(float) nounwind readnone
+declare <WIDTH x float> @__log_varying_float(<WIDTH x float>) nounwind readnone
+declare float @__exp_uniform_float(float) nounwind readnone
+declare <WIDTH x float> @__exp_varying_float(<WIDTH x float>) nounwind readnone
+declare float @__pow_uniform_float(float, float) nounwind readnone
+declare <WIDTH x float> @__pow_varying_float(<WIDTH x float>, <WIDTH x float>) nounwind readnone
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+declare void @__use8(<WIDTH x i8>)
+declare void @__use16(<WIDTH x i16>)
+declare void @__use32(<WIDTH x i32>)
+declare void @__use64(<WIDTH x i64>)
+
+;; This is a temporary function that will be removed at the end of
+;; compilation--the idea is that it calls out to all of the various
+;; functions / pseudo-function declarations that we need to keep around
+;; so that they are available to the various optimization passes.  This
+;; then prevents those functions from being removed as dead code when
+;; we do early DCE...
+
+define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
+                               <WIDTH x i32> %v32, <WIDTH x i64> %v64,
+                               <WIDTH x MASK> %mask) {
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+  ;; loads
+  %ml8  = call <WIDTH x i8>  @__masked_load_8(i8 * %ptr, <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %ml8)
+  %ml16 = call <WIDTH x i16> @__masked_load_16(i8 * %ptr, <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %ml16)
+  %ml32 = call <WIDTH x i32> @__masked_load_32(i8 * %ptr, <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %ml32)
+  %ml64 = call <WIDTH x i64> @__masked_load_64(i8 * %ptr, <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %ml64)
+
+  %lb8   = call <WIDTH x i8>  @__load_and_broadcast_8(i8 * %ptr, <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %lb8)
+  %lb16  = call <WIDTH x i16> @__load_and_broadcast_16(i8 * %ptr, <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %lb16)
+  %lb32  = call <WIDTH x i32> @__load_and_broadcast_32(i8 * %ptr, <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %lb32)
+  %lb64  = call <WIDTH x i64> @__load_and_broadcast_64(i8 * %ptr, <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %lb64)
+
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+  ;; stores
+  %pv8 = bitcast i8 * %ptr to <WIDTH x i8> *
+  call void @__pseudo_masked_store_8(<WIDTH x i8> * %pv8, <WIDTH x i8> %v8,
+                                     <WIDTH x MASK> %mask)
+  %pv16 = bitcast i8 * %ptr to <WIDTH x i16> *
+  call void @__pseudo_masked_store_16(<WIDTH x i16> * %pv16, <WIDTH x i16> %v16,
+                                      <WIDTH x MASK> %mask)
+  %pv32 = bitcast i8 * %ptr to <WIDTH x i32> *
+  call void @__pseudo_masked_store_32(<WIDTH x i32> * %pv32, <WIDTH x i32> %v32,
+                                      <WIDTH x MASK> %mask)
+  %pv64 = bitcast i8 * %ptr to <WIDTH x i64> *
+  call void @__pseudo_masked_store_64(<WIDTH x i64> * %pv64, <WIDTH x i64> %v64,
+                                      <WIDTH x MASK> %mask)
+
+  call void @__masked_store_8(<WIDTH x i8> * %pv8, <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__masked_store_16(<WIDTH x i16> * %pv16, <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__masked_store_32(<WIDTH x i32> * %pv32, <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__masked_store_64(<WIDTH x i64> * %pv64, <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+
+  call void @__masked_store_blend_8(<WIDTH x i8> * %pv8, <WIDTH x i8> %v8,
+                                    <WIDTH x MASK> %mask)
+  call void @__masked_store_blend_16(<WIDTH x i16> * %pv16, <WIDTH x i16> %v16,
+                                     <WIDTH x MASK> %mask)
+  call void @__masked_store_blend_32(<WIDTH x i32> * %pv32, <WIDTH x i32> %v32,
+                                     <WIDTH x MASK> %mask)
+  call void @__masked_store_blend_64(<WIDTH x i64> * %pv64, <WIDTH x i64> %v64,
+                                     <WIDTH x MASK> %mask)
+
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+  ;; gathers
+
+  %pg32_8 = call <WIDTH x i8>  @__pseudo_gather32_8(<WIDTH x i32> %v32,
+                                                    <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %pg32_8)
+  %pg32_16 = call <WIDTH x i16>  @__pseudo_gather32_16(<WIDTH x i32> %v32,
+                                                       <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %pg32_16)
+  %pg32_32 = call <WIDTH x i32>  @__pseudo_gather32_32(<WIDTH x i32> %v32,
+                                                       <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %pg32_32)
+  %pg32_64 = call <WIDTH x i64>  @__pseudo_gather32_64(<WIDTH x i32> %v32,
+                                                       <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %pg32_64)
+
+  %pg64_8 = call <WIDTH x i8>  @__pseudo_gather64_8(<WIDTH x i64> %v64,
+                                                    <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %pg64_8)
+  %pg64_16 = call <WIDTH x i16>  @__pseudo_gather64_16(<WIDTH x i64> %v64,
+                                                       <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %pg64_16)
+  %pg64_32 = call <WIDTH x i32>  @__pseudo_gather64_32(<WIDTH x i64> %v64,
+                                                       <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %pg64_32)
+  %pg64_64 = call <WIDTH x i64>  @__pseudo_gather64_64(<WIDTH x i64> %v64,
+                                                       <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %pg64_64)
+
+  %g32_8 = call <WIDTH x i8>  @__gather32_i8(<WIDTH x i32> %v32,
+                                            <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %g32_8)
+  %g32_16 = call <WIDTH x i16>  @__gather32_i16(<WIDTH x i32> %v32,
+                                               <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %g32_16)
+  %g32_32 = call <WIDTH x i32>  @__gather32_i32(<WIDTH x i32> %v32,
+                                               <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %g32_32)
+  %g32_64 = call <WIDTH x i64>  @__gather32_i64(<WIDTH x i32> %v32,
+                                               <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %g32_64)
+
+  %g64_8 = call <WIDTH x i8>  @__gather64_i8(<WIDTH x i64> %v64,
+                                            <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %g64_8)
+  %g64_16 = call <WIDTH x i16>  @__gather64_i16(<WIDTH x i64> %v64,
+                                               <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %g64_16)
+  %g64_32 = call <WIDTH x i32>  @__gather64_i32(<WIDTH x i64> %v64,
+                                               <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %g64_32)
+  %g64_64 = call <WIDTH x i64>  @__gather64_i64(<WIDTH x i64> %v64,
+                                                <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %g64_64)
+
+  %pgbo32_8 = call <WIDTH x i8>
+       @__pseudo_gather_base_offsets32_8(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                         <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %pgbo32_8)
+  %pgbo32_16 = call <WIDTH x i16>
+       @__pseudo_gather_base_offsets32_16(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                          <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %pgbo32_16)
+  %pgbo32_32 = call <WIDTH x i32>
+       @__pseudo_gather_base_offsets32_32(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                          <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %pgbo32_32)
+  %pgbo32_64 = call <WIDTH x i64>
+       @__pseudo_gather_base_offsets32_64(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                          <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %pgbo32_64)
+
+  %gbo32_8 = call <WIDTH x i8>
+       @__gather_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                  <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %gbo32_8)
+  %gbo32_16 = call <WIDTH x i16>
+       @__gather_base_offsets32_i16(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                   <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %gbo32_16)
+  %gbo32_32 = call <WIDTH x i32>
+       @__gather_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                   <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %gbo32_32)
+  %gbo32_64 = call <WIDTH x i64>
+       @__gather_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                   <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %gbo32_64)
+
+
+  %pgbo64_8 = call <WIDTH x i8>
+       @__pseudo_gather_base_offsets64_8(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+                                         <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %pgbo64_8)
+  %pgbo64_16 = call <WIDTH x i16>
+       @__pseudo_gather_base_offsets64_16(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+                                          <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %pgbo64_16)
+  %pgbo64_32 = call <WIDTH x i32>
+       @__pseudo_gather_base_offsets64_32(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+                                          <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %pgbo64_32)
+  %pgbo64_64 = call <WIDTH x i64>
+       @__pseudo_gather_base_offsets64_64(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+                                          <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %pgbo64_64)
+
+  %gbo64_8 = call <WIDTH x i8>
+       @__gather_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+                                  <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %gbo64_8)
+  %gbo64_16 = call <WIDTH x i16>
+       @__gather_base_offsets64_i16(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+                                   <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %gbo64_16)
+  %gbo64_32 = call <WIDTH x i32>
+       @__gather_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+                                   <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %gbo64_32)
+  %gbo64_64 = call <WIDTH x i64>
+       @__gather_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+                                   <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %gbo64_64)
+
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+  ;; scatters
+
+  call void @__pseudo_scatter32_8(<WIDTH x i32> %v32, <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter32_16(<WIDTH x i32> %v32, <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter32_32(<WIDTH x i32> %v32, <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter32_64(<WIDTH x i32> %v32, <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+
+  call void @__pseudo_scatter64_8(<WIDTH x i64> %v64, <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter64_16(<WIDTH x i64> %v64, <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter64_32(<WIDTH x i64> %v64, <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter64_64(<WIDTH x i64> %v64, <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+
+  call void @__scatter32_i8(<WIDTH x i32> %v32, <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__scatter32_i16(<WIDTH x i32> %v32, <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__scatter32_i32(<WIDTH x i32> %v32, <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__scatter32_i64(<WIDTH x i32> %v32, <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+
+  call void @__scatter64_i8(<WIDTH x i64> %v64, <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__scatter64_i16(<WIDTH x i64> %v64, <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__scatter64_i32(<WIDTH x i64> %v64, <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__scatter64_i64(<WIDTH x i64> %v64, <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+
+  call void @__pseudo_scatter_base_offsets32_8(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_base_offsets32_16(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_base_offsets32_32(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_base_offsets32_64(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+
+  call void @__pseudo_scatter_base_offsets64_8(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_base_offsets64_16(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                 <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_base_offsets64_32(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                 <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_base_offsets64_64(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                 <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+
+  call void @__scatter_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets32_i16(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+
+  call void @__scatter_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets64_i16(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                 <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                 <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                 <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+
+  ret void
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; vector ops
 
@@ -1837,12 +2108,12 @@ ok:
 
 
 define void @__do_assert_varying(i8 *%str, <WIDTH x MASK> %test,
-                                          <WIDTH x MASK> %mask) {
+                                 <WIDTH x MASK> %mask) {
   %nottest = xor <WIDTH x MASK> %test,
                  < forloop(i, 1, eval(WIDTH-1), `MASK -1, ') MASK -1 >
   %nottest_and_mask = and <WIDTH x MASK> %nottest, %mask
-  %mm = call i32 @__movmsk(<WIDTH x MASK> %nottest_and_mask)
-  %all_ok = icmp eq i32 %mm, 0
+  %mm = call i64 @__movmsk(<WIDTH x MASK> %nottest_and_mask)
+  %all_ok = icmp eq i64 %mm, 0
   br i1 %all_ok, label %ok, label %fail
 
 fail:
@@ -2244,14 +2515,18 @@ define <$1 x $2> @__load_and_broadcast_$3(i8 *, <$1 x MASK> %mask) nounwind alwa
 ;; $4: alignment for elements of type $2 (4, 8, ...)
 
 define(`masked_load', `
-define <$1 x $2> @__masked_load_$3(i8 *, <$1 x i32> %mask) nounwind alwaysinline {
+define <$1 x $2> @__masked_load_$3(i8 *, <$1 x MASK> %mask) nounwind alwaysinline {
 entry:
-  %mm = call i32 @__movmsk(<$1 x i32> %mask)
+  %mm = call i64 @__movmsk(<$1 x MASK> %mask)
   
   ; if the first lane and the last lane are on, then it is safe to do a vector load
   ; of the whole thing--what the lanes in the middle want turns out to not matter...
-  %mm_and = and i32 %mm, eval(1 | (1<<($1-1)))
-  %can_vload = icmp eq i32 %mm_and, eval(1 | (1<<($1-1)))
+  %mm_and_low = and i64 %mm, 1
+  %mm_and_high = and i64 %mm, MASK_HIGH_BIT_ON
+  %mm_and_high_shift = lshr i64 %mm_and_high, eval(WIDTH-1)
+  %mm_and_low_i1 = trunc i64 %mm_and_low to i1
+  %mm_and_high_shift_i1 = trunc i64 %mm_and_high_shift to i1
+  %can_vload = and i1 %mm_and_low_i1, %mm_and_high_shift_i1
 
   %fast32 = call i32 @__fast_masked_vload()
   %fast_i1 = trunc i32 %fast32 to i1
@@ -2270,9 +2545,10 @@ load:
 loop:
   ; loop over the lanes and see if each one is on...
   %lane = phi i32 [ 0, %entry ], [ %next_lane, %lane_done ]
-  %lanemask = shl i32 1, %lane
-  %mask_and = and i32 %mm, %lanemask
-  %do_lane = icmp ne i32 %mask_and, 0
+  %lane64 = zext i32 %lane to i64
+  %lanemask = shl i64 1, %lane64
+  %mask_and = and i64 %mm, %lanemask
+  %do_lane = icmp ne i64 %mask_and, 0
   br i1 %do_lane, label %load_lane, label %lane_done
 
 load_lane:
@@ -2484,12 +2760,12 @@ define(`packed_load_and_store', `
 define i32 @__packed_load_active(i32 * %startptr, <WIDTH x i32> * %val_ptr,
                                  <WIDTH x i32> %full_mask) nounwind alwaysinline {
 entry:
-  %mask = call i32 @__movmsk(<WIDTH x i32> %full_mask)
+  %mask = call i64 @__movmsk(<WIDTH x i32> %full_mask)
   %mask_known = call i1 @__is_compile_time_constant_mask(<WIDTH x i32> %full_mask)
   br i1 %mask_known, label %known_mask, label %unknown_mask
 
 known_mask:
-  %allon = icmp eq i32 %mask, eval((1 << WIDTH) -1)
+  %allon = icmp eq i64 %mask, ALL_ON_MASK
   br i1 %allon, label %all_on, label %unknown_mask
 
 all_on:
@@ -2505,12 +2781,12 @@ unknown_mask:
 
 loop:
   %lane = phi i32 [ 0, %unknown_mask ], [ %nextlane, %loopend ]
-  %lanemask = phi i32 [ 1, %unknown_mask ], [ %nextlanemask, %loopend ]
+  %lanemask = phi i64 [ 1, %unknown_mask ], [ %nextlanemask, %loopend ]
   %offset = phi i32 [ 0, %unknown_mask ], [ %nextoffset, %loopend ]
 
   ; is the current lane on?
-  %and = and i32 %mask, %lanemask
-  %do_load = icmp eq i32 %and, %lanemask
+  %and = and i64 %mask, %lanemask
+  %do_load = icmp eq i64 %and, %lanemask
   br i1 %do_load, label %load, label %loopend 
 
 load:
@@ -2525,7 +2801,7 @@ load:
 loopend:
   %nextoffset = phi i32 [ %offset1, %load ], [ %offset, %loop ]
   %nextlane = add i32 %lane, 1
-  %nextlanemask = mul i32 %lanemask, 2
+  %nextlanemask = mul i64 %lanemask, 2
 
   ; are we done yet?
   %test = icmp ne i32 %nextlane, WIDTH
@@ -2536,14 +2812,14 @@ done:
 }
 
 define i32 @__packed_store_active(i32 * %startptr, <WIDTH x i32> %vals,
-                                  <WIDTH x i32> %full_mask) nounwind alwaysinline {
+                                   <WIDTH x i32> %full_mask) nounwind alwaysinline {
 entry:
-  %mask = call i32 @__movmsk(<WIDTH x i32> %full_mask)
+  %mask = call i64 @__movmsk(<WIDTH x i32> %full_mask)
   %mask_known = call i1 @__is_compile_time_constant_mask(<WIDTH x i32> %full_mask)
   br i1 %mask_known, label %known_mask, label %unknown_mask
 
 known_mask:
-  %allon = icmp eq i32 %mask, eval((1 << WIDTH) -1)
+  %allon = icmp eq i64 %mask, ALL_ON_MASK
   br i1 %allon, label %all_on, label %unknown_mask
 
 all_on:
@@ -2556,12 +2832,12 @@ unknown_mask:
 
 loop:
   %lane = phi i32 [ 0, %unknown_mask ], [ %nextlane, %loopend ]
-  %lanemask = phi i32 [ 1, %unknown_mask ], [ %nextlanemask, %loopend ]
+  %lanemask = phi i64 [ 1, %unknown_mask ], [ %nextlanemask, %loopend ]
   %offset = phi i32 [ 0, %unknown_mask ], [ %nextoffset, %loopend ]
 
   ; is the current lane on?
-  %and = and i32 %mask, %lanemask
-  %do_store = icmp eq i32 %and, %lanemask
+  %and = and i64 %mask, %lanemask
+  %do_store = icmp eq i64 %and, %lanemask
   br i1 %do_store, label %store, label %loopend 
 
 store:
@@ -2574,7 +2850,7 @@ store:
 loopend:
   %nextoffset = phi i32 [ %offset1, %store ], [ %offset, %loop ]
   %nextlane = add i32 %lane, 1
-  %nextlanemask = mul i32 %lanemask, 2
+  %nextlanemask = mul i64 %lanemask, 2
 
   ; are we done yet?
   %test = icmp ne i32 %nextlane, WIDTH
@@ -2598,14 +2874,15 @@ define(`reduce_equal_aux', `
 define i1 @__reduce_equal_$3(<$1 x $2> %v, $2 * %samevalue,
                              <$1 x MASK> %mask) nounwind alwaysinline {
 entry:
-   %mm = call i32 @__movmsk(<$1 x MASK> %mask)
-   %allon = icmp eq i32 %mm, eval((1<<$1)-1)
+   %mm = call i64 @__movmsk(<$1 x MASK> %mask)
+   %allon = icmp eq i64 %mm, ALL_ON_MASK
    br i1 %allon, label %check_neighbors, label %domixed
 
 domixed:
   ; First, figure out which lane is the first active one
-  %first = call i32 @llvm.cttz.i32(i32 %mm)
-  %baseval = extractelement <$1 x $2> %v, i32 %first
+  %first = call i64 @llvm.cttz.i64(i64 %mm)
+  %first32 = trunc i64 %first to i32
+  %baseval = extractelement <$1 x $2> %v, i32 %first32
   %basev1 = bitcast $2 %baseval to <1 x $2>
   ; get a vector that is that value smeared across all elements
   %basesmear = shufflevector <1 x $2> %basev1, <1 x $2> undef,
@@ -2636,9 +2913,9 @@ check_neighbors:
   %eq = $5 eq <$1 x $2> %vec, %vr
   ifelse(MASK,i32, `
     %eq32 = sext <$1 x i1> %eq to <$1 x i32>
-    %eqmm = call i32 @__movmsk(<$1 x i32> %eq32)', `
-    %eqmm = call i32 @__movmsk(<$1 x MASK> %eq)')
-  %alleq = icmp eq i32 %eqmm, eval((1<<$1)-1)
+    %eqmm = call i64 @__movmsk(<$1 x i32> %eq32)', `
+    %eqmm = call i64 @__movmsk(<$1 x MASK> %eq)')
+  %alleq = icmp eq i64 %eqmm, ALL_ON_MASK
   br i1 %alleq, label %all_equal, label %not_all_equal
   ', `
   ; But for 64-bit elements, it turns out to be more efficient to just
@@ -2751,14 +3028,14 @@ define(`per_lane', `
   br label %pl_entry
 
 pl_entry:
-  %pl_mask = call i32 @__movmsk($2)
+  %pl_mask = call i64 @__movmsk($2)
   %pl_mask_known = call i1 @__is_compile_time_constant_mask($2)
   br i1 %pl_mask_known, label %pl_known_mask, label %pl_unknown_mask
 
 pl_known_mask:
   ;; the mask is known at compile time; see if it is something we can
   ;; handle more efficiently
-  %pl_is_allon = icmp eq i32 %pl_mask, eval((1<<$1)-1)
+  %pl_is_allon = icmp eq i64 %pl_mask, ALL_ON_MASK
   br i1 %pl_is_allon, label %pl_all_on, label %pl_unknown_mask
 
 pl_all_on:
@@ -2780,11 +3057,11 @@ pl_unknown_mask:
 pl_loop:
   ;; Loop over each lane and see if we want to do the work for this lane
   %pl_lane = phi i32 [ 0, %pl_unknown_mask ], [ %pl_nextlane, %pl_loopend ]
-  %pl_lanemask = phi i32 [ 1, %pl_unknown_mask ], [ %pl_nextlanemask, %pl_loopend ]
+  %pl_lanemask = phi i64 [ 1, %pl_unknown_mask ], [ %pl_nextlanemask, %pl_loopend ]
 
   ; is the current lane on?  if so, goto do work, otherwise to end of loop
-  %pl_and = and i32 %pl_mask, %pl_lanemask
-  %pl_doit = icmp eq i32 %pl_and, %pl_lanemask
+  %pl_and = and i64 %pl_mask, %pl_lanemask
+  %pl_doit = icmp eq i64 %pl_and, %pl_lanemask
   br i1 %pl_doit, label %pl_dolane, label %pl_loopend 
 
 pl_dolane:
@@ -2795,7 +3072,7 @@ pl_dolane:
 
 pl_loopend:
   %pl_nextlane = add i32 %pl_lane, 1
-  %pl_nextlanemask = mul i32 %pl_lanemask, 2
+  %pl_nextlanemask = mul i64 %pl_lanemask, 2
 
   ; are we done yet?
   %pl_test = icmp ne i32 %pl_nextlane, $1
@@ -2880,11 +3157,11 @@ define <$1 x $2> @__gather_base_offsets32_$2(i8 * %ptr, <$1 x i32> %offsets, i32
   %newDelta = load <$1 x i32> * %deltaPtr
 
   %ret0 = call <$1 x $2> @__gather_elt32_$2(i8 * %ptr, <$1 x i32> %newOffsets,
-                                            i32 %offset_scale, <$1 x i32> %offset_delta,
+                                            i32 %offset_scale, <$1 x i32> %newDelta,
                                             <$1 x $2> undef, i32 0)
   forloop(lane, 1, eval($1-1), 
           `patsubst(patsubst(`%retLANE = call <$1 x $2> @__gather_elt32_$2(i8 * %ptr, 
-                                <$1 x i32> %newOffsets, i32 %offset_scale, <$1 x i32> %offset_delta,
+                                <$1 x i32> %newOffsets, i32 %offset_scale, <$1 x i32> %newDelta,
                                 <$1 x $2> %retPREV, i32 LANE)
                     ', `LANE', lane), `PREV', eval(lane-1))')
   ret <$1 x $2> %ret`'eval($1-1)
diff --git a/cbackend.cpp b/cbackend.cpp
index b1a0a907..671a21ce 100644
--- a/cbackend.cpp
+++ b/cbackend.cpp
@@ -12,9 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifdef LLVM_2_9
-#warning "The C++ backend isn't supported when building with LLVM 2.9"
-#else
+#include <stdio.h>
 
 #ifndef _MSC_VER
 #include <inttypes.h>
@@ -339,8 +337,6 @@ namespace {
                            bool IsVolatile, unsigned Alignment);
 
   private :
-    std::string InterpretASMConstraint(InlineAsm::ConstraintInfo& c);
-
     void lowerIntrinsics(Function &F);
     /// Prints the definition of the intrinsic function F. Supports the 
     /// intrinsics which need to be explicitly defined in the CBackend.
@@ -363,7 +359,7 @@ namespace {
     bool printConstExprCast(const ConstantExpr *CE, bool Static);
     void printConstantArray(ConstantArray *CPA, bool Static);
     void printConstantVector(ConstantVector *CV, bool Static);
-#ifdef LLVM_3_1svn
+#ifndef LLVM_3_0
     void printConstantDataSequential(ConstantDataSequential *CDS, bool Static);
 #endif
 
@@ -440,11 +436,11 @@ namespace {
     void visitInvokeInst(InvokeInst &I) {
       llvm_unreachable("Lowerinvoke pass didn't work!");
     }
-#if !defined(LLVM_3_1) && !defined(LLVM_3_1svn)
+#ifdef LLVM_3_0
     void visitUnwindInst(UnwindInst &I) {
       llvm_unreachable("Lowerinvoke pass didn't work!");
     }
-#endif // !LLVM_3_1svn
+#endif // LLVM_3_0
     void visitResumeInst(ResumeInst &I) {
       llvm_unreachable("DwarfEHPrepare pass didn't work!");
     }
@@ -804,7 +800,7 @@ raw_ostream &CWriter::printType(raw_ostream &Out, Type *Ty,
 }
 
 void CWriter::printConstantArray(ConstantArray *CPA, bool Static) {
-#ifndef LLVM_3_1svn
+#ifdef LLVM_3_0
   Type *ETy = CPA->getType()->getElementType();
   // MMP: this looks like a bug: both sides of the || are the same
   bool isString = ETy == Type::getInt8Ty(CPA->getContext());
@@ -857,7 +853,7 @@ void CWriter::printConstantArray(ConstantArray *CPA, bool Static) {
     Out << "\"";
     return;
   }
-#endif // !LLVM_3_1
+#endif // LLVM_3_0
 
   printConstant(cast<Constant>(CPA->getOperand(0)), Static);
   for (unsigned i = 1, e = CPA->getNumOperands(); i != e; ++i) {
@@ -874,7 +870,7 @@ void CWriter::printConstantVector(ConstantVector *CP, bool Static) {
   }
 }
 
-#ifdef LLVM_3_1svn
+#ifndef LLVM_3_0
 void CWriter::printConstantDataSequential(ConstantDataSequential *CDS,
                                           bool Static) {
   // As a special case, print the array as a string if it is an array of
@@ -931,7 +927,21 @@ void CWriter::printConstantDataSequential(ConstantDataSequential *CDS,
     }
   }
 }
-#endif // LLVM_3_1svn
+#endif // !LLVM_3_0
+
+#ifndef LLVM_3_0
+static inline std::string ftostr(const APFloat& V) {
+  std::string Buf;
+  if (&V.getSemantics() == &APFloat::IEEEdouble) {
+    raw_string_ostream(Buf) << V.convertToDouble();
+    return Buf;
+  } else if (&V.getSemantics() == &APFloat::IEEEsingle) {
+    raw_string_ostream(Buf) << (double)V.convertToFloat();
+    return Buf;
+  }
+  return "<unknown format in ftostr>"; // error
+}
+#endif // !LLVM_3_0
 
 // isFPCSafeToPrint - Returns true if we may assume that CFP may be written out
 // textually as a double (rather than as a reference to a stack-allocated
@@ -1084,6 +1094,26 @@ bool CWriter::printCast(unsigned opc, Type *SrcTy, Type *DstTy) {
   return false;
 }
 
+
+// FIXME: generalize this/make it not so hard-coded?
+static const char *lGetSmearFunc(Type *matchType) {
+    switch (matchType->getTypeID()) {
+    case Type::FloatTyID:  return "__smear_float";
+    case Type::DoubleTyID: return "__smear_double";
+    case Type::IntegerTyID: {
+        switch (cast<IntegerType>(matchType)->getBitWidth()) {
+        case 1:  return "__smear_i1";
+        case 8:  return "__smear_i8";
+        case 16: return "__smear_i16";
+        case 32: return "__smear_i32";
+        case 64: return "__smear_i64";
+        }
+    }
+    default: return NULL;
+    }
+}
+
+
 // printConstant - The LLVM Constant to C Constant converter.
 void CWriter::printConstant(Constant *CPV, bool Static) {
   if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(CPV)) {
@@ -1400,11 +1430,11 @@ void CWriter::printConstant(Constant *CPV, bool Static) {
     }
     if (ConstantArray *CA = dyn_cast<ConstantArray>(CPV)) {
       printConstantArray(CA, Static);
-#ifdef LLVM_3_1svn
+#ifndef LLVM_3_0
     } else if (ConstantDataSequential *CDS = 
                dyn_cast<ConstantDataSequential>(CPV)) {
       printConstantDataSequential(CDS, Static);
-#endif // LLVM_3_1svn
+#endif // !LLVM_3_0
     } else {
       assert(isa<ConstantAggregateZero>(CPV) || isa<UndefValue>(CPV));
       if (AT->getNumElements()) {
@@ -1423,30 +1453,68 @@ void CWriter::printConstant(Constant *CPV, bool Static) {
         Out << ")";
     break;
   }
-  case Type::VectorTyID:
-    printType(Out, CPV->getType());
-    Out << "(";
+  case Type::VectorTyID: {
+    VectorType *VT = dyn_cast<VectorType>(CPV->getType());
+    const char *smearFunc = lGetSmearFunc(VT->getElementType());
 
-    if (ConstantVector *CV = dyn_cast<ConstantVector>(CPV)) {
-      printConstantVector(CV, Static);
-#ifdef LLVM_3_1svn
-    } else if (ConstantDataSequential *CDS = 
-               dyn_cast<ConstantDataSequential>(CPV)) {
-      printConstantDataSequential(CDS, Static);
-#endif
-    } else {
-      assert(isa<ConstantAggregateZero>(CPV) || isa<UndefValue>(CPV));
-      VectorType *VT = cast<VectorType>(CPV->getType());
+    if (isa<ConstantAggregateZero>(CPV)) {
+        assert(smearFunc != NULL);
+
+        Constant *CZ = Constant::getNullValue(VT->getElementType());
+        Out << smearFunc << "(";
+        printType(Out, VT);
+        Out << "(), ";
+        printConstant(CZ, Static);
+        Out << ")";
+    }
+    else if (ConstantVector *CV = dyn_cast<ConstantVector>(CPV)) {
+      llvm::Constant *splatValue = CV->getSplatValue();
+      if (splatValue != NULL && smearFunc != NULL) {
+        Out << smearFunc << "(";
+        printType(Out, VT);
+        Out << "(), ";
+        printConstant(splatValue, Static);
+        Out << ")";
+      }
+      else {
+        printType(Out, CPV->getType());
+        Out << "(";
+        printConstantVector(CV, Static);
+        Out << ")";
+      }
+    }
+#ifndef LLVM_3_0
+    else if (ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(CPV)) {
+      llvm::Constant *splatValue = CDV->getSplatValue();
+      if (splatValue != NULL && smearFunc != NULL) {
+        Out << smearFunc << "(";
+        printType(Out, VT);
+        Out << "(), ";
+        printConstant(splatValue, Static);
+        Out << ")";
+      }
+      else {
+        printType(Out, CPV->getType());
+        Out << "(";
+        printConstantDataSequential(CDV, Static);
+        Out << ")";
+      }
+    }
+#endif // !LLVM_3_0
+    else {
+      assert(isa<UndefValue>(CPV));
       Constant *CZ = Constant::getNullValue(VT->getElementType());
+      printType(Out, CPV->getType());
+      Out << "(";
       printConstant(CZ, Static);
       for (unsigned i = 1, e = VT->getNumElements(); i != e; ++i) {
         Out << ", ";
         printConstant(CZ, Static);
       }
+      Out << ")";
     }
-    Out << ")";
     break;
-
+  }
   case Type::StructTyID:
     if (!Static) {
       // call init func...
@@ -1639,7 +1707,12 @@ std::string CWriter::GetValueName(const Value *Operand) {
       VarName += ch;
   }
 
-  return VarName + "_llvm_cbe";
+  if (isa<BasicBlock>(Operand))
+    VarName += "_label";
+  else
+    VarName += "_";
+
+  return VarName;
 }
 
 /// writeInstComputationInline - Emit the computation for the specified
@@ -2071,69 +2144,18 @@ bool CWriter::doInitialization(Module &M) {
 
   Out << "#include \"" << includeName << "\"\n";
 
-  generateCompilerSpecificCode(Out, TD);
-
-  // Function declarations
-  Out << "\n/* Function Declarations */\n";
+  Out << "\n/* Basic Library Function Declarations */\n";
   Out << "extern \"C\" {\n";
   Out << "int puts(unsigned char *);\n";
   Out << "unsigned int putchar(unsigned int);\n";
   Out << "int fflush(void *);\n";
   Out << "int printf(const unsigned char *, ...);\n";
   Out << "uint8_t *memcpy(uint8_t *, uint8_t *, uint64_t );\n";
+  Out << "uint8_t *memset(uint8_t *, uint8_t, uint64_t );\n";
+  Out << "void memset_pattern16(void *, const void *, uint64_t );\n";
+  Out << "}\n\n";
 
-  // Store the intrinsics which will be declared/defined below.
-  SmallVector<const Function*, 8> intrinsicsToDefine;
-
-  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
-    // Don't print declarations for intrinsic functions.
-    // Store the used intrinsics, which need to be explicitly defined.
-    if (I->isIntrinsic()) {
-      switch (I->getIntrinsicID()) {
-        default:
-          break;
-        case Intrinsic::uadd_with_overflow:
-        case Intrinsic::sadd_with_overflow:
-          intrinsicsToDefine.push_back(I);
-          break;
-      }
-      continue;
-    }
-
-    if (I->getName() == "setjmp" || I->getName() == "abort" ||
-        I->getName() == "longjmp" || I->getName() == "_setjmp" ||
-        I->getName() == "memset" || I->getName() == "memset_pattern16" ||
-        I->getName() == "puts" ||
-        I->getName() == "printf" || I->getName() == "putchar" ||
-        I->getName() == "fflush" || I->getName() == "malloc" ||
-        I->getName() == "free")
-      continue;
-
-    // Don't redeclare ispc's own intrinsics
-    std::string name = I->getName();
-    if (name.size() > 2 && name[0] == '_' && name[1] == '_')
-        continue;
-
-    if (I->hasExternalWeakLinkage())
-      Out << "extern ";
-    printFunctionSignature(I, true);
-    if (I->hasWeakLinkage() || I->hasLinkOnceLinkage())
-      Out << " __ATTRIBUTE_WEAK__";
-    if (I->hasExternalWeakLinkage())
-      Out << " __EXTERNAL_WEAK__";
-    if (StaticCtors.count(I))
-      Out << " __ATTRIBUTE_CTOR__";
-    if (StaticDtors.count(I))
-      Out << " __ATTRIBUTE_DTOR__";
-    if (I->hasHiddenVisibility())
-      Out << " __HIDDEN__";
-
-    if (I->hasName() && I->getName()[0] == 1)
-      Out << " LLVM_ASM(\"" << I->getName().substr(1) << "\")";
-
-    Out << ";\n";
-  }
-  Out << "}\n";
+  generateCompilerSpecificCode(Out, TD);
 
   // Provide a definition for `bool' if not compiling with a C++ compiler.
   Out << "\n"
@@ -2240,6 +2262,106 @@ bool CWriter::doInitialization(Module &M) {
       }
   }
 
+  // Function declarations
+  Out << "\n/* Function Declarations */\n";
+  Out << "extern \"C\" {\n";
+
+  // Store the intrinsics which will be declared/defined below.
+  SmallVector<const Function*, 8> intrinsicsToDefine;
+
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
+    // Don't print declarations for intrinsic functions.
+    // Store the used intrinsics, which need to be explicitly defined.
+    if (I->isIntrinsic()) {
+      switch (I->getIntrinsicID()) {
+        default:
+          break;
+        case Intrinsic::uadd_with_overflow:
+        case Intrinsic::sadd_with_overflow:
+          intrinsicsToDefine.push_back(I);
+          break;
+      }
+      continue;
+    }
+
+    if (I->getName() == "setjmp" || I->getName() == "abort" ||
+        I->getName() == "longjmp" || I->getName() == "_setjmp" ||
+        I->getName() == "memset" || I->getName() == "memset_pattern16" ||
+        I->getName() == "puts" ||
+        I->getName() == "printf" || I->getName() == "putchar" ||
+        I->getName() == "fflush" || I->getName() == "malloc" ||
+        I->getName() == "free")
+      continue;
+
+    // Don't redeclare ispc's own intrinsics
+    std::string name = I->getName();
+    if (name.size() > 2 && name[0] == '_' && name[1] == '_')
+        continue;
+
+    if (I->hasExternalWeakLinkage())
+      Out << "extern ";
+    printFunctionSignature(I, true);
+    if (I->hasWeakLinkage() || I->hasLinkOnceLinkage())
+      Out << " __ATTRIBUTE_WEAK__";
+    if (I->hasExternalWeakLinkage())
+      Out << " __EXTERNAL_WEAK__";
+    if (StaticCtors.count(I))
+      Out << " __ATTRIBUTE_CTOR__";
+    if (StaticDtors.count(I))
+      Out << " __ATTRIBUTE_DTOR__";
+    if (I->hasHiddenVisibility())
+      Out << " __HIDDEN__";
+
+    if (I->hasName() && I->getName()[0] == 1)
+      Out << " LLVM_ASM(\"" << I->getName().substr(1) << "\")";
+
+    Out << ";\n";
+  }
+  Out << "}\n\n";
+
+  if (!M.empty())
+    Out << "\n\n/* Function Bodies */\n";
+
+  // Emit some helper functions for dealing with FCMP instruction's
+  // predicates
+  Out << "template <typename A, typename B> static inline int llvm_fcmp_ord(A X, B Y) { ";
+  Out << "return X == X && Y == Y; }\n";
+  Out << "template <typename A, typename B> static inline int llvm_fcmp_uno(A X, B Y) { ";
+  Out << "return X != X || Y != Y; }\n";
+  Out << "template <typename A, typename B> static inline int llvm_fcmp_ueq(A X, B Y) { ";
+  Out << "return X == Y || llvm_fcmp_uno(X, Y); }\n";
+  Out << "template <typename A, typename B> static inline int llvm_fcmp_une(A X, B Y) { ";
+  Out << "return X != Y; }\n";
+  Out << "template <typename A, typename B> static inline int llvm_fcmp_ult(A X, B Y) { ";
+  Out << "return X <  Y || llvm_fcmp_uno(X, Y); }\n";
+  Out << "template <typename A, typename B> static inline int llvm_fcmp_ugt(A X, B Y) { ";
+  Out << "return X >  Y || llvm_fcmp_uno(X, Y); }\n";
+  Out << "template <typename A, typename B> static inline int llvm_fcmp_ule(A X, B Y) { ";
+  Out << "return X <= Y || llvm_fcmp_uno(X, Y); }\n";
+  Out << "template <typename A, typename B> static inline int llvm_fcmp_uge(A X, B Y) { ";
+  Out << "return X >= Y || llvm_fcmp_uno(X, Y); }\n";
+  Out << "template <typename A, typename B> static inline int llvm_fcmp_oeq(A X, B Y) { ";
+  Out << "return X == Y ; }\n";
+  Out << "template <typename A, typename B> static inline int llvm_fcmp_one(A X, B Y) { ";
+  Out << "return X != Y && llvm_fcmp_ord(X, Y); }\n";
+  Out << "template <typename A, typename B> static inline int llvm_fcmp_olt(A X, B Y) { ";
+  Out << "return X <  Y ; }\n";
+  Out << "template <typename A, typename B> static inline int llvm_fcmp_ogt(A X, B Y) { ";
+  Out << "return X >  Y ; }\n";
+  Out << "template <typename A, typename B> static inline int llvm_fcmp_ole(A X, B Y) { ";
+  Out << "return X <= Y ; }\n";
+  Out << "template <typename A, typename B> static inline int llvm_fcmp_oge(A X, B Y) { ";
+  Out << "return X >= Y ; }\n";
+  Out << "template <typename A> A *Memset(A *ptr, int count, size_t len) { ";
+  Out << "return (A *)memset(ptr, count, len); }\n";
+
+  // Emit definitions of the intrinsics.
+  for (SmallVector<const Function*, 8>::const_iterator
+       I = intrinsicsToDefine.begin(),
+       E = intrinsicsToDefine.end(); I != E; ++I) {
+    printIntrinsicDefinition(**I, Out);
+  }
+
   // Output the global variable definitions and contents...
   if (!M.global_empty()) {
     Out << "\n\n/* Global Variable Definitions and Initialization */\n";
@@ -2303,49 +2425,6 @@ bool CWriter::doInitialization(Module &M) {
       }
   }
 
-  if (!M.empty())
-    Out << "\n\n/* Function Bodies */\n";
-
-  // Emit some helper functions for dealing with FCMP instruction's
-  // predicates
-  Out << "template <typename A, typename B> static inline int llvm_fcmp_ord(A X, B Y) { ";
-  Out << "return X == X && Y == Y; }\n";
-  Out << "template <typename A, typename B> static inline int llvm_fcmp_uno(A X, B Y) { ";
-  Out << "return X != X || Y != Y; }\n";
-  Out << "template <typename A, typename B> static inline int llvm_fcmp_ueq(A X, B Y) { ";
-  Out << "return X == Y || llvm_fcmp_uno(X, Y); }\n";
-  Out << "template <typename A, typename B> static inline int llvm_fcmp_une(A X, B Y) { ";
-  Out << "return X != Y; }\n";
-  Out << "template <typename A, typename B> static inline int llvm_fcmp_ult(A X, B Y) { ";
-  Out << "return X <  Y || llvm_fcmp_uno(X, Y); }\n";
-  Out << "template <typename A, typename B> static inline int llvm_fcmp_ugt(A X, B Y) { ";
-  Out << "return X >  Y || llvm_fcmp_uno(X, Y); }\n";
-  Out << "template <typename A, typename B> static inline int llvm_fcmp_ule(A X, B Y) { ";
-  Out << "return X <= Y || llvm_fcmp_uno(X, Y); }\n";
-  Out << "template <typename A, typename B> static inline int llvm_fcmp_uge(A X, B Y) { ";
-  Out << "return X >= Y || llvm_fcmp_uno(X, Y); }\n";
-  Out << "template <typename A, typename B> static inline int llvm_fcmp_oeq(A X, B Y) { ";
-  Out << "return X == Y ; }\n";
-  Out << "template <typename A, typename B> static inline int llvm_fcmp_one(A X, B Y) { ";
-  Out << "return X != Y && llvm_fcmp_ord(X, Y); }\n";
-  Out << "template <typename A, typename B> static inline int llvm_fcmp_olt(A X, B Y) { ";
-  Out << "return X <  Y ; }\n";
-  Out << "template <typename A, typename B> static inline int llvm_fcmp_ogt(A X, B Y) { ";
-  Out << "return X >  Y ; }\n";
-  Out << "template <typename A, typename B> static inline int llvm_fcmp_ole(A X, B Y) { ";
-  Out << "return X <= Y ; }\n";
-  Out << "template <typename A, typename B> static inline int llvm_fcmp_oge(A X, B Y) { ";
-  Out << "return X >= Y ; }\n";
-  Out << "template <typename A> A *Memset(A *ptr, int count, size_t len) { ";
-  Out << "return (A *)memset(ptr, count, len); }\n";
-
-  // Emit definitions of the intrinsics.
-  for (SmallVector<const Function*, 8>::const_iterator
-       I = intrinsicsToDefine.begin(),
-       E = intrinsicsToDefine.end(); I != E; ++I) {
-    printIntrinsicDefinition(**I, Out);
-  }
-
   return false;
 }
 
@@ -2823,17 +2902,17 @@ void CWriter::visitSwitchInst(SwitchInst &SI) {
   printBranchToBlock(SI.getParent(), SI.getDefaultDest(), 2);
   Out << ";\n";
 
-#ifdef LLVM_3_1svn
-  for (SwitchInst::CaseIt i = SI.case_begin(), e = SI.case_end(); i != e; ++i) {
-    ConstantInt* CaseVal = i.getCaseValue();
-    BasicBlock* Succ = i.getCaseSuccessor();
-#else
+#ifdef LLVM_3_0
   // Skip the first item since that's the default case.
   unsigned NumCases = SI.getNumCases();
   for (unsigned i = 1; i < NumCases; ++i) {
     ConstantInt* CaseVal = SI.getCaseValue(i);
     BasicBlock* Succ = SI.getSuccessor(i);
-#endif // LLVM_3_1svn
+#else
+  for (SwitchInst::CaseIt i = SI.case_begin(), e = SI.case_end(); i != e; ++i) {
+    ConstantInt* CaseVal = i.getCaseValue();
+    BasicBlock* Succ = i.getCaseSuccessor();
+#endif // !LLVM_3_0
     Out << "  case ";
     writeOperand(CaseVal);
     Out << ":\n";
@@ -3401,6 +3480,7 @@ void CWriter::lowerIntrinsics(Function &F) {
           case Intrinsic::ppc_altivec_lvsl:
           case Intrinsic::uadd_with_overflow:
           case Intrinsic::sadd_with_overflow:
+          case Intrinsic::trap:
               // We directly implement these intrinsics
             break;
           default:
@@ -3568,7 +3648,9 @@ bool CWriter::visitBuiltinCall(CallInst &I, Intrinsic::ID ID,
     // If this is an intrinsic that directly corresponds to a GCC
     // builtin, we emit it here.
     const char *BuiltinName = "";
+#ifdef LLVM_3_0
     Function *F = I.getCalledFunction();
+#endif // LLVM_3_0
 #define GET_GCC_BUILTIN_NAME
 #include "llvm/Intrinsics.gen"
 #undef GET_GCC_BUILTIN_NAME
@@ -3711,184 +3793,17 @@ bool CWriter::visitBuiltinCall(CallInst &I, Intrinsic::ID ID,
     writeOperand(I.getArgOperand(1));
     Out << ")";
     return true;
+  case Intrinsic::trap:
+    Out << "abort()";
+    return true;
   }
 }
 
-//This converts the llvm constraint string to something gcc is expecting.
-//TODO: work out platform independent constraints and factor those out
-//      of the per target tables
-//      handle multiple constraint codes
-std::string CWriter::InterpretASMConstraint(InlineAsm::ConstraintInfo& c) {
-  assert(c.Codes.size() == 1 && "Too many asm constraint codes to handle");
-
-  // Grab the translation table from MCAsmInfo if it exists.
-  const MCAsmInfo *TargetAsm;
-  std::string Triple = TheModule->getTargetTriple();
-  if (Triple.empty())
-#if defined(LLVM_3_1) || defined(LLVM_3_1svn)
-    Triple = llvm::sys::getDefaultTargetTriple();
-#else
-    Triple = llvm::sys::getHostTriple();
-#endif
-
-  std::string E;
-  if (const llvm::Target *Match = TargetRegistry::lookupTarget(Triple, E))
-    TargetAsm = Match->createMCAsmInfo(Triple);
-  else
-    return c.Codes[0];
-
-  const char *const *table = TargetAsm->getAsmCBE();
-
-  // Search the translation table if it exists.
-  for (int i = 0; table && table[i]; i += 2)
-    if (c.Codes[0] == table[i]) {
-      delete TargetAsm;
-      return table[i+1];
-    }
-
-  // Default is identity.
-  delete TargetAsm;
-  return c.Codes[0];
-}
-
-//TODO: import logic from AsmPrinter.cpp
-static std::string gccifyAsm(std::string asmstr) {
-  for (std::string::size_type i = 0; i != asmstr.size(); ++i)
-    if (asmstr[i] == '\n')
-      asmstr.replace(i, 1, "\\n");
-    else if (asmstr[i] == '\t')
-      asmstr.replace(i, 1, "\\t");
-    else if (asmstr[i] == '$') {
-      if (asmstr[i + 1] == '{') {
-        std::string::size_type a = asmstr.find_first_of(':', i + 1);
-        std::string::size_type b = asmstr.find_first_of('}', i + 1);
-        std::string n = "%" +
-          asmstr.substr(a + 1, b - a - 1) +
-          asmstr.substr(i + 2, a - i - 2);
-        asmstr.replace(i, b - i + 1, n);
-        i += n.size() - 1;
-      } else
-        asmstr.replace(i, 1, "%");
-    }
-    else if (asmstr[i] == '%')//grr
-      { asmstr.replace(i, 1, "%%"); ++i;}
-
-  return asmstr;
-}
 
 //TODO: assumptions about what consume arguments from the call are likely wrong
 //      handle communitivity
 void CWriter::visitInlineAsm(CallInst &CI) {
-  InlineAsm* as = cast<InlineAsm>(CI.getCalledValue());
-  InlineAsm::ConstraintInfoVector Constraints = as->ParseConstraints();
-
-  std::vector<std::pair<Value*, int> > ResultVals;
-  if (CI.getType() == Type::getVoidTy(CI.getContext()))
-    ;
-  else if (StructType *ST = dyn_cast<StructType>(CI.getType())) {
-    for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i)
-      ResultVals.push_back(std::make_pair(&CI, (int)i));
-  } else {
-    ResultVals.push_back(std::make_pair(&CI, -1));
-  }
-
-  // Fix up the asm string for gcc and emit it.
-  Out << "__asm__ volatile (\"" << gccifyAsm(as->getAsmString()) << "\"\n";
-  Out << "        :";
-
-  unsigned ValueCount = 0;
-  bool IsFirst = true;
-
-  // Convert over all the output constraints.
-  for (InlineAsm::ConstraintInfoVector::iterator I = Constraints.begin(),
-       E = Constraints.end(); I != E; ++I) {
-
-    if (I->Type != InlineAsm::isOutput) {
-      ++ValueCount;
-      continue;  // Ignore non-output constraints.
-    }
-
-    assert(I->Codes.size() == 1 && "Too many asm constraint codes to handle");
-    std::string C = InterpretASMConstraint(*I);
-    if (C.empty()) continue;
-
-    if (!IsFirst) {
-      Out << ", ";
-      IsFirst = false;
-    }
-
-    // Unpack the dest.
-    Value *DestVal;
-    int DestValNo = -1;
-
-    if (ValueCount < ResultVals.size()) {
-      DestVal = ResultVals[ValueCount].first;
-      DestValNo = ResultVals[ValueCount].second;
-    } else
-      DestVal = CI.getArgOperand(ValueCount-ResultVals.size());
-
-    if (I->isEarlyClobber)
-      C = "&"+C;
-
-    Out << "\"=" << C << "\"(" << GetValueName(DestVal);
-    if (DestValNo != -1)
-      Out << ".field" << DestValNo; // Multiple retvals.
-    Out << ")";
-    ++ValueCount;
-  }
-
-
-  // Convert over all the input constraints.
-  Out << "\n        :";
-  IsFirst = true;
-  ValueCount = 0;
-  for (InlineAsm::ConstraintInfoVector::iterator I = Constraints.begin(),
-       E = Constraints.end(); I != E; ++I) {
-    if (I->Type != InlineAsm::isInput) {
-      ++ValueCount;
-      continue;  // Ignore non-input constraints.
-    }
-
-    assert(I->Codes.size() == 1 && "Too many asm constraint codes to handle");
-    std::string C = InterpretASMConstraint(*I);
-    if (C.empty()) continue;
-
-    if (!IsFirst) {
-      Out << ", ";
-      IsFirst = false;
-    }
-
-    assert(ValueCount >= ResultVals.size() && "Input can't refer to result");
-    Value *SrcVal = CI.getArgOperand(ValueCount-ResultVals.size());
-
-    Out << "\"" << C << "\"(";
-    if (!I->isIndirect)
-      writeOperand(SrcVal);
-    else
-      writeOperandDeref(SrcVal);
-    Out << ")";
-  }
-
-  // Convert over the clobber constraints.
-  IsFirst = true;
-  for (InlineAsm::ConstraintInfoVector::iterator I = Constraints.begin(),
-       E = Constraints.end(); I != E; ++I) {
-    if (I->Type != InlineAsm::isClobber)
-      continue;  // Ignore non-input constraints.
-
-    assert(I->Codes.size() == 1 && "Too many asm constraint codes to handle");
-    std::string C = InterpretASMConstraint(*I);
-    if (C.empty()) continue;
-
-    if (!IsFirst) {
-      Out << ", ";
-      IsFirst = false;
-    }
-
-    Out << '\"' << C << '"';
-  }
-
-  Out << ")";
+  assert(!"Inline assembly not supported");
 }
 
 void CWriter::visitAllocaInst(AllocaInst &I) {
@@ -4240,14 +4155,14 @@ void CWriter::visitAtomicCmpXchgInst(AtomicCmpXchgInst &ACXI) {
 
 class SmearCleanupPass : public llvm::BasicBlockPass {
 public:
-    SmearCleanupPass(llvm::Module *m, int width)
+    SmearCleanupPass(Module *m, int width)
         : BasicBlockPass(ID) { module = m; vectorWidth = width; }
 
     const char *getPassName() const { return "Smear Cleanup Pass"; }
     bool runOnBasicBlock(llvm::BasicBlock &BB);
 
     static char ID;
-    llvm::Module *module;
+    Module *module;
     int vectorWidth;
 };
 
@@ -4303,41 +4218,28 @@ SmearCleanupPass::runOnBasicBlock(llvm::BasicBlock &bb) {
         assert(toMatch != NULL);
 
         {
-        // FIXME: generalize this/make it not so hard-coded?
         Type *matchType = toMatch->getType();
-        const char *smearFuncName = NULL;
-
-        switch (matchType->getTypeID()) {
-        case Type::FloatTyID:  smearFuncName = "__smear_float"; break;
-        case Type::DoubleTyID: smearFuncName = "__smear_double"; break;
-        case Type::IntegerTyID: {
-            switch (cast<IntegerType>(matchType)->getBitWidth()) {
-            case 8:  smearFuncName = "__smear_i8";  break;
-            case 16: smearFuncName = "__smear_i16"; break;
-            case 32: smearFuncName = "__smear_i32"; break;
-            case 64: smearFuncName = "__smear_i64"; break;
-            }
-        }
-        default: break;
-        }
+        const char *smearFuncName = lGetSmearFunc(matchType);
 
         if (smearFuncName != NULL) {
             Function *smearFunc = module->getFunction(smearFuncName);
             if (smearFunc == NULL) {
                 Constant *sf = 
                     module->getOrInsertFunction(smearFuncName, iter->getType(), 
-                                                matchType, NULL);
+                                                iter->getType(), matchType, NULL);
                 smearFunc = dyn_cast<Function>(sf);
                 assert(smearFunc != NULL);
                 smearFunc->setDoesNotThrow(true);
                 smearFunc->setDoesNotAccessMemory(true);
             }
-                
+
+            llvm::Value *undefResult = llvm::UndefValue::get(vt);
             assert(smearFunc != NULL);
-            Value *args[1] = { toMatch };
-            ArrayRef<llvm::Value *> argArray(&args[0], &args[1]);
+            Value *args[2] = { undefResult, toMatch };
+            ArrayRef<llvm::Value *> argArray(&args[0], &args[2]);
             Instruction *smearCall = 
-                CallInst::Create(smearFunc, argArray, "smear", (Instruction *)NULL);
+                CallInst::Create(smearFunc, argArray, LLVMGetName(toMatch, "_smear"),
+                                 (Instruction *)NULL);
 
             ReplaceInstWithInst(iter, smearCall);
 
@@ -4401,6 +4303,155 @@ BitcastCleanupPass::runOnBasicBlock(llvm::BasicBlock &bb) {
     return modifiedAny;
 }
 
+///////////////////////////////////////////////////////////////////////////
+// MaskOpsCleanupPass
+
+/** This pass does various peephole improvements to mask modification
+    operations.  In particular, it converts mask XORs with "all true" to
+    calls to __not() and replaces operations like and(not(a), b) to
+    __and_not1(a, b) (and similarly if the second operand has not applied
+    to it...)
+ */
+class MaskOpsCleanupPass : public llvm::BasicBlockPass {
+public:
+    MaskOpsCleanupPass(Module *m)
+        : BasicBlockPass(ID) { 
+        Type *mt = LLVMTypes::MaskType;
+
+        // Declare the __not, __and_not1, and __and_not2 functions that we
+        // expect the target to end up providing.
+        notFunc = 
+            dyn_cast<Function>(m->getOrInsertFunction("__not", mt, mt, NULL));
+        assert(notFunc != NULL);
+        notFunc->addFnAttr(Attribute::NoUnwind);
+        notFunc->addFnAttr(Attribute::ReadNone);
+
+        andNotFuncs[0] = 
+            dyn_cast<Function>(m->getOrInsertFunction("__and_not1", mt, mt, mt,
+                                                      NULL));
+        assert(andNotFuncs[0] != NULL);
+        andNotFuncs[0]->addFnAttr(Attribute::NoUnwind);
+        andNotFuncs[0]->addFnAttr(Attribute::ReadNone);
+
+        andNotFuncs[1] = 
+            dyn_cast<Function>(m->getOrInsertFunction("__and_not2", mt, mt, mt,
+                                                      NULL));
+        assert(andNotFuncs[1] != NULL);
+        andNotFuncs[1]->addFnAttr(Attribute::NoUnwind);
+        andNotFuncs[1]->addFnAttr(Attribute::ReadNone);
+    }
+
+    const char *getPassName() const { return "MaskOps Cleanup Pass"; }
+    bool runOnBasicBlock(llvm::BasicBlock &BB);
+
+private:
+    Value *lGetNotOperand(Value *v) const;
+
+    Function *notFunc, *andNotFuncs[2];
+
+    static char ID;
+};
+
+char MaskOpsCleanupPass::ID = 0;
+
+
+/** Returns true if the given value is a compile-time constant vector of
+    i1s with all elements 'true'. 
+*/
+static bool
+lIsAllTrue(Value *v) {
+    if (ConstantVector *cv = dyn_cast<ConstantVector>(v)) {
+        ConstantInt *ci;
+        return (cv->getSplatValue() != NULL &&
+                (ci = dyn_cast<ConstantInt>(cv->getSplatValue())) != NULL &&
+                ci->isOne());
+    }
+                
+#ifndef LLVM_3_0
+    if (ConstantDataVector *cdv = dyn_cast<ConstantDataVector>(v)) {
+        ConstantInt *ci;
+        return (cdv->getSplatValue() != NULL &&
+                (ci = dyn_cast<ConstantInt>(cdv->getSplatValue())) != NULL &&
+                ci->isOne());
+    }
+#endif
+
+    return false;
+}
+
+
+/** Checks to see if the given value is the NOT of some other value.  If
+    so, it returns the operand of the NOT; otherwise returns NULL.
+ */
+Value *
+MaskOpsCleanupPass::lGetNotOperand(Value *v) const {
+    if (CallInst *ci = dyn_cast<CallInst>(v))
+        if (ci->getCalledFunction() == notFunc)
+            // Direct call to __not()
+            return ci->getArgOperand(0);
+
+    if (BinaryOperator *bop = dyn_cast<BinaryOperator>(v))
+        if (bop->getOpcode() == Instruction::Xor &&
+            lIsAllTrue(bop->getOperand(1)))
+            // XOR of all-true vector.
+            return bop->getOperand(0);
+
+    return NULL;
+}
+
+
+bool
+MaskOpsCleanupPass::runOnBasicBlock(llvm::BasicBlock &bb) {
+    bool modifiedAny = false;
+
+ restart:
+    for (BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
+        BinaryOperator *bop = dyn_cast<BinaryOperator>(&*iter);
+        if (bop == NULL)
+            continue;
+
+        if (bop->getType() != LLVMTypes::MaskType)
+            continue;
+
+        if (bop->getOpcode() == Instruction::Xor) {
+            // Check for XOR with all-true values
+            if (lIsAllTrue(bop->getOperand(1))) {
+                ArrayRef<Value *> arg(bop->getOperand(0));
+                CallInst *notCall = CallInst::Create(notFunc, arg, 
+                                                     bop->getName());
+                ReplaceInstWithInst(iter, notCall);
+                modifiedAny = true;
+                goto restart;
+            }
+        }
+        else if (bop->getOpcode() == Instruction::And) {
+            // Check each of the operands to see if they have NOT applied
+            // to them.
+            for (int i = 0; i < 2; ++i) {
+                if (Value *notOp = lGetNotOperand(bop->getOperand(i))) {
+                    // In notOp we have the target of the NOT operation;
+                    // put it in its appropriate spot in the operand array.
+                    // Copy in the other operand directly.
+                    Value *args[2];
+                    args[i]     = notOp;
+                    args[i ^ 1] = bop->getOperand(i ^ 1);
+                    ArrayRef<Value *> argsRef(&args[0], 2);
+
+                    // Call the appropriate __and_not* function.
+                    CallInst *andNotCall = 
+                        CallInst::Create(andNotFuncs[i], argsRef, bop->getName());
+
+                    ReplaceInstWithInst(iter, andNotCall);
+                    modifiedAny = true;
+                    goto restart;
+                }
+            }
+        }
+    }
+
+    return modifiedAny;
+}
+
 
 //===----------------------------------------------------------------------===//
 //                       External Interface declaration
@@ -4432,6 +4483,7 @@ WriteCXXFile(llvm::Module *module, const char *fn, int vectorWidth,
     pm.add(createCFGSimplificationPass());   // clean up after lower invoke.
     pm.add(new SmearCleanupPass(module, vectorWidth));
     pm.add(new BitcastCleanupPass);
+    pm.add(new MaskOpsCleanupPass(module));
     pm.add(createDeadCodeEliminationPass()); // clean up after smear pass
 //CO    pm.add(createPrintModulePass(&fos));
     pm.add(new CWriter(fos, includeName, vectorWidth));
@@ -4442,5 +4494,3 @@ WriteCXXFile(llvm::Module *module, const char *fn, int vectorWidth,
 
     return true;
 }
-
-#endif // LLVM_2_9
diff --git a/contrib/ispc.vim b/contrib/ispc.vim
index 5c178c0f..cc8493f0 100644
--- a/contrib/ispc.vim
+++ b/contrib/ispc.vim
@@ -17,7 +17,7 @@ syn keyword	ispcStatement	cbreak ccontinue creturn launch print reference soa sy
 syn keyword	ispcConditional	cif
 syn keyword	ispcRepeat	cdo cfor cwhile
 syn keyword	ispcBuiltin	programCount programIndex	
-syn keyword	ispcType	export int8 int16 int32 int64
+syn keyword	ispcType	export uniform varying int8 int16 int32 int64
 
 " Default highlighting
 command -nargs=+ HiLink hi def link <args>
diff --git a/contrib/ispc.vim.README b/contrib/ispc.vim.README
new file mode 100644
index 00000000..fd33df09
--- /dev/null
+++ b/contrib/ispc.vim.README
@@ -0,0 +1,8 @@
+To install vim syntax highlighting for ispc files:
+
+1) Copy ispc.vim into ~/.vim/syntax/ispc.vim (create if necessary)
+2) Create a filetype for ispc files to correspond to that syntax file
+   To do this, create and append the following line to ~/.vim/ftdetect/ispc.vim
+
+au BufRead,BufNewFile *.ispc set filetype=ispc
+
diff --git a/ctx.cpp b/ctx.cpp
index 5f5258e8..11957ae2 100644
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -251,7 +251,7 @@ FunctionEmitContext::FunctionEmitContext(Function *func, Symbol *funSym,
     if (!returnType || Type::Equal(returnType, AtomicType::Void))
         returnValuePtr = NULL;
     else {
-        LLVM_TYPE_CONST llvm::Type *ftype = returnType->LLVMType(g->ctx);
+        llvm::Type *ftype = returnType->LLVMType(g->ctx);
         returnValuePtr = AllocaInst(ftype, "return_value_memory");
     }
 
@@ -284,7 +284,7 @@ FunctionEmitContext::FunctionEmitContext(Function *func, Symbol *funSym,
             llvm::Constant *offFunc = 
                 m->module->getOrInsertFunction(buf, LLVMTypes::VoidType,
                                                NULL);
-            Assert(llvm::isa<llvm::Function>(offFunc));
+            AssertPos(currentPos, llvm::isa<llvm::Function>(offFunc));
             llvm::BasicBlock *offBB = 
                    llvm::BasicBlock::Create(*g->ctx, "entry", 
                                             (llvm::Function *)offFunc, 0);
@@ -297,50 +297,55 @@ FunctionEmitContext::FunctionEmitContext(Function *func, Symbol *funSym,
     }
 
     if (m->diBuilder) {
+        currentPos = funSym->pos;
+
         /* If debugging is enabled, tell the debug information emission
            code about this new function */
         diFile = funcStartPos.GetDIFile();
-        llvm::DIType retType = function->GetReturnType()->GetDIType(diFile);
-        int flags = llvm::DIDescriptor::FlagPrototyped; // ??
-        diFunction = m->diBuilder->createFunction(diFile, /* scope */
-                                                  llvmFunction->getName(), // mangled
-                                                  funSym->name,
-                                                  diFile,
-                                                  funcStartPos.first_line,
-                                                  retType,
-                                                  funSym->storageClass == SC_STATIC,
-                                                  true, /* is definition */
-                                                  flags,
-                                                  g->opt.level > 0,
-                                                  llvmFunction);
+        AssertPos(currentPos, diFile.Verify());
+
+        llvm::DIScope scope = llvm::DIScope(m->diBuilder->getCU());
+        AssertPos(currentPos, scope.Verify());
+
+        const FunctionType *functionType = function->GetType();
+        llvm::DIType diSubprogramType;
+        if (functionType == NULL)
+            AssertPos(currentPos, m->errorCount > 0);
+        else {
+            diSubprogramType = functionType->GetDIType(scope);
+            AssertPos(currentPos, diSubprogramType.Verify());
+        }
+
+        std::string mangledName = llvmFunction->getName();
+        if (mangledName == funSym->name)
+            mangledName = "";
+
+        bool isStatic = (funSym->storageClass == SC_STATIC);
+        bool isOptimized = (g->opt.level > 0);
+        int firstLine = funcStartPos.first_line;
+        int flags =  (llvm::DIDescriptor::FlagPrototyped);
+
+        diSubprogram = 
+            m->diBuilder->createFunction(diFile /* scope */, funSym->name,
+                                         mangledName,        diFile,
+                                         firstLine,          diSubprogramType,
+                                         isStatic,           true, /* is defn */
+#ifndef LLVM_3_0
+                                         firstLine,
+#endif // !LLVM_3_0
+                                         flags,
+                                         isOptimized,        llvmFunction);
+        AssertPos(currentPos, diSubprogram.Verify());
+
         /* And start a scope representing the initial function scope */
         StartScope();
-
-        llvm::DIFile file = funcStartPos.GetDIFile();
-        Symbol *programIndexSymbol = m->symbolTable->LookupVariable("programIndex");
-        Assert(programIndexSymbol && programIndexSymbol->storagePtr);
-        m->diBuilder->createGlobalVariable(programIndexSymbol->name, 
-                                           file,
-                                           funcStartPos.first_line,
-                                           programIndexSymbol->type->GetDIType(file),
-                                           true /* static */,
-                                           programIndexSymbol->storagePtr);
-
-        Symbol *programCountSymbol = m->symbolTable->LookupVariable("programCount");
-        Assert(programCountSymbol);
-        m->diBuilder->createGlobalVariable(programCountSymbol->name, 
-                                           file,
-                                           funcStartPos.first_line,
-                                           programCountSymbol->type->GetDIType(file),
-                                           true /* static */,
-                                           programCountSymbol->storagePtr);
     }
 }
 
 
 FunctionEmitContext::~FunctionEmitContext() {
-    Assert(controlFlowInfo.size() == 0);
-    Assert(debugScopes.size() == (m->diBuilder ? 1 : 0));
+    AssertPos(currentPos, controlFlowInfo.size() == 0);
+    AssertPos(currentPos, debugScopes.size() == (m->diBuilder ? 1 : 0));
 }
 
 
@@ -434,7 +439,7 @@ FunctionEmitContext::SetInternalMaskAndNot(llvm::Value *oldMask, llvm::Value *te
 
 void
 FunctionEmitContext::BranchIfMaskAny(llvm::BasicBlock *btrue, llvm::BasicBlock *bfalse) {
-    Assert(bblock != NULL);
+    AssertPos(currentPos, bblock != NULL);
     llvm::Value *any = Any(GetFullMask());
     BranchInst(btrue, bfalse, any);
     // It's illegal to add any additional instructions to the basic block
@@ -445,7 +450,7 @@ FunctionEmitContext::BranchIfMaskAny(llvm::BasicBlock *btrue, llvm::BasicBlock *
 
 void
 FunctionEmitContext::BranchIfMaskAll(llvm::BasicBlock *btrue, llvm::BasicBlock *bfalse) {
-    Assert(bblock != NULL);
+    AssertPos(currentPos, bblock != NULL);
     llvm::Value *all = All(GetFullMask());
     BranchInst(btrue, bfalse, all);
     // It's illegal to add any additional instructions to the basic block
@@ -456,7 +461,7 @@ FunctionEmitContext::BranchIfMaskAll(llvm::BasicBlock *btrue, llvm::BasicBlock *
 
 void
 FunctionEmitContext::BranchIfMaskNone(llvm::BasicBlock *btrue, llvm::BasicBlock *bfalse) {
-    Assert(bblock != NULL);
+    AssertPos(currentPos, bblock != NULL);
     // switch sense of true/false bblocks
     BranchIfMaskAny(bfalse, btrue);
     // It's illegal to add any additional instructions to the basic block
@@ -481,7 +486,7 @@ void
 FunctionEmitContext::EndIf() {
     CFInfo *ci = popCFState();
     // Make sure we match up with a Start{Uniform,Varying}If().
-    Assert(ci->IsIf());
+    AssertPos(currentPos, ci->IsIf());
 
     // 'uniform' ifs don't change the mask so we only need to restore the
     // mask going into the if for 'varying' if statements
@@ -570,7 +575,7 @@ FunctionEmitContext::StartLoop(llvm::BasicBlock *bt, llvm::BasicBlock *ct,
 void
 FunctionEmitContext::EndLoop() {
     CFInfo *ci = popCFState();
-    Assert(ci->IsLoop());
+    AssertPos(currentPos, ci->IsLoop());
 
     if (!ci->IsUniform())
         // If the loop had a 'uniform' test, then it didn't make any
@@ -604,7 +609,7 @@ FunctionEmitContext::StartForeach() {
 void
 FunctionEmitContext::EndForeach() {
     CFInfo *ci = popCFState();
-    Assert(ci->IsForeach());
+    AssertPos(currentPos, ci->IsForeach());
 }
 
 
@@ -649,7 +654,7 @@ FunctionEmitContext::Break(bool doCoherenceCheck) {
               "for/while/do loops and \"switch\" statements.");
         return;
     }
-    Assert(controlFlowInfo.size() > 0);
+    AssertPos(currentPos, controlFlowInfo.size() > 0);
 
     if (bblock == NULL)
         return;
@@ -659,7 +664,7 @@ FunctionEmitContext::Break(bool doCoherenceCheck) {
         ifsInCFAllUniform(CFInfo::Switch)) {
         // We know that all program instances are executing the break, so
         // just jump to the block immediately after the switch.
-        Assert(breakTarget != NULL);
+        AssertPos(currentPos, breakTarget != NULL);
         BranchInst(breakTarget);
         bblock = NULL;
         return;
@@ -684,7 +689,7 @@ FunctionEmitContext::Break(bool doCoherenceCheck) {
         // break.  In these cases, we need to update the mask of the lanes
         // that have executed a 'break' statement: 
         // breakLanes = breakLanes | mask
-        Assert(breakLanesPtr != NULL);
+        AssertPos(currentPos, breakLanesPtr != NULL);
         llvm::Value *mask = GetInternalMask();
         llvm::Value *breakMask = LoadInst(breakLanesPtr,
                                           "break_mask");
@@ -723,7 +728,7 @@ FunctionEmitContext::Continue(bool doCoherenceCheck) {
               "for/while/do/foreach loops.");
         return;
     }
-    Assert(controlFlowInfo.size() > 0);
+    AssertPos(currentPos, controlFlowInfo.size() > 0);
 
     if (ifsInCFAllUniform(CFInfo::Loop) || GetInternalMask() == LLVMMaskAllOn) {
         // Similarly to 'break' statements, we can immediately jump to the
@@ -739,7 +744,7 @@ FunctionEmitContext::Continue(bool doCoherenceCheck) {
     else {
         // Otherwise update the stored value of which lanes have 'continue'd.
         // continueLanes = continueLanes | mask
-        Assert(continueLanesPtr);
+        AssertPos(currentPos, continueLanesPtr);
         llvm::Value *mask = GetInternalMask();
         llvm::Value *continueMask = 
             LoadInst(continueLanesPtr, "continue_mask");
@@ -767,7 +772,7 @@ FunctionEmitContext::Continue(bool doCoherenceCheck) {
  */
 bool
 FunctionEmitContext::ifsInCFAllUniform(int type) const {
-    Assert(controlFlowInfo.size() > 0);
+    AssertPos(currentPos, controlFlowInfo.size() > 0);
     // Go backwards through controlFlowInfo, since we add new nested scopes
     // to the back.  Stop once we come to the first enclosing control flow
     // structure of the desired type.
@@ -778,7 +783,7 @@ FunctionEmitContext::ifsInCFAllUniform(int type) const {
             return false;
         --i;
     }
-    Assert(i >= 0); // else we didn't find the expected control flow type!
+    AssertPos(currentPos, i >= 0); // else we didn't find the expected control flow type!
     return true;
 }
 
@@ -786,7 +791,7 @@ FunctionEmitContext::ifsInCFAllUniform(int type) const {
 void
 FunctionEmitContext::jumpIfAllLoopLanesAreDone(llvm::BasicBlock *target) {
     llvm::Value *allDone = NULL;
-    Assert(continueLanesPtr != NULL);
+    AssertPos(currentPos, continueLanesPtr != NULL);
     if (breakLanesPtr == NULL) {
         // In a foreach loop, break and return are illegal, and
         // breakLanesPtr is NULL.  In this case, the mask is guaranteed to
@@ -879,7 +884,7 @@ FunctionEmitContext::StartSwitch(bool cfIsUniform, llvm::BasicBlock *bbBreak) {
 
 void
 FunctionEmitContext::EndSwitch() {
-    Assert(bblock != NULL);
+    AssertPos(currentPos, bblock != NULL);
 
     CFInfo *ci = popCFState();
     if (ci->IsVarying() && bblock != NULL)
@@ -898,7 +903,7 @@ FunctionEmitContext::addSwitchMaskCheck(llvm::Value *mask) {
     // Find the basic block for the case or default label immediately after
     // the current one in the switch statement--that's where we want to
     // jump if the mask is all off at this label.
-    Assert(nextBlocks->find(bblock) != nextBlocks->end());
+    AssertPos(currentPos, nextBlocks->find(bblock) != nextBlocks->end());
     llvm::BasicBlock *bbNext = nextBlocks->find(bblock)->second;
 
     // Jump to the next one of the mask is all off; otherwise jump to the
@@ -912,11 +917,11 @@ FunctionEmitContext::addSwitchMaskCheck(llvm::Value *mask) {
     statement. */
 llvm::Value *
 FunctionEmitContext::getMaskAtSwitchEntry() {
-    Assert(controlFlowInfo.size() > 0);
+    AssertPos(currentPos, controlFlowInfo.size() > 0);
     int i = controlFlowInfo.size() - 1;
     while (i >= 0 && controlFlowInfo[i]->type != CFInfo::Switch)
         --i;
-    Assert(i != -1);
+    AssertPos(currentPos, i != -1);
     return controlFlowInfo[i]->savedMask;
 }
 
@@ -931,7 +936,7 @@ FunctionEmitContext::EmitDefaultLabel(bool checkMask, SourcePos pos) {
 
     // If there's a default label in the switch, a basic block for it
     // should have been provided in the previous call to SwitchInst().
-    Assert(defaultBlock != NULL);
+    AssertPos(currentPos, defaultBlock != NULL);
 
     if (bblock != NULL)
         // The previous case in the switch fell through, or we're in a
@@ -993,13 +998,13 @@ FunctionEmitContext::EmitCaseLabel(int value, bool checkMask, SourcePos pos) {
 
     // Find the basic block for this case statement.
     llvm::BasicBlock *bbCase = NULL;
-    Assert(caseBlocks != NULL);
+    AssertPos(currentPos, caseBlocks != NULL);
     for (int i = 0; i < (int)caseBlocks->size(); ++i)
         if ((*caseBlocks)[i].first == value) {
             bbCase = (*caseBlocks)[i].second;
             break;
         }
-    Assert(bbCase != NULL);
+    AssertPos(currentPos, bbCase != NULL);
 
     if (bblock != NULL)
         // fall through from the previous case
@@ -1042,7 +1047,7 @@ FunctionEmitContext::SwitchInst(llvm::Value *expr, llvm::BasicBlock *bbDefault,
                 const std::map<llvm::BasicBlock *, llvm::BasicBlock *> &bbNext) {
     // The calling code should have called StartSwitch() before calling
     // SwitchInst().
-    Assert(controlFlowInfo.size() &&
+    AssertPos(currentPos, controlFlowInfo.size() &&
            controlFlowInfo.back()->IsSwitch());
 
     switchExpr = expr;
@@ -1050,7 +1055,7 @@ FunctionEmitContext::SwitchInst(llvm::Value *expr, llvm::BasicBlock *bbDefault,
     caseBlocks = new std::vector<std::pair<int, llvm::BasicBlock *> >(bbCases);
     nextBlocks = new std::map<llvm::BasicBlock *, llvm::BasicBlock *>(bbNext);
     switchConditionWasUniform = 
-        (llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(expr->getType()) == false);
+        (llvm::isa<llvm::VectorType>(expr->getType()) == false);
 
     if (switchConditionWasUniform == true) {
         // For a uniform switch condition, just wire things up to the LLVM
@@ -1061,7 +1066,7 @@ FunctionEmitContext::SwitchInst(llvm::Value *expr, llvm::BasicBlock *bbDefault,
             if (expr->getType() == LLVMTypes::Int32Type)
                 s->addCase(LLVMInt32(bbCases[i].first), bbCases[i].second);
             else {
-                Assert(expr->getType() == LLVMTypes::Int64Type);
+                AssertPos(currentPos, expr->getType() == LLVMTypes::Int64Type);
                 s->addCase(LLVMInt64(bbCases[i].first), bbCases[i].second);
             }
         }
@@ -1080,7 +1085,7 @@ FunctionEmitContext::SwitchInst(llvm::Value *expr, llvm::BasicBlock *bbDefault,
             // anyone.
             std::map<llvm::BasicBlock *, llvm::BasicBlock *>::const_iterator iter;
             iter = nextBlocks->find(NULL);
-            Assert(iter != nextBlocks->end());
+            AssertPos(currentPos, iter != nextBlocks->end());
             llvm::BasicBlock *bbFirst = iter->second;
             BranchInst(bbFirst);
             bblock = NULL;
@@ -1155,6 +1160,19 @@ FunctionEmitContext::GetLabeledBasicBlock(const std::string &label) {
         return NULL;
 }
 
+std::vector<std::string>
+FunctionEmitContext::GetLabels() {
+    // Initialize vector to the right size
+    std::vector<std::string> labels(labelMap.size());
+
+    // Iterate through labelMap and grab only the keys
+    std::map<std::string, llvm::BasicBlock*>::iterator iter;
+    for (iter=labelMap.begin(); iter != labelMap.end(); iter++)
+        labels.push_back(iter->first);
+
+    return labels;
+}
+
 
 void
 FunctionEmitContext::CurrentLanesReturned(Expr *expr, bool doCoherenceCheck) {
@@ -1176,7 +1194,7 @@ FunctionEmitContext::CurrentLanesReturned(Expr *expr, bool doCoherenceCheck) {
             llvm::Value *retVal = expr->GetValue(this);
             if (retVal != NULL) {
                 if (returnType->IsUniformType() ||
-                    dynamic_cast<const ReferenceType *>(returnType) != NULL)
+                    CastType<ReferenceType>(returnType) != NULL)
                     StoreInst(retVal, returnValuePtr);
                 else {
                     // Use a masked store to store the value of the expression
@@ -1236,15 +1254,19 @@ llvm::Value *
 FunctionEmitContext::Any(llvm::Value *mask) {
     llvm::Value *mmval = LaneMask(mask);
     return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_NE, mmval,
-                   LLVMInt32(0), "any_mm_cmp");
+                   LLVMInt64(0), LLVMGetName(mask, "_any"));
 }
 
 
 llvm::Value *
 FunctionEmitContext::All(llvm::Value *mask) {
     llvm::Value *mmval = LaneMask(mask);
+    llvm::Value *allOnMaskValue = (g->target.vectorWidth == 64) ?
+        LLVMInt64(~0ull) :
+        LLVMInt64((1ull << g->target.vectorWidth) - 1);
+
     return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, mmval,
-                   LLVMInt32((1<<g->target.vectorWidth)-1), "all_mm_cmp");
+                   allOnMaskValue, LLVMGetName(mask, "_all"));
 }
 
 
@@ -1252,25 +1274,25 @@ llvm::Value *
 FunctionEmitContext::None(llvm::Value *mask) {
     llvm::Value *mmval = LaneMask(mask);
     return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, mmval,
-                   LLVMInt32(0), "none_mm_cmp");
+                   LLVMInt64(0), LLVMGetName(mask, "_none"));
 }
 
 
 llvm::Value *
 FunctionEmitContext::LaneMask(llvm::Value *v) {
     // Call the target-dependent movmsk function to turn the vector mask
-    // into an i32 value
+    // into an i64 value
     std::vector<Symbol *> mm;
     m->symbolTable->LookupFunction("__movmsk", &mm);
     if (g->target.maskBitCount == 1)
-        Assert(mm.size() == 1);
+        AssertPos(currentPos, mm.size() == 1);
     else
         // There should be one with signed int signature, one unsigned int.
-        Assert(mm.size() == 2); 
+        AssertPos(currentPos, mm.size() == 2); 
     // We can actually call either one, since both are i32s as far as
     // LLVM's type system is concerned...
     llvm::Function *fmm = mm[0]->function;
-    return CallInst(fmm, NULL, v, "val_movmsk");
+    return CallInst(fmm, NULL, v, LLVMGetName(v, "_movmsk"));
 }
 
 
@@ -1288,17 +1310,17 @@ FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
     llvm::Value *mm1 = LaneMask(v1);
     llvm::Value *mm2 = LaneMask(v2);
     return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, mm1, mm2,
-                   "v1==v2");
+                   LLVMGetName("equal", v1, v2));
 #endif
 }
 
 
 llvm::Value *
 FunctionEmitContext::GetStringPtr(const std::string &str) {
-#ifdef LLVM_3_1svn
-    llvm::Constant *lstr = llvm::ConstantDataArray::getString(*g->ctx, str);
-#else
+#ifdef LLVM_3_0
     llvm::Constant *lstr = llvm::ConstantArray::get(*g->ctx, str);
+#else
+    llvm::Constant *lstr = llvm::ConstantDataArray::getString(*g->ctx, str);
 #endif
     llvm::GlobalValue::LinkageTypes linkage = llvm::GlobalValue::InternalLinkage;
     llvm::Value *lstrPtr = new llvm::GlobalVariable(*m->module, lstr->getType(),
@@ -1318,26 +1340,26 @@ FunctionEmitContext::CreateBasicBlock(const char *name) {
 llvm::Value *
 FunctionEmitContext::I1VecToBoolVec(llvm::Value *b) {
     if (b == NULL) {
-        Assert(m->errorCount > 0);
+        AssertPos(currentPos, m->errorCount > 0);
         return NULL;
     }
 
     if (g->target.maskBitCount == 1)
         return b;
 
-    LLVM_TYPE_CONST llvm::ArrayType *at = 
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(b->getType());
+    llvm::ArrayType *at = 
+        llvm::dyn_cast<llvm::ArrayType>(b->getType());
     if (at) {
         // If we're given an array of vectors of i1s, then do the
         // conversion for each of the elements
-        LLVM_TYPE_CONST llvm::Type *boolArrayType = 
+        llvm::Type *boolArrayType = 
             llvm::ArrayType::get(LLVMTypes::BoolVectorType, at->getNumElements());
         llvm::Value *ret = llvm::UndefValue::get(boolArrayType);
 
         for (unsigned int i = 0; i < at->getNumElements(); ++i) {
             llvm::Value *elt = ExtractInst(b, i);
             llvm::Value *sext = SExtInst(elt, LLVMTypes::BoolVectorType, 
-                                         "val_to_boolvec32");
+                                         LLVMGetName(elt, "_to_boolvec32"));
             ret = InsertInst(ret, sext, i);
         }
         return ret;
@@ -1349,29 +1371,24 @@ FunctionEmitContext::I1VecToBoolVec(llvm::Value *b) {
 
 static llvm::Value *
 lGetStringAsValue(llvm::BasicBlock *bblock, const char *s) {
-#ifdef LLVM_3_1svn
-    llvm::Constant *sConstant = llvm::ConstantDataArray::getString(*g->ctx, s);
-#else
+#ifdef LLVM_3_0
     llvm::Constant *sConstant = llvm::ConstantArray::get(*g->ctx, s);
+#else
+    llvm::Constant *sConstant = llvm::ConstantDataArray::getString(*g->ctx, s);
 #endif
     llvm::Value *sPtr = new llvm::GlobalVariable(*m->module, sConstant->getType(), 
                                                  true /* const */,
                                                  llvm::GlobalValue::InternalLinkage,
                                                  sConstant, s);
     llvm::Value *indices[2] = { LLVMInt32(0), LLVMInt32(0) };
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
     llvm::ArrayRef<llvm::Value *> arrayRef(&indices[0], &indices[2]);
     return llvm::GetElementPtrInst::Create(sPtr, arrayRef, "sptr", bblock);
-#else
-    return llvm::GetElementPtrInst::Create(sPtr, &indices[0], &indices[2],
-                                           "sptr", bblock);
-#endif
 }
 
 
 void
 FunctionEmitContext::AddInstrumentationPoint(const char *note) {
-    Assert(note != NULL);
+    AssertPos(currentPos, note != NULL);
     if (!g->emitInstrumentation)
         return;
 
@@ -1382,7 +1399,7 @@ FunctionEmitContext::AddInstrumentationPoint(const char *note) {
     args.push_back(lGetStringAsValue(bblock, note));
     // arg 3: line number
     args.push_back(LLVMInt32(currentPos.first_line));
-    // arg 4: current mask, movmsk'ed down to an int32
+    // arg 4: current mask, movmsk'ed down to an int64
     args.push_back(LaneMask(GetFullMask()));
 
     llvm::Function *finst = m->module->getFunction("ISPCInstrument");
@@ -1425,12 +1442,13 @@ FunctionEmitContext::StartScope() {
         if (debugScopes.size() > 0)
             parentScope = debugScopes.back();
         else
-            parentScope = diFunction;
+            parentScope = diSubprogram;
 
         llvm::DILexicalBlock lexicalBlock = 
             m->diBuilder->createLexicalBlock(parentScope, diFile,
                                              currentPos.first_line,
                                              currentPos.first_column);
+        AssertPos(currentPos, lexicalBlock.Verify());
         debugScopes.push_back(lexicalBlock);
     }
 }
@@ -1439,7 +1457,7 @@ FunctionEmitContext::StartScope() {
 void
 FunctionEmitContext::EndScope() {
     if (m->diBuilder != NULL) {
-        Assert(debugScopes.size() > 0);
+        AssertPos(currentPos, debugScopes.size() > 0);
         debugScopes.pop_back();
     }
 }
@@ -1447,7 +1465,7 @@ FunctionEmitContext::EndScope() {
 
 llvm::DIScope 
 FunctionEmitContext::GetDIScope() const {
-    Assert(debugScopes.size() > 0);
+    AssertPos(currentPos, debugScopes.size() > 0);
     return debugScopes.back();
 }
 
@@ -1458,14 +1476,17 @@ FunctionEmitContext::EmitVariableDebugInfo(Symbol *sym) {
         return;
 
     llvm::DIScope scope = GetDIScope();
+    llvm::DIType diType = sym->type->GetDIType(scope);
+    AssertPos(currentPos, diType.Verify());
     llvm::DIVariable var = 
         m->diBuilder->createLocalVariable(llvm::dwarf::DW_TAG_auto_variable,
                                           scope,
                                           sym->name,
                                           sym->pos.GetDIFile(),
                                           sym->pos.first_line,
-                                          sym->type->GetDIType(scope),
+                                          diType,
                                           true /* preserve through opts */);
+    AssertPos(currentPos, var.Verify());
     llvm::Instruction *declareInst = 
         m->diBuilder->insertDeclare(sym->storagePtr, var, bblock);
     AddDebugPos(declareInst, &sym->pos, &scope);
@@ -1473,19 +1494,26 @@ FunctionEmitContext::EmitVariableDebugInfo(Symbol *sym) {
 
 
 void
-FunctionEmitContext::EmitFunctionParameterDebugInfo(Symbol *sym) {
+FunctionEmitContext::EmitFunctionParameterDebugInfo(Symbol *sym, int argNum) {
     if (m->diBuilder == NULL)
         return;
 
-    llvm::DIScope scope = diFunction;
+    llvm::DIScope scope = diSubprogram;
+    llvm::DIType diType = sym->type->GetDIType(scope);
+    AssertPos(currentPos, diType.Verify());
+    int flags = 0;
+
     llvm::DIVariable var = 
         m->diBuilder->createLocalVariable(llvm::dwarf::DW_TAG_arg_variable,
                                           scope,
                                           sym->name,
                                           sym->pos.GetDIFile(),
                                           sym->pos.first_line,
-                                          sym->type->GetDIType(scope),
-                                          true /* preserve through opts */);
+                                          diType,
+                                          true /* preserve through opts */,
+                                          flags,
+                                          argNum+1);
+    AssertPos(currentPos, var.Verify());
     llvm::Instruction *declareInst = 
         m->diBuilder->insertDeclare(sym->storagePtr, var, bblock);
     AddDebugPos(declareInst, &sym->pos, &scope);
@@ -1498,16 +1526,16 @@ FunctionEmitContext::EmitFunctionParameterDebugInfo(Symbol *sym) {
     Otherwise return zero.
  */
 static int
-lArrayVectorWidth(LLVM_TYPE_CONST llvm::Type *t) {
-    LLVM_TYPE_CONST llvm::ArrayType *arrayType = 
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(t);
+lArrayVectorWidth(llvm::Type *t) {
+    llvm::ArrayType *arrayType = 
+        llvm::dyn_cast<llvm::ArrayType>(t);
     if (arrayType == NULL)
         return 0;
 
     // We shouldn't be seeing arrays of anything but vectors being passed
     // to things like FunctionEmitContext::BinaryOperator() as operands.
-    LLVM_TYPE_CONST llvm::VectorType *vectorElementType = 
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(arrayType->getElementType());
+    llvm::VectorType *vectorElementType = 
+        llvm::dyn_cast<llvm::VectorType>(arrayType->getElementType());
     Assert((vectorElementType != NULL &&
             (int)vectorElementType->getNumElements() == g->target.vectorWidth));
            
@@ -1520,12 +1548,12 @@ FunctionEmitContext::BinaryOperator(llvm::Instruction::BinaryOps inst,
                                     llvm::Value *v0, llvm::Value *v1, 
                                     const char *name) {
     if (v0 == NULL || v1 == NULL) {
-        Assert(m->errorCount > 0);
+        AssertPos(currentPos, m->errorCount > 0);
         return NULL;
     }
 
-    Assert(v0->getType() == v1->getType());
-    LLVM_TYPE_CONST llvm::Type *type = v0->getType();
+    AssertPos(currentPos, v0->getType() == v1->getType());
+    llvm::Type *type = v0->getType();
     int arraySize = lArrayVectorWidth(type);
     if (arraySize == 0) {
         llvm::Instruction *bop = 
@@ -1552,14 +1580,14 @@ FunctionEmitContext::BinaryOperator(llvm::Instruction::BinaryOps inst,
 llvm::Value *
 FunctionEmitContext::NotOperator(llvm::Value *v, const char *name) {
     if (v == NULL) {
-        Assert(m->errorCount > 0);
+        AssertPos(currentPos, m->errorCount > 0);
         return NULL;
     }
 
     // Similarly to BinaryOperator, do the operation on all the elements of
     // the array if we're given an array type; otherwise just do the
     // regular llvm operation.
-    LLVM_TYPE_CONST llvm::Type *type = v->getType();
+    llvm::Type *type = v->getType();
     int arraySize = lArrayVectorWidth(type);
     if (arraySize == 0) {
         llvm::Instruction *binst = 
@@ -1584,18 +1612,18 @@ FunctionEmitContext::NotOperator(llvm::Value *v, const char *name) {
 // Given the llvm Type that represents an ispc VectorType, return an
 // equally-shaped type with boolean elements.  (This is the type that will
 // be returned from CmpInst with ispc VectorTypes).
-static LLVM_TYPE_CONST llvm::Type *
-lGetMatchingBoolVectorType(LLVM_TYPE_CONST llvm::Type *type) {
-    LLVM_TYPE_CONST llvm::ArrayType *arrayType = 
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(type);
+static llvm::Type *
+lGetMatchingBoolVectorType(llvm::Type *type) {
+    llvm::ArrayType *arrayType = 
+        llvm::dyn_cast<llvm::ArrayType>(type);
     Assert(arrayType != NULL);
 
-    LLVM_TYPE_CONST llvm::VectorType *vectorElementType = 
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(arrayType->getElementType());
+    llvm::VectorType *vectorElementType = 
+        llvm::dyn_cast<llvm::VectorType>(arrayType->getElementType());
     Assert(vectorElementType != NULL);
     Assert((int)vectorElementType->getNumElements() == g->target.vectorWidth);
 
-    LLVM_TYPE_CONST llvm::Type *base = 
+    llvm::Type *base = 
         llvm::VectorType::get(LLVMTypes::BoolType, g->target.vectorWidth);
     return llvm::ArrayType::get(base, arrayType->getNumElements());
 }
@@ -1607,12 +1635,12 @@ FunctionEmitContext::CmpInst(llvm::Instruction::OtherOps inst,
                              llvm::Value *v0, llvm::Value *v1, 
                              const char *name) {
     if (v0 == NULL || v1 == NULL) {
-        Assert(m->errorCount > 0);
+        AssertPos(currentPos, m->errorCount > 0);
         return NULL;
     }
 
-    Assert(v0->getType() == v1->getType());
-    LLVM_TYPE_CONST llvm::Type *type = v0->getType();
+    AssertPos(currentPos, v0->getType() == v1->getType());
+    llvm::Type *type = v0->getType();
     int arraySize = lArrayVectorWidth(type);
     if (arraySize == 0) {
         llvm::Instruction *ci = 
@@ -1622,7 +1650,7 @@ FunctionEmitContext::CmpInst(llvm::Instruction::OtherOps inst,
         return ci;
     }
     else {
-        LLVM_TYPE_CONST llvm::Type *boolType = lGetMatchingBoolVectorType(type);
+        llvm::Type *boolType = lGetMatchingBoolVectorType(type);
         llvm::Value *ret = llvm::UndefValue::get(boolType);
         for (int i = 0; i < arraySize; ++i) {
             llvm::Value *a = ExtractInst(v0, i);
@@ -1638,15 +1666,15 @@ FunctionEmitContext::CmpInst(llvm::Instruction::OtherOps inst,
 llvm::Value *
 FunctionEmitContext::SmearUniform(llvm::Value *value, const char *name) {
     if (value == NULL) {
-        Assert(m->errorCount > 0);
+        AssertPos(currentPos, m->errorCount > 0);
         return NULL;
     }
 
     llvm::Value *ret = NULL;
-    LLVM_TYPE_CONST llvm::Type *eltType = value->getType();
+    llvm::Type *eltType = value->getType();
 
-    LLVM_TYPE_CONST llvm::PointerType *pt = 
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::PointerType>(eltType);
+    llvm::PointerType *pt = 
+        llvm::dyn_cast<llvm::PointerType>(eltType);
     if (pt != NULL) {
         // Varying pointers are represented as vectors of i32/i64s
         ret = llvm::UndefValue::get(LLVMTypes::VoidPointerVectorType);
@@ -1669,16 +1697,17 @@ FunctionEmitContext::SmearUniform(llvm::Value *value, const char *name) {
                                     
 
 llvm::Value *
-FunctionEmitContext::BitCastInst(llvm::Value *value, 
-                                 LLVM_TYPE_CONST llvm::Type *type, 
+FunctionEmitContext::BitCastInst(llvm::Value *value, llvm::Type *type, 
                                  const char *name) {
     if (value == NULL) {
-        Assert(m->errorCount > 0);
+        AssertPos(currentPos, m->errorCount > 0);
         return NULL;
     }
 
-    llvm::Instruction *inst = 
-        new llvm::BitCastInst(value, type, name ? name : "bitcast", bblock);
+    if (name == NULL)
+        name = LLVMGetName(value, "_bitcast");
+
+    llvm::Instruction *inst = new llvm::BitCastInst(value, type, name, bblock);
     AddDebugPos(inst);
     return inst;
 }
@@ -1687,96 +1716,102 @@ FunctionEmitContext::BitCastInst(llvm::Value *value,
 llvm::Value *
 FunctionEmitContext::PtrToIntInst(llvm::Value *value, const char *name) {
     if (value == NULL) {
-        Assert(m->errorCount > 0);
+        AssertPos(currentPos, m->errorCount > 0);
         return NULL;
     }
 
-    if (llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(value->getType()))
+    if (llvm::isa<llvm::VectorType>(value->getType()))
         // no-op for varying pointers; they're already vectors of ints
         return value;
 
-    LLVM_TYPE_CONST llvm::Type *type = LLVMTypes::PointerIntType;
-    llvm::Instruction *inst = 
-        new llvm::PtrToIntInst(value, type, name ? name : "ptr2int", bblock);
+    if (name == NULL)
+        name = LLVMGetName(value, "_ptr2int");
+    llvm::Type *type = LLVMTypes::PointerIntType;
+    llvm::Instruction *inst = new llvm::PtrToIntInst(value, type, name, bblock);
     AddDebugPos(inst);
     return inst;
 }
 
 
 llvm::Value *
-FunctionEmitContext::PtrToIntInst(llvm::Value *value, 
-                                  LLVM_TYPE_CONST llvm::Type *toType,
+FunctionEmitContext::PtrToIntInst(llvm::Value *value, llvm::Type *toType,
                                   const char *name) {
     if (value == NULL) {
-        Assert(m->errorCount > 0);
+        AssertPos(currentPos, m->errorCount > 0);
         return NULL;
     }
 
-    LLVM_TYPE_CONST llvm::Type *fromType = value->getType();
-    if (llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(fromType)) {
+    if (name == NULL)
+        name = LLVMGetName(value, "_ptr2int");
+
+    llvm::Type *fromType = value->getType();
+    if (llvm::isa<llvm::VectorType>(fromType)) {
         // varying pointer
         if (fromType == toType)
             // already the right type--done
             return value;
         else if (fromType->getScalarSizeInBits() > toType->getScalarSizeInBits())
-            return TruncInst(value, toType, "ptr_to_int");
+            return TruncInst(value, toType, name);
         else {
-            Assert(fromType->getScalarSizeInBits() <
+            AssertPos(currentPos, fromType->getScalarSizeInBits() <
                    toType->getScalarSizeInBits());
-            return ZExtInst(value, toType, "ptr_to_int");
+            return ZExtInst(value, toType, name);
         }
     }
 
-    llvm::Instruction *inst = 
-        new llvm::PtrToIntInst(value, toType, name ? name : "ptr2int", bblock);
+    llvm::Instruction *inst = new llvm::PtrToIntInst(value, toType, name, bblock);
     AddDebugPos(inst);
     return inst;
 }
 
 
 llvm::Value *
-FunctionEmitContext::IntToPtrInst(llvm::Value *value, 
-                                  LLVM_TYPE_CONST llvm::Type *toType,
+FunctionEmitContext::IntToPtrInst(llvm::Value *value, llvm::Type *toType,
                                   const char *name) {
     if (value == NULL) {
-        Assert(m->errorCount > 0);
+        AssertPos(currentPos, m->errorCount > 0);
         return NULL;
     }
 
-    LLVM_TYPE_CONST llvm::Type *fromType = value->getType();
-    if (llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(fromType)) {
+    if (name == NULL)
+        name = LLVMGetName(value, "_int2ptr");
+
+    llvm::Type *fromType = value->getType();
+    if (llvm::isa<llvm::VectorType>(fromType)) {
         // varying pointer
         if (fromType == toType)
             // done
             return value;
         else if (fromType->getScalarSizeInBits() > toType->getScalarSizeInBits())
-            return TruncInst(value, toType, "int_to_ptr");
+            return TruncInst(value, toType, name);
         else {
-            Assert(fromType->getScalarSizeInBits() <
+            AssertPos(currentPos, fromType->getScalarSizeInBits() <
                    toType->getScalarSizeInBits());
-            return ZExtInst(value, toType, "int_to_ptr");
+            return ZExtInst(value, toType, name);
         }
     }
 
-    llvm::Instruction *inst = 
-        new llvm::IntToPtrInst(value, toType, name ? name : "int2ptr", bblock);
+    llvm::Instruction *inst = new llvm::IntToPtrInst(value, toType, name, 
+                                                     bblock);
     AddDebugPos(inst);
     return inst;
 }
 
 
 llvm::Instruction *
-FunctionEmitContext::TruncInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
+FunctionEmitContext::TruncInst(llvm::Value *value, llvm::Type *type,
                                const char *name) {
     if (value == NULL) {
-        Assert(m->errorCount > 0);
+        AssertPos(currentPos, m->errorCount > 0);
         return NULL;
     }
 
+    if (name == NULL)
+        name = LLVMGetName(value, "_trunc");
+
     // TODO: we should probably handle the array case as in
     // e.g. BitCastInst(), but we don't currently need that functionality
-    llvm::Instruction *inst = 
-        new llvm::TruncInst(value, type, name ? name : "trunc", bblock);
+    llvm::Instruction *inst = new llvm::TruncInst(value, type, name, bblock);
     AddDebugPos(inst);
     return inst;
 }
@@ -1784,67 +1819,76 @@ FunctionEmitContext::TruncInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *t
 
 llvm::Instruction *
 FunctionEmitContext::CastInst(llvm::Instruction::CastOps op, llvm::Value *value,
-                              LLVM_TYPE_CONST llvm::Type *type, const char *name) {
+                              llvm::Type *type, const char *name) {
     if (value == NULL) {
-        Assert(m->errorCount > 0);
+        AssertPos(currentPos, m->errorCount > 0);
         return NULL;
     }
 
+    if (name == NULL)
+        name = LLVMGetName(value, "_cast");
+
     // TODO: we should probably handle the array case as in
     // e.g. BitCastInst(), but we don't currently need that functionality
-    llvm::Instruction *inst = 
-        llvm::CastInst::Create(op, value, type, name ? name : "cast", bblock);
+    llvm::Instruction *inst = llvm::CastInst::Create(op, value, type, name,
+                                                     bblock);
     AddDebugPos(inst);
     return inst;
 }
 
 
 llvm::Instruction *
-FunctionEmitContext::FPCastInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type, 
+FunctionEmitContext::FPCastInst(llvm::Value *value, llvm::Type *type, 
                                 const char *name) {
     if (value == NULL) {
-        Assert(m->errorCount > 0);
+        AssertPos(currentPos, m->errorCount > 0);
         return NULL;
     }
 
+    if (name == NULL)
+        name = LLVMGetName(value, "_cast");
+
     // TODO: we should probably handle the array case as in
     // e.g. BitCastInst(), but we don't currently need that functionality
-    llvm::Instruction *inst = 
-        llvm::CastInst::CreateFPCast(value, type, name ? name : "fpcast", bblock);
+    llvm::Instruction *inst = llvm::CastInst::CreateFPCast(value, type, name, bblock);
     AddDebugPos(inst);
     return inst;
 }
 
 
 llvm::Instruction *
-FunctionEmitContext::SExtInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type, 
+FunctionEmitContext::SExtInst(llvm::Value *value, llvm::Type *type, 
                               const char *name) {
     if (value == NULL) {
-        Assert(m->errorCount > 0);
+        AssertPos(currentPos, m->errorCount > 0);
         return NULL;
     }
 
+    if (name == NULL)
+        name = LLVMGetName(value, "_sext");
+
     // TODO: we should probably handle the array case as in
     // e.g. BitCastInst(), but we don't currently need that functionality
-    llvm::Instruction *inst = 
-        new llvm::SExtInst(value, type, name ? name : "sext", bblock);
+    llvm::Instruction *inst = new llvm::SExtInst(value, type, name, bblock);
     AddDebugPos(inst);
     return inst;
 }
 
 
 llvm::Instruction *
-FunctionEmitContext::ZExtInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type, 
+FunctionEmitContext::ZExtInst(llvm::Value *value, llvm::Type *type, 
                               const char *name) {
     if (value == NULL) {
-        Assert(m->errorCount > 0);
+        AssertPos(currentPos, m->errorCount > 0);
         return NULL;
     }
 
+    if (name == NULL)
+        name = LLVMGetName(value, "_zext");
+
     // TODO: we should probably handle the array case as in
     // e.g. BitCastInst(), but we don't currently need that functionality
-    llvm::Instruction *inst = 
-        new llvm::ZExtInst(value, type, name ? name : "zext", bblock);
+    llvm::Instruction *inst = new llvm::ZExtInst(value, type, name, bblock);
     AddDebugPos(inst);
     return inst;
 }
@@ -1865,74 +1909,76 @@ FunctionEmitContext::applyVaryingGEP(llvm::Value *basePtr, llvm::Value *index,
     llvm::Value *scale = g->target.SizeOf(scaleType->LLVMType(g->ctx), bblock);
 
     bool indexIsVarying = 
-        llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(index->getType());
+        llvm::isa<llvm::VectorType>(index->getType());
     llvm::Value *offset = NULL;
     if (indexIsVarying == false) {
         // Truncate or sign extend the index as appropriate to a 32 or
         // 64-bit type.
         if ((g->target.is32Bit || g->opt.force32BitAddressing) && 
             index->getType() == LLVMTypes::Int64Type)
-            index = TruncInst(index, LLVMTypes::Int32Type, "trunc_index");
+            index = TruncInst(index, LLVMTypes::Int32Type);
         else if ((!g->target.is32Bit && !g->opt.force32BitAddressing) &&
                  index->getType() == LLVMTypes::Int32Type)
-            index = SExtInst(index, LLVMTypes::Int64Type, "sext_index");
+            index = SExtInst(index, LLVMTypes::Int64Type);
 
         // do a scalar multiply to get the offset as index * scale and then
         // smear the result out to be a vector; this is more efficient than
         // first promoting both the scale and the index to vectors and then
         // multiplying.
         offset = BinaryOperator(llvm::Instruction::Mul, scale, index);
-        offset = SmearUniform(offset, "offset_smear");
+        offset = SmearUniform(offset);
     }
     else {
         // Similarly, truncate or sign extend the index to be a 32 or 64
         // bit vector type
         if ((g->target.is32Bit || g->opt.force32BitAddressing) && 
             index->getType() == LLVMTypes::Int64VectorType)
-            index = TruncInst(index, LLVMTypes::Int32VectorType, "trunc_index");
+            index = TruncInst(index, LLVMTypes::Int32VectorType); 
         else if ((!g->target.is32Bit && !g->opt.force32BitAddressing) &&
                  index->getType() == LLVMTypes::Int32VectorType)
-            index = SExtInst(index, LLVMTypes::Int64VectorType, "sext_index");
+            index = SExtInst(index, LLVMTypes::Int64VectorType);
 
-        scale = SmearUniform(scale, "scale_smear");
+        scale = SmearUniform(scale);
 
         // offset = index * scale
-        offset = BinaryOperator(llvm::Instruction::Mul, scale, index, "offset");
+        offset = BinaryOperator(llvm::Instruction::Mul, scale, index, 
+                                LLVMGetName("mul", scale, index));
     }
 
     // For 64-bit targets, if we've been doing our offset calculations in
     // 32 bits, we still have to convert to a 64-bit value before we
     // actually add the offset to the pointer.
     if (g->target.is32Bit == false && g->opt.force32BitAddressing == true)
-        offset = SExtInst(offset, LLVMTypes::Int64VectorType, "offset_to_64");
+        offset = SExtInst(offset, LLVMTypes::Int64VectorType, 
+                          LLVMGetName(offset, "_to_64"));
 
     // Smear out the pointer to be varying; either the base pointer or the
     // index must be varying for this method to be called.
     bool baseIsUniform = 
-        (llvm::isa<LLVM_TYPE_CONST llvm::PointerType>(basePtr->getType()));
-    Assert(baseIsUniform == false || indexIsVarying == true);
-    llvm::Value *varyingPtr = baseIsUniform ? 
-        SmearUniform(basePtr, "ptr_smear") : basePtr;
+        (llvm::isa<llvm::PointerType>(basePtr->getType()));
+    AssertPos(currentPos, baseIsUniform == false || indexIsVarying == true);
+    llvm::Value *varyingPtr = baseIsUniform ? SmearUniform(basePtr) : basePtr;
 
     // newPtr = ptr + offset
-    return BinaryOperator(llvm::Instruction::Add, varyingPtr, offset, "new_ptr");
+    return BinaryOperator(llvm::Instruction::Add, varyingPtr, offset, 
+                          LLVMGetName(basePtr, "_offset"));
 }
 
 
 void
 FunctionEmitContext::MatchIntegerTypes(llvm::Value **v0, llvm::Value **v1) {
-    LLVM_TYPE_CONST llvm::Type *type0 = (*v0)->getType();
-    LLVM_TYPE_CONST llvm::Type *type1 = (*v1)->getType();
+    llvm::Type *type0 = (*v0)->getType();
+    llvm::Type *type1 = (*v1)->getType();
 
     // First, promote to a vector type if one of the two values is a vector
     // type
-    if (llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(type0) &&
-        !llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(type1)) {
+    if (llvm::isa<llvm::VectorType>(type0) &&
+        !llvm::isa<llvm::VectorType>(type1)) {
         *v1 = SmearUniform(*v1, "smear_v1");
         type1 = (*v1)->getType();
     }
-    if (!llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(type0) &&
-        llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(type1)) {
+    if (!llvm::isa<llvm::VectorType>(type0) &&
+        llvm::isa<llvm::VectorType>(type1)) {
         *v0 = SmearUniform(*v0, "smear_v0");
         type0 = (*v0)->getType();
     }
@@ -1969,7 +2015,7 @@ lComputeSliceIndex(FunctionEmitContext *ctx, int soaWidth,
 
     ctx->MatchIntegerTypes(&indexValue, &ptrSliceOffset);
 
-    LLVM_TYPE_CONST llvm::Type *indexType = indexValue->getType();
+    llvm::Type *indexType = indexValue->getType();
     llvm::Value *shift = LLVMIntAsType(logWidth, indexType);
     llvm::Value *mask = LLVMIntAsType(soaWidth-1, indexType);
 
@@ -1997,15 +2043,15 @@ FunctionEmitContext::MakeSlicePointer(llvm::Value *ptr, llvm::Value *offset) {
     // Create a small struct where the first element is the type of the
     // given pointer and the second element is the type of the offset
     // value.
-    std::vector<LLVM_TYPE_CONST llvm::Type *> eltTypes;
+    std::vector<llvm::Type *> eltTypes;
     eltTypes.push_back(ptr->getType());
     eltTypes.push_back(offset->getType());
-    LLVM_TYPE_CONST llvm::StructType *st = 
+    llvm::StructType *st = 
         llvm::StructType::get(*g->ctx, eltTypes);
 
     llvm::Value *ret = llvm::UndefValue::get(st);
-    ret = InsertInst(ret, ptr, 0);
-    ret = InsertInst(ret, offset, 1);
+    ret = InsertInst(ret, ptr, 0, LLVMGetName(ret, "_slice_ptr"));
+    ret = InsertInst(ret, offset, 1, LLVMGetName(ret, "_slice_offset"));
     return ret;
 }
 
@@ -2014,21 +2060,21 @@ llvm::Value *
 FunctionEmitContext::GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index, 
                                        const Type *ptrRefType, const char *name) {
     if (basePtr == NULL || index == NULL) {
-        Assert(m->errorCount > 0);
+        AssertPos(currentPos, m->errorCount > 0);
         return NULL;
     }
 
     // Regularize to a standard pointer type for basePtr's type
     const PointerType *ptrType;
-    if (dynamic_cast<const ReferenceType *>(ptrRefType) != NULL)
+    if (CastType<ReferenceType>(ptrRefType) != NULL)
         ptrType = PointerType::GetUniform(ptrRefType->GetReferenceTarget());
     else {
-        ptrType = dynamic_cast<const PointerType *>(ptrRefType);
-        Assert(ptrType != NULL);
+        ptrType = CastType<PointerType>(ptrRefType);
+        AssertPos(currentPos, ptrType != NULL);
     }
 
     if (ptrType->IsSlice()) {
-        Assert(llvm::isa<LLVM_TYPE_CONST llvm::StructType>(basePtr->getType()));
+        AssertPos(currentPos, llvm::isa<llvm::StructType>(basePtr->getType()));
 
         llvm::Value *ptrSliceOffset = ExtractInst(basePtr, 1);
         if (ptrType->IsFrozenSlice() == false) {
@@ -2056,27 +2102,21 @@ FunctionEmitContext::GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index,
     // Double-check consistency between the given pointer type and its LLVM
     // type.
     if (ptrType->IsUniformType())
-        Assert(llvm::isa<LLVM_TYPE_CONST llvm::PointerType>(basePtr->getType()));
+        AssertPos(currentPos, llvm::isa<llvm::PointerType>(basePtr->getType()));
     else if (ptrType->IsVaryingType())
-        Assert(llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(basePtr->getType()));
+        AssertPos(currentPos, llvm::isa<llvm::VectorType>(basePtr->getType()));
 
     bool indexIsVaryingType = 
-        llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(index->getType());
+        llvm::isa<llvm::VectorType>(index->getType());
 
     if (indexIsVaryingType == false && ptrType->IsUniformType() == true) {
         // The easy case: both the base pointer and the indices are
         // uniform, so just emit the regular LLVM GEP instruction
         llvm::Value *ind[1] = { index };
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
         llvm::ArrayRef<llvm::Value *> arrayRef(&ind[0], &ind[1]);
         llvm::Instruction *inst = 
             llvm::GetElementPtrInst::Create(basePtr, arrayRef,
                                             name ? name : "gep", bblock);
-#else
-        llvm::Instruction *inst = 
-            llvm::GetElementPtrInst::Create(basePtr, &ind[0], &ind[1], 
-                                            name ? name : "gep", bblock);
-#endif
         AddDebugPos(inst);
         return inst;
     }
@@ -2090,24 +2130,24 @@ FunctionEmitContext::GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index0
                                        llvm::Value *index1, const Type *ptrRefType,
                                        const char *name) {
     if (basePtr == NULL || index0 == NULL || index1 == NULL) {
-        Assert(m->errorCount > 0);
+        AssertPos(currentPos, m->errorCount > 0);
         return NULL;
     }
 
     // Regaularize the pointer type for basePtr
     const PointerType *ptrType = NULL;
-    if (dynamic_cast<const ReferenceType *>(ptrRefType) != NULL)
+    if (CastType<ReferenceType>(ptrRefType) != NULL)
         ptrType = PointerType::GetUniform(ptrRefType->GetReferenceTarget());
     else {
-        ptrType = dynamic_cast<const PointerType *>(ptrRefType);
-        Assert(ptrType != NULL);
+        ptrType = CastType<PointerType>(ptrRefType);
+        AssertPos(currentPos, ptrType != NULL);
     }
 
     if (ptrType->IsSlice()) {
         // Similar to the 1D GEP implementation above, for non-frozen slice
         // pointers we do the two-step indexing calculation and then pass
         // the new major index on to a recursive GEP call.
-        Assert(llvm::isa<LLVM_TYPE_CONST llvm::StructType>(basePtr->getType()));
+        AssertPos(currentPos, llvm::isa<llvm::StructType>(basePtr->getType()));
         llvm::Value *ptrSliceOffset = ExtractInst(basePtr, 1);
         if (ptrType->IsFrozenSlice() == false) {
             llvm::Value *newSliceOffset;
@@ -2124,25 +2164,19 @@ FunctionEmitContext::GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index0
     }
 
     bool index0IsVaryingType = 
-        llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(index0->getType());
+        llvm::isa<llvm::VectorType>(index0->getType());
     bool index1IsVaryingType = 
-        llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(index1->getType());
+        llvm::isa<llvm::VectorType>(index1->getType());
 
     if (index0IsVaryingType == false && index1IsVaryingType == false && 
         ptrType->IsUniformType() == true) {
         // The easy case: both the base pointer and the indices are
         // uniform, so just emit the regular LLVM GEP instruction
         llvm::Value *indices[2] = { index0, index1 };
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
         llvm::ArrayRef<llvm::Value *> arrayRef(&indices[0], &indices[2]);
         llvm::Instruction *inst = 
             llvm::GetElementPtrInst::Create(basePtr, arrayRef,
                                             name ? name : "gep", bblock);
-#else
-        llvm::Instruction *inst = 
-            llvm::GetElementPtrInst::Create(basePtr, &indices[0], &indices[2], 
-                                            name ? name : "gep", bblock);
-#endif
         AddDebugPos(inst);
         return inst;
     }
@@ -2153,11 +2187,11 @@ FunctionEmitContext::GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index0
         // Now index into the second dimension with index1.  First figure
         // out the type of ptr0.
         const Type *baseType = ptrType->GetBaseType();
-        const SequentialType *st = dynamic_cast<const SequentialType *>(baseType);
-        Assert(st != NULL);
+        const SequentialType *st = CastType<SequentialType>(baseType);
+        AssertPos(currentPos, st != NULL);
 
         bool ptr0IsUniform = 
-            llvm::isa<LLVM_TYPE_CONST llvm::PointerType>(ptr0->getType());
+            llvm::isa<llvm::PointerType>(ptr0->getType());
         const Type *ptr0BaseType = st->GetElementType();
         const Type *ptr0Type = ptr0IsUniform ?
             PointerType::GetUniform(ptr0BaseType) : 
@@ -2173,18 +2207,29 @@ FunctionEmitContext::AddElementOffset(llvm::Value *fullBasePtr, int elementNum,
                                       const Type *ptrRefType, const char *name,
                                       const PointerType **resultPtrType) {
     if (resultPtrType != NULL)
-        Assert(ptrRefType != NULL);
+        AssertPos(currentPos, ptrRefType != NULL);
+
+    llvm::PointerType *llvmPtrType = 
+        llvm::dyn_cast<llvm::PointerType>(fullBasePtr->getType());
+    if (llvmPtrType != NULL) {
+        llvm::StructType *llvmStructType = 
+            llvm::dyn_cast<llvm::StructType>(llvmPtrType->getElementType());
+        if (llvmStructType != NULL && llvmStructType->isSized() == false) {
+            AssertPos(currentPos, m->errorCount > 0);
+            return NULL;
+        }
+    }
 
     // (Unfortunately) it's not required to pass a non-NULL ptrRefType, but
     // if we have one, regularize into a pointer type.
     const PointerType *ptrType = NULL;
     if (ptrRefType != NULL) {
         // Normalize references to uniform pointers
-        if (dynamic_cast<const ReferenceType *>(ptrRefType) != NULL)
+        if (CastType<ReferenceType>(ptrRefType) != NULL)
             ptrType = PointerType::GetUniform(ptrRefType->GetReferenceTarget());
         else
-            ptrType = dynamic_cast<const PointerType *>(ptrRefType);
-        Assert(ptrType != NULL);
+            ptrType = CastType<PointerType>(ptrRefType);
+        AssertPos(currentPos, ptrType != NULL);
     }
 
     // Similarly, we have to see if the pointer type is a struct to see if
@@ -2192,10 +2237,10 @@ FunctionEmitContext::AddElementOffset(llvm::Value *fullBasePtr, int elementNum,
     // unfortunate...
     llvm::Value *basePtr = fullBasePtr;
     bool baseIsSlicePtr = 
-        llvm::isa<LLVM_TYPE_CONST llvm::StructType>(fullBasePtr->getType());
+        llvm::isa<llvm::StructType>(fullBasePtr->getType());
     const PointerType *rpt;
     if (baseIsSlicePtr) {
-        Assert(ptrType != NULL);
+        AssertPos(currentPos, ptrType != NULL);
         // Update basePtr to just be the part that actually points to the
         // start of an soa<> struct for now; the element offset computation
         // doesn't change the slice offset, so we'll incorporate that into
@@ -2208,10 +2253,10 @@ FunctionEmitContext::AddElementOffset(llvm::Value *fullBasePtr, int elementNum,
     // Return the pointer type of the result of this call, for callers that
     // want it.
     if (resultPtrType != NULL) {
-        Assert(ptrType != NULL);
-        const CollectionType *ct = 
-            dynamic_cast<const CollectionType *>(ptrType->GetBaseType());
-        Assert(ct != NULL);
+        AssertPos(currentPos, ptrType != NULL);
+        const CollectionType *ct =
+            CastType<CollectionType>(ptrType->GetBaseType());
+        AssertPos(currentPos, ct != NULL);
         *resultPtrType = new PointerType(ct->GetElementType(elementNum),
                                          ptrType->GetVariability(),
                                          ptrType->IsConstType(),
@@ -2222,22 +2267,15 @@ FunctionEmitContext::AddElementOffset(llvm::Value *fullBasePtr, int elementNum,
     if (ptrType == NULL || ptrType->IsUniformType()) {
         // If the pointer is uniform, we can use the regular LLVM GEP.
         llvm::Value *offsets[2] = { LLVMInt32(0), LLVMInt32(elementNum) };
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
         llvm::ArrayRef<llvm::Value *> arrayRef(&offsets[0], &offsets[2]);
         resultPtr = 
             llvm::GetElementPtrInst::Create(basePtr, arrayRef,
                                             name ? name : "struct_offset", bblock);
-#else
-        resultPtr =
-            llvm::GetElementPtrInst::Create(basePtr, &offsets[0], &offsets[2],
-                                            name ? name : "struct_offset", bblock);
-#endif
     }
     else {
         // Otherwise do the math to find the offset and add it to the given
         // varying pointers
-        const StructType *st = 
-            dynamic_cast<const StructType *>(ptrType->GetBaseType());
+        const StructType *st = CastType<StructType>(ptrType->GetBaseType());
         llvm::Value *offset = NULL;
         if (st != NULL)
             // If the pointer is to a structure, Target::StructOffset() gives
@@ -2248,9 +2286,9 @@ FunctionEmitContext::AddElementOffset(llvm::Value *fullBasePtr, int elementNum,
             // Otherwise we should have a vector or array here and the offset
             // is given by the element number times the size of the element
             // type of the vector.
-            const SequentialType *st = 
-                dynamic_cast<const SequentialType *>(ptrType->GetBaseType());
-            Assert(st != NULL);
+            const SequentialType *st =
+                CastType<SequentialType>(ptrType->GetBaseType());
+            AssertPos(currentPos, st != NULL);
             llvm::Value *size = 
                 g->target.SizeOf(st->GetElementType()->LLVMType(g->ctx), bblock);
             llvm::Value *scale = (g->target.is32Bit || g->opt.force32BitAddressing) ?
@@ -2282,22 +2320,25 @@ FunctionEmitContext::AddElementOffset(llvm::Value *fullBasePtr, int elementNum,
 llvm::Value *
 FunctionEmitContext::LoadInst(llvm::Value *ptr, const char *name) {
     if (ptr == NULL) {
-        Assert(m->errorCount > 0);
+        AssertPos(currentPos, m->errorCount > 0);
         return NULL;
     }
 
-    LLVM_TYPE_CONST llvm::PointerType *pt = 
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::PointerType>(ptr->getType());
-    Assert(pt != NULL);
+    llvm::PointerType *pt = 
+        llvm::dyn_cast<llvm::PointerType>(ptr->getType());
+    AssertPos(currentPos, pt != NULL);
+
+    if (name == NULL)
+        name = LLVMGetName(ptr, "_load");
 
     // FIXME: it's not clear to me that we generate unaligned vector loads
     // of varying stuff out of the front-end any more.  (Only by the
     // optimization passes that lower gathers to vector loads, I think..)
     // So remove this??
     int align = 0;
-    if (llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(pt->getElementType()))
+    if (llvm::isa<llvm::VectorType>(pt->getElementType()))
         align = 1;
-    llvm::Instruction *inst = new llvm::LoadInst(ptr, name ? name : "load",
+    llvm::Instruction *inst = new llvm::LoadInst(ptr, name, 
                                                  false /* not volatile */,
                                                  align, bblock);
     AddDebugPos(inst);
@@ -2312,10 +2353,10 @@ FunctionEmitContext::LoadInst(llvm::Value *ptr, const char *name) {
 static llvm::Value *
 lFinalSliceOffset(FunctionEmitContext *ctx, llvm::Value *ptr,
                   const PointerType **ptrType) {
-    Assert(dynamic_cast<const PointerType *>(*ptrType) != NULL);
+    Assert(CastType<PointerType>(*ptrType) != NULL);
 
-    llvm::Value *slicePtr = ctx->ExtractInst(ptr, 0, "slice_ptr");
-    llvm::Value *sliceOffset = ctx->ExtractInst(ptr, 1, "slice_offset");
+    llvm::Value *slicePtr = ctx->ExtractInst(ptr, 0, LLVMGetName(ptr, "_ptr"));
+    llvm::Value *sliceOffset = ctx->ExtractInst(ptr, 1, LLVMGetName(ptr, "_offset"));
 
     // slicePtr should be a pointer to an soa-width wide array of the
     // final atomic/enum/pointer type
@@ -2336,7 +2377,7 @@ lFinalSliceOffset(FunctionEmitContext *ctx, llvm::Value *ptr,
 
     // And finally index based on the slice offset
     return ctx->GetElementPtrInst(slicePtr, sliceOffset, *ptrType,
-                                  "final_slice_gep");
+                                  LLVMGetName(slicePtr, "_final_gep"));
 }
 
 
@@ -2349,13 +2390,12 @@ FunctionEmitContext::loadUniformFromSOA(llvm::Value *ptr, llvm::Value *mask,
                                         const char *name) {
     const Type *unifType = ptrType->GetBaseType()->GetAsUniformType();
 
-    const CollectionType *ct = 
-        dynamic_cast<const CollectionType *>(ptrType->GetBaseType());
+    const CollectionType *ct = CastType<CollectionType>(ptrType->GetBaseType());
     if (ct != NULL) {
         // If we have a struct/array, we need to decompose it into
         // individual element loads to fill in the result structure since
         // the SOA slice of values we need isn't contiguous in memory...
-        LLVM_TYPE_CONST llvm::Type *llvmReturnType = unifType->LLVMType(g->ctx);
+        llvm::Type *llvmReturnType = unifType->LLVMType(g->ctx);
         llvm::Value *retValue = llvm::UndefValue::get(llvmReturnType);
 
         for (int i = 0; i < ct->GetElementCount(); ++i) {
@@ -2382,18 +2422,21 @@ llvm::Value *
 FunctionEmitContext::LoadInst(llvm::Value *ptr, llvm::Value *mask,
                               const Type *ptrRefType, const char *name) {
     if (ptr == NULL) {
-        Assert(m->errorCount > 0);
+        AssertPos(currentPos, m->errorCount > 0);
         return NULL;
     }
 
-    Assert(ptrRefType != NULL && mask != NULL);
+    AssertPos(currentPos, ptrRefType != NULL && mask != NULL);
+
+    if (name == NULL)
+        name = LLVMGetName(ptr, "_load");
 
     const PointerType *ptrType;
-    if (dynamic_cast<const ReferenceType *>(ptrRefType) != NULL)
+    if (CastType<ReferenceType>(ptrRefType) != NULL)
         ptrType = PointerType::GetUniform(ptrRefType->GetReferenceTarget());
     else {
-        ptrType = dynamic_cast<const PointerType *>(ptrRefType);
-        Assert(ptrType != NULL);
+        ptrType = CastType<PointerType>(ptrRefType);
+        AssertPos(currentPos, ptrType != NULL);
     }
 
     if (ptrType->IsUniformType()) {
@@ -2409,15 +2452,15 @@ FunctionEmitContext::LoadInst(llvm::Value *ptr, llvm::Value *mask,
             // atomic types, we need to make sure that the compiler emits
             // unaligned vector loads, so we specify a reduced alignment here.
             int align = 0;
-            const AtomicType *atomicType = 
-                dynamic_cast<const AtomicType *>(ptrType->GetBaseType());
+            const AtomicType *atomicType =
+                CastType<AtomicType>(ptrType->GetBaseType());
             if (atomicType != NULL && atomicType->IsVaryingType())
                 // We actually just want to align to the vector element
                 // alignment, but can't easily get that here, so just tell LLVM
                 // it's totally unaligned.  (This shouldn't make any difference
                 // vs the proper alignment in practice.)
                 align = 1;
-            llvm::Instruction *inst = new llvm::LoadInst(ptr, name ? name : "load",
+            llvm::Instruction *inst = new llvm::LoadInst(ptr, name,
                                                          false /* not volatile */,
                                                          align, bblock);
             AddDebugPos(inst);
@@ -2436,13 +2479,13 @@ llvm::Value *
 FunctionEmitContext::gather(llvm::Value *ptr, const PointerType *ptrType, 
                             llvm::Value *mask, const char *name) {
     // We should have a varying pointer if we get here...
-    Assert(ptrType->IsVaryingType());
+    AssertPos(currentPos, ptrType->IsVaryingType());
 
     const Type *returnType = ptrType->GetBaseType()->GetAsVaryingType();
-    LLVM_TYPE_CONST llvm::Type *llvmReturnType = returnType->LLVMType(g->ctx);
+    llvm::Type *llvmReturnType = returnType->LLVMType(g->ctx);
 
     const CollectionType *collectionType = 
-        dynamic_cast<const CollectionType *>(ptrType->GetBaseType());
+        CastType<CollectionType>(ptrType->GetBaseType());
     if (collectionType != NULL) {
         // For collections, recursively gather element wise to find the
         // result.
@@ -2477,7 +2520,7 @@ FunctionEmitContext::gather(llvm::Value *ptr, const PointerType *ptrType,
 
     // Figure out which gather function to call based on the size of
     // the elements.
-    const PointerType *pt = dynamic_cast<const PointerType *>(returnType);
+    const PointerType *pt = CastType<PointerType>(returnType);
     const char *funcName = NULL;
     if (pt != NULL)
         funcName = g->target.is32Bit ? "__pseudo_gather32_32" : 
@@ -2494,13 +2537,13 @@ FunctionEmitContext::gather(llvm::Value *ptr, const PointerType *ptrType,
         funcName = g->target.is32Bit ? "__pseudo_gather32_16" : 
             "__pseudo_gather64_16";
     else {
-        Assert(llvmReturnType == LLVMTypes::Int8VectorType);
+        AssertPos(currentPos, llvmReturnType == LLVMTypes::Int8VectorType);
         funcName = g->target.is32Bit ? "__pseudo_gather32_8" : 
             "__pseudo_gather64_8";
     }
 
     llvm::Function *gatherFunc = m->module->getFunction(funcName);
-    Assert(gatherFunc != NULL);
+    AssertPos(currentPos, gatherFunc != NULL);
 
     llvm::Value *call = CallInst(gatherFunc, NULL, ptr, mask, name);
 
@@ -2510,7 +2553,7 @@ FunctionEmitContext::gather(llvm::Value *ptr, const PointerType *ptrType,
     if (disableGSWarningCount == 0)
         addGSMetadata(call, currentPos);
 
-    return BitCastInst(call, llvmReturnType, "gather_bitcast");
+    return BitCastInst(call, llvmReturnType, LLVMGetName(call, "_gather_bitcast"));
 }
 
 
@@ -2547,11 +2590,11 @@ FunctionEmitContext::addGSMetadata(llvm::Value *v, SourcePos pos) {
 
 
 llvm::Value *
-FunctionEmitContext::AllocaInst(LLVM_TYPE_CONST llvm::Type *llvmType, 
+FunctionEmitContext::AllocaInst(llvm::Type *llvmType, 
                                 const char *name, int align, 
                                 bool atEntryBlock) {
     if (llvmType == NULL) {
-        Assert(m->errorCount > 0);
+        AssertPos(currentPos, m->errorCount > 0);
         return NULL;
     }
 
@@ -2560,7 +2603,7 @@ FunctionEmitContext::AllocaInst(LLVM_TYPE_CONST llvm::Type *llvmType,
         // We usually insert it right before the jump instruction at the
         // end of allocaBlock
         llvm::Instruction *retInst = allocaBlock->getTerminator();
-        Assert(retInst);
+        AssertPos(currentPos, retInst);
         inst = new llvm::AllocaInst(llvmType, name ? name : "", retInst);
     }
     else
@@ -2573,10 +2616,10 @@ FunctionEmitContext::AllocaInst(LLVM_TYPE_CONST llvm::Type *llvmType,
     // unlikely that this array will be loaded into varying variables with
     // what will be aligned accesses if the uniform -> varying load is done
     // in regular chunks.
-    LLVM_TYPE_CONST llvm::ArrayType *arrayType = 
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(llvmType);
+    llvm::ArrayType *arrayType = 
+        llvm::dyn_cast<llvm::ArrayType>(llvmType);
     if (align == 0 && arrayType != NULL && 
-        !llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(arrayType->getElementType()))
+        !llvm::isa<llvm::VectorType>(arrayType->getElementType()))
         align = 4 * g->target.nativeVectorWidth;
 
     if (align != 0)
@@ -2596,16 +2639,15 @@ void
 FunctionEmitContext::maskedStore(llvm::Value *value, llvm::Value *ptr,
                                  const Type *ptrType, llvm::Value *mask) {
     if (value == NULL || ptr == NULL) {
-        Assert(m->errorCount > 0);
+        AssertPos(currentPos, m->errorCount > 0);
         return;
     }
 
-    Assert(dynamic_cast<const PointerType *>(ptrType) != NULL);
-    Assert(ptrType->IsUniformType());
+    AssertPos(currentPos, CastType<PointerType>(ptrType) != NULL);
+    AssertPos(currentPos, ptrType->IsUniformType());
 
     const Type *valueType = ptrType->GetBaseType();
-    const CollectionType *collectionType = 
-        dynamic_cast<const CollectionType *>(valueType);
+    const CollectionType *collectionType = CastType<CollectionType>(valueType);
     if (collectionType != NULL) {
         // Assigning a structure / array / vector. Handle each element
         // individually with what turns into a recursive call to
@@ -2623,23 +2665,31 @@ FunctionEmitContext::maskedStore(llvm::Value *value, llvm::Value *ptr,
 
     // We must have a regular atomic, enumerator, or pointer type at this
     // point.
-    Assert(Type::IsBasicType(valueType));
+    AssertPos(currentPos, Type::IsBasicType(valueType));
     valueType = valueType->GetAsNonConstType();
 
     // Figure out if we need a 8, 16, 32 or 64-bit masked store.
     llvm::Function *maskedStoreFunc = NULL;
 
-    const PointerType *pt = dynamic_cast<const PointerType *>(valueType);
+    const PointerType *pt = CastType<PointerType>(valueType);
     if (pt != NULL) {
         if (pt->IsSlice()) {
-            // For masked stores of (varying) slice pointers to memory, we
-            // grab the equivalent StructType and make a recursive call to
-            // maskedStore, giving it that type for the pointer type; that
-            // in turn will lead to the base pointer and offset index being
-            // mask stored to memory..
-            const StructType *sliceStructType = pt->GetSliceStructType();
-            ptrType = PointerType::GetUniform(sliceStructType);
-            maskedStore(value, ptr, ptrType, mask);
+            // Masked store of (varying) slice pointer.
+            AssertPos(currentPos, pt->IsVaryingType());
+                    
+            // First, extract the pointer from the slice struct and masked
+            // store that.
+            llvm::Value *v0 = ExtractInst(value, 0);
+            llvm::Value *p0 = AddElementOffset(ptr, 0, ptrType);
+            maskedStore(v0, p0, PointerType::GetUniform(pt->GetAsNonSlice()),
+                        mask);
+
+            // And then do same for the integer offset
+            llvm::Value *v1 = ExtractInst(value, 1);
+            llvm::Value *p1 = AddElementOffset(ptr, 1, ptrType);
+            const Type *offsetType = AtomicType::VaryingInt32;
+            maskedStore(v1, p1, PointerType::GetUniform(offsetType), mask);
+
             return;
         }
 
@@ -2667,35 +2717,35 @@ FunctionEmitContext::maskedStore(llvm::Value *value, llvm::Value *ptr,
              Type::Equal(valueType, AtomicType::VaryingUInt64)) {
         maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_64");
         ptr = BitCastInst(ptr, LLVMTypes::Int64VectorPointerType, 
-                             "ptr_to_int64vecptr");
+                          LLVMGetName(ptr, "_to_int64vecptr"));
         value = BitCastInst(value, LLVMTypes::Int64VectorType, 
-                             "value_to_int64");
+                            LLVMGetName(value, "_to_int64"));
     }
     else if (Type::Equal(valueType, AtomicType::VaryingFloat) ||
              Type::Equal(valueType, AtomicType::VaryingBool) ||
              Type::Equal(valueType, AtomicType::VaryingInt32) ||
              Type::Equal(valueType, AtomicType::VaryingUInt32) ||
-             dynamic_cast<const EnumType *>(valueType) != NULL) {
+             CastType<EnumType>(valueType) != NULL) {
         maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_32");
         ptr = BitCastInst(ptr, LLVMTypes::Int32VectorPointerType, 
-                             "ptr_to_int32vecptr");
+                          LLVMGetName(ptr, "_to_int32vecptr"));
         if (Type::Equal(valueType, AtomicType::VaryingFloat))
             value = BitCastInst(value, LLVMTypes::Int32VectorType, 
-                                 "value_to_int32");
+                                LLVMGetName(value, "_to_int32"));
     }
     else if (Type::Equal(valueType, AtomicType::VaryingInt16) ||
              Type::Equal(valueType, AtomicType::VaryingUInt16)) {
         maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_16");
         ptr = BitCastInst(ptr, LLVMTypes::Int16VectorPointerType, 
-                             "ptr_to_int16vecptr");
+                          LLVMGetName(ptr, "_to_int16vecptr"));
     }
     else if (Type::Equal(valueType, AtomicType::VaryingInt8) ||
              Type::Equal(valueType, AtomicType::VaryingUInt8)) {
         maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_8");
         ptr = BitCastInst(ptr, LLVMTypes::Int8VectorPointerType, 
-                             "ptr_to_int8vecptr");
+                          LLVMGetName(ptr, "_to_int8vecptr"));
     }
-    Assert(maskedStoreFunc != NULL);
+    AssertPos(currentPos, maskedStoreFunc != NULL);
 
     std::vector<llvm::Value *> args;
     args.push_back(ptr);
@@ -2716,12 +2766,12 @@ void
 FunctionEmitContext::scatter(llvm::Value *value, llvm::Value *ptr, 
                              const Type *valueType, const Type *origPt,
                              llvm::Value *mask) {
-    const PointerType *ptrType = dynamic_cast<const PointerType *>(origPt);
-    Assert(ptrType != NULL);
-    Assert(ptrType->IsVaryingType());
+    const PointerType *ptrType = CastType<PointerType>(origPt);
+    AssertPos(currentPos, ptrType != NULL);
+    AssertPos(currentPos, ptrType->IsVaryingType());
 
     const CollectionType *srcCollectionType = 
-        dynamic_cast<const CollectionType *>(valueType);
+        CastType<CollectionType>(valueType);
     if (srcCollectionType != NULL) {
         // We're scattering a collection type--we need to keep track of the
         // source type (the type of the data values to be stored) and the
@@ -2732,8 +2782,8 @@ FunctionEmitContext::scatter(llvm::Value *value, llvm::Value *ptr,
         // same struct type, versus scattering into an array of varying
         // instances of the struct type, etc.
         const CollectionType *dstCollectionType =
-            dynamic_cast<const CollectionType *>(ptrType->GetBaseType());
-        Assert(dstCollectionType != NULL);
+            CastType<CollectionType>(ptrType->GetBaseType());
+        AssertPos(currentPos, dstCollectionType != NULL);
             
         // Scatter the collection elements individually
         for (int i = 0; i < srcCollectionType->GetElementCount(); ++i) {
@@ -2777,13 +2827,12 @@ FunctionEmitContext::scatter(llvm::Value *value, llvm::Value *ptr,
         ptr = lFinalSliceOffset(this, ptr, &ptrType);
     }
 
-    const PointerType *pt = dynamic_cast<const PointerType *>(valueType);
+    const PointerType *pt = CastType<PointerType>(valueType);
 
     // And everything should be a pointer or atomic from here on out...
-    Assert(pt != NULL || 
-           dynamic_cast<const AtomicType *>(valueType) != NULL);
+    AssertPos(currentPos, pt != NULL || CastType<AtomicType>(valueType) != NULL);
 
-    LLVM_TYPE_CONST llvm::Type *type = value->getType();
+    llvm::Type *type = value->getType();
     const char *funcName = NULL;
     if (pt != NULL)
         funcName = g->target.is32Bit ? "__pseudo_scatter32_32" :
@@ -2808,7 +2857,7 @@ FunctionEmitContext::scatter(llvm::Value *value, llvm::Value *ptr,
             "__pseudo_scatter64_8";
 
     llvm::Function *scatterFunc = m->module->getFunction(funcName);
-    Assert(scatterFunc != NULL);
+    AssertPos(currentPos, scatterFunc != NULL);
     
     AddInstrumentationPoint("scatter");
 
@@ -2827,7 +2876,7 @@ void
 FunctionEmitContext::StoreInst(llvm::Value *value, llvm::Value *ptr) {
     if (value == NULL || ptr == NULL) {
         // may happen due to error elsewhere
-        Assert(m->errorCount > 0);
+        AssertPos(currentPos, m->errorCount > 0);
         return;
     }
 
@@ -2852,16 +2901,16 @@ FunctionEmitContext::StoreInst(llvm::Value *value, llvm::Value *ptr,
                                const Type *ptrRefType) {
     if (value == NULL || ptr == NULL) {
         // may happen due to error elsewhere
-        Assert(m->errorCount > 0);
+        AssertPos(currentPos, m->errorCount > 0);
         return;
     }
 
     const PointerType *ptrType;
-    if (dynamic_cast<const ReferenceType *>(ptrRefType) != NULL)
+    if (CastType<ReferenceType>(ptrRefType) != NULL)
         ptrType = PointerType::GetUniform(ptrRefType->GetReferenceTarget());
     else {
-        ptrType = dynamic_cast<const PointerType *>(ptrRefType);
-        Assert(ptrType != NULL);
+        ptrType = CastType<PointerType>(ptrRefType);
+        AssertPos(currentPos, ptrType != NULL);
     }
 
     // Figure out what kind of store we're doing here
@@ -2880,7 +2929,7 @@ FunctionEmitContext::StoreInst(llvm::Value *value, llvm::Value *ptr,
             maskedStore(value, ptr, ptrType, mask);
     }
     else {
-        Assert(ptrType->IsVaryingType());
+        AssertPos(currentPos, ptrType->IsVaryingType());
         // We have a varying ptr (an array of pointers), so it's time to
         // scatter
         scatter(value, ptr, valueType, ptrType, GetFullMask());
@@ -2894,10 +2943,10 @@ void
 FunctionEmitContext::storeUniformToSOA(llvm::Value *value, llvm::Value *ptr,
                                        llvm::Value *mask, const Type *valueType,
                                        const PointerType *ptrType) {
-    Assert(Type::EqualIgnoringConst(ptrType->GetBaseType()->GetAsUniformType(), 
+    AssertPos(currentPos, Type::EqualIgnoringConst(ptrType->GetBaseType()->GetAsUniformType(), 
                                     valueType));
 
-    const CollectionType *ct = dynamic_cast<const CollectionType *>(valueType);
+    const CollectionType *ct = CastType<CollectionType>(valueType);
     if (ct != NULL) {
         // Handle collections element wise...
         for (int i = 0; i < ct->GetElementCount(); ++i) {
@@ -2913,7 +2962,7 @@ FunctionEmitContext::storeUniformToSOA(llvm::Value *value, llvm::Value *ptr,
     else {
         // We're finally at a leaf SOA array; apply the slice offset and
         // then we can do a final regular store
-        Assert(Type::IsBasicType(valueType));
+        AssertPos(currentPos, Type::IsBasicType(valueType));
         ptr = lFinalSliceOffset(this, ptr, &ptrType);
         StoreInst(value, ptr);
     }
@@ -2926,7 +2975,7 @@ FunctionEmitContext::MemcpyInst(llvm::Value *dest, llvm::Value *src,
     dest = BitCastInst(dest, LLVMTypes::VoidPointerType);
     src = BitCastInst(src, LLVMTypes::VoidPointerType);
     if (count->getType() != LLVMTypes::Int64Type) {
-        Assert(count->getType() == LLVMTypes::Int32Type);
+        AssertPos(currentPos, count->getType() == LLVMTypes::Int32Type);
         count = ZExtInst(count, LLVMTypes::Int64Type, "count_to_64");
     }
     if (align == NULL)
@@ -2937,8 +2986,8 @@ FunctionEmitContext::MemcpyInst(llvm::Value *dest, llvm::Value *src,
                                        LLVMTypes::VoidType, LLVMTypes::VoidPointerType,
                                        LLVMTypes::VoidPointerType, LLVMTypes::Int64Type,
                                        LLVMTypes::Int32Type, LLVMTypes::BoolType, NULL);
-    Assert(mcFunc != NULL);
-    Assert(llvm::isa<llvm::Function>(mcFunc));
+    AssertPos(currentPos, mcFunc != NULL);
+    AssertPos(currentPos, llvm::isa<llvm::Function>(mcFunc));
 
     std::vector<llvm::Value *> args;
     args.push_back(dest);
@@ -2962,7 +3011,7 @@ FunctionEmitContext::BranchInst(llvm::BasicBlock *trueBlock,
                                 llvm::BasicBlock *falseBlock,
                                 llvm::Value *test) {
     if (test == NULL) {
-        Assert(m->errorCount > 0);
+        AssertPos(currentPos, m->errorCount > 0);
         return;
     }
 
@@ -2975,17 +3024,21 @@ FunctionEmitContext::BranchInst(llvm::BasicBlock *trueBlock,
 llvm::Value *
 FunctionEmitContext::ExtractInst(llvm::Value *v, int elt, const char *name) {
     if (v == NULL) {
-        Assert(m->errorCount > 0);
+        AssertPos(currentPos, m->errorCount > 0);
         return NULL;
     }
 
+    if (name == NULL) {
+        char buf[32];
+        sprintf(buf, "_extract_%d", elt);
+        name = LLVMGetName(v, buf);
+    }
+
     llvm::Instruction *ei = NULL;
-    if (llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(v->getType()))
-        ei = llvm::ExtractElementInst::Create(v, LLVMInt32(elt), 
-                                              name ? name : "extract", bblock);
+    if (llvm::isa<llvm::VectorType>(v->getType()))
+        ei = llvm::ExtractElementInst::Create(v, LLVMInt32(elt), name, bblock);
     else
-        ei = llvm::ExtractValueInst::Create(v, elt, name ? name : "extract",
-                                            bblock);
+        ei = llvm::ExtractValueInst::Create(v, elt, name, bblock);
     AddDebugPos(ei);
     return ei;
 }
@@ -2995,29 +3048,31 @@ llvm::Value *
 FunctionEmitContext::InsertInst(llvm::Value *v, llvm::Value *eltVal, int elt, 
                                 const char *name) {
     if (v == NULL || eltVal == NULL) {
-        Assert(m->errorCount > 0);
+        AssertPos(currentPos, m->errorCount > 0);
         return NULL;
     }
 
+    if (name == NULL) {
+        char buf[32];
+        sprintf(buf, "_insert_%d", elt);
+        name = LLVMGetName(v, buf);
+    }
+
     llvm::Instruction *ii = NULL;
-    if (llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(v->getType()))
+    if (llvm::isa<llvm::VectorType>(v->getType()))
         ii = llvm::InsertElementInst::Create(v, eltVal, LLVMInt32(elt), 
-                                             name ? name : "insert", bblock);
+                                             name, bblock);
     else
-        ii = llvm::InsertValueInst::Create(v, eltVal, elt, 
-                                           name ? name : "insert", bblock);
+        ii = llvm::InsertValueInst::Create(v, eltVal, elt, name, bblock);
     AddDebugPos(ii);
     return ii;
 }
 
 
 llvm::PHINode *
-FunctionEmitContext::PhiNode(LLVM_TYPE_CONST llvm::Type *type, int count, 
+FunctionEmitContext::PhiNode(llvm::Type *type, int count, 
                              const char *name) {
-    llvm::PHINode *pn = llvm::PHINode::Create(type, 
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
-                                              count, 
-#endif // LLVM_3_0
+    llvm::PHINode *pn = llvm::PHINode::Create(type, count,
                                               name ? name : "phi", bblock);
     AddDebugPos(pn);
     return pn;
@@ -3028,13 +3083,15 @@ llvm::Instruction *
 FunctionEmitContext::SelectInst(llvm::Value *test, llvm::Value *val0,
                                 llvm::Value *val1, const char *name) {
     if (test == NULL || val0 == NULL || val1 == NULL) {
-        Assert(m->errorCount > 0);
+        AssertPos(currentPos, m->errorCount > 0);
         return NULL;
     }
 
-    llvm::Instruction *inst = 
-        llvm::SelectInst::Create(test, val0, val1, name ? name : "select", 
-                                 bblock);
+    if (name == NULL)
+        name = LLVMGetName(test, "_select");
+
+    llvm::Instruction *inst = llvm::SelectInst::Create(test, val0, val1, name,
+                                                       bblock);
     AddDebugPos(inst);
     return inst;
 }
@@ -3045,18 +3102,18 @@ FunctionEmitContext::SelectInst(llvm::Value *test, llvm::Value *val0,
     function has. */
 static unsigned int
 lCalleeArgCount(llvm::Value *callee, const FunctionType *funcType) {
-    LLVM_TYPE_CONST llvm::FunctionType *ft = 
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::FunctionType>(callee->getType());
+    llvm::FunctionType *ft = 
+        llvm::dyn_cast<llvm::FunctionType>(callee->getType());
 
     if (ft == NULL) {
-        LLVM_TYPE_CONST llvm::PointerType *pt =
-            llvm::dyn_cast<LLVM_TYPE_CONST llvm::PointerType>(callee->getType());
+        llvm::PointerType *pt =
+            llvm::dyn_cast<llvm::PointerType>(callee->getType());
         if (pt == NULL) {
             // varying--in this case, it must be the version of the
             // function that takes a mask
             return funcType->GetNumParameters() + 1;
         }
-        ft = llvm::dyn_cast<LLVM_TYPE_CONST llvm::FunctionType>(pt->getElementType());
+        ft = llvm::dyn_cast<llvm::FunctionType>(pt->getElementType());
     }
 
     Assert(ft != NULL);
@@ -3069,7 +3126,7 @@ FunctionEmitContext::CallInst(llvm::Value *func, const FunctionType *funcType,
                               const std::vector<llvm::Value *> &args,
                               const char *name) {
     if (func == NULL) {
-        Assert(m->errorCount > 0);
+        AssertPos(currentPos, m->errorCount > 0);
         return NULL;
     }
 
@@ -3078,22 +3135,16 @@ FunctionEmitContext::CallInst(llvm::Value *func, const FunctionType *funcType,
     // isn't the case for things like intrinsics, builtins, and extern "C"
     // functions from the application.  Add the mask if it's needed.
     unsigned int calleeArgCount = lCalleeArgCount(func, funcType);
-    Assert(argVals.size() + 1 == calleeArgCount ||
+    AssertPos(currentPos, argVals.size() + 1 == calleeArgCount ||
            argVals.size() == calleeArgCount);
     if (argVals.size() + 1 == calleeArgCount)
         argVals.push_back(GetFullMask());
 
-    if (llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(func->getType()) == false) {
+    if (llvm::isa<llvm::VectorType>(func->getType()) == false) {
         // Regular 'uniform' function call--just one function or function
         // pointer, so just emit the IR directly.
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
         llvm::Instruction *ci = 
             llvm::CallInst::Create(func, argVals, name ? name : "", bblock);
-#else
-        llvm::Instruction *ci = 
-            llvm::CallInst::Create(func, argVals.begin(), argVals.end(), 
-                                   name ? name : "", bblock);
-#endif
         AddDebugPos(ci);
         return ci;
     }
@@ -3117,7 +3168,7 @@ FunctionEmitContext::CallInst(llvm::Value *func, const FunctionType *funcType,
         // First allocate memory to accumulate the various program
         // instances' return values...
         const Type *returnType = funcType->GetReturnType();
-        LLVM_TYPE_CONST llvm::Type *llvmReturnType = returnType->LLVMType(g->ctx);
+        llvm::Type *llvmReturnType = returnType->LLVMType(g->ctx);
         llvm::Value *resultPtr = NULL;
         if (llvmReturnType->isVoidTy() == false)
             resultPtr = AllocaInst(llvmReturnType);
@@ -3148,10 +3199,12 @@ FunctionEmitContext::CallInst(llvm::Value *func, const FunctionType *funcType,
             // pointer to be called.
             llvm::Value *currentMask = LoadInst(maskPtr);
             llvm::Function *cttz = 
-                m->module->getFunction("__count_trailing_zeros_i32");
-            Assert(cttz != NULL);
-            llvm::Value *firstLane = CallInst(cttz, NULL, LaneMask(currentMask),
-                                              "first_lane");
+                m->module->getFunction("__count_trailing_zeros_i64");
+            AssertPos(currentPos, cttz != NULL);
+            llvm::Value *firstLane64 = CallInst(cttz, NULL, LaneMask(currentMask),
+                                                "first_lane64");
+            llvm::Value *firstLane = 
+                TruncInst(firstLane64, LLVMTypes::Int32Type, "first_lane32");
 
             // Get the pointer to the function we're going to call this
             // time through: ftpr = func[firstLane]
@@ -3184,9 +3237,9 @@ FunctionEmitContext::CallInst(llvm::Value *func, const FunctionType *funcType,
 
             // bitcast the i32/64 function pointer to the actual function
             // pointer type (the variant that includes a mask).
-            LLVM_TYPE_CONST llvm::Type *llvmFuncType =
+            llvm::Type *llvmFuncType =
                 funcType->LLVMFunctionType(g->ctx, true);
-            LLVM_TYPE_CONST llvm::Type *llvmFPtrType = 
+            llvm::Type *llvmFPtrType = 
                 llvm::PointerType::get(llvmFuncType, 0);
             llvm::Value *fptrCast = IntToPtrInst(fptr, llvmFPtrType);
 
@@ -3195,13 +3248,14 @@ FunctionEmitContext::CallInst(llvm::Value *func, const FunctionType *funcType,
 
             // Now, do a masked store into the memory allocated to
             // accumulate the result using the call mask.
-            if (callResult != NULL) {
-                Assert(resultPtr != NULL);
+            if (callResult != NULL && 
+                callResult->getType() != LLVMTypes::VoidType) {
+                AssertPos(currentPos, resultPtr != NULL);
                 StoreInst(callResult, resultPtr, callMask, returnType,
                           PointerType::GetUniform(returnType));
             }
             else
-                Assert(resultPtr == NULL);
+                AssertPos(currentPos, resultPtr == NULL);
 
             // Update the mask to turn off the program instances for which
             // we just called the function.
@@ -3261,7 +3315,7 @@ FunctionEmitContext::ReturnInst() {
         rinst = llvm::ReturnInst::Create(*g->ctx, retVal, bblock);
     }
     else {
-        Assert(Type::Equal(function->GetReturnType(), AtomicType::Void));
+        AssertPos(currentPos, Type::Equal(function->GetReturnType(), AtomicType::Void));
         rinst = llvm::ReturnInst::Create(*g->ctx, bblock);
     }
 
@@ -3276,25 +3330,25 @@ FunctionEmitContext::LaunchInst(llvm::Value *callee,
                                 std::vector<llvm::Value *> &argVals,
                                 llvm::Value *launchCount) {
     if (callee == NULL) {
-        Assert(m->errorCount > 0);
+        AssertPos(currentPos, m->errorCount > 0);
         return NULL;
     }
 
     launchedTasks = true;
 
-    Assert(llvm::isa<llvm::Function>(callee));
-    LLVM_TYPE_CONST llvm::Type *argType = 
+    AssertPos(currentPos, llvm::isa<llvm::Function>(callee));
+    llvm::Type *argType = 
         (llvm::dyn_cast<llvm::Function>(callee))->arg_begin()->getType();
-    Assert(llvm::PointerType::classof(argType));
-    LLVM_TYPE_CONST llvm::PointerType *pt = 
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::PointerType>(argType);
-    Assert(llvm::StructType::classof(pt->getElementType()));
-    LLVM_TYPE_CONST llvm::StructType *argStructType = 
-        static_cast<LLVM_TYPE_CONST llvm::StructType *>(pt->getElementType());
-    Assert(argStructType->getNumElements() == argVals.size() + 1);
+    AssertPos(currentPos, llvm::PointerType::classof(argType));
+    llvm::PointerType *pt = 
+        llvm::dyn_cast<llvm::PointerType>(argType);
+    AssertPos(currentPos, llvm::StructType::classof(pt->getElementType()));
+    llvm::StructType *argStructType = 
+        static_cast<llvm::StructType *>(pt->getElementType());
+    AssertPos(currentPos, argStructType->getNumElements() == argVals.size() + 1);
 
     llvm::Function *falloc = m->module->getFunction("ISPCAlloc");
-    Assert(falloc != NULL);
+    AssertPos(currentPos, falloc != NULL);
     llvm::Value *structSize = g->target.SizeOf(argStructType, bblock);
     if (structSize->getType() != LLVMTypes::Int64Type)
         // ISPCAlloc expects the size as an uint64_t, but on 32-bit
@@ -3329,7 +3383,7 @@ FunctionEmitContext::LaunchInst(llvm::Value *callee,
     // argument block we just filled in
     llvm::Value *fptr = BitCastInst(callee, LLVMTypes::VoidPointerType);
     llvm::Function *flaunch = m->module->getFunction("ISPCLaunch");
-    Assert(flaunch != NULL);
+    AssertPos(currentPos, flaunch != NULL);
     std::vector<llvm::Value *> args;
     args.push_back(launchGroupHandlePtr);
     args.push_back(fptr);
@@ -3377,8 +3431,8 @@ llvm::Value *
 FunctionEmitContext::addVaryingOffsetsIfNeeded(llvm::Value *ptr, 
                                                const Type *ptrType) {
     // This should only be called for varying pointers
-    const PointerType *pt = dynamic_cast<const PointerType *>(ptrType);
-    Assert(pt && pt->IsVaryingType());
+    const PointerType *pt = CastType<PointerType>(ptrType);
+    AssertPos(currentPos, pt && pt->IsVaryingType());
 
     const Type *baseType = ptrType->GetBaseType();
     if (Type::IsBasicType(baseType) == false)
@@ -3388,7 +3442,7 @@ FunctionEmitContext::addVaryingOffsetsIfNeeded(llvm::Value *ptr,
         return ptr;
     
     // Find the size of a uniform element of the varying type
-    LLVM_TYPE_CONST llvm::Type *llvmBaseUniformType = 
+    llvm::Type *llvmBaseUniformType = 
         baseType->GetAsUniformType()->LLVMType(g->ctx);
     llvm::Value *unifSize = g->target.SizeOf(llvmBaseUniformType, bblock);
     unifSize = SmearUniform(unifSize);
@@ -3415,7 +3469,7 @@ FunctionEmitContext::addVaryingOffsetsIfNeeded(llvm::Value *ptr,
 
 CFInfo *
 FunctionEmitContext::popCFState() {
-    Assert(controlFlowInfo.size() > 0);
+    AssertPos(currentPos, controlFlowInfo.size() > 0);
     CFInfo *ci = controlFlowInfo.back();
     controlFlowInfo.pop_back();
 
@@ -3439,7 +3493,7 @@ FunctionEmitContext::popCFState() {
         loopMask = ci->savedLoopMask;
     }
     else {
-        Assert(ci->IsIf());
+        AssertPos(currentPos, ci->IsIf());
         // nothing to do
     }
 
diff --git a/ctx.h b/ctx.h
index 0b1ccffa..10a22115 100644
--- a/ctx.h
+++ b/ctx.h
@@ -248,6 +248,10 @@ public:
         new basic block that it starts. */
     llvm::BasicBlock *GetLabeledBasicBlock(const std::string &label);
 
+    /** Returns a vector of all labels in the context. This is
+        simply the key set of the labelMap */
+    std::vector<std::string> GetLabels();
+
     /** Called to generate code for 'return' statement; value is the
         expression in the return statement (if non-NULL), and
         doCoherenceCheck indicates whether instructions should be generated
@@ -272,7 +276,7 @@ public:
     llvm::Value *None(llvm::Value *mask);
 
     /** Given a boolean mask value of type LLVMTypes::MaskType, return an
-        i32 value wherein the i'th bit is on if and only if the i'th lane
+        i64 value wherein the i'th bit is on if and only if the i'th lane
         of the mask is on. */
     llvm::Value *LaneMask(llvm::Value *mask);
 
@@ -338,7 +342,7 @@ public:
 
     /** Emits debugging information for the function parameter represented
         by sym.  */
-    void EmitFunctionParameterDebugInfo(Symbol *sym);
+    void EmitFunctionParameterDebugInfo(Symbol *sym, int parameterNum);
     /** @} */
 
     /** @name IR instruction emission
@@ -380,23 +384,23 @@ public:
         array, for pointer types). */
     llvm::Value *SmearUniform(llvm::Value *value, const char *name = NULL);
 
-    llvm::Value *BitCastInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
+    llvm::Value *BitCastInst(llvm::Value *value, llvm::Type *type,
                              const char *name = NULL);
     llvm::Value *PtrToIntInst(llvm::Value *value, const char *name = NULL);
-    llvm::Value *PtrToIntInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
+    llvm::Value *PtrToIntInst(llvm::Value *value, llvm::Type *type,
                               const char *name = NULL);
-    llvm::Value *IntToPtrInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
+    llvm::Value *IntToPtrInst(llvm::Value *value, llvm::Type *type,
                               const char *name = NULL);
 
-    llvm::Instruction *TruncInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
+    llvm::Instruction *TruncInst(llvm::Value *value, llvm::Type *type,
                                  const char *name = NULL);
     llvm::Instruction *CastInst(llvm::Instruction::CastOps op, llvm::Value *value,
-                                LLVM_TYPE_CONST llvm::Type *type, const char *name = NULL);
-    llvm::Instruction *FPCastInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type, 
+                                llvm::Type *type, const char *name = NULL);
+    llvm::Instruction *FPCastInst(llvm::Value *value, llvm::Type *type, 
                                   const char *name = NULL);
-    llvm::Instruction *SExtInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type, 
+    llvm::Instruction *SExtInst(llvm::Value *value, llvm::Type *type, 
                                 const char *name = NULL);
-    llvm::Instruction *ZExtInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type, 
+    llvm::Instruction *ZExtInst(llvm::Value *value, llvm::Type *type, 
                                 const char *name = NULL);
 
     /** Given two integer-typed values (but possibly one vector and the
@@ -448,7 +452,7 @@ public:
         instruction is added at the start of the function in the entry
         basic block; if it should be added to the current basic block, then
         the atEntryBlock parameter should be false. */ 
-    llvm::Value *AllocaInst(LLVM_TYPE_CONST llvm::Type *llvmType, 
+    llvm::Value *AllocaInst(llvm::Type *llvmType, 
                             const char *name = NULL, int align = 0, 
                             bool atEntryBlock = true);
 
@@ -485,7 +489,7 @@ public:
     llvm::Value *InsertInst(llvm::Value *v, llvm::Value *eltVal, int elt, 
                             const char *name = NULL);
 
-    llvm::PHINode *PhiNode(LLVM_TYPE_CONST llvm::Type *type, int count, 
+    llvm::PHINode *PhiNode(llvm::Type *type, int count, 
                            const char *name = NULL);
     llvm::Instruction *SelectInst(llvm::Value *test, llvm::Value *val0,
                                   llvm::Value *val1, const char *name = NULL);
@@ -632,12 +636,12 @@ private:
     std::vector<CFInfo *> controlFlowInfo;
 
     /** DIFile object corresponding to the source file where the current
-        function was defined (used for debugging info0. */
+        function was defined (used for debugging info). */
     llvm::DIFile diFile;
 
     /** DISubprogram corresponding to this function (used for debugging
         info). */
-    llvm::DISubprogram diFunction;
+    llvm::DISubprogram diSubprogram;
 
     /** These correspond to the current set of nested scopes in the
         function. */
diff --git a/decl.cpp b/decl.cpp
index f3eb701e..7cf2b5fb 100644
--- a/decl.cpp
+++ b/decl.cpp
@@ -33,7 +33,7 @@
 
 /** @file decl.cpp
     @brief Implementations of classes related to turning declarations into 
-           symbols and types.
+           symbol names and types.
 */
 
 #include "decl.h"
@@ -44,6 +44,7 @@
 #include "stmt.h"
 #include "expr.h"
 #include <stdio.h>
+#include <string.h>
 #include <set>
 
 static void
@@ -55,6 +56,7 @@ lPrintTypeQualifiers(int typeQualifiers) {
     if (typeQualifiers & TYPEQUAL_TASK)      printf("task ");
     if (typeQualifiers & TYPEQUAL_SIGNED)    printf("signed ");
     if (typeQualifiers & TYPEQUAL_UNSIGNED)  printf("unsigned ");
+    if (typeQualifiers & TYPEQUAL_EXPORT)    printf("export ");
 }
 
 
@@ -134,7 +136,7 @@ DeclSpecs::GetBaseType(SourcePos pos) const {
     }
 
     if (vectorSize > 0) {
-        const AtomicType *atomicType = dynamic_cast<const AtomicType *>(retType);
+        const AtomicType *atomicType = CastType<AtomicType>(retType);
         if (atomicType == NULL) {
             Error(pos, "Only atomic types (int, float, ...) are legal for vector "
                   "types.");
@@ -146,7 +148,7 @@ DeclSpecs::GetBaseType(SourcePos pos) const {
     retType = lApplyTypeQualifiers(typeQualifiers, retType, pos);
     
     if (soaWidth > 0) {
-        const StructType *st = dynamic_cast<const StructType *>(retType);
+        const StructType *st = CastType<StructType>(retType);
 
         if (st == NULL) {
             Error(pos, "Illegal to provide soa<%d> qualifier with non-struct "
@@ -188,7 +190,6 @@ lGetStorageClassName(StorageClass storageClass) {
     case SC_NONE:     return "";
     case SC_EXTERN:   return "extern";
     case SC_EXTERN_C: return "extern \"C\"";
-    case SC_EXPORT:   return "export";
     case SC_STATIC:   return "static";
     case SC_TYPEDEF:  return "typedef";
     default:          FATAL("Unhandled storage class in lGetStorageClassName");
@@ -217,50 +218,44 @@ Declarator::Declarator(DeclaratorKind dk, SourcePos p)
     : pos(p), kind(dk) { 
     child = NULL;
     typeQualifiers = 0;
+    storageClass = SC_NONE;
     arraySize = -1;
-    sym = NULL;
+    type = NULL;
     initExpr = NULL;
 }
 
 
 void
 Declarator::InitFromDeclSpecs(DeclSpecs *ds) {
-    const Type *t = GetType(ds);
-    if (t == NULL) {
-        Assert(m->errorCount > 0);
+    const Type *baseType = ds->GetBaseType(pos);
+    InitFromType(baseType, ds);
+
+    if (type == NULL) {
+        AssertPos(pos, m->errorCount > 0);
         return;
     }
 
-    Symbol *sym = GetSymbol();
-    if (sym != NULL) {
-        sym->type = t;
-        sym->storageClass = ds->storageClass;
+    storageClass = ds->storageClass;
+
+    if (ds->declSpecList.size() > 0 && 
+        CastType<FunctionType>(type) == NULL) {
+        Error(pos, "__declspec specifiers for non-function type \"%s\" are "
+              "not used.", type->GetString().c_str());
     }
 }
 
 
-Symbol *
-Declarator::GetSymbol() const {
-    // The symbol lives at the last child in the chain, so walk down there
-    // and return the one there.
-    const Declarator *d = this;
-    while (d->child != NULL)
-        d = d->child;
-    return d->sym;
-}
-
-
 void
 Declarator::Print(int indent) const {
     printf("%*cdeclarator: [", indent, ' ');
     pos.Print();
 
     lPrintTypeQualifiers(typeQualifiers);
-    Symbol *sym = GetSymbol();
-    if (sym != NULL)
-        printf("%s", sym->name.c_str());
+    printf("%s ", lGetStorageClassName(storageClass));
+    if (name.size() > 0)
+        printf("%s", name.c_str());
     else
-        printf("(null symbol)");
+        printf("(unnamed)");
 
     printf(", array size = %d", arraySize);
 
@@ -294,66 +289,26 @@ Declarator::Print(int indent) const {
 }
 
 
-Symbol *
-Declarator::GetFunctionInfo(DeclSpecs *ds, std::vector<Symbol *> *funArgs) {
-    const FunctionType *type = 
-        dynamic_cast<const FunctionType *>(GetType(ds));
-    if (type == NULL)
-        return NULL;
-
-    Symbol *declSym = GetSymbol();
-    Assert(declSym != NULL);
-
-    // Get the symbol for the function from the symbol table.  (It should
-    // already have been added to the symbol table by AddGlobal() by the
-    // time we get here.)
-    Symbol *funSym = m->symbolTable->LookupFunction(declSym->name.c_str(), type);
-    if (funSym == NULL)
-        // May be NULL due to error earlier in compilation
-        Assert(m->errorCount > 0);
-    else
-        funSym->pos = pos;
-
-    // Walk down to the declarator for the function.  (We have to get past
-    // the stuff that specifies the function's return type before we get to
-    // the function's declarator.)
-    Declarator *d = this;
-    while (d != NULL && d->kind != DK_FUNCTION)
-        d = d->child;
-    Assert(d != NULL);
-
-    for (unsigned int i = 0; i < d->functionParams.size(); ++i) {
-        Symbol *sym = d->GetSymbolForFunctionParameter(i);
-        if (sym->type == NULL) {
-            Assert(m->errorCount > 0);
-            continue;
-        }
-        else
-            sym->type = sym->type->ResolveUnboundVariability(Variability::Varying);
-
-        funArgs->push_back(sym);
-    }
-
-    if (funSym != NULL)
-        funSym->type = funSym->type->ResolveUnboundVariability(Variability::Varying);
-
-    return funSym;
-}
-
-
-const Type *
-Declarator::GetType(const Type *base, DeclSpecs *ds) const {
+void
+Declarator::InitFromType(const Type *baseType, DeclSpecs *ds) {
     bool hasUniformQual = ((typeQualifiers & TYPEQUAL_UNIFORM) != 0);
     bool hasVaryingQual = ((typeQualifiers & TYPEQUAL_VARYING) != 0);
     bool isTask =         ((typeQualifiers & TYPEQUAL_TASK) != 0);
+    bool isExported =     ((typeQualifiers & TYPEQUAL_EXPORT) != 0);
     bool isConst =        ((typeQualifiers & TYPEQUAL_CONST) != 0);
 
     if (hasUniformQual && hasVaryingQual) {
         Error(pos, "Can't provide both \"uniform\" and \"varying\" qualifiers.");
-        return NULL;
+        return;
     }
-    if (kind != DK_FUNCTION && isTask)
+    if (kind != DK_FUNCTION && isTask) {
         Error(pos, "\"task\" qualifier illegal in variable declaration.");
+        return;
+    }
+    if (kind != DK_FUNCTION && isExported) {
+        Error(pos, "\"export\" qualifier illegal in variable declaration.");
+        return;
+    }
 
     Variability variability(Variability::Unbound);
     if (hasUniformQual)
@@ -361,91 +316,125 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
     else if (hasVaryingQual)
         variability = Variability::Varying;
 
-    const Type *type = base;
-    switch (kind) {
-    case DK_BASE:
+    if (kind == DK_BASE) {
         // All of the type qualifiers should be in the DeclSpecs for the
         // base declarator
-        Assert(typeQualifiers == 0);
-        Assert(child == NULL);
-        return type;
-
-    case DK_POINTER:
+        AssertPos(pos, typeQualifiers == 0);
+        AssertPos(pos, child == NULL);
+        type = baseType;
+    }
+    else if (kind == DK_POINTER) {
         /* For now, any pointer to an SOA type gets the slice property; if
            we add the capability to declare pointers as slices or not,
            we'll want to set this based on a type qualifier here. */
-        type = new PointerType(type, variability, isConst, type->IsSOAType());
-        if (child != NULL)
-            return child->GetType(type, ds);
+        const Type *ptrType = new PointerType(baseType, variability, isConst,
+                                              baseType->IsSOAType());
+        if (child != NULL) {
+            child->InitFromType(ptrType, ds);
+            type = child->type;
+            name = child->name;
+        }
         else
-            return type;
-        break;
-
-    case DK_REFERENCE:
-        if (hasUniformQual)
+            type = ptrType;
+    }
+    else if (kind == DK_REFERENCE) {
+        if (hasUniformQual) {
             Error(pos, "\"uniform\" qualifier is illegal to apply to references.");
-        if (hasVaryingQual)
+            return;
+        }
+        if (hasVaryingQual) {
             Error(pos, "\"varying\" qualifier is illegal to apply to references.");
-        if (isConst)
+            return;
+        }
+        if (isConst) {
             Error(pos, "\"const\" qualifier is to illegal apply to references.");
-
+            return;
+        }
         // The parser should disallow this already, but double check.
-        if (dynamic_cast<const ReferenceType *>(type) != NULL) {
+        if (CastType<ReferenceType>(baseType) != NULL) {
             Error(pos, "References to references are illegal.");
-            return NULL;
+            return;
         }
 
-        type = new ReferenceType(type);
-        if (child != NULL)
-            return child->GetType(type, ds);
+        const Type *refType = new ReferenceType(baseType);
+        if (child != NULL) {
+            child->InitFromType(refType, ds);
+            type = child->type;
+            name = child->name;
+        }
         else
-            return type;
-        break;
-
-    case DK_ARRAY:
-        if (Type::Equal(type, AtomicType::Void)) {
+            type = refType;
+    }
+    else if (kind == DK_ARRAY) {
+        if (Type::Equal(baseType, AtomicType::Void)) {
             Error(pos, "Arrays of \"void\" type are illegal.");
-            return NULL;
+            return;
         }
-        if (dynamic_cast<const ReferenceType *>(type)) {
+        if (CastType<ReferenceType>(baseType)) {
             Error(pos, "Arrays of references (type \"%s\") are illegal.",
-                  type->GetString().c_str());
-            return NULL;
+                  baseType->GetString().c_str());
+            return;
         }
 
-        type = new ArrayType(type, arraySize);
-        if (child)
-            return child->GetType(type, ds);
+        const Type *arrayType = new ArrayType(baseType, arraySize);
+        if (child != NULL) {
+            child->InitFromType(arrayType, ds);
+            type = child->type;
+            name = child->name;
+        }
         else
-            return type;
-        break;
-
-    case DK_FUNCTION: {
-        std::vector<const Type *> args;
-        std::vector<std::string> argNames;
-        std::vector<ConstExpr *> argDefaults;
-        std::vector<SourcePos> argPos;
-
+            type = arrayType;
+    }
+    else if (kind == DK_FUNCTION) {
+        llvm::SmallVector<const Type *, 8> args;
+        llvm::SmallVector<std::string, 8> argNames;
+        llvm::SmallVector<Expr *, 8> argDefaults;
+        llvm::SmallVector<SourcePos, 8> argPos;
+        
         // Loop over the function arguments and store the names, types,
         // default values (if any), and source file positions each one in
         // the corresponding vector.
         for (unsigned int i = 0; i < functionParams.size(); ++i) {
             Declaration *d = functionParams[i];
 
-            Symbol *sym = GetSymbolForFunctionParameter(i);
-
-            if (d->declSpecs->storageClass != SC_NONE)
-                Error(sym->pos, "Storage class \"%s\" is illegal in "
-                      "function parameter declaration for parameter \"%s\".", 
-                      lGetStorageClassName(d->declSpecs->storageClass),
-                      sym->name.c_str());
-            if (Type::Equal(sym->type, AtomicType::Void)) {
-                Error(sym->pos, "Parameter with type \"void\" illegal in function "
-                      "parameter list.");
-                sym->type = NULL;
+            if (d == NULL) {
+                AssertPos(pos, m->errorCount > 0);
+                continue;
+            }
+            if (d->declarators.size() == 0) {
+                // function declaration like foo(float), w/o a name for the
+                // parameter; wire up a placeholder Declarator for it
+                d->declarators.push_back(new Declarator(DK_BASE, pos));
+                d->declarators[0]->InitFromDeclSpecs(d->declSpecs);
             }
 
-            const ArrayType *at = dynamic_cast<const ArrayType *>(sym->type);
+            AssertPos(pos, d->declarators.size() == 1);
+            Declarator *decl = d->declarators[0];
+            if (decl == NULL || decl->type == NULL) {
+                AssertPos(pos, m->errorCount > 0);
+                continue;
+            }
+
+            if (decl->name == "") {
+                // Give a name to any anonymous parameter declarations
+                char buf[32];
+                sprintf(buf, "__anon_parameter_%d", i);
+                decl->name = buf;
+            }
+            decl->type = decl->type->ResolveUnboundVariability(Variability::Varying);
+
+            if (d->declSpecs->storageClass != SC_NONE)
+                Error(decl->pos, "Storage class \"%s\" is illegal in "
+                      "function parameter declaration for parameter \"%s\".", 
+                      lGetStorageClassName(d->declSpecs->storageClass),
+                      decl->name.c_str());
+            if (Type::Equal(decl->type, AtomicType::Void)) {
+                Error(decl->pos, "Parameter with type \"void\" illegal in function "
+                      "parameter list.");
+                decl->type = NULL;
+            }
+
+            const ArrayType *at = CastType<ArrayType>(decl->type);
             if (at != NULL) {
                 // As in C, arrays are passed to functions as pointers to
                 // their element type.  We'll just immediately make this
@@ -455,93 +444,94 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
                 // report this differently than it was originally declared
                 // in the function, but it's not clear that this is a
                 // significant problem.)
-                if (at->GetElementType() == NULL) {
-                    Assert(m->errorCount > 0);
-                    return NULL;
+                const Type *targetType = at->GetElementType();
+                if (targetType == NULL) {
+                    AssertPos(pos, m->errorCount > 0);
+                    return;
                 }
 
-                const Type *targetType = at->GetElementType();
-                targetType = 
-                    targetType->ResolveUnboundVariability(Variability::Varying);
-                sym->type = PointerType::GetUniform(targetType);
+                decl->type = PointerType::GetUniform(targetType);
 
                 // Make sure there are no unsized arrays (other than the
                 // first dimension) in function parameter lists.
-                at = dynamic_cast<const ArrayType *>(at->GetElementType());
+                at = CastType<ArrayType>(targetType);
                 while (at != NULL) {
                     if (at->GetElementCount() == 0)
-                        Error(sym->pos, "Arrays with unsized dimensions in "
+                        Error(decl->pos, "Arrays with unsized dimensions in "
                               "dimensions after the first one are illegal in "
                               "function parameter lists.");
-                    at = dynamic_cast<const ArrayType *>(at->GetElementType());
+                    at = CastType<ArrayType>(at->GetElementType());
                 }
             }
 
-            args.push_back(sym->type);
-            argNames.push_back(sym->name);
-            argPos.push_back(sym->pos);
+            args.push_back(decl->type);
+            argNames.push_back(decl->name);
+            argPos.push_back(decl->pos);
 
-            ConstExpr *init = NULL;
-            if (d->declarators.size()) {
-                // Try to find an initializer expression; if there is one,
-                // it lives down to the base declarator.
-                Declarator *decl = d->declarators[0];
-                while (decl->child != NULL) {
-                    Assert(decl->initExpr == NULL);
+            Expr *init = NULL;
+            // Try to find an initializer expression.
+            while (decl != NULL) {
+                if (decl->initExpr != NULL) {
+                    decl->initExpr = TypeCheck(decl->initExpr);
+                    decl->initExpr = Optimize(decl->initExpr);
+                    if (decl->initExpr != NULL) {
+                        init = dynamic_cast<ConstExpr *>(decl->initExpr);
+                        if (init == NULL)
+                            init = dynamic_cast<NullPointerExpr *>(decl->initExpr);
+                        if (init == NULL)
+                            Error(decl->initExpr->pos, "Default value for parameter "
+                                  "\"%s\" must be a compile-time constant.", 
+                                  decl->name.c_str());
+                    }
+                    break;
+                }
+                else
                     decl = decl->child;
-                }
-
-                if (decl->initExpr != NULL &&
-                    (decl->initExpr = TypeCheck(decl->initExpr)) != NULL &&
-                    (decl->initExpr = Optimize(decl->initExpr)) != NULL &&
-                    (init = dynamic_cast<ConstExpr *>(decl->initExpr)) == NULL) {
-                    Error(decl->initExpr->pos, "Default value for parameter "
-                          "\"%s\" must be a compile-time constant.", 
-                          sym->name.c_str());
-                }
             }
             argDefaults.push_back(init);
         }
 
-        const Type *returnType = type;
+        const Type *returnType = baseType;
         if (returnType == NULL) {
             Error(pos, "No return type provided in function declaration.");
-            return NULL;
+            return;
         }
-        if (dynamic_cast<const FunctionType *>(returnType) != NULL) {
+
+        if (CastType<FunctionType>(returnType) != NULL) {
             Error(pos, "Illegal to return function type from function.");
-            return NULL;
+            return;
         }
         
-        bool isExported = ds && (ds->storageClass == SC_EXPORT);
+        returnType = returnType->ResolveUnboundVariability(Variability::Varying);
+
         bool isExternC =  ds && (ds->storageClass == SC_EXTERN_C);
+        bool isExported = ds && ((ds->typeQualifiers & TYPEQUAL_EXPORT) != 0);
         bool isTask =     ds && ((ds->typeQualifiers & TYPEQUAL_TASK) != 0);
 
         if (isExported && isTask) {
             Error(pos, "Function can't have both \"task\" and \"export\" "
                   "qualifiers");
-            return NULL;
+            return;
         }
         if (isExternC && isTask) {
             Error(pos, "Function can't have both \"extern \"C\"\" and \"task\" "
                   "qualifiers");
-            return NULL;
+            return;
         }
         if (isExternC && isExported) {
             Error(pos, "Function can't have both \"extern \"C\"\" and \"export\" "
                   "qualifiers");
-            return NULL;
+            return;
         }
 
         if (child == NULL) {
-            Assert(m->errorCount > 0);
-            return NULL;
+            AssertPos(pos, m->errorCount > 0);
+            return;
         }
 
         const FunctionType *functionType = 
             new FunctionType(returnType, args, argNames, argDefaults,
                              argPos, isTask, isExported, isExternC);
-        functionType = functionType->ResolveUnboundVariability(Variability::Varying);
 
         // handle any explicit __declspecs on the function
         if (ds != NULL) {
@@ -563,60 +553,12 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
             }
         }
 
-        return child->GetType(functionType, ds);
-    }
-    default:
-        FATAL("Unexpected decl kind");
-        return NULL;
+        child->InitFromType(functionType, ds);
+        type = child->type;
+        name = child->name;
     }
 }
 
-
-const Type *
-Declarator::GetType(DeclSpecs *ds) const {
-    const Type *baseType = ds->GetBaseType(pos);
-    const Type *type = GetType(baseType, ds);
-
-    if (ds->declSpecList.size() > 0 && 
-        type != NULL &&
-        dynamic_cast<const FunctionType *>(type) == NULL) {
-        Error(pos, "__declspec specifiers for non-function type \"%s\" are "
-              "not used.", type->GetString().c_str());
-    }
-
-    return type;
-}
-
-
-Symbol *
-Declarator::GetSymbolForFunctionParameter(int paramNum) const {
-    Assert(paramNum < (int)functionParams.size());
-    Declaration *d = functionParams[paramNum];
-
-    char buf[32];
-    Symbol *sym;
-    if (d->declarators.size() == 0) {
-        // function declaration like foo(float), w/o a name for
-        // the parameter
-        sprintf(buf, "__anon_parameter_%d", paramNum);
-        sym = new Symbol(buf, pos);
-        sym->type = d->declSpecs->GetBaseType(pos);
-    }
-    else {
-        Assert(d->declarators.size() == 1);
-        sym = d->declarators[0]->GetSymbol();
-        if (sym == NULL) {
-            // Handle more complex anonymous declarations like
-            // float (float **).
-            sprintf(buf, "__anon_parameter_%d", paramNum);
-            sym = new Symbol(buf, d->declarators[0]->pos);
-            sym->type = d->declarators[0]->GetType(d->declSpecs);
-        }
-    }
-    return sym;
-}
-
-
 ///////////////////////////////////////////////////////////////////////////
 // Declaration
 
@@ -646,27 +588,23 @@ Declaration::GetVariableDeclarations() const {
 
     for (unsigned int i = 0; i < declarators.size(); ++i) {
         Declarator *decl = declarators[i];
-        if (decl == NULL) {
+        if (decl == NULL || decl->type == NULL) {
             // Ignore earlier errors
             Assert(m->errorCount > 0);
             continue;
         }
 
-        Symbol *sym = decl->GetSymbol();
-        if (sym == NULL || sym->type == NULL) {
-            // Ignore errors
-            Assert(m->errorCount > 0);
-            continue;
-        }
-        sym->type = sym->type->ResolveUnboundVariability(Variability::Varying);
-
-        if (Type::Equal(sym->type, AtomicType::Void))
-            Error(sym->pos, "\"void\" type variable illegal in declaration.");
-        else if (dynamic_cast<const FunctionType *>(sym->type) == NULL) {
+        if (Type::Equal(decl->type, AtomicType::Void))
+            Error(decl->pos, "\"void\" type variable illegal in declaration.");
+        else if (CastType<FunctionType>(decl->type) == NULL) {
+            decl->type = decl->type->ResolveUnboundVariability(Variability::Varying);
+            Symbol *sym = new Symbol(decl->name, decl->pos, decl->type,
+                                     decl->storageClass);
             m->symbolTable->AddVariable(sym);
             vars.push_back(VariableDeclaration(sym, decl->initExpr));
         }
     }
+
     return vars;
 }
 
@@ -677,25 +615,19 @@ Declaration::DeclareFunctions() {
 
     for (unsigned int i = 0; i < declarators.size(); ++i) {
         Declarator *decl = declarators[i];
-        if (decl == NULL) {
+        if (decl == NULL || decl->type == NULL) {
             // Ignore earlier errors
             Assert(m->errorCount > 0);
             continue;
         }
 
-        Symbol *sym = decl->GetSymbol();
-        if (sym == NULL || sym->type == NULL) {
-            // Ignore errors
-            Assert(m->errorCount > 0);
-            continue;
-        }
-        sym->type = sym->type->ResolveUnboundVariability(Variability::Varying);
-
-        if (dynamic_cast<const FunctionType *>(sym->type) == NULL)
+        const FunctionType *ftype = CastType<FunctionType>(decl->type);
+        if (ftype == NULL)
             continue;
 
         bool isInline = (declSpecs->typeQualifiers & TYPEQUAL_INLINE);
-        m->AddFunctionDeclaration(sym, isInline);
+        m->AddFunctionDeclaration(decl->name, ftype, decl->storageClass,
+                                  isInline, decl->pos);
     }
 }
 
@@ -709,13 +641,14 @@ Declaration::Print(int indent) const {
         declarators[i]->Print(indent+4);
 }
 
+
 ///////////////////////////////////////////////////////////////////////////
 
 void
 GetStructTypesNamesPositions(const std::vector<StructDeclaration *> &sd,
-                             std::vector<const Type *> *elementTypes,
-                             std::vector<std::string> *elementNames,
-                             std::vector<SourcePos> *elementPositions) {
+                             llvm::SmallVector<const Type *, 8> *elementTypes,
+                             llvm::SmallVector<std::string, 8> *elementNames,
+                             llvm::SmallVector<SourcePos, 8> *elementPositions) {
     std::set<std::string> seenNames;
     for (unsigned int i = 0; i < sd.size(); ++i) {
         const Type *type = sd[i]->type;
@@ -725,38 +658,41 @@ GetStructTypesNamesPositions(const std::vector<StructDeclaration *> &sd,
         // FIXME: making this fake little DeclSpecs here is really
         // disgusting
         DeclSpecs ds(type);
-        if (type->IsUniformType()) 
-            ds.typeQualifiers |= TYPEQUAL_UNIFORM;
-        else if (type->IsVaryingType())
-            ds.typeQualifiers |= TYPEQUAL_VARYING;
+        if (Type::Equal(type, AtomicType::Void) == false) {
+            if (type->IsUniformType()) 
+                ds.typeQualifiers |= TYPEQUAL_UNIFORM;
+            else if (type->IsVaryingType())
+                ds.typeQualifiers |= TYPEQUAL_VARYING;
+            else if (type->GetSOAWidth() != 0)
+                ds.soaWidth = type->GetSOAWidth();
+            // FIXME: ds.vectorSize?
+        }
 
         for (unsigned int j = 0; j < sd[i]->declarators->size(); ++j) {
             Declarator *d = (*sd[i]->declarators)[j];
             d->InitFromDeclSpecs(&ds);
 
-            Symbol *sym = d->GetSymbol();
-
-            if (Type::Equal(sym->type, AtomicType::Void))
+            if (Type::Equal(d->type, AtomicType::Void))
                 Error(d->pos, "\"void\" type illegal for struct member.");
 
-            const ArrayType *arrayType = 
-                dynamic_cast<const ArrayType *>(sym->type);
-            if (arrayType != NULL && arrayType->GetElementCount() == 0) {
-                Error(d->pos, "Unsized arrays aren't allowed in struct "
-                      "definitions.");
-                elementTypes->push_back(NULL);
-            }
-            else
-                elementTypes->push_back(sym->type);
+            elementTypes->push_back(d->type);
 
-            if (seenNames.find(sym->name) != seenNames.end())
+            if (seenNames.find(d->name) != seenNames.end())
                 Error(d->pos, "Struct member \"%s\" has same name as a "
-                      "previously-declared member.", sym->name.c_str());
+                      "previously-declared member.", d->name.c_str());
             else
-                seenNames.insert(sym->name);
+                seenNames.insert(d->name);
 
-            elementNames->push_back(sym->name);
-            elementPositions->push_back(sym->pos);
+            elementNames->push_back(d->name);
+            elementPositions->push_back(d->pos);
         }
     }
+
+    for (int i = 0; i < (int)elementTypes->size() - 1; ++i) {
+        const ArrayType *arrayType = CastType<ArrayType>((*elementTypes)[i]);
+
+        if (arrayType != NULL && arrayType->GetElementCount() == 0)
+            Error((*elementPositions)[i], "Unsized arrays aren't allowed except "
+                  "for the last member in a struct definition.");
+    }
 }
diff --git a/decl.h b/decl.h
index 0bae20b8..f8b5f3d4 100644
--- a/decl.h
+++ b/decl.h
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
   All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
@@ -47,30 +47,21 @@
     variables--here, that the declaration has the 'static' and 'uniform'
     qualifiers, and that it's basic type is 'int'.  Then for each variable
     declaration, the Declaraiton class holds an instance of a Declarator,
-    which in turn records the per-variable information like the symbol
-    name, array size (if any), initializer expression, etc.
+    which in turn records the per-variable information like the name, array
+    size (if any), initializer expression, etc.  
 */
 
 #ifndef ISPC_DECL_H
 #define ISPC_DECL_H
 
 #include "ispc.h"
+#include <llvm/ADT/SmallVector.h>
 
 struct VariableDeclaration;
 
 class Declaration;
 class Declarator;
 
-enum StorageClass {
-    SC_NONE,
-    SC_EXTERN,
-    SC_EXPORT,
-    SC_STATIC,
-    SC_TYPEDEF,
-    SC_EXTERN_C
-};
-
-
 /* Multiple qualifiers can be provided with types in declarations;
    therefore, they are set up so that they can be ANDed together into an
    int. */
@@ -82,6 +73,7 @@ enum StorageClass {
 #define TYPEQUAL_SIGNED     (1<<4)
 #define TYPEQUAL_UNSIGNED   (1<<5)
 #define TYPEQUAL_INLINE     (1<<6)
+#define TYPEQUAL_EXPORT     (1<<7)
 
 /** @brief Representation of the declaration specifiers in a declaration.
 
@@ -141,25 +133,11 @@ public:
     Declarator(DeclaratorKind dk, SourcePos p);
 
     /** Once a DeclSpecs instance is available, this method completes the
-        initialization of the Symbol, setting its Type accordingly.
+        initialization of the type member.
      */
     void InitFromDeclSpecs(DeclSpecs *ds);
 
-    /** Get the actual type of the combination of Declarator and the given
-        DeclSpecs.  If an explicit base type is provided, the declarator is
-        applied to that type; otherwise the base type from the DeclSpecs is
-        used. */
-    const Type *GetType(DeclSpecs *ds) const;
-    const Type *GetType(const Type *base, DeclSpecs *ds) const;
-
-    /** Returns the symbol corresponding to the function declared by this
-        declarator and symbols for its arguments in *args. */
-    Symbol *GetFunctionInfo(DeclSpecs *ds, std::vector<Symbol *> *args);
-
-    Symbol *GetSymbolForFunctionParameter(int paramNum) const;
-
-    /** Returns the symbol associated with the declarator. */
-    Symbol *GetSymbol() const;
+    void InitFromType(const Type *base, DeclSpecs *ds);
 
     void Print(int indent) const;
 
@@ -180,18 +158,24 @@ public:
     /** Type qualifiers provided with the declarator. */
     int typeQualifiers;
 
+    StorageClass storageClass;
+
     /** For array declarators, this gives the declared size of the array.
         Unsized arrays have arraySize == 0. */ 
     int arraySize;
 
-    /** Symbol associated with the declarator. */
-    Symbol *sym;
+    /** Name associated with the declarator. */
+    std::string name;
 
     /** Initialization expression for the variable.  May be NULL. */
     Expr *initExpr;
 
+    /** Type of the declarator.  This is NULL until InitFromDeclSpecs() or
+        InitFromType() is called. */
+    const Type *type;
+
     /** For function declarations, this holds the Declaration *s for the
-        funciton's parameters. */
+        function's parameters. */
     std::vector<Declaration *> functionParams;
 };
 
@@ -236,8 +220,8 @@ struct StructDeclaration {
 /** Given a set of StructDeclaration instances, this returns the types of
     the elements of the corresponding struct and their names. */
 extern void GetStructTypesNamesPositions(const std::vector<StructDeclaration *> &sd,
-                                         std::vector<const Type *> *elementTypes,
-                                         std::vector<std::string> *elementNames,
-                                         std::vector<SourcePos> *elementPositions);
+                                         llvm::SmallVector<const Type *, 8> *elementTypes,
+                                         llvm::SmallVector<std::string, 8> *elementNames,
+                                         llvm::SmallVector<SourcePos, 8> *elementPositions);
 
 #endif // ISPC_DECL_H
diff --git a/docs/ReleaseNotes.txt b/docs/ReleaseNotes.txt
index 62f46289..e381c017 100644
--- a/docs/ReleaseNotes.txt
+++ b/docs/ReleaseNotes.txt
@@ -1,3 +1,81 @@
+=== v1.2.2 === (20 April 2012)
+
+This release includes a number of small additions to functionality and a
+number of bugfixes.  New functionality includes:
+
+* It's now possible to forward declare structures as in C/C++: "struct
+  Foo;".  After such a declaration, structs with pointers to "Foo" and
+  functions that take pointers or references to Foo structs can be declared
+  without the entire definition of Foo being available.
+
+* New built-in types size_t, ptrdiff_t, and [u]intptr_t are now available,
+  corresponding to the equivalent types in C.
+
+* The standard library now provides atomic_swap*() and
+  atomic_compare_exchange*() functions for void * types.
+
+* The C++ backend has seen a number of improvements to the quality and
+  readability of generated code.
+
+A number of bugs have been fixed in this release as well.  The most
+significant are:
+
+* Fixed a bug where nested loops could cause a compiler crash in some
+  circumstances (issues #240, and #229)
+
+* Gathers could access invlaid mamory (and cause the program to crash) in
+  some circumstances (#235)
+
+* References to temporary values are now handled properly when passed to a
+  function that takes a reference typed parameter.
+
+* A case where incorrect code could be generated for compile-time-constant
+  initializers has been fixed (#234).
+
+=== v1.2.1 === (6 April 2012)
+
+This release contains only minor new functionality and is mostly for many
+small bugfixes and improvements to error handling and error reporting.
+The new functionality that is present is:
+
+* Significantly more efficient versions of the float / half conversion
+  routines are now available in the standard library, thanks to Fabian
+  Giesen.
+
+* The last member of a struct can now be a zero-length array; this allows
+  the trick of dynamically allocating enough storage for the struct and
+  some number of array elements at the end of it.
+
+Significant bugs fixed include:
+
+* Issue #205: When a target ISA isn't specified, use the host system's
+  capabilities to choose a target for which it will be able to run the
+  generated code.
+
+* Issues #215 and #217: Don't allocate storage for global variables that
+  are declared "extern".
+
+* Issue #197: Allow NULL as a default argument value in a function
+  declaration.
+
+* Issue #223: Fix bugs where taking the address of a function wouldn't work
+  as expected.
+
+* Issue #224: When there are overloaded variants of a function that take
+  both reference and const reference parameters, give the non-const
+  reference preference when matching values of that underlying type.
+
+* Issue #225: An error is issed when a varying lvalue is assigned to a
+  reference type (rather than crashing).
+
+* Issue #193: Permit conversions from array types to void *, not just the
+  pointer type of the underlying array element.
+
+* Issue #199: Still evaluate expressions that are cast to (void).
+
+The documentation has also been improved, with FAQs added to clarify some
+aspects of the ispc pointer model.
+
 === v1.2.0 === (20 March 2012)
 
 This is a major new release of ispc, with a number of significant
diff --git a/docs/faq.rst b/docs/faq.rst
index 2cdca136..ff959085 100644
--- a/docs/faq.rst
+++ b/docs/faq.rst
@@ -14,12 +14,19 @@ distribution.
   + `Why are there multiple versions of exported ispc functions in the assembly output?`_
   + `How can I more easily see gathers and scatters in generated assembly?`_
 
+* Language Details
+
+  + `What is the difference between "int *foo" and "int foo[]"?`_
+  + `Why are pointed-to types "uniform" by default?`_
+  + `What am I getting an error about assigning a varying lvalue to a reference type?`_ 
+  
 * Interoperability
 
   + `How can I supply an initial execution mask in the call from the application?`_
   + `How can I generate a single binary executable with support for multiple instruction sets?`_
   + `How can I determine at run-time which vector instruction set's instructions were selected to execute?`_
   + `Is it possible to inline ispc functions in C/C++ code?`_
+  + `Why is it illegal to pass "varying" values from C/C++ to ispc functions?`_ 
 
 * Programming Techniques
 
@@ -27,6 +34,7 @@ distribution.
   + `How can a gang of program instances generate variable amounts of output efficiently?`_
   + `Is it possible to use ispc for explicit vector programming?`_
   + `How can I debug my ispc programs using Valgrind?`_
+  + `foreach statements generate more complex assembly than I'd expect; what's going on?`_
 
 Understanding ispc's Output
 ===========================
@@ -213,6 +221,125 @@ easier to understand:
             jmp        ___pseudo_scatter_base_offsets32_32 ## TAILCALL
 
 
+Language Details
+================
+
+What is the difference between "int \*foo" and "int foo[]"?
+-----------------------------------------------------------
+
+In C and C++, declaring a function to take a parameter ``int *foo`` and
+``int foo[]`` results in the same type for the parameter.  Both are
+pointers to integers.  In ``ispc``, these are different types.  The first
+one is a varying pointer to a uniform integer value in memory, while the
+second results in a uniform pointer to the start of an array of varying
+integer values in memory.
+
+To understand why the first is a varying pointer to a uniform integer,
+first recall that types without explicit rate qualifiers (``uniform``,
+``varying``, or ``soa<>``) are ``varying`` by default.  Second, recall from
+the `discussion of pointer types in the ispc User's Guide`_ that pointed-to
+types without rate qualifiers are ``uniform`` by default.  (This second
+rule is discussed further below, in `Why are pointed-to types "uniform" by
+default?`_.)  The type of ``int *foo`` follows from these.
+
+.. _discussion of pointer types in the ispc User's Guide: ispc.html#pointer-types 
+
+Conversely, in a function body, ``int foo[10]`` represents a declaration of
+a 10-element array of varying ``int`` values.  In that we'd certainly like
+to be able to pass such an array to a function that takes a ``int []``
+parameter, the natural type for an ``int []`` parameter is a uniform
+pointer to varying integer values.
+
+In terms of compatibility with C/C++, it's unfortunate that this
+distinction exists, though any other set of rules seems to introduce more
+awkwardness than this one.  (Though we're interested to hear ideas to
+improve these rules!).
+
+Why are pointed-to types "uniform" by default?
+----------------------------------------------
+
+In ``ispc``, types without rate qualifiers are "varying" by default, but
+types pointed to by pointers without rate qualifiers are "uniform" by
+default.  Why this difference?
+
+::
+
+    int foo;  // no rate qualifier, "varying int".
+    uniform int *foo;  // pointer type has no rate qualifier, pointed-to does.
+                       // "varying pointer to uniform int".
+    int *foo;  // neither pointer type nor pointed-to type ("int") have
+               // rate qualifiers. Pointer type is varying by default,
+               // pointed-to is uniform. "varying pointer to uniform int".
+    varying int *foo;   // varying pointer to varying int
+
+The first rule, having types without rate qualifiers be varying by default,
+is a default that keeps the number of "uniform" or "varying" qualifiers in
+``ispc`` programs low.  Most ``ispc`` programs use mostly "varying"
+variables, so this rule allows most variables to be declared without also
+requiring rate qualifiers.
+
+On a related note, this rule allows many C/C++ functions to be used to
+define equivalent functions in the SPMD execution model that ``ispc``
+provides with little or no modification:
+
+::
+
+    // scalar add in C/C++, SPMD/vector add in ispc
+    int add(int a, int b) { return a + b; }
+
+This motivation also explains why ``uniform int *foo`` represents a varying
+pointer; having pointers be varying by default if they don't have rate
+qualifiers similarly helps with porting code from C/C++ to ``ispc``.
+
+The tricker issue is why pointed-to types are "uniform" by default.  In our
+experience, data in memory that is accessed via pointers is most often
+uniform; this generally includes all data that has been allocated and
+initialized by the C/C++ application code. In practice, "varying" types are
+more generally (but not exclusively) used for local data in ``ispc``
+functions.  Thus, making the pointed-to type uniform by default leads to
+more concise code for the most common cases.
+
+
+What am I getting an error about assigning a varying lvalue to a reference type?
+--------------------------------------------------------------------------------
+
+Given code like the following:
+
+::
+
+    uniform float a[...];
+    int index = ...;
+    float &r = a[index];
+
+``ispc`` issues the error "Initializer for reference-type variable "r" must
+have a uniform lvalue type.".  The underlying issue stems from how
+references are represented in the code generated by ``ispc``.  Recall that
+``ispc`` supports both uniform and varying pointer types--a uniform pointer
+points to the same location in memory for all program instances in the
+gang, while a varying pointer allows each program instance to have its own
+pointer value.
+
+References are represented a pointer in the code generated by ``ispc``,
+though this is generally opaque to the user; in ``ispc``, they are
+specifically uniform pointers.  This design decision was made so that given
+code like this:
+
+::
+
+    extern void func(float &val);
+    float foo = ...;
+    func(foo);
+
+Then the reference would be handled efficiently as a single pointer, rather
+than unnecessarily being turned into a gang-size of pointers.
+
+However, an implication of this decision is that it's not possible for
+references to refer to completely different things for each of the program
+instances.  (And hence the error that is issued).  In cases where a unique
+per-program-instance pointer is needed, a varying pointer should be used
+instead of a reference.
+
+
 Interoperability
 ================
 
@@ -391,6 +518,48 @@ linking your applicaiton.
 ``-mattr=+avx`` flag to ``llc``.)
     
 
+Why is it illegal to pass "varying" values from C/C++ to ispc functions?
+------------------------------------------------------------------------
+
+If any of the types in the parameter list to an exported function is
+"varying" (including recursively, and members of structure types, etc.),
+then ``ispc`` will issue an error and refuse to compile the function:
+
+::
+
+    % echo "export int add(int x) { return ++x; }" | ispc
+    <stdin>:1:12: Error: Illegal to return a "varying" type from exported function "foo" 
+    <stdin>:1:20: Error: Varying parameter "x" is illegal in an exported function. 
+
+While there's no fundamental reason why this isn't possible, recall the
+definition of "varying" variables: they have one value for each program
+instance in the gang.  As such, the number of values and amount of storage
+required to represent a varying variable depends on the gang size
+(i.e. ``programCount``), which can have different values depending on the
+compilation target.
+
+``ispc`` therefore prohibits passing "varying" values between the
+application and the ``ispc`` program in order to prevent the
+application-side code from depending on a particular gang size, in order to
+encourage portability to different gang sizes.  (A generally desirable
+programming practice.)
+
+For cases where the size of data is actually fixed from the application
+side, the value can be passed via a pointer to a short ``uniform`` array,
+as follows:
+
+::
+
+    export void add4(uniform int ptr[4]) {
+        foreach (i = 0 ... 4)
+            ptr[i]++;
+    }
+
+On the 4-wide SSE instruction set, this compiles to a single vector add
+instruction (and associated move instructions), while it still also
+efficiently computes the correct result on 8-wide AVX targets.
+
+
 Programming Techniques
 ======================
 
@@ -525,3 +694,79 @@ you can use ``--target=sse4`` when compiling to run with ``valgrind``.
 Note that ``valgrind`` does not yet support programs that use the AVX
 instruction set.
 
+foreach statements generate more complex assembly than I'd expect; what's going on?
+-----------------------------------------------------------------------------------
+
+Given a simple ``foreach`` loop like the following:
+
+::
+
+    void foo(uniform float a[], uniform int count) {
+        foreach (i = 0 ... count)
+            a[i] *= 2;
+    }
+
+
+the ``ispc`` compiler generates approximately 40 instructions--why isn't
+the generated code simpler?
+
+There are two main components to the code: one handles
+``programCount``-sized chunks of elements of the array, and the other
+handles any excess elements at the end of the array that don't completely
+fill a gang.  The code for the main loop is essentially what one would
+expect: a vector of values are laoded from the array, the multiply is done,
+and the result is stored.
+
+::
+
+    LBB0_2:                                 ## %foreach_full_body
+	movslq	%edx, %rdx
+	vmovups	(%rdi,%rdx), %ymm1
+	vmulps	%ymm0, %ymm1, %ymm1
+	vmovups	%ymm1, (%rdi,%rdx)
+	addl	$32, %edx
+	addl	$8, %eax
+	cmpl	%ecx, %eax
+	jl	LBB0_2
+
+
+Then, there is a sequence of instructions that handles any additional
+elements at the end of the array.  (These instructions don't execute if
+there aren't any left-over values to process, but they do lengthen the
+amount of generated code.)
+
+::
+
+  ## BB#4:                                ## %partial_inner_only
+	vmovd	%eax, %xmm0
+	vinsertf128	$1, %xmm0, %ymm0, %ymm0
+	vpermilps	$0, %ymm0, %ymm0 ## ymm0 = ymm0[0,0,0,0,4,4,4,4]
+	vextractf128	$1, %ymm0, %xmm3
+	vmovd	%esi, %xmm2
+	vmovaps	LCPI0_1(%rip), %ymm1
+	vextractf128	$1, %ymm1, %xmm4
+	vpaddd	%xmm4, %xmm3, %xmm3
+        # ....
+	vmulps	LCPI0_0(%rip), %ymm1, %ymm1
+	vmaskmovps	%ymm1, %ymm0, (%rdi,%rax)
+
+
+If you know that the number of elements to be processed will always be an
+exact multiple of the 8, 16, etc., then adding a simple assignment to
+``count`` like the one below gives the compiler enough information to be
+able to eliminate the code for the additional array elements.
+
+::
+
+    void foo(uniform float a[], uniform int count) {
+        // This assignment doesn't change the value of count
+        // if it's a multiple of 16, but it gives the compiler
+        // insight into this fact, allowing for simpler code to
+        // be generated for the foreach loop.
+        count = (count & ~(16-1));
+        foreach (i = 0 ... count)
+            a[i] *= 2;
+    }
+
+With this new version of ``foo()``, only the code for the first loop above
+is generated.
diff --git a/docs/ispc.rst b/docs/ispc.rst
index 4be80a18..5d26c93a 100644
--- a/docs/ispc.rst
+++ b/docs/ispc.rst
@@ -121,10 +121,14 @@ Contents:
 
 * `The ISPC Standard Library`_
 
+  + `Basic Operations On Data`_
+
+    * `Logical and Selection Operations`_
+    * `Bit Operations`_
+
   + `Math Functions`_
 
     * `Basic Math Functions`_
-    * `Bit-Level Operations`_
     * `Transcendental Functions`_
     * `Pseudo-Random Numbers`_
 
@@ -143,6 +147,7 @@ Contents:
 
     * `Converting Between Array-of-Structures and Structure-of-Arrays Layout`_
     * `Conversions To and From Half-Precision Floats`_
+    * `Converting to sRGB8`_
 
   + `Systems Programming Support`_
 
@@ -538,7 +543,7 @@ preprocessor runs:
   * - ISPC
     - 1
     - Detecting that the ``ispc`` compiler is processing the file
-  * - ISPC_TARGET_{SSE2,SSE4,AVX}
+  * - ISPC_TARGET_{SSE2,SSE4,AVX,AVX2}
     - 1
     - One of these will be set, depending on the compilation target.
   * - ISPC_POINTER_SIZE
@@ -1390,8 +1395,8 @@ Types
 Basic Types and Type Qualifiers
 -------------------------------
 
-``ispc`` is a statically-typed language.  It supports a variety of basic
-types.
+``ispc`` is a statically-typed language.  It supports a variety of core
+basic types:
 
 * ``void``: "empty" type representing no value.
 * ``bool``: boolean value; may be assigned ``true``, ``false``, or the
@@ -1408,6 +1413,15 @@ types.
 * ``unsigned int64``: 64-bit unsigned integer.
 * ``double``: 64-bit double-precision floating point value.
 
+There are also a few built-in types related to pointers and memory:
+
+* ``size_t``: the maximum size of any object (structure or array)
+* ``ptrdiff_t``: an integer type large enough to represent the difference
+  between two pointers
+* ``intptr_t``: signed integer type that is large enough to represent
+  a pointer value
+* ``uintptr_t``: unsigned integer type large enough to represent a pointer
+
 Implicit type conversion between values of different types is done
 automatically by the ``ispc`` compiler.  Thus, a value of ``float`` type
 can be assigned to a variable of ``int`` type directly.  In binary
@@ -1492,13 +1506,17 @@ Defining New Names For Types
 The ``typedef`` keyword can be used to name types:
 
 ::
+ 
+    typedef int64 BigInt;
+    typedef float Float3[3];
 
-  typedef Float3 float[3];
+Following C's syntax, the code above defines ``BigInt`` to have ``int64``
+type and ``Float3`` to have ``float[3]`` type.
 
-``typedef`` doesn't create a new type: it just provides an alternative name
-for an existing type.  Thus, in the above example, it is legal to pass a
-value with ``float[3]`` type to a function that has been declared to take a
-``Float3`` parameter.
+Also as in C, ``typedef`` doesn't create a new type: it just provides an
+alternative name for an existing type.  Thus, in the above example, it is
+legal to pass a value with ``float[3]`` type to a function that has been
+declared to take a ``Float3`` parameter.
 
 
 Pointer Types
@@ -2150,6 +2168,12 @@ greater than or equal to ``NUM_ITEMS``.
         // ...
     }
 
+Short-circuiting may impose some overhead in the generated code; for cases
+where short-circuiting is undesirable due to performance impact, see
+the section `Logical and Selection Operations`_, which introduces helper
+functions in the standard library that provide these operations without
+short-circuiting.
+
 
 Dynamic Memory Allocation
 -------------------------
@@ -2827,6 +2851,123 @@ The ISPC Standard Library
 compiling ``ispc`` programs.  (To disable the standard library, pass the
 ``--nostdlib`` command-line flag to the compiler.)
 
+Basic Operations On Data
+------------------------
+
+Logical and Selection Operations
+--------------------------------
+
+Recall from `Expressions`_ that ``ispc`` short-circuits the evaluation of
+logical and selection operators: given an expression like ``(index < count
+&& array[index] == 0)``, then ``array[index] == 0`` is only evaluated if
+``index < count`` is true.  This property is useful for writing expressions
+like the preceeding one, where the second expression may not be safe to
+evaluate in some cases.
+
+This short-circuiting can impose overhead in the generated code; additional
+operations are required to test the first value and to conditionally jump
+over the code that evaluates the second value.  The ``ispc`` compiler does
+try to mitigate this cost by detecting cases where it is both safe and
+inexpensive to evaluate both expressions, and skips short-circuiting in the
+generated code in this case (without there being any programmer-visible
+change in program behavior.)
+
+For cases where the compiler can't detect this case but the programmer
+wants to avoid short-circuiting behavior, the standard library provides a
+few helper functions.  First, ``and()`` and ``or()`` provide
+non-short-circuiting logical AND and OR operations.
+
+::
+
+    bool and(bool a, bool b)
+    bool or(bool a, bool b)
+    uniform bool and(uniform bool a, uniform bool b)
+    uniform bool or(uniform bool a, uniform bool b)
+
+And there are three variants of ``select()`` that select between two values
+based on a boolean condition.  These are the variants of ``select()`` for
+the ``int8`` type:
+
+::
+
+    int8 select(bool v, int8 a, int8 b)
+    int8 select(uniform bool v, int8 a, int8 b)
+    uniform int8 select(uniform bool v, uniform int8 a, uniform int8 b)
+
+There are also variants for ``int16``, ``int32``, ``int64``, ``float``, and
+``double`` types.
+
+Bit Operations
+--------------
+
+The various variants of ``popcnt()`` return the population count--the
+number of bits set in the given value.
+
+::
+
+    uniform int popcnt(uniform int v)
+    int popcnt(int v)
+    uniform int popcnt(bool v)
+
+
+A few functions determine how many leading bits in the given value are zero
+and how many of the trailing bits are zero; there are also ``unsigned``
+variants of these functions and variants that take ``int64`` and ``unsigned
+int64`` types.
+
+::
+
+    int32 count_leading_zeros(int32 v)
+    uniform int32 count_leading_zeros(uniform int32 v)
+    int32 count_trailing_zeros(int32 v)
+    uniform int32 count_trailing_zeros(uniform int32 v)
+
+Sometimes it's useful to convert a ``bool`` value to an integer using sign
+extension so that the integer's bits are all on if the ``bool`` has the
+value ``true`` (rather than just having the value one).  The
+``sign_extend()`` functions provide this functionality:
+
+::
+
+    int sign_extend(bool value) 
+    uniform int sign_extend(uniform bool value) 
+
+The ``intbits()`` and ``floatbits()`` functions can be used to implement
+low-level floating-point bit twiddling.  For example, ``intbits()`` returns
+an ``unsigned int`` that is a bit-for-bit copy of the given ``float``
+value.  (Note: it is **not** the same as ``(int)a``, but corresponds to
+something like ``*((int *)&a)`` in C.
+
+::
+
+    float floatbits(unsigned int a);
+    uniform float floatbits(uniform unsigned int a);
+    unsigned int intbits(float a);
+    uniform unsigned int intbits(uniform float a);
+
+
+The ``intbits()`` and ``floatbits()`` functions have no cost at runtime;
+they just let the compiler know how to interpret the bits of the given
+value.  They make it possible to efficiently write functions that take
+advantage of the low-level bit representation of floating-point values.
+
+For example, the ``abs()`` function in the standard library is implemented
+as follows:
+
+::
+
+    float abs(float a) {
+        unsigned int i = intbits(a);
+        i &= 0x7fffffff;
+        return floatbits(i);
+    }
+
+This code directly clears the high order bit to ensure that the given
+floating-point value is positive.  This compiles down to a single ``andps``
+instruction when used with an Intel® SSE target, for example.
+
+
+
 Math Functions
 --------------
 
@@ -2919,77 +3060,6 @@ quite efficient.)
                                uniform unsigned int low,
                                uniform unsigned int high)
 
-Bit-Level Operations
---------------------
-
-
-The various variants of ``popcnt()`` return the population count--the
-number of bits set in the given value.
-
-::
-
-    uniform int popcnt(uniform int v)
-    int popcnt(int v)
-    uniform int popcnt(bool v)
-
-
-A few functions determine how many leading bits in the given value are zero
-and how many of the trailing bits are zero; there are also ``unsigned``
-variants of these functions and variants that take ``int64`` and ``unsigned
-int64`` types.
-
-::
-
-    int32 count_leading_zeros(int32 v)
-    uniform int32 count_leading_zeros(uniform int32 v)
-    int32 count_trailing_zeros(int32 v)
-    uniform int32 count_trailing_zeros(uniform int32 v)
-
-Sometimes it's useful to convert a ``bool`` value to an integer using sign
-extension so that the integer's bits are all on if the ``bool`` has the
-value ``true`` (rather than just having the value one).  The
-``sign_extend()`` functions provide this functionality:
-
-::
-
-    int sign_extend(bool value) 
-    uniform int sign_extend(uniform bool value) 
-
-The ``intbits()`` and ``floatbits()`` functions can be used to implement
-low-level floating-point bit twiddling.  For example, ``intbits()`` returns
-an ``unsigned int`` that is a bit-for-bit copy of the given ``float``
-value.  (Note: it is **not** the same as ``(int)a``, but corresponds to
-something like ``*((int *)&a)`` in C.
-
-::
-
-    float floatbits(unsigned int a);
-    uniform float floatbits(uniform unsigned int a);
-    unsigned int intbits(float a);
-    uniform unsigned int intbits(uniform float a);
-
-
-The ``intbits()`` and ``floatbits()`` functions have no cost at runtime;
-they just let the compiler know how to interpret the bits of the given
-value.  They make it possible to efficiently write functions that take
-advantage of the low-level bit representation of floating-point values.
-
-For example, the ``abs()`` function in the standard library is implemented
-as follows:
-
-::
-
-    float abs(float a) {
-        unsigned int i = intbits(a);
-        i &= 0x7fffffff;
-        return floatbits(i);
-    }
-
-This code directly clears the high order bit to ensure that the given
-floating-point value is positive.  This compiles down to a single ``andps``
-instruction when used with an Intel® SSE target, for example.
-
-
 Transcendental Functions
 ------------------------
 
@@ -3027,8 +3097,8 @@ The corresponding inverse functions are also available:
    uniform float acos(uniform float x)
    float atan(float x)
    uniform float atan(uniform float x)
-   float atan2(float x, float y)
-   uniform float atan2(uniform float x, uniform float y)
+   float atan2(float y, float x)
+   uniform float atan2(uniform float y, uniform float x)
 
 If both sine and cosine are needed, then the ``sincos()`` call computes
 both more efficiently than two calls to the respective individual
@@ -3077,7 +3147,7 @@ library.  State for the RNG is maintained in an instance of the
 ::
 
     struct RNGState;
-    void seed_rng(varying RNGState * uniform state, uniform int seed)
+    void seed_rng(varying RNGState * uniform state, int seed)
     void seed_rng(uniform RNGState * uniform state, uniform int seed)
 
 After the RNG is seeded, the ``random()`` function can be used to get a
@@ -3622,6 +3692,22 @@ precise.
     uniform int16 float_to_half_fast(uniform float f)
 
 
+Converting to sRGB8
+-------------------
+
+The sRGB color space is used in many applications in graphics and imaging;
+see the `Wikipedia page on sRGB`_ for more information.  The ``ispc``
+standard library provides two functions for converting floating-point color
+values to 8-bit values in the sRGB space.
+
+.. _Wikipedia page on sRGB: http://en.wikipedia.org/wiki/SRGB
+
+::
+
+    int float_to_srgb8(float v)
+    uniform int float_to_srgb8(uniform float v)
+
+
 Systems Programming Support
 ---------------------------
 
@@ -3732,6 +3818,13 @@ For global atomics, only atomic swap is available for these types:
   float atomic_swap_global(uniform float * uniform ptr, float value)
   double atomic_swap_global(uniform double * uniform ptr, double value)
 
+Finally, "swap" (but none of these other atomics) is available for pointer
+types:
+
+::
+
+  void *atomic_swap_{local,global}(void * * uniform ptr, void * value)
+
 There are also variants of the atomic that take ``uniform`` values for the
 operand and return a ``uniform`` result.  These correspond to a single
 atomic operation being performed for the entire gang of program instances,
@@ -3756,6 +3849,13 @@ rather than one per program instance.
   uniform int32 atomic_swap_{local,global}(uniform int32 * uniform ptr,
                                            uniform int32 newval)
 
+And similarly for pointers:
+
+::
+
+  uniform void *atomic_swap_{local,global}(void * * uniform ptr,
+                                           void *newval)
+
 Be careful that you use the atomic function that you mean to; consider the
 following code:
 
@@ -3797,12 +3897,18 @@ the same location in memory!)
   int32 atomic_xor_{local,global}(uniform int32 * varying ptr, int32 value)
   int32 atomic_swap_{local,global}(uniform int32 * varying ptr, int32 value)
 
+And:
+
+::
+
+  void *atomic_swap_{local,global}(void * * ptr, void *value)
+
 There are also atomic "compare and exchange" functions.  Compare and
 exchange atomically compares the value in "val" to "compare"--if they
 match, it assigns "newval" to "val".  In either case, the old value of
 "val" is returned.  (As with the other atomic operations, there are also
 ``unsigned`` and 64-bit variants of this function.  Furthermore, there are
-``float`` and ``double`` variants as well.)
+``float``, ``double``, and ``void *`` variants as well.)
 
 ::
 
@@ -3824,6 +3930,11 @@ code.
 
     void memory_barrier();
 
+Note that this barrier is *not* needed for coordinating reads and writes
+among the program instances in a gang; it's only needed for coordinating
+between multiple hardware threads running on different cores.  See the
+section `Data Races Within a Gang`_ for the guarantees provided about
+memory read/write ordering across a gang.
 
 Prefetches
 ----------
diff --git a/docs/news.rst b/docs/news.rst
index e875b077..ad6c4bd5 100644
--- a/docs/news.rst
+++ b/docs/news.rst
@@ -2,6 +2,24 @@
 ispc News
 =========
 
+ispc 1.2.1 is Released
+----------------------
+
+This is a bugfix release, fixing approximately 20 bugs in the system and
+improving error handling and error reporting.  New functionality includes
+very efficient float/half conversion routines thanks to Fabian 
+Giesen.  See the `1.2.1 release notes`_ for details.
+
+.. _1.2.1 release notes: https://github.com/ispc/ispc/tree/master/docs/ReleaseNotes.txt
+
+ispc 1.2.0 is Released
+-----------------------
+
+A new major release was posted on March 20, 2012.  This release includes
+significant new functionality for cleanly handling "structure of arrays"
+(SoA) data layout and a new model for how uniform and varying are handled
+with structure types.  
+
 Paper on ispc To Appear in InPar 2012
 -------------------------------------
 
diff --git a/docs/perfguide.rst b/docs/perfguide.rst
index 6e8555bf..b8e65893 100644
--- a/docs/perfguide.rst
+++ b/docs/perfguide.rst
@@ -624,7 +624,7 @@ gathers happen.)
 
     extern "C" {
         void ISPCInstrument(const char *fn, const char *note, 
-                            int line, int mask);
+                            int line, uint64_t mask);
     }
 
 This function is passed the file name of the ``ispc`` file running, a short
@@ -637,7 +637,7 @@ as follows:
 
 ::
 
-   ISPCInstrument("foo.ispc", "function entry", 55, 0xf);
+   ISPCInstrument("foo.ispc", "function entry", 55, 0xfull);
 
 This call indicates that at the currently executing program has just
 entered the function defined at line 55 of the file ``foo.ispc``, with a
diff --git a/doxygen.cfg b/doxygen.cfg
index f8637ddf..f02dac70 100644
--- a/doxygen.cfg
+++ b/doxygen.cfg
@@ -31,7 +31,7 @@ PROJECT_NAME           = "Intel SPMD Program Compiler"
 # This could be handy for archiving the generated documentation or
 # if some version control system is used.
 
-PROJECT_NUMBER         = 1.2.0
+PROJECT_NUMBER         = 1.2.2
 
 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
 # base path where the generated documentation will be put.
diff --git a/examples/aobench/ao.ispc b/examples/aobench/ao.ispc
index 692dc367..e768c8c7 100644
--- a/examples/aobench/ao.ispc
+++ b/examples/aobench/ao.ispc
@@ -210,7 +210,7 @@ static void ao_scanlines(uniform int y0, uniform int y1, uniform int w,
         { { 1.0f, 0.0f, -2.2f }, 0.5f } };
     RNGState rngstate;
 
-    seed_rng(&rngstate, y0);
+    seed_rng(&rngstate, programIndex + (y0 << (programIndex & 15)));
     float invSamples = 1.f / nsubsamples;
 
     foreach_tiled(y = y0 ... y1, x = 0 ... w, 
diff --git a/examples/aobench_instrumented/ao.ispc b/examples/aobench_instrumented/ao.ispc
index 06d9b505..4fd9bedb 100644
--- a/examples/aobench_instrumented/ao.ispc
+++ b/examples/aobench_instrumented/ao.ispc
@@ -211,7 +211,7 @@ static void ao_scanlines(uniform int y0, uniform int y1, uniform int w,
         { { 1.0f, 0.0f, -2.2f }, 0.5f } };
     RNGState rngstate;
 
-    seed_rng(&rngstate, y0);
+    seed_rng(&rngstate, programIndex + (y0 << (programIndex & 15)));
 
     // Compute the mapping between the 'programCount'-wide program
     // instances running in parallel and samples in the image.  
diff --git a/examples/deferred/main.cpp b/examples/deferred/main.cpp
index 88dab2d6..17bd3f42 100644
--- a/examples/deferred/main.cpp
+++ b/examples/deferred/main.cpp
@@ -87,7 +87,7 @@ int main(int argc, char** argv) {
         framebuffer.clear();
         reset_and_start_timer();
         for (int j = 0; j < nframes; ++j)
-            ispc::RenderStatic(&input->header, &input->arrays, 
+            ispc::RenderStatic(input->header, input->arrays,
                                VISUALIZE_LIGHT_COUNT,
                                framebuffer.r, framebuffer.g, framebuffer.b);
         double mcycles = get_elapsed_mcycles() / nframes;
diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h
index 861db2a4..80c2635c 100644
--- a/examples/intrinsics/generic-16.h
+++ b/examples/intrinsics/generic-16.h
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
   All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
@@ -259,13 +259,13 @@ static FORCEINLINE TYPE NAME(TYPE a, int32_t b) {                   \
    return ret;                                                      \
 }
 
-#define SMEAR(VTYPE, NAME, STYPE)               \
-static FORCEINLINE VTYPE __smear_##NAME(STYPE v) {        \
-    VTYPE ret;                                  \
-    for (int i = 0; i < 16; ++i)                \
-        ret.v[i] = v;                           \
-    return ret;                                 \
-}                                               \
+#define SMEAR(VTYPE, NAME, STYPE)                                  \
+static FORCEINLINE VTYPE __smear_##NAME(VTYPE retType, STYPE v) {  \
+    VTYPE ret;                                                     \
+    for (int i = 0; i < 16; ++i)                                   \
+        ret.v[i] = v;                                              \
+    return ret;                                                    \
+}
 
 #define BROADCAST(VTYPE, NAME, STYPE)                 \
 static FORCEINLINE VTYPE __broadcast_##NAME(VTYPE v, int index) {   \
@@ -311,8 +311,8 @@ INSERT_EXTRACT(__vec1_d, double)
 ///////////////////////////////////////////////////////////////////////////
 // mask ops
 
-static FORCEINLINE uint32_t __movmsk(__vec16_i1 mask) {
-    return mask.v;
+static FORCEINLINE uint64_t __movmsk(__vec16_i1 mask) {
+    return (uint64_t)mask.v;
 }
 
 static FORCEINLINE __vec16_i1 __equal(__vec16_i1 a, __vec16_i1 b) {
@@ -339,6 +339,24 @@ static FORCEINLINE __vec16_i1 __or(__vec16_i1 a, __vec16_i1 b) {
     return r;
 }
 
+static FORCEINLINE __vec16_i1 __not(__vec16_i1 v) {
+    __vec16_i1 r;
+    r.v = ~v.v;
+    return r;
+}
+
+static FORCEINLINE __vec16_i1 __and_not1(__vec16_i1 a, __vec16_i1 b) {
+    __vec16_i1 r;
+    r.v = ~a.v & b.v;
+    return r;
+}
+
+static FORCEINLINE __vec16_i1 __and_not2(__vec16_i1 a, __vec16_i1 b) {
+    __vec16_i1 r;
+    r.v = a.v & ~b.v;
+    return r;
+}
+
 static FORCEINLINE __vec16_i1 __select(__vec16_i1 mask, __vec16_i1 a, 
                                        __vec16_i1 b) {
     __vec16_i1 r;
@@ -374,6 +392,12 @@ static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v, int align) {
     *ptr = v.v;
 }
 
+static FORCEINLINE __vec16_i1 __smear_i1(__vec16_i1, int v) {
+    return __vec16_i1(v, v, v, v, v, v, v, v, 
+                      v, v, v, v, v, v, v, v);
+}
+
+
 ///////////////////////////////////////////////////////////////////////////
 // int8
 
@@ -581,6 +605,121 @@ ROTATE(__vec16_f, float, float)
 SHUFFLES(__vec16_f, float, float)
 LOAD_STORE(__vec16_f, float)
 
+static FORCEINLINE float __exp_uniform_float(float v) {
+    return expf(v);
+}
+
+static FORCEINLINE __vec16_f __exp_varying_float(__vec16_f v) {
+    __vec16_f ret;
+    for (int i = 0; i < 16; ++i)
+        ret.v[i] = expf(v.v[i]);
+    return ret;
+}
+
+static FORCEINLINE float __log_uniform_float(float v) {
+    return logf(v);
+}
+
+static FORCEINLINE __vec16_f __log_varying_float(__vec16_f v) {
+    __vec16_f ret;
+    for (int i = 0; i < 16; ++i)
+        ret.v[i] = logf(v.v[i]);
+    return ret;
+}
+
+static FORCEINLINE float __pow_uniform_float(float a, float b) {
+    return powf(a, b);
+}
+
+static FORCEINLINE __vec16_f __pow_varying_float(__vec16_f a, __vec16_f b) {
+    __vec16_f ret;
+    for (int i = 0; i < 16; ++i)
+        ret.v[i] = powf(a.v[i], b.v[i]);
+    return ret;
+}
+
+static FORCEINLINE int __intbits(float v) {
+    union {
+        float f;
+        int i;
+    } u;
+    u.f = v;
+    return u.i;
+}
+
+static FORCEINLINE float __floatbits(int v) {
+    union {
+        float f;
+        int i;
+    } u;
+    u.i = v;
+    return u.f;
+}
+
+static FORCEINLINE float __half_to_float_uniform(int16_t h) {
+    static const uint32_t shifted_exp = 0x7c00 << 13; // exponent mask after shift
+
+    int32_t o = ((int32_t)(h & 0x7fff)) << 13;     // exponent/mantissa bits
+    uint32_t exp = shifted_exp & o;   // just the exponent
+    o += (127 - 15) << 23;        // exponent adjust
+
+    // handle exponent special cases
+    if (exp == shifted_exp) // Inf/NaN?
+        o += (128 - 16) << 23;    // extra exp adjust
+    else if (exp == 0) { // Zero/Denormal?
+        o += 1 << 23;             // extra exp adjust
+        o = __intbits(__floatbits(o) - __floatbits(113 << 23)); // renormalize
+    }
+
+    o |= ((int32_t)(h & 0x8000)) << 16;    // sign bit
+    return __floatbits(o);
+}
+
+
+static FORCEINLINE __vec16_f __half_to_float_varying(__vec16_i16 v) {
+    __vec16_f ret;
+    for (int i = 0; i < 16; ++i)
+        ret.v[i] = __half_to_float_uniform(v.v[i]);
+    return ret;
+}
+
+
+static FORCEINLINE int16_t __float_to_half_uniform(float f) {
+    uint32_t sign_mask = 0x80000000u;
+    int32_t o;
+
+    int32_t fint = __intbits(f);
+    int32_t sign = fint & sign_mask;
+    fint ^= sign;
+
+    int32_t f32infty = 255 << 23;
+    o = (fint > f32infty) ? 0x7e00 : 0x7c00; 
+
+    // (De)normalized number or zero
+    // update fint unconditionally to save the blending; we don't need it
+    // anymore for the Inf/NaN case anyway.
+    const uint32_t round_mask = ~0xfffu; 
+    const int32_t magic = 15 << 23;
+    const int32_t f16infty = 31 << 23;
+
+    int32_t fint2 = __intbits(__floatbits(fint & round_mask) * __floatbits(magic)) - round_mask;
+    fint2 = (fint2 > f16infty) ? f16infty : fint2; // Clamp to signed infinity if overflowed
+
+    if (fint < f32infty)
+        o = fint2 >> 13; // Take the bits!
+
+    return (o | (sign >> 16));
+}
+
+
+static FORCEINLINE __vec16_i16 __float_to_half_varying(__vec16_f v) {
+    __vec16_i16 ret;
+    for (int i = 0; i < 16; ++i)
+        ret.v[i] = __float_to_half_uniform(v.v[i]);
+    return ret;
+}
+
+
 ///////////////////////////////////////////////////////////////////////////
 // double
 
diff --git a/examples/intrinsics/generic-32.h b/examples/intrinsics/generic-32.h
new file mode 100644
index 00000000..244453a1
--- /dev/null
+++ b/examples/intrinsics/generic-32.h
@@ -0,0 +1,1688 @@
+/*
+  Copyright (c) 2010-2012, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#include <stdint.h>
+#include <math.h>
+
+#ifdef _MSC_VER
+#define FORCEINLINE __forceinline
+#define PRE_ALIGN(x)  /*__declspec(align(x))*/
+#define POST_ALIGN(x)  
+#define roundf(x) (floorf(x + .5f))
+#define round(x) (floor(x + .5))
+#else
+#define FORCEINLINE __attribute__((always_inline))
+#define PRE_ALIGN(x)
+#define POST_ALIGN(x)  __attribute__ ((aligned(x)))
+#endif
+
+typedef float __vec1_f;
+typedef double __vec1_d;
+typedef int8_t __vec1_i8;
+typedef int16_t __vec1_i16;
+typedef int32_t __vec1_i32;
+typedef int64_t __vec1_i64;
+
+struct __vec32_i1 {
+    __vec32_i1() { }
+    __vec32_i1(uint32_t v0, uint32_t v1, uint32_t v2, uint32_t v3,
+               uint32_t v4, uint32_t v5, uint32_t v6, uint32_t v7,
+               uint32_t v8, uint32_t v9, uint32_t v10, uint32_t v11,
+               uint32_t v12, uint32_t v13, uint32_t v14, uint32_t v15,
+               uint32_t v16, uint32_t v17, uint32_t v18, uint32_t v19,
+               uint32_t v20, uint32_t v21, uint32_t v22, uint32_t v23,
+               uint32_t v24, uint32_t v25, uint32_t v26, uint32_t v27,
+               uint32_t v28, uint32_t v29, uint32_t v30, uint32_t v31) {
+        v = ((v0 & 1) |
+             ((v1 & 1) << 1) |
+             ((v2 & 1) << 2) |
+             ((v3 & 1) << 3) |
+             ((v4 & 1) << 4) |
+             ((v5 & 1) << 5) |
+             ((v6 & 1) << 6) |
+             ((v7 & 1) << 7) |
+             ((v8 & 1) << 8) |
+             ((v9 & 1) << 9) |
+             ((v10 & 1) << 10) |
+             ((v11 & 1) << 11) |
+             ((v12 & 1) << 12) |
+             ((v13 & 1) << 13) |
+             ((v14 & 1) << 14) |
+             ((v15 & 1) << 15) |
+             ((v16 & 1) << 16) |
+             ((v17 & 1) << 17) |
+             ((v18 & 1) << 18) |
+             ((v19 & 1) << 19) |
+             ((v20 & 1) << 20) |
+             ((v21 & 1) << 21) |
+             ((v22 & 1) << 22) |
+             ((v23 & 1) << 23) |
+             ((v24 & 1) << 24) |
+             ((v25 & 1) << 25) |
+             ((v26 & 1) << 26) |
+             ((v27 & 1) << 27) |
+             ((v28 & 1) << 28) |
+             ((v29 & 1) << 29) |
+             ((v30 & 1) << 30) |
+             ((v31 & 1) << 31));
+    }
+             
+    uint32_t v;
+};
+
+
+template <typename T>
+struct vec32 {
+    vec32() { }
+    vec32(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
+          T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15,
+          T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23,
+          T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31) {
+        v[0] = v0;        v[1] = v1;        v[2] = v2;        v[3] = v3;
+        v[4] = v4;        v[5] = v5;        v[6] = v6;        v[7] = v7;
+        v[8] = v8;        v[9] = v9;        v[10] = v10;      v[11] = v11;
+        v[12] = v12;      v[13] = v13;      v[14] = v14;      v[15] = v15;
+        v[16] = v16;      v[17] = v17;      v[18] = v18;      v[19] = v19;
+        v[20] = v20;      v[21] = v21;      v[22] = v22;      v[23] = v23;
+        v[24] = v24;      v[25] = v25;      v[26] = v26;      v[27] = v27;
+        v[28] = v28;      v[29] = v29;      v[30] = v30;      v[31] = v31;
+    }
+    T v[32]; 
+};
+
+PRE_ALIGN(64) struct __vec32_f : public vec32<float> { 
+    __vec32_f() { }
+    __vec32_f(float v0, float v1, float v2, float v3, 
+              float v4, float v5, float v6, float v7,
+              float v8, float v9, float v10, float v11, 
+              float v12, float v13, float v14, float v15,
+              float v16, float v17, float v18, float v19,
+              float v20, float v21, float v22, float v23,
+              float v24, float v25, float v26, float v27,
+              float v28, float v29, float v30, float v31)
+        : vec32<float>(v0, v1, v2, v3, v4, v5, v6, v7,
+                       v8, v9, v10, v11, v12, v13, v14, v15,
+                       v16, v17, v18, v19, v20, v21, v22, v23,
+                       v24, v25, v26, v27, v28, v29, v30, v31) { }
+} POST_ALIGN(64);
+
+PRE_ALIGN(128) struct __vec32_d : public vec32<double> { 
+    __vec32_d() { }
+    __vec32_d(double v0, double v1, double v2, double v3, 
+              double v4, double v5, double v6, double v7,
+              double v8, double v9, double v10, double v11, 
+              double v12, double v13, double v14, double v15,
+              double v16, double v17, double v18, double v19,
+              double v20, double v21, double v22, double v23,
+              double v24, double v25, double v26, double v27,
+              double v28, double v29, double v30, double v31)
+        : vec32<double>(v0, v1, v2, v3, v4, v5, v6, v7,
+                       v8, v9, v10, v11, v12, v13, v14, v15,
+                       v16, v17, v18, v19, v20, v21, v22, v23,
+                       v24, v25, v26, v27, v28, v29, v30, v31) { }
+
+} POST_ALIGN(128);
+
+PRE_ALIGN(16) struct __vec32_i8   : public vec32<int8_t> { 
+    __vec32_i8() { }
+    __vec32_i8(int8_t v0, int8_t v1, int8_t v2, int8_t v3, 
+               int8_t v4, int8_t v5, int8_t v6, int8_t v7,
+               int8_t v8, int8_t v9, int8_t v10, int8_t v11, 
+               int8_t v12, int8_t v13, int8_t v14, int8_t v15,
+               int8_t v16, int8_t v17, int8_t v18, int8_t v19,
+               int8_t v20, int8_t v21, int8_t v22, int8_t v23,
+               int8_t v24, int8_t v25, int8_t v26, int8_t v27,
+               int8_t v28, int8_t v29, int8_t v30, int8_t v31)
+        : vec32<int8_t>(v0, v1, v2, v3, v4, v5, v6, v7,
+                       v8, v9, v10, v11, v12, v13, v14, v15,
+                       v16, v17, v18, v19, v20, v21, v22, v23,
+                       v24, v25, v26, v27, v28, v29, v30, v31) { }
+
+} POST_ALIGN(16);
+
+PRE_ALIGN(32) struct __vec32_i16  : public vec32<int16_t> { 
+    __vec32_i16() { }
+    __vec32_i16(int16_t v0, int16_t v1, int16_t v2, int16_t v3, 
+                int16_t v4, int16_t v5, int16_t v6, int16_t v7,
+                int16_t v8, int16_t v9, int16_t v10, int16_t v11, 
+                int16_t v12, int16_t v13, int16_t v14, int16_t v15,
+                int16_t v16, int16_t v17, int16_t v18, int16_t v19,
+                int16_t v20, int16_t v21, int16_t v22, int16_t v23,
+                int16_t v24, int16_t v25, int16_t v26, int16_t v27,
+                int16_t v28, int16_t v29, int16_t v30, int16_t v31)
+        : vec32<int16_t>(v0, v1, v2, v3, v4, v5, v6, v7,
+                         v8, v9, v10, v11, v12, v13, v14, v15,
+                         v16, v17, v18, v19, v20, v21, v22, v23,
+                         v24, v25, v26, v27, v28, v29, v30, v31) { }
+
+} POST_ALIGN(32);
+
+PRE_ALIGN(64) struct __vec32_i32  : public vec32<int32_t> { 
+    __vec32_i32() { }
+    __vec32_i32(int32_t v0, int32_t v1, int32_t v2, int32_t v3, 
+                int32_t v4, int32_t v5, int32_t v6, int32_t v7,
+                int32_t v8, int32_t v9, int32_t v10, int32_t v11, 
+                int32_t v12, int32_t v13, int32_t v14, int32_t v15,
+                int32_t v16, int32_t v17, int32_t v18, int32_t v19,
+                int32_t v20, int32_t v21, int32_t v22, int32_t v23,
+                int32_t v24, int32_t v25, int32_t v26, int32_t v27,
+                int32_t v28, int32_t v29, int32_t v30, int32_t v31)
+        : vec32<int32_t>(v0, v1, v2, v3, v4, v5, v6, v7,
+                         v8, v9, v10, v11, v12, v13, v14, v15,
+                         v16, v17, v18, v19, v20, v21, v22, v23,
+                         v24, v25, v26, v27, v28, v29, v30, v31) { }
+
+} POST_ALIGN(64);
+
+static inline int32_t __extract_element(__vec32_i32, int);
+
+PRE_ALIGN(128) struct __vec32_i64  : public vec32<int64_t> { 
+    __vec32_i64() { }
+    __vec32_i64(int64_t v0, int64_t v1, int64_t v2, int64_t v3, 
+                int64_t v4, int64_t v5, int64_t v6, int64_t v7,
+                int64_t v8, int64_t v9, int64_t v10, int64_t v11, 
+                int64_t v12, int64_t v13, int64_t v14, int64_t v15,
+                int64_t v16, int64_t v17, int64_t v18, int64_t v19,
+                int64_t v20, int64_t v21, int64_t v22, int64_t v23,
+                int64_t v24, int64_t v25, int64_t v26, int64_t v27,
+                int64_t v28, int64_t v29, int64_t v30, int64_t v31)
+        : vec32<int64_t>(v0, v1, v2, v3, v4, v5, v6, v7,
+                         v8, v9, v10, v11, v12, v13, v14, v15,
+                         v16, v17, v18, v19, v20, v21, v22, v23,
+                         v24, v25, v26, v27, v28, v29, v30, v31) { }
+
+} POST_ALIGN(128);
+
+///////////////////////////////////////////////////////////////////////////
+// macros...
+
+#define UNARY_OP(TYPE, NAME, OP)            \
+static FORCEINLINE TYPE NAME(TYPE v) {      \
+    TYPE ret;                               \
+    for (int i = 0; i < 32; ++i)            \
+        ret.v[i] = OP(v.v[i]);              \
+    return ret;                             \
+}
+
+#define BINARY_OP(TYPE, NAME, OP)                               \
+static FORCEINLINE TYPE NAME(TYPE a, TYPE b) {                  \
+    TYPE ret;                                                   \
+   for (int i = 0; i < 32; ++i)                                 \
+       ret.v[i] = a.v[i] OP b.v[i];                             \
+   return ret;                                                   \
+}
+
+#define BINARY_OP_CAST(TYPE, CAST, NAME, OP)                        \
+static FORCEINLINE TYPE NAME(TYPE a, TYPE b) {                      \
+   TYPE ret;                                                        \
+   for (int i = 0; i < 32; ++i)                                     \
+       ret.v[i] = (CAST)(a.v[i]) OP (CAST)(b.v[i]);                 \
+   return ret;                                                      \
+}
+
+#define BINARY_OP_FUNC(TYPE, NAME, FUNC)                            \
+static FORCEINLINE TYPE NAME(TYPE a, TYPE b) {                      \
+   TYPE ret;                                                        \
+   for (int i = 0; i < 32; ++i)                                     \
+       ret.v[i] = FUNC(a.v[i], b.v[i]);                             \
+   return ret;                                                      \
+}
+
+#define CMP_OP(TYPE, CAST, NAME, OP)                                \
+static FORCEINLINE __vec32_i1 NAME(TYPE a, TYPE b) {                \
+   __vec32_i1 ret;                                                  \
+   ret.v = 0;                                                       \
+   for (int i = 0; i < 32; ++i)                                     \
+       ret.v |= ((CAST)(a.v[i]) OP (CAST)(b.v[i])) << i;            \
+   return ret;                                                      \
+}
+
+#define INSERT_EXTRACT(VTYPE, STYPE)                                  \
+static FORCEINLINE STYPE __extract_element(VTYPE v, int index) {      \
+    return ((STYPE *)&v)[index];                                      \
+}                                                                     \
+static FORCEINLINE void __insert_element(VTYPE *v, int index, STYPE val) { \
+    ((STYPE *)v)[index] = val;                                        \
+}
+
+#define LOAD_STORE(VTYPE, STYPE)                       \
+static FORCEINLINE VTYPE __load(VTYPE *p, int align) { \
+    STYPE *ptr = (STYPE *)p;                           \
+    VTYPE ret;                                         \
+    for (int i = 0; i < 32; ++i)                       \
+        ret.v[i] = ptr[i];                             \
+    return ret;                                        \
+}                                                      \
+static FORCEINLINE void __store(VTYPE *p, VTYPE v, int align) {    \
+    STYPE *ptr = (STYPE *)p;                           \
+    for (int i = 0; i < 32; ++i)                       \
+        ptr[i] = v.v[i];                               \
+}
+
+#define REDUCE_ADD(TYPE, VTYPE, NAME)           \
+static FORCEINLINE TYPE NAME(VTYPE v) {         \
+     TYPE ret = v.v[0];                         \
+     for (int i = 1; i < 32; ++i)               \
+         ret = ret + v.v[i];                    \
+     return ret;                                \
+}
+
+#define REDUCE_MINMAX(TYPE, VTYPE, NAME, OP)                    \
+static FORCEINLINE TYPE NAME(VTYPE v) {                         \
+    TYPE ret = v.v[0];                                          \
+    for (int i = 1; i < 32; ++i)                                \
+        ret = (ret OP (TYPE)v.v[i]) ? ret : (TYPE)v.v[i];       \
+    return ret;                                                 \
+}
+
+#define SELECT(TYPE)                                                \
+static FORCEINLINE TYPE __select(__vec32_i1 mask, TYPE a, TYPE b) { \
+    TYPE ret;                                                       \
+    for (int i = 0; i < 32; ++i)                                    \
+        ret.v[i] = (mask.v & (1<<i)) ? a.v[i] : b.v[i];             \
+    return ret;                                                     \
+}                                                                   \
+static FORCEINLINE TYPE __select(bool cond, TYPE a, TYPE b) {       \
+    return cond ? a : b;                                            \
+}
+
+#define SHIFT_UNIFORM(TYPE, CAST, NAME, OP)                         \
+static FORCEINLINE TYPE NAME(TYPE a, int32_t b) {                   \
+   TYPE ret;                                                        \
+   for (int i = 0; i < 32; ++i)                                     \
+       ret.v[i] = (CAST)(a.v[i]) OP b;                              \
+   return ret;                                                      \
+}
+
+#define SMEAR(VTYPE, NAME, STYPE)                                  \
+static FORCEINLINE VTYPE __smear_##NAME(VTYPE retType, STYPE v) {  \
+    VTYPE ret;                                                     \
+    for (int i = 0; i < 32; ++i)                                   \
+        ret.v[i] = v;                                              \
+    return ret;                                                    \
+}
+
+#define BROADCAST(VTYPE, NAME, STYPE)                 \
+static FORCEINLINE VTYPE __broadcast_##NAME(VTYPE v, int index) {   \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 32; ++i)                      \
+        ret.v[i] = v.v[index & 31];                   \
+    return ret;                                       \
+}                                                     \
+
+#define ROTATE(VTYPE, NAME, STYPE)                    \
+static FORCEINLINE VTYPE __rotate_##NAME(VTYPE v, int index) {   \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 32; ++i)                      \
+        ret.v[i] = v.v[(i+index) & 31];               \
+    return ret;                                       \
+}                                                     \
+
+#define SHUFFLES(VTYPE, NAME, STYPE)                 \
+static FORCEINLINE VTYPE __shuffle_##NAME(VTYPE v, __vec32_i32 index) {   \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 32; ++i)                      \
+        ret.v[i] = v.v[__extract_element(index, i) & 31];       \
+    return ret;                                       \
+}                                                     \
+static FORCEINLINE VTYPE __shuffle2_##NAME(VTYPE v0, VTYPE v1, __vec32_i32 index) {     \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 32; ++i) {                    \
+        int ii = __extract_element(index, i) & 63;   \
+        ret.v[i] = (ii < 32) ? v0.v[ii] : v1.v[ii-32];  \
+    }                                                 \
+    return ret;                                       \
+}
+
+///////////////////////////////////////////////////////////////////////////
+
+INSERT_EXTRACT(__vec1_i8, int8_t)
+INSERT_EXTRACT(__vec1_i16, int16_t)
+INSERT_EXTRACT(__vec1_i32, int32_t)
+INSERT_EXTRACT(__vec1_i64, int64_t)
+INSERT_EXTRACT(__vec1_f, float)
+INSERT_EXTRACT(__vec1_d, double)
+
+///////////////////////////////////////////////////////////////////////////
+// mask ops
+
+static FORCEINLINE uint64_t __movmsk(__vec32_i1 mask) {
+    return (uint64_t)mask.v;
+}
+
+static FORCEINLINE __vec32_i1 __equal(__vec32_i1 a, __vec32_i1 b) {
+    __vec32_i1 r;
+    r.v = (a.v & b.v) | (~a.v & ~b.v);
+    return r;
+}
+
+static FORCEINLINE __vec32_i1 __and(__vec32_i1 a, __vec32_i1 b) {
+    __vec32_i1 r;
+    r.v = a.v & b.v;
+    return r;
+}
+
+static FORCEINLINE __vec32_i1 __xor(__vec32_i1 a, __vec32_i1 b) {
+    __vec32_i1 r;
+    r.v = a.v ^ b.v;
+    return r;
+}
+
+static FORCEINLINE __vec32_i1 __or(__vec32_i1 a, __vec32_i1 b) {
+    __vec32_i1 r;
+    r.v = a.v | b.v;
+    return r;
+}
+
+static FORCEINLINE __vec32_i1 __not(__vec32_i1 v) {
+    __vec32_i1 r;
+    r.v = ~v.v;
+    return r;
+}
+
+static FORCEINLINE __vec32_i1 __and_not1(__vec32_i1 a, __vec32_i1 b) {
+    __vec32_i1 r;
+    r.v = ~a.v & b.v;
+    return r;
+}
+
+static FORCEINLINE __vec32_i1 __and_not2(__vec32_i1 a, __vec32_i1 b) {
+    __vec32_i1 r;
+    r.v = a.v & ~b.v;
+    return r;
+}
+
+static FORCEINLINE __vec32_i1 __select(__vec32_i1 mask, __vec32_i1 a, 
+                                       __vec32_i1 b) {
+    __vec32_i1 r;
+    r.v = (a.v & mask.v) | (b.v & ~mask.v);
+    return r;
+}
+
+static FORCEINLINE __vec32_i1 __select(bool cond, __vec32_i1 a, __vec32_i1 b) {
+    return cond ? a : b;
+}
+
+static FORCEINLINE bool __extract_element(__vec32_i1 vec, int index) {
+    return (vec.v & (1 << index)) ? true : false;
+}
+
+static FORCEINLINE void __insert_element(__vec32_i1 *vec, int index, 
+                                         bool val) {
+    if (val == false)
+        vec->v &= ~(1 << index);
+    else
+        vec->v |= (1 << index);
+}
+
+static FORCEINLINE __vec32_i1 __load(__vec32_i1 *p, int align) {
+    uint16_t *ptr = (uint16_t *)p;
+    __vec32_i1 r;
+    r.v = *ptr;
+    return r;
+}
+
+static FORCEINLINE void __store(__vec32_i1 *p, __vec32_i1 v, int align) {
+    uint16_t *ptr = (uint16_t *)p;
+    *ptr = v.v;
+}
+
+static FORCEINLINE __vec32_i1 __smear_i1(__vec32_i1, int v) {
+    return __vec32_i1(v, v, v, v, v, v, v, v, 
+                      v, v, v, v, v, v, v, v,
+                      v, v, v, v, v, v, v, v,
+                      v, v, v, v, v, v, v, v);
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// int8
+
+BINARY_OP(__vec32_i8, __add, +)
+BINARY_OP(__vec32_i8, __sub, -)
+BINARY_OP(__vec32_i8, __mul, *)
+
+BINARY_OP(__vec32_i8, __or, |)
+BINARY_OP(__vec32_i8, __and, &)
+BINARY_OP(__vec32_i8, __xor, ^)
+BINARY_OP(__vec32_i8, __shl, <<)
+
+BINARY_OP_CAST(__vec32_i8, uint8_t, __udiv, /)
+BINARY_OP_CAST(__vec32_i8, int8_t,  __sdiv, /)
+
+BINARY_OP_CAST(__vec32_i8, uint8_t, __urem, %)
+BINARY_OP_CAST(__vec32_i8, int8_t,  __srem, %)
+BINARY_OP_CAST(__vec32_i8, uint8_t, __lshr, >>)
+BINARY_OP_CAST(__vec32_i8, int8_t,  __ashr, >>)
+
+SHIFT_UNIFORM(__vec32_i8, uint8_t, __lshr, >>)
+SHIFT_UNIFORM(__vec32_i8, int8_t, __ashr, >>)
+SHIFT_UNIFORM(__vec32_i8, int8_t, __shl, <<)
+
+CMP_OP(__vec32_i8, int8_t,  __equal, ==)
+CMP_OP(__vec32_i8, int8_t,  __not_equal, !=)
+CMP_OP(__vec32_i8, uint8_t, __unsigned_less_equal, <=)
+CMP_OP(__vec32_i8, int8_t,  __signed_less_equal, <=)
+CMP_OP(__vec32_i8, uint8_t, __unsigned_greater_equal, >=)
+CMP_OP(__vec32_i8, int8_t,  __signed_greater_equal, >=)
+CMP_OP(__vec32_i8, uint8_t, __unsigned_less_than, <)
+CMP_OP(__vec32_i8, int8_t,  __signed_less_than, <)
+CMP_OP(__vec32_i8, uint8_t, __unsigned_greater_than, >)
+CMP_OP(__vec32_i8, int8_t,  __signed_greater_than, >)
+
+SELECT(__vec32_i8)
+INSERT_EXTRACT(__vec32_i8, int8_t)
+SMEAR(__vec32_i8, i8, int8_t)
+BROADCAST(__vec32_i8, i8, int8_t)
+ROTATE(__vec32_i8, i8, int8_t)
+SHUFFLES(__vec32_i8, i8, int8_t)
+LOAD_STORE(__vec32_i8, int8_t)
+
+///////////////////////////////////////////////////////////////////////////
+// int16
+
+BINARY_OP(__vec32_i16, __add, +)
+BINARY_OP(__vec32_i16, __sub, -)
+BINARY_OP(__vec32_i16, __mul, *)
+
+BINARY_OP(__vec32_i16, __or, |)
+BINARY_OP(__vec32_i16, __and, &)
+BINARY_OP(__vec32_i16, __xor, ^)
+BINARY_OP(__vec32_i16, __shl, <<)
+
+BINARY_OP_CAST(__vec32_i16, uint16_t, __udiv, /)
+BINARY_OP_CAST(__vec32_i16, int16_t,  __sdiv, /)
+
+BINARY_OP_CAST(__vec32_i16, uint16_t, __urem, %)
+BINARY_OP_CAST(__vec32_i16, int16_t,  __srem, %)
+BINARY_OP_CAST(__vec32_i16, uint16_t, __lshr, >>)
+BINARY_OP_CAST(__vec32_i16, int16_t,  __ashr, >>)
+
+SHIFT_UNIFORM(__vec32_i16, uint16_t, __lshr, >>)
+SHIFT_UNIFORM(__vec32_i16, int16_t, __ashr, >>)
+SHIFT_UNIFORM(__vec32_i16, int16_t, __shl, <<)
+
+CMP_OP(__vec32_i16, int16_t,  __equal, ==)
+CMP_OP(__vec32_i16, int16_t,  __not_equal, !=)
+CMP_OP(__vec32_i16, uint16_t, __unsigned_less_equal, <=)
+CMP_OP(__vec32_i16, int16_t,  __signed_less_equal, <=)
+CMP_OP(__vec32_i16, uint16_t, __unsigned_greater_equal, >=)
+CMP_OP(__vec32_i16, int16_t,  __signed_greater_equal, >=)
+CMP_OP(__vec32_i16, uint16_t, __unsigned_less_than, <)
+CMP_OP(__vec32_i16, int16_t,  __signed_less_than, <)
+CMP_OP(__vec32_i16, uint16_t, __unsigned_greater_than, >)
+CMP_OP(__vec32_i16, int16_t,  __signed_greater_than, >)
+
+SELECT(__vec32_i16)
+INSERT_EXTRACT(__vec32_i16, int16_t)
+SMEAR(__vec32_i16, i16, int16_t)
+BROADCAST(__vec32_i16, i16, int16_t)
+ROTATE(__vec32_i16, i16, int16_t)
+SHUFFLES(__vec32_i16, i16, int16_t)
+LOAD_STORE(__vec32_i16, int16_t)
+
+///////////////////////////////////////////////////////////////////////////
+// int32
+
+BINARY_OP(__vec32_i32, __add, +)
+BINARY_OP(__vec32_i32, __sub, -)
+BINARY_OP(__vec32_i32, __mul, *)
+
+BINARY_OP(__vec32_i32, __or, |)
+BINARY_OP(__vec32_i32, __and, &)
+BINARY_OP(__vec32_i32, __xor, ^)
+BINARY_OP(__vec32_i32, __shl, <<)
+
+BINARY_OP_CAST(__vec32_i32, uint32_t, __udiv, /)
+BINARY_OP_CAST(__vec32_i32, int32_t,  __sdiv, /)
+
+BINARY_OP_CAST(__vec32_i32, uint32_t, __urem, %)
+BINARY_OP_CAST(__vec32_i32, int32_t,  __srem, %)
+BINARY_OP_CAST(__vec32_i32, uint32_t, __lshr, >>)
+BINARY_OP_CAST(__vec32_i32, int32_t,  __ashr, >>)
+
+SHIFT_UNIFORM(__vec32_i32, uint32_t, __lshr, >>)
+SHIFT_UNIFORM(__vec32_i32, int32_t, __ashr, >>)
+SHIFT_UNIFORM(__vec32_i32, int32_t, __shl, <<)
+
+CMP_OP(__vec32_i32, int32_t,  __equal, ==)
+CMP_OP(__vec32_i32, int32_t,  __not_equal, !=)
+CMP_OP(__vec32_i32, uint32_t, __unsigned_less_equal, <=)
+CMP_OP(__vec32_i32, int32_t,  __signed_less_equal, <=)
+CMP_OP(__vec32_i32, uint32_t, __unsigned_greater_equal, >=)
+CMP_OP(__vec32_i32, int32_t,  __signed_greater_equal, >=)
+CMP_OP(__vec32_i32, uint32_t, __unsigned_less_than, <)
+CMP_OP(__vec32_i32, int32_t,  __signed_less_than, <)
+CMP_OP(__vec32_i32, uint32_t, __unsigned_greater_than, >)
+CMP_OP(__vec32_i32, int32_t,  __signed_greater_than, >)
+
+SELECT(__vec32_i32)
+INSERT_EXTRACT(__vec32_i32, int32_t)
+SMEAR(__vec32_i32, i32, int32_t)
+BROADCAST(__vec32_i32, i32, int32_t)
+ROTATE(__vec32_i32, i32, int32_t)
+SHUFFLES(__vec32_i32, i32, int32_t)
+LOAD_STORE(__vec32_i32, int32_t)
+
+///////////////////////////////////////////////////////////////////////////
+// int64
+
+BINARY_OP(__vec32_i64, __add, +)
+BINARY_OP(__vec32_i64, __sub, -)
+BINARY_OP(__vec32_i64, __mul, *)
+
+BINARY_OP(__vec32_i64, __or, |)
+BINARY_OP(__vec32_i64, __and, &)
+BINARY_OP(__vec32_i64, __xor, ^)
+BINARY_OP(__vec32_i64, __shl, <<)
+
+BINARY_OP_CAST(__vec32_i64, uint64_t, __udiv, /)
+BINARY_OP_CAST(__vec32_i64, int64_t,  __sdiv, /)
+
+BINARY_OP_CAST(__vec32_i64, uint64_t, __urem, %)
+BINARY_OP_CAST(__vec32_i64, int64_t,  __srem, %)
+BINARY_OP_CAST(__vec32_i64, uint64_t, __lshr, >>)
+BINARY_OP_CAST(__vec32_i64, int64_t,  __ashr, >>)
+
+SHIFT_UNIFORM(__vec32_i64, uint64_t, __lshr, >>)
+SHIFT_UNIFORM(__vec32_i64, int64_t, __ashr, >>)
+SHIFT_UNIFORM(__vec32_i64, int64_t, __shl, <<)
+
+CMP_OP(__vec32_i64, int64_t,  __equal, ==)
+CMP_OP(__vec32_i64, int64_t,  __not_equal, !=)
+CMP_OP(__vec32_i64, uint64_t, __unsigned_less_equal, <=)
+CMP_OP(__vec32_i64, int64_t,  __signed_less_equal, <=)
+CMP_OP(__vec32_i64, uint64_t, __unsigned_greater_equal, >=)
+CMP_OP(__vec32_i64, int64_t,  __signed_greater_equal, >=)
+CMP_OP(__vec32_i64, uint64_t, __unsigned_less_than, <)
+CMP_OP(__vec32_i64, int64_t,  __signed_less_than, <)
+CMP_OP(__vec32_i64, uint64_t, __unsigned_greater_than, >)
+CMP_OP(__vec32_i64, int64_t,  __signed_greater_than, >)
+
+SELECT(__vec32_i64)
+INSERT_EXTRACT(__vec32_i64, int64_t)
+SMEAR(__vec32_i64, i64, int64_t)
+BROADCAST(__vec32_i64, i64, int64_t)
+ROTATE(__vec32_i64, i64, int64_t)
+SHUFFLES(__vec32_i64, i64, int64_t)
+LOAD_STORE(__vec32_i64, int64_t)
+
+///////////////////////////////////////////////////////////////////////////
+// float
+
+BINARY_OP(__vec32_f, __add, +)
+BINARY_OP(__vec32_f, __sub, -)
+BINARY_OP(__vec32_f, __mul, *)
+BINARY_OP(__vec32_f, __div, /)
+
+CMP_OP(__vec32_f, float, __equal, ==)
+CMP_OP(__vec32_f, float, __not_equal, !=)
+CMP_OP(__vec32_f, float, __less_than, <)
+CMP_OP(__vec32_f, float, __less_equal, <=)
+CMP_OP(__vec32_f, float, __greater_than, >)
+CMP_OP(__vec32_f, float, __greater_equal, >=)
+
+static FORCEINLINE __vec32_i1 __ordered(__vec32_f a, __vec32_f b) {
+    __vec32_i1 ret;
+    ret.v = 0;
+    for (int i = 0; i < 32; ++i)
+        ret.v |= ((a.v[i] == a.v[i]) && (b.v[i] == b.v[i])) ? (1 << i) : 0;
+    return ret;
+}
+
+#if 0
+      case Instruction::FRem: intrinsic = "__frem"; break;
+#endif
+
+SELECT(__vec32_f)
+INSERT_EXTRACT(__vec32_f, float)
+SMEAR(__vec32_f, float, float)
+BROADCAST(__vec32_f, float, float)
+ROTATE(__vec32_f, float, float)
+SHUFFLES(__vec32_f, float, float)
+LOAD_STORE(__vec32_f, float)
+
+static FORCEINLINE float __exp_uniform_float(float v) {
+    return expf(v);
+}
+
+static FORCEINLINE __vec32_f __exp_varying_float(__vec32_f v) {
+    __vec32_f ret;
+    for (int i = 0; i < 32; ++i)
+        ret.v[i] = expf(v.v[i]);
+    return ret;
+}
+
+static FORCEINLINE float __log_uniform_float(float v) {
+    return logf(v);
+}
+
+static FORCEINLINE __vec32_f __log_varying_float(__vec32_f v) {
+    __vec32_f ret;
+    for (int i = 0; i < 32; ++i)
+        ret.v[i] = logf(v.v[i]);
+    return ret;
+}
+
+static FORCEINLINE float __pow_uniform_float(float a, float b) {
+    return powf(a, b);
+}
+
+static FORCEINLINE __vec32_f __pow_varying_float(__vec32_f a, __vec32_f b) {
+    __vec32_f ret;
+    for (int i = 0; i < 32; ++i)
+        ret.v[i] = powf(a.v[i], b.v[i]);
+    return ret;
+}
+
+static FORCEINLINE int __intbits(float v) {
+    union {
+        float f;
+        int i;
+    } u;
+    u.f = v;
+    return u.i;
+}
+
+static FORCEINLINE float __floatbits(int v) {
+    union {
+        float f;
+        int i;
+    } u;
+    u.i = v;
+    return u.f;
+}
+
+static FORCEINLINE float __half_to_float_uniform(int16_t h) {
+    static const uint32_t shifted_exp = 0x7c00 << 13; // exponent mask after shift
+
+    int32_t o = ((int32_t)(h & 0x7fff)) << 13;     // exponent/mantissa bits
+    uint32_t exp = shifted_exp & o;   // just the exponent
+    o += (127 - 15) << 23;        // exponent adjust
+
+    // handle exponent special cases
+    if (exp == shifted_exp) // Inf/NaN?
+        o += (128 - 16) << 23;    // extra exp adjust
+    else if (exp == 0) { // Zero/Denormal?
+        o += 1 << 23;             // extra exp adjust
+        o = __intbits(__floatbits(o) - __floatbits(113 << 23)); // renormalize
+    }
+
+    o |= ((int32_t)(h & 0x8000)) << 16;    // sign bit
+    return __floatbits(o);
+}
+
+
+static FORCEINLINE __vec32_f __half_to_float_varying(__vec32_i16 v) {
+    __vec32_f ret;
+    for (int i = 0; i < 32; ++i)
+        ret.v[i] = __half_to_float_uniform(v.v[i]);
+    return ret;
+}
+
+
+static FORCEINLINE int16_t __float_to_half_uniform(float f) {
+    uint32_t sign_mask = 0x80000000u;
+    int32_t o;
+
+    int32_t fint = __intbits(f);
+    int32_t sign = fint & sign_mask;
+    fint ^= sign;
+
+    int32_t f32infty = 255 << 23;
+    o = (fint > f32infty) ? 0x7e00 : 0x7c00; 
+
+    // (De)normalized number or zero
+    // update fint unconditionally to save the blending; we don't need it
+    // anymore for the Inf/NaN case anyway.
+    const uint32_t round_mask = ~0xfffu; 
+    const int32_t magic = 15 << 23;
+    const int32_t f16infty = 31 << 23;
+
+    int32_t fint2 = __intbits(__floatbits(fint & round_mask) * __floatbits(magic)) - round_mask;
+    fint2 = (fint2 > f16infty) ? f16infty : fint2; // Clamp to signed infinity if overflowed
+
+    if (fint < f32infty)
+        o = fint2 >> 13; // Take the bits!
+
+    return (o | (sign >> 16));
+}
+
+
+static FORCEINLINE __vec32_i16 __float_to_half_varying(__vec32_f v) {
+    __vec32_i16 ret;
+    for (int i = 0; i < 32; ++i)
+        ret.v[i] = __float_to_half_uniform(v.v[i]);
+    return ret;
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// double
+
+BINARY_OP(__vec32_d, __add, +)
+BINARY_OP(__vec32_d, __sub, -)
+BINARY_OP(__vec32_d, __mul, *)
+BINARY_OP(__vec32_d, __div, /)
+
+CMP_OP(__vec32_d, double, __equal, ==)
+CMP_OP(__vec32_d, double, __not_equal, !=)
+CMP_OP(__vec32_d, double, __less_than, <)
+CMP_OP(__vec32_d, double, __less_equal, <=)
+CMP_OP(__vec32_d, double, __greater_than, >)
+CMP_OP(__vec32_d, double, __greater_equal, >=)
+
+static FORCEINLINE __vec32_i1 __ordered(__vec32_d a, __vec32_d b) {
+    __vec32_i1 ret;
+    ret.v = 0;
+    for (int i = 0; i < 32; ++i)
+        ret.v |= ((a.v[i] == a.v[i]) && (b.v[i] == b.v[i])) ? (1 << i) : 0;
+    return ret;
+}
+
+#if 0
+      case Instruction::FRem: intrinsic = "__frem"; break;
+#endif
+
+SELECT(__vec32_d)
+INSERT_EXTRACT(__vec32_d, double)
+SMEAR(__vec32_d, double, double)
+BROADCAST(__vec32_d, double, double)
+ROTATE(__vec32_d, double, double)
+SHUFFLES(__vec32_d, double, double)
+LOAD_STORE(__vec32_d, double)
+
+///////////////////////////////////////////////////////////////////////////
+// casts
+
+
+#define CAST(TO, STO, FROM, SFROM, FUNC)        \
+static FORCEINLINE TO FUNC(TO, FROM val) {      \
+    TO ret;                                     \
+    for (int i = 0; i < 32; ++i)                \
+        ret.v[i] = (STO)((SFROM)(val.v[i]));    \
+    return ret;                                 \
+}
+
+// sign extension conversions
+CAST(__vec32_i64, int64_t, __vec32_i32, int32_t, __cast_sext)
+CAST(__vec32_i64, int64_t, __vec32_i16, int16_t, __cast_sext)
+CAST(__vec32_i64, int64_t, __vec32_i8,  int8_t,  __cast_sext)
+CAST(__vec32_i32, int32_t, __vec32_i16, int16_t, __cast_sext)
+CAST(__vec32_i32, int32_t, __vec32_i8,  int8_t,  __cast_sext)
+CAST(__vec32_i16, int16_t, __vec32_i8,  int8_t,  __cast_sext)
+
+#define CAST_SEXT_I1(TYPE)                            \
+static FORCEINLINE TYPE __cast_sext(TYPE, __vec32_i1 v) {  \
+    TYPE ret;                                         \
+    for (int i = 0; i < 32; ++i) {                    \
+        ret.v[i] = 0;                                 \
+        if (v.v & (1 << i))                           \
+            ret.v[i] = ~ret.v[i];                     \
+    }                                                 \
+    return ret;                                       \
+}
+
+CAST_SEXT_I1(__vec32_i8)
+CAST_SEXT_I1(__vec32_i16)
+CAST_SEXT_I1(__vec32_i32)
+CAST_SEXT_I1(__vec32_i64)
+
+// zero extension
+CAST(__vec32_i64, uint64_t, __vec32_i32, uint32_t, __cast_zext)
+CAST(__vec32_i64, uint64_t, __vec32_i16, uint16_t, __cast_zext)
+CAST(__vec32_i64, uint64_t, __vec32_i8,  uint8_t,  __cast_zext)
+CAST(__vec32_i32, uint32_t, __vec32_i16, uint16_t, __cast_zext)
+CAST(__vec32_i32, uint32_t, __vec32_i8,  uint8_t,  __cast_zext)
+CAST(__vec32_i16, uint16_t, __vec32_i8,  uint8_t,  __cast_zext)
+
+#define CAST_ZEXT_I1(TYPE)                            \
+static FORCEINLINE TYPE __cast_zext(TYPE, __vec32_i1 v) {  \
+    TYPE ret;                                         \
+    for (int i = 0; i < 32; ++i)                      \
+        ret.v[i] = (v.v & (1 << i)) ? 1 : 0;          \
+    return ret;                                       \
+}
+
+CAST_ZEXT_I1(__vec32_i8)
+CAST_ZEXT_I1(__vec32_i16)
+CAST_ZEXT_I1(__vec32_i32)
+CAST_ZEXT_I1(__vec32_i64)
+
+// truncations
+CAST(__vec32_i32, int32_t, __vec32_i64, int64_t, __cast_trunc)
+CAST(__vec32_i16, int16_t, __vec32_i64, int64_t, __cast_trunc)
+CAST(__vec32_i8,  int8_t,  __vec32_i64, int64_t, __cast_trunc)
+CAST(__vec32_i16, int16_t, __vec32_i32, int32_t, __cast_trunc)
+CAST(__vec32_i8,  int8_t,  __vec32_i32, int32_t, __cast_trunc)
+CAST(__vec32_i8,  int8_t,  __vec32_i16, int16_t, __cast_trunc)
+
+// signed int to float/double
+CAST(__vec32_f, float, __vec32_i8,   int8_t,  __cast_sitofp)
+CAST(__vec32_f, float, __vec32_i16,  int16_t, __cast_sitofp)
+CAST(__vec32_f, float, __vec32_i32,  int32_t, __cast_sitofp)
+CAST(__vec32_f, float, __vec32_i64,  int64_t, __cast_sitofp)
+CAST(__vec32_d, double, __vec32_i8,  int8_t,  __cast_sitofp)
+CAST(__vec32_d, double, __vec32_i16, int16_t, __cast_sitofp)
+CAST(__vec32_d, double, __vec32_i32, int32_t, __cast_sitofp)
+CAST(__vec32_d, double, __vec32_i64, int64_t, __cast_sitofp)
+
+// unsigned int to float/double
+CAST(__vec32_f, float, __vec32_i8,   uint8_t,  __cast_uitofp)
+CAST(__vec32_f, float, __vec32_i16,  uint16_t, __cast_uitofp)
+CAST(__vec32_f, float, __vec32_i32,  uint32_t, __cast_uitofp)
+CAST(__vec32_f, float, __vec32_i64,  uint64_t, __cast_uitofp)
+CAST(__vec32_d, double, __vec32_i8,  uint8_t,  __cast_uitofp)
+CAST(__vec32_d, double, __vec32_i16, uint16_t, __cast_uitofp)
+CAST(__vec32_d, double, __vec32_i32, uint32_t, __cast_uitofp)
+CAST(__vec32_d, double, __vec32_i64, uint64_t, __cast_uitofp)
+
+static FORCEINLINE __vec32_f __cast_uitofp(__vec32_f, __vec32_i1 v) {
+    __vec32_f ret;
+    for (int i = 0; i < 32; ++i)
+        ret.v[i] = (v.v & (1 << i)) ? 1. : 0.;
+    return ret;
+}
+
+// float/double to signed int
+CAST(__vec32_i8,  int8_t,  __vec32_f, float, __cast_fptosi)
+CAST(__vec32_i16, int16_t, __vec32_f, float, __cast_fptosi)
+CAST(__vec32_i32, int32_t, __vec32_f, float, __cast_fptosi)
+CAST(__vec32_i64, int64_t, __vec32_f, float, __cast_fptosi)
+CAST(__vec32_i8,  int8_t,  __vec32_d, double, __cast_fptosi)
+CAST(__vec32_i16, int16_t, __vec32_d, double, __cast_fptosi)
+CAST(__vec32_i32, int32_t, __vec32_d, double, __cast_fptosi)
+CAST(__vec32_i64, int64_t, __vec32_d, double, __cast_fptosi)
+
+// float/double to unsigned int
+CAST(__vec32_i8,  uint8_t,  __vec32_f, float, __cast_fptoui)
+CAST(__vec32_i16, uint16_t, __vec32_f, float, __cast_fptoui)
+CAST(__vec32_i32, uint32_t, __vec32_f, float, __cast_fptoui)
+CAST(__vec32_i64, uint64_t, __vec32_f, float, __cast_fptoui)
+CAST(__vec32_i8,  uint8_t,  __vec32_d, double, __cast_fptoui)
+CAST(__vec32_i16, uint16_t, __vec32_d, double, __cast_fptoui)
+CAST(__vec32_i32, uint32_t, __vec32_d, double, __cast_fptoui)
+CAST(__vec32_i64, uint64_t, __vec32_d, double, __cast_fptoui)
+
+// float/double conversions
+CAST(__vec32_f, float,  __vec32_d, double, __cast_fptrunc)
+CAST(__vec32_d, double, __vec32_f, float,  __cast_fpext)
+
+typedef union {
+    int32_t i32;
+    float f;
+    int64_t i64;
+    double d;
+} BitcastUnion;
+
+#define CAST_BITS(TO, TO_ELT, FROM, FROM_ELT)       \
+static FORCEINLINE TO __cast_bits(TO, FROM val) {   \
+    TO r;                                           \
+    for (int i = 0; i < 32; ++i) {                  \
+        BitcastUnion u;                             \
+        u.FROM_ELT = val.v[i];                      \
+        r.v[i] = u.TO_ELT;                          \
+    }                                               \
+    return r;                                       \
+}
+
+CAST_BITS(__vec32_f,   f,   __vec32_i32, i32)
+CAST_BITS(__vec32_i32, i32, __vec32_f,   f)
+CAST_BITS(__vec32_d,   d,   __vec32_i64, i64)
+CAST_BITS(__vec32_i64, i64, __vec32_d,   d)
+
+#define CAST_BITS_SCALAR(TO, FROM)                  \
+static FORCEINLINE TO __cast_bits(TO, FROM v) {     \
+    union {                                         \
+    TO to;                                          \
+    FROM from;                                      \
+    } u;                                            \
+    u.from = v;                                     \
+    return u.to;                                    \
+}
+
+CAST_BITS_SCALAR(uint32_t, float)
+CAST_BITS_SCALAR(int32_t, float)
+CAST_BITS_SCALAR(float, uint32_t)
+CAST_BITS_SCALAR(float, int32_t)
+CAST_BITS_SCALAR(uint64_t, double)
+CAST_BITS_SCALAR(int64_t, double)
+CAST_BITS_SCALAR(double, uint64_t)
+CAST_BITS_SCALAR(double, int64_t)
+
+///////////////////////////////////////////////////////////////////////////
+// various math functions
+
+static FORCEINLINE void __fastmath() {
+}
+
+static FORCEINLINE float __round_uniform_float(float v) {
+    return roundf(v);
+}
+
+static FORCEINLINE float __floor_uniform_float(float v)  {
+    return floorf(v);
+}
+
+static FORCEINLINE float __ceil_uniform_float(float v) {
+    return ceilf(v);
+}
+
+static FORCEINLINE double __round_uniform_double(double v) {
+    return round(v);
+}
+
+static FORCEINLINE double __floor_uniform_double(double v) {
+    return floor(v);
+}
+
+static FORCEINLINE double __ceil_uniform_double(double v) {
+    return ceil(v);
+}
+
+UNARY_OP(__vec32_f, __round_varying_float, roundf)
+UNARY_OP(__vec32_f, __floor_varying_float, floorf)
+UNARY_OP(__vec32_f, __ceil_varying_float, ceilf)
+UNARY_OP(__vec32_d, __round_varying_double, round)
+UNARY_OP(__vec32_d, __floor_varying_double, floor)
+UNARY_OP(__vec32_d, __ceil_varying_double, ceil)
+
+// min/max
+
+static FORCEINLINE float __min_uniform_float(float a, float b) { return (a<b) ? a : b; }
+static FORCEINLINE float __max_uniform_float(float a, float b) { return (a>b) ? a : b; }
+static FORCEINLINE double __min_uniform_double(double a, double b) { return (a<b) ? a : b; }
+static FORCEINLINE double __max_uniform_double(double a, double b) { return (a>b) ? a : b; }
+
+static FORCEINLINE int32_t __min_uniform_int32(int32_t a, int32_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int32_t __max_uniform_int32(int32_t a, int32_t b) { return (a>b) ? a : b; }
+static FORCEINLINE int32_t __min_uniform_uint32(uint32_t a, uint32_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int32_t __max_uniform_uint32(uint32_t a, uint32_t b) { return (a>b) ? a : b; }
+
+static FORCEINLINE int64_t __min_uniform_int64(int64_t a, int64_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int64_t __max_uniform_int64(int64_t a, int64_t b) { return (a>b) ? a : b; }
+static FORCEINLINE int64_t __min_uniform_uint64(uint64_t a, uint64_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int64_t __max_uniform_uint64(uint64_t a, uint64_t b) { return (a>b) ? a : b; }
+
+
+BINARY_OP_FUNC(__vec32_f, __max_varying_float, __max_uniform_float)
+BINARY_OP_FUNC(__vec32_f, __min_varying_float, __min_uniform_float)
+BINARY_OP_FUNC(__vec32_d, __max_varying_double, __max_uniform_double)
+BINARY_OP_FUNC(__vec32_d, __min_varying_double, __min_uniform_double)
+
+BINARY_OP_FUNC(__vec32_i32, __max_varying_int32, __max_uniform_int32)
+BINARY_OP_FUNC(__vec32_i32, __min_varying_int32, __min_uniform_int32)
+BINARY_OP_FUNC(__vec32_i32, __max_varying_uint32, __max_uniform_uint32)
+BINARY_OP_FUNC(__vec32_i32, __min_varying_uint32, __min_uniform_uint32)
+
+BINARY_OP_FUNC(__vec32_i64, __max_varying_int64, __max_uniform_int64)
+BINARY_OP_FUNC(__vec32_i64, __min_varying_int64, __min_uniform_int64)
+BINARY_OP_FUNC(__vec32_i64, __max_varying_uint64, __max_uniform_uint64)
+BINARY_OP_FUNC(__vec32_i64, __min_varying_uint64, __min_uniform_uint64)
+
+// sqrt/rsqrt/rcp
+
+static FORCEINLINE float __rsqrt_uniform_float(float v) {
+    return 1.f / sqrtf(v);
+}
+
+static FORCEINLINE float __rcp_uniform_float(float v) {
+    return 1.f / v;
+}
+
+static FORCEINLINE float __sqrt_uniform_float(float v) {
+    return sqrtf(v);
+}
+
+static FORCEINLINE double __sqrt_uniform_double(double v) {
+    return sqrt(v);
+}
+
+UNARY_OP(__vec32_f, __rcp_varying_float, __rcp_uniform_float)
+UNARY_OP(__vec32_f, __rsqrt_varying_float, __rsqrt_uniform_float)
+UNARY_OP(__vec32_f, __sqrt_varying_float, __sqrt_uniform_float)
+UNARY_OP(__vec32_d, __sqrt_varying_double, __sqrt_uniform_double)
+
+///////////////////////////////////////////////////////////////////////////
+// bit ops
+
+static FORCEINLINE int32_t __popcnt_int32(uint32_t v) {
+    int count = 0;
+    for (; v != 0; v >>= 1)
+        count += (v & 1);
+    return count;
+}
+
+static FORCEINLINE int32_t __popcnt_int64(uint64_t v) {
+    int count = 0;
+    for (; v != 0; v >>= 1)
+        count += (v & 1);
+    return count;
+}
+
+static FORCEINLINE int32_t __count_trailing_zeros_i32(uint32_t v) {
+    if (v == 0)
+        return 32;
+
+    int count = 0;
+    while ((v & 1) == 0) {
+        ++count;
+        v >>= 1;
+    }
+    return count;
+}
+
+static FORCEINLINE int64_t __count_trailing_zeros_i64(uint64_t v) {
+    if (v == 0)
+        return 64;
+
+    int count = 0;
+    while ((v & 1) == 0) {
+        ++count;
+        v >>= 1;
+    }
+    return count;
+}
+
+static FORCEINLINE int32_t __count_leading_zeros_i32(uint32_t v) {
+    if (v == 0)
+        return 32;
+
+    int count = 0;
+    while ((v & (1<<31)) == 0) {
+        ++count;
+        v <<= 1;
+    }
+    return count;
+}
+
+static FORCEINLINE int64_t __count_leading_zeros_i64(uint64_t v) {
+    if (v == 0)
+        return 64;
+
+    int count = 0;
+    while ((v & (1ull<<63)) == 0) {
+        ++count;
+        v <<= 1;
+    }
+    return count;
+}
+
+///////////////////////////////////////////////////////////////////////////
+// reductions
+
+REDUCE_ADD(float, __vec32_f, __reduce_add_float)
+REDUCE_MINMAX(float, __vec32_f, __reduce_min_float, <)
+REDUCE_MINMAX(float, __vec32_f, __reduce_max_float, >)
+
+REDUCE_ADD(double, __vec32_d, __reduce_add_double)
+REDUCE_MINMAX(double, __vec32_d, __reduce_min_double, <)
+REDUCE_MINMAX(double, __vec32_d, __reduce_max_double, >)
+
+REDUCE_ADD(uint32_t, __vec32_i32, __reduce_add_int32)
+REDUCE_MINMAX(int32_t, __vec32_i32, __reduce_min_int32, <)
+REDUCE_MINMAX(int32_t, __vec32_i32, __reduce_max_int32, >)
+
+REDUCE_ADD(uint32_t, __vec32_i32, __reduce_add_uint32)
+REDUCE_MINMAX(uint32_t, __vec32_i32, __reduce_min_uint32, <)
+REDUCE_MINMAX(uint32_t, __vec32_i32, __reduce_max_uint32, >)
+
+REDUCE_ADD(uint64_t, __vec32_i64, __reduce_add_int64)
+REDUCE_MINMAX(int64_t, __vec32_i64, __reduce_min_int64, <)
+REDUCE_MINMAX(int64_t, __vec32_i64, __reduce_max_int64, >)
+
+REDUCE_ADD(uint64_t, __vec32_i64, __reduce_add_uint64)
+REDUCE_MINMAX(uint64_t, __vec32_i64, __reduce_min_uint64, <)
+REDUCE_MINMAX(uint64_t, __vec32_i64, __reduce_max_uint64, >)
+
+///////////////////////////////////////////////////////////////////////////
+// masked load/store
+
+static FORCEINLINE __vec32_i8 __masked_load_8(void *p,
+                                              __vec32_i1 mask) {
+    __vec32_i8 ret;
+    int8_t *ptr = (int8_t *)p;
+    for (int i = 0; i < 32; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret.v[i] = ptr[i];
+    return ret;
+}
+
+static FORCEINLINE __vec32_i16 __masked_load_16(void *p,
+                                                __vec32_i1 mask) {
+    __vec32_i16 ret;
+    int16_t *ptr = (int16_t *)p;
+    for (int i = 0; i < 32; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret.v[i] = ptr[i];
+    return ret;
+}
+
+static FORCEINLINE __vec32_i32 __masked_load_32(void *p,
+                                                __vec32_i1 mask) {
+    __vec32_i32 ret;
+    int32_t *ptr = (int32_t *)p;
+    for (int i = 0; i < 32; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret.v[i] = ptr[i];
+    return ret;
+}
+
+static FORCEINLINE __vec32_i64 __masked_load_64(void *p,
+                                                __vec32_i1 mask) {
+    __vec32_i64 ret;
+    int64_t *ptr = (int64_t *)p;
+    for (int i = 0; i < 32; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret.v[i] = ptr[i];
+    return ret;
+}
+
+static FORCEINLINE void __masked_store_8(void *p, __vec32_i8 val,
+                                         __vec32_i1 mask) {
+    int8_t *ptr = (int8_t *)p;
+    for (int i = 0; i < 32; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val.v[i];
+}
+
+static FORCEINLINE void __masked_store_16(void *p, __vec32_i16 val,
+                                          __vec32_i1 mask) {
+    int16_t *ptr = (int16_t *)p;
+    for (int i = 0; i < 32; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val.v[i];
+}
+
+static FORCEINLINE void __masked_store_32(void *p, __vec32_i32 val,
+                                          __vec32_i1 mask) {
+    int32_t *ptr = (int32_t *)p;
+    for (int i = 0; i < 32; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val.v[i];
+}
+
+static FORCEINLINE void __masked_store_64(void *p, __vec32_i64 val,
+                                          __vec32_i1 mask) {
+    int64_t *ptr = (int64_t *)p;
+    for (int i = 0; i < 32; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val.v[i];
+}
+
+static FORCEINLINE void __masked_store_blend_8(void *p, __vec32_i8 val,
+                                               __vec32_i1 mask) {
+    __masked_store_8(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_16(void *p, __vec32_i16 val,
+                                                __vec32_i1 mask) {
+    __masked_store_16(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_32(void *p, __vec32_i32 val,
+                                                __vec32_i1 mask) {
+    __masked_store_32(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_64(void *p, __vec32_i64 val,
+                                                __vec32_i1 mask) {
+    __masked_store_64(p, val, mask);
+}
+
+///////////////////////////////////////////////////////////////////////////
+// gather/scatter
+
+// offsets * offsetScale is in bytes (for all of these)
+
+#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                  \
+static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset,    \
+                              uint32_t scale, OTYPE constOffset, \
+                              __vec32_i1 mask) {                        \
+    VTYPE ret;                                                          \
+    int8_t *base = (int8_t *)b;                                         \
+    for (int i = 0; i < 32; ++i)                                        \
+        if ((mask.v & (1 << i)) != 0) {                                 \
+            STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] +  \
+                                   constOffset.v[i]);                   \
+            ret.v[i] = *ptr;                                            \
+        }                                                               \
+    return ret;                                                         \
+}
+    
+
+GATHER_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i32, __gather_base_offsets32_i8)
+GATHER_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i64, __gather_base_offsets64_i8)
+GATHER_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i32, __gather_base_offsets32_i16)
+GATHER_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i64, __gather_base_offsets64_i16)
+GATHER_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i32, __gather_base_offsets32_i32)
+GATHER_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i64, __gather_base_offsets64_i32)
+GATHER_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i32, __gather_base_offsets32_i64)
+GATHER_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i64, __gather_base_offsets64_i64)
+
+#define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)         \
+static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec32_i1 mask) {   \
+    VTYPE ret;                                              \
+    for (int i = 0; i < 32; ++i)                            \
+        if ((mask.v & (1 << i)) != 0) {                     \
+            STYPE *ptr = (STYPE *)ptrs.v[i];                \
+            ret.v[i] = *ptr;                                \
+        }                                                   \
+    return ret;                                             \
+}
+
+GATHER_GENERAL(__vec32_i8, int8_t, __vec32_i32, __gather32_i8)
+GATHER_GENERAL(__vec32_i8, int8_t, __vec32_i64, __gather64_i8)
+GATHER_GENERAL(__vec32_i16, int16_t, __vec32_i32, __gather32_i16)
+GATHER_GENERAL(__vec32_i16, int16_t, __vec32_i64, __gather64_i16)
+GATHER_GENERAL(__vec32_i32, int32_t, __vec32_i32, __gather32_i32)
+GATHER_GENERAL(__vec32_i32, int32_t, __vec32_i64, __gather64_i32)
+GATHER_GENERAL(__vec32_i64, int64_t, __vec32_i32, __gather32_i64)
+GATHER_GENERAL(__vec32_i64, int64_t, __vec32_i64, __gather64_i64)
+
+// scatter
+
+#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                 \
+static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset,     \
+                             uint32_t scale, OTYPE constOffset,         \
+                             VTYPE val, __vec32_i1 mask) {              \
+    int8_t *base = (int8_t *)b;                                         \
+    for (int i = 0; i < 32; ++i)                                        \
+        if ((mask.v & (1 << i)) != 0) {                                 \
+            STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] +  \
+                                   constOffset.v[i]);                   \
+            *ptr = val.v[i];                                            \
+        }                                                               \
+}
+    
+
+SCATTER_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i32, __scatter_base_offsets32_i8)
+SCATTER_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i64, __scatter_base_offsets64_i8)
+SCATTER_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i32, __scatter_base_offsets32_i16)
+SCATTER_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i64, __scatter_base_offsets64_i16)
+SCATTER_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i32, __scatter_base_offsets32_i32)
+SCATTER_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i64, __scatter_base_offsets64_i32)
+SCATTER_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i32, __scatter_base_offsets32_i64)
+SCATTER_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i64, __scatter_base_offsets64_i64)
+
+#define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)                 \
+static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec32_i1 mask) {  \
+    VTYPE ret;                                                       \
+    for (int i = 0; i < 32; ++i)                                     \
+        if ((mask.v & (1 << i)) != 0) {                              \
+            STYPE *ptr = (STYPE *)ptrs.v[i];                         \
+            *ptr = val.v[i];                                         \
+        }                                                            \
+}
+
+SCATTER_GENERAL(__vec32_i8, int8_t, __vec32_i32, __scatter32_i8)
+SCATTER_GENERAL(__vec32_i8, int8_t, __vec32_i64, __scatter64_i8)
+SCATTER_GENERAL(__vec32_i16, int16_t, __vec32_i32, __scatter32_i16)
+SCATTER_GENERAL(__vec32_i16, int16_t, __vec32_i64, __scatter64_i16)
+SCATTER_GENERAL(__vec32_i32, int32_t, __vec32_i32, __scatter32_i32)
+SCATTER_GENERAL(__vec32_i32, int32_t, __vec32_i64, __scatter64_i32)
+SCATTER_GENERAL(__vec32_i64, int64_t, __vec32_i32, __scatter32_i64)
+SCATTER_GENERAL(__vec32_i64, int64_t, __vec32_i64, __scatter64_i64)
+
+///////////////////////////////////////////////////////////////////////////
+// packed load/store
+
+static FORCEINLINE int32_t __packed_load_active(int32_t *ptr, __vec32_i32 *val,
+                                                __vec32_i1 mask) {
+    int count = 0; 
+    for (int i = 0; i < 32; ++i) {
+        if ((mask.v & (1 << i)) != 0) {
+            val->v[i] = *ptr++;
+            ++count;
+        }
+    }
+    return count;
+}
+
+
+static FORCEINLINE int32_t __packed_store_active(int32_t *ptr, __vec32_i32 val,
+                                                 __vec32_i1 mask) {
+    int count = 0; 
+    for (int i = 0; i < 32; ++i) {
+        if ((mask.v & (1 << i)) != 0) {
+            *ptr++ = val.v[i];
+            ++count;
+        }
+    }
+    return count;
+}
+
+static FORCEINLINE int32_t __packed_load_active(uint32_t *ptr,
+                                                __vec32_i32 *val,
+                                                __vec32_i1 mask) {
+    int count = 0; 
+    for (int i = 0; i < 32; ++i) {
+        if ((mask.v & (1 << i)) != 0) {
+            val->v[i] = *ptr++;
+            ++count;
+        }
+    }
+    return count;
+}
+
+
+static FORCEINLINE int32_t __packed_store_active(uint32_t *ptr, 
+                                                 __vec32_i32 val,
+                                                 __vec32_i1 mask) {
+    int count = 0; 
+    for (int i = 0; i < 32; ++i) {
+        if ((mask.v & (1 << i)) != 0) {
+            *ptr++ = val.v[i];
+            ++count;
+        }
+    }
+    return count;
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// aos/soa
+
+static FORCEINLINE void __soa_to_aos3_float(__vec32_f v0, __vec32_f v1, __vec32_f v2,
+                                            float *ptr) {
+    for (int i = 0; i < 32; ++i) {
+        *ptr++ = __extract_element(v0, i);
+        *ptr++ = __extract_element(v1, i);
+        *ptr++ = __extract_element(v2, i);
+    }
+}
+
+static FORCEINLINE void __aos_to_soa3_float(float *ptr, __vec32_f *out0, __vec32_f *out1,
+                                            __vec32_f *out2) {
+    for (int i = 0; i < 32; ++i) {
+        __insert_element(out0, i, *ptr++);
+        __insert_element(out1, i, *ptr++);
+        __insert_element(out2, i, *ptr++);
+    }
+}
+
+static FORCEINLINE void __soa_to_aos4_float(__vec32_f v0, __vec32_f v1, __vec32_f v2,
+                                            __vec32_f v3, float *ptr) {
+    for (int i = 0; i < 32; ++i) {
+        *ptr++ = __extract_element(v0, i);
+        *ptr++ = __extract_element(v1, i);
+        *ptr++ = __extract_element(v2, i);
+        *ptr++ = __extract_element(v3, i);
+    }
+}
+
+static FORCEINLINE void __aos_to_soa4_float(float *ptr, __vec32_f *out0, __vec32_f *out1,
+                                            __vec32_f *out2, __vec32_f *out3) {
+    for (int i = 0; i < 32; ++i) {
+        __insert_element(out0, i, *ptr++);
+        __insert_element(out1, i, *ptr++);
+        __insert_element(out2, i, *ptr++);
+        __insert_element(out3, i, *ptr++);
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////
+// prefetch
+
+static FORCEINLINE void __prefetch_read_uniform_1(unsigned char *) {
+}
+
+static FORCEINLINE void __prefetch_read_uniform_2(unsigned char *) {
+}
+
+static FORCEINLINE void __prefetch_read_uniform_3(unsigned char *) {
+}
+
+static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *) {
+}
+
+///////////////////////////////////////////////////////////////////////////
+// atomics
+
+static FORCEINLINE uint32_t __atomic_add(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedAdd((LONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_add(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_sub(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedAdd((LONG volatile *)p, -v) + v;
+#else
+    return __sync_fetch_and_sub(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_and(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedAnd((LONG volatile *)p, v);
+#else
+    return __sync_fetch_and_and(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_or(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedOr((LONG volatile *)p, v);
+#else
+    return __sync_fetch_and_or(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_xor(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedXor((LONG volatile *)p, v);
+#else
+    return __sync_fetch_and_xor(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_min(uint32_t *p, uint32_t v) {
+    int32_t old, min;
+    do {
+        old = *((volatile int32_t *)p);
+        min = (old < (int32_t)v) ? old : (int32_t)v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange((LONG volatile *)p, min, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint32_t __atomic_max(uint32_t *p, uint32_t v) {
+    int32_t old, max;
+    do {
+        old = *((volatile int32_t *)p);
+        max = (old > (int32_t)v) ? old : (int32_t)v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange((LONG volatile *)p, max, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint32_t __atomic_umin(uint32_t *p, uint32_t v) {
+    uint32_t old, min;
+    do {
+        old = *((volatile uint32_t *)p);
+        min = (old < v) ? old : v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange((LONG volatile *)p, min, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint32_t __atomic_umax(uint32_t *p, uint32_t v) {
+    uint32_t old, max;
+    do {
+        old = *((volatile uint32_t *)p);
+        max = (old > v) ? old : v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange((LONG volatile *)p, max, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint32_t __atomic_xchg(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedExchange((LONG volatile *)p, v);
+#else
+    return __sync_lock_test_and_set(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_cmpxchg(uint32_t *p, uint32_t cmpval,
+                                             uint32_t newval) {
+#ifdef _MSC_VER
+    return InterlockedCompareExchange((LONG volatile *)p, newval, cmpval);
+#else
+    return __sync_val_compare_and_swap(p, cmpval, newval);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_add(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedAdd64((LONGLONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_add(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_sub(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedAdd64((LONGLONG volatile *)p, -v) + v;
+#else
+    return __sync_fetch_and_sub(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_and(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedAnd64((LONGLONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_and(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_or(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedOr64((LONGLONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_or(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_xor(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedXor64((LONGLONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_xor(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_min(uint64_t *p, uint64_t v) {
+    int64_t old, min;
+    do {
+        old = *((volatile int64_t *)p);
+        min = (old < (int64_t)v) ? old : (int64_t)v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, min, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint64_t __atomic_max(uint64_t *p, uint64_t v) {
+    int64_t old, max;
+    do {
+        old = *((volatile int64_t *)p);
+        max = (old > (int64_t)v) ? old : (int64_t)v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, max, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint64_t __atomic_umin(uint64_t *p, uint64_t v) {
+    uint64_t old, min;
+    do {
+        old = *((volatile uint64_t *)p);
+        min = (old < v) ? old : v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, min, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint64_t __atomic_umax(uint64_t *p, uint64_t v) {
+    uint64_t old, max;
+    do {
+        old = *((volatile uint64_t *)p);
+        max = (old > v) ? old : v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, max, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint64_t __atomic_xchg(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedExchange64((LONGLONG volatile *)p, v);
+#else
+    return __sync_lock_test_and_set(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval,
+                                             uint64_t newval) {
+#ifdef _MSC_VER
+    return InterlockedCompareExchange64((LONGLONG volatile *)p, newval, cmpval);
+#else
+    return __sync_val_compare_and_swap(p, cmpval, newval);
+#endif
+}
diff --git a/examples/intrinsics/generic-64.h b/examples/intrinsics/generic-64.h
new file mode 100644
index 00000000..d088edfe
--- /dev/null
+++ b/examples/intrinsics/generic-64.h
@@ -0,0 +1,1817 @@
+/*
+  Copyright (c) 2010-2012, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#include <stdint.h>
+#include <math.h>
+
+#ifdef _MSC_VER
+#define FORCEINLINE __forceinline
+#define PRE_ALIGN(x)  /*__declspec(align(x))*/
+#define POST_ALIGN(x)  
+#define roundf(x) (floorf(x + .5f))
+#define round(x) (floor(x + .5))
+#else
+#define FORCEINLINE __attribute__((always_inline))
+#define PRE_ALIGN(x)
+#define POST_ALIGN(x)  __attribute__ ((aligned(x)))
+#endif
+
+typedef float __vec1_f;
+typedef double __vec1_d;
+typedef int8_t __vec1_i8;
+typedef int16_t __vec1_i16;
+typedef int32_t __vec1_i32;
+typedef int64_t __vec1_i64;
+
+struct __vec64_i1 {
+    __vec64_i1() { }
+    __vec64_i1(uint64_t v0, uint64_t v1, uint64_t v2, uint64_t v3,
+               uint64_t v4, uint64_t v5, uint64_t v6, uint64_t v7,
+               uint64_t v8, uint64_t v9, uint64_t v10, uint64_t v11,
+               uint64_t v12, uint64_t v13, uint64_t v14, uint64_t v15,
+               uint64_t v16, uint64_t v17, uint64_t v18, uint64_t v19,
+               uint64_t v20, uint64_t v21, uint64_t v22, uint64_t v23,
+               uint64_t v24, uint64_t v25, uint64_t v26, uint64_t v27,
+               uint64_t v28, uint64_t v29, uint64_t v30, uint64_t v31,
+               uint64_t v32, uint64_t v33, uint64_t v34, uint64_t v35,
+               uint64_t v36, uint64_t v37, uint64_t v38, uint64_t v39,
+               uint64_t v40, uint64_t v41, uint64_t v42, uint64_t v43,
+               uint64_t v44, uint64_t v45, uint64_t v46, uint64_t v47,
+               uint64_t v48, uint64_t v49, uint64_t v50, uint64_t v51,
+               uint64_t v52, uint64_t v53, uint64_t v54, uint64_t v55,
+               uint64_t v56, uint64_t v57, uint64_t v58, uint64_t v59,
+               uint64_t v60, uint64_t v61, uint64_t v62, uint64_t v63) {
+        v = ((v0 & 1) |
+             ((v1 & 1) << 1) |
+             ((v2 & 1) << 2) |
+             ((v3 & 1) << 3) |
+             ((v4 & 1) << 4) |
+             ((v5 & 1) << 5) |
+             ((v6 & 1) << 6) |
+             ((v7 & 1) << 7) |
+             ((v8 & 1) << 8) |
+             ((v9 & 1) << 9) |
+             ((v10 & 1) << 10) |
+             ((v11 & 1) << 11) |
+             ((v12 & 1) << 12) |
+             ((v13 & 1) << 13) |
+             ((v14 & 1) << 14) |
+             ((v15 & 1) << 15) |
+             ((v16 & 1) << 16) |
+             ((v17 & 1) << 17) |
+             ((v18 & 1) << 18) |
+             ((v19 & 1) << 19) |
+             ((v20 & 1) << 20) |
+             ((v21 & 1) << 21) |
+             ((v22 & 1) << 22) |
+             ((v23 & 1) << 23) |
+             ((v24 & 1) << 24) |
+             ((v25 & 1) << 25) |
+             ((v26 & 1) << 26) |
+             ((v27 & 1) << 27) |
+             ((v28 & 1) << 28) |
+             ((v29 & 1) << 29) |
+             ((v30 & 1) << 30) |
+             ((v31 & 1) << 31) |
+             ((v32 & 1) << 32) |
+             ((v33 & 1) << 33) |
+             ((v34 & 1) << 34) |
+             ((v35 & 1) << 35) |
+             ((v36 & 1) << 36) |
+             ((v37 & 1) << 37) |
+             ((v38 & 1) << 38) |
+             ((v39 & 1) << 39) |
+             ((v40 & 1) << 40) |
+             ((v41 & 1) << 41) |
+             ((v42 & 1) << 42) |
+             ((v43 & 1) << 43) |
+             ((v44 & 1) << 44) |
+             ((v45 & 1) << 45) |
+             ((v46 & 1) << 46) |
+             ((v47 & 1) << 47) |
+             ((v48 & 1) << 48) |
+             ((v49 & 1) << 49) |
+             ((v50 & 1) << 50) |
+             ((v51 & 1) << 51) |
+             ((v52 & 1) << 52) |
+             ((v53 & 1) << 53) |
+             ((v54 & 1) << 54) |
+             ((v55 & 1) << 55) |
+             ((v56 & 1) << 56) |
+             ((v57 & 1) << 57) |
+             ((v58 & 1) << 58) |
+             ((v59 & 1) << 59) |
+             ((v60 & 1) << 60) |
+             ((v61 & 1) << 61) |
+             ((v62 & 1) << 62) |
+             ((v63 & 1) << 63));
+    }
+             
+    uint64_t v;
+};
+
+
+template <typename T>
+struct vec64 {
+    vec64() { }
+    vec64(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
+          T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15,
+          T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23,
+          T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31,
+          T v32, T v33, T v34, T v35, T v36, T v37, T v38, T v39,
+          T v40, T v41, T v42, T v43, T v44, T v45, T v46, T v47,
+          T v48, T v49, T v50, T v51, T v52, T v53, T v54, T v55,
+          T v56, T v57, T v58, T v59, T v60, T v61, T v62, T v63) {
+        v[0] = v0;        v[1] = v1;        v[2] = v2;        v[3] = v3;
+        v[4] = v4;        v[5] = v5;        v[6] = v6;        v[7] = v7;
+        v[8] = v8;        v[9] = v9;        v[10] = v10;      v[11] = v11;
+        v[12] = v12;      v[13] = v13;      v[14] = v14;      v[15] = v15;
+        v[16] = v16;      v[17] = v17;      v[18] = v18;      v[19] = v19;
+        v[20] = v20;      v[21] = v21;      v[22] = v22;      v[23] = v23;
+        v[24] = v24;      v[25] = v25;      v[26] = v26;      v[27] = v27;
+        v[28] = v28;      v[29] = v29;      v[30] = v30;      v[31] = v31;
+        v[32] = v32;      v[33] = v33;      v[34] = v34;      v[35] = v35;
+        v[36] = v36;      v[37] = v37;      v[38] = v38;      v[39] = v39;
+        v[40] = v40;      v[41] = v41;      v[42] = v42;      v[43] = v43;
+        v[44] = v44;      v[45] = v45;      v[46] = v46;      v[47] = v47;
+        v[48] = v48;      v[49] = v49;      v[50] = v50;      v[51] = v51;
+        v[52] = v52;      v[53] = v53;      v[54] = v54;      v[55] = v55;
+        v[56] = v56;      v[57] = v57;      v[58] = v58;      v[59] = v59;
+        v[60] = v60;      v[61] = v61;      v[62] = v62;      v[63] = v63;
+    }
+    T v[64]; 
+};
+
+PRE_ALIGN(64) struct __vec64_f : public vec64<float> { 
+    __vec64_f() { }
+    __vec64_f(float v0, float v1, float v2, float v3, 
+              float v4, float v5, float v6, float v7,
+              float v8, float v9, float v10, float v11, 
+              float v12, float v13, float v14, float v15,
+              float v16, float v17, float v18, float v19,
+              float v20, float v21, float v22, float v23,
+              float v24, float v25, float v26, float v27,
+              float v28, float v29, float v30, float v31,
+              float v32, float v33, float v34, float v35,
+              float v36, float v37, float v38, float v39,
+              float v40, float v41, float v42, float v43,
+              float v44, float v45, float v46, float v47,
+              float v48, float v49, float v50, float v51,
+              float v52, float v53, float v54, float v55,
+              float v56, float v57, float v58, float v59,
+              float v60, float v61, float v62, float v63)
+        : vec64<float>(v0, v1, v2, v3, v4, v5, v6, v7,
+                       v8, v9, v10, v11, v12, v13, v14, v15,
+                       v16, v17, v18, v19, v20, v21, v22, v23,
+                       v24, v25, v26, v27, v28, v29, v30, v31,
+                       v32, v33, v34, v35, v36, v37, v38, v39, 
+                       v40, v41, v42, v43, v44, v45, v46, v47, 
+                       v48, v49, v50, v51, v52, v53, v54, v55, 
+                       v56, v57, v58, v59, v60, v61, v62, v63) { }
+
+} POST_ALIGN(64);
+
+PRE_ALIGN(128) struct __vec64_d : public vec64<double> { 
+    __vec64_d() { }
+    __vec64_d(double v0, double v1, double v2, double v3, 
+              double v4, double v5, double v6, double v7,
+              double v8, double v9, double v10, double v11, 
+              double v12, double v13, double v14, double v15,
+              double v16, double v17, double v18, double v19,
+              double v20, double v21, double v22, double v23,
+              double v24, double v25, double v26, double v27,
+              double v28, double v29, double v30, double v31,
+              double v32, double v33, double v34, double v35,
+              double v36, double v37, double v38, double v39,
+              double v40, double v41, double v42, double v43,
+              double v44, double v45, double v46, double v47,
+              double v48, double v49, double v50, double v51,
+              double v52, double v53, double v54, double v55,
+              double v56, double v57, double v58, double v59,
+              double v60, double v61, double v62, double v63)
+        : vec64<double>(v0, v1, v2, v3, v4, v5, v6, v7,
+                       v8, v9, v10, v11, v12, v13, v14, v15,
+                       v16, v17, v18, v19, v20, v21, v22, v23,
+                       v24, v25, v26, v27, v28, v29, v30, v31,
+                       v32, v33, v34, v35, v36, v37, v38, v39, 
+                       v40, v41, v42, v43, v44, v45, v46, v47, 
+                       v48, v49, v50, v51, v52, v53, v54, v55, 
+                       v56, v57, v58, v59, v60, v61, v62, v63) { }
+
+} POST_ALIGN(128);
+
+PRE_ALIGN(16) struct __vec64_i8   : public vec64<int8_t> { 
+    __vec64_i8() { }
+    __vec64_i8(int8_t v0, int8_t v1, int8_t v2, int8_t v3, 
+               int8_t v4, int8_t v5, int8_t v6, int8_t v7,
+               int8_t v8, int8_t v9, int8_t v10, int8_t v11, 
+               int8_t v12, int8_t v13, int8_t v14, int8_t v15,
+               int8_t v16, int8_t v17, int8_t v18, int8_t v19,
+               int8_t v20, int8_t v21, int8_t v22, int8_t v23,
+               int8_t v24, int8_t v25, int8_t v26, int8_t v27,
+               int8_t v28, int8_t v29, int8_t v30, int8_t v31,
+               int8_t v32, int8_t v33, int8_t v34, int8_t v35,
+               int8_t v36, int8_t v37, int8_t v38, int8_t v39,
+               int8_t v40, int8_t v41, int8_t v42, int8_t v43,
+               int8_t v44, int8_t v45, int8_t v46, int8_t v47,
+               int8_t v48, int8_t v49, int8_t v50, int8_t v51,
+               int8_t v52, int8_t v53, int8_t v54, int8_t v55,
+               int8_t v56, int8_t v57, int8_t v58, int8_t v59,
+               int8_t v60, int8_t v61, int8_t v62, int8_t v63)
+        : vec64<int8_t>(v0, v1, v2, v3, v4, v5, v6, v7,
+                        v8, v9, v10, v11, v12, v13, v14, v15,
+                        v16, v17, v18, v19, v20, v21, v22, v23,
+                        v24, v25, v26, v27, v28, v29, v30, v31,
+                        v32, v33, v34, v35, v36, v37, v38, v39, 
+                        v40, v41, v42, v43, v44, v45, v46, v47, 
+                        v48, v49, v50, v51, v52, v53, v54, v55, 
+                        v56, v57, v58, v59, v60, v61, v62, v63) { }
+
+} POST_ALIGN(16);
+
+PRE_ALIGN(32) struct __vec64_i16  : public vec64<int16_t> { 
+    __vec64_i16() { }
+    __vec64_i16(int16_t v0, int16_t v1, int16_t v2, int16_t v3, 
+                int16_t v4, int16_t v5, int16_t v6, int16_t v7,
+                int16_t v8, int16_t v9, int16_t v10, int16_t v11, 
+                int16_t v12, int16_t v13, int16_t v14, int16_t v15,
+                int16_t v16, int16_t v17, int16_t v18, int16_t v19,
+                int16_t v20, int16_t v21, int16_t v22, int16_t v23,
+                int16_t v24, int16_t v25, int16_t v26, int16_t v27,
+                int16_t v28, int16_t v29, int16_t v30, int16_t v31,
+                int16_t v32, int16_t v33, int16_t v34, int16_t v35,
+                int16_t v36, int16_t v37, int16_t v38, int16_t v39,
+                int16_t v40, int16_t v41, int16_t v42, int16_t v43,
+                int16_t v44, int16_t v45, int16_t v46, int16_t v47,
+                int16_t v48, int16_t v49, int16_t v50, int16_t v51,
+                int16_t v52, int16_t v53, int16_t v54, int16_t v55,
+                int16_t v56, int16_t v57, int16_t v58, int16_t v59,
+                int16_t v60, int16_t v61, int16_t v62, int16_t v63)
+        : vec64<int16_t>(v0, v1, v2, v3, v4, v5, v6, v7,
+                         v8, v9, v10, v11, v12, v13, v14, v15,
+                         v16, v17, v18, v19, v20, v21, v22, v23,
+                         v24, v25, v26, v27, v28, v29, v30, v31,
+                         v32, v33, v34, v35, v36, v37, v38, v39, 
+                         v40, v41, v42, v43, v44, v45, v46, v47, 
+                         v48, v49, v50, v51, v52, v53, v54, v55, 
+                         v56, v57, v58, v59, v60, v61, v62, v63) { }
+
+} POST_ALIGN(32);
+
+PRE_ALIGN(64) struct __vec64_i32  : public vec64<int32_t> { 
+    __vec64_i32() { }
+    __vec64_i32(int32_t v0, int32_t v1, int32_t v2, int32_t v3, 
+                int32_t v4, int32_t v5, int32_t v6, int32_t v7,
+                int32_t v8, int32_t v9, int32_t v10, int32_t v11, 
+                int32_t v12, int32_t v13, int32_t v14, int32_t v15,
+                int32_t v16, int32_t v17, int32_t v18, int32_t v19,
+                int32_t v20, int32_t v21, int32_t v22, int32_t v23,
+                int32_t v24, int32_t v25, int32_t v26, int32_t v27,
+                int32_t v28, int32_t v29, int32_t v30, int32_t v31,
+                int32_t v32, int32_t v33, int32_t v34, int32_t v35,
+                int32_t v36, int32_t v37, int32_t v38, int32_t v39,
+                int32_t v40, int32_t v41, int32_t v42, int32_t v43,
+                int32_t v44, int32_t v45, int32_t v46, int32_t v47,
+                int32_t v48, int32_t v49, int32_t v50, int32_t v51,
+                int32_t v52, int32_t v53, int32_t v54, int32_t v55,
+                int32_t v56, int32_t v57, int32_t v58, int32_t v59,
+                int32_t v60, int32_t v61, int32_t v62, int32_t v63)
+        : vec64<int32_t>(v0, v1, v2, v3, v4, v5, v6, v7,
+                         v8, v9, v10, v11, v12, v13, v14, v15,
+                         v16, v17, v18, v19, v20, v21, v22, v23,
+                         v24, v25, v26, v27, v28, v29, v30, v31,
+                         v32, v33, v34, v35, v36, v37, v38, v39, 
+                         v40, v41, v42, v43, v44, v45, v46, v47, 
+                         v48, v49, v50, v51, v52, v53, v54, v55, 
+                         v56, v57, v58, v59, v60, v61, v62, v63) { }
+
+} POST_ALIGN(64);
+
+static inline int32_t __extract_element(__vec64_i32, int);
+
+PRE_ALIGN(128) struct __vec64_i64  : public vec64<int64_t> { 
+    __vec64_i64() { }
+    __vec64_i64(int64_t v0, int64_t v1, int64_t v2, int64_t v3, 
+                int64_t v4, int64_t v5, int64_t v6, int64_t v7,
+                int64_t v8, int64_t v9, int64_t v10, int64_t v11, 
+                int64_t v12, int64_t v13, int64_t v14, int64_t v15,
+                int64_t v16, int64_t v17, int64_t v18, int64_t v19,
+                int64_t v20, int64_t v21, int64_t v22, int64_t v23,
+                int64_t v24, int64_t v25, int64_t v26, int64_t v27,
+                int64_t v28, int64_t v29, int64_t v30, int64_t v31,
+                int64_t v32, int64_t v33, int64_t v34, int64_t v35,
+                int64_t v36, int64_t v37, int64_t v38, int64_t v39,
+                int64_t v40, int64_t v41, int64_t v42, int64_t v43,
+                int64_t v44, int64_t v45, int64_t v46, int64_t v47,
+                int64_t v48, int64_t v49, int64_t v50, int64_t v51,
+                int64_t v52, int64_t v53, int64_t v54, int64_t v55,
+                int64_t v56, int64_t v57, int64_t v58, int64_t v59,
+                int64_t v60, int64_t v61, int64_t v62, int64_t v63)
+        : vec64<int64_t>(v0, v1, v2, v3, v4, v5, v6, v7,
+                         v8, v9, v10, v11, v12, v13, v14, v15,
+                         v16, v17, v18, v19, v20, v21, v22, v23,
+                         v24, v25, v26, v27, v28, v29, v30, v31,
+                         v32, v33, v34, v35, v36, v37, v38, v39, 
+                         v40, v41, v42, v43, v44, v45, v46, v47, 
+                         v48, v49, v50, v51, v52, v53, v54, v55, 
+                         v56, v57, v58, v59, v60, v61, v62, v63) { }
+
+} POST_ALIGN(128);
+
+///////////////////////////////////////////////////////////////////////////
+// macros...
+
+#define UNARY_OP(TYPE, NAME, OP)            \
+static FORCEINLINE TYPE NAME(TYPE v) {      \
+    TYPE ret;                               \
+    for (int i = 0; i < 64; ++i)            \
+        ret.v[i] = OP(v.v[i]);              \
+    return ret;                             \
+}
+
+#define BINARY_OP(TYPE, NAME, OP)                               \
+static FORCEINLINE TYPE NAME(TYPE a, TYPE b) {                  \
+    TYPE ret;                                                   \
+   for (int i = 0; i < 64; ++i)                                 \
+       ret.v[i] = a.v[i] OP b.v[i];                             \
+   return ret;                                                   \
+}
+
+#define BINARY_OP_CAST(TYPE, CAST, NAME, OP)                        \
+static FORCEINLINE TYPE NAME(TYPE a, TYPE b) {                      \
+   TYPE ret;                                                        \
+   for (int i = 0; i < 64; ++i)                                     \
+       ret.v[i] = (CAST)(a.v[i]) OP (CAST)(b.v[i]);                 \
+   return ret;                                                      \
+}
+
+#define BINARY_OP_FUNC(TYPE, NAME, FUNC)                            \
+static FORCEINLINE TYPE NAME(TYPE a, TYPE b) {                      \
+   TYPE ret;                                                        \
+   for (int i = 0; i < 64; ++i)                                     \
+       ret.v[i] = FUNC(a.v[i], b.v[i]);                             \
+   return ret;                                                      \
+}
+
+#define CMP_OP(TYPE, CAST, NAME, OP)                                \
+static FORCEINLINE __vec64_i1 NAME(TYPE a, TYPE b) {                \
+   __vec64_i1 ret;                                                  \
+   ret.v = 0;                                                       \
+   for (int i = 0; i < 64; ++i)                                     \
+       ret.v |= uint64_t((CAST)(a.v[i]) OP (CAST)(b.v[i])) << i;    \
+   return ret;                                                      \
+}
+
+#define INSERT_EXTRACT(VTYPE, STYPE)                                  \
+static FORCEINLINE STYPE __extract_element(VTYPE v, int index) {      \
+    return ((STYPE *)&v)[index];                                      \
+}                                                                     \
+static FORCEINLINE void __insert_element(VTYPE *v, int index, STYPE val) { \
+    ((STYPE *)v)[index] = val;                                        \
+}
+
+#define LOAD_STORE(VTYPE, STYPE)                       \
+static FORCEINLINE VTYPE __load(VTYPE *p, int align) { \
+    STYPE *ptr = (STYPE *)p;                           \
+    VTYPE ret;                                         \
+    for (int i = 0; i < 64; ++i)                       \
+        ret.v[i] = ptr[i];                             \
+    return ret;                                        \
+}                                                      \
+static FORCEINLINE void __store(VTYPE *p, VTYPE v, int align) {    \
+    STYPE *ptr = (STYPE *)p;                           \
+    for (int i = 0; i < 64; ++i)                       \
+        ptr[i] = v.v[i];                               \
+}
+
+#define REDUCE_ADD(TYPE, VTYPE, NAME)           \
+static FORCEINLINE TYPE NAME(VTYPE v) {         \
+     TYPE ret = v.v[0];                         \
+     for (int i = 1; i < 64; ++i)               \
+         ret = ret + v.v[i];                    \
+     return ret;                                \
+}
+
+#define REDUCE_MINMAX(TYPE, VTYPE, NAME, OP)                    \
+static FORCEINLINE TYPE NAME(VTYPE v) {                         \
+    TYPE ret = v.v[0];                                          \
+    for (int i = 1; i < 64; ++i)                                \
+        ret = (ret OP (TYPE)v.v[i]) ? ret : (TYPE)v.v[i];       \
+    return ret;                                                 \
+}
+
+#define SELECT(TYPE)                                                \
+static FORCEINLINE TYPE __select(__vec64_i1 mask, TYPE a, TYPE b) { \
+    TYPE ret;                                                       \
+    for (int i = 0; i < 64; ++i)                                    \
+        ret.v[i] = (mask.v & (1ull<<i)) ? a.v[i] : b.v[i];          \
+    return ret;                                                     \
+}                                                                   \
+static FORCEINLINE TYPE __select(bool cond, TYPE a, TYPE b) {       \
+    return cond ? a : b;                                            \
+}
+
+#define SHIFT_UNIFORM(TYPE, CAST, NAME, OP)                         \
+static FORCEINLINE TYPE NAME(TYPE a, int32_t b) {                   \
+   TYPE ret;                                                        \
+   for (int i = 0; i < 64; ++i)                                     \
+       ret.v[i] = (CAST)(a.v[i]) OP b;                              \
+   return ret;                                                      \
+}
+
+#define SMEAR(VTYPE, NAME, STYPE)                                  \
+static FORCEINLINE VTYPE __smear_##NAME(VTYPE retType, STYPE v) {  \
+    VTYPE ret;                                                     \
+    for (int i = 0; i < 64; ++i)                                   \
+        ret.v[i] = v;                                              \
+    return ret;                                                    \
+}
+
+#define BROADCAST(VTYPE, NAME, STYPE)                 \
+static FORCEINLINE VTYPE __broadcast_##NAME(VTYPE v, int index) {   \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 64; ++i)                      \
+        ret.v[i] = v.v[index & 63];                 \
+    return ret;                                       \
+}                                                     \
+
+#define ROTATE(VTYPE, NAME, STYPE)                    \
+static FORCEINLINE VTYPE __rotate_##NAME(VTYPE v, int index) {   \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 64; ++i)                      \
+        ret.v[i] = v.v[(i+index) & 63];               \
+    return ret;                                       \
+}                                                     \
+
+#define SHUFFLES(VTYPE, NAME, STYPE)                 \
+static FORCEINLINE VTYPE __shuffle_##NAME(VTYPE v, __vec64_i32 index) {   \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 64; ++i)                      \
+        ret.v[i] = v.v[__extract_element(index, i) & 63];       \
+    return ret;                                       \
+}                                                     \
+static FORCEINLINE VTYPE __shuffle2_##NAME(VTYPE v0, VTYPE v1, __vec64_i32 index) {     \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 64; ++i) {                    \
+        int ii = __extract_element(index, i) & 127;   \
+        ret.v[i] = (ii < 64) ? v0.v[ii] : v1.v[ii-64];  \
+    }                                                 \
+    return ret;                                       \
+}
+
+///////////////////////////////////////////////////////////////////////////
+
+INSERT_EXTRACT(__vec1_i8, int8_t)
+INSERT_EXTRACT(__vec1_i16, int16_t)
+INSERT_EXTRACT(__vec1_i32, int32_t)
+INSERT_EXTRACT(__vec1_i64, int64_t)
+INSERT_EXTRACT(__vec1_f, float)
+INSERT_EXTRACT(__vec1_d, double)
+
+///////////////////////////////////////////////////////////////////////////
+// mask ops
+
+static FORCEINLINE uint64_t __movmsk(__vec64_i1 mask) {
+    return (uint64_t)mask.v;
+}
+
+static FORCEINLINE __vec64_i1 __equal(__vec64_i1 a, __vec64_i1 b) {
+    __vec64_i1 r;
+    r.v = (a.v & b.v) | (~a.v & ~b.v);
+    return r;
+}
+
+static FORCEINLINE __vec64_i1 __and(__vec64_i1 a, __vec64_i1 b) {
+    __vec64_i1 r;
+    r.v = a.v & b.v;
+    return r;
+}
+
+static FORCEINLINE __vec64_i1 __xor(__vec64_i1 a, __vec64_i1 b) {
+    __vec64_i1 r;
+    r.v = a.v ^ b.v;
+    return r;
+}
+
+static FORCEINLINE __vec64_i1 __or(__vec64_i1 a, __vec64_i1 b) {
+    __vec64_i1 r;
+    r.v = a.v | b.v;
+    return r;
+}
+
+static FORCEINLINE __vec64_i1 __not(__vec64_i1 v) {
+    __vec64_i1 r;
+    r.v = ~v.v;
+    return r;
+}
+
+static FORCEINLINE __vec64_i1 __and_not1(__vec64_i1 a, __vec64_i1 b) {
+    __vec64_i1 r;
+    r.v = ~a.v & b.v;
+    return r;
+}
+
+static FORCEINLINE __vec64_i1 __and_not2(__vec64_i1 a, __vec64_i1 b) {
+    __vec64_i1 r;
+    r.v = a.v & ~b.v;
+    return r;
+}
+
+static FORCEINLINE __vec64_i1 __select(__vec64_i1 mask, __vec64_i1 a, 
+                                       __vec64_i1 b) {
+    __vec64_i1 r;
+    r.v = (a.v & mask.v) | (b.v & ~mask.v);
+    return r;
+}
+
+static FORCEINLINE __vec64_i1 __select(bool cond, __vec64_i1 a, __vec64_i1 b) {
+    return cond ? a : b;
+}
+
+static FORCEINLINE bool __extract_element(__vec64_i1 vec, int index) {
+    return (vec.v & (1ull << index)) ? true : false;
+}
+
+static FORCEINLINE void __insert_element(__vec64_i1 *vec, int index, 
+                                         bool val) {
+    if (val == false)
+        vec->v &= ~(1ull << index);
+    else
+        vec->v |= (1ull << index);
+}
+
+static FORCEINLINE __vec64_i1 __load(__vec64_i1 *p, int align) {
+    uint16_t *ptr = (uint16_t *)p;
+    __vec64_i1 r;
+    r.v = *ptr;
+    return r;
+}
+
+static FORCEINLINE void __store(__vec64_i1 *p, __vec64_i1 v, int align) {
+    uint16_t *ptr = (uint16_t *)p;
+    *ptr = v.v;
+}
+
+static FORCEINLINE __vec64_i1 __smear_i1(__vec64_i1, int v) {
+    return __vec64_i1(v, v, v, v, v, v, v, v, 
+                      v, v, v, v, v, v, v, v,
+                      v, v, v, v, v, v, v, v,
+                      v, v, v, v, v, v, v, v,
+                      v, v, v, v, v, v, v, v,
+                      v, v, v, v, v, v, v, v,
+                      v, v, v, v, v, v, v, v,
+                      v, v, v, v, v, v, v, v);
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// int8
+
+BINARY_OP(__vec64_i8, __add, +)
+BINARY_OP(__vec64_i8, __sub, -)
+BINARY_OP(__vec64_i8, __mul, *)
+
+BINARY_OP(__vec64_i8, __or, |)
+BINARY_OP(__vec64_i8, __and, &)
+BINARY_OP(__vec64_i8, __xor, ^)
+BINARY_OP(__vec64_i8, __shl, <<)
+
+BINARY_OP_CAST(__vec64_i8, uint8_t, __udiv, /)
+BINARY_OP_CAST(__vec64_i8, int8_t,  __sdiv, /)
+
+BINARY_OP_CAST(__vec64_i8, uint8_t, __urem, %)
+BINARY_OP_CAST(__vec64_i8, int8_t,  __srem, %)
+BINARY_OP_CAST(__vec64_i8, uint8_t, __lshr, >>)
+BINARY_OP_CAST(__vec64_i8, int8_t,  __ashr, >>)
+
+SHIFT_UNIFORM(__vec64_i8, uint8_t, __lshr, >>)
+SHIFT_UNIFORM(__vec64_i8, int8_t, __ashr, >>)
+SHIFT_UNIFORM(__vec64_i8, int8_t, __shl, <<)
+
+CMP_OP(__vec64_i8, int8_t,  __equal, ==)
+CMP_OP(__vec64_i8, int8_t,  __not_equal, !=)
+CMP_OP(__vec64_i8, uint8_t, __unsigned_less_equal, <=)
+CMP_OP(__vec64_i8, int8_t,  __signed_less_equal, <=)
+CMP_OP(__vec64_i8, uint8_t, __unsigned_greater_equal, >=)
+CMP_OP(__vec64_i8, int8_t,  __signed_greater_equal, >=)
+CMP_OP(__vec64_i8, uint8_t, __unsigned_less_than, <)
+CMP_OP(__vec64_i8, int8_t,  __signed_less_than, <)
+CMP_OP(__vec64_i8, uint8_t, __unsigned_greater_than, >)
+CMP_OP(__vec64_i8, int8_t,  __signed_greater_than, >)
+
+SELECT(__vec64_i8)
+INSERT_EXTRACT(__vec64_i8, int8_t)
+SMEAR(__vec64_i8, i8, int8_t)
+BROADCAST(__vec64_i8, i8, int8_t)
+ROTATE(__vec64_i8, i8, int8_t)
+SHUFFLES(__vec64_i8, i8, int8_t)
+LOAD_STORE(__vec64_i8, int8_t)
+
+///////////////////////////////////////////////////////////////////////////
+// int16
+
+BINARY_OP(__vec64_i16, __add, +)
+BINARY_OP(__vec64_i16, __sub, -)
+BINARY_OP(__vec64_i16, __mul, *)
+
+BINARY_OP(__vec64_i16, __or, |)
+BINARY_OP(__vec64_i16, __and, &)
+BINARY_OP(__vec64_i16, __xor, ^)
+BINARY_OP(__vec64_i16, __shl, <<)
+
+BINARY_OP_CAST(__vec64_i16, uint16_t, __udiv, /)
+BINARY_OP_CAST(__vec64_i16, int16_t,  __sdiv, /)
+
+BINARY_OP_CAST(__vec64_i16, uint16_t, __urem, %)
+BINARY_OP_CAST(__vec64_i16, int16_t,  __srem, %)
+BINARY_OP_CAST(__vec64_i16, uint16_t, __lshr, >>)
+BINARY_OP_CAST(__vec64_i16, int16_t,  __ashr, >>)
+
+SHIFT_UNIFORM(__vec64_i16, uint16_t, __lshr, >>)
+SHIFT_UNIFORM(__vec64_i16, int16_t, __ashr, >>)
+SHIFT_UNIFORM(__vec64_i16, int16_t, __shl, <<)
+
+CMP_OP(__vec64_i16, int16_t,  __equal, ==)
+CMP_OP(__vec64_i16, int16_t,  __not_equal, !=)
+CMP_OP(__vec64_i16, uint16_t, __unsigned_less_equal, <=)
+CMP_OP(__vec64_i16, int16_t,  __signed_less_equal, <=)
+CMP_OP(__vec64_i16, uint16_t, __unsigned_greater_equal, >=)
+CMP_OP(__vec64_i16, int16_t,  __signed_greater_equal, >=)
+CMP_OP(__vec64_i16, uint16_t, __unsigned_less_than, <)
+CMP_OP(__vec64_i16, int16_t,  __signed_less_than, <)
+CMP_OP(__vec64_i16, uint16_t, __unsigned_greater_than, >)
+CMP_OP(__vec64_i16, int16_t,  __signed_greater_than, >)
+
+SELECT(__vec64_i16)
+INSERT_EXTRACT(__vec64_i16, int16_t)
+SMEAR(__vec64_i16, i16, int16_t)
+BROADCAST(__vec64_i16, i16, int16_t)
+ROTATE(__vec64_i16, i16, int16_t)
+SHUFFLES(__vec64_i16, i16, int16_t)
+LOAD_STORE(__vec64_i16, int16_t)
+
+///////////////////////////////////////////////////////////////////////////
+// int32
+
+BINARY_OP(__vec64_i32, __add, +)
+BINARY_OP(__vec64_i32, __sub, -)
+BINARY_OP(__vec64_i32, __mul, *)
+
+BINARY_OP(__vec64_i32, __or, |)
+BINARY_OP(__vec64_i32, __and, &)
+BINARY_OP(__vec64_i32, __xor, ^)
+BINARY_OP(__vec64_i32, __shl, <<)
+
+BINARY_OP_CAST(__vec64_i32, uint32_t, __udiv, /)
+BINARY_OP_CAST(__vec64_i32, int32_t,  __sdiv, /)
+
+BINARY_OP_CAST(__vec64_i32, uint32_t, __urem, %)
+BINARY_OP_CAST(__vec64_i32, int32_t,  __srem, %)
+BINARY_OP_CAST(__vec64_i32, uint32_t, __lshr, >>)
+BINARY_OP_CAST(__vec64_i32, int32_t,  __ashr, >>)
+
+SHIFT_UNIFORM(__vec64_i32, uint32_t, __lshr, >>)
+SHIFT_UNIFORM(__vec64_i32, int32_t, __ashr, >>)
+SHIFT_UNIFORM(__vec64_i32, int32_t, __shl, <<)
+
+CMP_OP(__vec64_i32, int32_t,  __equal, ==)
+CMP_OP(__vec64_i32, int32_t,  __not_equal, !=)
+CMP_OP(__vec64_i32, uint32_t, __unsigned_less_equal, <=)
+CMP_OP(__vec64_i32, int32_t,  __signed_less_equal, <=)
+CMP_OP(__vec64_i32, uint32_t, __unsigned_greater_equal, >=)
+CMP_OP(__vec64_i32, int32_t,  __signed_greater_equal, >=)
+CMP_OP(__vec64_i32, uint32_t, __unsigned_less_than, <)
+CMP_OP(__vec64_i32, int32_t,  __signed_less_than, <)
+CMP_OP(__vec64_i32, uint32_t, __unsigned_greater_than, >)
+CMP_OP(__vec64_i32, int32_t,  __signed_greater_than, >)
+
+SELECT(__vec64_i32)
+INSERT_EXTRACT(__vec64_i32, int32_t)
+SMEAR(__vec64_i32, i32, int32_t)
+BROADCAST(__vec64_i32, i32, int32_t)
+ROTATE(__vec64_i32, i32, int32_t)
+SHUFFLES(__vec64_i32, i32, int32_t)
+LOAD_STORE(__vec64_i32, int32_t)
+
+///////////////////////////////////////////////////////////////////////////
+// int64
+
+BINARY_OP(__vec64_i64, __add, +)
+BINARY_OP(__vec64_i64, __sub, -)
+BINARY_OP(__vec64_i64, __mul, *)
+
+BINARY_OP(__vec64_i64, __or, |)
+BINARY_OP(__vec64_i64, __and, &)
+BINARY_OP(__vec64_i64, __xor, ^)
+BINARY_OP(__vec64_i64, __shl, <<)
+
+BINARY_OP_CAST(__vec64_i64, uint64_t, __udiv, /)
+BINARY_OP_CAST(__vec64_i64, int64_t,  __sdiv, /)
+
+BINARY_OP_CAST(__vec64_i64, uint64_t, __urem, %)
+BINARY_OP_CAST(__vec64_i64, int64_t,  __srem, %)
+BINARY_OP_CAST(__vec64_i64, uint64_t, __lshr, >>)
+BINARY_OP_CAST(__vec64_i64, int64_t,  __ashr, >>)
+
+SHIFT_UNIFORM(__vec64_i64, uint64_t, __lshr, >>)
+SHIFT_UNIFORM(__vec64_i64, int64_t, __ashr, >>)
+SHIFT_UNIFORM(__vec64_i64, int64_t, __shl, <<)
+
+CMP_OP(__vec64_i64, int64_t,  __equal, ==)
+CMP_OP(__vec64_i64, int64_t,  __not_equal, !=)
+CMP_OP(__vec64_i64, uint64_t, __unsigned_less_equal, <=)
+CMP_OP(__vec64_i64, int64_t,  __signed_less_equal, <=)
+CMP_OP(__vec64_i64, uint64_t, __unsigned_greater_equal, >=)
+CMP_OP(__vec64_i64, int64_t,  __signed_greater_equal, >=)
+CMP_OP(__vec64_i64, uint64_t, __unsigned_less_than, <)
+CMP_OP(__vec64_i64, int64_t,  __signed_less_than, <)
+CMP_OP(__vec64_i64, uint64_t, __unsigned_greater_than, >)
+CMP_OP(__vec64_i64, int64_t,  __signed_greater_than, >)
+
+SELECT(__vec64_i64)
+INSERT_EXTRACT(__vec64_i64, int64_t)
+SMEAR(__vec64_i64, i64, int64_t)
+BROADCAST(__vec64_i64, i64, int64_t)
+ROTATE(__vec64_i64, i64, int64_t)
+SHUFFLES(__vec64_i64, i64, int64_t)
+LOAD_STORE(__vec64_i64, int64_t)
+
+///////////////////////////////////////////////////////////////////////////
+// float
+
+BINARY_OP(__vec64_f, __add, +)
+BINARY_OP(__vec64_f, __sub, -)
+BINARY_OP(__vec64_f, __mul, *)
+BINARY_OP(__vec64_f, __div, /)
+
+CMP_OP(__vec64_f, float, __equal, ==)
+CMP_OP(__vec64_f, float, __not_equal, !=)
+CMP_OP(__vec64_f, float, __less_than, <)
+CMP_OP(__vec64_f, float, __less_equal, <=)
+CMP_OP(__vec64_f, float, __greater_than, >)
+CMP_OP(__vec64_f, float, __greater_equal, >=)
+
+static FORCEINLINE __vec64_i1 __ordered(__vec64_f a, __vec64_f b) {
+    __vec64_i1 ret;
+    ret.v = 0;
+    for (int i = 0; i < 64; ++i)
+        ret.v |= ((a.v[i] == a.v[i]) && (b.v[i] == b.v[i])) ? (1ull << i) : 0;
+    return ret;
+}
+
+#if 0
+      case Instruction::FRem: intrinsic = "__frem"; break;
+#endif
+
+SELECT(__vec64_f)
+INSERT_EXTRACT(__vec64_f, float)
+SMEAR(__vec64_f, float, float)
+BROADCAST(__vec64_f, float, float)
+ROTATE(__vec64_f, float, float)
+SHUFFLES(__vec64_f, float, float)
+LOAD_STORE(__vec64_f, float)
+
+static FORCEINLINE float __exp_uniform_float(float v) {
+    return expf(v);
+}
+
+static FORCEINLINE __vec64_f __exp_varying_float(__vec64_f v) {
+    __vec64_f ret;
+    for (int i = 0; i < 64; ++i)
+        ret.v[i] = expf(v.v[i]);
+    return ret;
+}
+
+static FORCEINLINE float __log_uniform_float(float v) {
+    return logf(v);
+}
+
+static FORCEINLINE __vec64_f __log_varying_float(__vec64_f v) {
+    __vec64_f ret;
+    for (int i = 0; i < 64; ++i)
+        ret.v[i] = logf(v.v[i]);
+    return ret;
+}
+
+static FORCEINLINE float __pow_uniform_float(float a, float b) {
+    return powf(a, b);
+}
+
+static FORCEINLINE __vec64_f __pow_varying_float(__vec64_f a, __vec64_f b) {
+    __vec64_f ret;
+    for (int i = 0; i < 64; ++i)
+        ret.v[i] = powf(a.v[i], b.v[i]);
+    return ret;
+}
+
+static FORCEINLINE int __intbits(float v) {
+    union {
+        float f;
+        int i;
+    } u;
+    u.f = v;
+    return u.i;
+}
+
+static FORCEINLINE float __floatbits(int v) {
+    union {
+        float f;
+        int i;
+    } u;
+    u.i = v;
+    return u.f;
+}
+
+static FORCEINLINE float __half_to_float_uniform(int16_t h) {
+    static const uint32_t shifted_exp = 0x7c00 << 13; // exponent mask after shift
+
+    int32_t o = ((int32_t)(h & 0x7fff)) << 13;     // exponent/mantissa bits
+    uint32_t exp = shifted_exp & o;   // just the exponent
+    o += (127 - 15) << 23;        // exponent adjust
+
+    // handle exponent special cases
+    if (exp == shifted_exp) // Inf/NaN?
+        o += (128 - 16) << 23;    // extra exp adjust
+    else if (exp == 0) { // Zero/Denormal?
+        o += 1 << 23;             // extra exp adjust
+        o = __intbits(__floatbits(o) - __floatbits(113 << 23)); // renormalize
+    }
+
+    o |= ((int32_t)(h & 0x8000)) << 16;    // sign bit
+    return __floatbits(o);
+}
+
+
+static FORCEINLINE __vec64_f __half_to_float_varying(__vec64_i16 v) {
+    __vec64_f ret;
+    for (int i = 0; i < 64; ++i)
+        ret.v[i] = __half_to_float_uniform(v.v[i]);
+    return ret;
+}
+
+
+static FORCEINLINE int16_t __float_to_half_uniform(float f) {
+    uint32_t sign_mask = 0x80000000u;
+    int32_t o;
+
+    int32_t fint = __intbits(f);
+    int32_t sign = fint & sign_mask;
+    fint ^= sign;
+
+    int32_t f32infty = 255 << 23;
+    o = (fint > f32infty) ? 0x7e00 : 0x7c00; 
+
+    // (De)normalized number or zero
+    // update fint unconditionally to save the blending; we don't need it
+    // anymore for the Inf/NaN case anyway.
+    const uint32_t round_mask = ~0xfffu; 
+    const int32_t magic = 15 << 23;
+    const int32_t f16infty = 31 << 23;
+
+    int32_t fint2 = __intbits(__floatbits(fint & round_mask) * __floatbits(magic)) - round_mask;
+    fint2 = (fint2 > f16infty) ? f16infty : fint2; // Clamp to signed infinity if overflowed
+
+    if (fint < f32infty)
+        o = fint2 >> 13; // Take the bits!
+
+    return (o | (sign >> 16));
+}
+
+
+static FORCEINLINE __vec64_i16 __float_to_half_varying(__vec64_f v) {
+    __vec64_i16 ret;
+    for (int i = 0; i < 64; ++i)
+        ret.v[i] = __float_to_half_uniform(v.v[i]);
+    return ret;
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// double
+
+BINARY_OP(__vec64_d, __add, +)
+BINARY_OP(__vec64_d, __sub, -)
+BINARY_OP(__vec64_d, __mul, *)
+BINARY_OP(__vec64_d, __div, /)
+
+CMP_OP(__vec64_d, double, __equal, ==)
+CMP_OP(__vec64_d, double, __not_equal, !=)
+CMP_OP(__vec64_d, double, __less_than, <)
+CMP_OP(__vec64_d, double, __less_equal, <=)
+CMP_OP(__vec64_d, double, __greater_than, >)
+CMP_OP(__vec64_d, double, __greater_equal, >=)
+
+static FORCEINLINE __vec64_i1 __ordered(__vec64_d a, __vec64_d b) {
+    __vec64_i1 ret;
+    ret.v = 0;
+    for (int i = 0; i < 64; ++i)
+        ret.v |= ((a.v[i] == a.v[i]) && (b.v[i] == b.v[i])) ? (1ull << i) : 0;
+    return ret;
+}
+
+#if 0
+      case Instruction::FRem: intrinsic = "__frem"; break;
+#endif
+
+SELECT(__vec64_d)
+INSERT_EXTRACT(__vec64_d, double)
+SMEAR(__vec64_d, double, double)
+BROADCAST(__vec64_d, double, double)
+ROTATE(__vec64_d, double, double)
+SHUFFLES(__vec64_d, double, double)
+LOAD_STORE(__vec64_d, double)
+
+///////////////////////////////////////////////////////////////////////////
+// casts
+
+
+#define CAST(TO, STO, FROM, SFROM, FUNC)        \
+static FORCEINLINE TO FUNC(TO, FROM val) {      \
+    TO ret;                                     \
+    for (int i = 0; i < 64; ++i)                \
+        ret.v[i] = (STO)((SFROM)(val.v[i]));    \
+    return ret;                                 \
+}
+
+// sign extension conversions
+CAST(__vec64_i64, int64_t, __vec64_i32, int32_t, __cast_sext)
+CAST(__vec64_i64, int64_t, __vec64_i16, int16_t, __cast_sext)
+CAST(__vec64_i64, int64_t, __vec64_i8,  int8_t,  __cast_sext)
+CAST(__vec64_i32, int32_t, __vec64_i16, int16_t, __cast_sext)
+CAST(__vec64_i32, int32_t, __vec64_i8,  int8_t,  __cast_sext)
+CAST(__vec64_i16, int16_t, __vec64_i8,  int8_t,  __cast_sext)
+
+#define CAST_SEXT_I1(TYPE)                            \
+static FORCEINLINE TYPE __cast_sext(TYPE, __vec64_i1 v) {  \
+    TYPE ret;                                         \
+    for (int i = 0; i < 64; ++i) {                    \
+        ret.v[i] = 0;                                 \
+        if (v.v & (1ull << i))                        \
+            ret.v[i] = ~ret.v[i];                     \
+    }                                                 \
+    return ret;                                       \
+}
+
+CAST_SEXT_I1(__vec64_i8)
+CAST_SEXT_I1(__vec64_i16)
+CAST_SEXT_I1(__vec64_i32)
+CAST_SEXT_I1(__vec64_i64)
+
+// zero extension
+CAST(__vec64_i64, uint64_t, __vec64_i32, uint32_t, __cast_zext)
+CAST(__vec64_i64, uint64_t, __vec64_i16, uint16_t, __cast_zext)
+CAST(__vec64_i64, uint64_t, __vec64_i8,  uint8_t,  __cast_zext)
+CAST(__vec64_i32, uint32_t, __vec64_i16, uint16_t, __cast_zext)
+CAST(__vec64_i32, uint32_t, __vec64_i8,  uint8_t,  __cast_zext)
+CAST(__vec64_i16, uint16_t, __vec64_i8,  uint8_t,  __cast_zext)
+
+#define CAST_ZEXT_I1(TYPE)                            \
+static FORCEINLINE TYPE __cast_zext(TYPE, __vec64_i1 v) {  \
+    TYPE ret;                                         \
+    for (int i = 0; i < 64; ++i)                      \
+        ret.v[i] = (v.v & (1ull << i)) ? 1 : 0;       \
+    return ret;                                       \
+}
+
+CAST_ZEXT_I1(__vec64_i8)
+CAST_ZEXT_I1(__vec64_i16)
+CAST_ZEXT_I1(__vec64_i32)
+CAST_ZEXT_I1(__vec64_i64)
+
+// truncations
+CAST(__vec64_i32, int32_t, __vec64_i64, int64_t, __cast_trunc)
+CAST(__vec64_i16, int16_t, __vec64_i64, int64_t, __cast_trunc)
+CAST(__vec64_i8,  int8_t,  __vec64_i64, int64_t, __cast_trunc)
+CAST(__vec64_i16, int16_t, __vec64_i32, int32_t, __cast_trunc)
+CAST(__vec64_i8,  int8_t,  __vec64_i32, int32_t, __cast_trunc)
+CAST(__vec64_i8,  int8_t,  __vec64_i16, int16_t, __cast_trunc)
+
+// signed int to float/double
+CAST(__vec64_f, float, __vec64_i8,   int8_t,  __cast_sitofp)
+CAST(__vec64_f, float, __vec64_i16,  int16_t, __cast_sitofp)
+CAST(__vec64_f, float, __vec64_i32,  int32_t, __cast_sitofp)
+CAST(__vec64_f, float, __vec64_i64,  int64_t, __cast_sitofp)
+CAST(__vec64_d, double, __vec64_i8,  int8_t,  __cast_sitofp)
+CAST(__vec64_d, double, __vec64_i16, int16_t, __cast_sitofp)
+CAST(__vec64_d, double, __vec64_i32, int32_t, __cast_sitofp)
+CAST(__vec64_d, double, __vec64_i64, int64_t, __cast_sitofp)
+
+// unsigned int to float/double
+CAST(__vec64_f, float, __vec64_i8,   uint8_t,  __cast_uitofp)
+CAST(__vec64_f, float, __vec64_i16,  uint16_t, __cast_uitofp)
+CAST(__vec64_f, float, __vec64_i32,  uint32_t, __cast_uitofp)
+CAST(__vec64_f, float, __vec64_i64,  uint64_t, __cast_uitofp)
+CAST(__vec64_d, double, __vec64_i8,  uint8_t,  __cast_uitofp)
+CAST(__vec64_d, double, __vec64_i16, uint16_t, __cast_uitofp)
+CAST(__vec64_d, double, __vec64_i32, uint32_t, __cast_uitofp)
+CAST(__vec64_d, double, __vec64_i64, uint64_t, __cast_uitofp)
+
+static FORCEINLINE __vec64_f __cast_uitofp(__vec64_f, __vec64_i1 v) {
+    __vec64_f ret;
+    for (int i = 0; i < 64; ++i)
+        ret.v[i] = (v.v & (1ull << i)) ? 1. : 0.;
+    return ret;
+}
+
+// float/double to signed int
+CAST(__vec64_i8,  int8_t,  __vec64_f, float, __cast_fptosi)
+CAST(__vec64_i16, int16_t, __vec64_f, float, __cast_fptosi)
+CAST(__vec64_i32, int32_t, __vec64_f, float, __cast_fptosi)
+CAST(__vec64_i64, int64_t, __vec64_f, float, __cast_fptosi)
+CAST(__vec64_i8,  int8_t,  __vec64_d, double, __cast_fptosi)
+CAST(__vec64_i16, int16_t, __vec64_d, double, __cast_fptosi)
+CAST(__vec64_i32, int32_t, __vec64_d, double, __cast_fptosi)
+CAST(__vec64_i64, int64_t, __vec64_d, double, __cast_fptosi)
+
+// float/double to unsigned int
+CAST(__vec64_i8,  uint8_t,  __vec64_f, float, __cast_fptoui)
+CAST(__vec64_i16, uint16_t, __vec64_f, float, __cast_fptoui)
+CAST(__vec64_i32, uint32_t, __vec64_f, float, __cast_fptoui)
+CAST(__vec64_i64, uint64_t, __vec64_f, float, __cast_fptoui)
+CAST(__vec64_i8,  uint8_t,  __vec64_d, double, __cast_fptoui)
+CAST(__vec64_i16, uint16_t, __vec64_d, double, __cast_fptoui)
+CAST(__vec64_i32, uint32_t, __vec64_d, double, __cast_fptoui)
+CAST(__vec64_i64, uint64_t, __vec64_d, double, __cast_fptoui)
+
+// float/double conversions
+CAST(__vec64_f, float,  __vec64_d, double, __cast_fptrunc)
+CAST(__vec64_d, double, __vec64_f, float,  __cast_fpext)
+
+typedef union {
+    int32_t i32;
+    float f;
+    int64_t i64;
+    double d;
+} BitcastUnion;
+
+#define CAST_BITS(TO, TO_ELT, FROM, FROM_ELT)       \
+static FORCEINLINE TO __cast_bits(TO, FROM val) {   \
+    TO r;                                           \
+    for (int i = 0; i < 64; ++i) {                  \
+        BitcastUnion u;                             \
+        u.FROM_ELT = val.v[i];                      \
+        r.v[i] = u.TO_ELT;                          \
+    }                                               \
+    return r;                                       \
+}
+
+CAST_BITS(__vec64_f,   f,   __vec64_i32, i32)
+CAST_BITS(__vec64_i32, i32, __vec64_f,   f)
+CAST_BITS(__vec64_d,   d,   __vec64_i64, i64)
+CAST_BITS(__vec64_i64, i64, __vec64_d,   d)
+
+#define CAST_BITS_SCALAR(TO, FROM)                  \
+static FORCEINLINE TO __cast_bits(TO, FROM v) {     \
+    union {                                         \
+    TO to;                                          \
+    FROM from;                                      \
+    } u;                                            \
+    u.from = v;                                     \
+    return u.to;                                    \
+}
+
+CAST_BITS_SCALAR(uint32_t, float)
+CAST_BITS_SCALAR(int32_t, float)
+CAST_BITS_SCALAR(float, uint32_t)
+CAST_BITS_SCALAR(float, int32_t)
+CAST_BITS_SCALAR(uint64_t, double)
+CAST_BITS_SCALAR(int64_t, double)
+CAST_BITS_SCALAR(double, uint64_t)
+CAST_BITS_SCALAR(double, int64_t)
+
+///////////////////////////////////////////////////////////////////////////
+// various math functions
+
+static FORCEINLINE void __fastmath() {
+}
+
+static FORCEINLINE float __round_uniform_float(float v) {
+    return roundf(v);
+}
+
+static FORCEINLINE float __floor_uniform_float(float v)  {
+    return floorf(v);
+}
+
+static FORCEINLINE float __ceil_uniform_float(float v) {
+    return ceilf(v);
+}
+
+static FORCEINLINE double __round_uniform_double(double v) {
+    return round(v);
+}
+
+static FORCEINLINE double __floor_uniform_double(double v) {
+    return floor(v);
+}
+
+static FORCEINLINE double __ceil_uniform_double(double v) {
+    return ceil(v);
+}
+
+UNARY_OP(__vec64_f, __round_varying_float, roundf)
+UNARY_OP(__vec64_f, __floor_varying_float, floorf)
+UNARY_OP(__vec64_f, __ceil_varying_float, ceilf)
+UNARY_OP(__vec64_d, __round_varying_double, round)
+UNARY_OP(__vec64_d, __floor_varying_double, floor)
+UNARY_OP(__vec64_d, __ceil_varying_double, ceil)
+
+// min/max
+
+static FORCEINLINE float __min_uniform_float(float a, float b) { return (a<b) ? a : b; }
+static FORCEINLINE float __max_uniform_float(float a, float b) { return (a>b) ? a : b; }
+static FORCEINLINE double __min_uniform_double(double a, double b) { return (a<b) ? a : b; }
+static FORCEINLINE double __max_uniform_double(double a, double b) { return (a>b) ? a : b; }
+
+static FORCEINLINE int32_t __min_uniform_int32(int32_t a, int32_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int32_t __max_uniform_int32(int32_t a, int32_t b) { return (a>b) ? a : b; }
+static FORCEINLINE int32_t __min_uniform_uint32(uint32_t a, uint32_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int32_t __max_uniform_uint32(uint32_t a, uint32_t b) { return (a>b) ? a : b; }
+
+static FORCEINLINE int64_t __min_uniform_int64(int64_t a, int64_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int64_t __max_uniform_int64(int64_t a, int64_t b) { return (a>b) ? a : b; }
+static FORCEINLINE int64_t __min_uniform_uint64(uint64_t a, uint64_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int64_t __max_uniform_uint64(uint64_t a, uint64_t b) { return (a>b) ? a : b; }
+
+
+BINARY_OP_FUNC(__vec64_f, __max_varying_float, __max_uniform_float)
+BINARY_OP_FUNC(__vec64_f, __min_varying_float, __min_uniform_float)
+BINARY_OP_FUNC(__vec64_d, __max_varying_double, __max_uniform_double)
+BINARY_OP_FUNC(__vec64_d, __min_varying_double, __min_uniform_double)
+
+BINARY_OP_FUNC(__vec64_i32, __max_varying_int32, __max_uniform_int32)
+BINARY_OP_FUNC(__vec64_i32, __min_varying_int32, __min_uniform_int32)
+BINARY_OP_FUNC(__vec64_i32, __max_varying_uint32, __max_uniform_uint32)
+BINARY_OP_FUNC(__vec64_i32, __min_varying_uint32, __min_uniform_uint32)
+
+BINARY_OP_FUNC(__vec64_i64, __max_varying_int64, __max_uniform_int64)
+BINARY_OP_FUNC(__vec64_i64, __min_varying_int64, __min_uniform_int64)
+BINARY_OP_FUNC(__vec64_i64, __max_varying_uint64, __max_uniform_uint64)
+BINARY_OP_FUNC(__vec64_i64, __min_varying_uint64, __min_uniform_uint64)
+
+// sqrt/rsqrt/rcp
+
+static FORCEINLINE float __rsqrt_uniform_float(float v) {
+    return 1.f / sqrtf(v);
+}
+
+static FORCEINLINE float __rcp_uniform_float(float v) {
+    return 1.f / v;
+}
+
+static FORCEINLINE float __sqrt_uniform_float(float v) {
+    return sqrtf(v);
+}
+
+static FORCEINLINE double __sqrt_uniform_double(double v) {
+    return sqrt(v);
+}
+
+UNARY_OP(__vec64_f, __rcp_varying_float, __rcp_uniform_float)
+UNARY_OP(__vec64_f, __rsqrt_varying_float, __rsqrt_uniform_float)
+UNARY_OP(__vec64_f, __sqrt_varying_float, __sqrt_uniform_float)
+UNARY_OP(__vec64_d, __sqrt_varying_double, __sqrt_uniform_double)
+
+///////////////////////////////////////////////////////////////////////////
+// bit ops
+
+static FORCEINLINE int32_t __popcnt_int32(uint32_t v) {
+    int count = 0;
+    for (; v != 0; v >>= 1)
+        count += (v & 1);
+    return count;
+}
+
+static FORCEINLINE int32_t __popcnt_int64(uint64_t v) {
+    int count = 0;
+    for (; v != 0; v >>= 1)
+        count += (v & 1);
+    return count;
+}
+
+static FORCEINLINE int32_t __count_trailing_zeros_i32(uint32_t v) {
+    if (v == 0)
+        return 32;
+
+    int count = 0;
+    while ((v & 1) == 0) {
+        ++count;
+        v >>= 1;
+    }
+    return count;
+}
+
+static FORCEINLINE int64_t __count_trailing_zeros_i64(uint64_t v) {
+    if (v == 0)
+        return 64;
+
+    int count = 0;
+    while ((v & 1) == 0) {
+        ++count;
+        v >>= 1;
+    }
+    return count;
+}
+
+static FORCEINLINE int32_t __count_leading_zeros_i32(uint32_t v) {
+    if (v == 0)
+        return 32;
+
+    int count = 0;
+    while ((v & (1<<31)) == 0) {
+        ++count;
+        v <<= 1;
+    }
+    return count;
+}
+
+static FORCEINLINE int64_t __count_leading_zeros_i64(uint64_t v) {
+    if (v == 0)
+        return 64;
+
+    int count = 0;
+    while ((v & (1ull<<63)) == 0) {
+        ++count;
+        v <<= 1;
+    }
+    return count;
+}
+
+///////////////////////////////////////////////////////////////////////////
+// reductions
+
+REDUCE_ADD(float, __vec64_f, __reduce_add_float)
+REDUCE_MINMAX(float, __vec64_f, __reduce_min_float, <)
+REDUCE_MINMAX(float, __vec64_f, __reduce_max_float, >)
+
+REDUCE_ADD(double, __vec64_d, __reduce_add_double)
+REDUCE_MINMAX(double, __vec64_d, __reduce_min_double, <)
+REDUCE_MINMAX(double, __vec64_d, __reduce_max_double, >)
+
+REDUCE_ADD(uint32_t, __vec64_i32, __reduce_add_int32)
+REDUCE_MINMAX(int32_t, __vec64_i32, __reduce_min_int32, <)
+REDUCE_MINMAX(int32_t, __vec64_i32, __reduce_max_int32, >)
+
+REDUCE_ADD(uint32_t, __vec64_i32, __reduce_add_uint32)
+REDUCE_MINMAX(uint32_t, __vec64_i32, __reduce_min_uint32, <)
+REDUCE_MINMAX(uint32_t, __vec64_i32, __reduce_max_uint32, >)
+
+REDUCE_ADD(uint64_t, __vec64_i64, __reduce_add_int64)
+REDUCE_MINMAX(int64_t, __vec64_i64, __reduce_min_int64, <)
+REDUCE_MINMAX(int64_t, __vec64_i64, __reduce_max_int64, >)
+
+REDUCE_ADD(uint64_t, __vec64_i64, __reduce_add_uint64)
+REDUCE_MINMAX(uint64_t, __vec64_i64, __reduce_min_uint64, <)
+REDUCE_MINMAX(uint64_t, __vec64_i64, __reduce_max_uint64, >)
+
+///////////////////////////////////////////////////////////////////////////
+// masked load/store
+
+static FORCEINLINE __vec64_i8 __masked_load_8(void *p,
+                                              __vec64_i1 mask) {
+    __vec64_i8 ret;
+    int8_t *ptr = (int8_t *)p;
+    for (int i = 0; i < 64; ++i)
+        if ((mask.v & (1ull << i)) != 0)
+            ret.v[i] = ptr[i];
+    return ret;
+}
+
+static FORCEINLINE __vec64_i16 __masked_load_16(void *p,
+                                                __vec64_i1 mask) {
+    __vec64_i16 ret;
+    int16_t *ptr = (int16_t *)p;
+    for (int i = 0; i < 64; ++i)
+        if ((mask.v & (1ull << i)) != 0)
+            ret.v[i] = ptr[i];
+    return ret;
+}
+
+static FORCEINLINE __vec64_i32 __masked_load_32(void *p,
+                                                __vec64_i1 mask) {
+    __vec64_i32 ret;
+    int32_t *ptr = (int32_t *)p;
+    for (int i = 0; i < 64; ++i)
+        if ((mask.v & (1ull << i)) != 0)
+            ret.v[i] = ptr[i];
+    return ret;
+}
+
+static FORCEINLINE __vec64_i64 __masked_load_64(void *p,
+                                                __vec64_i1 mask) {
+    __vec64_i64 ret;
+    int64_t *ptr = (int64_t *)p;
+    for (int i = 0; i < 64; ++i)
+        if ((mask.v & (1ull << i)) != 0)
+            ret.v[i] = ptr[i];
+    return ret;
+}
+
+static FORCEINLINE void __masked_store_8(void *p, __vec64_i8 val,
+                                         __vec64_i1 mask) {
+    int8_t *ptr = (int8_t *)p;
+    for (int i = 0; i < 64; ++i)
+        if ((mask.v & (1ull << i)) != 0)
+            ptr[i] = val.v[i];
+}
+
+static FORCEINLINE void __masked_store_16(void *p, __vec64_i16 val,
+                                          __vec64_i1 mask) {
+    int16_t *ptr = (int16_t *)p;
+    for (int i = 0; i < 64; ++i)
+        if ((mask.v & (1ull << i)) != 0)
+            ptr[i] = val.v[i];
+}
+
+static FORCEINLINE void __masked_store_32(void *p, __vec64_i32 val,
+                                          __vec64_i1 mask) {
+    int32_t *ptr = (int32_t *)p;
+    for (int i = 0; i < 64; ++i)
+        if ((mask.v & (1ull << i)) != 0)
+            ptr[i] = val.v[i];
+}
+
+static FORCEINLINE void __masked_store_64(void *p, __vec64_i64 val,
+                                          __vec64_i1 mask) {
+    int64_t *ptr = (int64_t *)p;
+    for (int i = 0; i < 64; ++i)
+        if ((mask.v & (1ull << i)) != 0)
+            ptr[i] = val.v[i];
+}
+
+static FORCEINLINE void __masked_store_blend_8(void *p, __vec64_i8 val,
+                                               __vec64_i1 mask) {
+    __masked_store_8(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_16(void *p, __vec64_i16 val,
+                                                __vec64_i1 mask) {
+    __masked_store_16(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_32(void *p, __vec64_i32 val,
+                                                __vec64_i1 mask) {
+    __masked_store_32(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_64(void *p, __vec64_i64 val,
+                                                __vec64_i1 mask) {
+    __masked_store_64(p, val, mask);
+}
+
+///////////////////////////////////////////////////////////////////////////
+// gather/scatter
+
+// offsets * offsetScale is in bytes (for all of these)
+
+#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                  \
+static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset,    \
+                              uint32_t scale, OTYPE constOffset, \
+                              __vec64_i1 mask) {                        \
+    VTYPE ret;                                                          \
+    int8_t *base = (int8_t *)b;                                         \
+    for (int i = 0; i < 64; ++i)                                        \
+        if ((mask.v & (1ull << i)) != 0) {                                 \
+            STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] +  \
+                                   constOffset.v[i]);                   \
+            ret.v[i] = *ptr;                                            \
+        }                                                               \
+    return ret;                                                         \
+}
+    
+
+GATHER_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i32, __gather_base_offsets32_i8)
+GATHER_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i64, __gather_base_offsets64_i8)
+GATHER_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i32, __gather_base_offsets32_i16)
+GATHER_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i64, __gather_base_offsets64_i16)
+GATHER_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i32, __gather_base_offsets32_i32)
+GATHER_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i64, __gather_base_offsets64_i32)
+GATHER_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i32, __gather_base_offsets32_i64)
+GATHER_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i64, __gather_base_offsets64_i64)
+
+#define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)         \
+static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec64_i1 mask) {   \
+    VTYPE ret;                                              \
+    for (int i = 0; i < 64; ++i)                            \
+        if ((mask.v & (1ull << i)) != 0) {                     \
+            STYPE *ptr = (STYPE *)ptrs.v[i];                \
+            ret.v[i] = *ptr;                                \
+        }                                                   \
+    return ret;                                             \
+}
+
+GATHER_GENERAL(__vec64_i8, int8_t, __vec64_i32, __gather32_i8)
+GATHER_GENERAL(__vec64_i8, int8_t, __vec64_i64, __gather64_i8)
+GATHER_GENERAL(__vec64_i16, int16_t, __vec64_i32, __gather32_i16)
+GATHER_GENERAL(__vec64_i16, int16_t, __vec64_i64, __gather64_i16)
+GATHER_GENERAL(__vec64_i32, int32_t, __vec64_i32, __gather32_i32)
+GATHER_GENERAL(__vec64_i32, int32_t, __vec64_i64, __gather64_i32)
+GATHER_GENERAL(__vec64_i64, int64_t, __vec64_i32, __gather32_i64)
+GATHER_GENERAL(__vec64_i64, int64_t, __vec64_i64, __gather64_i64)
+
+// scatter
+
+#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                 \
+static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset,     \
+                             uint32_t scale, OTYPE constOffset,         \
+                             VTYPE val, __vec64_i1 mask) {              \
+    int8_t *base = (int8_t *)b;                                         \
+    for (int i = 0; i < 64; ++i)                                        \
+        if ((mask.v & (1ull << i)) != 0) {                                 \
+            STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] +  \
+                                   constOffset.v[i]);                   \
+            *ptr = val.v[i];                                            \
+        }                                                               \
+}
+    
+
+SCATTER_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i32, __scatter_base_offsets32_i8)
+SCATTER_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i64, __scatter_base_offsets64_i8)
+SCATTER_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i32, __scatter_base_offsets32_i16)
+SCATTER_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i64, __scatter_base_offsets64_i16)
+SCATTER_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i32, __scatter_base_offsets32_i32)
+SCATTER_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i64, __scatter_base_offsets64_i32)
+SCATTER_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i32, __scatter_base_offsets32_i64)
+SCATTER_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i64, __scatter_base_offsets64_i64)
+
+#define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)                 \
+static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec64_i1 mask) {  \
+    VTYPE ret;                                                       \
+    for (int i = 0; i < 64; ++i)                                     \
+        if ((mask.v & (1ull << i)) != 0) {                              \
+            STYPE *ptr = (STYPE *)ptrs.v[i];                         \
+            *ptr = val.v[i];                                         \
+        }                                                            \
+}
+
+SCATTER_GENERAL(__vec64_i8, int8_t, __vec64_i32, __scatter32_i8)
+SCATTER_GENERAL(__vec64_i8, int8_t, __vec64_i64, __scatter64_i8)
+SCATTER_GENERAL(__vec64_i16, int16_t, __vec64_i32, __scatter32_i16)
+SCATTER_GENERAL(__vec64_i16, int16_t, __vec64_i64, __scatter64_i16)
+SCATTER_GENERAL(__vec64_i32, int32_t, __vec64_i32, __scatter32_i32)
+SCATTER_GENERAL(__vec64_i32, int32_t, __vec64_i64, __scatter64_i32)
+SCATTER_GENERAL(__vec64_i64, int64_t, __vec64_i32, __scatter32_i64)
+SCATTER_GENERAL(__vec64_i64, int64_t, __vec64_i64, __scatter64_i64)
+
+///////////////////////////////////////////////////////////////////////////
+// packed load/store
+
+static FORCEINLINE int32_t __packed_load_active(int32_t *ptr, __vec64_i32 *val,
+                                                __vec64_i1 mask) {
+    int count = 0; 
+    for (int i = 0; i < 64; ++i) {
+        if ((mask.v & (1ull << i)) != 0) {
+            val->v[i] = *ptr++;
+            ++count;
+        }
+    }
+    return count;
+}
+
+
+static FORCEINLINE int32_t __packed_store_active(int32_t *ptr, __vec64_i32 val,
+                                                 __vec64_i1 mask) {
+    int count = 0; 
+    for (int i = 0; i < 64; ++i) {
+        if ((mask.v & (1ull << i)) != 0) {
+            *ptr++ = val.v[i];
+            ++count;
+        }
+    }
+    return count;
+}
+
+static FORCEINLINE int32_t __packed_load_active(uint32_t *ptr,
+                                                __vec64_i32 *val,
+                                                __vec64_i1 mask) {
+    int count = 0; 
+    for (int i = 0; i < 64; ++i) {
+        if ((mask.v & (1ull << i)) != 0) {
+            val->v[i] = *ptr++;
+            ++count;
+        }
+    }
+    return count;
+}
+
+
+static FORCEINLINE int32_t __packed_store_active(uint32_t *ptr, 
+                                                 __vec64_i32 val,
+                                                 __vec64_i1 mask) {
+    int count = 0; 
+    for (int i = 0; i < 64; ++i) {
+        if ((mask.v & (1ull << i)) != 0) {
+            *ptr++ = val.v[i];
+            ++count;
+        }
+    }
+    return count;
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// aos/soa
+
+static FORCEINLINE void __soa_to_aos3_float(__vec64_f v0, __vec64_f v1, __vec64_f v2,
+                                            float *ptr) {
+    for (int i = 0; i < 64; ++i) {
+        *ptr++ = __extract_element(v0, i);
+        *ptr++ = __extract_element(v1, i);
+        *ptr++ = __extract_element(v2, i);
+    }
+}
+
+static FORCEINLINE void __aos_to_soa3_float(float *ptr, __vec64_f *out0, __vec64_f *out1,
+                                            __vec64_f *out2) {
+    for (int i = 0; i < 64; ++i) {
+        __insert_element(out0, i, *ptr++);
+        __insert_element(out1, i, *ptr++);
+        __insert_element(out2, i, *ptr++);
+    }
+}
+
+static FORCEINLINE void __soa_to_aos4_float(__vec64_f v0, __vec64_f v1, __vec64_f v2,
+                                            __vec64_f v3, float *ptr) {
+    for (int i = 0; i < 64; ++i) {
+        *ptr++ = __extract_element(v0, i);
+        *ptr++ = __extract_element(v1, i);
+        *ptr++ = __extract_element(v2, i);
+        *ptr++ = __extract_element(v3, i);
+    }
+}
+
+static FORCEINLINE void __aos_to_soa4_float(float *ptr, __vec64_f *out0, __vec64_f *out1,
+                                            __vec64_f *out2, __vec64_f *out3) {
+    for (int i = 0; i < 64; ++i) {
+        __insert_element(out0, i, *ptr++);
+        __insert_element(out1, i, *ptr++);
+        __insert_element(out2, i, *ptr++);
+        __insert_element(out3, i, *ptr++);
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////
+// prefetch
+
+static FORCEINLINE void __prefetch_read_uniform_1(unsigned char *) {
+}
+
+static FORCEINLINE void __prefetch_read_uniform_2(unsigned char *) {
+}
+
+static FORCEINLINE void __prefetch_read_uniform_3(unsigned char *) {
+}
+
+static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *) {
+}
+
+///////////////////////////////////////////////////////////////////////////
+// atomics
+
+static FORCEINLINE uint32_t __atomic_add(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedAdd((LONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_add(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_sub(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedAdd((LONG volatile *)p, -v) + v;
+#else
+    return __sync_fetch_and_sub(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_and(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedAnd((LONG volatile *)p, v);
+#else
+    return __sync_fetch_and_and(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_or(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedOr((LONG volatile *)p, v);
+#else
+    return __sync_fetch_and_or(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_xor(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedXor((LONG volatile *)p, v);
+#else
+    return __sync_fetch_and_xor(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_min(uint32_t *p, uint32_t v) {
+    int32_t old, min;
+    do {
+        old = *((volatile int32_t *)p);
+        min = (old < (int32_t)v) ? old : (int32_t)v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange((LONG volatile *)p, min, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint32_t __atomic_max(uint32_t *p, uint32_t v) {
+    int32_t old, max;
+    do {
+        old = *((volatile int32_t *)p);
+        max = (old > (int32_t)v) ? old : (int32_t)v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange((LONG volatile *)p, max, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint32_t __atomic_umin(uint32_t *p, uint32_t v) {
+    uint32_t old, min;
+    do {
+        old = *((volatile uint32_t *)p);
+        min = (old < v) ? old : v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange((LONG volatile *)p, min, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint32_t __atomic_umax(uint32_t *p, uint32_t v) {
+    uint32_t old, max;
+    do {
+        old = *((volatile uint32_t *)p);
+        max = (old > v) ? old : v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange((LONG volatile *)p, max, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint32_t __atomic_xchg(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedExchange((LONG volatile *)p, v);
+#else
+    return __sync_lock_test_and_set(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_cmpxchg(uint32_t *p, uint32_t cmpval,
+                                             uint32_t newval) {
+#ifdef _MSC_VER
+    return InterlockedCompareExchange((LONG volatile *)p, newval, cmpval);
+#else
+    return __sync_val_compare_and_swap(p, cmpval, newval);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_add(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedAdd64((LONGLONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_add(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_sub(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedAdd64((LONGLONG volatile *)p, -v) + v;
+#else
+    return __sync_fetch_and_sub(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_and(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedAnd64((LONGLONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_and(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_or(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedOr64((LONGLONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_or(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_xor(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedXor64((LONGLONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_xor(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_min(uint64_t *p, uint64_t v) {
+    int64_t old, min;
+    do {
+        old = *((volatile int64_t *)p);
+        min = (old < (int64_t)v) ? old : (int64_t)v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, min, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint64_t __atomic_max(uint64_t *p, uint64_t v) {
+    int64_t old, max;
+    do {
+        old = *((volatile int64_t *)p);
+        max = (old > (int64_t)v) ? old : (int64_t)v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, max, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint64_t __atomic_umin(uint64_t *p, uint64_t v) {
+    uint64_t old, min;
+    do {
+        old = *((volatile uint64_t *)p);
+        min = (old < v) ? old : v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, min, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint64_t __atomic_umax(uint64_t *p, uint64_t v) {
+    uint64_t old, max;
+    do {
+        old = *((volatile uint64_t *)p);
+        max = (old > v) ? old : v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, max, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint64_t __atomic_xchg(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedExchange64((LONGLONG volatile *)p, v);
+#else
+    return __sync_lock_test_and_set(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval,
+                                             uint64_t newval) {
+#ifdef _MSC_VER
+    return InterlockedCompareExchange64((LONGLONG volatile *)p, newval, cmpval);
+#else
+    return __sync_val_compare_and_swap(p, cmpval, newval);
+#endif
+}
diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h
index c6299893..9f301bb7 100644
--- a/examples/intrinsics/sse4.h
+++ b/examples/intrinsics/sse4.h
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
   All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
@@ -224,8 +224,8 @@ CAST_BITS_SCALAR(double, int64_t)
 ///////////////////////////////////////////////////////////////////////////
 // mask ops
 
-static FORCEINLINE uint32_t __movmsk(__vec4_i1 mask) {
-    return _mm_movemask_ps(mask.v);
+static FORCEINLINE uint64_t __movmsk(__vec4_i1 mask) {
+    return (uint64_t)_mm_movemask_ps(mask.v);
 }
 
 static FORCEINLINE __vec4_i1 __equal(__vec4_i1 a, __vec4_i1 b) {
@@ -266,6 +266,10 @@ static FORCEINLINE void __store(__vec4_i1 *p, __vec4_i1 value, int align) {
     _mm_storeu_ps((float *)(&p->v), value.v);
 }
 
+static FORCEINLINE __vec4_i1 __smear_i1(__vec4_i1, int v) {
+    return __vec4_i1(v, v, v, v);
+}
+
 ///////////////////////////////////////////////////////////////////////////
 // int8
 
@@ -489,7 +493,7 @@ static FORCEINLINE void __insert_element(__vec4_i8 *v, int index, int8_t val) {
     ((int8_t *)v)[index] = val;
 }
 
-static FORCEINLINE __vec4_i8 __smear_i8(int8_t v) {
+static FORCEINLINE __vec4_i8 __smear_i8(__vec4_i8, int8_t v) {
     return _mm_set1_epi8(v);
 }
 
@@ -748,7 +752,7 @@ static FORCEINLINE void __insert_element(__vec4_i16 *v, int index, int16_t val)
     ((int16_t *)v)[index] = val;
 }
 
-static FORCEINLINE __vec4_i16 __smear_i16(int16_t v) {
+static FORCEINLINE __vec4_i16 __smear_i16(__vec4_i16, int16_t v) {
     return _mm_set1_epi16(v);
 }
 
@@ -985,7 +989,7 @@ static FORCEINLINE __vec4_i32 __select(__vec4_i1 mask, __vec4_i32 a, __vec4_i32
                                           _mm_castsi128_ps(a.v), mask.v));
 }
 
-static FORCEINLINE __vec4_i32 __smear_i32(int32_t v) {
+static FORCEINLINE __vec4_i32 __smear_i32(__vec4_i32, int32_t v) {
     return _mm_set1_epi32(v);
 }
 
@@ -1246,7 +1250,7 @@ static FORCEINLINE __vec4_i64 __select(__vec4_i1 mask, __vec4_i64 a, __vec4_i64
     return __vec4_i64(_mm_castpd_si128(r0), _mm_castpd_si128(r1));
 }
 
-static FORCEINLINE __vec4_i64 __smear_i64(int64_t v) {
+static FORCEINLINE __vec4_i64 __smear_i64(__vec4_i64, int64_t v) {
     return __vec4_i64(v, v, v, v);
 }
 
@@ -1350,7 +1354,7 @@ static FORCEINLINE __vec4_f __select(__vec4_i1 mask, __vec4_f a, __vec4_f b) {
     return _mm_blendv_ps(b.v, a.v, mask.v);
 }
 
-static FORCEINLINE __vec4_f __smear_float(float v) {
+static FORCEINLINE __vec4_f __smear_float(__vec4_f, float v) {
     return _mm_set1_ps(v);
 }
 
@@ -1482,7 +1486,7 @@ static FORCEINLINE __vec4_d __select(__vec4_i1 mask, __vec4_d a, __vec4_d b) {
     return __vec4_d(r0, r1);
 }
 
-static FORCEINLINE __vec4_d __smear_double(double v) {
+static FORCEINLINE __vec4_d __smear_double(__vec4_d, double v) {
     return __vec4_d(_mm_set1_pd(v), _mm_set1_pd(v));
 }
 
@@ -1582,11 +1586,13 @@ static FORCEINLINE __vec4_i16 __cast_sext(__vec4_i16, __vec4_i8 val) {
 }
 
 static FORCEINLINE __vec4_i8 __cast_sext(__vec4_i8, __vec4_i1 v) {
-    return __select(v, __smear_i8(0xff), __smear_i8(0));
+    return __select(v, __smear_i8(__vec4_i8(), 0xff), 
+                       __smear_i8(__vec4_i8(), 0));
 }
 
 static FORCEINLINE __vec4_i16 __cast_sext(__vec4_i16, __vec4_i1 v) {
-    return __select(v, __smear_i16(0xffff), __smear_i16(0));
+    return __select(v, __smear_i16(__vec4_i16(), 0xffff),
+                       __smear_i16(__vec4_i16(), 0));
 }
 
 static FORCEINLINE __vec4_i32 __cast_sext(__vec4_i32, __vec4_i1 v) {
@@ -1646,11 +1652,12 @@ static FORCEINLINE __vec4_i16 __cast_zext(__vec4_i16, __vec4_i8 val) {
 }
 
 static FORCEINLINE __vec4_i8 __cast_zext(__vec4_i8, __vec4_i1 v) {
-    return __select(v, __smear_i8(1), __smear_i8(0));
+    return __select(v, __smear_i8(__vec4_i8(), 1), __smear_i8(__vec4_i8(), 0));
 }
 
 static FORCEINLINE __vec4_i16 __cast_zext(__vec4_i16, __vec4_i1 v) {
-    return __select(v, __smear_i16(1), __smear_i16(0));
+    return __select(v, __smear_i16(__vec4_i16(), 1), 
+                       __smear_i16(__vec4_i16(), 0));
 }
 
 static FORCEINLINE __vec4_i32 __cast_zext(__vec4_i32, __vec4_i1 v) {
@@ -1658,7 +1665,7 @@ static FORCEINLINE __vec4_i32 __cast_zext(__vec4_i32, __vec4_i1 v) {
 }
 
 static FORCEINLINE __vec4_i64 __cast_zext(__vec4_i64, __vec4_i1 v) {
-    return __select(v, __smear_i64(1), __smear_i64(0));
+    return __select(v, __smear_i64(__vec4_i64(), 1), __smear_i64(__vec4_i64(), 0));
 }
 
 // truncations
@@ -1818,11 +1825,11 @@ static FORCEINLINE __vec4_d __cast_uitofp(__vec4_d, __vec4_i64 val) {
 }
 
 static FORCEINLINE __vec4_f __cast_uitofp(__vec4_f, __vec4_i1 v) {
-    return __select(v, __smear_float(1.), __smear_float(0.));
+    return __select(v, __smear_float(__vec4_f(), 1.), __smear_float(__vec4_f(), 0.));
 }
 
 static FORCEINLINE __vec4_d __cast_uitofp(__vec4_d, __vec4_i1 v) {
-    return __select(v, __smear_double(1.), __smear_double(0.));
+    return __select(v, __smear_double(__vec4_d(), 1.), __smear_double(__vec4_d(), 0.));
 }
 
 // float/double to signed int
@@ -2613,8 +2620,8 @@ lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p, __vec4_i32 offsets,
     RetScalar r[4];
 #if 1
     // "Fast gather" trick...
-    offsets = __select(mask, offsets, __smear_i32(0));
-    constOffset = __select(mask, constOffset, __smear_i32(0));
+    offsets = __select(mask, offsets, __smear_i32(__vec4_i32(), 0));
+    constOffset = __select(mask, constOffset, __smear_i32(__vec4_i32(), 0));
 
     int offset = scale * _mm_extract_epi32(offsets.v, 0) + _mm_extract_epi32(constOffset.v, 0);
     RetScalar *ptr = (RetScalar *)(p + offset);
@@ -2671,8 +2678,8 @@ lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, __vec4_i64 offsets,
     RetScalar r[4];
 #if 1
     // "Fast gather" trick...
-    offsets = __select(mask, offsets, __smear_i64(0));
-    constOffset = __select(mask, constOffset, __smear_i64(0));
+    offsets = __select(mask, offsets, __smear_i64(__vec4_i64(), 0));
+    constOffset = __select(mask, constOffset, __smear_i64(__vec4_i64(), 0));
 
     int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + _mm_extract_epi64(constOffset.v[0], 0);
     RetScalar *ptr = (RetScalar *)(p + offset);
@@ -2756,8 +2763,8 @@ __gather_base_offsets32_i32(uint8_t *p, __vec4_i32 offsets, uint32_t scale,
     __m128i r = _mm_set_epi32(0, 0, 0, 0);
 #if 1
     // "Fast gather"...
-    offsets = __select(mask, offsets, __smear_i32(0));
-    constOffset = __select(mask, constOffset, __smear_i32(0));
+    offsets = __select(mask, offsets, __smear_i32(__vec4_i32(), 0));
+    constOffset = __select(mask, constOffset, __smear_i32(__vec4_i32(), 0));
 
     int offset = scale * _mm_extract_epi32(offsets.v, 0) +
         _mm_extract_epi32(constOffset.v, 0);
diff --git a/examples/timing.h b/examples/timing.h
index f61fbce8..7d746d45 100644
--- a/examples/timing.h
+++ b/examples/timing.h
@@ -43,9 +43,15 @@ extern "C" {
 #endif /* __cplusplus */
     __inline__ uint64_t rdtsc() {
         uint32_t low, high;
+#ifdef __x86_64
         __asm__ __volatile__ (
             "xorl %%eax,%%eax \n    cpuid"
             ::: "%rax", "%rbx", "%rcx", "%rdx" );
+#else
+        __asm__ __volatile__ (
+            "xorl %%eax,%%eax \n    cpuid"
+            ::: "%eax", "%ebx", "%ecx", "%edx" );
+#endif
         __asm__ __volatile__ (
                               "rdtsc" : "=a" (low), "=d" (high));
         return (uint64_t)high << 32 | low;
diff --git a/expr.cpp b/expr.cpp
index 17541012..7c270f46 100644
--- a/expr.cpp
+++ b/expr.cpp
@@ -68,6 +68,7 @@
 #include <llvm/ExecutionEngine/GenericValue.h>
 #include <llvm/Support/InstIterator.h>
 
+
 /////////////////////////////////////////////////////////////////////////////////////
 // Expr
 
@@ -147,7 +148,7 @@ lMaybeIssuePrecisionWarning(const AtomicType *toAtomicType,
 
 static Expr *
 lArrayToPointer(Expr *expr) {
-    Assert(expr && dynamic_cast<const ArrayType *>(expr->GetType()));
+    AssertPos(expr->pos, expr && CastType<ArrayType>(expr->GetType()));
 
     Expr *zero = new ConstExpr(AtomicType::UniformInt32, 0, expr->pos);
     Expr *index = new IndexExpr(expr, zero, expr->pos);
@@ -188,7 +189,7 @@ lDoTypeConv(const Type *fromType, const Type *toType, Expr **expr,
             bool failureOk, const char *errorMsgBase, SourcePos pos) {
     /* This function is way too long and complex.  Is type conversion stuff
        always this messy, or can this be cleaned up somehow? */
-    Assert(failureOk || errorMsgBase != NULL);
+    AssertPos(pos, failureOk || errorMsgBase != NULL);
 
     if (toType == NULL || fromType == NULL)
         return false;
@@ -211,14 +212,30 @@ lDoTypeConv(const Type *fromType, const Type *toType, Expr **expr,
         return false;
     }
 
-    if (dynamic_cast<const FunctionType *>(fromType)) {
-        if (!failureOk)
-            Error(pos, "Can't convert function type \"%s\" to \"%s\" for %s.",
-                  fromType->GetString().c_str(),
-                  toType->GetString().c_str(), errorMsgBase);
-        return false;
+    if (CastType<FunctionType>(fromType)) {
+        if (CastType<PointerType>(toType) != NULL) {
+            // Convert function type to pointer to function type
+            if (expr != NULL) {
+                Expr *aoe = new AddressOfExpr(*expr, (*expr)->pos);
+                if (lDoTypeConv(aoe->GetType(), toType, &aoe, failureOk,
+                                errorMsgBase, pos)) {
+                    *expr = aoe;
+                    return true;
+                }
+            }
+            else
+                return lDoTypeConv(PointerType::GetUniform(fromType), toType, NULL,
+                                   failureOk, errorMsgBase, pos);
+        }
+        else {
+            if (!failureOk)
+                Error(pos, "Can't convert function type \"%s\" to \"%s\" for %s.",
+                      fromType->GetString().c_str(),
+                      toType->GetString().c_str(), errorMsgBase);
+            return false;
+        }
     }
-    if (dynamic_cast<const FunctionType *>(toType)) {
+    if (CastType<FunctionType>(toType)) {
         if (!failureOk)
             Error(pos, "Can't convert from type \"%s\" to function type \"%s\" "
                   "for %s.", fromType->GetString().c_str(),
@@ -236,31 +253,36 @@ lDoTypeConv(const Type *fromType, const Type *toType, Expr **expr,
         return false;
     }
 
-    const ArrayType *toArrayType = dynamic_cast<const ArrayType *>(toType);
-    const ArrayType *fromArrayType = dynamic_cast<const ArrayType *>(fromType);
-    const VectorType *toVectorType = dynamic_cast<const VectorType *>(toType);
-    const VectorType *fromVectorType = dynamic_cast<const VectorType *>(fromType);
-    const StructType *toStructType = dynamic_cast<const StructType *>(toType);
-    const StructType *fromStructType = dynamic_cast<const StructType *>(fromType);
-    const EnumType *toEnumType = dynamic_cast<const EnumType *>(toType);
-    const EnumType *fromEnumType = dynamic_cast<const EnumType *>(fromType);
-    const AtomicType *toAtomicType = dynamic_cast<const AtomicType *>(toType);
-    const AtomicType *fromAtomicType = dynamic_cast<const AtomicType *>(fromType);
-    const PointerType *fromPointerType = dynamic_cast<const PointerType *>(fromType);
-    const PointerType *toPointerType = dynamic_cast<const PointerType *>(toType);
+    const ArrayType *toArrayType = CastType<ArrayType>(toType);
+    const ArrayType *fromArrayType = CastType<ArrayType>(fromType);
+    const VectorType *toVectorType = CastType<VectorType>(toType);
+    const VectorType *fromVectorType = CastType<VectorType>(fromType);
+    const StructType *toStructType = CastType<StructType>(toType);
+    const StructType *fromStructType = CastType<StructType>(fromType);
+    const EnumType *toEnumType = CastType<EnumType>(toType);
+    const EnumType *fromEnumType = CastType<EnumType>(fromType);
+    const AtomicType *toAtomicType = CastType<AtomicType>(toType);
+    const AtomicType *fromAtomicType = CastType<AtomicType>(fromType);
+    const PointerType *fromPointerType = CastType<PointerType>(fromType);
+    const PointerType *toPointerType = CastType<PointerType>(toType);
 
     // Do this early, since for the case of a conversion like
     // "float foo[10]" -> "float * uniform foo", we have what's seemingly
     // a varying to uniform conversion (but not really)
     if (fromArrayType != NULL && toPointerType != NULL) {
+        // can convert any array to a void pointer (both uniform and
+        // varying).
+        if (PointerType::IsVoidPointer(toPointerType))
+            goto typecast_ok;
+
         // array to pointer to array element type
         const Type *eltType = fromArrayType->GetElementType();
         if (toPointerType->GetBaseType()->IsConstType())
             eltType = eltType->GetAsConstType();
-        if (Type::Equal(toPointerType, 
-                        new PointerType(eltType,
-                                        toPointerType->GetVariability(),
-                                        toPointerType->IsConstType())))
+
+        PointerType pt(eltType, toPointerType->GetVariability(),
+                       toPointerType->IsConstType());
+        if (Type::Equal(toPointerType, &pt))
             goto typecast_ok;
         else {
             if (!failureOk)
@@ -281,7 +303,7 @@ lDoTypeConv(const Type *fromType, const Type *toType, Expr **expr,
     }
 
     if (fromPointerType != NULL) {
-        if (dynamic_cast<const AtomicType *>(toType) != NULL &&
+        if (CastType<AtomicType>(toType) != NULL &&
             toType->IsBoolType())
             // Allow implicit conversion of pointers to bools
             goto typecast_ok;
@@ -323,8 +345,8 @@ lDoTypeConv(const Type *fromType, const Type *toType, Expr **expr,
                  !Type::Equal(fromPointerType->GetBaseType()->GetAsConstType(), 
                               toPointerType->GetBaseType())) {
             if (!failureOk)
-                Error(pos, "Can't convert between incompatible pointer types "
-                      "\"%s\" and \"%s\" for %s.",
+                Error(pos, "Can't convert from pointer type \"%s\" to "
+                      "incompatible pointer type \"%s\" for %s.",
                       fromPointerType->GetString().c_str(),
                       toPointerType->GetString().c_str(), errorMsgBase);
             return false;
@@ -370,11 +392,13 @@ lDoTypeConv(const Type *fromType, const Type *toType, Expr **expr,
 
     // Convert from type T -> const T; just return a TypeCast expr, which
     // can handle this
-    if (Type::Equal(toType, fromType->GetAsConstType()))
+    if (Type::EqualIgnoringConst(toType, fromType) &&
+        toType->IsConstType() == true &&
+        fromType->IsConstType() == false)
         goto typecast_ok;
     
-    if (dynamic_cast<const ReferenceType *>(fromType)) {
-        if (dynamic_cast<const ReferenceType *>(toType)) {
+    if (CastType<ReferenceType>(fromType)) {
+        if (CastType<ReferenceType>(toType)) {
             // Convert from a reference to a type to a const reference to a type;
             // this is handled by TypeCastExpr
             if (Type::Equal(toType->GetReferenceTarget(),
@@ -382,9 +406,9 @@ lDoTypeConv(const Type *fromType, const Type *toType, Expr **expr,
                 goto typecast_ok;
 
             const ArrayType *atFrom = 
-                dynamic_cast<const ArrayType *>(fromType->GetReferenceTarget());
+                CastType<ArrayType>(fromType->GetReferenceTarget());
             const ArrayType *atTo = 
-                dynamic_cast<const ArrayType *>(toType->GetReferenceTarget());
+                CastType<ArrayType>(toType->GetReferenceTarget());
 
             if (atFrom != NULL && atTo != NULL && 
                 Type::Equal(atFrom->GetElementType(), atTo->GetElementType())) {
@@ -414,7 +438,7 @@ lDoTypeConv(const Type *fromType, const Type *toType, Expr **expr,
                                    failureOk, errorMsgBase, pos);
         }
     }
-    else if (dynamic_cast<const ReferenceType *>(toType)) {
+    else if (CastType<ReferenceType>(toType)) {
         // T -> reference T
         if (expr != NULL) {
             Expr *rExpr = new ReferenceExpr(*expr, pos);
@@ -425,9 +449,10 @@ lDoTypeConv(const Type *fromType, const Type *toType, Expr **expr,
             }
             return false;
         }
-        else
-            return lDoTypeConv(new ReferenceType(fromType), toType, NULL, 
-                               failureOk, errorMsgBase, pos);
+        else {
+            ReferenceType rt(fromType);
+            return lDoTypeConv(&rt, toType, NULL, failureOk, errorMsgBase, pos);
+        }
     }
     else if (Type::Equal(toType, fromType->GetAsNonConstType()))
         // convert: const T -> T (as long as T isn't a reference)
@@ -440,7 +465,7 @@ lDoTypeConv(const Type *fromType, const Type *toType, Expr **expr,
                         fromArrayType->GetElementType())) {
             // the case of different element counts should have returned
             // successfully earlier, yes??
-            Assert(toArrayType->GetElementCount() != fromArrayType->GetElementCount());
+            AssertPos(pos, toArrayType->GetElementCount() != fromArrayType->GetElementCount());
             goto typecast_ok;
         }
         else if (Type::Equal(toArrayType->GetElementType(), 
@@ -496,7 +521,7 @@ lDoTypeConv(const Type *fromType, const Type *toType, Expr **expr,
 
     // enum -> atomic (integer, generally...) is always ok
     if (fromEnumType != NULL) {
-        Assert(toAtomicType != NULL || toVectorType != NULL);
+        AssertPos(pos, toAtomicType != NULL || toVectorType != NULL);
         goto typecast_ok;
     }
 
@@ -571,8 +596,8 @@ bool
 PossiblyResolveFunctionOverloads(Expr *expr, const Type *type) {
     FunctionSymbolExpr *fse = NULL;
     const FunctionType *funcType = NULL;
-    if (dynamic_cast<const PointerType *>(type) != NULL &&
-        (funcType = dynamic_cast<const FunctionType *>(type->GetBaseType())) &&
+    if (CastType<PointerType>(type) != NULL &&
+        (funcType = CastType<FunctionType>(type->GetBaseType())) &&
         (fse = dynamic_cast<FunctionSymbolExpr *>(expr)) != NULL) {
         // We're initializing a function pointer with a function symbol,
         // which in turn may represent an overloaded function.  So we need
@@ -616,19 +641,23 @@ InitSymbol(llvm::Value *ptr, const Type *symType, Expr *initExpr,
         // instead we'll make a constant static global that holds the
         // constant value and emit a memcpy to put its value into the
         // pointer we have.
-        LLVM_TYPE_CONST llvm::Type *llvmType = symType->LLVMType(g->ctx);
+        llvm::Type *llvmType = symType->LLVMType(g->ctx);
         if (llvmType == NULL) {
-            Assert(m->errorCount > 0);
+            AssertPos(pos, m->errorCount > 0);
             return;
         }
 
-        llvm::Value *constPtr = 
-            new llvm::GlobalVariable(*m->module, llvmType, true /* const */, 
-                                     llvm::GlobalValue::InternalLinkage,
-                                     constValue, "const_initializer");
-        llvm::Value *size = g->target.SizeOf(llvmType, 
-                                             ctx->GetCurrentBasicBlock());
-        ctx->MemcpyInst(ptr, constPtr, size);
+        if (Type::IsBasicType(symType))
+            ctx->StoreInst(constValue, ptr);
+        else {
+            llvm::Value *constPtr = 
+                new llvm::GlobalVariable(*m->module, llvmType, true /* const */, 
+                                         llvm::GlobalValue::InternalLinkage,
+                                         constValue, "const_initializer");
+            llvm::Value *size = g->target.SizeOf(llvmType, 
+                                                 ctx->GetCurrentBasicBlock());
+            ctx->MemcpyInst(ptr, constPtr, size);
+        }
 
         return;
     }
@@ -667,7 +696,7 @@ InitSymbol(llvm::Value *ptr, const Type *symType, Expr *initExpr,
         return;
     }
 
-    const ReferenceType *rt = dynamic_cast<const ReferenceType *>(symType);
+    const ReferenceType *rt = CastType<ReferenceType>(symType);
     if (rt) {
         if (!Type::Equal(initExpr->GetType(), rt)) {
             Error(initExpr->pos, "Initializer for reference type \"%s\" must have same "
@@ -684,18 +713,17 @@ InitSymbol(llvm::Value *ptr, const Type *symType, Expr *initExpr,
 
     // Handle initiailizers for SOA types as well as for structs, arrays,
     // and vectors.
-    const CollectionType *collectionType = 
-        dynamic_cast<const CollectionType *>(symType);
+    const CollectionType *collectionType = CastType<CollectionType>(symType);
     if (collectionType != NULL || symType->IsSOAType()) {
         int nElements = collectionType ? collectionType->GetElementCount() :
             symType->GetSOAWidth();
 
         std::string name;
-        if (dynamic_cast<const StructType *>(symType) != NULL)
+        if (CastType<StructType>(symType) != NULL)
             name = "struct";
-        else if (dynamic_cast<const ArrayType *>(symType) != NULL) 
+        else if (CastType<ArrayType>(symType) != NULL) 
             name = "array";
-        else if (dynamic_cast<const VectorType *>(symType) != NULL) 
+        else if (CastType<VectorType>(symType) != NULL) 
             name = "vector";
         else if (symType->IsSOAType())
             name = symType->GetVariability().GetString();
@@ -729,12 +757,12 @@ InitSymbol(llvm::Value *ptr, const Type *symType, Expr *initExpr,
                     collectionType ? collectionType->GetElementType(i) : 
                                      symType->GetAsUniformType();
                 if (elementType == NULL) {
-                    Assert(m->errorCount > 0);
+                    AssertPos(pos, m->errorCount > 0);
                     return;
                 }
 
                 llvm::Value *ep;
-                if (dynamic_cast<const StructType *>(symType) != NULL)
+                if (CastType<StructType>(symType) != NULL)
                     ep = ctx->AddElementOffset(ptr, i, NULL, "element");
                 else
                     ep = ctx->GetElementPtrInst(ptr, LLVMInt32(0), LLVMInt32(i), 
@@ -746,9 +774,9 @@ InitSymbol(llvm::Value *ptr, const Type *symType, Expr *initExpr,
                 else {
                     // If we don't have enough initializer values, initialize the
                     // rest as zero.
-                    LLVM_TYPE_CONST llvm::Type *llvmType = elementType->LLVMType(g->ctx);
+                    llvm::Type *llvmType = elementType->LLVMType(g->ctx);
                     if (llvmType == NULL) {
-                        Assert(m->errorCount > 0);
+                        AssertPos(pos, m->errorCount > 0);
                         return;
                     }
 
@@ -779,12 +807,12 @@ lMatchingBoolType(const Type *type) {
     bool uniformTest = type->IsUniformType();
     const AtomicType *boolBase = uniformTest ? AtomicType::UniformBool : 
                                                AtomicType::VaryingBool;
-    const VectorType *vt = dynamic_cast<const VectorType *>(type);
+    const VectorType *vt = CastType<VectorType>(type);
     if (vt != NULL)
         return new VectorType(boolBase, vt->GetElementCount());
     else {
-        Assert(dynamic_cast<const AtomicType *>(type) != NULL ||
-               dynamic_cast<const PointerType *>(type) != NULL);
+        Assert(CastType<AtomicType>(type) != NULL ||
+               CastType<PointerType>(type) != NULL);
         return boolBase;
     }
 }
@@ -794,10 +822,10 @@ lMatchingBoolType(const Type *type) {
 
 static llvm::Constant *
 lLLVMConstantValue(const Type *type, llvm::LLVMContext *ctx, double value) {
-    const AtomicType *atomicType = dynamic_cast<const AtomicType *>(type);
-    const EnumType *enumType = dynamic_cast<const EnumType *>(type);
-    const VectorType *vectorType = dynamic_cast<const VectorType *>(type);
-    const PointerType *pointerType = dynamic_cast<const PointerType *>(type);
+    const AtomicType *atomicType = CastType<AtomicType>(type);
+    const EnumType *enumType = CastType<EnumType>(type);
+    const VectorType *vectorType = CastType<VectorType>(type);
+    const PointerType *pointerType = CastType<PointerType>(type);
 
     // This function is only called with, and only works for atomic, enum,
     // and vector types.
@@ -880,7 +908,7 @@ lLLVMConstantValue(const Type *type, llvm::LLVMContext *ctx, double value) {
         // a recursive call to lLLVMConstantValue().
         const Type *baseType = vectorType->GetBaseType();
         llvm::Constant *constElement = lLLVMConstantValue(baseType, ctx, value);
-        LLVM_TYPE_CONST llvm::Type *llvmVectorType = vectorType->LLVMType(ctx);
+        llvm::Type *llvmVectorType = vectorType->LLVMType(ctx);
 
         // Now create a constant version of the corresponding LLVM type that we
         // use to represent the VectorType.
@@ -889,8 +917,8 @@ lLLVMConstantValue(const Type *type, llvm::LLVMContext *ctx, double value) {
         // LLVM ArrayTypes leaks into the code here; it feels like this detail
         // should be better encapsulated?
         if (baseType->IsUniformType()) {
-            LLVM_TYPE_CONST llvm::VectorType *lvt = 
-                llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(llvmVectorType);
+            llvm::VectorType *lvt = 
+                llvm::dyn_cast<llvm::VectorType>(llvmVectorType);
             Assert(lvt != NULL);
             std::vector<llvm::Constant *> vals;
             for (unsigned int i = 0; i < lvt->getNumElements(); ++i)
@@ -898,8 +926,8 @@ lLLVMConstantValue(const Type *type, llvm::LLVMContext *ctx, double value) {
             return llvm::ConstantVector::get(vals);
         }
         else {
-            LLVM_TYPE_CONST llvm::ArrayType *lat = 
-                llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(llvmVectorType);
+            llvm::ArrayType *lat = 
+                llvm::dyn_cast<llvm::ArrayType>(llvmVectorType);
             Assert(lat != NULL);
             std::vector<llvm::Constant *> vals;
             for (unsigned int i = 0; i < lat->getNumElements(); ++i)
@@ -915,8 +943,8 @@ lMaskForSymbol(Symbol *baseSym, FunctionEmitContext *ctx) {
     if (baseSym == NULL)
         return ctx->GetFullMask();
 
-    if (dynamic_cast<const PointerType *>(baseSym->type) != NULL ||
-        dynamic_cast<const ReferenceType *>(baseSym->type) != NULL)
+    if (CastType<PointerType>(baseSym->type) != NULL ||
+        CastType<ReferenceType>(baseSym->type) != NULL)
         // FIXME: for pointers, we really only want to do this for
         // dereferencing the pointer, not for things like pointer
         // arithmetic, when we may be able to use the internal mask,
@@ -943,8 +971,8 @@ lStoreAssignResult(llvm::Value *value, llvm::Value *ptr, const Type *valueType,
         baseSym != NULL &&
         baseSym->varyingCFDepth == ctx->VaryingCFDepth() &&
         baseSym->storageClass != SC_STATIC &&
-        dynamic_cast<const ReferenceType *>(baseSym->type) == NULL &&
-        dynamic_cast<const PointerType *>(baseSym->type) == NULL) {
+        CastType<ReferenceType>(baseSym->type) == NULL &&
+        CastType<PointerType>(baseSym->type) == NULL) {
         // If the variable is declared at the same varying control flow
         // depth as where it's being assigned, then we don't need to do any
         // masking but can just do the assignment as if all the lanes were
@@ -974,7 +1002,7 @@ lEmitPrePostIncDec(UnaryExpr::Op op, Expr *expr, SourcePos pos,
     // Get both the lvalue and the rvalue of the given expression
     llvm::Value *lvalue = NULL, *rvalue = NULL;
     const Type *lvalueType = NULL;
-    if (dynamic_cast<const ReferenceType *>(type) != NULL) {
+    if (CastType<ReferenceType>(type) != NULL) {
         lvalueType = type;
         type = type->GetReferenceTarget();
         lvalue = expr->GetValue(ctx);
@@ -1004,20 +1032,26 @@ lEmitPrePostIncDec(UnaryExpr::Op op, Expr *expr, SourcePos pos,
     llvm::Value *binop = NULL;
     int delta = (op == UnaryExpr::PreInc || op == UnaryExpr::PostInc) ? 1 : -1;
 
-    if (dynamic_cast<const PointerType *>(type) != NULL) {
+    std::string opName = rvalue->getName().str();
+    if (op == UnaryExpr::PreInc || op == UnaryExpr::PostInc)
+        opName += "_plus1";
+    else
+        opName += "_minus1";
+
+    if (CastType<PointerType>(type) != NULL) {
         const Type *incType = type->IsUniformType() ? AtomicType::UniformInt32 :
             AtomicType::VaryingInt32;
         llvm::Constant *dval = lLLVMConstantValue(incType, g->ctx, delta);
-        binop = ctx->GetElementPtrInst(rvalue, dval, type, "ptr_inc_or_dec");
+        binop = ctx->GetElementPtrInst(rvalue, dval, type, opName.c_str());
     }
     else {
         llvm::Constant *dval = lLLVMConstantValue(type, g->ctx, delta);
         if (type->IsFloatType())
             binop = ctx->BinaryOperator(llvm::Instruction::FAdd, rvalue, 
-                                        dval, "val_inc_or_dec");
+                                        dval, opName.c_str());
         else
             binop = ctx->BinaryOperator(llvm::Instruction::Add, rvalue, 
-                                        dval, "val_inc_or_dec");
+                                        dval, opName.c_str());
     }
 
     // And store the result out to the lvalue
@@ -1046,11 +1080,11 @@ lEmitNegate(Expr *arg, SourcePos pos, FunctionEmitContext *ctx) {
     ctx->SetDebugPos(pos);
     if (type->IsFloatType())
         return ctx->BinaryOperator(llvm::Instruction::FSub, zero, argVal,
-                                   "fnegate");
+                                   LLVMGetName(argVal, "_negate"));
     else {
-        Assert(type->IsIntType());
+        AssertPos(pos, type->IsIntType());
         return ctx->BinaryOperator(llvm::Instruction::Sub, zero, argVal,
-                                   "inegate");
+                                   LLVMGetName(argVal, "_negate"));
     }
 }
 
@@ -1078,11 +1112,11 @@ UnaryExpr::GetValue(FunctionEmitContext *ctx) const {
         return lEmitNegate(expr, pos, ctx);
     case LogicalNot: {
         llvm::Value *argVal = expr->GetValue(ctx);
-        return ctx->NotOperator(argVal, "logicalnot");
+        return ctx->NotOperator(argVal, LLVMGetName(argVal, "_logicalnot"));
     }
     case BitNot: {
         llvm::Value *argVal = expr->GetValue(ctx);
-        return ctx->NotOperator(argVal, "bitnot");
+        return ctx->NotOperator(argVal, LLVMGetName(argVal, "_bitnot"));
     }
     default:
         FATAL("logic error");
@@ -1129,7 +1163,7 @@ UnaryExpr::Optimize() {
         return this;
 
     const Type *type = constExpr->GetType();
-    bool isEnumType = dynamic_cast<const EnumType *>(type) != NULL;
+    bool isEnumType = CastType<EnumType>(type) != NULL;
 
     const Type *baseType = type->GetAsNonConstType()->GetAsUniformType();
     if (Type::Equal(baseType, AtomicType::UniformInt8) ||
@@ -1183,7 +1217,7 @@ UnaryExpr::Optimize() {
             FATAL("unexpected type in UnaryExpr::Optimize() / BitNot case");
     }
     case LogicalNot: {
-        Assert(Type::EqualIgnoringConst(type, AtomicType::UniformBool) || 
+        AssertPos(pos, Type::EqualIgnoringConst(type, AtomicType::UniformBool) || 
                Type::EqualIgnoringConst(type, AtomicType::VaryingBool));
         bool v[ISPC_MAX_NVEC];
         int count = constExpr->AsBool(v);
@@ -1221,7 +1255,7 @@ UnaryExpr::TypeCheck() {
         if (type->IsNumericType())
             return this;
 
-        const PointerType *pt = dynamic_cast<const PointerType *>(type);
+        const PointerType *pt = CastType<PointerType>(type);
         if (pt == NULL) {
             Error(expr->pos, "Can only pre/post increment numeric and "
                   "pointer types, not \"%s\".", type->GetString().c_str());
@@ -1233,12 +1267,17 @@ UnaryExpr::TypeCheck() {
                   type->GetString().c_str());
             return NULL;
         }
+        if (CastType<UndefinedStructType>(pt->GetBaseType())) {
+            Error(expr->pos, "Illegal to pre/post increment pointer to "
+                  "undefined struct type \"%s\".", type->GetString().c_str());
+            return NULL;
+        }
 
         return this;
     }
 
     // don't do this for pre/post increment/decrement
-    if (dynamic_cast<const ReferenceType *>(type)) {
+    if (CastType<ReferenceType>(type)) {
         expr = new RefDerefExpr(expr, pos);
         type = expr->GetType();
     }
@@ -1362,7 +1401,7 @@ lEmitBinaryPointerArith(BinaryExpr::Op op, llvm::Value *value0,
                         llvm::Value *value1, const Type *type0,
                         const Type *type1, FunctionEmitContext *ctx,
                         SourcePos pos) {
-    const PointerType *ptrType = dynamic_cast<const PointerType *>(type0);
+    const PointerType *ptrType = CastType<PointerType>(type0);
 
     switch (op) {
     case BinaryExpr::Add:
@@ -1370,8 +1409,8 @@ lEmitBinaryPointerArith(BinaryExpr::Op op, llvm::Value *value0,
         return ctx->GetElementPtrInst(value0, value1, ptrType, "ptrmath");
         break;
     case BinaryExpr::Sub: {
-        if (dynamic_cast<const PointerType *>(type1) != NULL) {
-            Assert(Type::Equal(type0, type1));
+        if (CastType<PointerType>(type1) != NULL) {
+            AssertPos(pos, Type::Equal(type0, type1));
 
             if (ptrType->IsSlice()) {
                 llvm::Value *p0 = ctx->ExtractInst(value0, 0);
@@ -1382,7 +1421,7 @@ lEmitBinaryPointerArith(BinaryExpr::Op op, llvm::Value *value0,
                                             ctx, pos);
                
                 int soaWidth = ptrType->GetBaseType()->GetSOAWidth();
-                Assert(soaWidth > 0);
+                AssertPos(pos, soaWidth > 0);
                 llvm::Value *soaScale = LLVMIntAsType(soaWidth, 
                                                       majorDelta->getType());
 
@@ -1414,7 +1453,7 @@ lEmitBinaryPointerArith(BinaryExpr::Op op, llvm::Value *value0,
 
             // Now divide by the size of the type that the pointer
             // points to in order to return the difference in elements.
-            LLVM_TYPE_CONST llvm::Type *llvmElementType = 
+            llvm::Type *llvmElementType = 
                 ptrType->GetBaseType()->LLVMType(g->ctx);
             llvm::Value *size = g->target.SizeOf(llvmElementType, 
                                                  ctx->GetCurrentBasicBlock());
@@ -1464,29 +1503,34 @@ static llvm::Value *
 lEmitBinaryArith(BinaryExpr::Op op, llvm::Value *value0, llvm::Value *value1,
                  const Type *type0, const Type *type1,
                  FunctionEmitContext *ctx, SourcePos pos) {
-    const PointerType *ptrType = dynamic_cast<const PointerType *>(type0);
+    const PointerType *ptrType = CastType<PointerType>(type0);
 
     if (ptrType != NULL)
         return lEmitBinaryPointerArith(op, value0, value1, type0, type1,
                                        ctx, pos);
     else {
-        Assert(Type::EqualIgnoringConst(type0, type1));
+        AssertPos(pos, Type::EqualIgnoringConst(type0, type1));
 
         llvm::Instruction::BinaryOps inst;
         bool isFloatOp = type0->IsFloatType();
         bool isUnsignedOp = type0->IsUnsignedType();
 
+        const char *opName = NULL;
         switch (op) {
         case BinaryExpr::Add:
+            opName = "add";
             inst = isFloatOp ? llvm::Instruction::FAdd : llvm::Instruction::Add;
             break;
         case BinaryExpr::Sub:
+            opName = "sub";
             inst = isFloatOp ? llvm::Instruction::FSub : llvm::Instruction::Sub;
             break;
         case BinaryExpr::Mul:
+            opName = "mul";
             inst = isFloatOp ? llvm::Instruction::FMul : llvm::Instruction::Mul;
             break;
         case BinaryExpr::Div:
+            opName = "div";
             if (type0->IsVaryingType() && !isFloatOp)
                 PerformanceWarning(pos, "Division with varying integer types is "
                                    "very inefficient."); 
@@ -1494,6 +1538,7 @@ lEmitBinaryArith(BinaryExpr::Op op, llvm::Value *value0, llvm::Value *value1,
                 (isUnsignedOp ? llvm::Instruction::UDiv : llvm::Instruction::SDiv);
             break;
         case BinaryExpr::Mod:
+            opName = "mod";
             if (type0->IsVaryingType() && !isFloatOp)
                 PerformanceWarning(pos, "Modulus operator with varying types is "
                                    "very inefficient."); 
@@ -1505,7 +1550,7 @@ lEmitBinaryArith(BinaryExpr::Op op, llvm::Value *value0, llvm::Value *value1,
             return NULL;
         }
 
-        return ctx->BinaryOperator(inst, value0, value1, "binop");
+        return ctx->BinaryOperator(inst, value0, value1, LLVMGetName(opName, value0, value1));
     }
 }
 
@@ -1520,27 +1565,34 @@ lEmitBinaryCmp(BinaryExpr::Op op, llvm::Value *e0Val, llvm::Value *e1Val,
     bool isUnsignedOp = type->IsUnsignedType();
 
     llvm::CmpInst::Predicate pred;
+    const char *opName = NULL;
     switch (op) {
     case BinaryExpr::Lt:
+        opName = "less";
         pred = isFloatOp ? llvm::CmpInst::FCMP_OLT : 
             (isUnsignedOp ? llvm::CmpInst::ICMP_ULT : llvm::CmpInst::ICMP_SLT);
         break;
     case BinaryExpr::Gt:
+        opName = "greater";
         pred = isFloatOp ? llvm::CmpInst::FCMP_OGT : 
             (isUnsignedOp ? llvm::CmpInst::ICMP_UGT : llvm::CmpInst::ICMP_SGT);
         break;
     case BinaryExpr::Le:
+        opName = "lessequal";
         pred = isFloatOp ? llvm::CmpInst::FCMP_OLE : 
             (isUnsignedOp ? llvm::CmpInst::ICMP_ULE : llvm::CmpInst::ICMP_SLE);
         break;
     case BinaryExpr::Ge:
+        opName = "greaterequal";
         pred = isFloatOp ? llvm::CmpInst::FCMP_OGE : 
             (isUnsignedOp ? llvm::CmpInst::ICMP_UGE : llvm::CmpInst::ICMP_SGE);
         break;
     case BinaryExpr::Equal:
+        opName = "equal";
         pred = isFloatOp ? llvm::CmpInst::FCMP_OEQ : llvm::CmpInst::ICMP_EQ;
         break;
     case BinaryExpr::NotEqual:
+        opName = "notequal";
         pred = isFloatOp ? llvm::CmpInst::FCMP_ONE : llvm::CmpInst::ICMP_NE;
         break;
     default:
@@ -1550,7 +1602,8 @@ lEmitBinaryCmp(BinaryExpr::Op op, llvm::Value *e0Val, llvm::Value *e1Val,
 
     llvm::Value *cmp = ctx->CmpInst(isFloatOp ? llvm::Instruction::FCmp : 
                                     llvm::Instruction::ICmp,
-                                    pred, e0Val, e1Val, "bincmp");
+                                    pred, e0Val, e1Val, 
+                                    LLVMGetName(opName, e0Val, e1Val));
     // This is a little ugly: CmpInst returns i1 values, but we use vectors
     // of i32s for varying bool values; type convert the result here if
     // needed.
@@ -1579,7 +1632,7 @@ lEmitLogicalOp(BinaryExpr::Op op, Expr *arg0, Expr *arg1,
 
     const Type *type0 = arg0->GetType(), *type1 = arg1->GetType();
     if (type0 == NULL || type1 == NULL) {
-        Assert(m->errorCount > 0);
+        AssertPos(pos, m->errorCount > 0);
         return NULL;
     }
 
@@ -1590,24 +1643,24 @@ lEmitLogicalOp(BinaryExpr::Op op, Expr *arg0, Expr *arg1,
     // FIXME: not sure what we should do about vector types here...
     bool shortCircuit = (EstimateCost(arg1) > PREDICATE_SAFE_IF_STATEMENT_COST ||
                          SafeToRunWithMaskAllOff(arg1) == false ||
-                         dynamic_cast<const VectorType *>(type0) != NULL ||
-                         dynamic_cast<const VectorType *>(type1) != NULL);
+                         CastType<VectorType>(type0) != NULL ||
+                         CastType<VectorType>(type1) != NULL);
     if (shortCircuit == false) {
         // If one of the operands is uniform but the other is varying,
         // promote the uniform one to varying
         if (type0->IsUniformType() && type1->IsVaryingType()) {
             arg0 = TypeConvertExpr(arg0, AtomicType::VaryingBool, lOpString(op));
-            Assert(arg0 != NULL);
+            AssertPos(pos, arg0 != NULL);
         }
         if (type1->IsUniformType() && type0->IsVaryingType()) {
             arg1 = TypeConvertExpr(arg1, AtomicType::VaryingBool, lOpString(op));
-            Assert(arg1 != NULL);
+            AssertPos(pos, arg1 != NULL);
         }
 
         llvm::Value *value0 = arg0->GetValue(ctx);
         llvm::Value *value1 = arg1->GetValue(ctx);
         if (value0 == NULL || value1 == NULL) {
-            Assert(m->errorCount > 0);
+            AssertPos(pos, m->errorCount > 0);
             return NULL;
         }
 
@@ -1615,7 +1668,7 @@ lEmitLogicalOp(BinaryExpr::Op op, Expr *arg0, Expr *arg1,
             return ctx->BinaryOperator(llvm::Instruction::And, value0, value1,
                                        "logical_and");
         else {
-            Assert(op == BinaryExpr::LogicalOr);
+            AssertPos(pos, op == BinaryExpr::LogicalOr);
             return ctx->BinaryOperator(llvm::Instruction::Or, value0, value1, 
                                        "logical_or");
         }
@@ -1623,7 +1676,7 @@ lEmitLogicalOp(BinaryExpr::Op op, Expr *arg0, Expr *arg1,
 
     // Allocate temporary storage for the return value
     const Type *retType = Type::MoreGeneralType(type0, type1, pos, lOpString(op));
-    LLVM_TYPE_CONST llvm::Type *llvmRetType = retType->LLVMType(g->ctx);
+    llvm::Type *llvmRetType = retType->LLVMType(g->ctx);
     llvm::Value *retPtr = ctx->AllocaInst(llvmRetType, "logical_op_mem");
 
     llvm::BasicBlock *bbSkipEvalValue1 = ctx->CreateBasicBlock("skip_eval_1");
@@ -1633,7 +1686,7 @@ lEmitLogicalOp(BinaryExpr::Op op, Expr *arg0, Expr *arg1,
     // Evaluate the first operand
     llvm::Value *value0 = arg0->GetValue(ctx);
     if (value0 == NULL) {
-        Assert(m->errorCount > 0);
+        AssertPos(pos, m->errorCount > 0);
         return NULL;
     }
 
@@ -1657,7 +1710,7 @@ lEmitLogicalOp(BinaryExpr::Op op, Expr *arg0, Expr *arg1,
             ctx->BranchInst(bbLogicalDone);
         }
         else {
-            Assert(op == BinaryExpr::LogicalAnd);
+            AssertPos(pos, op == BinaryExpr::LogicalAnd);
 
             // Conversely, for &&, if value0 is false, we skip evaluating
             // value1.
@@ -1679,12 +1732,12 @@ lEmitLogicalOp(BinaryExpr::Op op, Expr *arg0, Expr *arg1,
         ctx->SetCurrentBasicBlock(bbEvalValue1);
         if (type1->IsUniformType() && retType->IsVaryingType()) {
             arg1 = TypeConvertExpr(arg1, AtomicType::VaryingBool, "logical op");
-            Assert(arg1 != NULL);
+            AssertPos(pos, arg1 != NULL);
         }
 
         llvm::Value *value1 = arg1->GetValue(ctx);
         if (value1 == NULL) {
-            Assert(m->errorCount > 0);
+            AssertPos(pos, m->errorCount > 0);
             return NULL;
         }
         ctx->StoreInst(value1, retPtr);
@@ -1706,7 +1759,7 @@ lEmitLogicalOp(BinaryExpr::Op op, Expr *arg0, Expr *arg1,
         // perform logical vector ops with its value.
         if (type1->IsUniformType()) {
             arg1 = TypeConvertExpr(arg1, AtomicType::VaryingBool, "logical op");
-            Assert(arg1 != NULL);
+            AssertPos(pos, arg1 != NULL);
             type1 = arg1->GetType();
         }
 
@@ -1741,7 +1794,7 @@ lEmitLogicalOp(BinaryExpr::Op op, Expr *arg0, Expr *arg1,
 
             llvm::Value *value1 = arg1->GetValue(ctx);
             if (value1 == NULL) {
-                Assert(m->errorCount > 0);
+                AssertPos(pos, m->errorCount > 0);
                 return NULL;
             }
 
@@ -1759,7 +1812,7 @@ lEmitLogicalOp(BinaryExpr::Op op, Expr *arg0, Expr *arg1,
             ctx->BranchInst(bbLogicalDone);
         }
         else {
-            Assert(op == BinaryExpr::LogicalAnd);
+            AssertPos(pos, op == BinaryExpr::LogicalAnd);
 
             // If value0 is false for all currently running lanes, the
             // overall result must be false: this corresponds to checking
@@ -1790,7 +1843,7 @@ lEmitLogicalOp(BinaryExpr::Op op, Expr *arg0, Expr *arg1,
 
             llvm::Value *value1 = arg1->GetValue(ctx);
             if (value1 == NULL) {
-                Assert(m->errorCount > 0);
+                AssertPos(pos, m->errorCount > 0);
                 return NULL;
             }
 
@@ -1822,7 +1875,7 @@ lEmitLogicalOp(BinaryExpr::Op op, Expr *arg0, Expr *arg1,
 llvm::Value *
 BinaryExpr::GetValue(FunctionEmitContext *ctx) const {
     if (!arg0 || !arg1) {
-        Assert(m->errorCount > 0);
+        AssertPos(pos, m->errorCount > 0);
         return NULL;
     }
 
@@ -1833,7 +1886,7 @@ BinaryExpr::GetValue(FunctionEmitContext *ctx) const {
     llvm::Value *value0 = arg0->GetValue(ctx);
     llvm::Value *value1 = arg1->GetValue(ctx);
     if (value0 == NULL || value1 == NULL) {
-        Assert(m->errorCount > 0);
+        AssertPos(pos, m->errorCount > 0);
         return NULL;
     }
 
@@ -1889,17 +1942,17 @@ BinaryExpr::GetType() const {
     // and will fail type checking and (int + ptr) should be canonicalized
     // into (ptr + int) by type checking.
     if (op == Add)
-        Assert(dynamic_cast<const PointerType *>(type1) == NULL);
+        AssertPos(pos, CastType<PointerType>(type1) == NULL);
 
     if (op == Comma)
         return arg1->GetType();
 
-    if (dynamic_cast<const PointerType *>(type0) != NULL) {
+    if (CastType<PointerType>(type0) != NULL) {
         if (op == Add)
             // ptr + int -> ptr
             return type0;
         else if (op == Sub) {
-            if (dynamic_cast<const PointerType *>(type1) != NULL) {
+            if (CastType<PointerType>(type1) != NULL) {
                 // ptr - ptr -> ~ptrdiff_t
                 const Type *diffType = (g->target.is32Bit || 
                                         g->opt.force32BitAddressing) ? 
@@ -1914,14 +1967,14 @@ BinaryExpr::GetType() const {
         }
 
         // otherwise fall through for these...
-        Assert(op == Lt || op == Gt || op == Le || op == Ge ||
+        AssertPos(pos, op == Lt || op == Gt || op == Le || op == Ge ||
                op == Equal || op == NotEqual);
     }
 
     const Type *exprType = Type::MoreGeneralType(type0, type1, pos, lOpString(op));
     // I don't think that MoreGeneralType should be able to fail after the
     // checks done in BinaryExpr::TypeCheck().
-    Assert(exprType != NULL);
+    AssertPos(pos, exprType != NULL);
 
     switch (op) {
     case Add:
@@ -2106,7 +2159,7 @@ BinaryExpr::Optimize() {
                 std::vector<Symbol *> rcpFuns;
                 m->symbolTable->LookupFunction("rcp", &rcpFuns);
                 if (rcpFuns.size() > 0) {
-                    Assert(rcpFuns.size() == 2);
+                    AssertPos(pos, rcpFuns.size() == 2);
                     Expr *rcpSymExpr = new FunctionSymbolExpr("rcp", rcpFuns, pos);
                     ExprList *args = new ExprList(arg1, arg1->pos);
                     Expr *rcpCall = new FunctionCallExpr(rcpSymExpr, args, 
@@ -2136,7 +2189,7 @@ BinaryExpr::Optimize() {
     if (constArg0 == NULL || constArg1 == NULL)
         return this;
 
-    Assert(Type::EqualIgnoringConst(arg0->GetType(), arg1->GetType()));
+    AssertPos(pos, Type::EqualIgnoringConst(arg0->GetType(), arg1->GetType()));
     const Type *type = arg0->GetType()->GetAsNonConstType();
     if (Type::Equal(type, AtomicType::UniformFloat) || 
         Type::Equal(type, AtomicType::VaryingFloat)) {
@@ -2181,7 +2234,7 @@ BinaryExpr::Optimize() {
     }
     else if (Type::Equal(type, AtomicType::UniformUInt32) || 
              Type::Equal(type, AtomicType::VaryingUInt32) ||
-             dynamic_cast<const EnumType *>(type) != NULL) {
+             CastType<EnumType>(type) != NULL) {
         uint32_t v0[ISPC_MAX_NVEC], v1[ISPC_MAX_NVEC];
         constArg0->AsUInt32(v0);
         constArg1->AsUInt32(v1);
@@ -2224,23 +2277,23 @@ BinaryExpr::TypeCheck() {
 
     // If either operand is a reference, dereference it before we move
     // forward
-    if (dynamic_cast<const ReferenceType *>(type0) != NULL) {
+    if (CastType<ReferenceType>(type0) != NULL) {
         arg0 = new RefDerefExpr(arg0, arg0->pos);
         type0 = arg0->GetType();
-        Assert(type0 != NULL);
+        AssertPos(pos, type0 != NULL);
     }
-    if (dynamic_cast<const ReferenceType *>(type1) != NULL) {
+    if (CastType<ReferenceType>(type1) != NULL) {
         arg1 = new RefDerefExpr(arg1, arg1->pos);
         type1 = arg1->GetType();
-        Assert(type1 != NULL);
+        AssertPos(pos, type1 != NULL);
     }
 
     // Convert arrays to pointers to their first elements
-    if (dynamic_cast<const ArrayType *>(type0) != NULL) {
+    if (CastType<ArrayType>(type0) != NULL) {
         arg0 = lArrayToPointer(arg0);
         type0 = arg0->GetType();
     }
-    if (dynamic_cast<const ArrayType *>(type1) != NULL) {
+    if (CastType<ArrayType>(type1) != NULL) {
         arg1 = lArrayToPointer(arg1);
         type1 = arg1->GetType();
     }
@@ -2257,8 +2310,8 @@ BinaryExpr::TypeCheck() {
         return NULL;
     }
 
-    const PointerType *pt0 = dynamic_cast<const PointerType *>(type0);
-    const PointerType *pt1 = dynamic_cast<const PointerType *>(type1);
+    const PointerType *pt0 = CastType<PointerType>(type0);
+    const PointerType *pt1 = CastType<PointerType>(type1);
     if (pt0 != NULL && pt1 != NULL && op == Sub) {
         // Pointer subtraction
         if (PointerType::IsVoidPointer(type0)) {
@@ -2271,6 +2324,16 @@ BinaryExpr::TypeCheck() {
                   "on \"%s\" type.", type1->GetString().c_str());
             return NULL;
         }
+        if (CastType<UndefinedStructType>(pt0->GetBaseType())) {
+            Error(pos, "Illegal to perform pointer arithmetic "
+                  "on undefined struct type \"%s\".", pt0->GetString().c_str());
+            return NULL;
+        }
+        if (CastType<UndefinedStructType>(pt1->GetBaseType())) {
+            Error(pos, "Illegal to perform pointer arithmetic "
+                  "on undefined struct type \"%s\".", pt1->GetString().c_str());
+            return NULL;
+        }
 
         const Type *t = Type::MoreGeneralType(type0, type1, pos, "-");
         if (t == NULL)
@@ -2299,13 +2362,18 @@ BinaryExpr::TypeCheck() {
             std::swap(pt0, pt1);
         }
 
-        Assert(pt0 != NULL);
+        AssertPos(pos, pt0 != NULL);
 
         if (PointerType::IsVoidPointer(pt0)) {
             Error(pos, "Illegal to perform pointer arithmetic "
                   "on \"%s\" type.", pt0->GetString().c_str());
             return NULL;
         }
+        if (CastType<UndefinedStructType>(pt0->GetBaseType())) {
+            Error(pos, "Illegal to perform pointer arithmetic "
+                  "on undefined struct type \"%s\".", pt0->GetString().c_str());
+            return NULL;
+        }
 
         const Type *offsetType = g->target.is32Bit ? 
             AtomicType::UniformInt32 : AtomicType::UniformInt64;
@@ -2314,7 +2382,8 @@ BinaryExpr::TypeCheck() {
         if (type1->IsVaryingType()) {
             arg0 = TypeConvertExpr(arg0, type0->GetAsVaryingType(), 
                                    "pointer addition");
-            Assert(arg0 != NULL);
+            offsetType = offsetType->GetAsVaryingType();
+            AssertPos(pos, arg0 != NULL);
         }
 
         arg1 = TypeConvertExpr(arg1, offsetType, lOpString(op));
@@ -2407,20 +2476,20 @@ BinaryExpr::TypeCheck() {
     case Ge:
     case Equal:
     case NotEqual: {
-        const PointerType *pt0 = dynamic_cast<const PointerType *>(type0);
-        const PointerType *pt1 = dynamic_cast<const PointerType *>(type1);
+        const PointerType *pt0 = CastType<PointerType>(type0);
+        const PointerType *pt1 = CastType<PointerType>(type1);
 
         // Convert '0' in expressions where the other expression is a
         // pointer type to a NULL pointer.
         if (pt0 != NULL && lIsAllIntZeros(arg1)) {
             arg1 = new NullPointerExpr(pos);
             type1 = arg1->GetType();
-            pt1 = dynamic_cast<const PointerType *>(type1);
+            pt1 = CastType<PointerType>(type1);
         }
         else if (pt1 != NULL && lIsAllIntZeros(arg0)) {
             arg0 = new NullPointerExpr(pos);
             type0 = arg1->GetType();
-            pt0 = dynamic_cast<const PointerType *>(type0);
+            pt0 = CastType<PointerType>(type0);
         }
 
         if (pt0 == NULL && pt1 == NULL) {
@@ -2462,8 +2531,8 @@ BinaryExpr::TypeCheck() {
             AtomicType::UniformBool : AtomicType::VaryingBool;
 
         const Type *destType0 = NULL, *destType1 = NULL;
-        const VectorType *vtype0 = dynamic_cast<const VectorType *>(type0);
-        const VectorType *vtype1 = dynamic_cast<const VectorType *>(type1);
+        const VectorType *vtype0 = CastType<VectorType>(type0);
+        const VectorType *vtype1 = CastType<VectorType>(type1);
         if (vtype0 && vtype1) {
             int sz0 = vtype0->GetElementCount(), sz1 = vtype1->GetElementCount();
             if (sz0 != sz1) {
@@ -2572,7 +2641,7 @@ lEmitOpAssign(AssignExpr::Op op, Expr *arg0, Expr *arg1, const Type *type,
     llvm::Value *rvalue = arg1->GetValue(ctx);
     ctx->SetDebugPos(pos);
     llvm::Value *mask = lMaskForSymbol(baseSym, ctx);
-    llvm::Value *oldLHS = ctx->LoadInst(lv, mask, lvalueType, "opassign_load");
+    llvm::Value *oldLHS = ctx->LoadInst(lv, mask, lvalueType);
 
     // Map the operator to the corresponding BinaryExpr::Op operator
     BinaryExpr::Op basicop;
@@ -2617,6 +2686,7 @@ lEmitOpAssign(AssignExpr::Op op, Expr *arg0, Expr *arg1, const Type *type,
     }
 
     // And store the result back to the lvalue.
+    ctx->SetDebugPos(arg0->pos);
     lStoreAssignResult(newValue, lv, resultType, lvalueType, ctx, baseSym);
 
     return newValue;
@@ -2651,17 +2721,17 @@ AssignExpr::GetValue(FunctionEmitContext *ctx) const {
         const Type *ptrType = lvalue->GetLValueType();
         const Type *valueType = rvalue->GetType();
         if (ptrType == NULL || valueType == NULL) {
-            Assert(m->errorCount > 0);
+            AssertPos(pos, m->errorCount > 0);
             return NULL;
         }
 
         llvm::Value *value = rvalue->GetValue(ctx);
         if (value == NULL) {
-            Assert(m->errorCount > 0);
+            AssertPos(pos, m->errorCount > 0);
             return NULL;
         }
 
-        ctx->SetDebugPos(pos);
+        ctx->SetDebugPos(lvalue->pos);
 
         lStoreAssignResult(value, ptr, valueType, ptrType, ctx, baseSym);
 
@@ -2678,8 +2748,8 @@ AssignExpr::GetValue(FunctionEmitContext *ctx) const {
     case XorAssign:
     case OrAssign: {
         // This should be caught during type checking
-        Assert(!dynamic_cast<const ArrayType *>(type) &&
-               !dynamic_cast<const StructType *>(type));
+        AssertPos(pos, !CastType<ArrayType>(type) &&
+               !CastType<StructType>(type));
         return lEmitOpAssign(op, lvalue, rvalue, type, baseSym, pos, ctx);
     }
     default:
@@ -2726,7 +2796,7 @@ lCheckForConstStructMember(SourcePos pos, const StructType *structType,
             return true;
         }
 
-        const StructType *st = dynamic_cast<const StructType *>(t);
+        const StructType *st = CastType<StructType>(t);
         if (st != NULL && lCheckForConstStructMember(pos, st, initialType))
             return true;
     }
@@ -2740,7 +2810,7 @@ AssignExpr::TypeCheck() {
         return NULL;
 
     bool lvalueIsReference = 
-        dynamic_cast<const ReferenceType *>(lvalue->GetType()) != NULL;
+        CastType<ReferenceType>(lvalue->GetType()) != NULL;
     if (lvalueIsReference)
         lvalue = new RefDerefExpr(lvalue, lvalue->pos);
 
@@ -2751,8 +2821,8 @@ AssignExpr::TypeCheck() {
         // function is overloaded.
         const Type *lvalueType = lvalue->GetType();
         const FunctionType *ftype;
-        if (dynamic_cast<const PointerType *>(lvalueType) == NULL ||
-            (ftype = dynamic_cast<const FunctionType *>(lvalueType->GetBaseType())) == NULL) {
+        if (CastType<PointerType>(lvalueType) == NULL ||
+            (ftype = CastType<FunctionType>(lvalueType->GetBaseType())) == NULL) {
             Error(lvalue->pos, "Can't assign function pointer to type \"%s\".",
                   lvalue->GetType()->GetString().c_str());
             return NULL;
@@ -2771,7 +2841,7 @@ AssignExpr::TypeCheck() {
 
     const Type *lhsType = lvalue->GetType();
     if (lhsType == NULL) {
-        Assert(m->errorCount > 0);
+        AssertPos(pos, m->errorCount > 0);
         return NULL;
     }
 
@@ -2781,7 +2851,7 @@ AssignExpr::TypeCheck() {
         return NULL;
     }
 
-    if (dynamic_cast<const PointerType *>(lhsType) != NULL) {
+    if (CastType<PointerType>(lhsType) != NULL) {
         if (op == AddAssign || op == SubAssign) {
             if (PointerType::IsVoidPointer(lhsType)) {
                 Error(pos, "Illegal to perform pointer arithmetic on \"%s\" "
@@ -2803,7 +2873,7 @@ AssignExpr::TypeCheck() {
             return NULL;
         }
     }
-    else if (dynamic_cast<const ArrayType *>(lhsType) != NULL) {
+    else if (CastType<ArrayType>(lhsType) != NULL) {
         Error(lvalue->pos, "Illegal to assign to array type \"%s\".",
               lhsType->GetString().c_str());
         return NULL;
@@ -2823,7 +2893,7 @@ AssignExpr::TypeCheck() {
     }
 
     // Make sure we're not assigning to a struct that has a constant member
-    const StructType *st = dynamic_cast<const StructType *>(lhsType);
+    const StructType *st = CastType<StructType>(lhsType);
     if (st != NULL && lCheckForConstStructMember(pos, st, st))
         return NULL;
 
@@ -2922,7 +2992,7 @@ SelectExpr::GetValue(FunctionEmitContext *ctx) const {
 
     const Type *testType = test->GetType()->GetAsNonConstType();
     // This should be taken care of during typechecking
-    Assert(Type::Equal(testType->GetBaseType(), AtomicType::UniformBool) ||
+    AssertPos(pos, Type::Equal(testType->GetBaseType(), AtomicType::UniformBool) ||
            Type::Equal(testType->GetBaseType(), AtomicType::VaryingBool));
 
     const Type *type = expr1->GetType();
@@ -2959,10 +3029,10 @@ SelectExpr::GetValue(FunctionEmitContext *ctx) const {
         ret->addIncoming(expr2Val, falsePred);
         return ret;
     }
-    else if (dynamic_cast<const VectorType *>(testType) == NULL) {
+    else if (CastType<VectorType>(testType) == NULL) {
         // the test is a varying bool type
         llvm::Value *testVal = test->GetValue(ctx);
-        Assert(testVal->getType() == LLVMTypes::MaskType);
+        AssertPos(pos, testVal->getType() == LLVMTypes::MaskType);
         llvm::Value *oldMask = ctx->GetInternalMask();
         llvm::Value *fullMask = ctx->GetFullMask();
 
@@ -2984,7 +3054,7 @@ SelectExpr::GetValue(FunctionEmitContext *ctx) const {
         // Temporary storage to store the values computed for each
         // expression, if any.  (These stay as uninitialized memory if we
         // short circuit around the corresponding expression.)
-        LLVM_TYPE_CONST llvm::Type *exprType = 
+        llvm::Type *exprType = 
             expr1->GetType()->LLVMType(g->ctx);
         llvm::Value *expr1Ptr = ctx->AllocaInst(exprType);
         llvm::Value *expr2Ptr = ctx->AllocaInst(exprType);
@@ -3023,11 +3093,11 @@ SelectExpr::GetValue(FunctionEmitContext *ctx) const {
         llvm::Value *expr2Val = expr2->GetValue(ctx);
 
         ctx->SetDebugPos(pos);
-        const VectorType *vt = dynamic_cast<const VectorType *>(type);
+        const VectorType *vt = CastType<VectorType>(type);
         // Things that typechecking should have caught
-        Assert(vt != NULL);
-        Assert(dynamic_cast<const VectorType *>(testType) != NULL &&
-               (dynamic_cast<const VectorType *>(testType)->GetElementCount() == 
+        AssertPos(pos, vt != NULL);
+        AssertPos(pos, CastType<VectorType>(testType) != NULL &&
+               (CastType<VectorType>(testType)->GetElementCount() == 
                 vt->GetElementCount()));
 
         // Do an element-wise select  
@@ -3063,11 +3133,11 @@ SelectExpr::GetType() const {
     bool becomesVarying = (testType->IsVaryingType() || expr1Type->IsVaryingType() ||
                            expr2Type->IsVaryingType());
     // if expr1 and expr2 have different vector sizes, typechecking should fail...
-    int testVecSize = dynamic_cast<const VectorType *>(testType) != NULL ?
-        dynamic_cast<const VectorType *>(testType)->GetElementCount() : 0;
-    int expr1VecSize = dynamic_cast<const VectorType *>(expr1Type) != NULL ?
-        dynamic_cast<const VectorType *>(expr1Type)->GetElementCount() : 0;
-    Assert(!(testVecSize != 0 && expr1VecSize != 0 && testVecSize != expr1VecSize));
+    int testVecSize = CastType<VectorType>(testType) != NULL ?
+        CastType<VectorType>(testType)->GetElementCount() : 0;
+    int expr1VecSize = CastType<VectorType>(expr1Type) != NULL ?
+        CastType<VectorType>(expr1Type)->GetElementCount() : 0;
+    AssertPos(pos, !(testVecSize != 0 && expr1VecSize != 0 && testVecSize != expr1VecSize));
     
     int vectorSize = std::max(testVecSize, expr1VecSize);
     return Type::MoreGeneralType(expr1Type, expr2Type, Union(expr1->pos, expr2->pos),
@@ -3112,9 +3182,9 @@ SelectExpr::Optimize() {
         if (constExpr1 == NULL || constExpr2 == NULL)
             return this;
 
-        Assert(Type::Equal(constExpr1->GetType(), constExpr2->GetType()));
+        AssertPos(pos, Type::Equal(constExpr1->GetType(), constExpr2->GetType()));
         const Type *exprType = constExpr1->GetType()->GetAsNonConstType();
-        Assert(exprType->IsVaryingType());
+        AssertPos(pos, exprType->IsVaryingType());
 
         // FIXME: it's annoying to have to have all of this replicated code.
         if (Type::Equal(exprType, AtomicType::VaryingInt32) ||
@@ -3179,12 +3249,12 @@ SelectExpr::TypeCheck() {
     if (!type1 || !type2)
         return NULL;
 
-    if (dynamic_cast<const ArrayType *>(type1)) {
+    if (CastType<ArrayType>(type1)) {
         Error(pos, "Array type \"%s\" can't be used in select expression", 
               type1->GetString().c_str());
         return NULL;
     }
-    if (dynamic_cast<const ArrayType *>(type2)) {
+    if (CastType<ArrayType>(type2)) {
         Error(pos, "Array type \"%s\" can't be used in select expression", 
               type2->GetString().c_str());
         return NULL;
@@ -3198,8 +3268,8 @@ SelectExpr::TypeCheck() {
         return NULL;
     testType = test->GetType();
 
-    int testVecSize = dynamic_cast<const VectorType *>(testType) ?
-        dynamic_cast<const VectorType *>(testType)->GetElementCount() : 0;
+    int testVecSize = CastType<VectorType>(testType) ?
+        CastType<VectorType>(testType)->GetElementCount() : 0;
     const Type *promotedType = 
         Type::MoreGeneralType(type1, type2, Union(expr1->pos, expr2->pos),
                               "select expression", testType->IsVaryingType(), testVecSize);
@@ -3258,11 +3328,11 @@ lGetFunctionType(Expr *func) {
     if (type == NULL)
         return NULL;
 
-    const FunctionType *ftype = dynamic_cast<const FunctionType *>(type);
+    const FunctionType *ftype = CastType<FunctionType>(type);
     if (ftype == NULL) {
         // Not a regular function symbol--is it a function pointer?
-        if (dynamic_cast<const PointerType *>(type) != NULL)
-            ftype = dynamic_cast<const FunctionType *>(type->GetBaseType());
+        if (CastType<PointerType>(type) != NULL)
+            ftype = CastType<FunctionType>(type->GetBaseType());
     }
     return ftype;
 }
@@ -3278,12 +3348,12 @@ FunctionCallExpr::GetValue(FunctionEmitContext *ctx) const {
     llvm::Value *callee = func->GetValue(ctx);
 
     if (callee == NULL) {
-        Assert(m->errorCount > 0);
+        AssertPos(pos, m->errorCount > 0);
         return NULL;
     }
 
     const FunctionType *ft = lGetFunctionType(func);
-    Assert(ft != NULL);
+    AssertPos(pos, ft != NULL);
     bool isVoidFunc = Type::Equal(ft->GetReturnType(), AtomicType::Void);
 
     // Automatically convert function call args to references if needed.
@@ -3296,7 +3366,7 @@ FunctionCallExpr::GetValue(FunctionEmitContext *ctx) const {
     // Specifically, this can happen if there's an error earlier during
     // overload resolution.
     if ((int)callargs.size() > ft->GetNumParameters()) {
-        Assert(m->errorCount > 0);
+        AssertPos(pos, m->errorCount > 0);
         return NULL;
     }
 
@@ -3309,9 +3379,9 @@ FunctionCallExpr::GetValue(FunctionEmitContext *ctx) const {
 
         const Type *argLValueType = argExpr->GetLValueType();
         if (argLValueType != NULL &&
-            dynamic_cast<const PointerType *>(argLValueType) != NULL &&
+            CastType<PointerType>(argLValueType) != NULL &&
             argLValueType->IsVaryingType() &&
-            dynamic_cast<const ReferenceType *>(paramType) != NULL) {
+            CastType<ReferenceType>(paramType) != NULL) {
             Error(argExpr->pos, "Illegal to pass a \"varying\" lvalue to a "
                   "reference parameter of type \"%s\".",
                   paramType->GetString().c_str());
@@ -3363,7 +3433,7 @@ FunctionCallExpr::GetValue(FunctionEmitContext *ctx) const {
     llvm::Value *retVal = NULL;
     ctx->SetDebugPos(pos);
     if (ft->isTask) {
-        Assert(launchCountExpr != NULL);
+        AssertPos(pos, launchCountExpr != NULL);
         llvm::Value *launchCount = launchCountExpr->GetValue(ctx);
         if (launchCount != NULL)
             ctx->LaunchInst(callee, argVals, launchCount);
@@ -3400,32 +3470,41 @@ FunctionCallExpr::TypeCheck() {
         return NULL;
 
     std::vector<const Type *> argTypes;
-    std::vector<bool> argCouldBeNULL;
+    std::vector<bool> argCouldBeNULL, argIsConstant;
     for (unsigned int i = 0; i < args->exprs.size(); ++i) {
-        if (args->exprs[i] == NULL)
+        Expr *expr = args->exprs[i];
+
+        if (expr == NULL)
             return NULL;
-        const Type *t = args->exprs[i]->GetType();
+        const Type *t = expr->GetType();
         if (t == NULL)
             return NULL;
+
         argTypes.push_back(t);
-        argCouldBeNULL.push_back(lIsAllIntZeros(args->exprs[i]));
+        argCouldBeNULL.push_back(lIsAllIntZeros(expr) ||
+                                 dynamic_cast<NullPointerExpr *>(expr));
+        argIsConstant.push_back(dynamic_cast<ConstExpr *>(expr) ||
+                                dynamic_cast<NullPointerExpr *>(expr));
     }
 
     FunctionSymbolExpr *fse = dynamic_cast<FunctionSymbolExpr *>(func);
     if (fse != NULL) {
         // Regular function call
-
-        if (fse->ResolveOverloads(args->pos, argTypes, &argCouldBeNULL) == false)
+        if (fse->ResolveOverloads(args->pos, argTypes, &argCouldBeNULL,
+                                  &argIsConstant) == false)
             return NULL;
 
         func = ::TypeCheck(fse);
         if (func == NULL)
             return NULL;
 
-        const PointerType *pt = 
-            dynamic_cast<const PointerType *>(func->GetType());
-        const FunctionType *ft = (pt == NULL) ? NULL : 
-            dynamic_cast<const FunctionType *>(pt->GetBaseType());
+        const FunctionType *ft = CastType<FunctionType>(func->GetType());
+        if (ft == NULL) {
+            const PointerType *pt = CastType<PointerType>(func->GetType());
+            ft = (pt == NULL) ? NULL : 
+                CastType<FunctionType>(pt->GetBaseType());
+        }
+
         if (ft == NULL) {
             Error(pos, "Valid function name must be used for function call.");
             return NULL;
@@ -3448,7 +3527,7 @@ FunctionCallExpr::TypeCheck() {
             if (isLaunch)
                 Error(pos, "\"launch\" expression illegal with non-\"task\"-"
                       "qualified function.");
-            Assert(launchCountExpr == NULL);
+            AssertPos(pos, launchCountExpr == NULL);
         }
     }
     else {
@@ -3459,8 +3538,8 @@ FunctionCallExpr::TypeCheck() {
 
         // Make sure we do in fact have a function to call
         const FunctionType *funcType;
-        if (dynamic_cast<const PointerType *>(fptrType) == NULL ||
-            (funcType = dynamic_cast<const FunctionType *>(fptrType->GetBaseType())) == NULL) {
+        if (CastType<PointerType>(fptrType) == NULL ||
+            (funcType = CastType<FunctionType>(fptrType->GetBaseType())) == NULL) {
             Error(func->pos, "Must provide function name or function pointer for "
                   "function call expression.");
             return NULL;
@@ -3492,7 +3571,7 @@ FunctionCallExpr::TypeCheck() {
                 const Type *paramType = funcType->GetParameterType(i);
                 if (CanConvertTypes(argTypes[i], paramType) == false &&
                     !(argCouldBeNULL[i] == true &&
-                      dynamic_cast<const PointerType *>(paramType) != NULL)) {
+                      CastType<PointerType>(paramType) != NULL)) {
                     Error(args->exprs[i]->pos, "Can't convert argument of "
                           "type \"%s\" to type \"%s\" for function call "
                           "argument.", argTypes[i]->GetString().c_str(),
@@ -3504,14 +3583,18 @@ FunctionCallExpr::TypeCheck() {
                 // Otherwise the parameter default saves us.  It should
                 // be there for sure, given the check right above the
                 // for loop.
-                Assert(funcType->GetParameterDefault(i) != NULL);
+                AssertPos(pos, funcType->GetParameterDefault(i) != NULL);
         }
 
-        if (fptrType->IsVaryingType() && 
-            funcType->GetReturnType()->IsUniformType()) {
-            Error(pos, "Illegal to call a varying function pointer that "
-                  "points to a function with a uniform return type.");
-            return NULL;
+        if (fptrType->IsVaryingType()) {
+            const Type *retType = funcType->GetReturnType();
+            if (Type::Equal(retType, AtomicType::Void) == false &&
+                retType->IsUniformType()) {
+                Error(pos, "Illegal to call a varying function pointer that "
+                      "points to a function with a uniform return type \"%s\".",
+                      funcType->GetReturnType()->GetString().c_str());
+                return NULL;
+            }
         }
     }
 
@@ -3530,10 +3613,10 @@ FunctionCallExpr::EstimateCost() const {
     if (type == NULL)
         return 0;
 
-    const PointerType *pt = dynamic_cast<const PointerType *>(type);
+    const PointerType *pt = CastType<PointerType>(type);
     if (pt != NULL)
         type = type->GetBaseType();
-    const FunctionType *ftype = dynamic_cast<const FunctionType *>(type);
+    const FunctionType *ftype = CastType<FunctionType>(type);
 
     if (ftype->costOverride > -1)
         return ftype->costOverride;
@@ -3592,22 +3675,21 @@ ExprList::TypeCheck() {
 llvm::Constant *
 ExprList::GetConstant(const Type *type) const {
     if (exprs.size() == 1 &&
-        (dynamic_cast<const AtomicType *>(type) != NULL ||
-         dynamic_cast<const EnumType *>(type) != NULL ||
-         dynamic_cast<const PointerType *>(type) != NULL))
+        (CastType<AtomicType>(type) != NULL ||
+         CastType<EnumType>(type) != NULL ||
+         CastType<PointerType>(type) != NULL))
         return exprs[0]->GetConstant(type);
 
-    const CollectionType *collectionType = 
-        dynamic_cast<const CollectionType *>(type);
+    const CollectionType *collectionType = CastType<CollectionType>(type);
     if (collectionType == NULL)
         return NULL;
 
     std::string name;
-    if (dynamic_cast<const StructType *>(type) != NULL)
+    if (CastType<StructType>(type) != NULL)
         name = "struct";
-    else if (dynamic_cast<const ArrayType *>(type) != NULL) 
+    else if (CastType<ArrayType>(type) != NULL) 
         name = "array";
-    else if (dynamic_cast<const VectorType *>(type) != NULL) 
+    else if (CastType<VectorType>(type) != NULL) 
         name = "vector";
     else 
         FATAL("Unexpected CollectionType in ExprList::GetConstant()");
@@ -3625,7 +3707,22 @@ ExprList::GetConstant(const Type *type) const {
         if (exprs[i] == NULL)
             return NULL;
         const Type *elementType = collectionType->GetElementType(i);
-        llvm::Constant *c = exprs[i]->GetConstant(elementType);
+
+        Expr *expr = exprs[i];
+        if (dynamic_cast<ExprList *>(expr) == NULL) {
+            // If there's a simple type conversion from the type of this
+            // expression to the type we need, then let the regular type
+            // conversion machinery handle it.
+            expr = TypeConvertExpr(exprs[i], elementType, "initializer list");
+            if (expr == NULL) {
+                AssertPos(pos, m->errorCount > 0);
+                return NULL;
+            }
+            // Re-establish const-ness if possible
+            expr = ::Optimize(expr);
+        }
+
+        llvm::Constant *c = expr->GetConstant(elementType);
         if (c == NULL)
             // If this list element couldn't convert to the right constant
             // type for the corresponding collection member, then give up.
@@ -3637,13 +3734,13 @@ ExprList::GetConstant(const Type *type) const {
     for (int i = (int)exprs.size(); i < collectionType->GetElementCount(); ++i) {
         const Type *elementType = collectionType->GetElementType(i);
         if (elementType == NULL) {
-            Assert(m->errorCount > 0);
+            AssertPos(pos, m->errorCount > 0);
             return NULL;
         }
 
-        LLVM_TYPE_CONST llvm::Type *llvmType = elementType->LLVMType(g->ctx);
+        llvm::Type *llvmType = elementType->LLVMType(g->ctx);
         if (llvmType == NULL) {
-            Assert(m->errorCount > 0);
+            AssertPos(pos, m->errorCount > 0);
             return NULL;
         }
 
@@ -3651,29 +3748,25 @@ ExprList::GetConstant(const Type *type) const {
         cv.push_back(c);
     }
 
-    if (dynamic_cast<const StructType *>(type) != NULL) {
-#if defined(LLVM_2_9)
-        return llvm::ConstantStruct::get(*g->ctx, cv, false);
-#else
-        LLVM_TYPE_CONST llvm::StructType *llvmStructType =
-            llvm::dyn_cast<LLVM_TYPE_CONST llvm::StructType>(collectionType->LLVMType(g->ctx));
-        Assert(llvmStructType != NULL);
+    if (CastType<StructType>(type) != NULL) {
+        llvm::StructType *llvmStructType =
+            llvm::dyn_cast<llvm::StructType>(collectionType->LLVMType(g->ctx));
+        AssertPos(pos, llvmStructType != NULL);
         return llvm::ConstantStruct::get(llvmStructType, cv);
-#endif
     }
     else {
-        LLVM_TYPE_CONST llvm::Type *lt = type->LLVMType(g->ctx);
-        LLVM_TYPE_CONST llvm::ArrayType *lat = 
-            llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(lt);
+        llvm::Type *lt = type->LLVMType(g->ctx);
+        llvm::ArrayType *lat = 
+            llvm::dyn_cast<llvm::ArrayType>(lt);
         if (lat != NULL)
             return llvm::ConstantArray::get(lat, cv);
         else {
             // uniform short vector type
-            Assert(type->IsUniformType() &&
-                   dynamic_cast<const VectorType *>(type) != NULL);
-            LLVM_TYPE_CONST llvm::VectorType *lvt = 
-                llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(lt);
-            Assert(lvt != NULL);
+            AssertPos(pos, type->IsUniformType() &&
+                   CastType<VectorType>(type) != NULL);
+            llvm::VectorType *lvt = 
+                llvm::dyn_cast<llvm::VectorType>(lt);
+            AssertPos(pos, lvt != NULL);
 
             // Uniform short vectors are stored as vectors of length
             // rounded up to the native vector width.  So we add additional
@@ -3713,6 +3806,7 @@ IndexExpr::IndexExpr(Expr *a, Expr *i, SourcePos p)
     : Expr(p) {
     baseExpr = a;
     index = i;
+    type = lvalueType = NULL;
 }
 
 
@@ -3739,11 +3833,11 @@ IndexExpr::IndexExpr(Expr *a, Expr *i, SourcePos p)
 static llvm::Value *
 lAddVaryingOffsetsIfNeeded(FunctionEmitContext *ctx, llvm::Value *ptr, 
                            const Type *ptrRefType) {
-    if (dynamic_cast<const ReferenceType *>(ptrRefType) != NULL)
+    if (CastType<ReferenceType>(ptrRefType) != NULL)
         // References are uniform pointers, so no offsetting is needed
         return ptr;
 
-    const PointerType *ptrType = dynamic_cast<const PointerType *>(ptrRefType);
+    const PointerType *ptrType = CastType<PointerType>(ptrRefType);
     Assert(ptrType != NULL);
     if (ptrType->IsUniformType() || ptrType->IsSlice())
         return ptr;
@@ -3779,21 +3873,21 @@ lAddVaryingOffsetsIfNeeded(FunctionEmitContext *ctx, llvm::Value *ptr,
  */
 static bool
 lVaryingStructHasUniformMember(const Type *type, SourcePos pos) {
-    if (dynamic_cast<const VectorType *>(type) != NULL ||
-        dynamic_cast<const ReferenceType *>(type) != NULL)
+    if (CastType<VectorType>(type) != NULL ||
+        CastType<ReferenceType>(type) != NULL)
         return false;
 
-    const StructType *st = dynamic_cast<const StructType *>(type);
+    const StructType *st = CastType<StructType>(type);
     if (st == NULL) {
-        const ArrayType *at = dynamic_cast<const ArrayType *>(type);
+        const ArrayType *at = CastType<ArrayType>(type);
         if (at != NULL)
-            st = dynamic_cast<const StructType *>(at->GetElementType());
+            st = CastType<StructType>(at->GetElementType());
         else {
-            const PointerType *pt = dynamic_cast<const PointerType *>(type);
+            const PointerType *pt = CastType<PointerType>(type);
             if (pt == NULL)
                 return false;
 
-            st = dynamic_cast<const StructType *>(pt->GetBaseType());
+            st = CastType<StructType>(pt->GetBaseType());
         }
 
         if (st == NULL)
@@ -3806,11 +3900,11 @@ lVaryingStructHasUniformMember(const Type *type, SourcePos pos) {
     for (int i = 0; i < st->GetElementCount(); ++i) {
         const Type *eltType = st->GetElementType(i);
         if (eltType == NULL) {
-            Assert(m->errorCount > 0);
+            AssertPos(pos, m->errorCount > 0);
             continue;
         }
 
-        if (dynamic_cast<const StructType *>(eltType) != NULL) {
+        if (CastType<StructType>(eltType) != NULL) {
             // We know that the enclosing struct is varying at this point,
             // so push that down to the enclosed struct before makign the
             // recursive call.
@@ -3838,7 +3932,7 @@ IndexExpr::GetValue(FunctionEmitContext *ctx) const {
     if (baseExpr == NULL || index == NULL || 
         ((indexType = index->GetType()) == NULL) ||
         ((returnType = GetType()) == NULL)) {
-        Assert(m->errorCount > 0);
+        AssertPos(pos, m->errorCount > 0);
         return NULL;
     }
 
@@ -3853,7 +3947,7 @@ IndexExpr::GetValue(FunctionEmitContext *ctx) const {
 
     llvm::Value *ptr = GetLValue(ctx);
     llvm::Value *mask = NULL;
-    const Type *lvalueType = GetLValueType();
+    const Type *lvType = GetLValueType();
     if (ptr == NULL) {
         // We may be indexing into a temporary that hasn't hit memory, so
         // get the full value and stuff it into temporary alloca'd space so
@@ -3861,7 +3955,7 @@ IndexExpr::GetValue(FunctionEmitContext *ctx) const {
         const Type *baseExprType = baseExpr->GetType();
         llvm::Value *val = baseExpr->GetValue(ctx);
         if (baseExprType == NULL || val == NULL) {
-            Assert(m->errorCount > 0);
+            AssertPos(pos, m->errorCount > 0);
             return NULL;
         }
         ctx->SetDebugPos(pos);
@@ -3870,31 +3964,33 @@ IndexExpr::GetValue(FunctionEmitContext *ctx) const {
         ctx->StoreInst(val, tmpPtr);
 
         // Get a pointer type to the underlying elements
-        const SequentialType *st = 
-            dynamic_cast<const SequentialType *>(baseExprType);
-        Assert(st != NULL);
-        lvalueType = PointerType::GetUniform(st->GetElementType());
+        const SequentialType *st = CastType<SequentialType>(baseExprType);
+        AssertPos(pos, st != NULL);
+        lvType = PointerType::GetUniform(st->GetElementType());
 
         // And do the indexing calculation into the temporary array in memory
         ptr = ctx->GetElementPtrInst(tmpPtr, LLVMInt32(0), index->GetValue(ctx), 
                                      PointerType::GetUniform(baseExprType));
-        ptr = lAddVaryingOffsetsIfNeeded(ctx, ptr, lvalueType);
+        ptr = lAddVaryingOffsetsIfNeeded(ctx, ptr, lvType);
 
         mask = LLVMMaskAllOn;
     }
     else {
         Symbol *baseSym = GetBaseSymbol();
-        Assert(baseSym != NULL);
+        AssertPos(pos, baseSym != NULL);
         mask = lMaskForSymbol(baseSym, ctx);
     }
 
     ctx->SetDebugPos(pos);
-    return ctx->LoadInst(ptr, mask, lvalueType, "index");
+    return ctx->LoadInst(ptr, mask, lvType);
 }
 
 
 const Type *
 IndexExpr::GetType() const {
+    if (type != NULL)
+        return type;
+
     const Type *baseExprType, *indexType;
     if (!baseExpr || !index || 
         ((baseExprType = baseExpr->GetType()) == NULL) ||
@@ -3902,17 +3998,16 @@ IndexExpr::GetType() const {
         return NULL;
 
     const Type *elementType = NULL;
-    const PointerType *pointerType = 
-        dynamic_cast<const PointerType *>(baseExprType);
+    const PointerType *pointerType = CastType<PointerType>(baseExprType);
     if (pointerType != NULL)
         // ptr[index] -> type that the pointer points to
         elementType = pointerType->GetBaseType();
     else {
         // sequential type[index] -> element type of the sequential type
         const SequentialType *sequentialType = 
-            dynamic_cast<const SequentialType *>(baseExprType->GetReferenceTarget());
+            CastType<SequentialType>(baseExprType->GetReferenceTarget());
         // Typechecking should have caught this...
-        Assert(sequentialType != NULL);
+        AssertPos(pos, sequentialType != NULL);
         elementType = sequentialType->GetElementType();
     }
 
@@ -3931,9 +4026,11 @@ IndexExpr::GetType() const {
     // type.
     if (indexType->IsUniformType() &&
         (pointerType == NULL || pointerType->IsUniformType()))
-        return elementType;
+        type = elementType;
     else
-        return elementType->GetAsVaryingType();
+        type = elementType->GetAsVaryingType();
+
+    return type;
 }
 
 
@@ -3949,10 +4046,10 @@ IndexExpr::GetBaseSymbol() const {
 static llvm::Value *
 lConvertToSlicePointer(FunctionEmitContext *ctx, llvm::Value *ptr,
                        const PointerType *slicePtrType) {
-    LLVM_TYPE_CONST llvm::Type *llvmSlicePtrType = 
+    llvm::Type *llvmSlicePtrType = 
         slicePtrType->LLVMType(g->ctx);
-    LLVM_TYPE_CONST llvm::StructType *sliceStructType =
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::StructType>(llvmSlicePtrType);
+    llvm::StructType *sliceStructType =
+        llvm::dyn_cast<llvm::StructType>(llvmSlicePtrType);
     Assert(sliceStructType != NULL &&
            sliceStructType->getElementType(0) == ptr->getType());
 
@@ -3960,7 +4057,7 @@ lConvertToSlicePointer(FunctionEmitContext *ctx, llvm::Value *ptr,
     // offsets
     llvm::Value *result = llvm::Constant::getNullValue(sliceStructType);
     // And replace the pointer in the struct with the given pointer
-    return ctx->InsertInst(result, ptr, 0);
+    return ctx->InsertInst(result, ptr, 0, LLVMGetName(ptr, "_slice"));
 }
 
 
@@ -3970,8 +4067,7 @@ lConvertToSlicePointer(FunctionEmitContext *ctx, llvm::Value *ptr,
 */
 static void
 lCheckIndicesVersusBounds(const Type *baseExprType, Expr *index) {
-    const SequentialType *seqType = 
-        dynamic_cast<const SequentialType *>(baseExprType);
+    const SequentialType *seqType = CastType<SequentialType>(baseExprType);
     if (seqType == NULL)
         return;
 
@@ -4006,10 +4102,9 @@ lCheckIndicesVersusBounds(const Type *baseExprType, Expr *index) {
 */
 static llvm::Value *
 lConvertPtrToSliceIfNeeded(FunctionEmitContext *ctx, 
-                           llvm::Value *ptr,
-                           const Type **type) {
+                           llvm::Value *ptr, const Type **type) {
     Assert(*type != NULL);
-    const PointerType *ptrType = dynamic_cast<const PointerType *>(*type);
+    const PointerType *ptrType = CastType<PointerType>(*type);
     bool convertToSlice = (ptrType->GetBaseType()->IsSOAType() &&
                            ptrType->IsSlice() == false);
     if (convertToSlice == false)
@@ -4025,23 +4120,23 @@ IndexExpr::GetLValue(FunctionEmitContext *ctx) const {
     const Type *baseExprType;
     if (baseExpr == NULL || index == NULL || 
         ((baseExprType = baseExpr->GetType()) == NULL)) {
-        Assert(m->errorCount > 0);
+        AssertPos(pos, m->errorCount > 0);
         return NULL;
     }
 
     ctx->SetDebugPos(pos);
     llvm::Value *indexValue = index->GetValue(ctx);
     if (indexValue == NULL) {
-        Assert(m->errorCount > 0);
+        AssertPos(pos, m->errorCount > 0);
         return NULL;
     }
 
     ctx->SetDebugPos(pos);
-    if (dynamic_cast<const PointerType *>(baseExprType) != NULL) {
+    if (CastType<PointerType>(baseExprType) != NULL) {
         // We're indexing off of a pointer 
         llvm::Value *basePtrValue = baseExpr->GetValue(ctx);
         if (basePtrValue == NULL) {
-            Assert(m->errorCount > 0);
+            AssertPos(pos, m->errorCount > 0);
             return NULL;
         }
         ctx->SetDebugPos(pos);
@@ -4051,7 +4146,8 @@ IndexExpr::GetLValue(FunctionEmitContext *ctx) const {
                                                   &baseExprType);
 
         llvm::Value *ptr = ctx->GetElementPtrInst(basePtrValue, indexValue,
-                                                  baseExprType, "ptr_offset");
+                                                  baseExprType, 
+                                                  LLVMGetName(basePtrValue, "_offset"));
         return lAddVaryingOffsetsIfNeeded(ctx, ptr, GetLValueType());
     }
 
@@ -4059,16 +4155,16 @@ IndexExpr::GetLValue(FunctionEmitContext *ctx) const {
     // a reference thereuponfore.)
     llvm::Value *basePtr = NULL;
     const PointerType *basePtrType = NULL;
-    if (dynamic_cast<const ArrayType *>(baseExprType) ||
-        dynamic_cast<const VectorType *>(baseExprType)) {
+    if (CastType<ArrayType>(baseExprType) ||
+        CastType<VectorType>(baseExprType)) {
         basePtr = baseExpr->GetLValue(ctx);
-        basePtrType = dynamic_cast<const PointerType *>(baseExpr->GetLValueType());
-        if (baseExpr->GetLValueType()) Assert(basePtrType != NULL);
+        basePtrType = CastType<PointerType>(baseExpr->GetLValueType());
+        if (baseExpr->GetLValueType()) AssertPos(pos, basePtrType != NULL);
     }
     else {
         baseExprType = baseExprType->GetReferenceTarget();
-        Assert(dynamic_cast<const ArrayType *>(baseExprType) ||
-               dynamic_cast<const VectorType *>(baseExprType));
+        AssertPos(pos, CastType<ArrayType>(baseExprType) ||
+               CastType<VectorType>(baseExprType));
         basePtr = baseExpr->GetValue(ctx);
         basePtrType = PointerType::GetUniform(baseExprType);
     }
@@ -4087,13 +4183,16 @@ IndexExpr::GetLValue(FunctionEmitContext *ctx) const {
     // And do the actual indexing calculation..
     llvm::Value *ptr = 
         ctx->GetElementPtrInst(basePtr, LLVMInt32(0), indexValue, 
-                               basePtrType);
+                               basePtrType, LLVMGetName(basePtr, "_offset"));
     return lAddVaryingOffsetsIfNeeded(ctx, ptr, GetLValueType());
 }
 
 
 const Type *
 IndexExpr::GetLValueType() const {
+    if (lvalueType != NULL)
+        return lvalueType;
+
     const Type *baseExprType, *baseExprLValueType, *indexType;
     if (baseExpr == NULL || index == NULL ||
         ((baseExprType = baseExpr->GetType()) == NULL) ||
@@ -4102,48 +4201,47 @@ IndexExpr::GetLValueType() const {
         return NULL;
 
     // regularize to a PointerType 
-    if (dynamic_cast<const ReferenceType *>(baseExprLValueType) != NULL) {
+    if (CastType<ReferenceType>(baseExprLValueType) != NULL) {
         const Type *refTarget = baseExprLValueType->GetReferenceTarget();
         baseExprLValueType = PointerType::GetUniform(refTarget);
     }
-    Assert(dynamic_cast<const PointerType *>(baseExprLValueType) != NULL);
+    AssertPos(pos, CastType<PointerType>(baseExprLValueType) != NULL);
 
     // Find the type of thing that we're indexing into
     const Type *elementType;
     const SequentialType *st = 
-        dynamic_cast<const SequentialType *>(baseExprLValueType->GetBaseType());
+        CastType<SequentialType>(baseExprLValueType->GetBaseType());
     if (st != NULL)
         elementType = st->GetElementType();
     else {
         const PointerType *pt = 
-            dynamic_cast<const PointerType *>(baseExprLValueType->GetBaseType());
-        Assert(pt != NULL);
+            CastType<PointerType>(baseExprLValueType->GetBaseType());
+        AssertPos(pos, pt != NULL);
         elementType = pt->GetBaseType();
     }
 
     // Are we indexing into a varying type, or are we indexing with a
     // varying pointer?
     bool baseVarying;
-    if (dynamic_cast<const PointerType *>(baseExprType) != NULL)
+    if (CastType<PointerType>(baseExprType) != NULL)
         baseVarying = baseExprType->IsVaryingType();
     else
         baseVarying = baseExprLValueType->IsVaryingType();
 
     // The return type is uniform iff. the base is a uniform pointer / a
     // collection of uniform typed elements and the index is uniform.
-    const PointerType *retType;
     if (baseVarying == false && indexType->IsUniformType())
-        retType = PointerType::GetUniform(elementType);
+        lvalueType = PointerType::GetUniform(elementType);
     else
-        retType = PointerType::GetVarying(elementType);
+        lvalueType = PointerType::GetVarying(elementType);
 
     // Finally, if we're indexing into an SOA type, then the resulting
     // pointer must (currently) be a slice pointer; we don't allow indexing
     // the soa-width-wide structs directly.
     if (elementType->IsSOAType())
-        retType = retType->GetAsSlice();
+        lvalueType = lvalueType->GetAsSlice();
 
-    return retType;
+    return lvalueType;
 }
 
 
@@ -4160,18 +4258,18 @@ IndexExpr::TypeCheck() {
     const Type *indexType;
     if (baseExpr == NULL || index == NULL || 
         ((indexType = index->GetType()) == NULL)) {
-        Assert(m->errorCount > 0);
+        AssertPos(pos, m->errorCount > 0);
         return NULL;
     }
 
     const Type *baseExprType = baseExpr->GetType();
     if (baseExprType == NULL) {
-        Assert(m->errorCount > 0);
+        AssertPos(pos, m->errorCount > 0);
         return NULL;
     }
 
-    if (!dynamic_cast<const SequentialType *>(baseExprType->GetReferenceTarget()) &&
-        !dynamic_cast<const PointerType *>(baseExprType)) {
+    if (!CastType<SequentialType>(baseExprType->GetReferenceTarget()) &&
+        !CastType<PointerType>(baseExprType)) {
         Error(pos, "Trying to index into non-array, vector, or pointer "
               "type \"%s\".", baseExprType->GetString().c_str());
         return NULL;
@@ -4207,7 +4305,7 @@ IndexExpr::EstimateCost() const {
     const Type *baseExprType = baseExpr->GetType();
     
     if ((indexType != NULL && indexType->IsVaryingType()) ||
-        (dynamic_cast<const PointerType *>(baseExprType) != NULL &&
+        (CastType<PointerType>(baseExprType) != NULL &&
          baseExprType->IsVaryingType()))
         // be pessimistic; some of these will later turn out to be vector
         // loads/stores, but it's too early for us to know that here.
@@ -4286,6 +4384,9 @@ StructMemberExpr::StructMemberExpr(Expr *e, const char *id, SourcePos p,
 
 const Type *
 StructMemberExpr::GetType() const {
+    if (type != NULL)
+        return type;
+
     // It's a struct, and the result type is the element type, possibly
     // promoted to varying if the struct type / lvalue is varying.
     const Type *exprType, *lvalueType;
@@ -4294,7 +4395,7 @@ StructMemberExpr::GetType() const {
         ((exprType = expr->GetType()) == NULL) ||
         ((structType = getStructType()) == NULL) ||
         ((lvalueType = GetLValueType()) == NULL)) {
-        Assert(m->errorCount > 0);
+        AssertPos(pos, m->errorCount > 0);
         return NULL;
     }
 
@@ -4306,14 +4407,14 @@ StructMemberExpr::GetType() const {
               getCandidateNearMatches().c_str());
         return NULL;
     }
-    Assert(Type::Equal(lvalueType->GetBaseType(), elementType));
+    AssertPos(pos, Type::Equal(lvalueType->GetBaseType(), elementType));
 
-    bool isSlice = (dynamic_cast<const PointerType *>(lvalueType) &&
-                    dynamic_cast<const PointerType *>(lvalueType)->IsSlice());
+    bool isSlice = (CastType<PointerType>(lvalueType) &&
+                    CastType<PointerType>(lvalueType)->IsSlice());
     if (isSlice) {
         // FIXME: not true if we allow bound unif/varying for soa<>
         // structs?...
-        Assert(elementType->IsSOAType());
+        AssertPos(pos, elementType->IsSOAType());
 
         // If we're accessing a member of an soa structure via a uniform
         // slice pointer, then the result type is the uniform variant of
@@ -4328,21 +4429,25 @@ StructMemberExpr::GetType() const {
         // result type must be the varying version of the element type.
         elementType = elementType->GetAsVaryingType();
 
-    return elementType;
+    type = elementType;
+    return type;
 }
 
 
 const Type *
 StructMemberExpr::GetLValueType() const {
+    if (lvalueType != NULL)
+        return lvalueType;
+
     if (expr == NULL) {
-        Assert(m->errorCount > 0);
+        AssertPos(pos, m->errorCount > 0);
         return NULL;
     }
 
     const Type *exprLValueType = dereferenceExpr ? expr->GetType() :
         expr->GetLValueType();
     if (exprLValueType == NULL) {
-        Assert(m->errorCount > 0);
+        AssertPos(pos, m->errorCount > 0);
         return NULL;
     }
 
@@ -4350,7 +4455,7 @@ StructMemberExpr::GetLValueType() const {
     // varying (and otherwise uniform)
     const PointerType *ptrType =
         (exprLValueType->IsUniformType() ||
-         dynamic_cast<const ReferenceType *>(exprLValueType) != NULL) ?
+         CastType<ReferenceType>(exprLValueType) != NULL) ?
         PointerType::GetUniform(getElementType()) : 
         PointerType::GetVarying(getElementType());
 
@@ -4358,11 +4463,12 @@ StructMemberExpr::GetLValueType() const {
     // needs to be a frozen slice pointer--i.e. any further indexing with
     // the result shouldn't modify the minor slice offset, but it should be
     // left unchanged until we get to a leaf SOA value.
-    if (dynamic_cast<const PointerType *>(exprLValueType) &&
-        dynamic_cast<const PointerType *>(exprLValueType)->IsSlice())
+    if (CastType<PointerType>(exprLValueType) &&
+        CastType<PointerType>(exprLValueType)->IsSlice())
         ptrType = ptrType->GetAsFrozenSlice();
 
-    return ptrType;
+    lvalueType = ptrType;
+    return lvalueType;
 }
 
 
@@ -4403,17 +4509,17 @@ StructMemberExpr::getStructType() const {
         return NULL;
 
     const Type *structType;
-    const ReferenceType *rt = dynamic_cast<const ReferenceType *>(type);
+    const ReferenceType *rt = CastType<ReferenceType>(type);
     if (rt != NULL)
         structType = rt->GetReferenceTarget();
     else {
-        const PointerType *pt = dynamic_cast<const PointerType *>(type);
-        Assert(pt != NULL);
+        const PointerType *pt = CastType<PointerType>(type);
+        AssertPos(pos, pt != NULL);
         structType = pt->GetBaseType();
     }
 
-    const StructType *ret = dynamic_cast<const StructType *>(structType);
-    Assert(ret != NULL);
+    const StructType *ret = CastType<StructType>(structType);
+    AssertPos(pos, ret != NULL);
     return ret;
 }
 
@@ -4445,17 +4551,17 @@ VectorMemberExpr::VectorMemberExpr(Expr *e, const char *id, SourcePos p,
                                    SourcePos idpos, bool derefLValue)
     : MemberExpr(e, id, p, idpos, derefLValue) {
     const Type *exprType = e->GetType();
-    exprVectorType = dynamic_cast<const VectorType *>(exprType);
+    exprVectorType = CastType<VectorType>(exprType);
     if (exprVectorType == NULL) {
-        const PointerType *pt = dynamic_cast<const PointerType *>(exprType);
+        const PointerType *pt = CastType<PointerType>(exprType);
         if (pt != NULL)
-            exprVectorType = dynamic_cast<const VectorType *>(pt->GetBaseType());
+            exprVectorType = CastType<VectorType>(pt->GetBaseType());
         else {
-            Assert(dynamic_cast<const ReferenceType *>(exprType) != NULL);
+            AssertPos(pos, CastType<ReferenceType>(exprType) != NULL);
             exprVectorType = 
-                dynamic_cast<const VectorType *>(exprType->GetReferenceTarget());
+                CastType<VectorType>(exprType->GetReferenceTarget());
         }
-        Assert(exprVectorType != NULL);
+        AssertPos(pos, exprVectorType != NULL);
     }
     memberType = new VectorType(exprVectorType->GetElementType(),
                                 identifier.length());
@@ -4464,25 +4570,28 @@ VectorMemberExpr::VectorMemberExpr(Expr *e, const char *id, SourcePos p,
 
 const Type *
 VectorMemberExpr::GetType() const {
+    if (type != NULL)
+        return type;
+
     // For 1-element expressions, we have the base vector element
     // type.  For n-element expressions, we have a shortvec type
     // with n > 1 elements.  This can be changed when we get
     // type<1> -> type conversions.
-    const Type *type = (identifier.length() == 1) ? 
+    type = (identifier.length() == 1) ? 
         (const Type *)exprVectorType->GetElementType() : 
         (const Type *)memberType;
 
-    const Type *lvalueType = GetLValueType();
-    if (lvalueType != NULL) {
-        bool isSlice = (dynamic_cast<const PointerType *>(lvalueType) &&
-                        dynamic_cast<const PointerType *>(lvalueType)->IsSlice());
+    const Type *lvType = GetLValueType();
+    if (lvType != NULL) {
+        bool isSlice = (CastType<PointerType>(lvType) &&
+                        CastType<PointerType>(lvType)->IsSlice());
         if (isSlice) {
-//CO            Assert(type->IsSOAType());
-            if (lvalueType->IsUniformType())
+//CO            AssertPos(pos, type->IsSOAType());
+            if (lvType->IsUniformType())
                 type = type->GetAsUniformType();
         }
 
-        if (lvalueType->IsVaryingType())
+        if (lvType->IsVaryingType())
             type = type->GetAsVaryingType();
     }
 
@@ -4502,9 +4611,12 @@ VectorMemberExpr::GetLValue(FunctionEmitContext* ctx) const {
 
 const Type *
 VectorMemberExpr::GetLValueType() const {
+    if (lvalueType != NULL)
+        return lvalueType;
+
     if (identifier.length() == 1) {
         if (expr == NULL) {
-            Assert(m->errorCount > 0);
+            AssertPos(pos, m->errorCount > 0);
             return NULL;
         }
 
@@ -4514,30 +4626,30 @@ VectorMemberExpr::GetLValueType() const {
             return NULL;
 
         const VectorType *vt = NULL;
-        if (dynamic_cast<const ReferenceType *>(exprLValueType) != NULL)
-            vt = dynamic_cast<const VectorType *>(exprLValueType->GetReferenceTarget());
+        if (CastType<ReferenceType>(exprLValueType) != NULL)
+            vt = CastType<VectorType>(exprLValueType->GetReferenceTarget());
         else
-            vt = dynamic_cast<const VectorType *>(exprLValueType->GetBaseType());
-        Assert(vt != NULL);
+            vt = CastType<VectorType>(exprLValueType->GetBaseType());
+        AssertPos(pos, vt != NULL);
 
         // we don't want to report that it's e.g. a pointer to a float<1>,
         // but a pointer to a float, etc.
         const Type *elementType = vt->GetElementType();
-        if (dynamic_cast<const ReferenceType *>(exprLValueType) != NULL)
-            return new ReferenceType(elementType);
+        if (CastType<ReferenceType>(exprLValueType) != NULL)
+            lvalueType = new ReferenceType(elementType);
         else {
             const PointerType *ptrType = exprLValueType->IsUniformType() ?
                 PointerType::GetUniform(elementType) : 
                 PointerType::GetVarying(elementType);
             // FIXME: replicated logic with structmemberexpr.... 
-            if (dynamic_cast<const PointerType *>(exprLValueType) &&
-                dynamic_cast<const PointerType *>(exprLValueType)->IsSlice())
+            if (CastType<PointerType>(exprLValueType) &&
+                CastType<PointerType>(exprLValueType)->IsSlice())
                 ptrType = ptrType->GetAsFrozenSlice();
-            return ptrType;
+            lvalueType = ptrType;
         }
     }
-    else
-        return NULL;
+
+    return lvalueType;
 }
 
 
@@ -4571,14 +4683,14 @@ VectorMemberExpr::GetValue(FunctionEmitContext *ctx) const {
         }
 
         if (basePtr == NULL || basePtrType == NULL) {
-            Assert(m->errorCount > 0);
+            AssertPos(pos, m->errorCount > 0);
             return NULL;
         }
 
         // Allocate temporary memory to tore the result
         llvm::Value *resultPtr = ctx->AllocaInst(memberType->LLVMType(g->ctx), 
-                                            "vector_tmp");
-
+                                                 "vector_tmp");
+        
         // FIXME: we should be able to use the internal mask here according
         // to the same logic where it's used elsewhere
         llvm::Value *elementMask = ctx->GetFullMask();
@@ -4589,17 +4701,19 @@ VectorMemberExpr::GetValue(FunctionEmitContext *ctx) const {
 
         ctx->SetDebugPos(pos);
         for (size_t i = 0; i < identifier.size(); ++i) {
+            char idStr[2] = { identifier[i], '\0' };
             llvm::Value *elementPtr = ctx->AddElementOffset(basePtr, indices[i],
-                                                            basePtrType);
+                                                            basePtrType,
+                                                            LLVMGetName(basePtr, idStr));
             llvm::Value *elementValue = 
-                ctx->LoadInst(elementPtr, elementMask, elementPtrType, 
-                              "vec_element");
+                ctx->LoadInst(elementPtr, elementMask, elementPtrType);
 
-            llvm::Value *ptmp = ctx->AddElementOffset(resultPtr, i, NULL);
+            const char *resultName = LLVMGetName(resultPtr, idStr);
+            llvm::Value *ptmp = ctx->AddElementOffset(resultPtr, i, NULL, resultName);
             ctx->StoreInst(elementValue, ptmp);
         }
 
-        return ctx->LoadInst(resultPtr, "swizzle_vec");
+        return ctx->LoadInst(resultPtr, LLVMGetName(basePtr, "_swizzle"));
     }
 }
 
@@ -4634,40 +4748,46 @@ MemberExpr::create(Expr *e, const char *id, SourcePos p, SourcePos idpos,
     if (e == NULL || (exprType = e->GetType()) == NULL)
         return NULL;
 
-    const ReferenceType *referenceType =
-        dynamic_cast<const ReferenceType *>(exprType);
+    const ReferenceType *referenceType = CastType<ReferenceType>(exprType);
     if (referenceType != NULL) {
         e = new RefDerefExpr(e, e->pos);
         exprType = e->GetType();
         Assert(exprType != NULL);
     }
 
-    const PointerType *pointerType = dynamic_cast<const PointerType *>(exprType);
+    const PointerType *pointerType = CastType<PointerType>(exprType);
     if (pointerType != NULL)
         exprType = pointerType->GetBaseType();
 
     if (derefLValue == true && pointerType == NULL) {
-        if (dynamic_cast<const StructType *>(exprType->GetReferenceTarget()) != NULL)
-            Error(p, "Dereference operator \"->\" can't be applied to non-pointer "
+        const Type *targetType = exprType->GetReferenceTarget();
+        if (CastType<StructType>(targetType) != NULL)
+            Error(p, "Member operator \"->\" can't be applied to non-pointer "
                   "type \"%s\".  Did you mean to use \".\"?", 
                   exprType->GetString().c_str());
         else
-            Error(p, "Dereference operator \"->\" can't be applied to non-struct "
+            Error(p, "Member operator \"->\" can't be applied to non-struct "
                   "pointer type \"%s\".", exprType->GetString().c_str());
         return NULL;
     }
     if (derefLValue == false && pointerType != NULL &&
-        dynamic_cast<const StructType *>(pointerType->GetBaseType()) != NULL) {
+        CastType<StructType>(pointerType->GetBaseType()) != NULL) {
             Error(p, "Member operator \".\" can't be applied to pointer "
                   "type \"%s\".  Did you mean to use \"->\"?", 
                   exprType->GetString().c_str());
         return NULL;
     }
 
-    if (dynamic_cast<const StructType *>(exprType) != NULL)
+    if (CastType<StructType>(exprType) != NULL)
         return new StructMemberExpr(e, id, p, idpos, derefLValue);
-    else if (dynamic_cast<const VectorType *>(exprType) != NULL)
+    else if (CastType<VectorType>(exprType) != NULL)
         return new VectorMemberExpr(e, id, p, idpos, derefLValue);
+    else if (CastType<UndefinedStructType>(exprType)) {
+        Error(p, "Member operator \"%s\" can't be applied to declared "
+              "but not defined struct type \"%s\".", derefLValue ? "->" : ".",
+              exprType->GetString().c_str());
+        return NULL;
+    }
     else {
         Error(p, "Member operator \"%s\" can't be used with expression of "
               "\"%s\" type.", derefLValue ? "->" : ".", 
@@ -4683,6 +4803,7 @@ MemberExpr::MemberExpr(Expr *e, const char *id, SourcePos p, SourcePos idpos,
     expr = e;
     identifier = id;
     dereferenceExpr = derefLValue;
+    type = lvalueType = NULL;
 }
 
 
@@ -4696,12 +4817,15 @@ MemberExpr::GetValue(FunctionEmitContext *ctx) const {
 
     llvm::Value *mask = NULL;
     if (lvalue == NULL) {
+        if (m->errorCount > 0)
+            return NULL;
+
         // As in the array case, this may be a temporary that hasn't hit
         // memory; get the full value and stuff it into a temporary array
         // so that we can index from there...
         llvm::Value *val = expr->GetValue(ctx);
         if (!val) {
-            Assert(m->errorCount > 0);
+            AssertPos(pos, m->errorCount > 0);
             return NULL;
         }
         ctx->SetDebugPos(pos);
@@ -4721,12 +4845,14 @@ MemberExpr::GetValue(FunctionEmitContext *ctx) const {
     }
     else {
         Symbol *baseSym = GetBaseSymbol();
-        Assert(baseSym != NULL);
+        AssertPos(pos, baseSym != NULL);
         mask = lMaskForSymbol(baseSym, ctx);
     }
 
     ctx->SetDebugPos(pos);
-    return ctx->LoadInst(lvalue, mask, lvalueType, "structelement");
+    std::string suffix = std::string("_") + identifier;
+    return ctx->LoadInst(lvalue, mask, lvalueType, 
+                         LLVMGetName(lvalue, suffix.c_str()));
 }
 
 
@@ -4768,7 +4894,12 @@ MemberExpr::GetLValue(FunctionEmitContext *ctx) const {
         expr->GetLValueType();
     ctx->SetDebugPos(pos);
     llvm::Value *ptr = ctx->AddElementOffset(basePtr, elementNumber,
-                                             exprLValueType);
+                                             exprLValueType, 
+                                             basePtr->getName().str().c_str());
+    if (ptr == NULL) {
+        AssertPos(pos, m->errorCount > 0);
+        return NULL;
+    }
 
     ptr = lAddVaryingOffsetsIfNeeded(ctx, ptr, GetLValueType());
 
@@ -4816,8 +4947,7 @@ MemberExpr::Print() const {
  */
 std::string
 MemberExpr::getCandidateNearMatches() const {
-    const StructType *structType = 
-        dynamic_cast<const StructType *>(expr->GetType());
+    const StructType *structType = CastType<StructType>(expr->GetType());
     if (!structType)
         return "";
 
@@ -4845,7 +4975,7 @@ ConstExpr::ConstExpr(const Type *t, int8_t i, SourcePos p)
   : Expr(p) {
     type = t;
     type = type->GetAsConstType();
-    Assert(Type::Equal(type, AtomicType::UniformInt8->GetAsConstType()));
+    AssertPos(pos, Type::Equal(type, AtomicType::UniformInt8->GetAsConstType()));
     int8Val[0] = i;
 }
 
@@ -4854,7 +4984,7 @@ ConstExpr::ConstExpr(const Type *t, int8_t *i, SourcePos p)
   : Expr(p) {
     type = t;
     type = type->GetAsConstType();
-    Assert(Type::Equal(type, AtomicType::UniformInt8->GetAsConstType()) || 
+    AssertPos(pos, Type::Equal(type, AtomicType::UniformInt8->GetAsConstType()) || 
            Type::Equal(type, AtomicType::VaryingInt8->GetAsConstType()));
     for (int j = 0; j < Count(); ++j)
         int8Val[j] = i[j];
@@ -4865,7 +4995,7 @@ ConstExpr::ConstExpr(const Type *t, uint8_t u, SourcePos p)
   : Expr(p) {
     type = t;
     type = type->GetAsConstType();
-    Assert(Type::Equal(type, AtomicType::UniformUInt8->GetAsConstType()));
+    AssertPos(pos, Type::Equal(type, AtomicType::UniformUInt8->GetAsConstType()));
     uint8Val[0] = u;
 }
 
@@ -4874,7 +5004,7 @@ ConstExpr::ConstExpr(const Type *t, uint8_t *u, SourcePos p)
   : Expr(p) {
     type = t;
     type = type->GetAsConstType();
-    Assert(Type::Equal(type, AtomicType::UniformUInt8->GetAsConstType()) || 
+    AssertPos(pos, Type::Equal(type, AtomicType::UniformUInt8->GetAsConstType()) || 
            Type::Equal(type, AtomicType::VaryingUInt8->GetAsConstType()));
     for (int j = 0; j < Count(); ++j)
         uint8Val[j] = u[j];
@@ -4885,7 +5015,7 @@ ConstExpr::ConstExpr(const Type *t, int16_t i, SourcePos p)
   : Expr(p) {
     type = t;
     type = type->GetAsConstType();
-    Assert(Type::Equal(type, AtomicType::UniformInt16->GetAsConstType()));
+    AssertPos(pos, Type::Equal(type, AtomicType::UniformInt16->GetAsConstType()));
     int16Val[0] = i;
 }
 
@@ -4894,7 +5024,7 @@ ConstExpr::ConstExpr(const Type *t, int16_t *i, SourcePos p)
   : Expr(p) {
     type = t;
     type = type->GetAsConstType();
-    Assert(Type::Equal(type, AtomicType::UniformInt16->GetAsConstType()) || 
+    AssertPos(pos, Type::Equal(type, AtomicType::UniformInt16->GetAsConstType()) || 
            Type::Equal(type, AtomicType::VaryingInt16->GetAsConstType()));
     for (int j = 0; j < Count(); ++j)
         int16Val[j] = i[j];
@@ -4905,7 +5035,7 @@ ConstExpr::ConstExpr(const Type *t, uint16_t u, SourcePos p)
   : Expr(p) {
     type = t;
     type = type->GetAsConstType();
-    Assert(Type::Equal(type, AtomicType::UniformUInt16->GetAsConstType()));
+    AssertPos(pos, Type::Equal(type, AtomicType::UniformUInt16->GetAsConstType()));
     uint16Val[0] = u;
 }
 
@@ -4914,7 +5044,7 @@ ConstExpr::ConstExpr(const Type *t, uint16_t *u, SourcePos p)
   : Expr(p) {
     type = t;
     type = type->GetAsConstType();
-    Assert(Type::Equal(type, AtomicType::UniformUInt16->GetAsConstType()) || 
+    AssertPos(pos, Type::Equal(type, AtomicType::UniformUInt16->GetAsConstType()) || 
            Type::Equal(type, AtomicType::VaryingUInt16->GetAsConstType()));
     for (int j = 0; j < Count(); ++j)
         uint16Val[j] = u[j];
@@ -4925,7 +5055,7 @@ ConstExpr::ConstExpr(const Type *t, int32_t i, SourcePos p)
   : Expr(p) {
     type = t;
     type = type->GetAsConstType();
-    Assert(Type::Equal(type, AtomicType::UniformInt32->GetAsConstType()));
+    AssertPos(pos, Type::Equal(type, AtomicType::UniformInt32->GetAsConstType()));
     int32Val[0] = i;
 }
 
@@ -4934,7 +5064,7 @@ ConstExpr::ConstExpr(const Type *t, int32_t *i, SourcePos p)
   : Expr(p) {
     type = t;
     type = type->GetAsConstType();
-    Assert(Type::Equal(type, AtomicType::UniformInt32->GetAsConstType()) || 
+    AssertPos(pos, Type::Equal(type, AtomicType::UniformInt32->GetAsConstType()) || 
            Type::Equal(type, AtomicType::VaryingInt32->GetAsConstType()));
     for (int j = 0; j < Count(); ++j)
         int32Val[j] = i[j];
@@ -4945,8 +5075,8 @@ ConstExpr::ConstExpr(const Type *t, uint32_t u, SourcePos p)
   : Expr(p) {
     type = t;
     type = type->GetAsConstType();
-    Assert(Type::Equal(type, AtomicType::UniformUInt32->GetAsConstType()) ||
-           (dynamic_cast<const EnumType *>(type) != NULL &&
+    AssertPos(pos, Type::Equal(type, AtomicType::UniformUInt32->GetAsConstType()) ||
+           (CastType<EnumType>(type) != NULL &&
             type->IsUniformType()));
     uint32Val[0] = u;
 }
@@ -4956,9 +5086,9 @@ ConstExpr::ConstExpr(const Type *t, uint32_t *u, SourcePos p)
   : Expr(p) {
     type = t;
     type = type->GetAsConstType();
-    Assert(Type::Equal(type, AtomicType::UniformUInt32->GetAsConstType()) || 
+    AssertPos(pos, Type::Equal(type, AtomicType::UniformUInt32->GetAsConstType()) || 
            Type::Equal(type, AtomicType::VaryingUInt32->GetAsConstType()) ||
-           (dynamic_cast<const EnumType *>(type) != NULL));
+           (CastType<EnumType>(type) != NULL));
     for (int j = 0; j < Count(); ++j)
         uint32Val[j] = u[j];
 }
@@ -4968,7 +5098,7 @@ ConstExpr::ConstExpr(const Type *t, float f, SourcePos p)
     : Expr(p) {
     type = t;
     type = type->GetAsConstType();
-    Assert(Type::Equal(type, AtomicType::UniformFloat->GetAsConstType()));
+    AssertPos(pos, Type::Equal(type, AtomicType::UniformFloat->GetAsConstType()));
     floatVal[0] = f;
 }
 
@@ -4977,7 +5107,7 @@ ConstExpr::ConstExpr(const Type *t, float *f, SourcePos p)
   : Expr(p) {
     type = t;
     type = type->GetAsConstType();
-    Assert(Type::Equal(type, AtomicType::UniformFloat->GetAsConstType()) || 
+    AssertPos(pos, Type::Equal(type, AtomicType::UniformFloat->GetAsConstType()) || 
            Type::Equal(type, AtomicType::VaryingFloat->GetAsConstType()));
     for (int j = 0; j < Count(); ++j)
         floatVal[j] = f[j];
@@ -4988,7 +5118,7 @@ ConstExpr::ConstExpr(const Type *t, int64_t i, SourcePos p)
   : Expr(p) {
     type = t;
     type = type->GetAsConstType();
-    Assert(Type::Equal(type, AtomicType::UniformInt64->GetAsConstType()));
+    AssertPos(pos, Type::Equal(type, AtomicType::UniformInt64->GetAsConstType()));
     int64Val[0] = i;
 }
 
@@ -4997,7 +5127,7 @@ ConstExpr::ConstExpr(const Type *t, int64_t *i, SourcePos p)
   : Expr(p) {
     type = t;
     type = type->GetAsConstType();
-    Assert(Type::Equal(type, AtomicType::UniformInt64->GetAsConstType()) || 
+    AssertPos(pos, Type::Equal(type, AtomicType::UniformInt64->GetAsConstType()) || 
            Type::Equal(type, AtomicType::VaryingInt64->GetAsConstType()));
     for (int j = 0; j < Count(); ++j)
         int64Val[j] = i[j];
@@ -5008,7 +5138,7 @@ ConstExpr::ConstExpr(const Type *t, uint64_t u, SourcePos p)
   : Expr(p) {
     type = t;
     type = type->GetAsConstType();
-    Assert(Type::Equal(type, AtomicType::UniformUInt64->GetAsConstType()));
+    AssertPos(pos, Type::Equal(type, AtomicType::UniformUInt64->GetAsConstType()));
     uint64Val[0] = u;
 }
 
@@ -5017,7 +5147,7 @@ ConstExpr::ConstExpr(const Type *t, uint64_t *u, SourcePos p)
   : Expr(p) {
     type = t;
     type = type->GetAsConstType();
-    Assert(Type::Equal(type, AtomicType::UniformUInt64->GetAsConstType()) || 
+    AssertPos(pos, Type::Equal(type, AtomicType::UniformUInt64->GetAsConstType()) || 
            Type::Equal(type, AtomicType::VaryingUInt64->GetAsConstType()));
     for (int j = 0; j < Count(); ++j)
         uint64Val[j] = u[j];
@@ -5028,7 +5158,7 @@ ConstExpr::ConstExpr(const Type *t, double f, SourcePos p)
     : Expr(p) {
     type = t;
     type = type->GetAsConstType();
-    Assert(Type::Equal(type, AtomicType::UniformDouble->GetAsConstType()));
+    AssertPos(pos, Type::Equal(type, AtomicType::UniformDouble->GetAsConstType()));
     doubleVal[0] = f;
 }
 
@@ -5037,7 +5167,7 @@ ConstExpr::ConstExpr(const Type *t, double *f, SourcePos p)
   : Expr(p) {
     type = t;
     type = type->GetAsConstType();
-    Assert(Type::Equal(type, AtomicType::UniformDouble->GetAsConstType()) || 
+    AssertPos(pos, Type::Equal(type, AtomicType::UniformDouble->GetAsConstType()) || 
            Type::Equal(type, AtomicType::VaryingDouble->GetAsConstType()));
     for (int j = 0; j < Count(); ++j)
         doubleVal[j] = f[j];
@@ -5048,7 +5178,7 @@ ConstExpr::ConstExpr(const Type *t, bool b, SourcePos p)
   : Expr(p) {
     type = t;
     type = type->GetAsConstType();
-    Assert(Type::Equal(type, AtomicType::UniformBool->GetAsConstType()));
+    AssertPos(pos, Type::Equal(type, AtomicType::UniformBool->GetAsConstType()));
     boolVal[0] = b;
 }
 
@@ -5057,7 +5187,7 @@ ConstExpr::ConstExpr(const Type *t, bool *b, SourcePos p)
   : Expr(p) {
     type = t;
     type = type->GetAsConstType();
-    Assert(Type::Equal(type, AtomicType::UniformBool->GetAsConstType()) || 
+    AssertPos(pos, Type::Equal(type, AtomicType::UniformBool->GetAsConstType()) || 
            Type::Equal(type, AtomicType::VaryingBool->GetAsConstType()));
     for (int j = 0; j < Count(); ++j)
         boolVal[j] = b[j];
@@ -5167,11 +5297,11 @@ ConstExpr::ConstExpr(ConstExpr *old, SourcePos p)
 
 AtomicType::BasicType
 ConstExpr::getBasicType() const {
-    const AtomicType *at = dynamic_cast<const AtomicType *>(type);
+    const AtomicType *at = CastType<AtomicType>(type);
     if (at != NULL)
         return at->basicType;
     else {
-        Assert(dynamic_cast<const EnumType *>(type) != NULL);
+        AssertPos(pos, CastType<EnumType>(type) != NULL);
         return AtomicType::TYPE_UINT32;
     }
 }
@@ -5523,7 +5653,7 @@ ConstExpr::GetConstant(const Type *type) const {
     // Caller shouldn't be trying to stuff a varying value here into a
     // constant type.
     if (type->IsUniformType())
-        Assert(Count() == 1);
+        AssertPos(pos, Count() == 1);
 
     type = type->GetAsNonConstType();
     if (Type::Equal(type, AtomicType::UniformBool) || 
@@ -5582,7 +5712,7 @@ ConstExpr::GetConstant(const Type *type) const {
     }
     else if (Type::Equal(type, AtomicType::UniformUInt32) || 
              Type::Equal(type, AtomicType::VaryingUInt32) ||
-             dynamic_cast<const EnumType *>(type) != NULL) {
+             CastType<EnumType>(type) != NULL) {
         uint32_t uiv[ISPC_MAX_NVEC];
         AsUInt32(uiv, type->IsVaryingType());
         if (type->IsUniformType())
@@ -5626,13 +5756,13 @@ ConstExpr::GetConstant(const Type *type) const {
         else
             return LLVMDoubleVector(dv);
     }
-    else if (dynamic_cast<const PointerType *>(type) != NULL) {
+    else if (CastType<PointerType>(type) != NULL) {
         // The only time we should get here is if we have an integer '0'
         // constant that should be turned into a NULL pointer of the
         // appropriate type.
-        LLVM_TYPE_CONST llvm::Type *llvmType = type->LLVMType(g->ctx);
+        llvm::Type *llvmType = type->LLVMType(g->ctx);
         if (llvmType == NULL) {
-            Assert(m->errorCount > 0);
+            AssertPos(pos, m->errorCount > 0);
             return NULL;
         }
 
@@ -5741,9 +5871,26 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
                 SourcePos pos) {
     llvm::Value *cast = NULL;
 
+    std::string opName = exprVal->getName().str();
+    switch (toType->basicType) {
+    case AtomicType::TYPE_BOOL: opName += "_to_bool"; break;
+    case AtomicType::TYPE_INT8: opName += "_to_int8"; break;
+    case AtomicType::TYPE_UINT8: opName += "_to_uint8"; break;
+    case AtomicType::TYPE_INT16: opName += "_to_int16"; break;
+    case AtomicType::TYPE_UINT16: opName += "_to_uint16"; break;
+    case AtomicType::TYPE_INT32: opName += "_to_int32"; break;
+    case AtomicType::TYPE_UINT32: opName += "_to_uint32"; break;
+    case AtomicType::TYPE_INT64: opName += "_to_int64"; break;
+    case AtomicType::TYPE_UINT64: opName += "_to_uint64"; break;
+    case AtomicType::TYPE_FLOAT: opName += "_to_float"; break;
+    case AtomicType::TYPE_DOUBLE: opName += "_to_double"; break;
+    default: FATAL("Unimplemented");
+    }
+    const char *cOpName = opName.c_str();
+
     switch (toType->basicType) {
     case AtomicType::TYPE_FLOAT: {
-        LLVM_TYPE_CONST llvm::Type *targetType = 
+        llvm::Type *targetType = 
             fromType->IsUniformType() ? LLVMTypes::FloatType : 
                                         LLVMTypes::FloatVectorType;
         switch (fromType->basicType) {
@@ -5752,34 +5899,34 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
                 LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
                 // If we have a bool vector of i32 elements, first truncate
                 // down to a single bit
-                exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, "bool_to_i1");
+                exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
             // And then do an unisgned int->float cast
             cast = ctx->CastInst(llvm::Instruction::UIToFP, // unsigned int
-                                 exprVal, targetType, "bool2float");
+                                 exprVal, targetType, cOpName);
             break;
         case AtomicType::TYPE_INT8:
         case AtomicType::TYPE_INT16:
         case AtomicType::TYPE_INT32:
         case AtomicType::TYPE_INT64:
             cast = ctx->CastInst(llvm::Instruction::SIToFP, // signed int to float
-                                 exprVal, targetType, "int2float");
+                                 exprVal, targetType, cOpName);
             break;
         case AtomicType::TYPE_UINT8:
         case AtomicType::TYPE_UINT16:
         case AtomicType::TYPE_UINT32:
         case AtomicType::TYPE_UINT64:
-            if (fromType->IsVaryingType())
+            if (fromType->IsVaryingType() && g->target.isa != Target::GENERIC)
                 PerformanceWarning(pos, "Conversion from unsigned int to float is slow. "
                                    "Use \"int\" if possible");
             cast = ctx->CastInst(llvm::Instruction::UIToFP, // unsigned int to float
-                                 exprVal, targetType, "uint2float");
+                                 exprVal, targetType, cOpName);
             break;
         case AtomicType::TYPE_FLOAT:
             // No-op cast.
             cast = exprVal;
             break;
         case AtomicType::TYPE_DOUBLE:
-            cast = ctx->FPCastInst(exprVal, targetType, "double2float");
+            cast = ctx->FPCastInst(exprVal, targetType, cOpName);
             break;
         default:
             FATAL("unimplemented");
@@ -5787,7 +5934,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
         break;
     }
     case AtomicType::TYPE_DOUBLE: {
-        LLVM_TYPE_CONST llvm::Type *targetType = 
+        llvm::Type *targetType = 
             fromType->IsUniformType() ? LLVMTypes::DoubleType :
                                         LLVMTypes::DoubleVectorType;
         switch (fromType->basicType) {
@@ -5795,26 +5942,26 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
             if (fromType->IsVaryingType() && 
                 LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
                 // truncate i32 bool vector values to i1s
-                exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, "bool_to_i1");
+                exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
             cast = ctx->CastInst(llvm::Instruction::UIToFP, // unsigned int to double
-                                 exprVal, targetType, "bool2double");
+                                 exprVal, targetType, cOpName);
             break;
         case AtomicType::TYPE_INT8:
         case AtomicType::TYPE_INT16:
         case AtomicType::TYPE_INT32:
         case AtomicType::TYPE_INT64:
             cast = ctx->CastInst(llvm::Instruction::SIToFP, // signed int
-                                 exprVal, targetType, "int2double");
+                                 exprVal, targetType, cOpName);
             break;
         case AtomicType::TYPE_UINT8:
         case AtomicType::TYPE_UINT16:
         case AtomicType::TYPE_UINT32:
         case AtomicType::TYPE_UINT64:
             cast = ctx->CastInst(llvm::Instruction::UIToFP, // unsigned int
-                                 exprVal, targetType, "uint2double");
+                                 exprVal, targetType, cOpName);
             break;
         case AtomicType::TYPE_FLOAT:
-            cast = ctx->FPCastInst(exprVal, targetType, "float2double");
+            cast = ctx->FPCastInst(exprVal, targetType, cOpName);
             break;
         case AtomicType::TYPE_DOUBLE:
             cast = exprVal;
@@ -5825,15 +5972,15 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
         break;
     }
     case AtomicType::TYPE_INT8: {
-        LLVM_TYPE_CONST llvm::Type *targetType = 
+        llvm::Type *targetType = 
             fromType->IsUniformType() ? LLVMTypes::Int8Type :
                                         LLVMTypes::Int8VectorType;
         switch (fromType->basicType) {
         case AtomicType::TYPE_BOOL:
             if (fromType->IsVaryingType() && 
                 LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
-                exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, "bool_to_i1");
-            cast = ctx->ZExtInst(exprVal, targetType, "bool2int");
+                exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
+            cast = ctx->ZExtInst(exprVal, targetType, cOpName);
             break;
         case AtomicType::TYPE_INT8:
         case AtomicType::TYPE_UINT8:
@@ -5845,15 +5992,15 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
         case AtomicType::TYPE_UINT32:
         case AtomicType::TYPE_INT64:
         case AtomicType::TYPE_UINT64:
-            cast = ctx->TruncInst(exprVal, targetType, "int64_to_int8");
+            cast = ctx->TruncInst(exprVal, targetType, cOpName);
             break;
         case AtomicType::TYPE_FLOAT:
             cast = ctx->CastInst(llvm::Instruction::FPToSI, // signed int
-                                 exprVal, targetType, "float2int");
+                                 exprVal, targetType, cOpName);
             break;
         case AtomicType::TYPE_DOUBLE:
             cast = ctx->CastInst(llvm::Instruction::FPToSI, // signed int
-                                 exprVal, targetType, "double2int");
+                                 exprVal, targetType, cOpName);
             break;
         default:
             FATAL("unimplemented");
@@ -5861,15 +6008,15 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
         break;
     }
     case AtomicType::TYPE_UINT8: {
-        LLVM_TYPE_CONST llvm::Type *targetType = 
+        llvm::Type *targetType = 
             fromType->IsUniformType() ? LLVMTypes::Int8Type :
                                         LLVMTypes::Int8VectorType;
         switch (fromType->basicType) {
         case AtomicType::TYPE_BOOL:
             if (fromType->IsVaryingType() && 
                 LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
-                exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, "bool_to_i1");
-            cast = ctx->ZExtInst(exprVal, targetType, "bool2uint");
+                exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
+            cast = ctx->ZExtInst(exprVal, targetType, cOpName);
             break;
         case AtomicType::TYPE_INT8:
         case AtomicType::TYPE_UINT8:
@@ -5881,21 +6028,21 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
         case AtomicType::TYPE_UINT32:
         case AtomicType::TYPE_INT64:
         case AtomicType::TYPE_UINT64:
-            cast = ctx->TruncInst(exprVal, targetType, "int64_to_uint8");
+            cast = ctx->TruncInst(exprVal, targetType, cOpName);
             break;
         case AtomicType::TYPE_FLOAT:
-            if (fromType->IsVaryingType())
+            if (fromType->IsVaryingType() && g->target.isa != Target::GENERIC)
                 PerformanceWarning(pos, "Conversion from float to unsigned int is slow. "
                                    "Use \"int\" if possible");
             cast = ctx->CastInst(llvm::Instruction::FPToUI, // unsigned int
-                                 exprVal, targetType, "float2uint");
+                                 exprVal, targetType, cOpName);
             break;
         case AtomicType::TYPE_DOUBLE:
-            if (fromType->IsVaryingType())
+            if (fromType->IsVaryingType() && g->target.isa != Target::GENERIC)
                 PerformanceWarning(pos, "Conversion from double to unsigned int is slow. "
                                    "Use \"int\" if possible");
             cast = ctx->CastInst(llvm::Instruction::FPToUI, // unsigned int
-                                 exprVal, targetType, "double2uint");
+                                 exprVal, targetType, cOpName);
             break;
         default:
             FATAL("unimplemented");
@@ -5903,21 +6050,21 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
         break;
     }
     case AtomicType::TYPE_INT16: {
-        LLVM_TYPE_CONST llvm::Type *targetType = 
+        llvm::Type *targetType = 
             fromType->IsUniformType() ? LLVMTypes::Int16Type :
                                         LLVMTypes::Int16VectorType;
         switch (fromType->basicType) {
         case AtomicType::TYPE_BOOL:
             if (fromType->IsVaryingType() && 
                 LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
-                exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, "bool_to_i1");
-            cast = ctx->ZExtInst(exprVal, targetType, "bool2int");
+                exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
+            cast = ctx->ZExtInst(exprVal, targetType, cOpName);
             break;
         case AtomicType::TYPE_INT8:
-            cast = ctx->SExtInst(exprVal, targetType, "int2int16");
+            cast = ctx->SExtInst(exprVal, targetType, cOpName);
             break;
         case AtomicType::TYPE_UINT8:
-            cast = ctx->ZExtInst(exprVal, targetType, "uint2uint16");
+            cast = ctx->ZExtInst(exprVal, targetType, cOpName);
             break;
         case AtomicType::TYPE_INT16:
         case AtomicType::TYPE_UINT16:
@@ -5925,17 +6072,17 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
             break;
         case AtomicType::TYPE_FLOAT:
             cast = ctx->CastInst(llvm::Instruction::FPToSI, // signed int
-                                 exprVal, targetType, "float2int");
+                                 exprVal, targetType, cOpName);
             break;
         case AtomicType::TYPE_INT32:
         case AtomicType::TYPE_UINT32:
         case AtomicType::TYPE_INT64:
         case AtomicType::TYPE_UINT64:
-            cast = ctx->TruncInst(exprVal, targetType, "int64_to_int16");
+            cast = ctx->TruncInst(exprVal, targetType, cOpName);
             break;
         case AtomicType::TYPE_DOUBLE:
             cast = ctx->CastInst(llvm::Instruction::FPToSI, // signed int
-                                 exprVal, targetType, "double2int");
+                                 exprVal, targetType, cOpName);
             break;
         default:
             FATAL("unimplemented");
@@ -5943,45 +6090,45 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
         break;
     }
     case AtomicType::TYPE_UINT16: {
-        LLVM_TYPE_CONST llvm::Type *targetType = 
+        llvm::Type *targetType = 
             fromType->IsUniformType() ? LLVMTypes::Int16Type :
                                         LLVMTypes::Int16VectorType;
         switch (fromType->basicType) {
         case AtomicType::TYPE_BOOL:
             if (fromType->IsVaryingType() && 
                 LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
-                exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, "bool_to_i1");
-            cast = ctx->ZExtInst(exprVal, targetType, "bool2uint16");
+                exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
+            cast = ctx->ZExtInst(exprVal, targetType, cOpName);
             break;
         case AtomicType::TYPE_INT8:
-            cast = ctx->SExtInst(exprVal, targetType, "uint2uint16");
+            cast = ctx->SExtInst(exprVal, targetType, cOpName);
             break;
         case AtomicType::TYPE_UINT8:
-            cast = ctx->ZExtInst(exprVal, targetType, "uint2uint16");
+            cast = ctx->ZExtInst(exprVal, targetType, cOpName);
             break;            
         case AtomicType::TYPE_INT16:
         case AtomicType::TYPE_UINT16:
             cast = exprVal;
             break;
         case AtomicType::TYPE_FLOAT:
-            if (fromType->IsVaryingType())
+            if (fromType->IsVaryingType() && g->target.isa != Target::GENERIC)
                 PerformanceWarning(pos, "Conversion from float to unsigned int is slow. "
                                    "Use \"int\" if possible");
             cast = ctx->CastInst(llvm::Instruction::FPToUI, // unsigned int
-                                 exprVal, targetType, "float2uint");
+                                 exprVal, targetType, cOpName);
             break;
         case AtomicType::TYPE_INT32:
         case AtomicType::TYPE_UINT32:
         case AtomicType::TYPE_INT64:
         case AtomicType::TYPE_UINT64:
-            cast = ctx->TruncInst(exprVal, targetType, "int64_to_uint16");
+            cast = ctx->TruncInst(exprVal, targetType, cOpName);
             break;
         case AtomicType::TYPE_DOUBLE:
-            if (fromType->IsVaryingType())
+            if (fromType->IsVaryingType() && g->target.isa != Target::GENERIC)
                 PerformanceWarning(pos, "Conversion from double to unsigned int is slow. "
                                    "Use \"int\" if possible");
             cast = ctx->CastInst(llvm::Instruction::FPToUI, // unsigned int
-                                 exprVal, targetType, "double2uint");
+                                 exprVal, targetType, cOpName);
             break;
         default:
             FATAL("unimplemented");
@@ -5989,23 +6136,23 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
         break;
     }
     case AtomicType::TYPE_INT32: {
-        LLVM_TYPE_CONST llvm::Type *targetType = 
+        llvm::Type *targetType = 
             fromType->IsUniformType() ? LLVMTypes::Int32Type :
                                         LLVMTypes::Int32VectorType;
         switch (fromType->basicType) {
         case AtomicType::TYPE_BOOL:
             if (fromType->IsVaryingType() && 
                 LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
-                exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, "bool_to_i1");
-            cast = ctx->ZExtInst(exprVal, targetType, "bool2int");
+                exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
+            cast = ctx->ZExtInst(exprVal, targetType, cOpName);
             break;
         case AtomicType::TYPE_INT8:
         case AtomicType::TYPE_INT16:
-            cast = ctx->SExtInst(exprVal, targetType, "int2int32");
+            cast = ctx->SExtInst(exprVal, targetType, cOpName);
             break;
         case AtomicType::TYPE_UINT8:
         case AtomicType::TYPE_UINT16:
-            cast = ctx->ZExtInst(exprVal, targetType, "uint2uint32");
+            cast = ctx->ZExtInst(exprVal, targetType, cOpName);
             break;
         case AtomicType::TYPE_INT32:
         case AtomicType::TYPE_UINT32:
@@ -6013,15 +6160,15 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
             break;
         case AtomicType::TYPE_FLOAT:
             cast = ctx->CastInst(llvm::Instruction::FPToSI, // signed int
-                                 exprVal, targetType, "float2int");
+                                 exprVal, targetType, cOpName);
             break;
         case AtomicType::TYPE_INT64:
         case AtomicType::TYPE_UINT64:
-            cast = ctx->TruncInst(exprVal, targetType, "int64_to_int32");
+            cast = ctx->TruncInst(exprVal, targetType, cOpName);
             break;
         case AtomicType::TYPE_DOUBLE:
             cast = ctx->CastInst(llvm::Instruction::FPToSI, // signed int
-                                 exprVal, targetType, "double2int");
+                                 exprVal, targetType, cOpName);
             break;
         default:
             FATAL("unimplemented");
@@ -6029,45 +6176,45 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
         break;
     }
     case AtomicType::TYPE_UINT32: {
-        LLVM_TYPE_CONST llvm::Type *targetType = 
+        llvm::Type *targetType = 
             fromType->IsUniformType() ? LLVMTypes::Int32Type :
                                         LLVMTypes::Int32VectorType;
         switch (fromType->basicType) {
         case AtomicType::TYPE_BOOL:
             if (fromType->IsVaryingType() && 
                 LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
-                exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, "bool_to_i1");
-            cast = ctx->ZExtInst(exprVal, targetType, "bool2uint");
+                exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
+            cast = ctx->ZExtInst(exprVal, targetType, cOpName);
             break;
         case AtomicType::TYPE_INT8:
         case AtomicType::TYPE_INT16:
-            cast = ctx->SExtInst(exprVal, targetType, "uint2uint");
+            cast = ctx->SExtInst(exprVal, targetType, cOpName);
             break;
         case AtomicType::TYPE_UINT8:
         case AtomicType::TYPE_UINT16:
-            cast = ctx->ZExtInst(exprVal, targetType, "uint2uint");
+            cast = ctx->ZExtInst(exprVal, targetType, cOpName);
             break;            
         case AtomicType::TYPE_INT32:
         case AtomicType::TYPE_UINT32:
             cast = exprVal;
             break;
         case AtomicType::TYPE_FLOAT:
-            if (fromType->IsVaryingType())
+            if (fromType->IsVaryingType() && g->target.isa != Target::GENERIC)
                 PerformanceWarning(pos, "Conversion from float to unsigned int is slow. "
                                    "Use \"int\" if possible");
             cast = ctx->CastInst(llvm::Instruction::FPToUI, // unsigned int
-                                 exprVal, targetType, "float2uint");
+                                 exprVal, targetType, cOpName);
             break;
         case AtomicType::TYPE_INT64:
         case AtomicType::TYPE_UINT64:
-            cast = ctx->TruncInst(exprVal, targetType, "int64_to_uint32");
+            cast = ctx->TruncInst(exprVal, targetType, cOpName);
             break;
         case AtomicType::TYPE_DOUBLE:
-            if (fromType->IsVaryingType())
+            if (fromType->IsVaryingType() && g->target.isa != Target::GENERIC)
                 PerformanceWarning(pos, "Conversion from double to unsigned int is slow. "
                                    "Use \"int\" if possible");
             cast = ctx->CastInst(llvm::Instruction::FPToUI, // unsigned int
-                                 exprVal, targetType, "double2uint");
+                                 exprVal, targetType, cOpName);
             break;
         default:
             FATAL("unimplemented");
@@ -6075,29 +6222,29 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
         break;
     }
     case AtomicType::TYPE_INT64: {
-        LLVM_TYPE_CONST llvm::Type *targetType = 
+        llvm::Type *targetType = 
             fromType->IsUniformType() ? LLVMTypes::Int64Type : 
                                         LLVMTypes::Int64VectorType;
         switch (fromType->basicType) {
         case AtomicType::TYPE_BOOL:
             if (fromType->IsVaryingType() &&
                 LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
-                exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, "bool_to_i1");
-            cast = ctx->ZExtInst(exprVal, targetType, "bool2int64");
+                exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
+            cast = ctx->ZExtInst(exprVal, targetType, cOpName);
             break;
         case AtomicType::TYPE_INT8:
         case AtomicType::TYPE_INT16:
         case AtomicType::TYPE_INT32:
-            cast = ctx->SExtInst(exprVal, targetType, "int_to_int64");
+            cast = ctx->SExtInst(exprVal, targetType, cOpName);
             break;
         case AtomicType::TYPE_UINT8:
         case AtomicType::TYPE_UINT16:
         case AtomicType::TYPE_UINT32:
-            cast = ctx->ZExtInst(exprVal, targetType, "uint_to_int64");
+            cast = ctx->ZExtInst(exprVal, targetType, cOpName);
             break;
         case AtomicType::TYPE_FLOAT:
             cast = ctx->CastInst(llvm::Instruction::FPToSI, // signed int
-                                 exprVal, targetType, "float2int64");
+                                 exprVal, targetType, cOpName);
             break;
         case AtomicType::TYPE_INT64:
         case AtomicType::TYPE_UINT64:
@@ -6105,7 +6252,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
             break;
         case AtomicType::TYPE_DOUBLE:
             cast = ctx->CastInst(llvm::Instruction::FPToSI, // signed int
-                                 exprVal, targetType, "double2int64");
+                                 exprVal, targetType, cOpName);
             break;
         default:
             FATAL("unimplemented");
@@ -6113,43 +6260,43 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
         break;
     }
     case AtomicType::TYPE_UINT64: {
-        LLVM_TYPE_CONST llvm::Type *targetType = 
+        llvm::Type *targetType = 
             fromType->IsUniformType() ? LLVMTypes::Int64Type : 
                                         LLVMTypes::Int64VectorType;
         switch (fromType->basicType) {
         case AtomicType::TYPE_BOOL:
             if (fromType->IsVaryingType() && 
                 LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
-                exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, "bool_to_i1");
-            cast = ctx->ZExtInst(exprVal, targetType, "bool2uint");
+                exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
+            cast = ctx->ZExtInst(exprVal, targetType, cOpName);
             break;
         case AtomicType::TYPE_INT8:
         case AtomicType::TYPE_INT16:
         case AtomicType::TYPE_INT32:
-            cast = ctx->SExtInst(exprVal, targetType, "int_to_uint64");
+            cast = ctx->SExtInst(exprVal, targetType, cOpName);
             break;
         case AtomicType::TYPE_UINT8:
         case AtomicType::TYPE_UINT16:
         case AtomicType::TYPE_UINT32:
-            cast = ctx->ZExtInst(exprVal, targetType, "uint_to_uint64");
+            cast = ctx->ZExtInst(exprVal, targetType, cOpName);
             break;
         case AtomicType::TYPE_FLOAT:
-            if (fromType->IsVaryingType())
+            if (fromType->IsVaryingType() && g->target.isa != Target::GENERIC)
                 PerformanceWarning(pos, "Conversion from float to unsigned int64 is slow. "
                                    "Use \"int64\" if possible");
             cast = ctx->CastInst(llvm::Instruction::FPToUI, // signed int
-                                 exprVal, targetType, "float2uint");
+                                 exprVal, targetType, cOpName);
             break;
         case AtomicType::TYPE_INT64:
         case AtomicType::TYPE_UINT64:
             cast = exprVal;
             break;
         case AtomicType::TYPE_DOUBLE:
-            if (fromType->IsVaryingType())
+            if (fromType->IsVaryingType() && g->target.isa != Target::GENERIC)
                 PerformanceWarning(pos, "Conversion from double to unsigned int64 is slow. "
                                    "Use \"int64\" if possible");
             cast = ctx->CastInst(llvm::Instruction::FPToUI, // signed int
-                                 exprVal, targetType, "double2uint");
+                                 exprVal, targetType, cOpName);
             break;
         default:
             FATAL("unimplemented");
@@ -6166,7 +6313,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
             llvm::Value *zero = fromType->IsUniformType() ? (llvm::Value *)LLVMInt8(0) : 
                 (llvm::Value *)LLVMInt8Vector((int8_t)0);
             cast = ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_NE,
-                                exprVal, zero, "cmpi0");
+                                exprVal, zero, cOpName);
             break;
         }
         case AtomicType::TYPE_INT16:
@@ -6174,7 +6321,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
             llvm::Value *zero = fromType->IsUniformType() ? (llvm::Value *)LLVMInt16(0) : 
                 (llvm::Value *)LLVMInt16Vector((int16_t)0);
             cast = ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_NE,
-                                exprVal, zero, "cmpi0");
+                                exprVal, zero, cOpName);
             break;
         }
         case AtomicType::TYPE_INT32:
@@ -6182,14 +6329,14 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
             llvm::Value *zero = fromType->IsUniformType() ? (llvm::Value *)LLVMInt32(0) : 
                 (llvm::Value *)LLVMInt32Vector(0);
             cast = ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_NE,
-                                exprVal, zero, "cmpi0");
+                                exprVal, zero, cOpName);
             break;
         }
         case AtomicType::TYPE_FLOAT: {
             llvm::Value *zero = fromType->IsUniformType() ? (llvm::Value *)LLVMFloat(0.f) : 
                 (llvm::Value *)LLVMFloatVector(0.f);
             cast = ctx->CmpInst(llvm::Instruction::FCmp, llvm::CmpInst::FCMP_ONE,
-                                exprVal, zero, "cmpf0");
+                                exprVal, zero, cOpName);
             break;
         }
         case AtomicType::TYPE_INT64:
@@ -6197,14 +6344,14 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
             llvm::Value *zero = fromType->IsUniformType() ? (llvm::Value *)LLVMInt64(0) : 
                 (llvm::Value *)LLVMInt64Vector((int64_t)0);
             cast = ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_NE,
-                                exprVal, zero, "cmpi0");
+                                exprVal, zero, cOpName);
             break;
         }
         case AtomicType::TYPE_DOUBLE: {
             llvm::Value *zero = fromType->IsUniformType() ? (llvm::Value *)LLVMDouble(0.) : 
                 (llvm::Value *)LLVMDoubleVector(0.);
             cast = ctx->CmpInst(llvm::Instruction::FCmp, llvm::CmpInst::FCMP_ONE,
-                                exprVal, zero, "cmpd0");
+                                exprVal, zero, cOpName);
             break;
         }
         default:
@@ -6218,7 +6365,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
                 // turn into a vector below, the way it does for everyone
                 // else...
                 cast = ctx->SExtInst(cast, LLVMTypes::BoolVectorType->getElementType(),
-                                     "i1bool_to_i32bool");
+                                     LLVMGetName(cast, "to_i32bool"));
             }
         }
         else
@@ -6254,10 +6401,9 @@ lUniformValueToVarying(FunctionEmitContext *ctx, llvm::Value *value,
 
     // for structs/arrays/vectors, just recursively make their elements
     // varying (if needed) and populate the return value.
-    const CollectionType *collectionType = 
-        dynamic_cast<const CollectionType *>(type);
+    const CollectionType *collectionType = CastType<CollectionType>(type);
     if (collectionType != NULL) {
-        LLVM_TYPE_CONST llvm::Type *llvmType = 
+        llvm::Type *llvmType = 
             type->GetAsVaryingType()->LLVMType(g->ctx);
         llvm::Value *retValue = llvm::UndefValue::get(llvmType);
 
@@ -6269,9 +6415,10 @@ lUniformValueToVarying(FunctionEmitContext *ctx, llvm::Value *value,
         return retValue;
     }
 
-    // Otherwise we must have a uniform AtomicType, so smear its value
-    // across the vector lanes.
-    Assert(dynamic_cast<const AtomicType *>(type) != NULL);
+    // Otherwise we must have a uniform atomic or pointer type, so smear
+    // its value across the vector lanes.
+    Assert(CastType<AtomicType>(type) != NULL ||
+           CastType<PointerType>(type) != NULL);
     return ctx->SmearUniform(value);
 }
 
@@ -6283,15 +6430,22 @@ TypeCastExpr::GetValue(FunctionEmitContext *ctx) const {
 
     ctx->SetDebugPos(pos);
     const Type *toType = GetType(), *fromType = expr->GetType();
-    if (!toType || !fromType || Type::Equal(toType, AtomicType::Void) || 
-        Type::Equal(fromType, AtomicType::Void))
-        // an error should have been issued elsewhere in this case
+    if (toType == NULL || fromType == NULL) {
+        AssertPos(pos, m->errorCount > 0);
         return NULL;
+    }
 
-    const PointerType *fromPointerType = dynamic_cast<const PointerType *>(fromType);
-    const PointerType *toPointerType = dynamic_cast<const PointerType *>(toType);
-    const ArrayType *toArrayType = dynamic_cast<const ArrayType *>(toType);
-    const ArrayType *fromArrayType = dynamic_cast<const ArrayType *>(fromType);
+    if (Type::Equal(toType, AtomicType::Void)) {
+        // emit the code for the expression in case it has side-effects but
+        // then we're done.
+        (void)expr->GetValue(ctx);
+        return NULL;
+    }
+
+    const PointerType *fromPointerType = CastType<PointerType>(fromType);
+    const PointerType *toPointerType = CastType<PointerType>(toType);
+    const ArrayType *toArrayType = CastType<ArrayType>(toType);
+    const ArrayType *fromArrayType = CastType<ArrayType>(fromType);
     if (fromPointerType != NULL) {
         if (toArrayType != NULL) {
             return expr->GetValue(ctx);
@@ -6328,11 +6482,11 @@ TypeCastExpr::GetValue(FunctionEmitContext *ctx) const {
             }
             else {
                 // Uniform -> varying pointer conversion
-                Assert(fromType->IsUniformType() && toType->IsVaryingType());
+                AssertPos(pos, fromType->IsUniformType() && toType->IsVaryingType());
                 if (fromPointerType->IsSlice()) {
                     // For slice pointers, we need to smear out both the
                     // pointer and the offset vector
-                    Assert(toPointerType->IsSlice());
+                    AssertPos(pos, toPointerType->IsSlice());
                     llvm::Value *ptr = ctx->ExtractInst(value, 0);
                     llvm::Value *offset = ctx->ExtractInst(value, 1);
                     ptr = ctx->PtrToIntInst(ptr);
@@ -6349,13 +6503,13 @@ TypeCastExpr::GetValue(FunctionEmitContext *ctx) const {
             }
         }
         else {
-            Assert(dynamic_cast<const AtomicType *>(toType) != NULL);
+            AssertPos(pos, CastType<AtomicType>(toType) != NULL);
             if (toType->IsBoolType()) {
                 // convert pointer to bool
-                LLVM_TYPE_CONST llvm::Type *lfu = 
+                llvm::Type *lfu = 
                     fromType->GetAsUniformType()->LLVMType(g->ctx);
-                LLVM_TYPE_CONST llvm::PointerType *llvmFromUnifType = 
-                    llvm::dyn_cast<LLVM_TYPE_CONST llvm::PointerType>(lfu);
+                llvm::PointerType *llvmFromUnifType = 
+                    llvm::dyn_cast<llvm::PointerType>(lfu);
 
                 llvm::Value *nullPtrValue = 
                     llvm::ConstantPointerNull::get(llvmFromUnifType);
@@ -6384,7 +6538,7 @@ TypeCastExpr::GetValue(FunctionEmitContext *ctx) const {
                 if (toType->IsVaryingType() && fromType->IsUniformType())
                     value = ctx->SmearUniform(value);
 
-                LLVM_TYPE_CONST llvm::Type *llvmToType = toType->LLVMType(g->ctx);
+                llvm::Type *llvmToType = toType->LLVMType(g->ctx);
                 if (llvmToType == NULL)
                     return NULL;
                 return ctx->PtrToIntInst(value, llvmToType, "ptr_typecast");
@@ -6401,20 +6555,21 @@ TypeCastExpr::GetValue(FunctionEmitContext *ctx) const {
         // implicit array to pointer to first element
         Expr *arrayAsPtr = lArrayToPointer(expr);
         if (Type::EqualIgnoringConst(arrayAsPtr->GetType(), toPointerType) == false) {
-            Assert(Type::EqualIgnoringConst(arrayAsPtr->GetType()->GetAsVaryingType(),
+            AssertPos(pos, PointerType::IsVoidPointer(toPointerType) ||
+                   Type::EqualIgnoringConst(arrayAsPtr->GetType()->GetAsVaryingType(),
                                             toPointerType) == true);
             arrayAsPtr = new TypeCastExpr(toPointerType, arrayAsPtr, pos);
             arrayAsPtr = ::TypeCheck(arrayAsPtr);
-            Assert(arrayAsPtr != NULL);
+            AssertPos(pos, arrayAsPtr != NULL);
             arrayAsPtr = ::Optimize(arrayAsPtr);
-            Assert(arrayAsPtr != NULL);
+            AssertPos(pos, arrayAsPtr != NULL);
         }
-        Assert(Type::EqualIgnoringConst(arrayAsPtr->GetType(), toPointerType));
+        AssertPos(pos, Type::EqualIgnoringConst(arrayAsPtr->GetType(), toPointerType));
         return arrayAsPtr->GetValue(ctx);
     }
 
     // This also should be caught during typechecking
-    Assert(!(toType->IsUniformType() && fromType->IsVaryingType()));
+    AssertPos(pos, !(toType->IsUniformType() && fromType->IsVaryingType()));
 
     if (toArrayType != NULL && fromArrayType != NULL) {
         // cast array pointer from [n x foo] to [0 x foo] if needed to be able
@@ -6423,21 +6578,21 @@ TypeCastExpr::GetValue(FunctionEmitContext *ctx) const {
             (toArrayType->GetElementCount() != fromArrayType->GetElementCount()))
             Warning(pos, "Type-converting array of length %d to length %d",
                     fromArrayType->GetElementCount(), toArrayType->GetElementCount());
-        Assert(Type::EqualIgnoringConst(toArrayType->GetBaseType(),
+        AssertPos(pos, Type::EqualIgnoringConst(toArrayType->GetBaseType(),
                                         fromArrayType->GetBaseType()));
         llvm::Value *v = expr->GetValue(ctx);
-        LLVM_TYPE_CONST llvm::Type *ptype = toType->LLVMType(g->ctx);
+        llvm::Type *ptype = toType->LLVMType(g->ctx);
         return ctx->BitCastInst(v, ptype); //, "array_cast_0size");
     }
 
-    const ReferenceType *toReference = dynamic_cast<const ReferenceType *>(toType);
-    const ReferenceType *fromReference = dynamic_cast<const ReferenceType *>(fromType);
+    const ReferenceType *toReference = CastType<ReferenceType>(toType);
+    const ReferenceType *fromReference = CastType<ReferenceType>(fromType);
     if (toReference && fromReference) {
         const Type *toTarget = toReference->GetReferenceTarget();
         const Type *fromTarget = fromReference->GetReferenceTarget();
 
-        const ArrayType *toArray = dynamic_cast<const ArrayType *>(toTarget);
-        const ArrayType *fromArray = dynamic_cast<const ArrayType *>(fromTarget);
+        const ArrayType *toArray = CastType<ArrayType>(toTarget);
+        const ArrayType *fromArray = CastType<ArrayType>(fromTarget);
         if (toArray && fromArray) {
             // cast array pointer from [n x foo] to [0 x foo] if needed to be able
             // to pass to a function that takes an unsized array as a parameter
@@ -6445,25 +6600,26 @@ TypeCastExpr::GetValue(FunctionEmitContext *ctx) const {
                (toArray->GetElementCount() != fromArray->GetElementCount()))
                 Warning(pos, "Type-converting array of length %d to length %d",
                         fromArray->GetElementCount(), toArray->GetElementCount());
-            Assert(Type::EqualIgnoringConst(toArray->GetBaseType(),
+            AssertPos(pos, Type::EqualIgnoringConst(toArray->GetBaseType(),
                                             fromArray->GetBaseType()));
             llvm::Value *v = expr->GetValue(ctx);
-            LLVM_TYPE_CONST llvm::Type *ptype = toType->LLVMType(g->ctx);
+            llvm::Type *ptype = toType->LLVMType(g->ctx);
             return ctx->BitCastInst(v, ptype); //, "array_cast_0size");
         }
 
-        Assert(Type::Equal(toTarget, fromTarget) ||
+        AssertPos(pos, Type::Equal(toTarget, fromTarget) ||
                Type::Equal(toTarget, fromTarget->GetAsConstType()));
         return expr->GetValue(ctx);
     }
 
-    const StructType *toStruct = dynamic_cast<const StructType *>(toType);
-    const StructType *fromStruct = dynamic_cast<const StructType *>(fromType);
+    const StructType *toStruct = CastType<StructType>(toType);
+    const StructType *fromStruct = CastType<StructType>(fromType);
     if (toStruct && fromStruct) {
         // The only legal type conversions for structs are to go from a
         // uniform to a varying instance of the same struct type.
-        Assert(toStruct->IsVaryingType() && fromStruct->IsUniformType() &&
-               Type::Equal(toStruct, fromStruct->GetAsVaryingType()));
+        AssertPos(pos, toStruct->IsVaryingType() && fromStruct->IsUniformType() &&
+               Type::EqualIgnoringConst(toStruct, 
+                                        fromStruct->GetAsVaryingType()));
 
         llvm::Value *origValue = expr->GetValue(ctx);
         if (!origValue)
@@ -6471,11 +6627,11 @@ TypeCastExpr::GetValue(FunctionEmitContext *ctx) const {
         return lUniformValueToVarying(ctx, origValue, fromType);
     }
 
-    const VectorType *toVector = dynamic_cast<const VectorType *>(toType);
-    const VectorType *fromVector = dynamic_cast<const VectorType *>(fromType);
+    const VectorType *toVector = CastType<VectorType>(toType);
+    const VectorType *fromVector = CastType<VectorType>(fromType);
     if (toVector && fromVector) {
         // this should be caught during typechecking
-        Assert(toVector->GetElementCount() == fromVector->GetElementCount());
+        AssertPos(pos, toVector->GetElementCount() == fromVector->GetElementCount());
 
         llvm::Value *exprVal = expr->GetValue(ctx);
         if (!exprVal)
@@ -6504,8 +6660,8 @@ TypeCastExpr::GetValue(FunctionEmitContext *ctx) const {
     if (!exprVal)
         return NULL;
 
-    const EnumType *fromEnum = dynamic_cast<const EnumType *>(fromType);
-    const EnumType *toEnum = dynamic_cast<const EnumType *>(toType);
+    const EnumType *fromEnum = CastType<EnumType>(fromType);
+    const EnumType *toEnum = CastType<EnumType>(toType);
     if (fromEnum)
         // treat it as an uint32 type for the below and all will be good.
         fromType = fromEnum->IsUniformType() ? AtomicType::UniformUInt32 :
@@ -6515,9 +6671,9 @@ TypeCastExpr::GetValue(FunctionEmitContext *ctx) const {
         toType = toEnum->IsUniformType() ? AtomicType::UniformUInt32 :
             AtomicType::VaryingUInt32;
 
-    const AtomicType *fromAtomic = dynamic_cast<const AtomicType *>(fromType);
+    const AtomicType *fromAtomic = CastType<AtomicType>(fromType);
     // at this point, coming from an atomic type is all that's left...
-    Assert(fromAtomic != NULL);
+    AssertPos(pos, fromAtomic != NULL);
 
     if (toVector) {
         // scalar -> short vector conversion
@@ -6536,16 +6692,16 @@ TypeCastExpr::GetValue(FunctionEmitContext *ctx) const {
         if (toType->IsVaryingType() && fromType->IsUniformType())
             exprVal = ctx->SmearUniform(exprVal);
 
-        LLVM_TYPE_CONST llvm::Type *llvmToType = toType->LLVMType(g->ctx);
+        llvm::Type *llvmToType = toType->LLVMType(g->ctx);
         if (llvmToType == NULL)
             return NULL;
 
         return ctx->IntToPtrInst(exprVal, llvmToType, "int_to_ptr");
     }
     else {
-        const AtomicType *toAtomic = dynamic_cast<const AtomicType *>(toType);
+        const AtomicType *toAtomic = CastType<AtomicType>(toType);
         // typechecking should ensure this is the case
-        Assert(toAtomic != NULL);
+        AssertPos(pos, toAtomic != NULL);
 
         return lTypeConvAtomic(ctx, exprVal, toAtomic, fromAtomic, pos);
     }
@@ -6554,14 +6710,14 @@ TypeCastExpr::GetValue(FunctionEmitContext *ctx) const {
 
 const Type *
 TypeCastExpr::GetType() const { 
-    Assert(type->HasUnboundVariability() == false);
+    AssertPos(pos, type->HasUnboundVariability() == false);
     return type; 
 }
 
 
 static const Type *
 lDeconstifyType(const Type *t) {
-    const PointerType *pt = dynamic_cast<const PointerType *>(t);
+    const PointerType *pt = CastType<PointerType>(t);
     if (pt != NULL)
         return new PointerType(lDeconstifyType(pt->GetBaseType()), 
                                pt->GetVariability(), false);
@@ -6589,23 +6745,28 @@ TypeCastExpr::TypeCheck() {
     fromType = lDeconstifyType(fromType);
     toType = lDeconstifyType(toType);
 
-    if (fromType->IsVaryingType() && toType->IsUniformType()) {
+    // Anything can be cast to void...
+    if (Type::Equal(toType, AtomicType::Void))
+        return this;
+
+    if (Type::Equal(fromType, AtomicType::Void) ||
+        (fromType->IsVaryingType() && toType->IsUniformType())) {
         Error(pos, "Can't type cast from type \"%s\" to type \"%s\"",
               fromType->GetString().c_str(), toType->GetString().c_str());
         return NULL;
     }
 
     // First some special cases that we allow only with an explicit type cast
-    const PointerType *fromPtr = dynamic_cast<const PointerType *>(fromType);
-    const PointerType *toPtr = dynamic_cast<const PointerType *>(toType);
+    const PointerType *fromPtr = CastType<PointerType>(fromType);
+    const PointerType *toPtr = CastType<PointerType>(toType);
     if (fromPtr != NULL && toPtr != NULL)
         // allow explicit typecasts between any two different pointer types
         return this;
 
-    const AtomicType *fromAtomic = dynamic_cast<const AtomicType *>(fromType);
-    const AtomicType *toAtomic = dynamic_cast<const AtomicType *>(toType);
-    const EnumType *fromEnum = dynamic_cast<const EnumType *>(fromType);
-    const EnumType *toEnum = dynamic_cast<const EnumType *>(toType);
+    const AtomicType *fromAtomic = CastType<AtomicType>(fromType);
+    const AtomicType *toAtomic = CastType<AtomicType>(toType);
+    const EnumType *fromEnum = CastType<EnumType>(fromType);
+    const EnumType *toEnum = CastType<EnumType>(toType);
     if ((fromAtomic || fromEnum) && (toAtomic || toEnum))
         // Allow explicit casts between all of these
         return this;
@@ -6646,8 +6807,8 @@ TypeCastExpr::Optimize() {
         return this;
 
     const Type *toType = GetType();
-    const AtomicType *toAtomic = dynamic_cast<const AtomicType *>(toType);
-    const EnumType *toEnum = dynamic_cast<const EnumType *>(toType);
+    const AtomicType *toAtomic = CastType<AtomicType>(toType);
+    const EnumType *toEnum = CastType<EnumType>(toType);
     // If we're not casting to an atomic or enum type, we can't do anything
     // here, since ConstExprs can only represent those two types.  (So
     // e.g. we're casting from an int to an int<4>.)
@@ -6749,6 +6910,34 @@ TypeCastExpr::GetBaseSymbol() const {
 }
 
 
+static
+llvm::Constant *
+lConvertPointerConstant(llvm::Constant *c, const Type *constType) {
+    if (c == NULL || constType->IsUniformType())
+        return c;
+
+    // Handle conversion to int and then to vector of int or array of int
+    // (for varying and soa types, respectively)
+    llvm::Constant *intPtr = 
+        llvm::ConstantExpr::getPtrToInt(c, LLVMTypes::PointerIntType);
+    Assert(constType->IsVaryingType() || constType->IsSOAType());
+    int count = constType->IsVaryingType() ? g->target.vectorWidth :
+        constType->GetSOAWidth();
+
+    std::vector<llvm::Constant *> smear;
+    for (int i = 0; i < count; ++i)
+        smear.push_back(intPtr);
+
+    if (constType->IsVaryingType())
+        return llvm::ConstantVector::get(smear);
+    else {
+        llvm::ArrayType *at =
+            llvm::ArrayType::get(LLVMTypes::PointerIntType, count);
+        return llvm::ConstantArray::get(at, smear);
+    }
+}
+
+
 llvm::Constant *
 TypeCastExpr::GetConstant(const Type *constType) const {
     // We don't need to worry about most the basic cases where the type
@@ -6756,11 +6945,18 @@ TypeCastExpr::GetConstant(const Type *constType) const {
     // TypeCastExpr::Optimize() method generally ends up doing the type
     // conversion and returning a ConstExpr, which in turn will have its
     // GetConstant() method called.  However, because ConstExpr currently
-    // can't represent pointer values, we have to handle two cases here:
-    // 1. Null pointers (NULL, 0) valued initializers, and
-    // 2. Converting a uniform function pointer to a varying function
-    //    pointer of the same type.
-    return expr->GetConstant(constType);
+    // can't represent pointer values, we have to handle a few cases
+    // related to pointers here:
+    //
+    // 1. Null pointer (NULL, 0) valued initializers
+    // 2. Converting function types to pointer-to-function types
+    // 3. And converting these from uniform to the varying/soa equivalents.
+    //
+    if (CastType<PointerType>(constType) == NULL)
+        return NULL;
+
+    llvm::Constant *c = expr->GetConstant(constType->GetAsUniformType());
+    return lConvertPointerConstant(c, constType);
 }
 
 
@@ -6776,7 +6972,34 @@ ReferenceExpr::ReferenceExpr(Expr *e, SourcePos p)
 llvm::Value *
 ReferenceExpr::GetValue(FunctionEmitContext *ctx) const {
     ctx->SetDebugPos(pos);
-    return expr ? expr->GetLValue(ctx) : NULL;
+    if (expr == NULL) {
+        AssertPos(pos, m->errorCount > 0);
+        return NULL;
+    }
+    
+    llvm::Value *value = expr->GetLValue(ctx);
+    if (value != NULL)
+        return value;
+
+    // value is NULL if the expression is a temporary; in this case, we'll
+    // allocate storage for it so that we can return the pointer to that...
+    const Type *type;
+    llvm::Type *llvmType;
+    if ((type = expr->GetType()) == NULL ||
+        (llvmType = type->LLVMType(g->ctx)) == NULL) {
+        AssertPos(pos, m->errorCount > 0);
+        return NULL;
+    }
+
+    value = expr->GetValue(ctx);
+    if (value == NULL) {
+        AssertPos(pos, m->errorCount > 0);
+        return NULL;
+    }
+
+    llvm::Value *ptr = ctx->AllocaInst(llvmType);
+    ctx->StoreInst(value, ptr);
+    return ptr;
 }
 
 
@@ -6874,7 +7097,7 @@ DerefExpr::GetValue(FunctionEmitContext *ctx) const {
         ctx->GetFullMask();
 
     ctx->SetDebugPos(pos);
-    return ctx->LoadInst(ptr, mask, type, "deref_load");
+    return ctx->LoadInst(ptr, mask, type);
 }
 
 
@@ -6920,10 +7143,10 @@ const Type *
 PtrDerefExpr::GetType() const {
     const Type *type;
     if (expr == NULL || (type = expr->GetType()) == NULL) {
-        Assert(m->errorCount > 0);
+        AssertPos(pos, m->errorCount > 0);
         return NULL;
     }
-    Assert(dynamic_cast<const PointerType *>(type) != NULL);
+    AssertPos(pos, CastType<PointerType>(type) != NULL);
 
     if (type->IsUniformType())
         return type->GetBaseType();
@@ -6936,11 +7159,11 @@ Expr *
 PtrDerefExpr::TypeCheck() {
     const Type *type;
     if (expr == NULL || (type = expr->GetType()) == NULL) {
-        Assert(m->errorCount > 0);
+        AssertPos(pos, m->errorCount > 0);
         return NULL;
     }
 
-    if (dynamic_cast<const PointerType *>(type) == NULL) {
+    if (CastType<PointerType>(type) == NULL) {
         Error(pos, "Illegal to dereference non-pointer type \"%s\".", 
               type->GetString().c_str());
         return NULL;
@@ -6954,7 +7177,7 @@ int
 PtrDerefExpr::EstimateCost() const {
     const Type *type;
     if (expr == NULL || (type = expr->GetType()) == NULL) {
-        Assert(m->errorCount > 0);
+        AssertPos(pos, m->errorCount > 0);
         return 0;
     }
 
@@ -6991,11 +7214,11 @@ const Type *
 RefDerefExpr::GetType() const {
     const Type *type;
     if (expr == NULL || (type = expr->GetType()) == NULL) {
-        Assert(m->errorCount > 0);
+        AssertPos(pos, m->errorCount > 0);
         return NULL;
     }
         
-    Assert(dynamic_cast<const ReferenceType *>(type) != NULL);
+    AssertPos(pos, CastType<ReferenceType>(type) != NULL);
     return type->GetReferenceTarget();
 }
 
@@ -7004,14 +7227,14 @@ Expr *
 RefDerefExpr::TypeCheck() {
     const Type *type;
     if (expr == NULL || (type = expr->GetType()) == NULL) {
-        Assert(m->errorCount > 0);
+        AssertPos(pos, m->errorCount > 0);
         return NULL;
     }
 
     // We only create RefDerefExprs internally for references in
     // expressions, so we should never create one with a non-reference
     // expression...
-    Assert(dynamic_cast<const ReferenceType *>(type) != NULL);
+    AssertPos(pos, CastType<ReferenceType>(type) != NULL);
 
     return this;
 }
@@ -7053,7 +7276,8 @@ AddressOfExpr::GetValue(FunctionEmitContext *ctx) const {
         return NULL;
 
     const Type *exprType = expr->GetType();
-    if (dynamic_cast<const ReferenceType *>(exprType) != NULL)
+    if (CastType<ReferenceType>(exprType) != NULL ||
+        CastType<FunctionType>(exprType) != NULL)
         return expr->GetValue(ctx);
     else
         return expr->GetLValue(ctx);
@@ -7066,10 +7290,20 @@ AddressOfExpr::GetType() const {
         return NULL;
 
     const Type *exprType = expr->GetType();
-    if (dynamic_cast<const ReferenceType *>(exprType) != NULL)
+    if (CastType<ReferenceType>(exprType) != NULL)
         return PointerType::GetUniform(exprType->GetReferenceTarget());
-    else
-        return expr->GetLValueType();
+
+    const Type *t = expr->GetLValueType();
+    if (t != NULL)
+        return t;
+    else {
+        t = expr->GetType();
+        if (t == NULL) {
+            AssertPos(pos, m->errorCount > 0);
+            return NULL;
+        }
+        return PointerType::GetUniform(t);
+    }
 }
 
 
@@ -7093,7 +7327,22 @@ AddressOfExpr::Print() const {
 
 Expr *
 AddressOfExpr::TypeCheck() {
-    return this;
+    const Type *exprType;
+    if (expr == NULL || (exprType = expr->GetType()) == NULL) {
+        AssertPos(pos, m->errorCount > 0);
+        return NULL;
+    }
+
+    if (CastType<ReferenceType>(exprType) != NULL ||
+        CastType<FunctionType>(exprType) != NULL) {
+        return this;
+    }
+
+    if (expr->GetLValueType() != NULL)
+        return this;
+
+    Error(expr->pos, "Illegal to take address of non-lvalue or function.");
+    return NULL;
 }
 
 
@@ -7109,6 +7358,28 @@ AddressOfExpr::EstimateCost() const {
 }
 
 
+llvm::Constant *
+AddressOfExpr::GetConstant(const Type *type) const {
+    const Type *exprType;
+    if (expr == NULL || (exprType = expr->GetType()) == NULL) {
+        AssertPos(pos, m->errorCount > 0);
+        return NULL;
+    }
+
+    const PointerType *pt = CastType<PointerType>(type);
+    if (pt == NULL)
+        return NULL;
+
+    const FunctionType *ft = CastType<FunctionType>(pt->GetBaseType());
+    if (ft != NULL) {
+        llvm::Constant *c = expr->GetConstant(ft);
+        return lConvertPointerConstant(c, type);
+    }
+    else
+        return NULL;
+}
+
+
 ///////////////////////////////////////////////////////////////////////////
 // SizeOfExpr
 
@@ -7119,8 +7390,7 @@ SizeOfExpr::SizeOfExpr(Expr *e, SourcePos p)
 
 SizeOfExpr::SizeOfExpr(const Type *t, SourcePos p)
     : Expr(p), expr(NULL), type(t) {
-    if (type->HasUnboundVariability())
-        type = type->ResolveUnboundVariability(Variability::Varying);
+    type = type->ResolveUnboundVariability(Variability::Varying);
 }
 
 
@@ -7131,7 +7401,7 @@ SizeOfExpr::GetValue(FunctionEmitContext *ctx) const {
     if (t == NULL)
         return NULL;
 
-    LLVM_TYPE_CONST llvm::Type *llvmType = t->LLVMType(g->ctx);
+    llvm::Type *llvmType = t->LLVMType(g->ctx);
     if (llvmType == NULL)
         return NULL;
 
@@ -7161,6 +7431,14 @@ SizeOfExpr::Print() const {
 
 Expr *
 SizeOfExpr::TypeCheck() {
+    // Can't compute the size of a struct without a definition
+    if (type != NULL &&
+        CastType<UndefinedStructType>(type) != NULL) {
+        Error(pos, "Can't compute the size of declared but not defined "
+              "struct type \"%s\".", type->GetString().c_str());
+        return NULL;
+    }
+
     return this;
 }
 
@@ -7191,7 +7469,9 @@ SymbolExpr::GetValue(FunctionEmitContext *ctx) const {
     if (!symbol || !symbol->storagePtr)
         return NULL;
     ctx->SetDebugPos(pos);
-    return ctx->LoadInst(symbol->storagePtr, symbol->name.c_str());
+
+    std::string loadName = symbol->name + std::string("_load");
+    return ctx->LoadInst(symbol->storagePtr, loadName.c_str());
 }
 
 
@@ -7209,7 +7489,10 @@ SymbolExpr::GetLValueType() const {
     if (symbol == NULL)
         return NULL;
 
-    return PointerType::GetUniform(symbol->type);
+    if (CastType<ReferenceType>(symbol->type) != NULL)
+        return PointerType::GetUniform(symbol->type->GetReferenceTarget());
+    else
+        return PointerType::GetUniform(symbol->type);
 }
 
 
@@ -7236,7 +7519,7 @@ SymbolExpr::Optimize() {
     if (symbol == NULL)
         return NULL;
     else if (symbol->constValue != NULL) {
-        Assert(GetType()->IsConstType());
+        AssertPos(pos, GetType()->IsConstType());
         return new ConstExpr(symbol->constValue, pos);
     }
     else
@@ -7285,8 +7568,7 @@ FunctionSymbolExpr::GetType() const {
         return NULL;
     }
 
-    return matchingFunc ? 
-        new PointerType(matchingFunc->type, Variability::Uniform, true) : NULL;
+    return matchingFunc ? matchingFunc->type : NULL;
 }
 
 
@@ -7336,27 +7618,18 @@ FunctionSymbolExpr::GetConstant(const Type *type) const {
     if (matchingFunc == NULL || matchingFunc->function == NULL)
         return NULL;
 
-    const FunctionType *ft;
-    if (dynamic_cast<const PointerType *>(type) == NULL ||
-        (ft = dynamic_cast<const FunctionType *>(type->GetBaseType())) == NULL)
+    const FunctionType *ft = CastType<FunctionType>(type);
+    if (ft == NULL)
         return NULL;
 
-    LLVM_TYPE_CONST llvm::Type *llvmUnifType = 
-        type->GetAsUniformType()->LLVMType(g->ctx);
-    if (llvmUnifType != matchingFunc->function->getType())
+    if (Type::Equal(type, matchingFunc->type) == false) {
+        Error(pos, "Type of function symbol \"%s\" doesn't match expected type "
+              "\"%s\".", matchingFunc->type->GetString().c_str(),
+              type->GetString().c_str());
         return NULL;
-
-    if (type->IsUniformType())
-        return matchingFunc->function;
-    else {
-        llvm::Constant *intPtr = 
-            llvm::ConstantExpr::getPtrToInt(matchingFunc->function, 
-                                            LLVMTypes::PointerIntType);
-        std::vector<llvm::Constant *> smear;
-        for (int i = 0; i < g->target.vectorWidth; ++i)
-            smear.push_back(intPtr);
-        return llvm::ConstantVector::get(smear);
     }
+
+    return matchingFunc->function;
 }
 
 
@@ -7364,8 +7637,11 @@ static void
 lPrintOverloadCandidates(SourcePos pos, const std::vector<Symbol *> &funcs, 
                          const std::vector<const Type *> &argTypes, 
                          const std::vector<bool> *argCouldBeNULL) {
-    for (unsigned int i = 0; i < funcs.size(); ++i)
-        Error(funcs[i]->pos, "Candidate function:");
+    for (unsigned int i = 0; i < funcs.size(); ++i) {
+        const FunctionType *ft = CastType<FunctionType>(funcs[i]->type);
+        AssertPos(pos, ft != NULL);
+        Error(funcs[i]->pos, "Candidate function: %s.", ft->GetString().c_str());
+    }
 
     std::string passedTypes = "Passed types: (";
     for (unsigned int i = 0; i < argTypes.size(); ++i) {
@@ -7378,283 +7654,221 @@ lPrintOverloadCandidates(SourcePos pos, const std::vector<Symbol *> &funcs,
     Error(pos, "%s", passedTypes.c_str());
 }
 
-             
-/** Helper function used for function overload resolution: returns zero
-    cost if the call argument's type exactly matches the function argument
-    type (modulo a conversion to a const type if needed), otherwise reports
-    failure.
- */ 
-static int
-lExactMatch(const Type *callType, const Type *funcArgType) {
-    if (dynamic_cast<const ReferenceType *>(callType) == NULL)
-        callType = callType->GetAsNonConstType();
-    if (dynamic_cast<const ReferenceType *>(funcArgType) != NULL && 
-        dynamic_cast<const ReferenceType *>(callType) == NULL)
-        callType = new ReferenceType(callType);
 
-    return Type::Equal(callType, funcArgType) ? 0 : -1;
+static bool
+lIsMatchToNonConstReference(const Type *callType, const Type *funcArgType) {
+    return (CastType<ReferenceType>(funcArgType) &&
+            (funcArgType->IsConstType() == false) &&
+            Type::Equal(callType, funcArgType->GetReferenceTarget()));
 }
 
 
-/** Helper function used for function overload resolution: returns a cost
-    of 1 if the call argument type and the function argument type match,
-    modulo conversion to a reference type if needed.
- */
-static int
-lMatchIgnoringReferences(const Type *callType, const Type *funcArgType) {
-    int prev = lExactMatch(callType, funcArgType);
-    if (prev != -1)
-        return prev;
-
-    callType = callType->GetReferenceTarget();
-    if (funcArgType->IsConstType())
-        callType = callType->GetAsConstType();
-
-    return Type::Equal(callType,
-                       funcArgType->GetReferenceTarget()) ? 1 : -1;
+static bool
+lIsMatchToNonConstReferenceUnifToVarying(const Type *callType,
+                                         const Type *funcArgType) {
+    return (CastType<ReferenceType>(funcArgType) &&
+            (funcArgType->IsConstType() == false) &&
+            Type::Equal(callType->GetAsVaryingType(),
+                        funcArgType->GetReferenceTarget()));
 }
 
-/** Helper function used for function overload resolution: returns a cost
-    of 1 if converting the argument to the call type only requires a type
-    conversion that won't lose information.  Otherwise reports failure.
-*/
-static int
-lMatchWithTypeWidening(const Type *callType, const Type *funcArgType) {
-    int prev = lMatchIgnoringReferences(callType, funcArgType);
-    if (prev != -1)
-        return prev;
-
-    const AtomicType *callAt = dynamic_cast<const AtomicType *>(callType);
-    const AtomicType *funcAt = dynamic_cast<const AtomicType *>(funcArgType);
+/** Helper function used for function overload resolution: returns true if
+    converting the argument to the call type only requires a type
+    conversion that won't lose information.  Otherwise return false.
+  */
+static bool
+lIsMatchWithTypeWidening(const Type *callType, const Type *funcArgType) {
+    const AtomicType *callAt = CastType<AtomicType>(callType);
+    const AtomicType *funcAt = CastType<AtomicType>(funcArgType);
     if (callAt == NULL || funcAt == NULL)
-        return -1;
+        return false;
 
     if (callAt->IsUniformType() != funcAt->IsUniformType())
-        return -1;
+        return false;
 
     switch (callAt->basicType) {
     case AtomicType::TYPE_BOOL:
-        return 1;
+        return true;
     case AtomicType::TYPE_INT8:
     case AtomicType::TYPE_UINT8:
-        return (funcAt->basicType != AtomicType::TYPE_BOOL) ? 1 : -1;
+        return (funcAt->basicType != AtomicType::TYPE_BOOL);
     case AtomicType::TYPE_INT16:
     case AtomicType::TYPE_UINT16:
         return (funcAt->basicType != AtomicType::TYPE_BOOL &&
                 funcAt->basicType != AtomicType::TYPE_INT8 &&
-                funcAt->basicType != AtomicType::TYPE_UINT8) ? 1 : -1;
+                funcAt->basicType != AtomicType::TYPE_UINT8);
     case AtomicType::TYPE_INT32:
     case AtomicType::TYPE_UINT32:
         return (funcAt->basicType == AtomicType::TYPE_INT32 ||
                 funcAt->basicType == AtomicType::TYPE_UINT32 ||
                 funcAt->basicType == AtomicType::TYPE_INT64 ||
-                funcAt->basicType == AtomicType::TYPE_UINT64) ? 1 : -1;
+                funcAt->basicType == AtomicType::TYPE_UINT64);
     case AtomicType::TYPE_FLOAT:
-        return (funcAt->basicType == AtomicType::TYPE_DOUBLE) ? 1 : -1;
+        return (funcAt->basicType == AtomicType::TYPE_DOUBLE);
     case AtomicType::TYPE_INT64:
     case AtomicType::TYPE_UINT64:
         return (funcAt->basicType == AtomicType::TYPE_INT64 ||
-                funcAt->basicType == AtomicType::TYPE_UINT64) ? 1 : -1;
+                funcAt->basicType == AtomicType::TYPE_UINT64);
     case AtomicType::TYPE_DOUBLE:
-        return -1;
+        return false;
     default:
         FATAL("Unhandled atomic type");
-        return -1;
+        return false;
     }
 }
 
 
-/** Helper function used for function overload resolution: returns a cost
-    of 1 if the call argument type and the function argument type match if
-    we only do a uniform -> varying type conversion but otherwise have
-    exactly the same type.
+/** Helper function used for function overload resolution: returns true if
+    the call argument type and the function argument type match if we only
+    do a uniform -> varying type conversion but otherwise have exactly the
+    same type.
  */
-static int
-lMatchIgnoringUniform(const Type *callType, const Type *funcArgType) {
-    int prev = lMatchWithTypeWidening(callType, funcArgType);
-    if (prev != -1)
-        return prev;
-
-    if (dynamic_cast<const ReferenceType *>(callType) == NULL)
-        callType = callType->GetAsNonConstType();
-
+static bool
+lIsMatchWithUniformToVarying(const Type *callType, const Type *funcArgType) {
     return (callType->IsUniformType() && 
             funcArgType->IsVaryingType() &&
-            Type::Equal(callType->GetAsVaryingType(), funcArgType)) ? 1 : -1;
+            Type::EqualIgnoringConst(callType->GetAsVaryingType(), funcArgType));
 }
 
 
-/** Helper function used for function overload resolution: returns a cost
-    of 1 if we can type convert from the call argument type to the function
+/** Helper function used for function overload resolution: returns true if
+    we can type convert from the call argument type to the function
     argument type, but without doing a uniform -> varying conversion.
  */
-static int
-lMatchWithTypeConvSameVariability(const Type *callType,
-                                  const Type *funcArgType) {
-    int prev = lMatchIgnoringUniform(callType, funcArgType);
-    if (prev != -1)
-        return prev;
-
-    if (CanConvertTypes(callType, funcArgType) &&
-        (callType->IsUniformType() == funcArgType->IsUniformType()))
-        return 1;
-    else
-        return -1;
+static bool
+lIsMatchWithTypeConvSameVariability(const Type *callType,
+                                    const Type *funcArgType) {
+    return (CanConvertTypes(callType, funcArgType) &&
+            (callType->GetVariability() == funcArgType->GetVariability()));
 }
 
 
-/** Helper function used for function overload resolution: returns a cost
-    of 1 if there is any type conversion that gets us from the caller
-    argument type to the function argument type.
+/* Returns the set of function overloads that are potential matches, given
+   argCount values being passed as arguments to the function call.
  */
-static int
-lMatchWithTypeConv(const Type *callType, const Type *funcArgType) {
-    int prev = lMatchWithTypeConvSameVariability(callType, funcArgType);
-    if (prev != -1)
-        return prev;
-        
-    return CanConvertTypes(callType, funcArgType) ? 0 : -1;
-}
-
-
-/** Given a set of potential matching functions and their associated cost,
-    return the one with the lowest cost, if unique.  Otherwise, if multiple
-    functions match with the same cost, return NULL.
- */
-static Symbol *
-lGetBestMatch(std::vector<std::pair<int, Symbol *> > &matches) {
-    Assert(matches.size() > 0);
-    int minCost = matches[0].first;
-
-    for (unsigned int i = 1; i < matches.size(); ++i)
-        minCost = std::min(minCost, matches[i].first);
-
-    Symbol *match = NULL;
-    for (unsigned int i = 0; i < matches.size(); ++i) {
-        if (matches[i].first == minCost) {
-            if (match != NULL)
-                // multiple things had the same cost
-                return NULL;
-            else
-                match = matches[i].second;
-        }
-    }
-    return match;
-}
-
-
-/** See if we can find a single function from the set of overload options
-    based on the predicate function passed in.  Returns true if no more
-    tries should be made to find a match, either due to success from
-    finding a single overloaded function that matches or failure due to
-    finding multiple ambiguous matches.
- */
-bool
-FunctionSymbolExpr::tryResolve(int (*matchFunc)(const Type *, const Type *),
-                               SourcePos argPos,
-                               const std::vector<const Type *> &callTypes,
-                               const std::vector<bool> *argCouldBeNULL) {
-    const char *funName = candidateFunctions.front()->name.c_str();
-
-    std::vector<std::pair<int, Symbol *> > matches;
-    std::vector<Symbol *>::iterator iter;
-    for (iter = candidateFunctions.begin(); 
-         iter != candidateFunctions.end(); ++iter) {
-        // Loop over the set of candidate functions and try each one
-        Symbol *candidateFunction = *iter;
+std::vector<Symbol *>
+FunctionSymbolExpr::getCandidateFunctions(int argCount) const {
+    std::vector<Symbol *> ret;
+    for (int i = 0; i < (int)candidateFunctions.size(); ++i) {
         const FunctionType *ft = 
-            dynamic_cast<const FunctionType *>(candidateFunction->type);
-        Assert(ft != NULL);
+            CastType<FunctionType>(candidateFunctions[i]->type);
+        AssertPos(pos, ft != NULL);
 
         // There's no way to match if the caller is passing more arguments
         // than this function instance takes.
-        if ((int)callTypes.size() > ft->GetNumParameters())
+        if (argCount > ft->GetNumParameters())
             continue;
 
-        int i;
-        // Note that we're looping over the caller arguments, not the
-        // function arguments; it may be ok to have more arguments to the
-        // function than are passed, if the function has default argument
-        // values.  This case is handled below.
-        int cost = 0;
-        for (i = 0; i < (int)callTypes.size(); ++i) {
-            // This may happen if there's an error earlier in compilation.
-            // It's kind of a silly to redundantly discover this for each
-            // potential match versus detecting this earlier in the
-            // matching process and just giving up.
-            const Type *paramType = ft->GetParameterType(i);
+        // Not enough arguments, and no default argument value to save us
+        if (argCount < ft->GetNumParameters() &&
+            ft->GetParameterDefault(argCount) == NULL)
+            continue;
 
-            if (callTypes[i] == NULL || paramType == NULL ||
-                dynamic_cast<const FunctionType *>(callTypes[i]) != NULL)
-                return false;
-
-            int argCost = matchFunc(callTypes[i], paramType);
-            if (argCost == -1) {
-                if (argCouldBeNULL != NULL && (*argCouldBeNULL)[i] == true &&
-                    dynamic_cast<const PointerType *>(paramType) != NULL)
-                    // If the passed argument value is zero and this is a
-                    // pointer type, then it can convert to a NULL value of
-                    // that pointer type.
-                    argCost = 0;
-                else
-                    // If the predicate function returns -1, we have failed no
-                    // matter what else happens, so we stop trying
-                    break;
-            }
-            cost += argCost;
-        }
-        if (i == (int)callTypes.size()) {
-            // All of the arguments matched!
-            if (i == ft->GetNumParameters())
-                // And we have exactly as many arguments as the function
-                // wants, so we're done.
-                matches.push_back(std::make_pair(cost, candidateFunction));
-            else if (i < ft->GetNumParameters() && 
-                     ft->GetParameterDefault(i) != NULL)
-                // Otherwise we can still make it if there are default
-                // arguments for the rest of the arguments!  Because in
-                // Module::AddFunction() we have verified that once the
-                // default arguments start, then all of the following ones
-                // have them as well.  Therefore, we just need to check if
-                // the arg we stopped at has a default value and we're
-                // done.
-                matches.push_back(std::make_pair(cost, candidateFunction));
-            // otherwise, we don't have a match
-        }
+        // Success
+        ret.push_back(candidateFunctions[i]);
     }
+    return ret;
+}
 
-    if (matches.size() == 0)
+
+static bool
+lArgIsPointerType(const Type *type) {
+    if (CastType<PointerType>(type) != NULL)
+        return true;
+
+    const ReferenceType *rt = CastType<ReferenceType>(type);
+    if (rt == NULL)
         return false;
-    else if ((matchingFunc = lGetBestMatch(matches)) != NULL)
-        // We have a match!
-        return true;
-    else {
-        Error(pos, "Multiple overloaded instances of function \"%s\" matched.",
-              funName);
 
-        // select the matches that have the lowest cost
-        std::vector<Symbol *> bestMatches;
-        int minCost = matches[0].first;
-        for (unsigned int i = 1; i < matches.size(); ++i)
-            minCost = std::min(minCost, matches[i].first);
-        for (unsigned int i = 0; i < matches.size(); ++i)
-            if (matches[i].first == minCost)
-                bestMatches.push_back(matches[i].second);
+    const Type *t = rt->GetReferenceTarget();
+    return (CastType<PointerType>(t) != NULL);
+}
 
-        // And print a useful error message
-        lPrintOverloadCandidates(argPos, bestMatches, callTypes, argCouldBeNULL);
 
-        // Stop trying to find more matches after an ambigious set of
-        // matches.
-        return true;
+/** This function computes the value of a cost function that represents the
+    cost of calling a function of the given type with arguments of the
+    given types.  If it's not possible to call the function, regardless of
+    any type conversions applied, a cost of -1 is returned.
+ */
+int
+FunctionSymbolExpr::computeOverloadCost(const FunctionType *ftype,
+                                        const std::vector<const Type *> &argTypes,
+                                        const std::vector<bool> *argCouldBeNULL,
+                                        const std::vector<bool> *argIsConstant) {
+    int costSum = 0;
+
+    // In computing the cost function, we only worry about the actual
+    // argument types--using function default parameter values is free for
+    // the purposes here...
+    for (int i = 0; i < (int)argTypes.size(); ++i) {
+        // The cost imposed by this argument will be a multiple of
+        // costScale, which has a value set so that for each of the cost
+        // buckets, even if all of the function arguments undergo the next
+        // lower-cost conversion, the sum of their costs will be less than
+        // a single instance of the next higher-cost conversion.
+        int costScale = argTypes.size() + 1;
+
+        const Type *fargType = ftype->GetParameterType(i);
+        const Type *callType = argTypes[i];
+
+        if (Type::Equal(callType, fargType))
+            // Perfect match: no cost
+            costSum += 0;
+        else if (argCouldBeNULL && (*argCouldBeNULL)[i] &&
+                 lArgIsPointerType(fargType))
+            // Passing NULL to a pointer-typed parameter is also a no-cost
+            // operation
+            costSum += 0;
+        else {
+            // If the argument is a compile-time constant, we'd like to
+            // count the cost of various conversions as much lower than the
+            // cost if it wasn't--so scale up the cost when this isn't the
+            // case..
+            if (argIsConstant == NULL || (*argIsConstant)[i] == false)
+                costScale *= 128;
+
+            // For convenience, normalize to non-const types (except for
+            // references, where const-ness matters).  For all other types,
+            // we're passing by value anyway, so const doesn't matter.
+            const Type *callTypeNC = callType, *fargTypeNC = fargType;
+            if (CastType<ReferenceType>(callType) == NULL)
+                callTypeNC = callType->GetAsNonConstType();
+            if (CastType<ReferenceType>(fargType) == NULL)
+                fargTypeNC = fargType->GetAsNonConstType();
+                
+            if (Type::Equal(callTypeNC, fargTypeNC))
+                // Exact match (after dealing with references, above)
+                costSum += 1 * costScale;
+            // note: orig fargType for the next two...
+            else if (lIsMatchToNonConstReference(callTypeNC, fargType))
+                costSum += 2 * costScale;
+            else if (lIsMatchToNonConstReferenceUnifToVarying(callTypeNC, fargType))
+                costSum += 4 * costScale;
+            else if (lIsMatchWithTypeWidening(callTypeNC, fargTypeNC))
+                costSum += 8 * costScale;
+            else if (lIsMatchWithUniformToVarying(callTypeNC, fargTypeNC))
+                costSum += 16 * costScale;
+            else if (lIsMatchWithTypeConvSameVariability(callTypeNC, fargTypeNC))
+                costSum += 32 * costScale;
+            else if (CanConvertTypes(callTypeNC, fargTypeNC))
+                costSum += 64 * costScale;
+            else
+                // Failure--no type conversion possible...
+                return -1;
+        }
     }
+
+    return costSum;
 }
 
 
 bool
 FunctionSymbolExpr::ResolveOverloads(SourcePos argPos,
                                      const std::vector<const Type *> &argTypes,
-                                     const std::vector<bool> *argCouldBeNULL) {
+                                     const std::vector<bool> *argCouldBeNULL,
+                                     const std::vector<bool> *argIsConstant) {
+    const char *funName = candidateFunctions.front()->name.c_str();
+
     triedToResolve = true;
 
     // Functions with names that start with "__" should only be various
@@ -7665,45 +7879,67 @@ FunctionSymbolExpr::ResolveOverloads(SourcePos argPos,
     // called.
     bool exactMatchOnly = (name.substr(0,2) == "__");
 
-    // Is there an exact match that doesn't require any argument type
-    // conversion (other than converting type -> reference type)?
-    if (tryResolve(lExactMatch, argPos, argTypes, argCouldBeNULL))
-        return true;
+    // First, find the subset of overload candidates that take the same
+    // number of arguments as have parameters (including functions that
+    // take more arguments but have defaults starting no later than after
+    // our last parameter).
+    std::vector<Symbol *> actualCandidates = 
+        getCandidateFunctions(argTypes.size());
 
-    if (exactMatchOnly == false) {
-        // Try to find a single match ignoring references
-        if (tryResolve(lMatchIgnoringReferences, argPos, argTypes, 
-                       argCouldBeNULL))
-            return true;
+    int bestMatchCost = 1<<30;
+    std::vector<Symbol *> matches;
+    std::vector<int> candidateCosts;
 
-        // Try to find an exact match via type widening--i.e. int8 ->
-        // int16, etc.--things that don't lose data.
-        if (tryResolve(lMatchWithTypeWidening, argPos, argTypes, argCouldBeNULL))
-            return true;
+    if (actualCandidates.size() == 0)
+        goto failure;
 
-        // Next try to see if there's a match via just uniform -> varying
-        // promotions.
-        if (tryResolve(lMatchIgnoringUniform, argPos, argTypes, argCouldBeNULL))
-            return true;
-
-        // Try to find a match via type conversion, but don't change
-        // unif->varying
-        if (tryResolve(lMatchWithTypeConvSameVariability, argPos, argTypes,
-                       argCouldBeNULL))
-            return true;
-    
-        // Last chance: try to find a match via arbitrary type conversion.
-        if (tryResolve(lMatchWithTypeConv, argPos, argTypes, argCouldBeNULL))
-            return true;
+    // Compute the cost for calling each of the candidate functions
+    for (int i = 0; i < (int)actualCandidates.size(); ++i) {
+        const FunctionType *ft = 
+            CastType<FunctionType>(actualCandidates[i]->type);
+        AssertPos(pos, ft != NULL);
+        candidateCosts.push_back(computeOverloadCost(ft, argTypes,
+                                                     argCouldBeNULL,
+                                                     argIsConstant));
     }
 
-    // failure :-(
-    const char *funName = candidateFunctions.front()->name.c_str();
-    Error(pos, "Unable to find matching overload for call to function \"%s\"%s.",
-          funName, exactMatchOnly ? " only considering exact matches" : "");
-    lPrintOverloadCandidates(argPos, candidateFunctions, argTypes, 
-                             argCouldBeNULL);
-    return false;
+    // Find the best cost, and then the candidate or candidates that have
+    // that cost.
+    for (int i = 0; i < (int)candidateCosts.size(); ++i) {
+        if (candidateCosts[i] != -1 && candidateCosts[i] < bestMatchCost)
+            bestMatchCost = candidateCosts[i];
+    }
+    // None of the candidates matched
+    if (bestMatchCost == (1<<30))
+        goto failure;
+    for (int i = 0; i < (int)candidateCosts.size(); ++i) {
+        if (candidateCosts[i] == bestMatchCost)
+            matches.push_back(actualCandidates[i]);
+    }
+
+    if (matches.size() == 1) {
+        // Only one match: success
+        matchingFunc = matches[0];
+        return true;
+    }
+    else if (matches.size() > 1) {
+        // Multiple matches: ambiguous
+        Error(pos, "Multiple overloaded functions matched call to function "
+              "\"%s\"%s.", funName, 
+              exactMatchOnly ? " only considering exact matches" : "");
+        lPrintOverloadCandidates(argPos, matches, argTypes, argCouldBeNULL);
+        return false;
+    }
+    else {
+        // No matches at all
+ failure:
+        Error(pos, "Unable to find any matching overload for call to function "
+              "\"%s\"%s.", funName, 
+              exactMatchOnly ? " only considering exact matches" : "");
+        lPrintOverloadCandidates(argPos, candidateFunctions, argTypes, 
+                                 argCouldBeNULL);
+        return false;
+    }
 }
 
 
@@ -7784,13 +8020,13 @@ NullPointerExpr::Optimize() {
 
 llvm::Constant *
 NullPointerExpr::GetConstant(const Type *type) const {
-    const PointerType *pt = dynamic_cast<const PointerType *>(type);
+    const PointerType *pt = CastType<PointerType>(type);
     if (pt == NULL)
         return NULL;
 
-    LLVM_TYPE_CONST llvm::Type *llvmType = type->LLVMType(g->ctx);
+    llvm::Type *llvmType = type->LLVMType(g->ctx);
     if (llvmType == NULL) {
-        Assert(m->errorCount > 0);
+        AssertPos(pos, m->errorCount > 0);
         return NULL;
     }
 
@@ -7840,7 +8076,7 @@ NewExpr::NewExpr(int typeQual, const Type *t, Expr *init, Expr *count,
         // varying new.
         isVarying = (typeQual == 0) || (typeQual & TYPEQUAL_VARYING);
 
-    if (allocType != NULL && allocType->HasUnboundVariability())
+    if (allocType != NULL)
         allocType = allocType->ResolveUnboundVariability(Variability::Uniform);
 }
 
@@ -7855,7 +8091,7 @@ NewExpr::GetValue(FunctionEmitContext *ctx) const {
     if (countExpr != NULL) {
         countValue = countExpr->GetValue(ctx);
         if (countValue == NULL) {
-            Assert(m->errorCount > 0);
+            AssertPos(pos, m->errorCount > 0);
             return NULL;
         }
     }
@@ -7895,7 +8131,7 @@ NewExpr::GetValue(FunctionEmitContext *ctx) const {
                                       "alloc_size64");
         func = m->module->getFunction("__new_uniform");
     }
-    Assert(func != NULL);
+    AssertPos(pos, func != NULL);
 
     // Make the call for the the actual allocation.
     llvm::Value *ptrValue = ctx->CallInst(func, NULL, allocSize, "new");
@@ -7934,7 +8170,7 @@ NewExpr::GetValue(FunctionEmitContext *ctx) const {
                 // Initialize the memory pointed to by the pointer for the
                 // current lane.
                 ctx->SetCurrentBasicBlock(bbInit);
-                LLVM_TYPE_CONST llvm::Type *ptrType = 
+                llvm::Type *ptrType = 
                     retType->GetAsUniformType()->LLVMType(g->ctx);
                 llvm::Value *ptr = ctx->IntToPtrInst(p, ptrType);
                 InitSymbol(ptr, allocType, initExpr, ctx, pos);
@@ -7950,8 +8186,9 @@ NewExpr::GetValue(FunctionEmitContext *ctx) const {
         // For uniform news, we just need to cast the void * to be a
         // pointer of the return type and to run the code for initializers,
         // if present.
-        LLVM_TYPE_CONST llvm::Type *ptrType = retType->LLVMType(g->ctx);
-        ptrValue = ctx->BitCastInst(ptrValue, ptrType, "cast_new_ptr");
+        llvm::Type *ptrType = retType->LLVMType(g->ctx);
+        ptrValue = ctx->BitCastInst(ptrValue, ptrType, 
+                                    LLVMGetName(ptrValue, "_cast_ptr"));
 
         if (initExpr != NULL)
             InitSymbol(ptrValue, allocType, initExpr, ctx, pos);
@@ -7973,9 +8210,20 @@ NewExpr::GetType() const {
 
 Expr *
 NewExpr::TypeCheck() {
-    // Here we only need to make sure that if we have an expression giving
-    // a number of elements to allocate that it can be converted to an
-    // integer of the appropriate variability.
+    // It's illegal to call new with an undefined struct type
+    if (allocType == NULL) {
+        AssertPos(pos, m->errorCount > 0);
+        return NULL;
+    }
+    if (CastType<UndefinedStructType>(allocType) != NULL) {
+        Error(pos, "Can't dynamically allocate storage for declared "
+              "but not defined type \"%s\".", allocType->GetString().c_str());
+        return NULL;
+    }
+
+    // Otherwise we only need to make sure that if we have an expression
+    // giving a number of elements to allocate that it can be converted to
+    // an integer of the appropriate variability.
     if (countExpr == NULL)
         return this;
 
diff --git a/expr.h b/expr.h
index 5c59ae83..d65bc8c3 100644
--- a/expr.h
+++ b/expr.h
@@ -284,6 +284,10 @@ public:
     int EstimateCost() const;
 
     Expr *baseExpr, *index;
+
+private:
+    mutable const Type *type;
+    mutable const PointerType *lvalueType;
 };
 
 
@@ -320,6 +324,9 @@ public:
         member is found.  (i.e. this is true if the MemberExpr was a '->'
         operator, and is false if it was a '.' operator. */
     bool dereferenceExpr;
+
+protected:
+    mutable const Type *type, *lvalueType;
 };
 
 
@@ -584,6 +591,7 @@ public:
     Expr *TypeCheck();
     Expr *Optimize();
     int EstimateCost() const;
+    llvm::Constant *GetConstant(const Type *type) const;
 
     Expr *expr;
 };
@@ -651,20 +659,26 @@ public:
         function overloading, this method resolves which actual function
         the arguments match best.  If the argCouldBeNULL parameter is
         non-NULL, each element indicates whether the corresponding argument
-        is the number zero, indicating that it could be a NULL pointer.
-        This parameter may be NULL (for cases where overload resolution is
-        being done just given type information without the parameter
-        argument expressions being available.  It returns true on success.
+        is the number zero, indicating that it could be a NULL pointer, and
+        if argIsConstant is non-NULL, each element indicates whether the
+        corresponding argument is a compile-time constant value.  Both of
+        these parameters may be NULL (for cases where overload resolution
+        is being done just given type information without the parameter
+        argument expressions being available.  This function returns true
+        on success.
      */
     bool ResolveOverloads(SourcePos argPos,
                           const std::vector<const Type *> &argTypes,
-                          const std::vector<bool> *argCouldBeNULL = NULL);
+                          const std::vector<bool> *argCouldBeNULL = NULL,
+                          const std::vector<bool> *argIsConstant = NULL);
     Symbol *GetMatchingFunction();
 
 private:
-    bool tryResolve(int (*matchFunc)(const Type *, const Type *),
-                    SourcePos argPos, const std::vector<const Type *> &argTypes,
-                    const std::vector<bool> *argCouldBeNULL);
+    std::vector<Symbol *> getCandidateFunctions(int argCount) const;
+    static int computeOverloadCost(const FunctionType *ftype,
+                                   const std::vector<const Type *> &argTypes,
+                                   const std::vector<bool> *argCouldBeNULL,
+                            const std::vector<bool> *argIsConstant);
 
     /** Name of the function that is being called. */
     std::string name;
diff --git a/func.cpp b/func.cpp
index c1ca7ee6..4e4e8196 100644
--- a/func.cpp
+++ b/func.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2011, Intel Corporation
+  Copyright (c) 2011-2012, Intel Corporation
   All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
@@ -66,9 +66,8 @@
 #include <llvm/Support/ToolOutputFile.h>
 #include <llvm/Assembly/PrintModulePass.h>
 
-Function::Function(Symbol *s, const std::vector<Symbol *> &a, Stmt *c) {
+Function::Function(Symbol *s, Stmt *c) {
     sym = s;
-    args = a;
     code = c;
 
     maskSymbol = m->symbolTable->LookupVariable("__mask");
@@ -101,12 +100,20 @@ Function::Function(Symbol *s, const std::vector<Symbol *> &a, Stmt *c) {
         printf("\n\n\n");
     }
 
-    const FunctionType *type = dynamic_cast<const FunctionType *>(sym->type);
+    const FunctionType *type = CastType<FunctionType>(sym->type);
     Assert(type != NULL);
 
-    for (unsigned int i = 0; i < args.size(); ++i)
-        if (dynamic_cast<const ReferenceType *>(args[i]->type) == NULL)
-            args[i]->parentFunction = this;
+    for (int i = 0; i < type->GetNumParameters(); ++i) {
+        const char *paramName = type->GetParameterName(i).c_str();
+        Symbol *sym = m->symbolTable->LookupVariable(paramName);
+        if (sym == NULL)
+            Assert(strncmp(paramName, "__anon_parameter_", 17) == 0);
+        args.push_back(sym);
+
+        const Type *t = type->GetParameterType(i);
+        if (sym != NULL && CastType<ReferenceType>(t) == NULL)
+            sym->parentFunction = this;
+    }
 
     if (type->isTask) {
         threadIndexSym = m->symbolTable->LookupVariable("threadIndex");
@@ -125,7 +132,7 @@ Function::Function(Symbol *s, const std::vector<Symbol *> &a, Stmt *c) {
 
 const Type *
 Function::GetReturnType() const {
-    const FunctionType *type = dynamic_cast<const FunctionType *>(sym->type);
+    const FunctionType *type = CastType<FunctionType>(sym->type);
     Assert(type != NULL);
     return type->GetReturnType();
 }
@@ -133,7 +140,7 @@ Function::GetReturnType() const {
 
 const FunctionType *
 Function::GetType() const {
-    const FunctionType *type = dynamic_cast<const FunctionType *>(sym->type);
+    const FunctionType *type = CastType<FunctionType>(sym->type);
     Assert(type != NULL);
     return type;
 }
@@ -145,7 +152,8 @@ Function::GetType() const {
     'mem2reg' pass will in turn promote to SSA registers..
  */
 static void
-lCopyInTaskParameter(int i, llvm::Value *structArgPtr, const std::vector<Symbol *> &args,
+lCopyInTaskParameter(int i, llvm::Value *structArgPtr, const 
+                     std::vector<Symbol *> &args,
                      FunctionEmitContext *ctx) {
     // We expect the argument structure to come in as a poitner to a
     // structure.  Confirm and figure out its type here.
@@ -157,9 +165,13 @@ lCopyInTaskParameter(int i, llvm::Value *structArgPtr, const std::vector<Symbol
         llvm::dyn_cast<const llvm::StructType>(pt->getElementType());
 
     // Get the type of the argument we're copying in and its Symbol pointer
-    LLVM_TYPE_CONST llvm::Type *argType = argStructType->getElementType(i);
+    llvm::Type *argType = argStructType->getElementType(i);
     Symbol *sym = args[i];
 
+    if (sym == NULL)
+        // anonymous parameter, so don't worry about it
+        return;
+
     // allocate space to copy the parameter in to
     sym->storagePtr = ctx->AllocaInst(argType, sym->name.c_str());
 
@@ -170,7 +182,7 @@ lCopyInTaskParameter(int i, llvm::Value *structArgPtr, const std::vector<Symbol
     // memory
     llvm::Value *ptrval = ctx->LoadInst(ptr, sym->name.c_str());
     ctx->StoreInst(ptrval, sym->storagePtr);
-    ctx->EmitFunctionParameterDebugInfo(sym);
+    ctx->EmitFunctionParameterDebugInfo(sym, i);
 }
 
 
@@ -186,14 +198,14 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
     // value
     maskSymbol->storagePtr = ctx->GetFullMaskPointer();
 
-    // add debugging info for __mask, programIndex, ...
+    // add debugging info for __mask
     maskSymbol->pos = firstStmtPos;
     ctx->EmitVariableDebugInfo(maskSymbol);
 
 #if 0
     llvm::BasicBlock *entryBBlock = ctx->GetCurrentBasicBlock();
 #endif
-    const FunctionType *type = dynamic_cast<const FunctionType *>(sym->type);
+    const FunctionType *type = CastType<FunctionType>(sym->type);
     Assert(type != NULL);
     if (type->isTask == true) {
         // For tasks, we there should always be three parmeters: the
@@ -240,13 +252,17 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
         llvm::Function::arg_iterator argIter = function->arg_begin(); 
         for (unsigned int i = 0; i < args.size(); ++i, ++argIter) {
             Symbol *sym = args[i];
+            if (sym == NULL)
+                // anonymous function parameter
+                continue;
+
             argIter->setName(sym->name.c_str());
 
             // Allocate stack storage for the parameter and emit code
             // to store the its value there.
             sym->storagePtr = ctx->AllocaInst(argIter->getType(), sym->name.c_str());
             ctx->StoreInst(argIter, sym->storagePtr);
-            ctx->EmitFunctionParameterDebugInfo(sym);
+            ctx->EmitFunctionParameterDebugInfo(sym, i);
         }
 
         // If the number of actual function arguments is equal to the
@@ -415,11 +431,11 @@ Function::GenerateIR() {
         // If the function is 'export'-qualified, emit a second version of
         // it without a mask parameter and without name mangling so that
         // the application can call it
-        const FunctionType *type = dynamic_cast<const FunctionType *>(sym->type);
+        const FunctionType *type = CastType<FunctionType>(sym->type);
         Assert(type != NULL);
         if (type->isExported) {
             if (!type->isTask) {
-                LLVM_TYPE_CONST llvm::FunctionType *ftype = 
+                llvm::FunctionType *ftype = 
                     type->LLVMFunctionType(g->ctx);
                 llvm::GlobalValue::LinkageTypes linkage = llvm::GlobalValue::ExternalLinkage;
                 std::string functionName = sym->name;
diff --git a/func.h b/func.h
index d0bf0731..6d0527fc 100644
--- a/func.h
+++ b/func.h
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2011, Intel Corporation
+  Copyright (c) 2011-2012, Intel Corporation
   All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
@@ -43,7 +43,7 @@
 
 class Function {
 public:
-    Function(Symbol *sym, const std::vector<Symbol *> &args, Stmt *code);
+    Function(Symbol *sym, Stmt *code);
 
     const Type *GetReturnType() const;
     const FunctionType *GetType() const;
diff --git a/ispc.cpp b/ispc.cpp
index 49623be4..341206c6 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
   All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
@@ -54,14 +54,8 @@
 #include <llvm/Target/TargetMachine.h>
 #include <llvm/Target/TargetOptions.h>
 #include <llvm/Target/TargetData.h>
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
-  #include <llvm/Support/TargetRegistry.h>
-  #include <llvm/Support/TargetSelect.h>
-#else
-  #include <llvm/Target/TargetRegistry.h>
-  #include <llvm/Target/TargetSelect.h>
-  #include <llvm/Target/SubtargetFeature.h>
-#endif
+#include <llvm/Support/TargetRegistry.h>
+#include <llvm/Support/TargetSelect.h>
 #include <llvm/Support/Host.h>
 
 Globals *g;
@@ -70,9 +64,82 @@ Module *m;
 ///////////////////////////////////////////////////////////////////////////
 // Target
 
+#ifndef ISPC_IS_WINDOWS
+static void __cpuid(int info[4], int infoType) {
+    __asm__ __volatile__ ("cpuid"
+                          : "=a" (info[0]), "=b" (info[1]), "=c" (info[2]), "=d" (info[3])
+                          : "0" (infoType));
+}
+
+/* Save %ebx in case it's the PIC register */
+static void __cpuidex(int info[4], int level, int count) {
+  __asm__ __volatile__ ("xchg{l}\t{%%}ebx, %1\n\t"
+                        "cpuid\n\t"
+                        "xchg{l}\t{%%}ebx, %1\n\t"
+                        : "=a" (info[0]), "=r" (info[1]), "=c" (info[2]), "=d" (info[3])
+                        : "0" (level), "2" (count));
+}
+#endif // ISPC_IS_WINDOWS
+
+
+static const char *
+lGetSystemISA() {
+    int info[4];
+    __cpuid(info, 1);
+
+    if ((info[2] & (1 << 28)) != 0) {
+        // AVX1 for sure. Do we have AVX2?
+        // Call cpuid with eax=7, ecx=0
+        __cpuidex(info, 7, 0);
+        if ((info[1] & (1 << 5)) != 0)
+            return "avx2";
+        else
+            return "avx";
+    }
+    else if ((info[2] & (1 << 19)) != 0)
+        return "sse4";
+    else if ((info[3] & (1 << 26)) != 0)
+        return "sse2";
+    else {
+        fprintf(stderr, "Unable to detect supported SSE/AVX ISA.  Exiting.\n");
+        exit(1);
+    }
+}
+
+
+static const char *supportedCPUs[] = { 
+    "atom", "penryn", "core2", "corei7", "corei7-avx"
+};
+
+
 bool
 Target::GetTarget(const char *arch, const char *cpu, const char *isa,
                   bool pic, Target *t) {
+    if (isa == NULL) {
+        if (cpu != NULL) {
+            // If a CPU was specified explicitly, try to pick the best
+            // possible ISA based on that.
+            if (!strcmp(cpu, "sandybridge") ||
+                !strcmp(cpu, "corei7-avx"))
+                isa = "avx";
+            else if (!strcmp(cpu, "corei7") ||
+                     !strcmp(cpu, "penryn"))
+                isa = "sse4";
+            else
+                isa = "sse2";
+            fprintf(stderr, "Notice: no --target specified on command-line.  "
+                    "Using ISA \"%s\" based on specified CPU \"%s\".\n", isa,
+                    cpu);
+        }
+        else {
+            // No CPU and no ISA, so use CPUID to figure out what this CPU
+            // supports.
+            isa = lGetSystemISA();
+            fprintf(stderr, "Notice: no --target specified on command-line.  "
+                    "Using system ISA \"%s\".\n", isa);
+        }
+    }
+
     if (cpu == NULL) {
         std::string hostCPU = llvm::sys::getHostCPUName();
         if (hostCPU.size() > 0)
@@ -82,19 +149,24 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
             cpu = "generic";
         }
     }
+    else {
+        bool foundCPU = false;
+        for (int i = 0; i < int(sizeof(supportedCPUs) / sizeof(supportedCPUs[0])); 
+             ++i) {
+            if (!strcmp(cpu, supportedCPUs[i])) {
+                foundCPU = true;
+                break;
+            }
+        }
+        if (foundCPU == false) {
+            fprintf(stderr, "Error: CPU type \"%s\" unknown. Supported CPUs: "
+                    "%s.\n", cpu, SupportedTargetCPUs().c_str());
+            return false;
+        }
+    }
+
     t->cpu = cpu;
 
-    if (isa == NULL) {
-        if (!strcasecmp(cpu, "atom"))
-            isa = "sse2";
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
-        else if (!strcasecmp(cpu, "sandybridge") ||
-                 !strcasecmp(cpu, "corei7-avx"))
-            isa = "avx";
-#endif // LLVM_3_0
-        else
-            isa = "sse4";
-    }
     if (arch == NULL)
         arch = "x86-64";
 
@@ -125,13 +197,15 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
         t->arch = arch;
     }
 
+    // This is the case for most of them
+    t->hasHalf = t->hasTranscendentals = false;
+
     if (!strcasecmp(isa, "sse2")) {
         t->isa = Target::SSE2;
         t->nativeVectorWidth = 4;
         t->vectorWidth = 4;
         t->attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt";
         t->maskingIsFree = false;
-        t->allOffMaskIsSafe = false;
         t->maskBitCount = 32;
     }
     else if (!strcasecmp(isa, "sse2-x2")) {
@@ -140,7 +214,6 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
         t->vectorWidth = 8;
         t->attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt";
         t->maskingIsFree = false;
-        t->allOffMaskIsSafe = false;
         t->maskBitCount = 32;
     }
     else if (!strcasecmp(isa, "sse4")) {
@@ -149,7 +222,6 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
         t->vectorWidth = 4;
         t->attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
         t->maskingIsFree = false;
-        t->allOffMaskIsSafe = false;
         t->maskBitCount = 32;
     }
     else if (!strcasecmp(isa, "sse4x2") || !strcasecmp(isa, "sse4-x2")) {
@@ -158,7 +230,6 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
         t->vectorWidth = 8;
         t->attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
         t->maskingIsFree = false;
-        t->allOffMaskIsSafe = false;
         t->maskBitCount = 32;
     }
     else if (!strcasecmp(isa, "generic-4")) {
@@ -166,41 +237,59 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
         t->nativeVectorWidth = 4;
         t->vectorWidth = 4;
         t->maskingIsFree = true;
-        t->allOffMaskIsSafe = true;
         t->maskBitCount = 1;
+        t->hasHalf = true;
+        t->hasTranscendentals = true;
     }
     else if (!strcasecmp(isa, "generic-8")) {
         t->isa = Target::GENERIC;
         t->nativeVectorWidth = 8;
         t->vectorWidth = 8;
         t->maskingIsFree = true;
-        t->allOffMaskIsSafe = true;
         t->maskBitCount = 1;
+        t->hasHalf = true;
+        t->hasTranscendentals = true;
     }
     else if (!strcasecmp(isa, "generic-16")) {
         t->isa = Target::GENERIC;
         t->nativeVectorWidth = 16;
         t->vectorWidth = 16;
         t->maskingIsFree = true;
-        t->allOffMaskIsSafe = true;
         t->maskBitCount = 1;
+        t->hasHalf = true;
+        t->hasTranscendentals = true;
+    }
+    else if (!strcasecmp(isa, "generic-32")) {
+        t->isa = Target::GENERIC;
+        t->nativeVectorWidth = 32;
+        t->vectorWidth = 32;
+        t->maskingIsFree = true;
+        t->maskBitCount = 1;
+        t->hasHalf = true;
+        t->hasTranscendentals = true;
+    }
+    else if (!strcasecmp(isa, "generic-64")) {
+        t->isa = Target::GENERIC;
+        t->nativeVectorWidth = 64;
+        t->vectorWidth = 64;
+        t->maskingIsFree = true;
+        t->maskBitCount = 1;
+        t->hasHalf = true;
+        t->hasTranscendentals = true;
     }
     else if (!strcasecmp(isa, "generic-1")) {
         t->isa = Target::GENERIC;
         t->nativeVectorWidth = 1;
         t->vectorWidth = 1;
         t->maskingIsFree = false;
-        t->allOffMaskIsSafe = false;
         t->maskBitCount = 32;
     }
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
     else if (!strcasecmp(isa, "avx")) {
         t->isa = Target::AVX;
         t->nativeVectorWidth = 8;
         t->vectorWidth = 8;
         t->attributes = "+avx,+popcnt,+cmov";
         t->maskingIsFree = false;
-        t->allOffMaskIsSafe = false;
         t->maskBitCount = 32;
     }
     else if (!strcasecmp(isa, "avx-x2")) {
@@ -209,19 +298,17 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
         t->vectorWidth = 16;
         t->attributes = "+avx,+popcnt,+cmov";
         t->maskingIsFree = false;
-        t->allOffMaskIsSafe = false;
         t->maskBitCount = 32;
     }
-#endif // LLVM 3.0+
-#if defined(LLVM_3_1svn)
+#ifndef LLVM_3_0
     else if (!strcasecmp(isa, "avx2")) {
         t->isa = Target::AVX2;
         t->nativeVectorWidth = 8;
         t->vectorWidth = 8;
         t->attributes = "+avx2,+popcnt,+cmov,+f16c";
         t->maskingIsFree = false;
-        t->allOffMaskIsSafe = false;
         t->maskBitCount = 32;
+        t->hasHalf = true;
     }
     else if (!strcasecmp(isa, "avx2-x2")) {
         t->isa = Target::AVX2;
@@ -229,10 +316,10 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
         t->vectorWidth = 16;
         t->attributes = "+avx2,+popcnt,+cmov,+f16c";
         t->maskingIsFree = false;
-        t->allOffMaskIsSafe = false;
         t->maskBitCount = 32;
+        t->hasHalf = true;
     }
-#endif // LLVM 3.1
+#endif // !LLVM_3_0
     else {
         fprintf(stderr, "Target ISA \"%s\" is unknown.  Choices are: %s\n", 
                 isa, SupportedTargetISAs());
@@ -243,23 +330,23 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
         llvm::TargetMachine *targetMachine = t->GetTargetMachine();
         const llvm::TargetData *targetData = targetMachine->getTargetData();
         t->is32Bit = (targetData->getPointerSize() == 4);
+        Assert(t->vectorWidth <= ISPC_MAX_NVEC);
     }
 
     return !error;
 }
 
 
-const char *
+std::string
 Target::SupportedTargetCPUs() {
-    return "atom, barcelona, core2, corei7, "
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
-        "corei7-avx, "
-#endif
-        "istanbul, nocona, penryn, "
-#ifdef LLVM_2_9
-        "sandybridge, "
-#endif
-        "westmere";
+    std::string ret;
+    int count = sizeof(supportedCPUs) / sizeof(supportedCPUs[0]);
+    for (int i = 0; i < count; ++i) {
+        ret += supportedCPUs[i];
+        if (i != count - 1)
+            ret += ", ";
+    }
+    return ret;
 }
 
 
@@ -271,14 +358,11 @@ Target::SupportedTargetArchs() {
 
 const char *
 Target::SupportedTargetISAs() {
-    return "sse2, sse2-x2, sse4, sse4-x2"
-#ifndef LLVM_2_9
-        ", avx, avx-x2"
-#endif // !LLVM_2_9
-#ifdef LLVM_3_1svn
+    return "sse2, sse2-x2, sse4, sse4-x2, avx, avx-x2"
+#ifndef LLVM_3_0
         ", avx2, avx2-x2"
-#endif // LLVM_3_1svn
-        ", generic-4, generic-8, generic-16, generic-1";
+#endif // !LLVM_3_0
+        ", generic-1, generic-4, generic-8, generic-16, generic-32";
 }
 
 
@@ -286,10 +370,10 @@ std::string
 Target::GetTripleString() const {
     llvm::Triple triple;
     // Start with the host triple as the default
-#if defined(LLVM_3_1) || defined(LLVM_3_1svn)
-    triple.setTriple(llvm::sys::getDefaultTargetTriple());
-#else
+#ifdef LLVM_3_0
     triple.setTriple(llvm::sys::getHostTriple());
+#else
+    triple.setTriple(llvm::sys::getDefaultTargetTriple());
 #endif
 
     // And override the arch in the host triple based on what the user
@@ -315,30 +399,17 @@ Target::GetTargetMachine() const {
 
     llvm::Reloc::Model relocModel = generatePIC ? llvm::Reloc::PIC_ : 
                                                   llvm::Reloc::Default;
-#if defined(LLVM_3_1svn)
-    std::string featuresString = attributes;
-    llvm::TargetOptions options;
-    if (g->opt.fastMath == true)
-        options.UnsafeFPMath = 1;
-    llvm::TargetMachine *targetMachine = 
-        target->createTargetMachine(triple, cpu, featuresString, options,
-                                    relocModel);
-#elif defined(LLVM_3_0)
+#ifdef LLVM_3_0
     std::string featuresString = attributes;
     llvm::TargetMachine *targetMachine = 
         target->createTargetMachine(triple, cpu, featuresString, relocModel);
-#else // LLVM 2.9
-#ifdef ISPC_IS_APPLE
-    relocModel = llvm::Reloc::PIC_;
-#endif // ISPC_IS_APPLE
-    std::string featuresString = cpu + std::string(",") + attributes;
+#else
+    std::string featuresString = attributes;
+    llvm::TargetOptions options;
     llvm::TargetMachine *targetMachine = 
-        target->createTargetMachine(triple, featuresString);
-#ifndef ISPC_IS_WINDOWS
-    targetMachine->setRelocationModel(relocModel);
-#endif // !ISPC_IS_WINDOWS
-#endif // LLVM_2_9
-
+        target->createTargetMachine(triple, cpu, featuresString, options,
+                                    relocModel);
+#endif // !LLVM_3_0
     Assert(targetMachine != NULL);
 
     targetMachine->setAsmVerbosityDefault(true);
@@ -367,7 +438,7 @@ Target::GetISAString() const {
 
 
 static bool
-lGenericTypeLayoutIndeterminate(LLVM_TYPE_CONST llvm::Type *type) {
+lGenericTypeLayoutIndeterminate(llvm::Type *type) {
     if (type->isPrimitiveType() || type->isIntegerTy())
         return false;
 
@@ -376,18 +447,18 @@ lGenericTypeLayoutIndeterminate(LLVM_TYPE_CONST llvm::Type *type) {
         type == LLVMTypes::Int1VectorType)
         return true;
 
-    LLVM_TYPE_CONST llvm::ArrayType *at = 
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(type);
+    llvm::ArrayType *at = 
+        llvm::dyn_cast<llvm::ArrayType>(type);
     if (at != NULL)
         return lGenericTypeLayoutIndeterminate(at->getElementType());
 
-    LLVM_TYPE_CONST llvm::PointerType *pt = 
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::PointerType>(type);
+    llvm::PointerType *pt = 
+        llvm::dyn_cast<llvm::PointerType>(type);
     if (pt != NULL)
         return false;
 
-    LLVM_TYPE_CONST llvm::StructType *st =
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::StructType>(type);
+    llvm::StructType *st =
+        llvm::dyn_cast<llvm::StructType>(type);
     if (st != NULL) {
         for (int i = 0; i < (int)st->getNumElements(); ++i)
             if (lGenericTypeLayoutIndeterminate(st->getElementType(i)))
@@ -395,29 +466,24 @@ lGenericTypeLayoutIndeterminate(LLVM_TYPE_CONST llvm::Type *type) {
         return false;
     }
 
-    Assert(llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(type));
+    Assert(llvm::isa<llvm::VectorType>(type));
     return true;
 }
 
 
 llvm::Value *
-Target::SizeOf(LLVM_TYPE_CONST llvm::Type *type, 
+Target::SizeOf(llvm::Type *type, 
                llvm::BasicBlock *insertAtEnd) {
     if (isa == Target::GENERIC &&
         lGenericTypeLayoutIndeterminate(type)) {
         llvm::Value *index[1] = { LLVMInt32(1) };
-        LLVM_TYPE_CONST llvm::PointerType *ptrType = llvm::PointerType::get(type, 0);
+        llvm::PointerType *ptrType = llvm::PointerType::get(type, 0);
         llvm::Value *voidPtr = llvm::ConstantPointerNull::get(ptrType);
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
         llvm::ArrayRef<llvm::Value *> arrayRef(&index[0], &index[1]);
         llvm::Instruction *gep = 
             llvm::GetElementPtrInst::Create(voidPtr, arrayRef, "sizeof_gep",
                                             insertAtEnd);
-#else
-        llvm::Instruction *gep =
-            llvm::GetElementPtrInst::Create(voidPtr, &index[0], &index[1],
-                                            "sizeof_gep", insertAtEnd);
-#endif
+
         if (is32Bit || g->opt.force32BitAddressing)
             return new llvm::PtrToIntInst(gep, LLVMTypes::Int32Type, 
                                           "sizeof_int", insertAtEnd);
@@ -428,7 +494,9 @@ Target::SizeOf(LLVM_TYPE_CONST llvm::Type *type,
 
     const llvm::TargetData *td = GetTargetMachine()->getTargetData();
     Assert(td != NULL);
-    uint64_t byteSize = td->getTypeSizeInBits(type) / 8;
+    uint64_t bitSize = td->getTypeSizeInBits(type);
+    Assert((bitSize % 8) == 0);
+    uint64_t byteSize = bitSize / 8;
     if (is32Bit || g->opt.force32BitAddressing)
         return LLVMInt32((int32_t)byteSize);
     else
@@ -437,23 +505,18 @@ Target::SizeOf(LLVM_TYPE_CONST llvm::Type *type,
 
 
 llvm::Value *
-Target::StructOffset(LLVM_TYPE_CONST llvm::Type *type, int element,
+Target::StructOffset(llvm::Type *type, int element,
                      llvm::BasicBlock *insertAtEnd) {
     if (isa == Target::GENERIC && 
         lGenericTypeLayoutIndeterminate(type) == true) {
         llvm::Value *indices[2] = { LLVMInt32(0), LLVMInt32(element) };
-        LLVM_TYPE_CONST llvm::PointerType *ptrType = llvm::PointerType::get(type, 0);
+        llvm::PointerType *ptrType = llvm::PointerType::get(type, 0);
         llvm::Value *voidPtr = llvm::ConstantPointerNull::get(ptrType);
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
         llvm::ArrayRef<llvm::Value *> arrayRef(&indices[0], &indices[2]);
         llvm::Instruction *gep = 
             llvm::GetElementPtrInst::Create(voidPtr, arrayRef, "offset_gep",
                                             insertAtEnd);
-#else
-        llvm::Instruction *gep =
-            llvm::GetElementPtrInst::Create(voidPtr, &indices[0], &indices[2],
-                                            "offset_gep", insertAtEnd);
-#endif
+
         if (is32Bit || g->opt.force32BitAddressing)
             return new llvm::PtrToIntInst(gep, LLVMTypes::Int32Type, 
                                           "offset_int", insertAtEnd);
@@ -464,9 +527,12 @@ Target::StructOffset(LLVM_TYPE_CONST llvm::Type *type, int element,
 
     const llvm::TargetData *td = GetTargetMachine()->getTargetData();
     Assert(td != NULL);
-    LLVM_TYPE_CONST llvm::StructType *structType = 
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::StructType>(type);
-    Assert(structType != NULL);
+    llvm::StructType *structType = 
+        llvm::dyn_cast<llvm::StructType>(type);
+    if (structType == NULL || structType->isSized() == false) {
+        Assert(m->errorCount > 0);
+        return NULL;
+    }
     const llvm::StructLayout *sl = td->getStructLayout(structType);
     Assert(sl != NULL);
 
@@ -552,7 +618,9 @@ llvm::DIFile
 SourcePos::GetDIFile() const {
     std::string directory, filename;
     GetDirectoryAndFileName(g->currentDirectory, name, &directory, &filename);
-    return m->diBuilder->createFile(filename, directory);
+    llvm::DIFile ret = m->diBuilder->createFile(filename, directory);
+    Assert(ret.Verify());
+    return ret;
 }
 
 
diff --git a/ispc.h b/ispc.h
index 360b7d99..4cbbce7d 100644
--- a/ispc.h
+++ b/ispc.h
@@ -38,10 +38,10 @@
 #ifndef ISPC_H
 #define ISPC_H
 
-#define ISPC_VERSION "1.2.1dev"
+#define ISPC_VERSION "1.2.3dev"
 
-#if !defined(LLVM_2_9) && !defined(LLVM_3_0) && !defined(LLVM_3_0svn) && !defined(LLVM_3_1svn)
-#error "Only LLVM 2.9, 3.0, and the 3.1 development branch are supported"
+#if !defined(LLVM_3_0) && !defined(LLVM_3_1) && !defined(LLVM_3_2)
+#error "Only LLVM 3.0, 3.1, and the 3.2 development branch are supported"
 #endif
 
 #if defined(_WIN32) || defined(_WIN64)
@@ -58,20 +58,10 @@
 #include <vector>
 #include <string>
 
-#define Assert(expr)                                            \
-    ((void)((expr) ? 0 : __Assert (#expr, __FILE__, __LINE__)))
-#define __Assert(expr, file, line)                                      \
-    ((void)fprintf(stderr, "%s:%u: Assertion failed: \"%s\"\n"          \
-                   "***\n*** Please file a bug report at "              \
-                   "https://github.com/ispc/ispc/issues\n*** (Including as much " \
-                   "information as you can about how to reproduce this error).\n" \
-                   "*** You have apparently encountered a bug in the compiler that " \
-                   "we'd like to fix!\n***\n", file, line, expr), abort(), 0)
-
 /** @def ISPC_MAX_NVEC maximum vector size of any of the compliation
     targets.
  */
-#define ISPC_MAX_NVEC 16
+#define ISPC_MAX_NVEC 64
 
 // Forward declarations of a number of widely-used LLVM types
 namespace llvm {
@@ -92,12 +82,6 @@ namespace llvm {
     class Value;
 }
 
-// llvm::Type *s are no longer const in llvm 3.0
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
-#define LLVM_TYPE_CONST
-#else
-#define LLVM_TYPE_CONST const
-#endif
 
 class ArrayType;
 class AST;
@@ -116,6 +100,15 @@ class SymbolTable;
 class Type;
 struct VariableDeclaration;
 
+enum StorageClass {
+    SC_NONE,
+    SC_EXTERN,
+    SC_STATIC,
+    SC_TYPEDEF,
+    SC_EXTERN_C
+};
+
+
 /** @brief Representation of a range of positions in a source file.
 
     This class represents a range of characters in a source file
@@ -142,11 +135,25 @@ struct SourcePos {
     bool operator==(const SourcePos &p2) const;
 };
 
+
 /** Returns a SourcePos that encompasses the extent of both of the given
     extents. */
 SourcePos Union(const SourcePos &p1, const SourcePos &p2);
 
 
+
+// Assert
+
+extern void DoAssert(const char *file, int line, const char *expr);
+extern void DoAssertPos(SourcePos pos, const char *file, int line, const char *expr);
+
+#define Assert(expr)                                            \
+    ((void)((expr) ? 0 : ((void)DoAssert (__FILE__, __LINE__, #expr), 0)))
+
+#define AssertPos(pos, expr)                                     \
+    ((void)((expr) ? 0 : ((void)DoAssertPos (pos, __FILE__, __LINE__, #expr), 0)))
+
+
 /** @brief Structure that defines a compilation target 
 
     This structure defines a compilation target for the ispc compiler.
@@ -164,7 +171,7 @@ struct Target {
 
     /** Returns a comma-delimited string giving the names of the currently
         supported target CPUs. */
-    static const char *SupportedTargetCPUs();
+    static std::string SupportedTargetCPUs();
 
     /** Returns a comma-delimited string giving the names of the currently
         supported target architectures. */
@@ -182,13 +189,13 @@ struct Target {
     const char *GetISAString() const;
 
     /** Returns the size of the given type */
-    llvm::Value *SizeOf(LLVM_TYPE_CONST llvm::Type *type,
+    llvm::Value *SizeOf(llvm::Type *type,
                         llvm::BasicBlock *insertAtEnd);
 
     /** Given a structure type and an element number in the structure,
         returns a value corresponding to the number of bytes from the start
         of the structure where the element is located. */
-    llvm::Value *StructOffset(LLVM_TYPE_CONST llvm::Type *type,
+    llvm::Value *StructOffset(llvm::Type *type,
                               int element, llvm::BasicBlock *insertAtEnd);
 
     /** llvm Target object representing this target. */
@@ -236,16 +243,18 @@ struct Target {
         natively. */
     bool maskingIsFree;
 
-    /** Is it safe to run code with the mask all if: e.g. on SSE, the fast
-        gather trick assumes that at least one program instance is running
-        (so that it can safely assume that the array base pointer is
-        valid). */
-    bool allOffMaskIsSafe;
-
     /** How many bits are used to store each element of the mask: e.g. this
         is 32 on SSE/AVX, since that matches the HW better, but it's 1 for
         the generic target. */
     int maskBitCount;
+
+    /** Indicates whether the target has native support for float/half
+        conversions. */
+    bool hasHalf;
+
+    /** Indicates whether the target has support for transcendentals (beyond
+        sqrt, which we assume that all of them handle). */
+    bool hasTranscendentals;
 };
 
 
diff --git a/ispc.vcxproj b/ispc.vcxproj
index 6971ce9a..6478df4e 100755
--- a/ispc.vcxproj
+++ b/ispc.vcxproj
@@ -29,6 +29,8 @@
     <ClCompile Include="gen-bitcode-generic-4.cpp" />
     <ClCompile Include="gen-bitcode-generic-8.cpp" />
     <ClCompile Include="gen-bitcode-generic-16.cpp" />
+    <ClCompile Include="gen-bitcode-generic-32.cpp" />
+    <ClCompile Include="gen-bitcode-generic-64.cpp" />
     <ClCompile Include="gen-bitcode-sse2.cpp" />
     <ClCompile Include="gen-bitcode-sse2-x2.cpp" />
     <ClCompile Include="gen-bitcode-sse4.cpp" />
@@ -264,6 +266,32 @@
       <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-generic-16.cpp</Message>
     </CustomBuild>
   </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-generic-32.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-32.ll | python bitcode2cpp.py builtins\target-generic-32.ll &gt; gen-bitcode-generic-32.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-generic-32.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-32.ll | python bitcode2cpp.py builtins\target-generic-32.ll &gt; gen-bitcode-generic-32.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-generic-32.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-generic-32.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-generic-32.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-generic-64.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-64.ll | python bitcode2cpp.py builtins\target-generic-64.ll &gt; gen-bitcode-generic-64.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-generic-64.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-64.ll | python bitcode2cpp.py builtins\target-generic-64.ll &gt; gen-bitcode-generic-64.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-generic-64.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-generic-64.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-generic-64.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="lex.ll">
       <FileType>Document</FileType>
diff --git a/lex.ll b/lex.ll
index 4130372f..026b1b48 100644
--- a/lex.ll
+++ b/lex.ll
@@ -43,6 +43,7 @@
 #include <stdint.h>
 
 static uint64_t lParseBinary(const char *ptr, SourcePos pos, char **endPtr);
+static int lParseInteger(bool dotdotdot);
 static void lCComment(SourcePos *);
 static void lCppComment(SourcePos *);
 static void lHandleCppHash(SourcePos *);
@@ -322,7 +323,8 @@ inline int ispcRand() {
 %option nounistd
 
 WHITESPACE [ \t\r]+
-INT_NUMBER (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[kMG]?
+INT_NUMBER (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]*
+INT_NUMBER_DOTDOTDOT (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]*\.\.\.
 FLOAT_NUMBER (([0-9]+|(([0-9]+\.[0-9]*[fF]?)|(\.[0-9]+)))([eE][-+]?[0-9]+)?[fF]?)
 HEX_FLOAT_NUMBER (0x[01](\.[0-9a-fA-F]*)?p[-+]?[0-9]+[fF]?)
 
@@ -406,53 +408,14 @@ L?\"(\\.|[^\\"])*\" { lStringConst(&yylval, &yylloc); return TOKEN_STRING_LITERA
         return TOKEN_IDENTIFIER; 
 }
 
-{INT_NUMBER}+(u|U|l|L)*? { 
+{INT_NUMBER} { 
     RT;
-    int ls = 0, us = 0;
+    return lParseInteger(false);
+}
 
-    char *endPtr = NULL;
-    if (yytext[0] == '0' && yytext[1] == 'b')
-        yylval.intVal = lParseBinary(yytext+2, yylloc, &endPtr);
-    else {
-#if defined(ISPC_IS_WINDOWS) && !defined(__MINGW32__)
-        yylval.intVal = _strtoui64(yytext, &endPtr, 0);
-#else
-        // FIXME: should use strtouq and then issue an error if we can't
-        // fit into 64 bits...
-        yylval.intVal = strtoull(yytext, &endPtr, 0);
-#endif
-    }
-
-    bool kilo = false, mega = false, giga = false;
-    for (; *endPtr; endPtr++) {
-        if (*endPtr == 'k')
-            kilo = true;
-        else if (*endPtr == 'M')
-            mega = true;
-        else if (*endPtr == 'G')
-            giga = true;        
-        else if (*endPtr == 'l' || *endPtr == 'L')
-            ls++;
-        else if (*endPtr == 'u' || *endPtr == 'U')
-            us++;
-    }
-    if (kilo)
-        yylval.intVal *= 1024;
-    if (mega)
-        yylval.intVal *= 1024*1024;
-    if (giga)
-        yylval.intVal *= 1024*1024*1024;
-
-    if (ls >= 2)
-        return us ? TOKEN_UINT64_CONSTANT : TOKEN_INT64_CONSTANT;
-    else if (ls == 1)
-        return us ? TOKEN_UINT32_CONSTANT : TOKEN_INT32_CONSTANT;
-
-    // See if we can fit this into a 32-bit integer...
-    if ((yylval.intVal & 0xffffffff) == yylval.intVal)
-        return us ? TOKEN_UINT32_CONSTANT : TOKEN_INT32_CONSTANT;
-    else
-        return us ? TOKEN_UINT64_CONSTANT : TOKEN_INT64_CONSTANT;
+{INT_NUMBER_DOTDOTDOT} {
+    RT;
+    return lParseInteger(true);
 }
 
 
@@ -562,6 +525,72 @@ lParseBinary(const char *ptr, SourcePos pos, char **endPtr) {
 }
 
 
+static int
+lParseInteger(bool dotdotdot) {
+    int ls = 0, us = 0;
+
+    char *endPtr = NULL;
+    if (yytext[0] == '0' && yytext[1] == 'b')
+        yylval.intVal = lParseBinary(yytext+2, yylloc, &endPtr);
+    else {
+#if defined(ISPC_IS_WINDOWS) && !defined(__MINGW32__)
+        yylval.intVal = _strtoui64(yytext, &endPtr, 0);
+#else
+        // FIXME: should use strtouq and then issue an error if we can't
+        // fit into 64 bits...
+        yylval.intVal = strtoull(yytext, &endPtr, 0);
+#endif
+    }
+
+    bool kilo = false, mega = false, giga = false;
+    for (; *endPtr; endPtr++) {
+        if (*endPtr == 'k')
+            kilo = true;
+        else if (*endPtr == 'M')
+            mega = true;
+        else if (*endPtr == 'G')
+            giga = true;        
+        else if (*endPtr == 'l' || *endPtr == 'L')
+            ls++;
+        else if (*endPtr == 'u' || *endPtr == 'U')
+            us++;
+        else
+            Assert(dotdotdot && *endPtr == '.');
+    }
+    if (kilo)
+        yylval.intVal *= 1024;
+    if (mega)
+        yylval.intVal *= 1024*1024;
+    if (giga)
+        yylval.intVal *= 1024*1024*1024;
+
+    if (dotdotdot) {
+        if (ls >= 2)
+            return us ? TOKEN_UINT64DOTDOTDOT_CONSTANT : TOKEN_INT64DOTDOTDOT_CONSTANT;
+        else if (ls == 1)
+            return us ? TOKEN_UINT32DOTDOTDOT_CONSTANT : TOKEN_INT32DOTDOTDOT_CONSTANT;
+
+        // See if we can fit this into a 32-bit integer...
+        if ((yylval.intVal & 0xffffffff) == yylval.intVal)
+            return us ? TOKEN_UINT32DOTDOTDOT_CONSTANT : TOKEN_INT32DOTDOTDOT_CONSTANT;
+        else
+            return us ? TOKEN_UINT64DOTDOTDOT_CONSTANT : TOKEN_INT64DOTDOTDOT_CONSTANT;
+    }
+    else {
+        if (ls >= 2)
+            return us ? TOKEN_UINT64_CONSTANT : TOKEN_INT64_CONSTANT;
+        else if (ls == 1)
+            return us ? TOKEN_UINT32_CONSTANT : TOKEN_INT32_CONSTANT;
+
+        // See if we can fit this into a 32-bit integer...
+        if ((yylval.intVal & 0xffffffff) == yylval.intVal)
+            return us ? TOKEN_UINT32_CONSTANT : TOKEN_INT32_CONSTANT;
+        else
+            return us ? TOKEN_UINT64_CONSTANT : TOKEN_INT64_CONSTANT;
+    }
+}
+
+
 /** Handle a C-style comment in the source. 
  */
 static void
@@ -675,7 +704,7 @@ lEscapeChar(char *str, char *pChar, SourcePos *pos)
             str = tail - 1;
             break;
         default:
-            Error(*pos, "Bad character escape sequence: '%s'\n.", str);
+            Error(*pos, "Bad character escape sequence: '%s'.", str);
             break;
         }
     }
diff --git a/llvmutil.cpp b/llvmutil.cpp
index 2ba60dbb..cc8ac5af 100644
--- a/llvmutil.cpp
+++ b/llvmutil.cpp
@@ -43,44 +43,44 @@
 #include <set>
 #include <map>
 
-LLVM_TYPE_CONST llvm::Type *LLVMTypes::VoidType = NULL;
-LLVM_TYPE_CONST llvm::PointerType *LLVMTypes::VoidPointerType = NULL;
-LLVM_TYPE_CONST llvm::Type *LLVMTypes::PointerIntType = NULL;
-LLVM_TYPE_CONST llvm::Type *LLVMTypes::BoolType = NULL;
+llvm::Type *LLVMTypes::VoidType = NULL;
+llvm::PointerType *LLVMTypes::VoidPointerType = NULL;
+llvm::Type *LLVMTypes::PointerIntType = NULL;
+llvm::Type *LLVMTypes::BoolType = NULL;
 
-LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int8Type = NULL;
-LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int16Type = NULL;
-LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int32Type = NULL;
-LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int64Type = NULL;
-LLVM_TYPE_CONST llvm::Type *LLVMTypes::FloatType = NULL;
-LLVM_TYPE_CONST llvm::Type *LLVMTypes::DoubleType = NULL;
+llvm::Type *LLVMTypes::Int8Type = NULL;
+llvm::Type *LLVMTypes::Int16Type = NULL;
+llvm::Type *LLVMTypes::Int32Type = NULL;
+llvm::Type *LLVMTypes::Int64Type = NULL;
+llvm::Type *LLVMTypes::FloatType = NULL;
+llvm::Type *LLVMTypes::DoubleType = NULL;
 
-LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int8PointerType = NULL;
-LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int16PointerType = NULL;
-LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int32PointerType = NULL;
-LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int64PointerType = NULL;
-LLVM_TYPE_CONST llvm::Type *LLVMTypes::FloatPointerType = NULL;
-LLVM_TYPE_CONST llvm::Type *LLVMTypes::DoublePointerType = NULL;
+llvm::Type *LLVMTypes::Int8PointerType = NULL;
+llvm::Type *LLVMTypes::Int16PointerType = NULL;
+llvm::Type *LLVMTypes::Int32PointerType = NULL;
+llvm::Type *LLVMTypes::Int64PointerType = NULL;
+llvm::Type *LLVMTypes::FloatPointerType = NULL;
+llvm::Type *LLVMTypes::DoublePointerType = NULL;
 
-LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::MaskType = NULL;
-LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::BoolVectorType = NULL;
+llvm::VectorType *LLVMTypes::MaskType = NULL;
+llvm::VectorType *LLVMTypes::BoolVectorType = NULL;
 
-LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::Int1VectorType = NULL;
-LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::Int8VectorType = NULL;
-LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::Int16VectorType = NULL;
-LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::Int32VectorType = NULL;
-LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::Int64VectorType = NULL;
-LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::FloatVectorType = NULL;
-LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::DoubleVectorType = NULL;
+llvm::VectorType *LLVMTypes::Int1VectorType = NULL;
+llvm::VectorType *LLVMTypes::Int8VectorType = NULL;
+llvm::VectorType *LLVMTypes::Int16VectorType = NULL;
+llvm::VectorType *LLVMTypes::Int32VectorType = NULL;
+llvm::VectorType *LLVMTypes::Int64VectorType = NULL;
+llvm::VectorType *LLVMTypes::FloatVectorType = NULL;
+llvm::VectorType *LLVMTypes::DoubleVectorType = NULL;
 
-LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int8VectorPointerType = NULL;
-LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int16VectorPointerType = NULL;
-LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int32VectorPointerType = NULL;
-LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int64VectorPointerType = NULL;
-LLVM_TYPE_CONST llvm::Type *LLVMTypes::FloatVectorPointerType = NULL;
-LLVM_TYPE_CONST llvm::Type *LLVMTypes::DoubleVectorPointerType = NULL;
+llvm::Type *LLVMTypes::Int8VectorPointerType = NULL;
+llvm::Type *LLVMTypes::Int16VectorPointerType = NULL;
+llvm::Type *LLVMTypes::Int32VectorPointerType = NULL;
+llvm::Type *LLVMTypes::Int64VectorPointerType = NULL;
+llvm::Type *LLVMTypes::FloatVectorPointerType = NULL;
+llvm::Type *LLVMTypes::DoubleVectorPointerType = NULL;
 
-LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::VoidPointerVectorType = NULL;
+llvm::VectorType *LLVMTypes::VoidPointerVectorType = NULL;
 
 llvm::Constant *LLVMTrue = NULL;
 llvm::Constant *LLVMFalse = NULL;
@@ -473,9 +473,9 @@ LLVMBoolVector(const bool *bvec) {
 
 
 llvm::Constant *
-LLVMIntAsType(int64_t val, LLVM_TYPE_CONST llvm::Type *type) {
-    LLVM_TYPE_CONST llvm::VectorType *vecType =
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(type);
+LLVMIntAsType(int64_t val, llvm::Type *type) {
+    llvm::VectorType *vecType =
+        llvm::dyn_cast<llvm::VectorType>(type);
 
     if (vecType != NULL) {
         llvm::Constant *v = llvm::ConstantInt::get(vecType->getElementType(),
@@ -491,9 +491,9 @@ LLVMIntAsType(int64_t val, LLVM_TYPE_CONST llvm::Type *type) {
 
 
 llvm::Constant *
-LLVMUIntAsType(uint64_t val, LLVM_TYPE_CONST llvm::Type *type) {
-    LLVM_TYPE_CONST llvm::VectorType *vecType =
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(type);
+LLVMUIntAsType(uint64_t val, llvm::Type *type) {
+    llvm::VectorType *vecType =
+        llvm::dyn_cast<llvm::VectorType>(type);
 
     if (vecType != NULL) {
         llvm::Constant *v = llvm::ConstantInt::get(vecType->getElementType(),
@@ -642,8 +642,8 @@ LLVMFlattenInsertChain(llvm::InsertElementInst *ie, int vectorWidth,
 bool
 LLVMExtractVectorInts(llvm::Value *v, int64_t ret[], int *nElts) {
     // Make sure we do in fact have a vector of integer values here
-    LLVM_TYPE_CONST llvm::VectorType *vt =
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(v->getType());
+    llvm::VectorType *vt =
+        llvm::dyn_cast<llvm::VectorType>(v->getType());
     Assert(vt != NULL);
     Assert(llvm::isa<llvm::IntegerType>(vt->getElementType()));
 
@@ -657,7 +657,7 @@ LLVMExtractVectorInts(llvm::Value *v, int64_t ret[], int *nElts) {
 
     // Deal with the fact that LLVM3.1 and previous versions have different
     // representations for vectors of constant ints...
-#ifdef LLVM_3_1svn
+#ifndef LLVM_3_0
     llvm::ConstantDataVector *cv = llvm::dyn_cast<llvm::ConstantDataVector>(v);
     if (cv == NULL)
         return false;
@@ -678,7 +678,7 @@ LLVMExtractVectorInts(llvm::Value *v, int64_t ret[], int *nElts) {
          ret[i] = ci->getSExtValue();
      }
      return true;
-#endif // LLVM_3_1svn
+#endif // !LLVM_3_0
 }
 
 
@@ -696,7 +696,7 @@ lVectorValuesAllEqual(llvm::Value *v, int vectorLength,
 static bool
 lIsExactMultiple(llvm::Value *val, int baseValue, int vectorLength,
                  std::vector<llvm::PHINode *> &seenPhis) {
-    if (llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(val->getType()) == false) {
+    if (llvm::isa<llvm::VectorType>(val->getType()) == false) {
         // If we've worked down to a constant int, then the moment of truth
         // has arrived...
         llvm::ConstantInt *ci = llvm::dyn_cast<llvm::ConstantInt>(val);
@@ -780,7 +780,7 @@ static bool
 lAllDivBaseEqual(llvm::Value *val, int64_t baseValue, int vectorLength,
                  std::vector<llvm::PHINode *> &seenPhis,
                  bool &canAdd) {
-    Assert(llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(val->getType()));
+    Assert(llvm::isa<llvm::VectorType>(val->getType()));
     // Make sure the base value is a positive power of 2
     Assert(baseValue > 0 && (baseValue & (baseValue-1)) == 0);
 
@@ -790,7 +790,7 @@ lAllDivBaseEqual(llvm::Value *val, int64_t baseValue, int vectorLength,
 
     int64_t vecVals[ISPC_MAX_NVEC];
     int nElts;
-    if (llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(val->getType()) &&
+    if (llvm::isa<llvm::VectorType>(val->getType()) &&
         LLVMExtractVectorInts(val, vecVals, &nElts)) {
         // If we have a vector of compile-time constant integer values,
         // then go ahead and check them directly..
@@ -880,7 +880,7 @@ lAllDivBaseEqual(llvm::Value *val, int64_t baseValue, int vectorLength,
         // the addConstants[], mod baseValue.  If we round that up to the
         // next power of 2, we'll have a value that will be no greater than
         // baseValue and sometimes less.
-        int maxMod = addConstants[0] % baseValue;
+        int maxMod = int(addConstants[0] % baseValue);
         for (int i = 1; i < vectorLength; ++i)
             maxMod = std::max(maxMod, int(addConstants[i] % baseValue));
         int requiredAlignment = lRoundUpPow2(maxMod);
@@ -947,7 +947,7 @@ lVectorValuesAllEqual(llvm::Value *v, int vectorLength,
     if (cv != NULL)
         return (cv->getSplatValue() != NULL);
 
-#ifdef LLVM_3_1svn
+#ifndef LLVM_3_0
     llvm::ConstantDataVector *cdv = llvm::dyn_cast<llvm::ConstantDataVector>(v);
     if (cdv != NULL)
         return (cdv->getSplatValue() != NULL);
@@ -1074,8 +1074,8 @@ lVectorValuesAllEqual(llvm::Value *v, int vectorLength,
 */
 bool
 LLVMVectorValuesAllEqual(llvm::Value *v) {
-    LLVM_TYPE_CONST llvm::VectorType *vt =
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(v->getType());
+    llvm::VectorType *vt =
+        llvm::dyn_cast<llvm::VectorType>(v->getType());
     Assert(vt != NULL);
     int vectorLength = vt->getNumElements();
 
@@ -1102,7 +1102,7 @@ lVectorIsLinear(llvm::Value *v, int vectorLength, int stride,
  */
 static bool
 lVectorIsLinearConstantInts(
-#ifdef LLVM_3_1svn
+#ifndef LLVM_3_0
                             llvm::ConstantDataVector *cv, 
 #else
                             llvm::ConstantVector *cv, 
@@ -1111,7 +1111,7 @@ lVectorIsLinearConstantInts(
                             int stride) {
     // Flatten the vector out into the elements array
     llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> elements;
-#ifdef LLVM_3_1svn
+#ifndef LLVM_3_0
     for (int i = 0; i < (int)cv->getNumElements(); ++i)
         elements.push_back(cv->getElementAsConstant(i));
 #else
@@ -1152,7 +1152,7 @@ lCheckMulForLinear(llvm::Value *op0, llvm::Value *op1, int vectorLength,
                    int stride, std::vector<llvm::PHINode *> &seenPhis) {
     // Is the first operand a constant integer value splatted across all of
     // the lanes?
-#ifdef LLVM_3_1svn
+#ifndef LLVM_3_0
     llvm::ConstantDataVector *cv = llvm::dyn_cast<llvm::ConstantDataVector>(op0);
 #else
     llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(op0);
@@ -1226,7 +1226,7 @@ lVectorIsLinear(llvm::Value *v, int vectorLength, int stride,
                 std::vector<llvm::PHINode *> &seenPhis) {
     // First try the easy case: if the values are all just constant
     // integers and have the expected stride between them, then we're done.
-#ifdef LLVM_3_1svn
+#ifndef LLVM_3_0
     llvm::ConstantDataVector *cv = llvm::dyn_cast<llvm::ConstantDataVector>(v);
 #else
     llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(v);
@@ -1344,8 +1344,8 @@ lVectorIsLinear(llvm::Value *v, int vectorLength, int stride,
 */
 bool
 LLVMVectorIsLinear(llvm::Value *v, int stride) {
-    LLVM_TYPE_CONST llvm::VectorType *vt =
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(v->getType());
+    llvm::VectorType *vt =
+        llvm::dyn_cast<llvm::VectorType>(v->getType());
     Assert(vt != NULL);
     int vectorLength = vt->getNumElements();
 
@@ -1390,19 +1390,38 @@ LLVMDumpValue(llvm::Value *v) {
 
 
 static llvm::Value *
-lExtractFirstVectorElement(llvm::Value *v, llvm::Instruction *insertBefore,
+lExtractFirstVectorElement(llvm::Value *v, 
                            std::map<llvm::PHINode *, llvm::PHINode *> &phiMap) {
-    // If it's not an instruction (i.e. is a constant), then we can just
-    // emit an extractelement instruction and let the regular optimizer do
-    // the rest.
-    if (llvm::isa<llvm::Instruction>(v) == false)
-        return llvm::ExtractElementInst::Create(v, LLVMInt32(0), "first_elt",
-                                                insertBefore);
-
-    LLVM_TYPE_CONST llvm::VectorType *vt =
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(v->getType());
+    llvm::VectorType *vt =
+        llvm::dyn_cast<llvm::VectorType>(v->getType());
     Assert(vt != NULL);
 
+    // First, handle various constant types; do the extraction manually, as
+    // appropriate.
+    if (llvm::isa<llvm::ConstantAggregateZero>(v) == true) {
+        Assert(vt->getElementType()->isIntegerTy());
+        return llvm::ConstantInt::get(vt->getElementType(), 0);
+    }
+    if (llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(v)) {
+#ifndef LLVM_3_0
+        return cv->getOperand(0);
+#else
+        llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> elements;
+        cv->getVectorElements(elements);
+        return elements[0];
+#endif // !LLVM_3_0
+    }
+#ifndef LLVM_3_0
+    if (llvm::ConstantDataVector *cdv = 
+        llvm::dyn_cast<llvm::ConstantDataVector>(v))
+        return cdv->getElementAsConstant(0);
+#endif  // !LLVM_3_0
+
+    // Otherwise, all that we should have at this point is an instruction
+    // of some sort
+    Assert(llvm::isa<llvm::Constant>(v) == false);
+    Assert(llvm::isa<llvm::Instruction>(v) == true);
+
     std::string newName = v->getName().str() + std::string(".elt0");
 
     // Rewrite regular binary operators and casts to the scalarized
@@ -1410,20 +1429,24 @@ lExtractFirstVectorElement(llvm::Value *v, llvm::Instruction *insertBefore,
     llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(v);
     if (bop != NULL) {
         llvm::Value *v0 = lExtractFirstVectorElement(bop->getOperand(0),
-                                                     insertBefore, phiMap);
+                                                     phiMap);
         llvm::Value *v1 = lExtractFirstVectorElement(bop->getOperand(1),
-                                                     insertBefore, phiMap);
+                                                     phiMap);
+        // Note that the new binary operator is inserted immediately before
+        // the previous vector one
         return llvm::BinaryOperator::Create(bop->getOpcode(), v0, v1,
-                                            newName, insertBefore);
+                                            newName, bop);
     }
 
     llvm::CastInst *cast = llvm::dyn_cast<llvm::CastInst>(v);
     if (cast != NULL) {
         llvm::Value *v = lExtractFirstVectorElement(cast->getOperand(0),
-                                                    insertBefore, phiMap);
+                                                    phiMap);
+        // Similarly, the equivalent scalar cast instruction goes right
+        // before the vector cast
         return llvm::CastInst::Create(cast->getOpcode(), v,
                                       vt->getElementType(), newName,
-                                      insertBefore);
+                                      cast);
     }
 
     llvm::PHINode *phi = llvm::dyn_cast<llvm::PHINode>(v);
@@ -1438,18 +1461,17 @@ lExtractFirstVectorElement(llvm::Value *v, llvm::Instruction *insertBefore,
         // return the pointer and not get stuck in an infinite loop.
         //
         // The insertion point for the new phi node also has to be the
-        // start of the bblock of the original phi node, which isn't
-        // necessarily the same bblock as insertBefore is in!
+        // start of the bblock of the original phi node.
         llvm::Instruction *phiInsertPos = phi->getParent()->begin();
         llvm::PHINode *scalarPhi = 
             llvm::PHINode::Create(vt->getElementType(), 
-                                  phi->getNumIncomingValues(), newName,
-                                  phiInsertPos);
+                                  phi->getNumIncomingValues(), 
+                                  newName, phiInsertPos);
         phiMap[phi] = scalarPhi;
 
         for (unsigned i = 0; i < phi->getNumIncomingValues(); ++i) {
             llvm::Value *v = lExtractFirstVectorElement(phi->getIncomingValue(i),
-                                                        insertBefore, phiMap);
+                                                        phiMap);
             scalarPhi->addIncoming(v, phi->getIncomingBlock(i));
         }
 
@@ -1466,15 +1488,22 @@ lExtractFirstVectorElement(llvm::Value *v, llvm::Instruction *insertBefore,
     }
 
     // Worst case, for everything else, just do a regular extract element
-    return llvm::ExtractElementInst::Create(v, LLVMInt32(0), "first_elt",
-                                            insertBefore);
+    // instruction, which we insert immediately after the instruction we
+    // have here.
+    llvm::Instruction *insertAfter = llvm::dyn_cast<llvm::Instruction>(v);
+    Assert(insertAfter != NULL);
+    llvm::Instruction *ee = 
+        llvm::ExtractElementInst::Create(v, LLVMInt32(0), "first_elt",
+                                         (llvm::Instruction *)NULL);
+    ee->insertAfter(insertAfter);
+    return ee;
 }
 
 
 llvm::Value *
-LLVMExtractFirstVectorElement(llvm::Value *v, llvm::Instruction *insertBefore) {
+LLVMExtractFirstVectorElement(llvm::Value *v) {
     std::map<llvm::PHINode *, llvm::PHINode *> phiMap;
-    llvm::Value *ret = lExtractFirstVectorElement(v, insertBefore, phiMap);
+    llvm::Value *ret = lExtractFirstVectorElement(v, phiMap);
     return ret;
 }
 
@@ -1489,8 +1518,8 @@ LLVMConcatVectors(llvm::Value *v1, llvm::Value *v2,
                   llvm::Instruction *insertBefore) {
     Assert(v1->getType() == v2->getType());
 
-    LLVM_TYPE_CONST llvm::VectorType *vt =
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(v1->getType());
+    llvm::VectorType *vt =
+        llvm::dyn_cast<llvm::VectorType>(v1->getType());
     Assert(vt != NULL);
 
     int32_t identity[ISPC_MAX_NVEC];
@@ -1518,12 +1547,29 @@ LLVMShuffleVectors(llvm::Value *v1, llvm::Value *v2, int32_t shuf[],
             shufVec.push_back(LLVMInt32(shuf[i]));
     }
 
-#ifndef LLVM_2_9
     llvm::ArrayRef<llvm::Constant *> aref(&shufVec[0], &shufVec[shufSize]);
     llvm::Value *vec = llvm::ConstantVector::get(aref);
-#else // LLVM_2_9
-    llvm::Value *vec = llvm::ConstantVector::get(shufVec);
-#endif
 
     return new llvm::ShuffleVectorInst(v1, v2, vec, "shuffle", insertBefore);
 }
+
+
+const char *
+LLVMGetName(llvm::Value *v, const char *s) {
+    if (v == NULL) return s;
+    std::string ret = v->getName();
+    ret += s;
+    return strdup(ret.c_str());
+}
+
+
+const char *
+LLVMGetName(const char *op, llvm::Value *v1, llvm::Value *v2) {
+    std::string r = op;
+    r += "_";
+    r += v1->getName().str();
+    r += "_";
+    r += v2->getName().str();
+    return strdup(r.c_str());
+}
+
diff --git a/llvmutil.h b/llvmutil.h
index 96cdf079..ba8bc16d 100644
--- a/llvmutil.h
+++ b/llvmutil.h
@@ -48,57 +48,50 @@ namespace llvm {
     class InsertElementInst;
 }
 
-// llvm::Type *s are no longer const in llvm 3.0
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
-#define LLVM_TYPE_CONST
-#else
-#define LLVM_TYPE_CONST const
-#endif
-
 
 /** This structure holds pointers to a variety of LLVM types; code
     elsewhere can use them from here, ratherthan needing to make more
     verbose LLVM API calls.
  */ 
 struct LLVMTypes {
-    static LLVM_TYPE_CONST llvm::Type *VoidType;
-    static LLVM_TYPE_CONST llvm::PointerType *VoidPointerType;
-    static LLVM_TYPE_CONST llvm::Type *PointerIntType;
-    static LLVM_TYPE_CONST llvm::Type *BoolType;
+    static llvm::Type *VoidType;
+    static llvm::PointerType *VoidPointerType;
+    static llvm::Type *PointerIntType;
+    static llvm::Type *BoolType;
 
-    static LLVM_TYPE_CONST llvm::Type *Int8Type;
-    static LLVM_TYPE_CONST llvm::Type *Int16Type;
-    static LLVM_TYPE_CONST llvm::Type *Int32Type;
-    static LLVM_TYPE_CONST llvm::Type *Int64Type;
-    static LLVM_TYPE_CONST llvm::Type *FloatType;
-    static LLVM_TYPE_CONST llvm::Type *DoubleType;
+    static llvm::Type *Int8Type;
+    static llvm::Type *Int16Type;
+    static llvm::Type *Int32Type;
+    static llvm::Type *Int64Type;
+    static llvm::Type *FloatType;
+    static llvm::Type *DoubleType;
 
-    static LLVM_TYPE_CONST llvm::Type *Int8PointerType;
-    static LLVM_TYPE_CONST llvm::Type *Int16PointerType;
-    static LLVM_TYPE_CONST llvm::Type *Int32PointerType;
-    static LLVM_TYPE_CONST llvm::Type *Int64PointerType;
-    static LLVM_TYPE_CONST llvm::Type *FloatPointerType;
-    static LLVM_TYPE_CONST llvm::Type *DoublePointerType;
+    static llvm::Type *Int8PointerType;
+    static llvm::Type *Int16PointerType;
+    static llvm::Type *Int32PointerType;
+    static llvm::Type *Int64PointerType;
+    static llvm::Type *FloatPointerType;
+    static llvm::Type *DoublePointerType;
 
-    static LLVM_TYPE_CONST llvm::VectorType *MaskType;
+    static llvm::VectorType *MaskType;
 
-    static LLVM_TYPE_CONST llvm::VectorType *BoolVectorType;
-    static LLVM_TYPE_CONST llvm::VectorType *Int1VectorType;
-    static LLVM_TYPE_CONST llvm::VectorType *Int8VectorType;
-    static LLVM_TYPE_CONST llvm::VectorType *Int16VectorType;
-    static LLVM_TYPE_CONST llvm::VectorType *Int32VectorType;
-    static LLVM_TYPE_CONST llvm::VectorType *Int64VectorType;
-    static LLVM_TYPE_CONST llvm::VectorType *FloatVectorType;
-    static LLVM_TYPE_CONST llvm::VectorType *DoubleVectorType;
+    static llvm::VectorType *BoolVectorType;
+    static llvm::VectorType *Int1VectorType;
+    static llvm::VectorType *Int8VectorType;
+    static llvm::VectorType *Int16VectorType;
+    static llvm::VectorType *Int32VectorType;
+    static llvm::VectorType *Int64VectorType;
+    static llvm::VectorType *FloatVectorType;
+    static llvm::VectorType *DoubleVectorType;
 
-    static LLVM_TYPE_CONST llvm::Type *Int8VectorPointerType;
-    static LLVM_TYPE_CONST llvm::Type *Int16VectorPointerType;
-    static LLVM_TYPE_CONST llvm::Type *Int32VectorPointerType;
-    static LLVM_TYPE_CONST llvm::Type *Int64VectorPointerType;
-    static LLVM_TYPE_CONST llvm::Type *FloatVectorPointerType;
-    static LLVM_TYPE_CONST llvm::Type *DoubleVectorPointerType;
+    static llvm::Type *Int8VectorPointerType;
+    static llvm::Type *Int16VectorPointerType;
+    static llvm::Type *Int32VectorPointerType;
+    static llvm::Type *Int64VectorPointerType;
+    static llvm::Type *FloatVectorPointerType;
+    static llvm::Type *DoubleVectorPointerType;
 
-    static LLVM_TYPE_CONST llvm::VectorType *VoidPointerVectorType;
+    static llvm::VectorType *VoidPointerVectorType;
 };
 
 /** These variables hold the corresponding LLVM constant values as a
@@ -175,11 +168,11 @@ extern llvm::Constant *LLVMDoubleVector(double f);
 
 /** Returns a constant integer or vector (according to the given type) of
     the given signed integer value. */
-extern llvm::Constant *LLVMIntAsType(int64_t, LLVM_TYPE_CONST llvm::Type *t);
+extern llvm::Constant *LLVMIntAsType(int64_t, llvm::Type *t);
 
 /** Returns a constant integer or vector (according to the given type) of
     the given unsigned integer value. */
-extern llvm::Constant *LLVMUIntAsType(uint64_t, LLVM_TYPE_CONST llvm::Type *t);
+extern llvm::Constant *LLVMUIntAsType(uint64_t, llvm::Type *t);
 
 /** Returns an LLVM boolean vector based on the given array of values.
     The array should have g->target.vectorWidth elements. */
@@ -281,8 +274,7 @@ extern void LLVMDumpValue(llvm::Value *v);
     worth of values just to extract the first element, in cases where only
     the first element's value is needed.
   */
-extern llvm::Value *LLVMExtractFirstVectorElement(llvm::Value *v, 
-                                              llvm::Instruction *insertBefore);
+extern llvm::Value *LLVMExtractFirstVectorElement(llvm::Value *v);
 
 /** This function takes two vectors, expected to be the same length, and
     returns a new vector of twice the length that represents concatenating
@@ -298,4 +290,10 @@ extern llvm::Value *LLVMShuffleVectors(llvm::Value *v1, llvm::Value *v2,
                                        int32_t shuf[], int shufSize,
                                        llvm::Instruction *insertBefore);
 
+/** Utility routines to concat strings with the names of existing values to
+    create meaningful new names for instruction values.
+*/
+extern const char *LLVMGetName(llvm::Value *v, const char *);
+extern const char *LLVMGetName(const char *op, llvm::Value *v1, llvm::Value *v2);
+
 #endif // ISPC_LLVMUTIL_H
diff --git a/main.cpp b/main.cpp
index b29a9f0f..417c1c3c 100644
--- a/main.cpp
+++ b/main.cpp
@@ -44,16 +44,9 @@
 #ifdef ISPC_IS_WINDOWS
   #include <time.h>
 #endif // ISPC_IS_WINDOWS
-#include <llvm/Support/PrettyStackTrace.h>
 #include <llvm/Support/Signals.h>
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
-  #include <llvm/Support/TargetRegistry.h>
-  #include <llvm/Support/TargetSelect.h>
-#else
-  #include <llvm/Target/TargetRegistry.h>
-  #include <llvm/Target/TargetSelect.h>
-  #include <llvm/Target/SubtargetFeature.h>
-#endif
+#include <llvm/Support/TargetRegistry.h>
+#include <llvm/Support/TargetSelect.h>
 
 #ifdef ISPC_IS_WINDOWS
 #define strcasecmp stricmp
@@ -67,12 +60,12 @@ static void
 lPrintVersion() {
     printf("Intel(r) SPMD Program Compiler (ispc), %s (build %s @ %s, LLVM %s)\n", 
            ISPC_VERSION, BUILD_VERSION, BUILD_DATE, 
-#ifdef LLVM_2_9
-           "2.9"
-#elif defined(LLVM_3_0) || defined(LLVM_3_0svn)
+#if defined(LLVM_3_0)
            "3.0"
-#elif defined(LLVM_3_1) || defined(LLVM_3_1svn)
+#elif defined(LLVM_3_1)
            "3.1"
+#elif defined(LLVM_3_2)
+           "3.2"
 #else
 #error "Unhandled LLVM version"
 #endif 
@@ -91,12 +84,10 @@ usage(int ret) {
            Target::SupportedTargetArchs());
     printf("    [--c++-include-file=<name>]\t\tSpecify name of file to emit in #include statement in generated C++ code.\n");
     printf("    [--cpu=<cpu>]\t\t\tSelect target CPU type\n");
-    printf("         <cpu>={%s}\n", Target::SupportedTargetCPUs());
+    printf("         <cpu>={%s}\n", Target::SupportedTargetCPUs().c_str());
     printf("    [-D<foo>]\t\t\t\t#define given value when running preprocessor\n");
     printf("    [--emit-asm]\t\t\tGenerate assembly language file as output\n");
-#ifndef LLVM_2_9
     printf("    [--emit-c++]\t\t\tEmit a C++ source file as output\n");
-#endif // !LLVM_2_9
     printf("    [--emit-llvm]\t\t\tEmit LLVM bitode file as output\n");
     printf("    [--emit-obj]\t\t\tGenerate object file file as output (default)\n");
     printf("    [-g]\t\t\t\tGenerate debugging information\n");
@@ -202,17 +193,18 @@ static void lGetAllArgs(int Argc, char *Argv[], int &argc, char *argv[128]) {
 }
 
 
+static void
+lSignal(void *) {
+    FATAL("Unhandled signal sent to process; terminating.");
+}
+
+
 int main(int Argc, char *Argv[]) {
     int argc;
     char *argv[128];
     lGetAllArgs(Argc, Argv, argc, argv);
 
-#if 0
-    // Use LLVM's little utility function to print out nice stack traces if
-    // we crash
-    llvm::sys::PrintStackTraceOnErrorSignal();
-    llvm::PrettyStackTraceProgram X(argc, argv);
-#endif
+    llvm::sys::AddSignalHandler(lSignal, NULL);
 
     // initialize available LLVM targets
     LLVMInitializeX86TargetInfo();
@@ -220,9 +212,7 @@ int main(int Argc, char *Argv[]) {
     LLVMInitializeX86AsmPrinter();
     LLVMInitializeX86AsmParser();
     LLVMInitializeX86Disassembler();
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
     LLVMInitializeX86TargetMC();
-#endif
 
     char *file = NULL;
     const char *headerFileName = NULL;
@@ -279,10 +269,8 @@ int main(int Argc, char *Argv[]) {
         }
         else if (!strcmp(argv[i], "--emit-asm"))
             ot = Module::Asm;
-#ifndef LLVM_2_9
         else if (!strcmp(argv[i], "--emit-c++"))
             ot = Module::CXX;
-#endif // !LLVM_2_9
         else if (!strcmp(argv[i], "--emit-llvm"))
             ot = Module::Bitcode;
         else if (!strcmp(argv[i], "--emit-obj"))
diff --git a/module.cpp b/module.cpp
index 99da37ab..f69fa0f7 100644
--- a/module.cpp
+++ b/module.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
   All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
@@ -88,6 +88,124 @@
 #include <llvm/Support/raw_ostream.h>
 #include <llvm/Bitcode/ReaderWriter.h>
 
+static void
+lDeclareSizeAndPtrIntTypes(SymbolTable *symbolTable) {
+    const Type *ptrIntType = (g->target.is32Bit) ? AtomicType::VaryingInt32 :
+        AtomicType::VaryingInt64;
+    ptrIntType = ptrIntType->GetAsUnboundVariabilityType();
+
+    symbolTable->AddType("intptr_t", ptrIntType, SourcePos());
+    symbolTable->AddType("uintptr_t", ptrIntType->GetAsUnsignedType(),
+                         SourcePos());
+    symbolTable->AddType("ptrdiff_t", ptrIntType, SourcePos());
+
+    const Type *sizeType = (g->target.is32Bit || g->opt.force32BitAddressing) ?
+        AtomicType::VaryingInt32 : AtomicType::VaryingInt64;
+    sizeType = sizeType->GetAsUnboundVariabilityType();
+    symbolTable->AddType("size_t", sizeType, SourcePos());
+}
+
+
+/** After compilation completes, there's often a lot of extra debugging
+    metadata left around that isn't needed any more--for example, for
+    static functions that weren't actually used, function information for
+    functions that were inlined, etc.  This function takes a llvm::Module
+    and tries to strip out all of this extra stuff.
+ */
+static void
+lStripUnusedDebugInfo(llvm::Module *module) {
+    if (g->generateDebuggingSymbols == false)
+        return;
+
+#ifndef LLVM_3_0
+    // loop over the compile units that contributed to the final module
+    if (llvm::NamedMDNode *cuNodes = module->getNamedMetadata("llvm.dbg.cu")) {
+        for (unsigned i = 0, ie = cuNodes->getNumOperands(); i != ie; ++i) {
+            llvm::MDNode *cuNode = cuNodes->getOperand(i);
+            llvm::DICompileUnit cu(cuNode);
+            llvm::DIArray subprograms = cu.getSubprograms();
+            std::vector<llvm::Value *> usedSubprograms;
+
+            if (subprograms.getNumElements() == 0)
+                continue;
+
+            // And now loop over the subprograms inside each compile unit.
+            for (unsigned j = 0, je = subprograms.getNumElements(); j != je; ++j) {
+                llvm::MDNode *spNode = 
+                    llvm::dyn_cast<llvm::MDNode>(subprograms->getOperand(j));
+                Assert(spNode != NULL);
+                llvm::DISubprogram sp(spNode);
+
+                // Get the name of the subprogram.  Start with the mangled
+                // name; if that's empty then we have an export'ed
+                // function, so grab the unmangled name in that case.
+                std::string name = sp.getLinkageName();
+                if (name == "")
+                    name = sp.getName();
+
+                // Does the llvm::Function for this function survive in the
+                // module?
+                if (module->getFunction(name) != NULL)
+                    usedSubprograms.push_back(sp);
+            }
+
+            Debug(SourcePos(), "%d / %d functions left in module with debug "
+                  "info.", (int)usedSubprograms.size(),
+                  (int)subprograms.getNumElements());
+
+            // We'd now like to replace the array of subprograms in the
+            // compile unit with only the ones that actually have function
+            // definitions present.  Unfortunately, llvm::DICompileUnit
+            // doesn't provide a method to set the subprograms.  Therefore,
+            // we end up needing to directly stuff a new array into the
+            // appropriate slot (number 12) in the MDNode for the compile
+            // unit.
+            //
+            // Because this is all so hard-coded and would break if the
+            // debugging metadata organization on the LLVM side changed,
+            // here is a bunch of asserting to make sure that element 12 of
+            // the compile unit's MDNode has the subprograms array....
+            llvm::MDNode *nodeSPMD = 
+                llvm::dyn_cast<llvm::MDNode>(cuNode->getOperand(12));
+            Assert(nodeSPMD != NULL);
+            llvm::MDNode *nodeSPMDArray =
+                llvm::dyn_cast<llvm::MDNode>(nodeSPMD->getOperand(0));
+            llvm::DIArray nodeSPs(nodeSPMDArray);
+            Assert(nodeSPs.getNumElements() == subprograms.getNumElements());
+            for (int i = 0; i < (int)nodeSPs.getNumElements(); ++i)
+                Assert(nodeSPs.getElement(i) == subprograms.getElement(i));
+
+            // And now we can go and stuff it into the node with some
+            // confidence...
+            llvm::Value *usedSubprogramsArray = 
+                m->diBuilder->getOrCreateArray(llvm::ArrayRef<llvm::Value *>(usedSubprograms));
+            llvm::MDNode *replNode = 
+                llvm::MDNode::get(*g->ctx, llvm::ArrayRef<llvm::Value *>(usedSubprogramsArray));
+            cuNode->replaceOperandWith(12, replNode);
+        }
+    }
+
+    // Also, erase a bunch of named metadata detrius; for each function
+    // there is sometimes named metadata llvm.dbg.lv.{funcname} that
+    // doesn't seem to be otherwise needed.
+    std::vector<llvm::NamedMDNode *> toErase;
+    llvm::Module::named_metadata_iterator iter = module->named_metadata_begin();
+    for (; iter != module->named_metadata_end(); ++iter) {
+        if (!strncmp(iter->getName().str().c_str(), "llvm.dbg.lv", 11))
+            toErase.push_back(iter);
+    }
+    for (int i = 0; i < (int)toErase.size(); ++i)
+        module->eraseNamedMetadata(toErase[i]);
+#endif // !LLVM_3_0
+
+    // Wrap up by running the LLVM pass to remove anything left that's
+    // unused.
+    llvm::PassManager pm;
+    pm.add(llvm::createStripDeadDebugInfoPass());
+    pm.run(*module);
+}
+
+
 ///////////////////////////////////////////////////////////////////////////
 // Module
 
@@ -103,6 +221,8 @@ Module::Module(const char *fn) {
     symbolTable = new SymbolTable;
     ast = new AST;
 
+    lDeclareSizeAndPtrIntTypes(symbolTable);
+
     module = new llvm::Module(filename ? filename : "<stdin>", *g->ctx);
     module->setTargetTriple(g->target.GetTripleString());
 
@@ -124,10 +244,18 @@ Module::Module(const char *fn) {
             std::string directory, name;
             GetDirectoryAndFileName(g->currentDirectory, filename, &directory,
                                     &name);
+            char producerString[512];
+#if defined(BUILD_VERSION) && defined (BUILD_DATE)
+            sprintf(producerString, "ispc version %s (build %s on %s)",
+                    ISPC_VERSION, BUILD_VERSION, BUILD_DATE);
+#else
+            sprintf(producerString, "ispc version %s (built on %s)",
+                    ISPC_VERSION, __DATE__);
+#endif
             diBuilder->createCompileUnit(llvm::dwarf::DW_LANG_C99,  /* lang */
                                          name,  /* filename */
                                          directory, /* directory */
-                                         "ispc", /* producer */
+                                         producerString, /* producer */
                                          g->opt.level > 0 /* is optimized */,
                                          "-g", /* command line args */
                                          0 /* run time version */);
@@ -148,11 +276,6 @@ extern void yy_delete_buffer(YY_BUFFER_STATE);
 
 int
 Module::CompileFile() {
-#ifndef LLVM_3_1svn
-    if (g->opt.fastMath == true)
-        llvm::UnsafeFPMath = true;
-#endif // !LLVM_3_1svn
-
     extern void ParserInit();
     ParserInit();
 
@@ -211,119 +334,175 @@ Module::CompileFile() {
 
 
 void
-Module::AddTypeDef(Symbol *sym) {
+Module::AddTypeDef(const std::string &name, const Type *type,
+                   SourcePos pos) {
     // Typedefs are easy; just add the mapping between the given name and
     // the given type.
-    symbolTable->AddType(sym->name.c_str(), sym->type, sym->pos);
+    symbolTable->AddType(name.c_str(), type, pos);
 }
 
 
 void
-Module::AddGlobalVariable(Symbol *sym, Expr *initExpr, bool isConst) {
+Module::AddGlobalVariable(const std::string &name, const Type *type, Expr *initExpr, 
+                          bool isConst, StorageClass storageClass, SourcePos pos) {
     // These may be NULL due to errors in parsing; just gracefully return
     // here if so.
-    if (sym == NULL || sym->type == NULL) {
-        // But if these are NULL and there haven't been any previous
-        // errors, something surprising is going on
+    if (name == "" || type == NULL) {
         Assert(errorCount > 0);
         return;
     }
 
-    if (symbolTable->LookupFunction(sym->name.c_str())) {
-        Error(sym->pos, "Global variable \"%s\" shadows previously-declared "
-              "function.", sym->name.c_str());
+    if (symbolTable->LookupFunction(name.c_str())) {
+        Error(pos, "Global variable \"%s\" shadows previously-declared "
+              "function.", name.c_str());
         return;
     }
 
-    if (sym->storageClass == SC_EXTERN_C) {
-        Error(sym->pos, "extern \"C\" qualifier can only be used for "
+    if (storageClass == SC_EXTERN_C) {
+        Error(pos, "extern \"C\" qualifier can only be used for "
               "functions.");
         return;
     }
 
-    if (Type::Equal(sym->type, AtomicType::Void)) {
-        Error(sym->pos, "\"void\" type global variable is illegal.");
+    if (Type::Equal(type, AtomicType::Void)) {
+        Error(pos, "\"void\" type global variable is illegal.");
         return;
     }
 
-    sym->type = ArrayType::SizeUnsizedArrays(sym->type, initExpr);
-    if (sym->type == NULL)
+    type = ArrayType::SizeUnsizedArrays(type, initExpr);
+    if (type == NULL)
         return;
 
-    const ArrayType *at = dynamic_cast<const ArrayType *>(sym->type);
+    const ArrayType *at = CastType<ArrayType>(type);
     if (at != NULL && at->TotalElementCount() == 0) {
-        Error(sym->pos, "Illegal to declare a global variable with unsized "
+        Error(pos, "Illegal to declare a global variable with unsized "
               "array dimensions that aren't set with an initializer "
               "expression.");
         return;
     }
         
-    LLVM_TYPE_CONST llvm::Type *llvmType = sym->type->LLVMType(g->ctx);
+    llvm::Type *llvmType = type->LLVMType(g->ctx);
     if (llvmType == NULL)
         return;
 
     // See if we have an initializer expression for the global.  If so,
     // make sure it's a compile-time constant!
     llvm::Constant *llvmInitializer = NULL;
-    if (sym->storageClass == SC_EXTERN || sym->storageClass == SC_EXTERN_C) {
+    ConstExpr *constValue = NULL;
+    if (storageClass == SC_EXTERN || storageClass == SC_EXTERN_C) {
         if (initExpr != NULL)
-            Error(sym->pos, "Initializer can't be provided with \"extern\" "
-                  "global variable \"%s\".", sym->name.c_str());
+            Error(pos, "Initializer can't be provided with \"extern\" "
+                  "global variable \"%s\".", name.c_str());
     }
-    else if (initExpr != NULL) {
-        initExpr = TypeCheck(initExpr);
+    else {
         if (initExpr != NULL) {
-            // We need to make sure the initializer expression is
-            // the same type as the global.  (But not if it's an
-            // ExprList; they don't have types per se / can't type
-            // convert themselves anyway.)
-            if (dynamic_cast<ExprList *>(initExpr) == NULL)
-                initExpr = TypeConvertExpr(initExpr, sym->type, "initializer");
-            
+            initExpr = TypeCheck(initExpr);
             if (initExpr != NULL) {
-                initExpr = Optimize(initExpr);
-                // Fingers crossed, now let's see if we've got a
-                // constant value..
-                llvmInitializer = initExpr->GetConstant(sym->type);
+                // We need to make sure the initializer expression is
+                // the same type as the global.  (But not if it's an
+                // ExprList; they don't have types per se / can't type
+                // convert themselves anyway.)
+                if (dynamic_cast<ExprList *>(initExpr) == NULL)
+                    initExpr = TypeConvertExpr(initExpr, type, "initializer");
+            
+                if (initExpr != NULL) {
+                    initExpr = Optimize(initExpr);
+                    // Fingers crossed, now let's see if we've got a
+                    // constant value..
+                    llvmInitializer = initExpr->GetConstant(type);
 
-                if (llvmInitializer != NULL) {
-                    if (sym->type->IsConstType())
-                        // Try to get a ConstExpr associated with
-                        // the symbol.  This dynamic_cast can
-                        // validly fail, for example for types like
-                        // StructTypes where a ConstExpr can't
-                        // represent their values.
-                        sym->constValue = 
-                            dynamic_cast<ConstExpr *>(initExpr);
+                    if (llvmInitializer != NULL) {
+                        if (type->IsConstType())
+                            // Try to get a ConstExpr associated with
+                            // the symbol.  This dynamic_cast can
+                            // validly fail, for example for types like
+                            // StructTypes where a ConstExpr can't
+                            // represent their values.
+                            constValue = dynamic_cast<ConstExpr *>(initExpr);
+                    }
+                    else
+                        Error(initExpr->pos, "Initializer for global variable \"%s\" "
+                              "must be a constant.", name.c_str());
                 }
-                else
-                    Error(initExpr->pos, "Initializer for global variable \"%s\" "
-                          "must be a constant.", sym->name.c_str());
             }
         }
+
+        // If no initializer was provided or if we couldn't get a value
+        // above, initialize it with zeros..
+        if (llvmInitializer == NULL)
+            llvmInitializer = llvm::Constant::getNullValue(llvmType);
     }
 
-    // If no initializer was provided or if we couldn't get a value
-    // above, initialize it with zeros..
-    if (llvmInitializer == NULL)
-        llvmInitializer = llvm::Constant::getNullValue(llvmType);
+    Symbol *sym = symbolTable->LookupVariable(name.c_str());
+    llvm::GlobalVariable *oldGV = NULL;
+    if (sym != NULL) {
+        // We've already seen either a declaration or a definition of this
+        // global.
+
+        // If the type doesn't match with the previous one, issue an error.
+        if (!Type::Equal(sym->type, type) ||
+            (sym->storageClass != SC_EXTERN && 
+             sym->storageClass != SC_EXTERN_C &&
+             sym->storageClass != storageClass)) {
+            Error(pos, "Definition of variable \"%s\" conflicts with "
+                  "definition at %s:%d.", name.c_str(), 
+                  sym->pos.name, sym->pos.first_line);
+            return;
+        }
+
+        llvm::GlobalVariable *gv = 
+            llvm::dyn_cast<llvm::GlobalVariable>(sym->storagePtr);
+        Assert(gv != NULL);
+
+        // And issue an error if this is a redefinition of a variable
+        if (gv->hasInitializer() && 
+            sym->storageClass != SC_EXTERN && sym->storageClass != SC_EXTERN_C) {
+            Error(pos, "Redefinition of variable \"%s\" is illegal. "
+                  "(Previous definition at %s:%d.)", sym->name.c_str(),
+                  sym->pos.name, sym->pos.first_line);
+            return;
+        }
+
+        // Now, we either have a redeclaration of a global, or a definition
+        // of a previously-declared global.  First, save the pointer to the
+        // previous llvm::GlobalVariable
+        oldGV = gv;
+    }
+    else {
+        sym = new Symbol(name, pos, type, storageClass);
+        symbolTable->AddVariable(sym);
+    }
+    sym->constValue = constValue;
 
     llvm::GlobalValue::LinkageTypes linkage =
         (sym->storageClass == SC_STATIC) ? llvm::GlobalValue::InternalLinkage :
         llvm::GlobalValue::ExternalLinkage;
+
+    // Note that the NULL llvmInitializer is what leads to "extern"
+    // declarations coming up extern and not defining storage (a bit
+    // subtle)...
     sym->storagePtr = new llvm::GlobalVariable(*module, llvmType, isConst, 
                                                linkage, llvmInitializer, 
                                                sym->name.c_str());
-    symbolTable->AddVariable(sym);
 
-    if (diBuilder && (sym->storageClass != SC_EXTERN)) {
-        llvm::DIFile file = sym->pos.GetDIFile();
-        diBuilder->createGlobalVariable(sym->name, 
-                                        file,
-                                        sym->pos.first_line,
-                                        sym->type->GetDIType(file),
-                                        (sym->storageClass == SC_STATIC),
-                                        sym->storagePtr);
+    // Patch up any references to the previous GlobalVariable (e.g. from a
+    // declaration of a global that was later defined.)
+    if (oldGV != NULL) {
+        oldGV->replaceAllUsesWith(sym->storagePtr);
+        oldGV->removeFromParent();
+        sym->storagePtr->setName(sym->name.c_str());
+    }
+    
+    if (diBuilder) {
+        llvm::DIFile file = pos.GetDIFile();
+        llvm::DIGlobalVariable var =
+            diBuilder->createGlobalVariable(name, 
+                                            file,
+                                            pos.first_line,
+                                            sym->type->GetDIType(file),
+                                            (sym->storageClass == SC_STATIC),
+                                            sym->storagePtr);
+        Assert(var.Verify());
     }
 }
 
@@ -343,7 +522,7 @@ Module::AddGlobalVariable(Symbol *sym, Expr *initExpr, bool isConst) {
 */
 static bool
 lRecursiveCheckValidParamType(const Type *t) {
-    const StructType *st = dynamic_cast<const StructType *>(t);
+    const StructType *st = CastType<StructType>(t);
     if (st != NULL) {
         for (int i = 0; i < st->GetElementCount(); ++i)
             if (lRecursiveCheckValidParamType(st->GetElementType(i)))
@@ -351,11 +530,11 @@ lRecursiveCheckValidParamType(const Type *t) {
         return false;
     }
 
-    const SequentialType *seqt = dynamic_cast<const SequentialType *>(t);
+    const SequentialType *seqt = CastType<SequentialType>(t);
     if (seqt != NULL)
         return lRecursiveCheckValidParamType(seqt->GetElementType());
 
-    const PointerType *pt = dynamic_cast<const PointerType *>(t);
+    const PointerType *pt = CastType<PointerType>(t);
     if (pt != NULL) {
         if (pt->IsSlice() || pt->IsVaryingType())
             return true;
@@ -375,8 +554,10 @@ static void
 lCheckForVaryingParameter(const Type *type, const std::string &name, 
                           SourcePos pos) {
     if (lRecursiveCheckValidParamType(type)) {
-        const Type *t = type->GetBaseType();
-        if (dynamic_cast<const StructType *>(t))
+        if (CastType<PointerType>(type))
+            Error(pos, "Varying pointer type parameter \"%s\" is illegal "
+                  "in an exported function.", name.c_str());
+        else if (CastType<StructType>(type->GetBaseType()))
             Error(pos, "Struct parameter \"%s\" with varying member(s) is illegal "
                   "in an exported function.", name.c_str());
         else
@@ -394,7 +575,7 @@ static void
 lCheckForStructParameters(const FunctionType *ftype, SourcePos pos) {
     for (int i = 0; i < ftype->GetNumParameters(); ++i) {
         const Type *type = ftype->GetParameterType(i);
-        if (dynamic_cast<const StructType *>(type) != NULL) {
+        if (CastType<StructType>(type) != NULL) {
             Error(pos, "Passing structs to/from application functions is "
                   "currently broken.  Use a pointer or const pointer to the "
                   "struct instead for now.");
@@ -411,26 +592,39 @@ lCheckForStructParameters(const FunctionType *ftype, SourcePos pos) {
     false if any errors were encountered.
  */
 void
-Module::AddFunctionDeclaration(Symbol *funSym, bool isInline) {
-    const FunctionType *functionType = 
-        dynamic_cast<const FunctionType *>(funSym->type);
+Module::AddFunctionDeclaration(const std::string &name, 
+                               const FunctionType *functionType, 
+                               StorageClass storageClass, bool isInline,
+                               SourcePos pos) {
     Assert(functionType != NULL);
 
     // If a global variable with the same name has already been declared
     // issue an error.
-    if (symbolTable->LookupVariable(funSym->name.c_str()) != NULL) {
-        Error(funSym->pos, "Function \"%s\" shadows previously-declared global variable. "
+    if (symbolTable->LookupVariable(name.c_str()) != NULL) {
+        Error(pos, "Function \"%s\" shadows previously-declared global variable. "
               "Ignoring this definition.",
-              funSym->name.c_str());
+              name.c_str());
         return;
     }
 
     std::vector<Symbol *> overloadFuncs;
-    symbolTable->LookupFunction(funSym->name.c_str(), &overloadFuncs);
+    symbolTable->LookupFunction(name.c_str(), &overloadFuncs);
     if (overloadFuncs.size() > 0) {
         for (unsigned int i = 0; i < overloadFuncs.size(); ++i) {
             Symbol *overloadFunc = overloadFuncs[i];
 
+            const FunctionType *overloadType = 
+                CastType<FunctionType>(overloadFunc->type);
+            if (overloadType == NULL) {
+                Assert(m->errorCount == 0);
+                continue;
+            }
+
+            if (functionType->isExported || overloadType->isExported)
+                Error(pos, "Illegal to have \"export\" function with same name "
+                      "as previously declared function (%s:%d).",
+                      overloadFunc->pos.name, overloadFunc->pos.first_line);
+
             // Check for a redeclaration of a function with the same
             // name and type
             if (Type::Equal(overloadFunc->type, functionType))
@@ -440,7 +634,7 @@ Module::AddFunctionDeclaration(Symbol *funSym, bool isInline) {
             // different, return an error--overloading by return type isn't
             // allowed.
             const FunctionType *ofType = 
-                dynamic_cast<const FunctionType *>(overloadFunc->type);
+                CastType<FunctionType>(overloadFunc->type);
             Assert(ofType != NULL);
             if (ofType->GetNumParameters() == functionType->GetNumParameters()) {
                 int i;
@@ -450,65 +644,67 @@ Module::AddFunctionDeclaration(Symbol *funSym, bool isInline) {
                         break;
                 }
                 if (i == functionType->GetNumParameters()) {
-                    Error(funSym->pos, "Illegal to overload function by return "
-                          "type only (previous declaration was at line %d of "
-                          "file %s).", overloadFunc->pos.first_line,
-                          overloadFunc->pos.name);
+                    std::string thisRetType = functionType->GetReturnTypeString();
+                    std::string otherRetType = ofType->GetReturnTypeString();
+                    Error(pos, "Illegal to overload function by return "
+                          "type only.  This function returns \"%s\" while "
+                          "previous declaration at %s:%d returns \"%s\".",
+                          thisRetType.c_str(), overloadFunc->pos.name,
+                          overloadFunc->pos.first_line, otherRetType.c_str());
                     return;
                 }
             }
         }
     }
 
-    if (funSym->storageClass == SC_EXTERN_C) {
+    if (storageClass == SC_EXTERN_C) {
         // Make sure the user hasn't supplied both an 'extern "C"' and a
         // 'task' qualifier with the function
         if (functionType->isTask) {
-            Error(funSym->pos, "\"task\" qualifier is illegal with C-linkage extern "
-                  "function \"%s\".  Ignoring this function.", funSym->name.c_str());
+            Error(pos, "\"task\" qualifier is illegal with C-linkage extern "
+                  "function \"%s\".  Ignoring this function.", name.c_str());
             return;
         }
 
         std::vector<Symbol *> funcs;
-        symbolTable->LookupFunction(funSym->name.c_str(), &funcs);
+        symbolTable->LookupFunction(name.c_str(), &funcs);
         if (funcs.size() > 0) {
             if (funcs.size() > 1) {
                 // Multiple functions with this name have already been declared; 
                 // can't overload here
-                Error(funSym->pos, "Can't overload extern \"C\" function \"%s\"; "
+                Error(pos, "Can't overload extern \"C\" function \"%s\"; "
                       "%d functions with the same name have already been declared.",
-                      funSym->name.c_str(), (int)funcs.size());
+                      name.c_str(), (int)funcs.size());
                 return;
             }
 
             // One function with the same name has been declared; see if it
             // has the same type as this one, in which case it's ok.
-            if (Type::Equal(funcs[0]->type, funSym->type))
+            if (Type::Equal(funcs[0]->type, functionType))
                 return;
             else {
-                Error(funSym->pos, "Can't overload extern \"C\" function \"%s\".",
-                      funSym->name.c_str());
+                Error(pos, "Can't overload extern \"C\" function \"%s\".",
+                      name.c_str());
                 return;
             }
         }
     }
 
     // Get the LLVM FunctionType
-    bool includeMask = (funSym->storageClass != SC_EXTERN_C);
-    LLVM_TYPE_CONST llvm::FunctionType *llvmFunctionType = 
+    bool includeMask = (storageClass != SC_EXTERN_C);
+    llvm::FunctionType *llvmFunctionType = 
         functionType->LLVMFunctionType(g->ctx, includeMask);
     if (llvmFunctionType == NULL)
         return;
 
     // And create the llvm::Function
-    llvm::GlobalValue::LinkageTypes linkage = (funSym->storageClass == SC_STATIC ||
+    llvm::GlobalValue::LinkageTypes linkage = (storageClass == SC_STATIC ||
                                                isInline) ?
         llvm::GlobalValue::InternalLinkage : llvm::GlobalValue::ExternalLinkage;
-    std::string functionName;
-    if (funSym->storageClass == SC_EXTERN_C)
-        functionName = funSym->name;
-    else {
-        functionName = funSym->MangledName();
+
+    std::string functionName = name;
+    if (storageClass != SC_EXTERN_C) {
+        functionName += functionType->Mangle();
         if (g->mangleFunctionsWithTarget)
             functionName += g->target.GetISAString();
     }
@@ -518,7 +714,7 @@ Module::AddFunctionDeclaration(Symbol *funSym, bool isInline) {
 
     // Set function attributes: we never throw exceptions
     function->setDoesNotThrow(true);
-    if (!(funSym->storageClass == SC_EXTERN_C) && 
+    if (storageClass != SC_EXTERN_C && 
         !g->generateDebuggingSymbols &&
         isInline)
         function->addFnAttr(llvm::Attribute::AlwaysInline);
@@ -528,17 +724,17 @@ Module::AddFunctionDeclaration(Symbol *funSym, bool isInline) {
 
     // Make sure that the return type isn't 'varying' if the function is
     // 'export'ed.
-    if (funSym->storageClass == SC_EXPORT && 
+    if (functionType->isExported && 
         lRecursiveCheckValidParamType(functionType->GetReturnType()))
-        Error(funSym->pos, "Illegal to return a \"varying\" type from exported "
-              "function \"%s\"", funSym->name.c_str());
+        Error(pos, "Illegal to return a \"varying\" type from exported "
+              "function \"%s\"", name.c_str());
 
     if (functionType->isTask && 
         Type::Equal(functionType->GetReturnType(), AtomicType::Void) == false)
-        Error(funSym->pos, "Task-qualified functions must have void return type.");
+        Error(pos, "Task-qualified functions must have void return type.");
 
     if (functionType->isExported || functionType->isExternC)
-        lCheckForStructParameters(functionType, funSym->pos);
+        lCheckForStructParameters(functionType, pos);
 
     // Loop over all of the arguments; process default values if present
     // and do other checks and parameter attribute setting.
@@ -547,12 +743,12 @@ Module::AddFunctionDeclaration(Symbol *funSym, bool isInline) {
     for (int i = 0; i < nArgs; ++i) {
         const Type *argType = functionType->GetParameterType(i);
         const std::string &argName = functionType->GetParameterName(i);
-        ConstExpr *defaultValue = functionType->GetParameterDefault(i);
+        Expr *defaultValue = functionType->GetParameterDefault(i);
         const SourcePos &argPos = functionType->GetParameterSourcePos(i);
 
         // If the function is exported, make sure that the parameter
         // doesn't have any varying stuff going on in it.
-        if (funSym->storageClass == SC_EXPORT)
+        if (functionType->isExported)
             lCheckForVaryingParameter(argType, argName, argPos);
 
         // ISPC assumes that no pointers alias.  (It should be possible to
@@ -560,9 +756,9 @@ Module::AddFunctionDeclaration(Symbol *funSym, bool isInline) {
         // default.)  Set parameter attributes accordingly.  (Only for
         // uniform pointers, since varying pointers are int vectors...)
         if (!functionType->isTask && 
-            ((dynamic_cast<const PointerType *>(argType) != NULL &&
+            ((CastType<PointerType>(argType) != NULL &&
               argType->IsUniformType()) ||
-             dynamic_cast<const ReferenceType *>(argType) != NULL)) {
+             CastType<ReferenceType>(argType) != NULL)) {
 
             // NOTE: LLVM indexes function parameters starting from 1.
             // This is unintuitive.
@@ -596,29 +792,46 @@ Module::AddFunctionDeclaration(Symbol *funSym, bool isInline) {
         function->eraseFromParent();
         function = module->getFunction(functionName);
     }
-    funSym->function = function;
 
     // Finally, we know all is good and we can add the function to the
     // symbol table
+    Symbol *funSym = new Symbol(name, pos, functionType, storageClass);
+    funSym->function = function;
     bool ok = symbolTable->AddFunction(funSym);
     Assert(ok);
 }
 
 
 void
-Module::AddFunctionDefinition(Symbol *sym, const std::vector<Symbol *> &args,
+Module::AddFunctionDefinition(const std::string &name, const FunctionType *type,
                               Stmt *code) {
-    ast->AddFunction(sym, args, code);
+    Symbol *sym = symbolTable->LookupFunction(name.c_str(), type);
+    if (sym == NULL || code == NULL) {
+        Assert(m->errorCount > 0);
+        return;
+    }
+
+    sym->pos = code->pos;
+
+    // FIXME: because we encode the parameter names in the function type,
+    // we need to override the function type here in case the function had
+    // earlier been declared with anonymous parameter names but is now
+    // defined with actual names.  This is yet another reason we shouldn't
+    // include the names in FunctionType...  
+    sym->type = type;
+
+    ast->AddFunction(sym, code);
 }
 
 
 bool
 Module::writeOutput(OutputType outputType, const char *outFileName,
                     const char *includeFileName) {
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
-    if (diBuilder != NULL && outputType != Header)
+    if (diBuilder != NULL && outputType != Header) {
         diBuilder->finalize();
-#endif // LLVM_3_0
+
+        lStripUnusedDebugInfo(module);
+    }
 
     // First, issue a warning if the output file suffix and the type of
     // file being created seem to mismatch.  This can help catch missing
@@ -640,14 +853,12 @@ Module::writeOutput(OutputType outputType, const char *outFileName,
             if (strcasecmp(suffix, "o") && strcasecmp(suffix, "obj"))
                 fileType = "object";
             break;
-#ifndef LLVM_2_9
         case CXX:
             if (strcasecmp(suffix, "c") && strcasecmp(suffix, "cc") &&
                 strcasecmp(suffix, "c++") && strcasecmp(suffix, "cxx") &&
                 strcasecmp(suffix, "cpp"))
                 fileType = "c++";
             break;
-#endif // !LLVM_2_9
         case Header:
             if (strcasecmp(suffix, "h") && strcasecmp(suffix, "hh") &&
                 strcasecmp(suffix, "hpp"))
@@ -663,14 +874,12 @@ Module::writeOutput(OutputType outputType, const char *outFileName,
         return writeHeader(outFileName);
     else if (outputType == Bitcode)
         return writeBitcode(module, outFileName);
-#ifndef LLVM_2_9
     else if (outputType == CXX) {
         extern bool WriteCXXFile(llvm::Module *module, const char *fn, 
                                  int vectorWidth, const char *includeName);
         return WriteCXXFile(module, outFileName, g->target.vectorWidth,
                             includeFileName);
     }
-#endif // !LLVM_2_9
     else
         return writeObjectFileOrAssembly(outputType, outFileName);
 }
@@ -755,109 +964,56 @@ Module::writeObjectFileOrAssembly(llvm::TargetMachine *targetMachine,
 }
 
 
-/** Small structure used in representing dependency graphs of structures
-    (i.e. given a StructType, which other structure types does it have as
-    elements).
- */ 
-struct StructDAGNode {
-    StructDAGNode()
-        : visited(false) { }
-
-    bool visited;
-    std::vector<const StructType *> dependents;
-};
-
-
-/** Visit a node for the topological sort.
+/** Emits a declaration for the given struct to the given file.  This
+    function first makes sure that declarations for any structs that are
+    (recursively) members of this struct are emitted first.
  */
 static void
-lVisitNode(const StructType *structType, 
-           std::map<const StructType *, StructDAGNode *> &structToNode,
-           std::vector<const StructType *> &sortedTypes) {
-    Assert(structToNode.find(structType) != structToNode.end());
-    // Get the node that encodes the structs that this one is immediately
-    // dependent on.
-    StructDAGNode *node = structToNode[structType];
-    if (node->visited)
-        return;
+lEmitStructDecl(const StructType *st, std::vector<const StructType *> *emittedStructs,
+                FILE *file) {
+    // Has this struct type already been declared?  (This happens if it's a
+    // member of another struct for which we emitted a declaration
+    // previously.)
+    for (int i = 0; i < (int)emittedStructs->size(); ++i)
+        if (Type::EqualIgnoringConst(st, (*emittedStructs)[i]))
+            return;
 
-    node->visited = true;
-    // Depth-first traversal: visit all of the dependent nodes...
-    for (unsigned int i = 0; i < node->dependents.size(); ++i)
-        lVisitNode(node->dependents[i], structToNode, sortedTypes);
-    // ...and then add this one to the sorted list
-    sortedTypes.push_back(structType);
+    // Otherwise first make sure any contained structs have been declared.
+    for (int i = 0; i < st->GetElementCount(); ++i) {
+        const StructType *elementStructType = 
+            CastType<StructType>(st->GetElementType(i));
+        if (elementStructType != NULL)
+            lEmitStructDecl(elementStructType, emittedStructs, file);
+    }
+
+    // And now it's safe to declare this one
+    emittedStructs->push_back(st);
+
+    fprintf(file, "struct %s", st->GetStructName().c_str());
+    if (st->GetSOAWidth() > 0)
+        // This has to match the naming scheme in
+        // StructType::GetCDeclaration().
+        fprintf(file, "_SOA%d", st->GetSOAWidth());
+    fprintf(file, " {\n");
+
+    for (int i = 0; i < st->GetElementCount(); ++i) {
+        const Type *type = st->GetElementType(i)->GetAsNonConstType();
+        std::string d = type->GetCDeclaration(st->GetElementName(i));
+        fprintf(file, "    %s;\n", d.c_str());
+    }
+    fprintf(file, "};\n\n");
 }
-           
+
 
 /** Given a set of structures that we want to print C declarations of in a
-    header file, order them so that any struct that is used as a member
-    variable in another struct is printed before the struct that uses it
-    and then print them to the given file.
+    header file, emit their declarations.
  */
 static void
 lEmitStructDecls(std::vector<const StructType *> &structTypes, FILE *file) {
-    // First, build a DAG among the struct types where there is an edge
-    // from node A to node B if struct type A depends on struct type B
-
-    // Records the struct types that have incoming edges in the
-    // DAG--i.e. the ones that one or more other struct types depend on
-    std::set<const StructType *> hasIncomingEdges;
-    // Records the mapping between struct type pointers and the
-    // StructDagNode structures
-    std::map<const StructType *, StructDAGNode *> structToNode;
-    for (unsigned int i = 0; i < structTypes.size(); ++i) {
-        // For each struct type, create its DAG node and record the
-        // relationship between it and its node
-        const StructType *st = structTypes[i];
-        StructDAGNode *node = new StructDAGNode;
-        structToNode[st] = node;
-
-        for (int j = 0; j < st->GetElementCount(); ++j) {
-            const StructType *elementStructType = 
-                dynamic_cast<const StructType *>(st->GetElementType(j));
-            // If this element is a struct type and we haven't already
-            // processed it for the current struct type, then upate th
-            // dependencies and record that this element type has other
-            // struct types that depend on it.
-            if (elementStructType != NULL &&
-                (std::find(node->dependents.begin(), node->dependents.end(), 
-                           elementStructType) == node->dependents.end())) {
-                node->dependents.push_back(elementStructType);
-                hasIncomingEdges.insert(elementStructType);
-            }
-        }
-    }
-
-    // Perform a topological sort of the struct types.  Kick it off by
-    // visiting nodes with no incoming edges; i.e. the struct types that no
-    // other struct types depend on.
-    std::vector<const StructType *> sortedTypes;
-    for (unsigned int i = 0; i < structTypes.size(); ++i) {
-        const StructType *structType = structTypes[i];
-        if (hasIncomingEdges.find(structType) == hasIncomingEdges.end())
-            lVisitNode(structType, structToNode, sortedTypes);
-    }
-    Assert(sortedTypes.size() == structTypes.size());
-
-    // And finally we can emit the struct declarations by going through the
-    // sorted ones in order.
-    for (unsigned int i = 0; i < sortedTypes.size(); ++i) {
-        const StructType *st = sortedTypes[i];
-        fprintf(file, "struct %s", st->GetStructName().c_str());
-        if (st->GetSOAWidth() > 0)
-            // This has to match the naming scheme in
-            // StructType::GetCDeclaration().
-            fprintf(file, "_SOA%d", st->GetSOAWidth());
-        fprintf(file, " {\n");
-
-        for (int j = 0; j < st->GetElementCount(); ++j) {
-            const Type *type = st->GetElementType(j)->GetAsNonConstType();
-            std::string d = type->GetCDeclaration(st->GetElementName(j));
-            fprintf(file, "    %s;\n", d.c_str());
-        }
-        fprintf(file, "};\n\n");
-    }
+    std::vector<const StructType *> emittedStructs;
+    for (unsigned int i = 0; i < structTypes.size(); ++i)
+        lEmitStructDecl(structTypes[i], &emittedStructs, file);
+    Assert(emittedStructs.size() == structTypes.size());
 }
 
 
@@ -947,7 +1103,7 @@ lAddTypeIfNew(const Type *type, std::vector<const T *> *exportedTypes) {
         if (Type::Equal((*exportedTypes)[i], type))
             return;
 
-    const T *castType = dynamic_cast<const T *>(type);
+    const T *castType = CastType<T>(type);
     Assert(castType != NULL);
     exportedTypes->push_back(castType);
 }
@@ -962,13 +1118,13 @@ lGetExportedTypes(const Type *type,
                   std::vector<const StructType *> *exportedStructTypes,
                   std::vector<const EnumType *> *exportedEnumTypes,
                   std::vector<const VectorType *> *exportedVectorTypes) {
-    const ArrayType *arrayType = dynamic_cast<const ArrayType *>(type);
-    const StructType *structType = dynamic_cast<const StructType *>(type);
+    const ArrayType *arrayType = CastType<ArrayType>(type);
+    const StructType *structType = CastType<StructType>(type);
 
-    if (dynamic_cast<const ReferenceType *>(type) != NULL)
+    if (CastType<ReferenceType>(type) != NULL)
         lGetExportedTypes(type->GetReferenceTarget(), exportedStructTypes, 
                           exportedEnumTypes, exportedVectorTypes);
-    else if (dynamic_cast<const PointerType *>(type) != NULL)
+    else if (CastType<PointerType>(type) != NULL)
         lGetExportedTypes(type->GetBaseType(), exportedStructTypes,
                           exportedEnumTypes, exportedVectorTypes);
     else if (arrayType != NULL)
@@ -980,12 +1136,15 @@ lGetExportedTypes(const Type *type,
             lGetExportedTypes(structType->GetElementType(i), exportedStructTypes,
                               exportedEnumTypes, exportedVectorTypes);
     }
-    else if (dynamic_cast<const EnumType *>(type) != NULL)
+    else if (CastType<UndefinedStructType>(type) != NULL)
+        // do nothing
+        ;
+    else if (CastType<EnumType>(type) != NULL)
         lAddTypeIfNew(type, exportedEnumTypes);
-    else if (dynamic_cast<const VectorType *>(type) != NULL)
+    else if (CastType<VectorType>(type) != NULL)
         lAddTypeIfNew(type, exportedVectorTypes);
     else
-        Assert(dynamic_cast<const AtomicType *>(type) != NULL);
+        Assert(CastType<AtomicType>(type) != NULL);
 }
 
 
@@ -998,7 +1157,7 @@ lGetExportedParamTypes(const std::vector<Symbol *> &funcs,
                        std::vector<const EnumType *> *exportedEnumTypes,
                        std::vector<const VectorType *> *exportedVectorTypes) {
     for (unsigned int i = 0; i < funcs.size(); ++i) {
-        const FunctionType *ftype = dynamic_cast<const FunctionType *>(funcs[i]->type);
+        const FunctionType *ftype = CastType<FunctionType>(funcs[i]->type);
         // Handle the return type
         lGetExportedTypes(ftype->GetReturnType(), exportedStructTypes,
                           exportedEnumTypes, exportedVectorTypes);
@@ -1015,7 +1174,7 @@ static void
 lPrintFunctionDeclarations(FILE *file, const std::vector<Symbol *> &funcs) {
     fprintf(file, "#ifdef __cplusplus\nextern \"C\" {\n#endif // __cplusplus\n");
     for (unsigned int i = 0; i < funcs.size(); ++i) {
-        const FunctionType *ftype = dynamic_cast<const FunctionType *>(funcs[i]->type);
+        const FunctionType *ftype = CastType<FunctionType>(funcs[i]->type);
         Assert(ftype);
         std::string decl = ftype->GetCDeclaration(funcs[i]->name);
         fprintf(file, "    extern %s;\n", decl.c_str());
@@ -1024,24 +1183,9 @@ lPrintFunctionDeclarations(FILE *file, const std::vector<Symbol *> &funcs) {
 }
 
 
-static void
-lPrintExternGlobals(FILE *file, const std::vector<Symbol *> &externGlobals) {
-    for (unsigned int i = 0; i < externGlobals.size(); ++i) {
-        Symbol *sym = externGlobals[i];
-        if (lRecursiveCheckValidParamType(sym->type))
-            Warning(sym->pos, "Not emitting declaration for symbol \"%s\" into "
-                    "generated header file since it (or some of its members) "
-                    "has types that are illegal in exported symbols.",
-                    sym->name.c_str());
-        else
-            fprintf(file, "extern %s;\n", sym->type->GetCDeclaration(sym->name).c_str());
-    }
-}
-
-
 static bool
 lIsExported(const Symbol *sym) {
-    const FunctionType *ft = dynamic_cast<const FunctionType *>(sym->type);
+    const FunctionType *ft = CastType<FunctionType>(sym->type);
     Assert(ft);
     return ft->isExported;
 }
@@ -1049,18 +1193,12 @@ lIsExported(const Symbol *sym) {
 
 static bool
 lIsExternC(const Symbol *sym) {
-    const FunctionType *ft = dynamic_cast<const FunctionType *>(sym->type);
+    const FunctionType *ft = CastType<FunctionType>(sym->type);
     Assert(ft);
     return ft->isExternC;
 }
 
 
-static bool
-lIsExternGlobal(const Symbol *sym) {
-    return sym->storageClass == SC_EXTERN || sym->storageClass == SC_EXTERN_C;
-}
-
-
 bool
 Module::writeHeader(const char *fn) {
     FILE *f = fopen(fn, "w");
@@ -1093,7 +1231,7 @@ Module::writeHeader(const char *fn) {
     if (g->emitInstrumentation) {
         fprintf(f, "#define ISPC_INSTRUMENTATION 1\n");
         fprintf(f, "extern \"C\" {\n");
-        fprintf(f, "  void ISPCInstrument(const char *fn, const char *note, int line, int mask);\n");
+        fprintf(f, "  void ISPCInstrument(const char *fn, const char *note, int line, uint64_t mask);\n");
         fprintf(f, "}\n");
     }
 
@@ -1113,13 +1251,6 @@ Module::writeHeader(const char *fn) {
     lGetExportedParamTypes(externCFuncs, &exportedStructTypes,
                            &exportedEnumTypes, &exportedVectorTypes);
 
-    // And do the same for the 'extern' globals
-    std::vector<Symbol *> externGlobals;
-    symbolTable->GetMatchingVariables(lIsExternGlobal, &externGlobals);
-    for (unsigned int i = 0; i < externGlobals.size(); ++i)
-        lGetExportedTypes(externGlobals[i]->type, &exportedStructTypes,
-                          &exportedEnumTypes, &exportedVectorTypes);
-
     // And print them
     lEmitVectorTypedefs(exportedVectorTypes, f);
     lEmitEnumDecls(exportedEnumTypes, f);
@@ -1146,15 +1277,6 @@ Module::writeHeader(const char *fn) {
     // end namespace
     fprintf(f, "\n#ifdef __cplusplus\n}\n#endif // __cplusplus\n");
 
-    // and only now emit externs for globals, outside of the ispc namespace
-    if (externGlobals.size() > 0) {
-        fprintf(f, "\n");
-        fprintf(f, "///////////////////////////////////////////////////////////////////////////\n");
-        fprintf(f, "// Globals declared \"extern\" from ispc code\n");
-        fprintf(f, "///////////////////////////////////////////////////////////////////////////\n");
-        lPrintExternGlobals(f, externGlobals);
-    }
-
     // end guard
     fprintf(f, "\n#endif // %s\n", guard.c_str());
 
@@ -1171,26 +1293,20 @@ Module::execPreprocessor(const char* infilename, llvm::raw_string_ostream* ostre
 
     llvm::raw_fd_ostream stderrRaw(2, false);
 
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
     clang::TextDiagnosticPrinter *diagPrinter =
         new clang::TextDiagnosticPrinter(stderrRaw, clang::DiagnosticOptions());
     llvm::IntrusiveRefCntPtr<clang::DiagnosticIDs> diagIDs(new clang::DiagnosticIDs);
     clang::DiagnosticsEngine *diagEngine = 
         new clang::DiagnosticsEngine(diagIDs, diagPrinter);
     inst.setDiagnostics(diagEngine);
-#else
-    clang::TextDiagnosticPrinter *diagPrinter = 
-        new clang::TextDiagnosticPrinter(stderrRaw, clang::DiagnosticOptions());
-    inst.createDiagnostics(0, NULL, diagPrinter);
-#endif
 
     clang::TargetOptions &options = inst.getTargetOpts();
     llvm::Triple triple(module->getTargetTriple());
     if (triple.getTriple().empty()) {
-#if defined(LLVM_3_1) || defined(LLVM_3_1svn)
-        triple.setTriple(llvm::sys::getDefaultTargetTriple());
-#else
+#ifdef LLVM_3_0
         triple.setTriple(llvm::sys::getHostTriple());
+#else
+        triple.setTriple(llvm::sys::getDefaultTargetTriple());
 #endif
     }
     options.Triple = triple.getTriple();
@@ -1208,9 +1324,7 @@ Module::execPreprocessor(const char* infilename, llvm::raw_string_ostream* ostre
 
     clang::HeaderSearchOptions &headerOpts = inst.getHeaderSearchOpts();
     headerOpts.UseBuiltinIncludes = 0;
-#ifndef LLVM_2_9
     headerOpts.UseStandardSystemIncludes = 0;
-#endif // !LLVM_2_9
     headerOpts.UseStandardCXXIncludes = 0;
     if (g->debugPrint)
         headerOpts.Verbose = 1;
@@ -1241,8 +1355,13 @@ Module::execPreprocessor(const char* infilename, llvm::raw_string_ostream* ostre
     else
         opts.addMacroDef("ISPC_POINTER_SIZE=64");
 
+    if (g->target.hasHalf)
+        opts.addMacroDef("ISPC_TARGET_HAS_HALF");
+    if (g->target.hasTranscendentals)
+        opts.addMacroDef("ISPC_TARGET_HAS_TRANSCENDENTALS");
+
     opts.addMacroDef("ISPC_MAJOR_VERSION=1");
-    opts.addMacroDef("ISPC_MINOR_VERSION=1");
+    opts.addMacroDef("ISPC_MINOR_VERSION=2");
 
     if (g->includeStdlib) {
         if (g->opt.disableAsserts) 
@@ -1418,7 +1537,7 @@ lAddExtractedGlobals(llvm::Module *module,
     for (unsigned int i = 0; i < globals[firstActive].size(); ++i) {
         RewriteGlobalInfo &rgi = globals[firstActive][i];
         llvm::GlobalVariable *gv = rgi.gv;
-        LLVM_TYPE_CONST llvm::Type *type = gv->getType()->getElementType();
+        llvm::Type *type = gv->getType()->getElementType();
         llvm::Constant *initializer = rgi.init;
 
         // Create a new global in the given model that matches the original
@@ -1482,7 +1601,7 @@ lCreateDispatchFunction(llvm::Module *module, llvm::Function *setISAFunc,
     // we'll start by generating an 'extern' declaration of each one that
     // we have in the current module so that we can then call out to that.
     llvm::Function *targetFuncs[Target::NUM_ISAS];
-    LLVM_TYPE_CONST llvm::FunctionType *ftype = NULL;
+    llvm::FunctionType *ftype = NULL;
 
     for (int i = 0; i < Target::NUM_ISAS; ++i) {
         if (funcs.func[i] == NULL) {
@@ -1490,10 +1609,14 @@ lCreateDispatchFunction(llvm::Module *module, llvm::Function *setISAFunc,
             continue;
         }
 
-        // Grab the type of the function as well.
-        if (ftype != NULL)
-            Assert(ftype == funcs.func[i]->getFunctionType());
-        else
+        // Grab the type of the function as well.  Note that the various
+        // functions will have different types if they have arguments that
+        // are pointers to structs, due to the fact that we mangle LLVM
+        // struct type names with the target vector width.  However,
+        // because we only allow uniform stuff to pass through the
+        // export'ed function layer, they should all have the same memory
+        // layout, so this is benign..
+        if (ftype == NULL)
             ftype = funcs.func[i]->getFunctionType();
 
         targetFuncs[i] = 
@@ -1548,24 +1671,13 @@ lCreateDispatchFunction(llvm::Module *module, llvm::Function *setISAFunc,
         for (; argIter != dispatchFunc->arg_end(); ++argIter)
             args.push_back(argIter);
         if (voidReturn) {
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
             llvm::CallInst::Create(targetFuncs[i], args, "", callBBlock);
-#else
-            llvm::CallInst::Create(targetFuncs[i], args.begin(), args.end(),
-                                   "", callBBlock);
-#endif
             llvm::ReturnInst::Create(*g->ctx, callBBlock);
         }
         else {
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
             llvm::Value *retValue = 
                 llvm::CallInst::Create(targetFuncs[i], args, "ret_value", 
                                        callBBlock);
-#else
-            llvm::Value *retValue = 
-                llvm::CallInst::Create(targetFuncs[i], args.begin(), args.end(),
-                                       "ret_value", callBBlock);
-#endif
             llvm::ReturnInst::Create(*g->ctx, retValue, callBBlock);
         }
 
@@ -1663,13 +1775,11 @@ Module::CompileAndOutput(const char *srcFile, const char *arch, const char *cpu,
         return errorCount > 0;
     }
     else {
-#ifndef LLVM_2_9
         if (outputType == CXX) {
             Error(SourcePos(), "Illegal to specify more then one target when "
                   "compiling C++ output.");
             return 1;
         }
-#endif // !LLVM_2_9
 
         // The user supplied multiple targets
         std::vector<std::string> targets = lExtractTargets(target);
diff --git a/module.h b/module.h
index 9032548f..d62728c8 100644
--- a/module.h
+++ b/module.h
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
   All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
@@ -59,30 +59,33 @@ public:
     int CompileFile();
 
     /** Add a named type definition to the module. */
-    void AddTypeDef(Symbol *sym);
+    void AddTypeDef(const std::string &name, const Type *type,
+                    SourcePos pos);
 
     /** Add a new global variable corresponding to the given Symbol to the
         module.  If non-NULL, initExpr gives the initiailizer expression
         for the global's inital value. */ 
-    void AddGlobalVariable(Symbol *sym, Expr *initExpr, bool isConst);
+    void AddGlobalVariable(const std::string &name, const Type *type,
+                           Expr *initExpr, bool isConst,
+                           StorageClass storageClass, SourcePos pos);
 
     /** Add a declaration of the function defined by the given function
         symbol to the module. */
-    void AddFunctionDeclaration(Symbol *funSym, bool isInline);
+    void AddFunctionDeclaration(const std::string &name,
+                                const FunctionType *ftype, 
+                                StorageClass sc, bool isInline, SourcePos pos);
 
     /** Adds the function described by the declaration information and the
         provided statements to the module. */
-    void AddFunctionDefinition(Symbol *sym, const std::vector<Symbol *> &args,
-                               Stmt *code);
+    void AddFunctionDefinition(const std::string &name,
+                               const FunctionType *ftype, Stmt *code);
 
     /** After a source file has been compiled, output can be generated in a
         number of different formats. */
     enum OutputType { Asm,      /** Generate text assembly language output */
                       Bitcode,  /** Generate LLVM IR bitcode output */
                       Object,   /** Generate a native object file */
-#ifndef LLVM_2_9
                       CXX,      /** Generate a C++ file */
-#endif // !LLVM_2_9
                       Header    /** Generate a C/C++ header file with 
                                     declarations of 'export'ed functions, global
                                     variables, and the types used by them. */
diff --git a/opt.cpp b/opt.cpp
index 1ebfd4a4..ce455d6f 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
   All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
@@ -59,9 +59,6 @@
 #include <llvm/Constants.h>
 #include <llvm/Analysis/ConstantFolding.h>
 #include <llvm/Target/TargetLibraryInfo.h>
-#ifdef LLVM_2_9
-    #include <llvm/Support/StandardPasses.h>
-#endif // LLVM_2_9
 #include <llvm/ADT/Triple.h>
 #include <llvm/Transforms/Scalar.h>
 #include <llvm/Transforms/IPO.h>
@@ -72,7 +69,6 @@
 #include <llvm/Analysis/Verifier.h>
 #include <llvm/Analysis/Passes.h>
 #include <llvm/Support/raw_ostream.h>
-#include <llvm/Analysis/DIBuilder.h>
 #include <llvm/Analysis/DebugInfo.h>
 #include <llvm/Support/Dwarf.h>
 #ifdef ISPC_IS_LINUX
@@ -103,6 +99,30 @@ static llvm::Pass *CreateMaskedLoadOptPass();
 static llvm::Pass *CreateIsCompileTimeConstantPass(bool isLastTry);
 static llvm::Pass *CreateMakeInternalFuncsStaticPass();
 
+#define DEBUG_START_PASS(NAME)                                 \
+    if (g->debugPrint &&                                       \
+        (getenv("FUNC") == NULL ||                             \
+         !strncmp(bb.getParent()->getName().str().c_str(), getenv("FUNC"), \
+                  strlen(getenv("FUNC"))))) {                           \
+        fprintf(stderr, "Start of " NAME "\n");                \
+        fprintf(stderr, "---------------\n");                  \
+        bb.dump();                                             \
+        fprintf(stderr, "---------------\n\n");                \
+    } else /* eat semicolon */
+
+#define DEBUG_END_PASS(NAME)                                   \
+    if (g->debugPrint &&                                       \
+        (getenv("FUNC") == NULL ||                             \
+         !strncmp(bb.getParent()->getName().str().c_str(), getenv("FUNC"), \
+                  strlen(getenv("FUNC"))))) {                           \
+        fprintf(stderr, "End of " NAME " %s\n", modifiedAny ? "** CHANGES **" : ""); \
+        fprintf(stderr, "---------------\n");                  \
+        bb.dump();                                             \
+        fprintf(stderr, "---------------\n\n");                \
+    } else /* eat semicolon */
+
+
+
 ///////////////////////////////////////////////////////////////////////////
 
 
@@ -188,13 +208,8 @@ static llvm::Instruction *
 lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, 
           const char *name, llvm::Instruction *insertBefore = NULL) {
     llvm::Value *args[2] = { arg0, arg1 };
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
     llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[2]);
     return llvm::CallInst::Create(func, newArgArray, name, insertBefore);
-#else
-    return llvm::CallInst::Create(func, &args[0], &args[2],
-                                  name, insertBefore);
-#endif
 }
 
 
@@ -203,13 +218,8 @@ lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1,
           llvm::Value *arg2, const char *name,
           llvm::Instruction *insertBefore = NULL) {
     llvm::Value *args[3] = { arg0, arg1, arg2 };
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
     llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[3]);
     return llvm::CallInst::Create(func, newArgArray, name, insertBefore);
-#else
-    return llvm::CallInst::Create(func, &args[0], &args[3],
-                                  name, insertBefore);
-#endif
 }
 
 
@@ -219,13 +229,8 @@ lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1,
           llvm::Value *arg2, llvm::Value *arg3, const char *name,
           llvm::Instruction *insertBefore = NULL) {
     llvm::Value *args[4] = { arg0, arg1, arg2, arg3 };
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
     llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[4]);
     return llvm::CallInst::Create(func, newArgArray, name, insertBefore);
-#else
-    return llvm::CallInst::Create(func, &args[0], &args[4],
-                                  name, insertBefore);
-#endif
 }
 #endif
 
@@ -234,28 +239,19 @@ lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1,
           llvm::Value *arg2, llvm::Value *arg3, llvm::Value *arg4,
           const char *name, llvm::Instruction *insertBefore = NULL) {
     llvm::Value *args[5] = { arg0, arg1, arg2, arg3, arg4 };
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
     llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[5]);
     return llvm::CallInst::Create(func, newArgArray, name, insertBefore);
-#else
-    return llvm::CallInst::Create(func, &args[0], &args[5],
-                                  name, insertBefore);
-#endif
 }
 
+
 static llvm::Instruction *
 lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, 
           llvm::Value *arg2, llvm::Value *arg3, llvm::Value *arg4,
           llvm::Value *arg5, const char *name, 
           llvm::Instruction *insertBefore = NULL) {
     llvm::Value *args[6] = { arg0, arg1, arg2, arg3, arg4, arg5 };
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
     llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[6]);
     return llvm::CallInst::Create(func, newArgArray, name, insertBefore);
-#else
-    return llvm::CallInst::Create(func, &args[0], &args[6],
-                                  name, insertBefore);
-#endif
 }
 
 
@@ -263,14 +259,127 @@ static llvm::Instruction *
 lGEPInst(llvm::Value *ptr, llvm::Value *offset, const char *name,
          llvm::Instruction *insertBefore) {
     llvm::Value *index[1] = { offset };
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
     llvm::ArrayRef<llvm::Value *> arrayRef(&index[0], &index[1]);
     return llvm::GetElementPtrInst::Create(ptr, arrayRef, name,
                                            insertBefore);
-#else
-    return llvm::GetElementPtrInst::Create(ptr, &index[0], &index[1],
-                                           name, insertBefore);
+}
+
+
+/** Given a vector of constant values (int, float, or bool) representing an
+    execution mask, convert it to a bitvector where the 0th bit corresponds
+    to the first vector value and so forth.
+*/
+static uint64_t
+lConstElementsToMask(const llvm::SmallVector<llvm::Constant *, 
+                                             ISPC_MAX_NVEC> &elements) {
+    Assert(elements.size() <= 64);
+
+    uint64_t mask = 0;
+    for (unsigned int i = 0; i < elements.size(); ++i) {
+        llvm::APInt intMaskValue;
+        // SSE has the "interesting" approach of encoding blending
+        // masks as <n x float>.
+        llvm::ConstantFP *cf = llvm::dyn_cast<llvm::ConstantFP>(elements[i]);
+        if (cf != NULL) {
+            llvm::APFloat apf = cf->getValueAPF();
+            intMaskValue = apf.bitcastToAPInt();
+        }
+        else {
+            // Otherwise get it as an int
+            llvm::ConstantInt *ci = llvm::dyn_cast<llvm::ConstantInt>(elements[i]);
+            Assert(ci != NULL);  // vs return -1 if NULL?
+            intMaskValue = ci->getValue();
+        }
+        // Is the high-bit set?  If so, OR in the appropriate bit in
+        // the result mask
+        if (intMaskValue.countLeadingOnes() > 0)
+            mask |= (1ull << i);
+    }
+    return mask;
+}
+
+
+/** Given an llvm::Value represinting a vector mask, see if the value is a
+    constant.  If so, return true and set *bits to be the integer mask
+    found by taking the high bits of the mask values in turn and
+    concatenating them into a single integer.  In other words, given the
+    4-wide mask: < 0xffffffff, 0, 0, 0xffffffff >, we have 0b1001 = 9.
+ */
+static bool
+lGetMask(llvm::Value *factor, uint64_t *mask) {
+#ifndef LLVM_3_0
+    llvm::ConstantDataVector *cdv = llvm::dyn_cast<llvm::ConstantDataVector>(factor);
+    if (cdv != NULL) {
+        llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> elements;
+        for (int i = 0; i < (int)cdv->getNumElements(); ++i)
+            elements.push_back(cdv->getElementAsConstant(i));
+        *mask = lConstElementsToMask(elements);
+        return true;
+    }
 #endif
+
+    llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(factor);
+    if (cv != NULL) {
+        llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> elements;
+#ifndef LLVM_3_0
+        for (int i = 0; i < (int)cv->getNumOperands(); ++i) {
+            llvm::Constant *c = 
+                llvm::dyn_cast<llvm::Constant>(cv->getOperand(i));
+            if (c == NULL)
+                return NULL;
+            elements.push_back(c);
+        }
+#else
+        cv->getVectorElements(elements);
+#endif
+        *mask = lConstElementsToMask(elements);
+        return true;
+    }
+    else if (llvm::isa<llvm::ConstantAggregateZero>(factor)) {
+        *mask = 0;
+        return true;
+    }
+    else {
+#if 0
+        llvm::ConstantExpr *ce = llvm::dyn_cast<llvm::ConstantExpr>(factor);
+        if (ce != NULL) {
+            llvm::TargetMachine *targetMachine = g->target.GetTargetMachine();
+            const llvm::TargetData *td = targetMachine->getTargetData();
+            llvm::Constant *c = llvm::ConstantFoldConstantExpression(ce, td);
+            c->dump();
+            factor = c;
+        }
+        // else we should be able to handle it above...
+        Assert(!llvm::isa<llvm::Constant>(factor));
+#endif
+        return false;
+    }
+}
+
+
+enum MaskStatus { ALL_ON, ALL_OFF, MIXED, UNKNOWN };
+
+/** Determines if the given mask value is all on, all off, mixed, or
+    unknown at compile time.
+*/
+static MaskStatus
+lGetMaskStatus(llvm::Value *mask, int vecWidth = -1) {
+    uint64_t bits;
+    if (lGetMask(mask, &bits) == false)
+        return UNKNOWN;
+
+    if (bits == 0)
+        return ALL_OFF;
+
+    if (vecWidth == -1)
+        vecWidth = g->target.vectorWidth;
+    Assert(vecWidth <= 64);
+
+    for (int i = 0; i < vecWidth; ++i) {
+        if ((bits & (1ull << i)) == 0)
+            return MIXED;
+    }
+    return ALL_ON;
 }
 
 
@@ -284,31 +393,32 @@ Optimize(llvm::Module *module, int optLevel) {
     }
 
     llvm::PassManager optPM;
-    llvm::FunctionPassManager funcPM(module);
+    optPM.add(llvm::createVerifierPass());
 
-    if (g->target.isa != Target::GENERIC) {
-        llvm::TargetLibraryInfo *targetLibraryInfo =
-            new llvm::TargetLibraryInfo(llvm::Triple(module->getTargetTriple()));
-        optPM.add(targetLibraryInfo);
-        optPM.add(new llvm::TargetData(module));
-    }
-
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
-    optPM.add(llvm::createIndVarSimplifyPass());
+#if 0
+    std::string err;
+    optPM.add(llvm::createPrintModulePass(new llvm::raw_fd_ostream("-", err)));
 #endif
 
+    llvm::TargetLibraryInfo *targetLibraryInfo =
+        new llvm::TargetLibraryInfo(llvm::Triple(module->getTargetTriple()));
+    optPM.add(targetLibraryInfo);
+    optPM.add(new llvm::TargetData(module));
+
+    optPM.add(llvm::createIndVarSimplifyPass());
+
     if (optLevel == 0) {
         // This is more or less the minimum set of optimizations that we
         // need to do to generate code that will actually run.  (We can't
         // run absolutely no optimizations, since the front-end needs us to
         // take the various __pseudo_* functions it has emitted and turn
         // them into something that can actually execute.
-        optPM.add(llvm::createPromoteMemoryToRegisterPass());
         optPM.add(CreateDetectGSBaseOffsetsPass());
         if (g->opt.disableHandlePseudoMemoryOps == false) {
             optPM.add(CreatePseudoGSToGSPass());
             optPM.add(CreatePseudoMaskedStorePass());
         }
+        optPM.add(CreateIntrinsicsOptPass());
         optPM.add(CreateIsCompileTimeConstantPass(true));
         optPM.add(llvm::createFunctionInliningPass());
         optPM.add(CreateMakeInternalFuncsStaticPass());
@@ -316,17 +426,6 @@ Optimize(llvm::Module *module, int optLevel) {
         optPM.add(llvm::createGlobalDCEPass());
     }
     else {
-        // Otherwise throw the kitchen sink of optimizations at the code.
-        // This is almost certainly overkill and likely could be reduced,
-        // but on the other hand trying to remove some of these has
-        // historically caused performance slowdowns.  Benchmark carefully
-        // if changing these around.
-        //
-        // Note in particular that a number of the ispc optimization
-        // passes are run repeatedly along the way; they often can kick in
-        // only later in the optimization process as things like constant
-        // propagation have done their thing, and then when they do kick
-        // in, they can often open up new opportunities for optimization...
         llvm::PassRegistry *registry = llvm::PassRegistry::getPassRegistry();
         llvm::initializeCore(*registry);
         llvm::initializeScalarOpts(*registry);
@@ -338,13 +437,12 @@ Optimize(llvm::Module *module, int optLevel) {
         llvm::initializeInstrumentation(*registry);
         llvm::initializeTarget(*registry);
 
-        bool runSROA = true;
+        optPM.add(llvm::createGlobalDCEPass());
 
         // Early optimizations to try to reduce the total amount of code to
         // work with if we can
         optPM.add(llvm::createReassociatePass());
         optPM.add(llvm::createConstantPropagationPass());
-        optPM.add(llvm::createConstantPropagationPass());
         optPM.add(llvm::createDeadInstEliminationPass());
         optPM.add(llvm::createCFGSimplificationPass());
 
@@ -358,8 +456,7 @@ Optimize(llvm::Module *module, int optLevel) {
         optPM.add(llvm::createDeadInstEliminationPass());
 
         // On to more serious optimizations
-        if (runSROA)
-            optPM.add(llvm::createScalarReplAggregatesPass());
+        optPM.add(llvm::createScalarReplAggregatesPass());
         optPM.add(llvm::createInstructionCombiningPass());
         optPM.add(llvm::createCFGSimplificationPass());
         optPM.add(llvm::createPromoteMemoryToRegisterPass());
@@ -381,8 +478,7 @@ Optimize(llvm::Module *module, int optLevel) {
         optPM.add(llvm::createInstructionCombiningPass());
         optPM.add(llvm::createJumpThreadingPass());
         optPM.add(llvm::createCFGSimplificationPass());
-        if (runSROA)
-            optPM.add(llvm::createScalarReplAggregatesPass());
+        optPM.add(llvm::createScalarReplAggregatesPass());
         optPM.add(llvm::createInstructionCombiningPass());
         optPM.add(llvm::createTailCallEliminationPass());
 
@@ -419,51 +515,13 @@ Optimize(llvm::Module *module, int optLevel) {
         optPM.add(CreateIntrinsicsOptPass());
         optPM.add(CreateVSelMovmskOptPass());
 
-#if defined(LLVM_2_9)
-        llvm::createStandardModulePasses(&optPM, 3, 
-                                         false /* opt size */,
-                                         true /* unit at a time */, 
-                                         g->opt.unrollLoops,
-                                         true /* simplify lib calls */,
-                                         false /* may have exceptions */,
-                                         llvm::createFunctionInliningPass());
-        llvm::createStandardLTOPasses(&optPM, true /* internalize pass */,
-                                      true /* inline once again */,
-                                      false /* verify after each pass */);
-        llvm::createStandardFunctionPasses(&optPM, 3);
-
-        optPM.add(CreateIsCompileTimeConstantPass(true));
-        optPM.add(CreateIntrinsicsOptPass());
-        optPM.add(CreateVSelMovmskOptPass());
-
-        llvm::createStandardModulePasses(&optPM, 3, 
-                                         false /* opt size */,
-                                         true /* unit at a time */, 
-                                         g->opt.unrollLoops,
-                                         true /* simplify lib calls */,
-                                         false /* may have exceptions */,
-                                         llvm::createFunctionInliningPass());
-
-#else
-        funcPM.add(llvm::createTypeBasedAliasAnalysisPass());
-        funcPM.add(llvm::createBasicAliasAnalysisPass());
-        funcPM.add(llvm::createCFGSimplificationPass());
-        if (runSROA)
-            funcPM.add(llvm::createScalarReplAggregatesPass());
-        funcPM.add(llvm::createEarlyCSEPass());
-        funcPM.add(llvm::createLowerExpectIntrinsicPass());
-
-        optPM.add(llvm::createTypeBasedAliasAnalysisPass());
-        optPM.add(llvm::createBasicAliasAnalysisPass());
-        optPM.add(llvm::createGlobalOptimizerPass());     
         optPM.add(llvm::createIPSCCPPass());              
         optPM.add(llvm::createDeadArgEliminationPass());  
         optPM.add(llvm::createInstructionCombiningPass());
         optPM.add(llvm::createCFGSimplificationPass());   
         optPM.add(llvm::createFunctionInliningPass());
         optPM.add(llvm::createArgumentPromotionPass());   
-        if (runSROA)
-            optPM.add(llvm::createScalarReplAggregatesPass(-1, false));
+        optPM.add(llvm::createScalarReplAggregatesPass(-1, false));
         optPM.add(llvm::createInstructionCombiningPass());  
         optPM.add(llvm::createCFGSimplificationPass());     
         optPM.add(llvm::createReassociatePass());           
@@ -477,57 +535,11 @@ Optimize(llvm::Module *module, int optLevel) {
         if (g->opt.unrollLoops)
             optPM.add(llvm::createLoopUnrollPass());          
         optPM.add(llvm::createGVNPass());                 
-        optPM.add(llvm::createMemCpyOptPass());             
-        optPM.add(llvm::createSCCPPass());                  
-        optPM.add(llvm::createInstructionCombiningPass());
-        optPM.add(llvm::createJumpThreadingPass());         
-        optPM.add(llvm::createCorrelatedValuePropagationPass());
-        optPM.add(llvm::createDeadStoreEliminationPass());  
-        optPM.add(llvm::createAggressiveDCEPass());         
-        optPM.add(llvm::createCFGSimplificationPass());     
-        optPM.add(llvm::createInstructionCombiningPass());  
-        optPM.add(llvm::createStripDeadPrototypesPass()); 
-        optPM.add(llvm::createGlobalDCEPass());         
-        optPM.add(llvm::createConstantMergePass());     
-
-        optPM.add(CreateIsCompileTimeConstantPass(false));
-        optPM.add(CreateIntrinsicsOptPass());
-        optPM.add(CreateVSelMovmskOptPass());
-
-        optPM.add(llvm::createGlobalOptimizerPass());
-        optPM.add(llvm::createGlobalDCEPass()); 
-        optPM.add(llvm::createArgumentPromotionPass());
-        optPM.add(llvm::createInstructionCombiningPass());
-        optPM.add(llvm::createJumpThreadingPass());
-        if (runSROA)
-            optPM.add(llvm::createScalarReplAggregatesPass());
-        optPM.add(llvm::createFunctionAttrsPass()); 
-        optPM.add(llvm::createGlobalsModRefPass()); 
-        optPM.add(llvm::createLICMPass());      
-        optPM.add(llvm::createGVNPass());       
-        optPM.add(llvm::createMemCpyOptPass()); 
-        optPM.add(llvm::createDeadStoreEliminationPass());
-        optPM.add(llvm::createInstructionCombiningPass());
-        optPM.add(llvm::createJumpThreadingPass());
-        optPM.add(llvm::createCFGSimplificationPass());
-        optPM.add(llvm::createGlobalDCEPass());
 
         optPM.add(CreateIsCompileTimeConstantPass(true));
         optPM.add(CreateIntrinsicsOptPass());
         optPM.add(CreateVSelMovmskOptPass());
             
-        optPM.add(llvm::createArgumentPromotionPass());   
-        if (runSROA)
-            optPM.add(llvm::createScalarReplAggregatesPass(-1, false));
-        optPM.add(llvm::createEarlyCSEPass());              
-        optPM.add(llvm::createSimplifyLibCallsPass());    
-        optPM.add(llvm::createJumpThreadingPass());         
-        optPM.add(llvm::createCorrelatedValuePropagationPass()); 
-        optPM.add(llvm::createCFGSimplificationPass());     
-        optPM.add(llvm::createInstructionCombiningPass());  
-        optPM.add(llvm::createCFGSimplificationPass());     
-        optPM.add(llvm::createReassociatePass());           
-        optPM.add(llvm::createGVNPass());                 
         optPM.add(llvm::createMemCpyOptPass());             
         optPM.add(llvm::createSCCPPass());                  
         optPM.add(llvm::createInstructionCombiningPass());
@@ -538,21 +550,14 @@ Optimize(llvm::Module *module, int optLevel) {
         optPM.add(llvm::createCFGSimplificationPass());     
         optPM.add(llvm::createInstructionCombiningPass());  
         optPM.add(llvm::createStripDeadPrototypesPass()); 
+        optPM.add(CreateMakeInternalFuncsStaticPass());
         optPM.add(llvm::createGlobalDCEPass());         
         optPM.add(llvm::createConstantMergePass());     
-#endif
-        optPM.add(CreateMakeInternalFuncsStaticPass());
-        optPM.add(llvm::createGlobalDCEPass());
     }
 
     // Finish up by making sure we didn't mess anything up in the IR along
     // the way.
     optPM.add(llvm::createVerifierPass());
-    
-    for (llvm::Module::iterator fiter = module->begin(); fiter != module->end();
-         ++fiter)
-        funcPM.run(*fiter);
-
     optPM.run(*module);
 
     if (g->debugPrint) {
@@ -596,12 +601,12 @@ private:
         instruction for this optimization pass.
      */
     struct BlendInstruction {
-        BlendInstruction(llvm::Function *f, int ao, int o0, int o1, int of)
+        BlendInstruction(llvm::Function *f, uint64_t ao, int o0, int o1, int of)
             : function(f), allOnMask(ao), op0(o0), op1(o1), opFactor(of) { }
         /** Function pointer for the blend instruction */ 
         llvm::Function *function;
         /** Mask value for an "all on" mask for this instruction */
-        int allOnMask;
+        uint64_t allOnMask;
         /** The operand number in the llvm CallInst corresponds to the
             first operand to blend with. */
         int op0;
@@ -631,92 +636,18 @@ IntrinsicsOpt::IntrinsicsOpt()
         llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_sse_movmsk_ps);
     maskInstructions.push_back(sseMovmsk);
     maskInstructions.push_back(m->module->getFunction("__movmsk"));
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
     llvm::Function *avxMovmsk = 
         llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_avx_movmsk_ps_256);
     Assert(avxMovmsk != NULL);
     maskInstructions.push_back(avxMovmsk);
-#endif
 
     // And all of the blend instructions
     blendInstructions.push_back(BlendInstruction(
         llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_sse41_blendvps),
         0xf, 0, 1, 2));
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
     blendInstructions.push_back(BlendInstruction(
         llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_avx_blendv_ps_256),
         0xff, 0, 1, 2));
-#endif
-}
-
-
-/** Given an llvm::Value represinting a vector mask, see if the value is a
-    constant.  If so, return the integer mask found by taking the high bits
-    of the mask values in turn and concatenating them into a single integer.
-    In other words, given the 4-wide mask: < 0xffffffff, 0, 0, 0xffffffff >, 
-    we have 0b1001 = 9.
- */
-static int
-lGetMask(llvm::Value *factor) {
-    /* FIXME: This will break if we ever do 32-wide compilation, in which case
-       it don't be possible to distinguish between -1 for "don't know" and
-       "known and all bits on". */
-    Assert(g->target.vectorWidth < 32);
-
-#ifdef LLVM_3_1svn
-    llvm::ConstantDataVector *cv = llvm::dyn_cast<llvm::ConstantDataVector>(factor);
-#else
-    llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(factor);
-#endif
-    if (cv) {
-        int mask = 0;
-        llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> elements;
-#ifdef LLVM_3_1svn
-        for (int i = 0; i < (int)cv->getNumElements(); ++i)
-            elements.push_back(cv->getElementAsConstant(i));
-#else
-        cv->getVectorElements(elements);
-#endif
-
-        for (unsigned int i = 0; i < elements.size(); ++i) {
-            llvm::APInt intMaskValue;
-            // SSE has the "interesting" approach of encoding blending
-            // masks as <n x float>.
-            llvm::ConstantFP *cf = llvm::dyn_cast<llvm::ConstantFP>(elements[i]);
-            if (cf) {
-                llvm::APFloat apf = cf->getValueAPF();
-                intMaskValue = apf.bitcastToAPInt();
-            }
-            else {
-                // Otherwise get it as an int
-                llvm::ConstantInt *ci = llvm::dyn_cast<llvm::ConstantInt>(elements[i]);
-                Assert(ci != NULL);  // vs return -1 if NULL?
-                intMaskValue = ci->getValue();
-            }
-            // Is the high-bit set?  If so, OR in the appropriate bit in
-            // the result mask
-            if (intMaskValue.countLeadingOnes() > 0)
-                mask |= (1 << i);
-        }
-        return mask;
-    }
-    else if (llvm::isa<llvm::ConstantAggregateZero>(factor))
-        return 0;
-    else {
-#if 0
-        llvm::ConstantExpr *ce = llvm::dyn_cast<llvm::ConstantExpr>(factor);
-        if (ce != NULL) {
-            llvm::TargetMachine *targetMachine = g->target.GetTargetMachine();
-            const llvm::TargetData *td = targetMachine->getTargetData();
-            llvm::Constant *c = llvm::ConstantFoldConstantExpression(ce, td);
-            c->dump();
-            factor = c;
-        }
-        // else we should be able to handle it above...
-        Assert(!llvm::isa<llvm::Constant>(factor));
-#endif
-        return -1;
-    }
 }
 
 
@@ -744,7 +675,8 @@ lIsUndef(llvm::Value *value) {
 
 bool
 IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
+    DEBUG_START_PASS("IntrinsicsOpt");
+
     llvm::Function *avxMaskedLoad32 = 
         llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_avx_maskload_ps_256);
     llvm::Function *avxMaskedLoad64 = 
@@ -755,7 +687,6 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
         llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_avx_maskstore_pd_256);
     Assert(avxMaskedLoad32 != NULL && avxMaskedStore32 != NULL);
     Assert(avxMaskedLoad64 != NULL && avxMaskedStore64 != NULL);
-#endif
 
     bool modifiedAny = false;
  restart:
@@ -797,105 +728,117 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
                 goto restart;
             }
 
-            int mask = lGetMask(factor);
-            llvm::Value *value = NULL;
-            if (mask == 0)
-                // Mask all off -> replace with the first blend value
-                value = v[0];
-            else if (mask == blend->allOnMask)
-                // Mask all on -> replace with the second blend value
-                value = v[1];
+            uint64_t mask;
+            if (lGetMask(factor, &mask) == true) {
+                llvm::Value *value = NULL;
+                if (mask == 0)
+                    // Mask all off -> replace with the first blend value
+                    value = v[0];
+                else if (mask == blend->allOnMask)
+                    // Mask all on -> replace with the second blend value
+                    value = v[1];
 
-            if (value != NULL) {
-                llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), 
-                                           iter, value);
-                modifiedAny = true;
-                goto restart;
+                if (value != NULL) {
+                    llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), 
+                                               iter, value);
+                    modifiedAny = true;
+                    goto restart;
+                }
             }
         }
         else if (matchesMaskInstruction(callInst->getCalledFunction())) {
             llvm::Value *factor = callInst->getArgOperand(0);
-            int mask = lGetMask(factor);
-            if (mask != -1) {
+            uint64_t mask;
+            if (lGetMask(factor, &mask) == true) {
                 // If the vector-valued mask has a known value, replace it
                 // with the corresponding integer mask from its elements
                 // high bits.
-                llvm::Value *value = LLVMInt32(mask);
+                llvm::Value *value = (callInst->getType() == LLVMTypes::Int32Type) ?
+                    LLVMInt32(mask) : LLVMInt64(mask);
                 llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
                                            iter, value);
                 modifiedAny = true;
                 goto restart;
             }
         }
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
         else if (callInst->getCalledFunction() == avxMaskedLoad32 ||
                  callInst->getCalledFunction() == avxMaskedLoad64) {
             llvm::Value *factor = callInst->getArgOperand(1);
-            int mask = lGetMask(factor);
-            if (mask == 0) {
-                // nothing being loaded, replace with undef value
-                llvm::Type *returnType = callInst->getType();
-                Assert(llvm::isa<llvm::VectorType>(returnType));
-                llvm::Value *undefValue = llvm::UndefValue::get(returnType);
-                llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
-                                           iter, undefValue);
-                modifiedAny = true;
-                goto restart;
-            }
-            else if (mask == 0xff) {
-                // all lanes active; replace with a regular load
-                llvm::Type *returnType = callInst->getType();
-                Assert(llvm::isa<llvm::VectorType>(returnType));
-                // cast the i8 * to the appropriate type
-                llvm::Value *castPtr = 
-                    new llvm::BitCastInst(callInst->getArgOperand(0),
-                                          llvm::PointerType::get(returnType, 0), 
-                                          "ptr2vec", callInst);
-                lCopyMetadata(castPtr, callInst);
-                int align = callInst->getCalledFunction() == avxMaskedLoad32 ? 4 : 8;
-                llvm::Instruction *loadInst = 
-                    new llvm::LoadInst(castPtr, "load", false /* not volatile */,
-                                       align, (llvm::Instruction *)NULL);
-                lCopyMetadata(loadInst, callInst);
-                llvm::ReplaceInstWithInst(callInst, loadInst);
-                modifiedAny = true;
-                goto restart;
+            uint64_t mask;
+            if (lGetMask(factor, &mask) == true) {
+                if (mask == 0) {
+                    // nothing being loaded, replace with undef value
+                    llvm::Type *returnType = callInst->getType();
+                    Assert(llvm::isa<llvm::VectorType>(returnType));
+                    llvm::Value *undefValue = llvm::UndefValue::get(returnType);
+                    llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
+                                               iter, undefValue);
+                    modifiedAny = true;
+                    goto restart;
+                }
+                else if (mask == 0xff) {
+                    // all lanes active; replace with a regular load
+                    llvm::Type *returnType = callInst->getType();
+                    Assert(llvm::isa<llvm::VectorType>(returnType));
+                    // cast the i8 * to the appropriate type
+                    const char *name = LLVMGetName(callInst->getArgOperand(0), "_cast");
+                    llvm::Value *castPtr = 
+                        new llvm::BitCastInst(callInst->getArgOperand(0),
+                                              llvm::PointerType::get(returnType, 0), 
+                                              name, callInst);
+                    lCopyMetadata(castPtr, callInst);
+                    int align = callInst->getCalledFunction() == avxMaskedLoad32 ? 4 : 8;
+                    name = LLVMGetName(callInst->getArgOperand(0), "_load");
+                    llvm::Instruction *loadInst = 
+                        new llvm::LoadInst(castPtr, name, false /* not volatile */,
+                                           align, (llvm::Instruction *)NULL);
+                    lCopyMetadata(loadInst, callInst);
+                    llvm::ReplaceInstWithInst(callInst, loadInst);
+                    modifiedAny = true;
+                    goto restart;
+                }
             }
         }
         else if (callInst->getCalledFunction() == avxMaskedStore32 ||
                  callInst->getCalledFunction() == avxMaskedStore64) {
             // NOTE: mask is the 2nd parameter, not the 3rd one!!
             llvm::Value *factor = callInst->getArgOperand(1);
-            int mask = lGetMask(factor);
-            if (mask == 0) {
-                // nothing actually being stored, just remove the inst
-                callInst->eraseFromParent();
-                modifiedAny = true;
-                goto restart;
-            }
-            else if (mask == 0xff) {
-                // all lanes storing, so replace with a regular store
-                llvm::Value *rvalue = callInst->getArgOperand(2);
-                llvm::Type *storeType = rvalue->getType();
-                llvm::Value *castPtr = 
-                    new llvm::BitCastInst(callInst->getArgOperand(0),
-                                          llvm::PointerType::get(storeType, 0), 
-                                          "ptr2vec", callInst);
-                lCopyMetadata(castPtr, callInst);
+            uint64_t mask;
+            if (lGetMask(factor, &mask) == true) {
+                if (mask == 0) {
+                    // nothing actually being stored, just remove the inst
+                    callInst->eraseFromParent();
+                    modifiedAny = true;
+                    goto restart;
+                }
+                else if (mask == 0xff) {
+                    // all lanes storing, so replace with a regular store
+                    llvm::Value *rvalue = callInst->getArgOperand(2);
+                    llvm::Type *storeType = rvalue->getType();
+                    const char *name = LLVMGetName(callInst->getArgOperand(0),
+                                                   "_ptrcast");
+                    llvm::Value *castPtr = 
+                        new llvm::BitCastInst(callInst->getArgOperand(0),
+                                              llvm::PointerType::get(storeType, 0), 
+                                              name, callInst);
+                    lCopyMetadata(castPtr, callInst);
 
-                llvm::StoreInst *storeInst = 
-                    new llvm::StoreInst(rvalue, castPtr, (llvm::Instruction *)NULL);
-                int align = callInst->getCalledFunction() == avxMaskedStore32 ? 4 : 8;
-                storeInst->setAlignment(align);
-                lCopyMetadata(storeInst, callInst);
-                llvm::ReplaceInstWithInst(callInst, storeInst);
+                    llvm::StoreInst *storeInst = 
+                        new llvm::StoreInst(rvalue, castPtr, (llvm::Instruction *)NULL);
+                    int align = callInst->getCalledFunction() == avxMaskedStore32 ? 4 : 8;
+                    storeInst->setAlignment(align);
+                    lCopyMetadata(storeInst, callInst);
+                    llvm::ReplaceInstWithInst(callInst, storeInst);
 
-                modifiedAny = true;
-                goto restart;
+                    modifiedAny = true;
+                    goto restart;
+                }
             }
         }
-#endif
     }
+
+    DEBUG_END_PASS("IntrinsicsOpt");
+
     return modifiedAny;
 }
 
@@ -951,24 +894,26 @@ char VSelMovmskOpt::ID = 0;
 
 bool
 VSelMovmskOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
+    DEBUG_START_PASS("VSelMovmaskOpt");
+
     bool modifiedAny = false;
 
  restart:
     for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
     // vector select wasn't available before 3.1...
-#if defined(LLVM_3_1svn)
+#ifndef LLVM_3_0
         llvm::SelectInst *selectInst = llvm::dyn_cast<llvm::SelectInst>(&*iter);
         if (selectInst != NULL && selectInst->getType()->isVectorTy()) {
             llvm::Value *factor = selectInst->getOperand(0);
-            int mask = lGetMask(factor);
-            int allOnMask = (1 << g->target.vectorWidth) - 1;
+
+            MaskStatus maskStatus = lGetMaskStatus(factor);
             llvm::Value *value = NULL;
-            if (mask == allOnMask)
+            if (maskStatus == ALL_ON)
                 // Mask all on -> replace with the first select value
                 value = selectInst->getOperand(1);
-            else if (mask == 0)
-                // Mask all off -> replace with the second select blend value
-                value = selectInst->getOperand(1);
+            else if (maskStatus == ALL_OFF)
+                // Mask all off -> replace with the second select value
+                value = selectInst->getOperand(2);
 
             if (value != NULL) {
                 llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), 
@@ -977,7 +922,7 @@ VSelMovmskOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
                 goto restart;
             }
         }
-#endif // LLVM_3_1svn
+#endif // !LLVM_3_0
 
         llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
         if (callInst == NULL)
@@ -987,20 +932,22 @@ VSelMovmskOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
         if (calledFunc == NULL || calledFunc != m->module->getFunction("__movmsk"))
             continue;
 
-        int mask = lGetMask(callInst->getArgOperand(0));
-        if (mask != -1) {
+        uint64_t mask;
+        if (lGetMask(callInst->getArgOperand(0), &mask) == true) {
 #if 0
             fprintf(stderr, "mask %d\n", mask);
             callInst->getArgOperand(0)->dump();
             fprintf(stderr, "-----------\n");
 #endif
             llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), 
-                                       iter, LLVMInt32(mask));
+                                       iter, LLVMInt64(mask));
             modifiedAny = true;
             goto restart;
         }
     }
 
+    DEBUG_END_PASS("VSelMovMskOpt");
+
     return modifiedAny;
 }
 
@@ -1050,7 +997,7 @@ static llvm::Value *
 lCheckForActualPointer(llvm::Value *v) {
     if (v == NULL)
         return NULL;
-    else if (llvm::isa<LLVM_TYPE_CONST llvm::PointerType>(v->getType()))
+    else if (llvm::isa<llvm::PointerType>(v->getType()))
         return v;
     else if (llvm::isa<llvm::PtrToIntInst>(v))
         return v;
@@ -1188,7 +1135,7 @@ lGetBasePtrAndOffsets(llvm::Value *ptrs, llvm::Value **offsets,
         // Indexing into global arrays can lead to this form, with
         // ConstantVectors..
         llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> elements;
-#ifdef LLVM_3_1svn
+#ifndef LLVM_3_0
         for (int i = 0; i < (int)cv->getNumOperands(); ++i) {
             llvm::Constant *c = 
                 llvm::dyn_cast<llvm::Constant>(cv->getOperand(i));
@@ -1247,13 +1194,9 @@ lGetBasePtrAndOffsets(llvm::Value *ptrs, llvm::Value **offsets,
         }
 
         Assert(base != NULL);
-#ifdef LLVM_2_9
-        *offsets = llvm::ConstantVector::get(delta);
-#else
         llvm::ArrayRef<llvm::Constant *> deltas(&delta[0], 
                                                 &delta[elements.size()]);
         *offsets = llvm::ConstantVector::get(deltas);
-#endif
         return base;
     }
 
@@ -1287,7 +1230,7 @@ lExtractConstantOffset(llvm::Value *vec, llvm::Value **constOffset,
                        llvm::Value **variableOffset, 
                        llvm::Instruction *insertBefore) {
     if (llvm::isa<llvm::ConstantVector>(vec) ||
-#ifdef LLVM_3_1svn
+#ifndef LLVM_3_0
         llvm::isa<llvm::ConstantDataVector>(vec) ||
 #endif
         llvm::isa<llvm::ConstantAggregateZero>(vec)) {
@@ -1307,12 +1250,13 @@ lExtractConstantOffset(llvm::Value *vec, llvm::Value **constOffset,
             *constOffset = NULL;
         else
             *constOffset = new llvm::SExtInst(co, sext->getType(), 
-                                              "const_offset_sext", insertBefore);
+                                              LLVMGetName(co, "_sext"),
+                                              insertBefore);
         if (vo == NULL)
             *variableOffset = NULL;
         else
             *variableOffset = new llvm::SExtInst(vo, sext->getType(), 
-                                                 "variable_offset_sext", 
+                                                 LLVMGetName(vo, "_sext"),
                                                  insertBefore);
         return;
     }
@@ -1336,7 +1280,8 @@ lExtractConstantOffset(llvm::Value *vec, llvm::Value **constOffset,
             else
                 *constOffset = 
                     llvm::BinaryOperator::Create(llvm::Instruction::Add, c0, c1,
-                                                 "const_op", insertBefore);
+                                                 LLVMGetName("add", c0, c1),
+                                                 insertBefore);
 
             if (v0 == NULL || llvm::isa<llvm::ConstantAggregateZero>(v0))
                 *variableOffset = v1;
@@ -1345,7 +1290,8 @@ lExtractConstantOffset(llvm::Value *vec, llvm::Value **constOffset,
             else
                 *variableOffset = 
                     llvm::BinaryOperator::Create(llvm::Instruction::Add, v0, v1,
-                                                 "variable_op", insertBefore);
+                                                 LLVMGetName("add", v0, v1),
+                                                 insertBefore);
             return;
         }
         else if (bop->getOpcode() == llvm::Instruction::Mul) {
@@ -1359,26 +1305,27 @@ lExtractConstantOffset(llvm::Value *vec, llvm::Value **constOffset,
             if (c0 != NULL && c1 != NULL)
                 *constOffset =
                     llvm::BinaryOperator::Create(llvm::Instruction::Mul, c0, c1,
-                                                 "const_mul", insertBefore);
+                                                 LLVMGetName("mul", c0, c1),
+                                                 insertBefore);
             else
                 *constOffset = NULL;
 
             llvm::Value *va = NULL, *vb = NULL, *vc = NULL;
             if (v0 != NULL && c1 != NULL)
                 va = llvm::BinaryOperator::Create(llvm::Instruction::Mul, v0, c1,
-                                                  "va_mul", insertBefore);
+                                                  LLVMGetName("mul", v0, c1), insertBefore);
             if (c0 != NULL && v1 != NULL)
                 vb = llvm::BinaryOperator::Create(llvm::Instruction::Mul, c0, v1,
-                                                  "vb_mul", insertBefore);
+                                                  LLVMGetName("mul", c0, v1), insertBefore);
             if (v0 != NULL && v1 != NULL)
                 vc = llvm::BinaryOperator::Create(llvm::Instruction::Mul, v0, v1,
-                                                  "vc_mul", insertBefore);
+                                                  LLVMGetName("mul", v0, v1), insertBefore);
 
             
             llvm::Value *vab = NULL;
             if (va != NULL && vb != NULL)
                 vab = llvm::BinaryOperator::Create(llvm::Instruction::Add, va, vb,
-                                                   "vab_add", insertBefore);
+                                                   LLVMGetName("add", va, vb), insertBefore);
             else if (va != NULL)
                 vab = va;
             else
@@ -1387,7 +1334,7 @@ lExtractConstantOffset(llvm::Value *vec, llvm::Value **constOffset,
             if (vab != NULL && vc != NULL)
                 *variableOffset = 
                     llvm::BinaryOperator::Create(llvm::Instruction::Add, vab, vc,
-                                                 "vabc_add", insertBefore);
+                                                 LLVMGetName("add", vab, vc), insertBefore);
             else if (vab != NULL)
                 *variableOffset = vab;
             else
@@ -1408,11 +1355,11 @@ lExtractConstantOffset(llvm::Value *vec, llvm::Value **constOffset,
    *splat, if so). */
 static bool
 lIsIntegerSplat(llvm::Value *v, int *splat) {
-#ifdef LLVM_3_1svn
+#ifdef LLVM_3_0
+    llvm::ConstantVector *cvec = llvm::dyn_cast<llvm::ConstantVector>(v);
+#else
     llvm::ConstantDataVector *cvec = 
         llvm::dyn_cast<llvm::ConstantDataVector>(v);
-#else
-    llvm::ConstantVector *cvec = llvm::dyn_cast<llvm::ConstantVector>(v);
 #endif
     if (cvec == NULL)
         return false;
@@ -1459,7 +1406,7 @@ lExtract248Scale(llvm::Value *splatOperand, int splatValue,
             *result = 
                 llvm::BinaryOperator::Create(llvm::Instruction::Mul,
                                              splatDiv, otherOperand,
-                                             "add", insertBefore);
+                                             "mul", insertBefore);
             return LLVMInt32(scale);
         }
     }
@@ -1548,7 +1495,7 @@ lExtractUniforms(llvm::Value **vec, llvm::Instruction *insertBefore) {
     fprintf(stderr, "\n");
 
     if (llvm::isa<llvm::ConstantVector>(*vec) ||
-#ifdef LLVM_3_1svn
+#ifndef LLVM_3_0
         llvm::isa<llvm::ConstantDataVector>(*vec) ||
 #endif
         llvm::isa<llvm::ConstantAggregateZero>(*vec))
@@ -1689,7 +1636,8 @@ lOffsets32BitSafe(llvm::Value **variableOffsetPtr,
             // do the more general check with lVectorIs32BitInts().
             variableOffset = 
                 new llvm::TruncInst(variableOffset, LLVMTypes::Int32VectorType,
-                                    "trunc_variable_offset", insertBefore);
+                                    LLVMGetName(variableOffset, "_trunc"),
+                                    insertBefore);
         else
             return false;
     }
@@ -1699,7 +1647,7 @@ lOffsets32BitSafe(llvm::Value **variableOffsetPtr,
             // Truncate them so we have a 32-bit vector type for them.
             constOffset = 
                 new llvm::TruncInst(constOffset, LLVMTypes::Int32VectorType,
-                                    "trunc_const_offset", insertBefore);
+                                    LLVMGetName(constOffset, "_trunc"), insertBefore);
         }
         else {
             // FIXME: otherwise we just assume that all constant offsets
@@ -1712,7 +1660,7 @@ lOffsets32BitSafe(llvm::Value **variableOffsetPtr,
             // enough for us in some cases if we call it from here.
             constOffset = 
                 new llvm::TruncInst(constOffset, LLVMTypes::Int32VectorType,
-                                    "trunc_const_offset", insertBefore);
+                                    LLVMGetName(constOffset, "_trunc"), insertBefore);
         }
     }
 
@@ -1738,6 +1686,8 @@ struct GSInfo {
 
 bool
 DetectGSBaseOffsetsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
+    DEBUG_START_PASS("DetectGSBaseOffsets");
+
     GSInfo gsFuncs[] = {
         GSInfo("__pseudo_gather32_8",  "__pseudo_gather_base_offsets32_8",
                "__pseudo_gather_base_offsets32_8", true),
@@ -1833,7 +1783,7 @@ DetectGSBaseOffsetsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
         // Cast the base pointer to a void *, since that's what the
         // __pseudo_*_base_offsets_* functions want.
         basePtr = new llvm::IntToPtrInst(basePtr, LLVMTypes::VoidPointerType,
-                                         "base2void", callInst);
+                                         LLVMGetName(basePtr, "_2void"), callInst);
         lCopyMetadata(basePtr, callInst);
 
         llvm::Function *gatherScatterFunc = info->baseOffsetsFunc;
@@ -1856,7 +1806,8 @@ DetectGSBaseOffsetsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
             // way we can then call ReplaceInstWithInst().
             llvm::Instruction *newCall = 
                 lCallInst(gatherScatterFunc, basePtr, variableOffset, offsetScale,
-                          constOffset, mask, "newgather", NULL);
+                          constOffset, mask, callInst->getName().str().c_str(),
+                          NULL);
             lCopyMetadata(newCall, callInst);
             llvm::ReplaceInstWithInst(callInst, newCall);
         }
@@ -1877,6 +1828,8 @@ DetectGSBaseOffsetsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
         goto restart;
     }
 
+    DEBUG_END_PASS("DetectGSBaseOffsets");
+
     return modifiedAny;
 }
 
@@ -1921,6 +1874,8 @@ struct MSInfo {
 
 bool
 MaskedStoreOptPass::runOnBasicBlock(llvm::BasicBlock &bb) {
+    DEBUG_START_PASS("MaskedStoreOpt");
+
     MSInfo msInfo[] = {
         MSInfo("__pseudo_masked_store_8",  1),
         MSInfo("__pseudo_masked_store_16", 2),
@@ -1965,10 +1920,8 @@ MaskedStoreOptPass::runOnBasicBlock(llvm::BasicBlock &bb) {
         llvm::Value *rvalue  = callInst->getArgOperand(1);
         llvm::Value *mask = callInst->getArgOperand(2);
 
-        int allOnMask = (1 << g->target.vectorWidth) - 1;
-
-        int maskAsInt = lGetMask(mask);
-        if (maskAsInt == 0) {
+        MaskStatus maskStatus = lGetMaskStatus(mask);
+        if (maskStatus == ALL_OFF) {
             // Zero mask - no-op, so remove the store completely.  (This
             // may in turn lead to being able to optimize out instructions
             // that compute the rvalue...)
@@ -1976,11 +1929,10 @@ MaskedStoreOptPass::runOnBasicBlock(llvm::BasicBlock &bb) {
             modifiedAny = true;
             goto restart;
         }
-        else if (maskAsInt == allOnMask) {
+        else if (maskStatus == ALL_ON) {
             // The mask is all on, so turn this into a regular store
-            LLVM_TYPE_CONST llvm::Type *rvalueType = rvalue->getType();
-            LLVM_TYPE_CONST llvm::Type *ptrType = 
-                llvm::PointerType::get(rvalueType, 0);
+            llvm::Type *rvalueType = rvalue->getType();
+            llvm::Type *ptrType = llvm::PointerType::get(rvalueType, 0);
 
             lvalue = new llvm::BitCastInst(lvalue, ptrType, "lvalue_to_ptr_type", callInst);
             lCopyMetadata(lvalue, callInst);
@@ -1995,6 +1947,8 @@ MaskedStoreOptPass::runOnBasicBlock(llvm::BasicBlock &bb) {
         }
     }
 
+    DEBUG_END_PASS("MaskedStoreOpt");
+
     return modifiedAny;
 }
 
@@ -2035,6 +1989,8 @@ struct MLInfo {
 
 bool
 MaskedLoadOptPass::runOnBasicBlock(llvm::BasicBlock &bb) {
+    DEBUG_START_PASS("MaskedLoadOpt");
+
     MLInfo mlInfo[] = {
         MLInfo("__masked_load_8",  1),
         MLInfo("__masked_load_16", 2),
@@ -2069,20 +2025,18 @@ MaskedLoadOptPass::runOnBasicBlock(llvm::BasicBlock &bb) {
         // Got one; grab the operands
         llvm::Value *ptr = callInst->getArgOperand(0);
         llvm::Value *mask  = callInst->getArgOperand(1);
-        int allOnMask = (1 << g->target.vectorWidth) - 1;
 
-        int maskAsInt = lGetMask(mask);
-        if (maskAsInt == 0) {
+        MaskStatus maskStatus = lGetMaskStatus(mask);
+        if (maskStatus == ALL_OFF) {
             // Zero mask - no-op, so replace the load with an undef value
             llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
                                        iter, llvm::UndefValue::get(callInst->getType()));
             modifiedAny = true;
             goto restart;
         }
-        else if (maskAsInt == allOnMask) {
+        else if (maskStatus == ALL_ON) {
             // The mask is all on, so turn this into a regular load
-            LLVM_TYPE_CONST llvm::Type *ptrType = 
-                llvm::PointerType::get(callInst->getType(), 0);
+            llvm::Type *ptrType = llvm::PointerType::get(callInst->getType(), 0);
             ptr = new llvm::BitCastInst(ptr, ptrType, "ptr_cast_for_load", 
                                         callInst);
             llvm::Instruction *load = 
@@ -2094,6 +2048,9 @@ MaskedLoadOptPass::runOnBasicBlock(llvm::BasicBlock &bb) {
             goto restart;
         }
     }
+
+    DEBUG_END_PASS("MaskedLoadOpt");
+
     return modifiedAny;
 }
 
@@ -2139,17 +2096,17 @@ lIsSafeToBlend(llvm::Value *lvalue) {
     else {
         llvm::AllocaInst *ai = llvm::dyn_cast<llvm::AllocaInst>(lvalue);
         if (ai) {
-            LLVM_TYPE_CONST llvm::Type *type = ai->getType();
-            LLVM_TYPE_CONST llvm::PointerType *pt = 
-                llvm::dyn_cast<LLVM_TYPE_CONST llvm::PointerType>(type);
+            llvm::Type *type = ai->getType();
+            llvm::PointerType *pt = 
+                llvm::dyn_cast<llvm::PointerType>(type);
             assert(pt != NULL);
             type = pt->getElementType();
-            LLVM_TYPE_CONST llvm::ArrayType *at;
-            while ((at = llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(type))) {
+            llvm::ArrayType *at;
+            while ((at = llvm::dyn_cast<llvm::ArrayType>(type))) {
                 type = at->getElementType();
             }
-            LLVM_TYPE_CONST llvm::VectorType *vt = 
-                llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(type);
+            llvm::VectorType *vt = 
+                llvm::dyn_cast<llvm::VectorType>(type);
             return (vt != NULL && 
                     (int)vt->getNumElements() == g->target.vectorWidth);
         }
@@ -2181,6 +2138,8 @@ struct LMSInfo {
 
 bool
 PseudoMaskedStorePass::runOnBasicBlock(llvm::BasicBlock &bb) {
+    DEBUG_START_PASS("PseudoMaskedStorePass");
+
     LMSInfo msInfo[] = {
         LMSInfo("__pseudo_masked_store_8", "__masked_store_blend_8", 
                 "__masked_store_8"),
@@ -2233,6 +2192,8 @@ PseudoMaskedStorePass::runOnBasicBlock(llvm::BasicBlock &bb) {
         goto restart;
     }
 
+    DEBUG_END_PASS("PseudoMaskedStorePass");
+
     return modifiedAny;
 }
 
@@ -2294,15 +2255,14 @@ struct GatherImpInfo {
 static llvm::Value *
 lComputeCommonPointer(llvm::Value *base, llvm::Value *offsets,
                       llvm::Instruction *insertBefore) {
-    llvm::Value *firstOffset = LLVMExtractFirstVectorElement(offsets,
-                                                             insertBefore);
+    llvm::Value *firstOffset = LLVMExtractFirstVectorElement(offsets);
     return lGEPInst(base, firstOffset, "ptr", insertBefore);
 }
 
 
 struct ScatterImpInfo {
     ScatterImpInfo(const char *pName, const char *msName, 
-                   LLVM_TYPE_CONST llvm::Type *vpt, int a)
+                   llvm::Type *vpt, int a)
         : align(a) {
         pseudoFunc = m->module->getFunction(pName);
         maskedStoreFunc = m->module->getFunction(msName);
@@ -2311,13 +2271,15 @@ struct ScatterImpInfo {
     }
     llvm::Function *pseudoFunc;
     llvm::Function *maskedStoreFunc;
-    LLVM_TYPE_CONST llvm::Type *vecPtrType;
+    llvm::Type *vecPtrType;
     const int align;
 };
     
 
 bool
 GSToLoadStorePass::runOnBasicBlock(llvm::BasicBlock &bb) {
+    DEBUG_START_PASS("GSToLoadStorePass");
+
     GatherImpInfo gInfo[] = {
         GatherImpInfo("__pseudo_gather_base_offsets32_8", "__load_and_broadcast_8",
                       "__masked_load_8", 1),
@@ -2441,7 +2403,7 @@ GSToLoadStorePass::runOnBasicBlock(llvm::BasicBlock &bb) {
                 Debug(pos, "Transformed gather to scalar load and broadcast!");
                 llvm::Instruction *newCall = 
                     lCallInst(gatherInfo->loadBroadcastFunc, ptr, mask, 
-                              "load_braodcast");
+                              LLVMGetName(callInst, "_broadcast"));
                 lCopyMetadata(newCall, callInst);
                 llvm::ReplaceInstWithInst(callInst, newCall);
 
@@ -2479,7 +2441,8 @@ GSToLoadStorePass::runOnBasicBlock(llvm::BasicBlock &bb) {
                 if (gatherInfo != NULL) {
                     Debug(pos, "Transformed gather to unaligned vector load!");
                     llvm::Instruction *newCall = 
-                        lCallInst(gatherInfo->loadMaskedFunc, ptr, mask, "masked_load");
+                        lCallInst(gatherInfo->loadMaskedFunc, ptr, mask, 
+                                  LLVMGetName(ptr, "_masked_load"));
                     lCopyMetadata(newCall, callInst);
                     llvm::ReplaceInstWithInst(callInst, newCall);
                 }
@@ -2500,6 +2463,8 @@ GSToLoadStorePass::runOnBasicBlock(llvm::BasicBlock &bb) {
         }
     }
 
+    DEBUG_END_PASS("GSToLoadStorePass");
+
     return modifiedAny;
 }
 
@@ -2544,18 +2509,6 @@ public:
 char GatherCoalescePass::ID = 0;
 
 
-/* Returns true if the mask is known at compile time to be "all on". */ 
-static bool
-lIsMaskAllOn(llvm::Value *mask) {
-    int m = lGetMask(mask);
-    if (m == -1)
-        return false;
-
-    int allOnMask = (1 << g->target.vectorWidth) - 1;
-    return (m == allOnMask);
-}
-
-
 /** Representation of a memory load that the gather coalescing code has
     decided to generate.
  */
@@ -2812,7 +2765,7 @@ lCoalescePerfInfo(const std::vector<llvm::CallInst *> &coalesceGroup,
  */
 llvm::Value *
 lGEPAndLoad(llvm::Value *basePtr, int64_t offset, int align,
-            llvm::Instruction *insertBefore, LLVM_TYPE_CONST llvm::Type *type) {
+            llvm::Instruction *insertBefore, llvm::Type *type) {
     llvm::Value *ptr = lGEPInst(basePtr, LLVMInt64(offset), "new_base",
                                 insertBefore);
     ptr = new llvm::BitCastInst(ptr, llvm::PointerType::get(type, 0),
@@ -2866,7 +2819,7 @@ lEmitLoads(llvm::Value *basePtr, std::vector<CoalescedLoadOp> &loadOps,
         }
         case 4: {
             // 4-wide vector load
-            LLVM_TYPE_CONST llvm::VectorType *vt =
+            llvm::VectorType *vt =
                 llvm::VectorType::get(LLVMTypes::Int32Type, 4);
             loadOps[i].load = lGEPAndLoad(basePtr, start, align,
                                           insertBefore, vt);
@@ -2874,7 +2827,7 @@ lEmitLoads(llvm::Value *basePtr, std::vector<CoalescedLoadOp> &loadOps,
         }
         case 8: {
             // 8-wide vector load
-            LLVM_TYPE_CONST llvm::VectorType *vt =
+            llvm::VectorType *vt =
                 llvm::VectorType::get(LLVMTypes::Int32Type, 8);
             loadOps[i].load = lGEPAndLoad(basePtr, start, align, 
                                           insertBefore, vt);
@@ -2966,7 +2919,7 @@ lApplyLoad2(llvm::Value *result, const CoalescedLoadOp &load,
             Assert(set[elt] == false && set[elt+1] == false);
 
             // In this case, we bitcast from a 4xi32 to a 2xi64 vector
-            LLVM_TYPE_CONST llvm::Type *vec2x64Type = 
+            llvm::Type *vec2x64Type = 
                 llvm::VectorType::get(LLVMTypes::Int64Type, 2);
             result = new llvm::BitCastInst(result, vec2x64Type, "to2x64",
                                            insertBefore);
@@ -2978,7 +2931,7 @@ lApplyLoad2(llvm::Value *result, const CoalescedLoadOp &load,
                                                      "insert64", insertBefore);
             
             // And back to 4xi32.
-            LLVM_TYPE_CONST llvm::Type *vec4x32Type = 
+            llvm::Type *vec4x32Type = 
                 llvm::VectorType::get(LLVMTypes::Int32Type, 4);
             result = new llvm::BitCastInst(result, vec4x32Type, "to4x32",
                                            insertBefore);
@@ -3058,7 +3011,7 @@ lApplyLoad4(llvm::Value *result, const CoalescedLoadOp &load,
 static llvm::Value *
 lAssemble4Vector(const std::vector<CoalescedLoadOp> &loadOps, 
                  const int64_t offsets[4], llvm::Instruction *insertBefore) {
-    LLVM_TYPE_CONST llvm::Type *returnType = 
+    llvm::Type *returnType = 
         llvm::VectorType::get(LLVMTypes::Int32Type, 4);
     llvm::Value *result = llvm::UndefValue::get(returnType);
 
@@ -3198,7 +3151,7 @@ lApplyLoad12s(llvm::Value *result, const std::vector<CoalescedLoadOp> &loadOps,
 static llvm::Value *
 lAssemble4Vector(const std::vector<CoalescedLoadOp> &loadOps, 
                  const int64_t offsets[4], llvm::Instruction *insertBefore) {
-    LLVM_TYPE_CONST llvm::Type *returnType = 
+    llvm::Type *returnType = 
         llvm::VectorType::get(LLVMTypes::Int32Type, 4);
     llvm::Value *result = llvm::UndefValue::get(returnType);
 
@@ -3285,8 +3238,7 @@ lComputeBasePtr(llvm::CallInst *gatherInst, llvm::Instruction *insertBefore) {
     // All of the variable offsets values should be the same, due to
     // checking for this in GatherCoalescePass::runOnBasicBlock().  Thus,
     // extract the first value and use that as a scalar.
-    llvm::Value *variable = LLVMExtractFirstVectorElement(variableOffsets,
-                                                          insertBefore);
+    llvm::Value *variable = LLVMExtractFirstVectorElement(variableOffsets);
     if (variable->getType() == LLVMTypes::Int64Type)
         offsetScale = new llvm::ZExtInst(offsetScale, LLVMTypes::Int64Type,
                                          "scale_to64", insertBefore);
@@ -3341,7 +3293,7 @@ lCoalesceGathers(const std::vector<llvm::CallInst *> &coalesceGroup) {
     // First, compute the shared base pointer for all of the gathers
     llvm::Value *basePtr = lComputeBasePtr(coalesceGroup[0], insertBefore);
 
-    int elementSize;
+    int elementSize = 0;
     if (coalesceGroup[0]->getType() == LLVMTypes::Int32VectorType)
         elementSize = 4;
     else if (coalesceGroup[0]->getType() == LLVMTypes::Int64VectorType)
@@ -3405,13 +3357,9 @@ lCoalesceGathers(const std::vector<llvm::CallInst *> &coalesceGroup) {
     memory. */
 static bool
 lInstructionMayWriteToMemory(llvm::Instruction *inst) {
-#ifdef LLVM_2_9
-    if (llvm::isa<llvm::StoreInst>(inst))
-#else
     if (llvm::isa<llvm::StoreInst>(inst) ||
         llvm::isa<llvm::AtomicRMWInst>(inst) ||
         llvm::isa<llvm::AtomicCmpXchgInst>(inst))
-#endif // !LLVM_2_9
         // FIXME: we could be less conservative and try to allow stores if
         // we are sure that the pointers don't overlap..
         return true;
@@ -3436,6 +3384,8 @@ lInstructionMayWriteToMemory(llvm::Instruction *inst) {
 
 bool
 GatherCoalescePass::runOnBasicBlock(llvm::BasicBlock &bb) {
+    DEBUG_START_PASS("GatherCoalescePass");
+
     llvm::Function *gatherFuncs[] = {
         m->module->getFunction("__pseudo_gather_base_offsets32_32"),
         m->module->getFunction("__pseudo_gather_base_offsets64_32"),
@@ -3486,7 +3436,7 @@ GatherCoalescePass::runOnBasicBlock(llvm::BasicBlock &bb) {
         // Then and only then do we have a common base pointer with all
         // offsets from that constants (in which case we can potentially
         // coalesce).
-        if (lIsMaskAllOn(mask) == false)
+        if (lGetMaskStatus(mask) != ALL_ON)
             continue;
 
         if (!LLVMVectorValuesAllEqual(variableOffsets))
@@ -3570,6 +3520,8 @@ GatherCoalescePass::runOnBasicBlock(llvm::BasicBlock &bb) {
         }
     }
 
+    DEBUG_END_PASS("GatherCoalescePass");
+
     return modifiedAny;
 }
 
@@ -3615,6 +3567,8 @@ struct LowerGSInfo {
 
 bool
 PseudoGSToGSPass::runOnBasicBlock(llvm::BasicBlock &bb) {
+    DEBUG_START_PASS("PseudoGSToGSPass");
+
     LowerGSInfo lgsInfo[] = {
         LowerGSInfo("__pseudo_gather_base_offsets32_8",  "__gather_base_offsets32_i8",  true),
         LowerGSInfo("__pseudo_gather_base_offsets32_16", "__gather_base_offsets32_i16", true),
@@ -3698,6 +3652,8 @@ PseudoGSToGSPass::runOnBasicBlock(llvm::BasicBlock &bb) {
         goto restart;
     }
 
+    DEBUG_END_PASS("PseudoGSToGSPass");
+
     return modifiedAny;
 }
 
@@ -3739,8 +3695,11 @@ public:
 
 char IsCompileTimeConstantPass::ID = 0;
 
+
 bool
 IsCompileTimeConstantPass::runOnBasicBlock(llvm::BasicBlock &bb) {
+    DEBUG_START_PASS("IsCompileTimeConstantPass");
+
     llvm::Function *funcs[] = {
         m->module->getFunction("__is_compile_time_constant_mask"),
         m->module->getFunction("__is_compile_time_constant_uniform_int32"),
@@ -3798,6 +3757,8 @@ IsCompileTimeConstantPass::runOnBasicBlock(llvm::BasicBlock &bb) {
         }
     }
 
+    DEBUG_END_PASS("IsCompileTimeConstantPass");
+
     return modifiedAny;
 }
 
@@ -3828,6 +3789,10 @@ public:
     MakeInternalFuncsStaticPass(bool last = false) : ModulePass(ID) {
     }
 
+    void getAnalysisUsage(llvm::AnalysisUsage &AU) const {
+        AU.setPreservesCFG();
+    }
+
     const char *getPassName() const { return "Make internal funcs \"static\""; }
     bool runOnModule(llvm::Module &m);
 };
@@ -3870,6 +3835,7 @@ MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) {
         "__scatter32_i32", "__scatter32_i64",
         "__scatter64_i8", "__scatter64_i16",
         "__scatter64_i32", "__scatter64_i64",
+        "__keep_funcs_live",
     };
 
     bool modifiedAny = false;
diff --git a/parse.yy b/parse.yy
index 1fa8336f..33e19dbc 100644
--- a/parse.yy
+++ b/parse.yy
@@ -173,8 +173,11 @@ struct ForeachDimension {
 }
 
 
-%token TOKEN_INT32_CONSTANT TOKEN_UINT32_CONSTANT TOKEN_INT64_CONSTANT
-%token TOKEN_UINT64_CONSTANT TOKEN_FLOAT_CONSTANT TOKEN_STRING_C_LITERAL
+%token TOKEN_INT32_CONSTANT TOKEN_UINT32_CONSTANT 
+%token TOKEN_INT64_CONSTANT TOKEN_UINT64_CONSTANT 
+%token TOKEN_INT32DOTDOTDOT_CONSTANT TOKEN_UINT32DOTDOTDOT_CONSTANT 
+%token TOKEN_INT64DOTDOTDOT_CONSTANT TOKEN_UINT64DOTDOTDOT_CONSTANT
+%token TOKEN_FLOAT_CONSTANT TOKEN_STRING_C_LITERAL
 %token TOKEN_IDENTIFIER TOKEN_STRING_LITERAL TOKEN_TYPE_NAME TOKEN_NULL
 %token TOKEN_PTR_OP TOKEN_INC_OP TOKEN_DEC_OP TOKEN_LEFT_OP TOKEN_RIGHT_OP 
 %token TOKEN_LE_OP TOKEN_GE_OP TOKEN_EQ_OP TOKEN_NE_OP
@@ -196,7 +199,7 @@ struct ForeachDimension {
 %token TOKEN_CIF TOKEN_CDO TOKEN_CFOR TOKEN_CWHILE TOKEN_CBREAK
 %token TOKEN_CCONTINUE TOKEN_CRETURN TOKEN_SYNC TOKEN_PRINT TOKEN_ASSERT
 
-%type <expr> primary_expression postfix_expression
+%type <expr> primary_expression postfix_expression integer_dotdotdot
 %type <expr> unary_expression cast_expression funcall_expression launch_expression
 %type <expr> multiplicative_expression additive_expression shift_expression
 %type <expr> relational_expression equality_expression and_expression
@@ -250,6 +253,12 @@ struct ForeachDimension {
 
 string_constant
     : TOKEN_STRING_LITERAL { $$ = new std::string(*yylval.stringVal); }
+    | string_constant TOKEN_STRING_LITERAL
+    {
+        std::string s = *((std::string *)$1);
+        s += *yylval.stringVal;
+        $$ = new std::string(s);
+    }
     ;
 
 primary_expression
@@ -382,7 +391,7 @@ argument_expression_list
       {
           ExprList *argList = dynamic_cast<ExprList *>($1);
           if (argList == NULL) {
-              Assert(m->errorCount > 0);
+              AssertPos(@1, m->errorCount > 0);
               argList = new ExprList(@3);
           }
           argList->exprs.push_back($3);
@@ -540,8 +549,8 @@ rate_qualified_type_specifier
         if ($2 == NULL)
             $$ = NULL;
         else {
-            int soaWidth = $1;
-            const StructType *st = dynamic_cast<const StructType *>($2);
+            int soaWidth = (int)$1;
+            const StructType *st = CastType<StructType>($2);
             if (st == NULL) {
                 Error(@1, "\"soa\" qualifier is illegal with non-struct type \"%s\".",
                       $2->GetString().c_str());
@@ -614,15 +623,17 @@ declaration_statement
     : declaration     
     {
         if ($1 == NULL) {
-            Assert(m->errorCount > 0);
+            AssertPos(@1, m->errorCount > 0);
             $$ = NULL;
         }
         else if ($1->declSpecs->storageClass == SC_TYPEDEF) {
             for (unsigned int i = 0; i < $1->declarators.size(); ++i) {
                 if ($1->declarators[i] == NULL)
-                    Assert(m->errorCount > 0);
+                    AssertPos(@1, m->errorCount > 0);
                 else
-                    m->AddTypeDef($1->declarators[i]->GetSymbol());
+                    m->AddTypeDef($1->declarators[i]->name,
+                                  $1->declarators[i]->type,
+                                  $1->declarators[i]->pos);
             }
             $$ = NULL;
         }
@@ -778,7 +789,7 @@ init_declarator_list
       {
           std::vector<Declarator *> *dl = (std::vector<Declarator *> *)$1;
           if (dl == NULL) {
-              Assert(m->errorCount > 0);
+              AssertPos(@1, m->errorCount > 0);
               dl = new std::vector<Declarator *>;
           }
           if ($3 != NULL)
@@ -801,7 +812,6 @@ storage_class_specifier
     : TOKEN_TYPEDEF { $$ = SC_TYPEDEF; }
     | TOKEN_EXTERN { $$ = SC_EXTERN; }
     | TOKEN_EXTERN TOKEN_STRING_C_LITERAL  { $$ = SC_EXTERN_C; }
-    | TOKEN_EXPORT { $$ = SC_EXPORT; }
     | TOKEN_STATIC { $$ = SC_STATIC; }
     ;
 
@@ -843,9 +853,9 @@ struct_or_union_specifier
     : struct_or_union struct_or_union_name '{' struct_declaration_list '}' 
       {
           if ($4 != NULL) {
-              std::vector<const Type *> elementTypes;
-              std::vector<std::string> elementNames;
-              std::vector<SourcePos> elementPositions;
+              llvm::SmallVector<const Type *, 8> elementTypes;
+              llvm::SmallVector<std::string, 8> elementNames;
+              llvm::SmallVector<SourcePos, 8> elementPositions;
               GetStructTypesNamesPositions(*$4, &elementTypes, &elementNames,
                                            &elementPositions);
               StructType *st = new StructType($2, elementTypes, elementNames,
@@ -859,12 +869,11 @@ struct_or_union_specifier
     | struct_or_union '{' struct_declaration_list '}' 
       {
           if ($3 != NULL) {
-              std::vector<const Type *> elementTypes;
-              std::vector<std::string> elementNames;
-              std::vector<SourcePos> elementPositions;
+              llvm::SmallVector<const Type *, 8> elementTypes;
+              llvm::SmallVector<std::string, 8> elementNames;
+              llvm::SmallVector<SourcePos, 8> elementPositions;
               GetStructTypesNamesPositions(*$3, &elementTypes, &elementNames,
                                            &elementPositions);
-              // FIXME: should be unbound
               $$ = new StructType("", elementTypes, elementNames, elementPositions,
                                   false, Variability::Unbound, @1);
           }
@@ -882,12 +891,11 @@ struct_or_union_specifier
     | struct_or_union struct_or_union_name
       { 
           const Type *st = m->symbolTable->LookupType($2); 
-          if (!st) {
-              std::vector<std::string> alternates = m->symbolTable->ClosestTypeMatch($2);
-              std::string alts = lGetAlternates(alternates);
-              Error(@2, "Struct type \"%s\" unknown.%s", $2, alts.c_str());
+          if (st == NULL) {
+              st = new UndefinedStructType($2, Variability::Unbound, false, @2);
+              m->symbolTable->AddType($2, st, @2);
           }
-          else if (dynamic_cast<const StructType *>(st) == NULL)
+          else if (CastType<StructType>(st) == NULL)
               Error(@2, "Type \"%s\" is not a struct type! (%s)", $2,
                     st->GetString().c_str());
           $$ = st;
@@ -910,7 +918,7 @@ struct_declaration_list
       {
           std::vector<StructDeclaration *> *sdl = (std::vector<StructDeclaration *> *)$1;
           if (sdl == NULL) {
-              Assert(m->errorCount > 0);
+              AssertPos(@1, m->errorCount > 0);
               sdl = new std::vector<StructDeclaration *>;
           }
           if ($2 != NULL)
@@ -976,6 +984,11 @@ specifier_qualifier_list
                       "function declarations.");
                 $$ = $2;
             }
+            else if ($1 == TYPEQUAL_EXPORT) {
+                Error(@1, "\"export\" qualifier is illegal outside of "
+                      "function declarations.");
+                $$ = $2;
+            }
             else
                 FATAL("Unhandled type qualifier in parser.");
         }
@@ -1000,7 +1013,7 @@ struct_declarator_list
       {
           std::vector<Declarator *> *sdl = (std::vector<Declarator *> *)$1;
           if (sdl == NULL) {
-              Assert(m->errorCount > 0);
+              AssertPos(@1, m->errorCount > 0);
               sdl = new std::vector<Declarator *>;
           }
           if ($3 != NULL)
@@ -1047,7 +1060,7 @@ enum_specifier
               $$ = NULL;
           }
           else {
-              const EnumType *enumType = dynamic_cast<const EnumType *>(type);
+              const EnumType *enumType = CastType<EnumType>(type);
               if (enumType == NULL) {
                   Error(@2, "Type \"%s\" is not an enum type (%s).", $2,
                         type->GetString().c_str());
@@ -1074,7 +1087,7 @@ enumerator_list
       {
           std::vector<Symbol *> *symList = $1;
           if (symList == NULL) {
-              Assert(m->errorCount > 0);
+              AssertPos(@1, m->errorCount > 0);
               symList = new std::vector<Symbol *>;
           }
           if ($3 != NULL)
@@ -1108,6 +1121,7 @@ type_qualifier
     | TOKEN_UNIFORM    { $$ = TYPEQUAL_UNIFORM; }
     | TOKEN_VARYING    { $$ = TYPEQUAL_VARYING; }
     | TOKEN_TASK       { $$ = TYPEQUAL_TASK; }
+    | TOKEN_EXPORT     { $$ = TYPEQUAL_EXPORT; }
     | TOKEN_INLINE     { $$ = TYPEQUAL_INLINE; }
     | TOKEN_SIGNED     { $$ = TYPEQUAL_SIGNED; }
     | TOKEN_UNSIGNED   { $$ = TYPEQUAL_UNSIGNED; }
@@ -1160,7 +1174,7 @@ direct_declarator
     : TOKEN_IDENTIFIER
       {
           Declarator *d = new Declarator(DK_BASE, @1);
-          d->sym = new Symbol(yytext, @1);
+          d->name = yytext;
           $$ = d;
       }
     | '(' declarator ')' 
@@ -1335,8 +1349,10 @@ type_name
     {
         if ($1 == NULL || $2 == NULL)
             $$ = NULL;
-        else
-            $$ = $2->GetType($1, NULL);
+        else {
+            $2->InitFromType($1, NULL);
+            $$ = $2->type;
+        }
     }
     ;
 
@@ -1471,7 +1487,7 @@ initializer_list
       {
           ExprList *exprList = $1;
           if (exprList == NULL) {
-              Assert(m->errorCount > 0);
+              AssertPos(@1, m->errorCount > 0);
               exprList = new ExprList(@3);
           }
           exprList->exprs.push_back($3);
@@ -1542,7 +1558,7 @@ statement_list
       {
           StmtList *sl = (StmtList *)$1;
           if (sl == NULL) {
-              Assert(m->errorCount > 0);
+              AssertPos(@1, m->errorCount > 0);
               sl = new StmtList(@2);
           }
           sl->Add($2);
@@ -1614,11 +1630,34 @@ foreach_active_identifier
     }
     ;
 
+integer_dotdotdot
+    : TOKEN_INT32DOTDOTDOT_CONSTANT {
+        $$ = new ConstExpr(AtomicType::UniformInt32->GetAsConstType(),
+                           (int32_t)yylval.intVal, @1); 
+    }
+    | TOKEN_UINT32DOTDOTDOT_CONSTANT {
+        $$ = new ConstExpr(AtomicType::UniformUInt32->GetAsConstType(),
+                           (uint32_t)yylval.intVal, @1); 
+    }
+    | TOKEN_INT64DOTDOTDOT_CONSTANT {
+        $$ = new ConstExpr(AtomicType::UniformInt64->GetAsConstType(),
+                           (int64_t)yylval.intVal, @1); 
+    }
+    | TOKEN_UINT64DOTDOTDOT_CONSTANT {
+        $$ = new ConstExpr(AtomicType::UniformUInt64->GetAsConstType(),
+                           (uint64_t)yylval.intVal, @1); 
+    }
+    ;
+
 foreach_dimension_specifier
     : foreach_identifier '=' assignment_expression TOKEN_DOTDOTDOT assignment_expression
     {
         $$ = new ForeachDimension($1, $3, $5);
     }
+    | foreach_identifier '=' integer_dotdotdot assignment_expression
+    {
+        $$ = new ForeachDimension($1, $3, $4);
+    }
     ;
 
 foreach_dimension_list
@@ -1631,7 +1670,7 @@ foreach_dimension_list
     {
         std::vector<ForeachDimension *> *dv = $1;
         if (dv == NULL) {
-            Assert(m->errorCount > 0);
+            AssertPos(@1, m->errorCount > 0);
             dv = new std::vector<ForeachDimension *>;
         }
         if ($3 != NULL)
@@ -1669,7 +1708,7 @@ iteration_statement
      {
          std::vector<ForeachDimension *> *dims = $3;
          if (dims == NULL) {
-             Assert(m->errorCount > 0);
+             AssertPos(@3, m->errorCount > 0);
              dims = new std::vector<ForeachDimension *>;
          }
          for (unsigned int i = 0; i < dims->size(); ++i)
@@ -1679,7 +1718,7 @@ iteration_statement
      {
          std::vector<ForeachDimension *> *dims = $3;
          if (dims == NULL) {
-             Assert(m->errorCount > 0);
+             AssertPos(@3, m->errorCount > 0);
              dims = new std::vector<ForeachDimension *>;
          }
 
@@ -1697,7 +1736,7 @@ iteration_statement
      {
          std::vector<ForeachDimension *> *dims = $3;
          if (dims == NULL) {
-             Assert(m->errorCount > 0);
+             AssertPos(@3, m->errorCount > 0);
              dims = new std::vector<ForeachDimension *>;
          }
 
@@ -1708,7 +1747,7 @@ iteration_statement
      {
          std::vector<ForeachDimension *> *dims = $3;
          if (dims == NULL) {
-             Assert(m->errorCount > 0);
+             AssertPos(@1, m->errorCount > 0);
              dims = new std::vector<ForeachDimension *>;
          }
 
@@ -1804,6 +1843,7 @@ external_declaration
             for (unsigned int i = 0; i < $1->declarators.size(); ++i)
                 lAddDeclaration($1->declSpecs, $1->declarators[i]);
     }
+    | ';'
     ;
 
 function_definition
@@ -1817,11 +1857,18 @@ function_definition
     } 
     compound_statement
     {
-        std::vector<Symbol *> args;
         if ($2 != NULL) {
-            Symbol *sym = $2->GetFunctionInfo($1, &args);
-            if (sym != NULL)
-                m->AddFunctionDefinition(sym, args, $4);
+            $2->InitFromDeclSpecs($1);
+            const FunctionType *funcType = CastType<FunctionType>($2->type);
+            if (funcType == NULL)
+                AssertPos(@1, m->errorCount > 0);
+            else if ($1->storageClass == SC_TYPEDEF)
+                Error(@1, "Illegal \"typedef\" provided with function definition.");
+            else {
+                Stmt *code = $4;
+                if (code == NULL) code = new StmtList(@4);
+                m->AddFunctionDefinition($2->name, funcType, code);
+            }
         }
         m->symbolTable->PopScope(); // push in lAddFunctionParams();
     }
@@ -1931,35 +1978,27 @@ lAddDeclaration(DeclSpecs *ds, Declarator *decl) {
         // Error happened earlier during parsing
         return;
 
+    decl->InitFromDeclSpecs(ds);
     if (ds->storageClass == SC_TYPEDEF)
-        m->AddTypeDef(decl->GetSymbol());
+        m->AddTypeDef(decl->name, decl->type, decl->pos);
     else {
-        const Type *t = decl->GetType(ds);
-        if (t == NULL) {
+        if (decl->type == NULL) {
             Assert(m->errorCount > 0);
             return;
         }
 
-        Symbol *sym = decl->GetSymbol();
-        if (sym == NULL) {
-            Assert(m->errorCount > 0);
-            return;
-        }
-
-        const FunctionType *ft = dynamic_cast<const FunctionType *>(t);
+        decl->type = decl->type->ResolveUnboundVariability(Variability::Varying);
+        
+        const FunctionType *ft = CastType<FunctionType>(decl->type);
         if (ft != NULL) {
-            sym->type = ft;
-            sym->storageClass = ds->storageClass;
             bool isInline = (ds->typeQualifiers & TYPEQUAL_INLINE);
-            m->AddFunctionDeclaration(sym, isInline);
+            m->AddFunctionDeclaration(decl->name, ft, ds->storageClass,
+                                      isInline, decl->pos);
         }
         else {
-            if (sym->type == NULL)
-                Assert(m->errorCount > 0);
-            else
-                sym->type = sym->type->ResolveUnboundVariability(Variability::Varying);
             bool isConst = (ds->typeQualifiers & TYPEQUAL_CONST) != 0;
-            m->AddGlobalVariable(sym, decl->initExpr, isConst);
+            m->AddGlobalVariable(decl->name, decl->type, decl->initExpr,
+                                 isConst, decl->storageClass, decl->pos);
         }
     }
 }
@@ -1973,7 +2012,7 @@ lAddFunctionParams(Declarator *decl) {
     m->symbolTable->PushScope();
 
     if (decl == NULL) {
-        Assert(m->errorCount > 0);
+        AssertPos(decl->pos, m->errorCount > 0);
         return;
     }
 
@@ -1981,27 +2020,24 @@ lAddFunctionParams(Declarator *decl) {
     while (decl->kind != DK_FUNCTION && decl->child != NULL)
         decl = decl->child;
     if (decl->kind != DK_FUNCTION) {
-        Assert(m->errorCount > 0);
+        AssertPos(decl->pos, m->errorCount > 0);
         return;
     }
 
     // now loop over its parameters and add them to the symbol table
     for (unsigned int i = 0; i < decl->functionParams.size(); ++i) {
         Declaration *pdecl = decl->functionParams[i];
-        if (pdecl == NULL || pdecl->declarators.size() == 0)
-            // zero size declarators array corresponds to an anonymous 
-            // parameter
-            continue;
-        Assert(pdecl->declarators.size() == 1);
-        Symbol *sym = pdecl->declarators[0]->GetSymbol();
-        if (sym == NULL || sym->type == NULL)
-            Assert(m->errorCount > 0);
+        Assert(pdecl != NULL && pdecl->declarators.size() == 1);
+        Declarator *declarator = pdecl->declarators[0];
+        if (declarator == NULL)
+            AssertPos(decl->pos, m->errorCount > 0);
         else {
-            sym->type = sym->type->ResolveUnboundVariability(Variability::Varying);
+            Symbol *sym = new Symbol(declarator->name, declarator->pos,
+                                     declarator->type, declarator->storageClass);
 #ifndef NDEBUG
             bool ok = m->symbolTable->AddVariable(sym);
             if (ok == false)
-                Assert(m->errorCount > 0);
+                AssertPos(decl->pos, m->errorCount > 0);
 #else
             m->symbolTable->AddVariable(sym);
 #endif
@@ -2064,8 +2100,6 @@ lGetStorageClassString(StorageClass sc) {
         return "";
     case SC_EXTERN:
         return "extern";
-    case SC_EXPORT:
-        return "export";
     case SC_STATIC:
         return "static";
     case SC_TYPEDEF:
@@ -2157,7 +2191,7 @@ lFinalizeEnumeratorSymbols(std::vector<Symbol *> &enums,
         if (enums[i]->constValue != NULL) {
             /* Already has a value, so first update nextVal with it. */
             int count = enums[i]->constValue->AsUInt32(&nextVal);
-            Assert(count == 1);
+            AssertPos(enums[i]->pos, count == 1);
             ++nextVal;
 
             /* When the source file as being parsed, the ConstExpr for any
@@ -2170,7 +2204,7 @@ lFinalizeEnumeratorSymbols(std::vector<Symbol *> &enums,
                                               enums[i]->pos);
             castExpr = Optimize(castExpr);
             enums[i]->constValue = dynamic_cast<ConstExpr *>(castExpr);
-            Assert(enums[i]->constValue != NULL);
+            AssertPos(enums[i]->pos, enums[i]->constValue != NULL);
         }
         else {
             enums[i]->constValue = new ConstExpr(enumType, nextVal++, 
diff --git a/run_tests.py b/run_tests.py
index 724e1037..03fda1ad 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -17,6 +17,10 @@ import shlex
 import platform
 import tempfile
 
+# disable fancy error/warning printing with ANSI colors, so grepping for error
+# messages doesn't get confused
+os.environ["TERM"] = "dumb"
+
 # This script is affected by http://bugs.python.org/issue5261 on OSX 10.5 Leopard
 # git history has a workaround for that issue.
 
@@ -28,8 +32,10 @@ parser.add_option("-r", "--random-shuffle", dest="random", help="Randomly order
                   default=False, action="store_true")
 parser.add_option("-g", "--generics-include", dest="include_file", help="Filename for header implementing functions for generics",
                   default=None)
+parser.add_option("-f", "--ispc-flags", dest="ispc_flags", help="Additional flags for ispc (-g, -O1, ...)",
+                  default="")
 parser.add_option('-t', '--target', dest='target',
-                  help='Set compilation target (sse2, sse2-x2, sse4, sse4-x2, avx, avx-x2, generic-4, generic-8, generic-16)',
+                  help='Set compilation target (sse2, sse2-x2, sse4, sse4-x2, avx, avx-x2, generic-4, generic-8, generic-16, generic-32)',
                   default="sse4")
 parser.add_option('-a', '--arch', dest='arch',
                   help='Set architecture (x86, x86-64)',
@@ -53,6 +59,10 @@ if not is_windows:
 else:
     ispc_exe = "../Release/ispc.exe"
 
+ispc_exe += " " + options.ispc_flags
+
+print ispc_exe
+
 is_generic_target = (options.target.find("generic-") != -1 and
                      options.target != "generic-1")
 if is_generic_target and options.include_file == None:
@@ -65,6 +75,12 @@ if is_generic_target and options.include_file == None:
     elif options.target == "generic-16":
         sys.stderr.write("No generics #include specified; using examples/intrinsics/generic-16.h\n")
         options.include_file = "examples/intrinsics/generic-16.h"
+    elif options.target == "generic-32":
+        sys.stderr.write("No generics #include specified; using examples/intrinsics/generic-32.h\n")
+        options.include_file = "examples/intrinsics/generic-32.h"
+    elif options.target == "generic-64":
+        sys.stderr.write("No generics #include specified; using examples/intrinsics/generic-64.h\n")
+        options.include_file = "examples/intrinsics/generic-64.h"
 
 if options.compiler_exe == None:
     if is_windows:
diff --git a/stdlib.ispc b/stdlib.ispc
index f5984277..ea4c6b98 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -1,6 +1,6 @@
 // -*- mode: c++ -*-
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
   All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
@@ -355,7 +355,8 @@ static inline uniform bool all(bool v) {
 #else
     int32 match = __sext_varying_bool((__sext_varying_bool(v) & __mask) == __mask);
 #endif
-    return __movmsk(match) == (1 << programCount) - 1;
+    return __movmsk(match) == ((programCount == 64) ? ~0ull : 
+                               ((1ull << programCount) - 1));
 }
 
 __declspec(safe) 
@@ -388,14 +389,14 @@ __declspec(safe)
 static inline uniform int popcnt(bool v) {
     // As with any() and all(), only count across the active lanes
 #ifdef ISPC_TARGET_GENERIC
-    return __popcnt_int32(__movmsk(v & __mask));
+    return __popcnt_int64(__movmsk(v & __mask));
 #else
-    return __popcnt_int32(__movmsk(__sext_varying_bool(v) & __mask));
+    return __popcnt_int64(__movmsk(__sext_varying_bool(v) & __mask));
 #endif
 }
 
 __declspec(safe) 
-static inline uniform int lanemask() {
+static inline uniform unsigned int64 lanemask() {
     return __movmsk(__mask);
 }
 
@@ -746,6 +747,125 @@ static inline void prefetch_nt(const void * varying ptr) {
     }
 }
 
+///////////////////////////////////////////////////////////////////////////
+// non-short-circuiting alternatives
+
+__declspec(safe,cost1)
+static inline bool and(bool a, bool b) {
+    return a && b;
+}
+
+__declspec(safe,cost1)
+static inline uniform bool and(uniform bool a, uniform bool b) {
+    return a && b;
+}
+
+__declspec(safe,cost1)
+static inline bool or(bool a, bool b) {
+    return a || b;
+}
+
+__declspec(safe,cost1)
+static inline uniform bool or(uniform bool a, uniform bool b) {
+    return a || b;
+}
+
+__declspec(safe,cost1)
+static inline int8 select(bool c, int8 a, int8 b) {
+    return c ? a : b;
+}
+
+__declspec(safe,cost1)
+static inline int8 select(uniform bool c, int8 a, int8 b) {
+    return c ? a : b;
+}
+
+__declspec(safe,cost1)
+static inline uniform int8 select(uniform bool c, uniform int8 a,
+                                  uniform int8 b) {
+    return c ? a : b;
+}
+
+__declspec(safe,cost1)
+static inline int16 select(bool c, int16 a, int16 b) {
+    return c ? a : b;
+}
+
+__declspec(safe,cost1)
+static inline int16 select(uniform bool c, int16 a, int16 b) {
+    return c ? a : b;
+}
+
+__declspec(safe,cost1)
+static inline uniform int16 select(uniform bool c, uniform int16 a,
+                                   uniform int16 b) {
+    return c ? a : b;
+}
+
+__declspec(safe,cost1)
+static inline int32 select(bool c, int32 a, int32 b) {
+    return c ? a : b;
+}
+
+__declspec(safe,cost1)
+static inline int32 select(uniform bool c, int32 a, int32 b) {
+    return c ? a : b;
+}
+
+__declspec(safe,cost1)
+static inline uniform int32 select(uniform bool c, uniform int32 a,
+                                   uniform int32 b) {
+    return c ? a : b;
+}
+
+__declspec(safe,cost1)
+static inline int64 select(bool c, int64 a, int64 b) {
+    return c ? a : b;
+}
+
+__declspec(safe,cost1)
+static inline int64 select(uniform bool c, int64 a, int64 b) {
+    return c ? a : b;
+}
+
+__declspec(safe,cost1)
+static inline uniform int64 select(uniform bool c, uniform int64 a,
+                                   uniform int64 b) {
+    return c ? a : b;
+}
+
+__declspec(safe,cost1)
+static inline float select(bool c, float a, float b) {
+    return c ? a : b;
+}
+
+__declspec(safe,cost1)
+static inline float select(uniform bool c, float a, float b) {
+    return c ? a : b;
+}
+
+__declspec(safe,cost1)
+static inline uniform float select(uniform bool c, uniform float a,
+                                   uniform float b) {
+    return c ? a : b;
+}
+
+__declspec(safe,cost1)
+static inline double select(bool c, double a, double b) {
+    return c ? a : b;
+}
+
+__declspec(safe,cost1)
+static inline double select(uniform bool c, double a, double b) {
+    return c ? a : b;
+}
+
+__declspec(safe,cost1)
+static inline uniform double select(uniform bool c, uniform double a,
+                                    uniform double b) {
+    return c ? a : b;
+}
+
 ///////////////////////////////////////////////////////////////////////////
 // Horizontal ops / reductions
 
@@ -1469,22 +1589,17 @@ static inline void memory_barrier() {
 
 #define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB,MASKTYPE)                        \
 static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
-    memory_barrier();                                                   \
     TA ret = __atomic_##OPB##_##TB##_global(ptr, value, (MASKTYPE)__mask); \
-    memory_barrier();                                                   \
     return ret;                                                         \
 }                                                                       \
 static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
                                                uniform TA value) {      \
-    memory_barrier();                                                   \
     uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value); \
-    memory_barrier();                                                   \
     return ret;                                                         \
 }                                                                       \
 static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \
     uniform TA * uniform ptrArray[programCount];                        \
     ptrArray[programIndex] = ptr;                                       \
-    memory_barrier();                                                   \
     TA ret;                                                             \
     __foreach_active (i) {                                              \
         uniform TA * uniform p = ptrArray[i];                           \
@@ -1492,23 +1607,21 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \
         uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v);    \
         ret = insert(ret, i, r);                                        \
     }                                                                   \
-    memory_barrier();                                                   \
     return ret;                                                         \
 }                                                                       \
 
 #define DEFINE_ATOMIC_SWAP(TA,TB)                                       \
 static inline TA atomic_swap_global(uniform TA * uniform ptr, TA value) { \
-    memory_barrier();                                                   \
     uniform int i = 0;                                                  \
     TA ret[programCount];                                               \
     TA memVal;                                                          \
     uniform int lastSwap;                                               \
-    uniform int mask = lanemask();                                      \
+    uniform unsigned int64 mask = lanemask();                           \
     /* First, have the first running program instance (if any) perform  \
        the swap with memory with its value of "value"; record the       \
        value returned. */                                               \
     for (; i < programCount; ++i) {                                     \
-        if ((mask & (1 << i)) == 0)                                     \
+        if ((mask & (1ull << i)) == 0)                                  \
             continue;                                                   \
         memVal = __atomic_swap_uniform_##TB##_global(ptr, extract(value, i)); \
         lastSwap = i;                                                   \
@@ -1520,7 +1633,7 @@ static inline TA atomic_swap_global(uniform TA * uniform ptr, TA value) { \
        current instance had executed a hardware atomic swap right before \
        the last one that did a swap. */                                 \
     for (; i < programCount; ++i) {                                     \
-        if ((mask & (1 << i)) == 0)                                     \
+        if ((mask & (1ull << i)) == 0)                                  \
             continue;                                                   \
         ret[lastSwap] = extract(value, i);                              \
         lastSwap = i;                                                   \
@@ -1528,20 +1641,16 @@ static inline TA atomic_swap_global(uniform TA * uniform ptr, TA value) { \
     /* And the last instance that wanted to swap gets the value we      \
        originally got back from memory... */                            \
     ret[lastSwap] = memVal;                                             \
-    memory_barrier();                                                   \
     return ret[programIndex];                                           \
 }                                                                       \
 static inline uniform TA atomic_swap_global(uniform TA * uniform ptr,   \
                                             uniform TA value) {         \
-    memory_barrier();                                                   \
     uniform TA ret = __atomic_swap_uniform_##TB##_global(ptr, value);   \
-    memory_barrier();                                                   \
     return ret;                                                         \
 }                                                                       \
 static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \
     uniform TA * uniform ptrArray[programCount];                        \
     ptrArray[programIndex] = ptr;                                       \
-    memory_barrier();                                                   \
     TA ret;                                                             \
     __foreach_active (i) {                                              \
         uniform TA * uniform p = ptrArray[i];                           \
@@ -1549,7 +1658,6 @@ static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \
         uniform TA r = __atomic_swap_uniform_##TB##_global(p, v);       \
         ret = insert(ret, i, r);                                        \
     }                                                                   \
-    memory_barrier();                                                   \
     return ret;                                                         \
 }                                                                       \
 
@@ -1557,25 +1665,19 @@ static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \
 static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
     uniform TA oneval = reduce_##OPA(value);                            \
     TA ret;                                                             \
-    if (lanemask() != 0) {                                              \
-        memory_barrier();                                               \
+    if (lanemask() != 0)                                                \
         ret = __atomic_##OPB##_uniform_##TB##_global(ptr, oneval);      \
-        memory_barrier();                                               \
-    }                                                                   \
     return ret;                                                         \
 }                                                                       \
 static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
                                                uniform TA value) {      \
-    memory_barrier();                                                   \
     uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value); \
-    memory_barrier();                                                   \
     return ret;                                                         \
 }                                                                       \
 static inline TA atomic_##OPA##_global(uniform TA * varying ptr,        \
                                        TA value) {                      \
     uniform TA * uniform ptrArray[programCount];                        \
     ptrArray[programIndex] = ptr;                                       \
-    memory_barrier();                                                   \
     TA ret;                                                             \
     __foreach_active (i) {                                              \
         uniform TA * uniform p = ptrArray[i];                           \
@@ -1583,7 +1685,6 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr,        \
         uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v);    \
         ret = insert(ret, i, r);                                        \
     }                                                                   \
-    memory_barrier();                                                   \
     return ret;                                                         \
 }
 
@@ -1638,25 +1739,20 @@ DEFINE_ATOMIC_SWAP(double,double)
 #define ATOMIC_DECL_CMPXCHG(TA, TB, MASKTYPE)                           \
 static inline uniform TA atomic_compare_exchange_global(               \
          uniform TA * uniform ptr, uniform TA oldval, uniform TA newval) { \
-    memory_barrier();                                                      \
     uniform TA ret =                                                    \
         __atomic_compare_exchange_uniform_##TB##_global(ptr, oldval, newval); \
-    memory_barrier();                                                   \
     return ret;                                                         \
 }                                                                       \
 static inline TA atomic_compare_exchange_global(                           \
          uniform TA * uniform ptr, TA oldval, TA newval) {                 \
-    memory_barrier();                                                      \
     TA ret = __atomic_compare_exchange_##TB##_global(ptr, oldval, newval,  \
                                                      (MASKTYPE)__mask);    \
-    memory_barrier();                                                      \
     return ret;                                                            \
 } \
 static inline TA atomic_compare_exchange_global(               \
          uniform TA * varying ptr, TA oldval, TA newval) { \
     uniform TA * uniform ptrArray[programCount];                        \
     ptrArray[programIndex] = ptr;                                       \
-    memory_barrier();                                                   \
     TA ret;                                                             \
     __foreach_active (i) {                                              \
         uniform TA r =                                                  \
@@ -1665,7 +1761,6 @@ static inline TA atomic_compare_exchange_global(               \
                                                             extract(newval, i)); \
         ret = insert(ret, i, r);                                        \
     }                                                                   \
-    memory_barrier();                                                   \
     return ret;                                                         \
 }
 
@@ -1678,6 +1773,49 @@ ATOMIC_DECL_CMPXCHG(double, double, IntMaskType)
 
 #undef ATOMIC_DECL_CMPXCHG
 
+// void * variants of swap and compare exchange
+
+static inline void *atomic_swap_global(void ** uniform ptr,
+                                       void * value) {
+    return (void *)atomic_swap_global((intptr_t * uniform)ptr,
+                                      (intptr_t)value);
+}
+
+static inline void * uniform atomic_swap_global(void ** uniform ptr,
+                                                void * uniform value) {
+    return (void * uniform)atomic_swap_global((intptr_t * uniform)ptr,
+                                              (uniform intptr_t)value);
+}
+
+static inline void *atomic_swap_global(void ** ptr, void * value) {
+    return (void *)atomic_swap_global((intptr_t *)ptr,
+                                      (intptr_t)value);
+}
+
+static inline void * 
+atomic_compare_exchange_global(void ** uniform ptr, 
+                               void * oldval, void * newval) {
+    return (void *)atomic_compare_exchange_global((intptr_t * uniform)ptr,
+                                                  (intptr_t)oldval,
+                                                  (intptr_t)newval);
+}
+
+static inline void * uniform
+atomic_compare_exchange_global(void ** uniform ptr, void * uniform oldval, 
+                               void * uniform newval) { 
+    return (void * uniform)atomic_compare_exchange_global((intptr_t * uniform)ptr,
+                                                          (uniform intptr_t)oldval,
+                                                          (uniform intptr_t)newval);
+}
+
+static inline void *
+atomic_compare_exchange_global(void ** ptr, void * oldval,
+                               void * newval) {
+    return (void *)atomic_compare_exchange_global((intptr_t *)ptr,
+                                                  (intptr_t)oldval,
+                                                  (intptr_t)newval);
+}
+
 ///////////////////////////////////////////////////////////////////////////
 // local atomics
 
@@ -1849,6 +1987,49 @@ LOCAL_CMPXCHG(double)
 #undef LOCAL_ATOMIC
 #undef LOCAL_CMPXCHG
 
+// void * variants of swap and compare exchange
+
+static inline void *atomic_swap_local(void ** uniform ptr,
+                                      void * value) {
+    return (void *)atomic_swap_local((intptr_t * uniform)ptr,
+                                      (intptr_t)value);
+}
+
+static inline void * uniform atomic_swap_local(void ** uniform ptr,
+                                               void * uniform value) {
+    return (void * uniform)atomic_swap_local((intptr_t * uniform)ptr,
+                                              (uniform intptr_t)value);
+}
+
+static inline void *atomic_swap_local(void ** ptr, void * value) {
+    return (void *)atomic_swap_local((intptr_t *)ptr,
+                                      (intptr_t)value);
+}
+
+static inline void * 
+atomic_compare_exchange_local(void ** uniform ptr, 
+                              void * oldval, void * newval) {
+    return (void *)atomic_compare_exchange_local((intptr_t * uniform)ptr,
+                                                  (intptr_t)oldval,
+                                                  (intptr_t)newval);
+}
+
+static inline void * uniform
+atomic_compare_exchange_local(void ** uniform ptr, void * uniform oldval, 
+                              void * uniform newval) { 
+    return (void * uniform)atomic_compare_exchange_local((intptr_t * uniform)ptr,
+                                                          (uniform intptr_t)oldval,
+                                                          (uniform intptr_t)newval);
+}
+
+static inline void *
+atomic_compare_exchange_local(void ** ptr, void * oldval,
+                              void * newval) {
+    return (void *)atomic_compare_exchange_local((intptr_t *)ptr,
+                                                  (intptr_t)oldval,
+                                                  (intptr_t)newval);
+}
+
 ///////////////////////////////////////////////////////////////////////////
 // Transcendentals (float precision)
 
@@ -2735,7 +2916,10 @@ static inline uniform float atan2(uniform float y, uniform float x) {
 
 __declspec(safe)
 static inline float exp(float x_full) {
-    if (__math_lib == __math_lib_svml) {
+    if (__have_native_transcendentals) {
+        return __exp_varying_float(x_full);
+    }
+    else if (__math_lib == __math_lib_svml) {
         return __svml_exp(x_full);
     }
     else if (__math_lib == __math_lib_system) {
@@ -2814,7 +2998,10 @@ static inline float exp(float x_full) {
 
 __declspec(safe)
 static inline uniform float exp(uniform float x_full) {
-    if (__math_lib == __math_lib_system ||
+    if (__have_native_transcendentals) {
+        return __exp_uniform_float(x_full);
+    }
+    else if (__math_lib == __math_lib_system ||
         __math_lib == __math_lib_svml) {
         return __stdlib_expf(x_full);
     }
@@ -2936,7 +3123,10 @@ static inline void __range_reduce_log(uniform float input, uniform float * unifo
 
 __declspec(safe)
 static inline float log(float x_full) {
-    if (__math_lib == __math_lib_svml) {
+    if (__have_native_transcendentals) {
+        return __log_varying_float(x_full);
+    }
+    else if (__math_lib == __math_lib_svml) {
         return __svml_log(x_full);
     }
     else if (__math_lib == __math_lib_system) {
@@ -3024,7 +3214,10 @@ static inline float log(float x_full) {
 
 __declspec(safe)
 static inline uniform float log(uniform float x_full) {
-    if (__math_lib == __math_lib_system ||
+    if (__have_native_transcendentals) {
+        return __log_uniform_float(x_full);
+    }
+    else if (__math_lib == __math_lib_system ||
         __math_lib == __math_lib_svml) {
         return __stdlib_logf(x_full);
     }
@@ -3105,7 +3298,10 @@ static inline uniform float log(uniform float x_full) {
 
 __declspec(safe)
 static inline float pow(float a, float b) {
-    if (__math_lib == __math_lib_svml) {
+    if (__have_native_transcendentals) {
+        return __pow_varying_float(a, b);
+    }
+    else if (__math_lib == __math_lib_svml) {
         return __svml_pow(a, b);
     }
     else if (__math_lib == __math_lib_system) {
@@ -3124,6 +3320,9 @@ static inline float pow(float a, float b) {
 
 __declspec(safe)
 static inline uniform float pow(uniform float a, uniform float b) {
+    if (__have_native_transcendentals) {
+        return __pow_uniform_float(a, b);
+    }
     if (__math_lib == __math_lib_system ||
         __math_lib == __math_lib_svml) {
         return __stdlib_powf(a, b);
@@ -3551,8 +3750,9 @@ static inline int16 float_to_half(float f) {
         //   like recursive filters in DSP - not a typical half-float application. Whether
         //   FP16 denormals are rare in practice, I don't know. Whatever slow path your HW
         //   may or may not have for denormals, this may well hit it.
-        int32 fint2 = intbits(floatbits(fint & round_mask) * floatbits(magic)) - round_mask;
-        fint2 = (fint2 > f16infty) ? f16infty : fint2; // Clamp to signed infinity if overflowed
+        float fscale = floatbits(fint & round_mask) * floatbits(magic);
+        fscale = min(fscale, floatbits((31 << 23) - 0x1000));
+        int32 fint2 = intbits(fscale) - round_mask;
 
         if (fint < f32infty)
             o = fint2 >> 13; // Take the bits!
@@ -3648,6 +3848,133 @@ static inline int16 float_to_half_fast(float f) {
     }
 }
 
+///////////////////////////////////////////////////////////////////////////
+// float -> srgb8
+
+// https://gist.github.com/2246678, from Fabian "rygorous" Giesen.
+//
+// The basic ideas are still the same, only this time, we squeeze
+// everything into the table, even the linear part of the range; since we
+// are approximating the function as piecewise linear anyway, this is
+// fairly easy.
+//
+// In the exact version of the conversion, any value that produces an
+// output float less than 0.5 will be rounded to an integer of
+// zero. Inverting the linear part of the transform, we get:
+//
+//   log2(0.5 / (255 * 12.92)) =~ -12.686
+//
+// which in turn means that any value smaller than about 2^(-12.687) will
+// return 0.  What this means is that we can adapt the clamping code to
+// just clamp to [2^(-13), 1-eps] and we're covered. This means our table
+// needs to cover a range of 13 different exponents from -13 to -1.
+//
+// The table lookup, storage and interpolation works exactly the same way
+// as in the code above.
+//
+// Max error for the whole function (integer-rounded result minus "exact"
+// value, as computed in floats using the official formula): 0.544403 at
+// 0x3e9f8000
+
+__declspec(safe)
+static inline int
+float_to_srgb8(float in)
+{
+    static const uniform unsigned int table[104] = {
+        0x0073000d, 0x007a000d, 0x0080000d, 0x0087000d,
+        0x008d000d, 0x0094000d, 0x009a000d, 0x00a1000d,
+        0x00a7001a, 0x00b4001a, 0x00c1001a, 0x00ce001a,
+        0x00da001a, 0x00e7001a, 0x00f4001a, 0x0101001a,
+        0x010e0033, 0x01280033, 0x01410033, 0x015b0033,
+        0x01750033, 0x018f0033, 0x01a80033, 0x01c20033,
+        0x01dc0067, 0x020f0067, 0x02430067, 0x02760067,
+        0x02aa0067, 0x02dd0067, 0x03110067, 0x03440067,
+        0x037800ce, 0x03df00ce, 0x044600ce, 0x04ad00ce,
+        0x051400ce, 0x057b00c5, 0x05dd00bc, 0x063b00b5,
+        0x06970158, 0x07420142, 0x07e30130, 0x087b0120,
+        0x090b0112, 0x09940106, 0x0a1700fc, 0x0a9500f2,
+        0x0b0f01cb, 0x0bf401ae, 0x0ccb0195, 0x0d950180,
+        0x0e56016e, 0x0f0d015e, 0x0fbc0150, 0x10630143,
+        0x11070264, 0x1238023e, 0x1357021d, 0x14660201,
+        0x156601e9, 0x165a01d3, 0x174401c0, 0x182401af,
+        0x18fe0331, 0x1a9602fe, 0x1c1502d2, 0x1d7e02ad,
+        0x1ed4028d, 0x201a0270, 0x21520256, 0x227d0240,
+        0x239f0443, 0x25c003fe, 0x27bf03c4, 0x29a10392,
+        0x2b6a0367, 0x2d1d0341, 0x2ebe031f, 0x304d0300,
+        0x31d105b0, 0x34a80555, 0x37520507, 0x39d504c5,
+        0x3c37048b, 0x3e7c0458, 0x40a8042a, 0x42bd0401,
+        0x44c20798, 0x488e071e, 0x4c1c06b6, 0x4f76065d,
+        0x52a50610, 0x55ac05cc, 0x5892058f, 0x5b590559,
+        0x5e0c0a23, 0x631c0980, 0x67db08f6, 0x6c55087f,
+        0x70940818, 0x74a007bd, 0x787d076c, 0x7c330723,
+    };
+
+    static const uniform unsigned int almost_one = 0x3f7fffff;
+    
+    // Clamp to [2^(-13), 1-eps]; these two values map to 0 and 1, respectively.
+    in = max(in, 0.0f);
+    in = min(in, floatbits(almost_one));
+
+    // Do the table lookup and unpack bias, scale
+    unsigned int tab = table[(intbits(in) - 0x39000000u) >> 20];
+    unsigned int bias = (tab >> 16) << 9;
+    unsigned int scale = tab & 0xffff;
+
+    // Grab next-highest mantissa bits and perform linear interpolation
+    unsigned int t = (intbits(in) >> 12) & 0xff;
+    return (bias + scale*t) >> 16;
+}
+
+
+__declspec(safe)
+static inline uniform int
+float_to_srgb8(uniform float in)
+{
+    static const uniform unsigned int table[104] = {
+        0x0073000d, 0x007a000d, 0x0080000d, 0x0087000d,
+        0x008d000d, 0x0094000d, 0x009a000d, 0x00a1000d,
+        0x00a7001a, 0x00b4001a, 0x00c1001a, 0x00ce001a,
+        0x00da001a, 0x00e7001a, 0x00f4001a, 0x0101001a,
+        0x010e0033, 0x01280033, 0x01410033, 0x015b0033,
+        0x01750033, 0x018f0033, 0x01a80033, 0x01c20033,
+        0x01dc0067, 0x020f0067, 0x02430067, 0x02760067,
+        0x02aa0067, 0x02dd0067, 0x03110067, 0x03440067,
+        0x037800ce, 0x03df00ce, 0x044600ce, 0x04ad00ce,
+        0x051400ce, 0x057b00c5, 0x05dd00bc, 0x063b00b5,
+        0x06970158, 0x07420142, 0x07e30130, 0x087b0120,
+        0x090b0112, 0x09940106, 0x0a1700fc, 0x0a9500f2,
+        0x0b0f01cb, 0x0bf401ae, 0x0ccb0195, 0x0d950180,
+        0x0e56016e, 0x0f0d015e, 0x0fbc0150, 0x10630143,
+        0x11070264, 0x1238023e, 0x1357021d, 0x14660201,
+        0x156601e9, 0x165a01d3, 0x174401c0, 0x182401af,
+        0x18fe0331, 0x1a9602fe, 0x1c1502d2, 0x1d7e02ad,
+        0x1ed4028d, 0x201a0270, 0x21520256, 0x227d0240,
+        0x239f0443, 0x25c003fe, 0x27bf03c4, 0x29a10392,
+        0x2b6a0367, 0x2d1d0341, 0x2ebe031f, 0x304d0300,
+        0x31d105b0, 0x34a80555, 0x37520507, 0x39d504c5,
+        0x3c37048b, 0x3e7c0458, 0x40a8042a, 0x42bd0401,
+        0x44c20798, 0x488e071e, 0x4c1c06b6, 0x4f76065d,
+        0x52a50610, 0x55ac05cc, 0x5892058f, 0x5b590559,
+        0x5e0c0a23, 0x631c0980, 0x67db08f6, 0x6c55087f,
+        0x70940818, 0x74a007bd, 0x787d076c, 0x7c330723,
+    };
+
+    static const uniform unsigned int almost_one = 0x3f7fffff;
+    
+    // Clamp to [2^(-13), 1-eps]; these two values map to 0 and 1, respectively.
+    in = max(in, 0.0f);
+    in = min(in, floatbits(almost_one));
+
+    // Do the table lookup and unpack bias, scale
+    uniform unsigned int tab = table[(intbits(in) - 0x39000000u) >> 20];
+    uniform unsigned int bias = (tab >> 16) << 9;
+    uniform unsigned int scale = tab & 0xffff;
+
+    // Grab next-highest mantissa bits and perform linear interpolation
+    uniform unsigned int t = (intbits(in) >> 12) & 0xff;
+    return (bias + scale*t) >> 16;
+}
+
 ///////////////////////////////////////////////////////////////////////////
 // RNG stuff
 
@@ -3699,60 +4026,13 @@ static inline uniform float frandom(uniform RNGState * uniform state)
     return floatbits(0x3F800000 | irand)-1.0f;
 }
 
-static inline uniform unsigned int __seed4(varying RNGState * uniform state, 
-                                           uniform int start,
-                                           uniform unsigned int seed) {
-    uniform unsigned int c1 = 0xf0f0f0f0;
-    uniform unsigned int c2 = 0x0f0f0f0f;
-
-    state->z1 = insert(state->z1, start + 0, seed);
-    state->z1 = insert(state->z1, start + 1, seed ^ c1);
-    state->z1 = insert(state->z1, start + 2, (seed << 3) ^ c1);
-    state->z1 = insert(state->z1, start + 3, (seed << 2) ^ c2);
-
-    seed += 131;
-    state->z2 = insert(state->z2, start + 0, seed);
-    state->z2 = insert(state->z2, start + 1, seed ^ c1);
-    state->z2 = insert(state->z2, start + 2, (seed << 3) ^ c1);
-    state->z2 = insert(state->z2, start + 3, (seed << 2) ^ c2);
-
-    seed ^= extract(state->z2, 2);
-    state->z3 = insert(state->z3, start + 0, seed);
-    state->z3 = insert(state->z3, start + 1, seed ^ c1);
-    state->z3 = insert(state->z3, start + 2, (seed << 3) ^ c1);
-    state->z3 = insert(state->z3, start + 3, (seed << 2) ^ c2);
-
-    seed <<= 4;
-    seed += 3;
-    seed ^= extract(state->z1, 3);
-    state->z4 = insert(state->z4, start + 0, seed);
-    state->z4 = insert(state->z4, start + 1, seed ^ c1);
-    state->z4 = insert(state->z4, start + 2, (seed << 3) ^ c1);
-    state->z4 = insert(state->z4, start + 3, (seed << 2) ^ c2);
-
-    return seed;
-}
-
 static inline void seed_rng(varying RNGState * uniform state, 
-                            uniform unsigned int seed) {
-    if (programCount == 1) {
-        state->z1 = seed;
-        state->z2 = seed ^ 0xbeeff00d;
-        state->z3 = ((seed & 0xffff) << 16) | (seed >> 16);
-        state->z4 = (((seed & 0xff) << 24) | ((seed & 0xff00)  << 8) |
-                     ((seed & 0xff0000) >> 8) | (seed & 0xff000000) >> 24);
-    }
-    else {
-        seed = __seed4(state, 0, seed);
-        if (programCount == 8)
-            __seed4(state, 4, seed ^ 0xbeeff00d);
-        if (programCount == 16) {
-            __seed4(state, 4,  seed ^ 0xbeeff00d);
-            __seed4(state, 8,  ((seed & 0xffff) << 16) | (seed >> 16));
-            __seed4(state, 12, (((seed & 0xff) << 24) | ((seed & 0xff00)  << 8) |
-                                ((seed & 0xff0000) >> 8) | (seed & 0xff000000) >> 24));
-        }
-    }
+                            unsigned int seed) {
+    state->z1 = seed;
+    state->z2 = seed ^ 0xbeeff00d;
+    state->z3 = ((seed & 0xffff) << 16) | (seed >> 16);
+    state->z4 = (((seed & 0xff) << 24) | ((seed & 0xff00)  << 8) |
+                 ((seed & 0xff0000) >> 8) | (seed & 0xff000000) >> 24);
 }
 
 static inline void seed_rng(uniform RNGState * uniform state, 
diff --git a/stmt.cpp b/stmt.cpp
index 9aad4291..6049d4e2 100644
--- a/stmt.cpp
+++ b/stmt.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
   All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
@@ -40,6 +40,7 @@
 #include "util.h"
 #include "expr.h"
 #include "type.h"
+#include "func.h"
 #include "sym.h"
 #include "module.h"
 #include "llvmutil.h"
@@ -121,7 +122,7 @@ DeclStmt::DeclStmt(const std::vector<VariableDeclaration> &v, SourcePos p)
 
 static bool
 lHasUnsizedArrays(const Type *type) {
-    const ArrayType *at = dynamic_cast<const ArrayType *>(type);
+    const ArrayType *at = CastType<ArrayType>(type);
     if (at == NULL)
         return false;
 
@@ -139,7 +140,7 @@ DeclStmt::EmitCode(FunctionEmitContext *ctx) const {
 
     for (unsigned int i = 0; i < vars.size(); ++i) {
         Symbol *sym = vars[i].sym;
-        Assert(sym != NULL);
+        AssertPos(pos, sym != NULL);
         if (sym->type == NULL)
             continue;
         Expr *initExpr = vars[i].init;
@@ -167,16 +168,30 @@ DeclStmt::EmitCode(FunctionEmitContext *ctx) const {
         }
 
         // References must have initializer expressions as well.
-        if (dynamic_cast<const ReferenceType *>(sym->type) && initExpr == NULL) {
-            Error(sym->pos,
-                  "Must provide initializer for reference-type variable \"%s\".",
-                  sym->name.c_str());
-            continue;
+        if (IsReferenceType(sym->type) == true) {
+            if (initExpr == NULL) {
+                Error(sym->pos, "Must provide initializer for reference-type "
+                      "variable \"%s\".", sym->name.c_str());
+                continue;
+            }
+            if (IsReferenceType(initExpr->GetType()) == false) {
+                const Type *initLVType = initExpr->GetLValueType();
+                if (initLVType == NULL) {
+                    Error(initExpr->pos, "Initializer for reference-type variable "
+                          "\"%s\" must have an lvalue type.", sym->name.c_str());
+                    continue;
+                }
+                if (initLVType->IsUniformType() == false) {
+                    Error(initExpr->pos, "Initializer for reference-type variable "
+                          "\"%s\" must have a uniform lvalue type.", sym->name.c_str());
+                    continue;
+                }
+            }
         }
 
-        LLVM_TYPE_CONST llvm::Type *llvmType = sym->type->LLVMType(g->ctx);
+        llvm::Type *llvmType = sym->type->LLVMType(g->ctx);
         if (llvmType == NULL) {
-            Assert(m->errorCount > 0);
+            AssertPos(pos, m->errorCount > 0);
             return;
         }
 
@@ -282,8 +297,8 @@ DeclStmt::TypeCheck() {
         // the int->float type conversion is in there and we don't return
         // an int as the constValue later...
         const Type *type = vars[i].sym->type;
-        if (dynamic_cast<const AtomicType *>(type) != NULL ||
-            dynamic_cast<const EnumType *>(type) != NULL) {
+        if (CastType<AtomicType>(type) != NULL ||
+            CastType<EnumType>(type) != NULL) {
             // If it's an expr list with an atomic type, we'll later issue
             // an error.  Need to leave vars[i].init as is in that case so
             // it is in fact caught later, though.
@@ -463,12 +478,12 @@ IfStmt::emitMaskedTrueAndFalse(FunctionEmitContext *ctx, llvm::Value *oldMask,
         lEmitIfStatements(ctx, trueStmts, "if: expr mixed, true statements");
         // under varying control flow,, returns can't stop instruction
         // emission, so this better be non-NULL...
-        Assert(ctx->GetCurrentBasicBlock()); 
+        AssertPos(ctx->GetDebugPos(), ctx->GetCurrentBasicBlock()); 
     }
     if (falseStmts) {
         ctx->SetInternalMaskAndNot(oldMask, test);
         lEmitIfStatements(ctx, falseStmts, "if: expr mixed, false statements");
-        Assert(ctx->GetCurrentBasicBlock());
+        AssertPos(ctx->GetDebugPos(), ctx->GetCurrentBasicBlock());
     }
 }
 
@@ -549,7 +564,7 @@ IfStmt::emitVaryingIf(FunctionEmitContext *ctx, llvm::Value *ltest) const {
             (costIsAcceptable || g->opt.disableCoherentControlFlow)) {
             ctx->StartVaryingIf(oldMask);
             emitMaskedTrueAndFalse(ctx, oldMask, ltest);
-            Assert(ctx->GetCurrentBasicBlock());
+            AssertPos(pos, ctx->GetCurrentBasicBlock());
             ctx->EndIf();
         }
         else {
@@ -572,7 +587,7 @@ IfStmt::emitMaskAllOn(FunctionEmitContext *ctx, llvm::Value *ltest,
     // compiler see what's going on so that subsequent optimizations for
     // code emitted here can operate with the knowledge that the mask is
     // definitely all on (until it modifies the mask itself).
-    Assert(!g->opt.disableCoherentControlFlow);
+    AssertPos(pos, !g->opt.disableCoherentControlFlow);
     if (!g->opt.disableMaskAllOnOptimizations)
         ctx->SetInternalMask(LLVMMaskAllOn);
     llvm::Value *oldFunctionMask = ctx->GetFunctionMask();
@@ -622,7 +637,7 @@ IfStmt::emitMaskAllOn(FunctionEmitContext *ctx, llvm::Value *ltest,
     emitMaskedTrueAndFalse(ctx, LLVMMaskAllOn, ltest);
     // In this case, return/break/continue isn't allowed to jump and end
     // emission.
-    Assert(ctx->GetCurrentBasicBlock());
+    AssertPos(pos, ctx->GetCurrentBasicBlock());
     ctx->EndIf();
     ctx->BranchInst(bDone);
 
@@ -651,7 +666,7 @@ IfStmt::emitMaskMixed(FunctionEmitContext *ctx, llvm::Value *oldMask,
         // Emit statements for true
         ctx->SetCurrentBasicBlock(bRunTrue);
         lEmitIfStatements(ctx, trueStmts, "if: expr mixed, true statements");
-        Assert(ctx->GetCurrentBasicBlock()); 
+        AssertPos(pos, ctx->GetCurrentBasicBlock()); 
         ctx->BranchInst(bNext);
         ctx->SetCurrentBasicBlock(bNext);
     }
@@ -668,7 +683,7 @@ IfStmt::emitMaskMixed(FunctionEmitContext *ctx, llvm::Value *oldMask,
         // Emit code for false
         ctx->SetCurrentBasicBlock(bRunFalse);
         lEmitIfStatements(ctx, falseStmts, "if: expr mixed, false statements");
-        Assert(ctx->GetCurrentBasicBlock());
+        AssertPos(pos, ctx->GetCurrentBasicBlock());
         ctx->BranchInst(bNext);
         ctx->SetCurrentBasicBlock(bNext);
     }
@@ -822,7 +837,7 @@ void DoStmt::EmitCode(FunctionEmitContext *ctx) const {
             ctx->SetFunctionMask(LLVMMaskAllOn);
         if (bodyStmts)
             bodyStmts->EmitCode(ctx);
-        Assert(ctx->GetCurrentBasicBlock());
+        AssertPos(pos, ctx->GetCurrentBasicBlock());
         ctx->SetFunctionMask(oldFunctionMask);
         ctx->BranchInst(btest);
 
@@ -830,7 +845,7 @@ void DoStmt::EmitCode(FunctionEmitContext *ctx) const {
         ctx->SetCurrentBasicBlock(bMixed);
         if (bodyStmts)
             bodyStmts->EmitCode(ctx);
-        Assert(ctx->GetCurrentBasicBlock());
+        AssertPos(pos, ctx->GetCurrentBasicBlock());
         ctx->BranchInst(btest);
     }
     else {
@@ -971,7 +986,7 @@ ForStmt::EmitCode(FunctionEmitContext *ctx) const {
     // it and then jump into the loop test code.  (Also start a new scope
     // since the initiailizer may be a declaration statement).
     if (init) {
-        Assert(dynamic_cast<StmtList *>(init) == NULL);
+        AssertPos(pos, dynamic_cast<StmtList *>(init) == NULL);
         ctx->StartScope();
         init->EmitCode(ctx);
     }
@@ -1000,7 +1015,7 @@ ForStmt::EmitCode(FunctionEmitContext *ctx) const {
         if (doCoherentCheck)
             Warning(test->pos, "Uniform condition supplied to cfor/cwhile "
                     "statement.");
-        Assert(ltest->getType() == LLVMTypes::BoolType);
+        AssertPos(pos, ltest->getType() == LLVMTypes::BoolType);
         ctx->BranchInst(bloop, bexit, ltest);
     }
     else {
@@ -1036,7 +1051,7 @@ ForStmt::EmitCode(FunctionEmitContext *ctx) const {
             ctx->SetFunctionMask(LLVMMaskAllOn);
         if (stmts)
             stmts->EmitCode(ctx);
-        Assert(ctx->GetCurrentBasicBlock());
+        AssertPos(pos, ctx->GetCurrentBasicBlock());
         ctx->SetFunctionMask(oldFunctionMask);
         ctx->BranchInst(bstep);
 
@@ -1349,8 +1364,8 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
     ctx->SetFunctionMask(LLVMMaskAllOn);
 
     // This should be caught during typechecking
-    Assert(startExprs.size() == dimVariables.size() && 
-           endExprs.size() == dimVariables.size());
+    AssertPos(pos, startExprs.size() == dimVariables.size() && 
+              endExprs.size() == dimVariables.size());
     int nDims = (int)dimVariables.size();
 
     ///////////////////////////////////////////////////////////////////////
@@ -1689,7 +1704,7 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
         ctx->SetContinueTarget(bbFullBodyContinue);
         ctx->AddInstrumentationPoint("foreach loop body (all on)");
         stmts->EmitCode(ctx);
-        Assert(ctx->GetCurrentBasicBlock() != NULL);
+        AssertPos(pos, ctx->GetCurrentBasicBlock() != NULL);
         ctx->BranchInst(bbFullBodyContinue);
     }
     ctx->SetCurrentBasicBlock(bbFullBodyContinue); {
@@ -2079,7 +2094,7 @@ SwitchStmt::EmitCode(FunctionEmitContext *ctx) const {
 
     const Type *type;
     if (expr == NULL || ((type = expr->GetType()) == NULL)) {
-        Assert(m->errorCount > 0);
+        AssertPos(pos, m->errorCount > 0);
         return;
     }
 
@@ -2097,7 +2112,7 @@ SwitchStmt::EmitCode(FunctionEmitContext *ctx) const {
 
     llvm::Value *exprValue = expr->GetValue(ctx);
     if (exprValue == NULL) {
-        Assert(m->errorCount > 0);
+        AssertPos(pos, m->errorCount > 0);
         return;
     }
 
@@ -2173,8 +2188,8 @@ SwitchStmt::EstimateCost() const {
 ///////////////////////////////////////////////////////////////////////////
 // ReturnStmt
 
-ReturnStmt::ReturnStmt(Expr *v, bool cc, SourcePos p) 
-    : Stmt(p), val(v), 
+ReturnStmt::ReturnStmt(Expr *e, bool cc, SourcePos p) 
+    : Stmt(p), expr(e), 
       doCoherenceCheck(cc && !g->opt.disableCoherentControlFlow) {
 }
 
@@ -2189,8 +2204,29 @@ ReturnStmt::EmitCode(FunctionEmitContext *ctx) const {
         return;
     }
 
+    // Make sure we're not trying to return a reference to something where
+    // that doesn't make sense
+    const Function *func = ctx->GetFunction();
+    const Type *returnType = func->GetReturnType();
+    if (IsReferenceType(returnType) == true &&
+        IsReferenceType(expr->GetType()) == false) {
+        const Type *lvType = expr->GetLValueType();
+        if (lvType == NULL) {
+            Error(expr->pos, "Illegal to return non-lvalue from function "
+                  "returning reference type \"%s\".",
+                  returnType->GetString().c_str());
+            return;
+        }
+        else if (lvType->IsUniformType() == false) {
+            Error(expr->pos, "Illegal to return varying lvalue type from "
+                  "function returning a reference type \"%s\".",
+                  returnType->GetString().c_str());
+            return;
+        }
+    }
+
     ctx->SetDebugPos(pos);
-    ctx->CurrentLanesReturned(val, doCoherenceCheck);
+    ctx->CurrentLanesReturned(expr, doCoherenceCheck);
 }
 
 
@@ -2210,7 +2246,8 @@ void
 ReturnStmt::Print(int indent) const {
     printf("%*c%sReturn Stmt", indent, ' ', doCoherenceCheck ? "Coherent " : "");
     pos.Print();
-    if (val) val->Print();
+    if (expr)
+        expr->Print();
     else printf("(void)");
     printf("\n");
 }
@@ -2228,6 +2265,9 @@ GotoStmt::GotoStmt(const char *l, SourcePos gotoPos, SourcePos ip)
 
 void
 GotoStmt::EmitCode(FunctionEmitContext *ctx) const {
+    if (!ctx->GetCurrentBasicBlock()) 
+        return;
+
     if (ctx->VaryingCFDepth() > 0) {
         Error(pos, "\"goto\" statements are only legal under \"uniform\" "
               "control flow.");
@@ -2241,10 +2281,22 @@ GotoStmt::EmitCode(FunctionEmitContext *ctx) const {
 
     llvm::BasicBlock *bb = ctx->GetLabeledBasicBlock(label);
     if (bb == NULL) {
-        // TODO: use the string distance stuff to suggest alternatives if
-        // there are some with names close to the label name we have here..
-        Error(identifierPos, "No label named \"%s\" found in current function.",
-              label.c_str());
+        /* Label wasn't found. Look for suggestions that are close */
+        std::vector<std::string> labels = ctx->GetLabels();
+        std::vector<std::string> matches = MatchStrings(label, labels);
+        std::string match_output;
+        if (! matches.empty()) {
+            /* Print up to 5 matches. Don't want to spew too much */
+            match_output += "\nDid you mean:";
+            for (unsigned int i=0; i<matches.size() && i<5; i++)
+                match_output += "\n " + matches[i] + "?";
+        }
+
+        /* Label wasn't found. Emit an error */
+        Error(identifierPos, 
+                "No label named \"%s\" found in current function.%s",
+              label.c_str(), match_output.c_str());
+
         return;
     }
 
@@ -2290,7 +2342,7 @@ LabeledStmt::LabeledStmt(const char *n, Stmt *s, SourcePos p)
 void
 LabeledStmt::EmitCode(FunctionEmitContext *ctx) const {
     llvm::BasicBlock *bblock = ctx->GetLabeledBasicBlock(name);
-    Assert(bblock != NULL);
+    AssertPos(pos, bblock != NULL);
 
     // End the current basic block with a jump to our basic block and then
     // set things up for emission to continue there.  Note that the current
@@ -2409,7 +2461,7 @@ lEncodeType(const Type *t) {
     if (Type::Equal(t, AtomicType::VaryingUInt64)) return 'V';
     if (Type::Equal(t, AtomicType::UniformDouble)) return 'd';
     if (Type::Equal(t, AtomicType::VaryingDouble)) return 'D';
-    if (dynamic_cast<const PointerType *>(t) != NULL) {
+    if (CastType<PointerType>(t) != NULL) {
         if (t->IsUniformType())
             return 'p';
         else
@@ -2429,7 +2481,7 @@ lProcessPrintArg(Expr *expr, FunctionEmitContext *ctx, std::string &argTypes) {
     if (type == NULL)
         return NULL;
 
-    if (dynamic_cast<const ReferenceType *>(type) != NULL) {
+    if (CastType<ReferenceType>(type) != NULL) {
         expr = new RefDerefExpr(expr, expr->pos);
         type = expr->GetType();
         if (type == NULL)
@@ -2457,7 +2509,7 @@ lProcessPrintArg(Expr *expr, FunctionEmitContext *ctx, std::string &argTypes) {
     else {
         argTypes.push_back(t);
 
-        LLVM_TYPE_CONST llvm::Type *llvmExprType = type->LLVMType(g->ctx);
+        llvm::Type *llvmExprType = type->LLVMType(g->ctx);
         llvm::Value *ptr = ctx->AllocaInst(llvmExprType, "print_arg");
         llvm::Value *val = expr->GetValue(ctx);
         if (!val)
@@ -2478,6 +2530,9 @@ lProcessPrintArg(Expr *expr, FunctionEmitContext *ctx, std::string &argTypes) {
  */
 void
 PrintStmt::EmitCode(FunctionEmitContext *ctx) const {
+    if (!ctx->GetCurrentBasicBlock()) 
+        return;
+
     ctx->SetDebugPos(pos);
 
     // __do_print takes 5 arguments; we'll get them stored in the args[] array
@@ -2494,7 +2549,7 @@ PrintStmt::EmitCode(FunctionEmitContext *ctx) const {
     std::string argTypes;
 
     if (values == NULL) {
-        LLVM_TYPE_CONST llvm::Type *ptrPtrType = 
+        llvm::Type *ptrPtrType = 
             llvm::PointerType::get(LLVMTypes::VoidPointerType, 0);
         args[4] = llvm::Constant::getNullValue(ptrPtrType);
     }
@@ -2506,7 +2561,7 @@ PrintStmt::EmitCode(FunctionEmitContext *ctx) const {
         int nArgs = elist ? elist->exprs.size() : 1;
 
         // Allocate space for the array of pointers to values to be printed 
-        LLVM_TYPE_CONST llvm::Type *argPtrArrayType = 
+        llvm::Type *argPtrArrayType = 
             llvm::ArrayType::get(LLVMTypes::VoidPointerType, nArgs);
         llvm::Value *argPtrArray = ctx->AllocaInst(argPtrArrayType,
                                                    "print_arg_ptrs");
@@ -2542,7 +2597,7 @@ PrintStmt::EmitCode(FunctionEmitContext *ctx) const {
 
     // Now we can emit code to call __do_print()
     llvm::Function *printFunc = m->module->getFunction("__do_print");
-    Assert(printFunc);
+    AssertPos(pos, printFunc);
 
     llvm::Value *mask = ctx->GetFullMask();
     // Set up the rest of the parameters to it
@@ -2583,6 +2638,9 @@ AssertStmt::AssertStmt(const std::string &msg, Expr *e, SourcePos p)
 
 void
 AssertStmt::EmitCode(FunctionEmitContext *ctx) const {
+    if (!ctx->GetCurrentBasicBlock()) 
+        return;
+
     if (expr == NULL)
         return;
     const Type *type = expr->GetType();
@@ -2595,7 +2653,7 @@ AssertStmt::EmitCode(FunctionEmitContext *ctx) const {
     llvm::Function *assertFunc = 
         isUniform ? m->module->getFunction("__do_assert_uniform") :
                     m->module->getFunction("__do_assert_varying");
-    Assert(assertFunc != NULL);
+    AssertPos(pos, assertFunc != NULL);
 
     char *errorString;
     if (asprintf(&errorString, "%s:%d:%d: Assertion failed: %s\n", 
@@ -2658,20 +2716,23 @@ DeleteStmt::DeleteStmt(Expr *e, SourcePos p)
 
 void
 DeleteStmt::EmitCode(FunctionEmitContext *ctx) const {
+    if (!ctx->GetCurrentBasicBlock()) 
+        return;
+
     const Type *exprType;
     if (expr == NULL || ((exprType = expr->GetType()) == NULL)) {
-        Assert(m->errorCount > 0);
+        AssertPos(pos, m->errorCount > 0);
         return;
     }
 
     llvm::Value *exprValue = expr->GetValue(ctx);
     if (exprValue == NULL) {
-        Assert(m->errorCount > 0);
+        AssertPos(pos, m->errorCount > 0);
         return;
     }
 
     // Typechecking should catch this
-    Assert(dynamic_cast<const PointerType *>(exprType) != NULL);
+    AssertPos(pos, CastType<PointerType>(exprType) != NULL);
 
     if (exprType->IsUniformType()) {
         // For deletion of a uniform pointer, we just need to cast the
@@ -2680,7 +2741,7 @@ DeleteStmt::EmitCode(FunctionEmitContext *ctx) const {
         exprValue = ctx->BitCastInst(exprValue, LLVMTypes::VoidPointerType,
                                      "ptr_to_void");
         llvm::Function *func = m->module->getFunction("__delete_uniform");
-        Assert(func != NULL);
+        AssertPos(pos, func != NULL);
 
         ctx->CallInst(func, NULL, exprValue, "");
     }
@@ -2690,7 +2751,7 @@ DeleteStmt::EmitCode(FunctionEmitContext *ctx) const {
         // only need to extend to 64-bit values on 32-bit targets before
         // calling it.
         llvm::Function *func = m->module->getFunction("__delete_varying");
-        Assert(func != NULL);
+        AssertPos(pos, func != NULL);
         if (g->target.is32Bit)
             exprValue = ctx->ZExtInst(exprValue, LLVMTypes::Int64VectorType,
                                       "ptr_to_64");
@@ -2711,7 +2772,7 @@ DeleteStmt::TypeCheck() {
     if (expr == NULL || ((exprType = expr->GetType()) == NULL))
         return NULL;
 
-    if (dynamic_cast<const PointerType *>(exprType) == NULL) {
+    if (CastType<PointerType>(exprType) == NULL) {
         Error(pos, "Illegal to delete non-pointer type \"%s\".",
               exprType->GetString().c_str());
         return NULL;
@@ -2743,7 +2804,7 @@ DeleteStmt::EstimateCost() const {
 Stmt *
 CreateForeachActiveStmt(Symbol *iterSym, Stmt *stmts, SourcePos pos) {
     if (iterSym == NULL) {
-        Assert(m->errorCount > 0);
+        AssertPos(pos, m->errorCount > 0);
         return NULL;
     }
 
@@ -2770,11 +2831,11 @@ CreateForeachActiveStmt(Symbol *iterSym, Stmt *stmts, SourcePos pos) {
     // First, call __movmsk(__mask)) to get the mask as a set of bits.
     // This should be hoisted out of the loop
     Symbol *maskSym = m->symbolTable->LookupVariable("__mask");
-    Assert(maskSym != NULL);
+    AssertPos(pos, maskSym != NULL);
     Expr *maskVecExpr = new SymbolExpr(maskSym, pos);
     std::vector<Symbol *> mmFuns;
     m->symbolTable->LookupFunction("__movmsk", &mmFuns);
-    Assert(mmFuns.size() == 2);
+    AssertPos(pos, mmFuns.size() == (g->target.maskBitCount == 32 ? 2 : 1));
     FunctionSymbolExpr *movmskFunc = new FunctionSymbolExpr("__movmsk", mmFuns,
                                                             pos);
     ExprList *movmskArgs = new ExprList(maskVecExpr, pos);
@@ -2782,7 +2843,7 @@ CreateForeachActiveStmt(Symbol *iterSym, Stmt *stmts, SourcePos pos) {
                                                         pos);
 
     // Compute the per lane mask to test the mask bits against: (1 << iter)
-    ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, 1,
+    ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt64, int64_t(1),
                                        iterSym->pos);
     Expr *shiftLaneExpr = new BinaryExpr(BinaryExpr::Shl, oneExpr, symExpr, 
                                          pos);
@@ -2802,4 +2863,3 @@ CreateForeachActiveStmt(Symbol *iterSym, Stmt *stmts, SourcePos pos) {
     // And return a for loop that wires it all together.
     return new ForStmt(initStmt, testExpr, stepStmt, laneCheckIf, false, pos);
 }
-
diff --git a/stmt.h b/stmt.h
index da418ec7..88115ab2 100644
--- a/stmt.h
+++ b/stmt.h
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
   All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
@@ -265,7 +265,7 @@ public:
     statement in the program. */
 class ReturnStmt : public Stmt {
 public:
-    ReturnStmt(Expr *v, bool cc, SourcePos p);
+    ReturnStmt(Expr *e, bool cc, SourcePos p);
 
     void EmitCode(FunctionEmitContext *ctx) const;
     void Print(int indent) const;
@@ -273,7 +273,7 @@ public:
     Stmt *TypeCheck();
     int EstimateCost() const;
 
-    Expr *val;
+    Expr *expr;
     /** This indicates whether the generated code will check to see if no
         more program instances are currently running after the return, in
         which case the code can possibly jump to the end of the current
diff --git a/sym.cpp b/sym.cpp
index f60dc1aa..42d1f66f 100644
--- a/sym.cpp
+++ b/sym.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
   All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
@@ -56,12 +56,6 @@ Symbol::Symbol(const std::string &n, SourcePos p, const Type *t,
 }
 
 
-std::string
-Symbol::MangledName() const {
-    return name + type->Mangle();
-}
-
-
 ///////////////////////////////////////////////////////////////////////////
 // SymbolTable
 
@@ -72,27 +66,31 @@ SymbolTable::SymbolTable() {
 
 SymbolTable::~SymbolTable() {
     // Otherwise we have mismatched push/pop scopes
-    Assert(variables.size() == 1 && types.size() == 1);
+    Assert(variables.size() == 1);
     PopScope();
 }
 
 
 void
 SymbolTable::PushScope() { 
-    variables.push_back(new SymbolMapType);
-    types.push_back(new TypeMapType);
+    SymbolMapType *sm;
+    if (freeSymbolMaps.size() > 0) {
+        sm = freeSymbolMaps.back();
+        freeSymbolMaps.pop_back();
+        sm->erase(sm->begin(), sm->end());
+    }
+    else
+        sm = new SymbolMapType;
+
+    variables.push_back(sm);
 }
 
 
 void
 SymbolTable::PopScope() { 
     Assert(variables.size() > 1);
-    delete variables.back();
+    freeSymbolMaps.push_back(variables.back());
     variables.pop_back();
-
-    Assert(types.size() > 1);
-    delete types.back();
-    types.pop_back();
 }
 
 
@@ -147,7 +145,7 @@ SymbolTable::LookupVariable(const char *name) {
 
 bool
 SymbolTable::AddFunction(Symbol *symbol) {
-    const FunctionType *ft = dynamic_cast<const FunctionType *>(symbol->type);
+    const FunctionType *ft = CastType<FunctionType>(symbol->type);
     Assert(ft != NULL);
     if (LookupFunction(symbol->name.c_str(), ft) != NULL)
         // A function of the same name and type has already been added to
@@ -192,26 +190,17 @@ SymbolTable::LookupFunction(const char *name, const FunctionType *type) {
 
 bool
 SymbolTable::AddType(const char *name, const Type *type, SourcePos pos) {
-    // Like AddVariable(), we go backwards through the type maps, working
-    // from innermost scope to outermost.
-    for (int i = types.size()-1; i >= 0; --i) {
-        TypeMapType &sm = *(types[i]);
-        if (sm.find(name) != sm.end()) {
-            if (i == (int)types.size() - 1) {
-                Error(pos, "Ignoring redefinition of type \"%s\".", name);
-                return false;
-            }
-            else {
-                Warning(pos, "Type \"%s\" shadows type declared in outer scope.", name);
-                TypeMapType &sm = *(types.back());
-                sm[name] = type;
-                return true;
-            }
-        }
+    const Type *t = LookupType(name);
+    if (t != NULL && CastType<UndefinedStructType>(t) == NULL) {
+        // If we have a previous declaration of anything other than an
+        // UndefinedStructType with this struct name, issue an error.  If
+        // we have an UndefinedStructType, then we'll fall through to the
+        // code below that adds the definition to the type map.
+        Error(pos, "Ignoring redefinition of type \"%s\".", name);
+        return false;
     }
 
-    TypeMapType &sm = *(types.back());
-    sm[name] = type;
+    types[name] = type;
     return true;
 }
 
@@ -219,11 +208,9 @@ SymbolTable::AddType(const char *name, const Type *type, SourcePos pos) {
 const Type *
 SymbolTable::LookupType(const char *name) const {
     // Again, search through the type maps backward to get scoping right.
-    for (int i = types.size()-1; i >= 0; --i) {
-        TypeMapType &sm = *(types[i]);
-        if (sm.find(name) != sm.end())
-            return sm[name];
-    }
+    TypeMapType::const_iterator iter = types.find(name);
+    if (iter != types.end())
+        return iter->second;
     return NULL;
 }
 
@@ -288,21 +275,19 @@ SymbolTable::closestTypeMatch(const char *str, bool structsVsEnums) const {
     const int maxDelta = 2;
     std::vector<std::string> matches[maxDelta+1];
 
-    for (unsigned int i = 0; i < types.size(); ++i) {
-        TypeMapType::const_iterator iter;
-        for (iter = types[i]->begin(); iter != types[i]->end(); ++iter) {
-            // Skip over either StructTypes or EnumTypes, depending on the
-            // value of the structsVsEnums parameter
-            bool isEnum = (dynamic_cast<const EnumType *>(iter->second) != NULL);
-            if (isEnum && structsVsEnums)
-                continue;
-            else if (!isEnum && !structsVsEnums)
-                continue;
+    TypeMapType::const_iterator iter;
+    for (iter = types.begin(); iter != types.end(); ++iter) {
+        // Skip over either StructTypes or EnumTypes, depending on the
+        // value of the structsVsEnums parameter
+        bool isEnum = (CastType<EnumType>(iter->second) != NULL);
+        if (isEnum && structsVsEnums)
+            continue;
+        else if (!isEnum && !structsVsEnums)
+            continue;
 
-            int dist = StringEditDistance(str, iter->first, maxDelta+1);
-            if (dist <= maxDelta)
-                matches[dist].push_back(iter->first);
-        }
+        int dist = StringEditDistance(str, iter->first, maxDelta+1);
+        if (dist <= maxDelta)
+            matches[dist].push_back(iter->first);
     }
 
     for (int i = 0; i <= maxDelta; ++i) {
@@ -342,16 +327,12 @@ SymbolTable::Print() {
 
     depth = 0;
     fprintf(stderr, "Named types:\n---------------\n");
-    for (unsigned int i = 0; i < types.size(); ++i) {
-        TypeMapType &sm = *types[i];
-        TypeMapType::iterator siter = sm.begin();
-        while (siter != sm.end()) {
-            fprintf(stderr, "%*c", depth, ' ');
-            fprintf(stderr, "%s -> %s\n", siter->first.c_str(),
-                    siter->second->GetString().c_str());
-            ++siter;
-        }
-        depth += 4;
+    TypeMapType::iterator siter = types.begin();
+    while (siter != types.end()) {
+        fprintf(stderr, "%*c", depth, ' ');
+        fprintf(stderr, "%s -> %s\n", siter->first.c_str(),
+                siter->second->GetString().c_str());
+        ++siter;
     }
 }
 
@@ -382,14 +363,11 @@ SymbolTable::RandomSymbol() {
 
 const Type *
 SymbolTable::RandomType() {
-    int v = ispcRand() % types.size();
-    if (types[v]->size() == 0)
-        return NULL;
-    int count = ispcRand() % types[v]->size();
-    TypeMapType::iterator iter = types[v]->begin();
+    int count = types.size();
+    TypeMapType::iterator iter = types.begin();
     while (count-- > 0) {
         ++iter;
-        Assert(iter != types[v]->end());
+        Assert(iter != types.end());
     }
     return iter->second;
 }
diff --git a/sym.h b/sym.h
index fa452326..07bbe187 100644
--- a/sym.h
+++ b/sym.h
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
   All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
@@ -67,15 +67,8 @@ public:
     Symbol(const std::string &name, SourcePos pos, const Type *t = NULL,
            StorageClass sc = SC_NONE);
 
-    /** This method should only be called for function symbols; for them,
-        it returns a mangled version of the function name with the argument
-        types encoded into the returned name.  This is used to generate
-        unique symbols in object files for overloaded functions.
-     */
-    std::string MangledName() const;
-
     SourcePos pos;            /*!< Source file position where the symbol was defined */
-    const std::string name;   /*!< Symbol's name */
+    std::string name;         /*!< Symbol's name */
     llvm::Value *storagePtr;  /*!< For symbols with storage associated with
                                    them (i.e. variables but not functions),
                                    this member stores a pointer to its
@@ -208,6 +201,9 @@ public:
     /** Adds the named type to the symbol table.  This is used for both
         struct definitions (where <tt>struct Foo</tt> causes type \c Foo to
         be added to the symbol table) as well as for <tt>typedef</tt>s.
+        For structs with forward declarations ("struct Foo;") and are thus
+        UndefinedStructTypes, this method replaces these with an actual
+        struct definition if one is provided.
 
         @param name Name of the type to be added
         @param type Type that \c name represents
@@ -264,6 +260,8 @@ private:
     typedef std::map<std::string, Symbol *> SymbolMapType;
     std::vector<SymbolMapType *> variables;
 
+    std::vector<SymbolMapType *> freeSymbolMaps;
+
     /** Function declarations are *not* scoped.  (C99, for example, allows
         an implementation to maintain function declarations in a single
         namespace.)  A STL \c vector is used to store the function symbols
@@ -272,12 +270,10 @@ private:
     typedef std::map<std::string, std::vector<Symbol *> > FunctionMapType;
     FunctionMapType functions;
 
-    /** Type definitions can also be scoped.  A new \c TypeMapType
-        is added to the back of the \c types \c vector each time a new scope
-        is entered.  (And it's removed when the scope exits).
+    /** Type definitions can't currently be scoped.
      */
     typedef std::map<std::string, const Type *> TypeMapType;
-    std::vector<TypeMapType *> types;
+    TypeMapType types;
 };
 
 
diff --git a/test_static.cpp b/test_static.cpp
index a8ec4a79..e798f960 100644
--- a/test_static.cpp
+++ b/test_static.cpp
@@ -102,15 +102,21 @@ void *ISPCAlloc(void **handle, int64_t size, int32_t alignment) {
 
 int main(int argc, char *argv[]) {
     int w = width();
-    assert(w <= 16);
+    assert(w <= 64);
 
-    float returned_result[16];
-    for (int i = 0; i < 16; ++i)
+    float returned_result[64];
+    float vfloat[64];
+    double vdouble[64];
+    int vint[64], vint2[64];
+
+    for (int i = 0; i < 64; ++i) {
         returned_result[i] = -1e20;
-    float vfloat[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
-    double vdouble[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
-    int vint[16] = { 2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32 };
-    int vint2[16] = { 5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20};
+        vfloat[i] = i+1;
+        vdouble[i] = i+1;
+        vint[i] = 2*(i+1);
+        vint2[i] = i+5;
+    }
+
     float b = 5.;
 
 #if (TEST_SIG == 0)
@@ -131,8 +137,8 @@ int main(int argc, char *argv[]) {
 #error "Unknown or unset TEST_SIG value"
 #endif    
 
-    float expected_result[16];
-    memset(expected_result, 0, 16*sizeof(float));
+    float expected_result[64];
+    memset(expected_result, 0, 64*sizeof(float));
     result(expected_result);
 
     int errors = 0;
diff --git a/tests/acos.ispc b/tests/acos.ispc
index 45173782..225fda97 100644
--- a/tests/acos.ispc
+++ b/tests/acos.ispc
@@ -6,14 +6,14 @@ bool ok(float x, float ref) { return (abs(x - ref) < 1e-6) || abs((x-ref)/ref) <
 
 export void f_v(uniform float RET[]) {
     uniform float vals[8] = { 0, 1, 0.5, -1, -.87, -.25, 1e-3, -.99999999 };
-    uniform float r[8];
+    uniform float r[programCount];
     foreach (i = 0 ... 8)
-        r[i] = cos(acos(vals[i]));
+        r[i] = cos(acos(vals[i % 8]));
 
     int errors = 0;
     for (uniform int i = 0; i < 8; ++i) {
-        if (ok(r[i], vals[i]) == false) {
-            print("error @ %: got %, expected %\n", i, r[i], vals[i]);
+        if (ok(r[i], vals[i%8]) == false) {
+            print("error @ %: got %, expected %\n", i, r[i], vals[i%8]);
             ++errors;
         }
     }
diff --git a/tests/aossoa-1.ispc b/tests/aossoa-1.ispc
index 6323d80f..59964d6d 100644
--- a/tests/aossoa-1.ispc
+++ b/tests/aossoa-1.ispc
@@ -3,7 +3,9 @@ export uniform int width() { return programCount; }
 
 export void f_v(uniform float RET[]) {
 #define width 3
-#define maxProgramCount 16
+#define maxProgramCount 64
+    assert(programCount <= maxProgramCount);
+
 //CO    const uniform int width = 3;
 //CO    const uniform int maxProgramCount = 16;
     uniform float a[width*maxProgramCount], r[width*maxProgramCount];
diff --git a/tests/aossoa-2.ispc b/tests/aossoa-2.ispc
index b23a25e4..9ff82226 100644
--- a/tests/aossoa-2.ispc
+++ b/tests/aossoa-2.ispc
@@ -3,7 +3,9 @@ export uniform int width() { return programCount; }
 
 export void f_v(uniform float RET[]) {
 #define width 4
-#define maxProgramCount 16
+#define maxProgramCount 64
+    assert(programCount <= maxProgramCount);
+
 //CO    const uniform int width = 4;
 //CO    const uniform int maxProgramCount = 16;
     uniform float a[width*maxProgramCount], r[width*maxProgramCount];
diff --git a/tests/aossoa-5.ispc b/tests/aossoa-5.ispc
index 883786c0..eb4fed3a 100644
--- a/tests/aossoa-5.ispc
+++ b/tests/aossoa-5.ispc
@@ -3,7 +3,9 @@ export uniform int width() { return programCount; }
 
 export void f_v(uniform float RET[]) {
 #define width 3
-#define maxProgramCount 16
+#define maxProgramCount 64
+    assert(programCount <= maxProgramCount);
+
 //CO    const uniform int width = 3;
 //CO    const uniform int maxProgramCount = 16;
     uniform int a[width*maxProgramCount], r[width*maxProgramCount];
diff --git a/tests/aossoa-6.ispc b/tests/aossoa-6.ispc
index 34d35348..b64cd10b 100644
--- a/tests/aossoa-6.ispc
+++ b/tests/aossoa-6.ispc
@@ -3,7 +3,9 @@ export uniform int width() { return programCount; }
 
 export void f_v(uniform float RET[]) {
 #define width 4
-#define maxProgramCount 16
+#define maxProgramCount 64
+    assert(programCount <= maxProgramCount);
+
 //CO    const uniform int width = 4;
 //CO    const uniform int maxProgramCount = 16;
     uniform int a[width*maxProgramCount], r[width*maxProgramCount];
diff --git a/tests/array-gather-ifs.ispc b/tests/array-gather-ifs.ispc
index d635e10f..626d7c3b 100644
--- a/tests/array-gather-ifs.ispc
+++ b/tests/array-gather-ifs.ispc
@@ -5,9 +5,9 @@ export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex];
-    uniform float x[45];
+    uniform float x[programCount+15];
     uniform int i;
-    for (i = 0; i < 45; ++i)
+    for (i = 0; i < programCount+15; ++i)
         x[i] = i;
 
     float ret;
diff --git a/tests/array-gather-multi-unif.ispc b/tests/array-gather-multi-unif.ispc
index d876f314..016ecbfd 100644
--- a/tests/array-gather-multi-unif.ispc
+++ b/tests/array-gather-multi-unif.ispc
@@ -10,7 +10,8 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     for (uniform int i = 0; i < 29+b; ++i)
         for (uniform int j = 0; j < 29+b; ++j)
             x[i][j] = 0;
-    x[a][a] = a;
+    if (a < 34)
+        x[a][a] = a;
     RET[programIndex] = x[4][4] + x[1][1] + x[b][b] + x[0][0];
 }
 
diff --git a/tests/array-gather-simple.ispc b/tests/array-gather-simple.ispc
index 8835b2f0..5abc7f97 100644
--- a/tests/array-gather-simple.ispc
+++ b/tests/array-gather-simple.ispc
@@ -12,8 +12,10 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 }
 
 export void result(uniform float RET[]) { 
-    RET[0] = 1; RET[4] = 5; RET[8] = 9; RET[12] = 13;
-    RET[1] = RET[5] = RET[9] = RET[13] = 0;
-    RET[2] = 6; RET[6] = 14; RET[10] = 22; RET[14] = 30;
-    RET[3] = RET[7] = RET[11] = RET[15] = 3;
+    for (uniform int i = 0; i < programCount; i += 4) {
+        RET[i] = i+1;
+        RET[i+1] = 0;
+        RET[i+2] = 2 * (i+3);
+        RET[i+3] = 3;
+    }
 }
diff --git a/tests/array-gather-unif-runflags.ispc b/tests/array-gather-unif-runflags.ispc
index f2936f05..c13f5c29 100644
--- a/tests/array-gather-unif-runflags.ispc
+++ b/tests/array-gather-unif-runflags.ispc
@@ -4,9 +4,9 @@ export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex];
-    uniform float x[45];
+    uniform float x[programCount+5];
     uniform int i;
-    for (i = 0; i < 45; ++i)
+    for (i = 0; i < programCount+5; ++i)
         x[i] = i+b;
     a -= 1;
     if (a == 3) a = 0;
diff --git a/tests/array-gather-unif.ispc b/tests/array-gather-unif.ispc
index 3e040ad3..7ff35c9d 100644
--- a/tests/array-gather-unif.ispc
+++ b/tests/array-gather-unif.ispc
@@ -4,9 +4,9 @@ export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex];
-    uniform float x[45];
+    uniform float x[programCount+5];
     uniform int i;
-    for (i = 0; i < 45; ++i)
+    for (i = 0; i < programCount+5; ++i)
         x[i] = i+b;
     RET[programIndex] = x[a];
 }
diff --git a/tests/array-gather-vary.ispc b/tests/array-gather-vary.ispc
index bbbdd85d..6e8c2f5e 100644
--- a/tests/array-gather-vary.ispc
+++ b/tests/array-gather-vary.ispc
@@ -4,14 +4,14 @@ export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex];
-    float x[55];
+    float x[programCount+10];
     uniform int i;
-    for (i = 0; i < 45; ++i)
+    for (i = 0; i < programCount+10; ++i)
         x[i] = a+b;
     RET[programIndex] = x[a];
 }
 
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = 6 + programIndex;;
+    RET[programIndex] = 6 + programIndex;
 }
diff --git a/tests/array-mixed-unif-vary-indexing-2.ispc b/tests/array-mixed-unif-vary-indexing-2.ispc
index edd53c84..8143ca29 100644
--- a/tests/array-mixed-unif-vary-indexing-2.ispc
+++ b/tests/array-mixed-unif-vary-indexing-2.ispc
@@ -15,6 +15,9 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
         x[a][b-1] = 0;
     else
         x[a][b-1] = 1;
+
+    a = min(a, 46);
+
     RET[programIndex] = x[3][a];
 }
 
diff --git a/tests/array-mixed-unif-vary-indexing-3.ispc b/tests/array-mixed-unif-vary-indexing-3.ispc
index 686f121e..ab3a7a7c 100644
--- a/tests/array-mixed-unif-vary-indexing-3.ispc
+++ b/tests/array-mixed-unif-vary-indexing-3.ispc
@@ -4,9 +4,10 @@ export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex];
-    uniform float x[47][47];
-    for (uniform int i = 0; i < 47; ++i)
-        for (uniform int j = 0; j < 47; ++j)
+    assert(programCount <= 64);
+    uniform float x[70][70];
+    for (uniform int i = 0; i < 70; ++i)
+        for (uniform int j = 0; j < 70; ++j)
             x[i][j] = 2+b-5;
 
     // all are 2 except (4,2) = 0, (4,...) = 1, (4,programCount-1)=2
diff --git a/tests/array-mixed-unif-vary-indexing.ispc b/tests/array-mixed-unif-vary-indexing.ispc
index ebe932ad..96fc0870 100644
--- a/tests/array-mixed-unif-vary-indexing.ispc
+++ b/tests/array-mixed-unif-vary-indexing.ispc
@@ -10,6 +10,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
         for (uniform int j = 0; j < 47; ++j)
             x[i][j] = 2+b-5;
 
+    a = min(a,46);
     x[a][b-1] = 0;
     RET[programIndex] = x[2][a];
 }
diff --git a/tests/array-multidim-gather-scatter.ispc b/tests/array-multidim-gather-scatter.ispc
index 8a2f3947..1528b070 100644
--- a/tests/array-multidim-gather-scatter.ispc
+++ b/tests/array-multidim-gather-scatter.ispc
@@ -11,7 +11,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 
     uniform int index[4] = { 0, 1, 2, 4 };
     float v = index[programIndex & 0x3];
-    x[a][v] = 0;
+    x[min(a,39)][v] = 0;
     RET[programIndex] = x[v+1][v];
 }
 
diff --git a/tests/array-pointer-duality-1.ispc b/tests/array-pointer-duality-1.ispc
index 1550c294..4fa51cba 100644
--- a/tests/array-pointer-duality-1.ispc
+++ b/tests/array-pointer-duality-1.ispc
@@ -4,7 +4,7 @@ export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     uniform float a[programCount+4];
-    for (unsigned int i = 0; i < programCount+4; ++i)
+    for (uniform int i = 0; i < programCount+4; ++i)
         a[i] = aFOO[min((int)i, programCount)];
 
     RET[programIndex] = *(a + 2);
diff --git a/tests/array-scatter-unif-2.ispc b/tests/array-scatter-unif-2.ispc
index ef0c659b..89181be6 100644
--- a/tests/array-scatter-unif-2.ispc
+++ b/tests/array-scatter-unif-2.ispc
@@ -4,9 +4,8 @@ export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex];
-    uniform float x[100];
-    // HACK to avoid @llvm.memset...
-    for (uniform int i = 0; i < b*20; ++i)
+    uniform float x[2*programCount];
+    for (uniform int i = 0; i < 2*programCount; ++i)
         x[i] = 0;
     
     x[2*(a-1)] = b;
diff --git a/tests/array-scatter-unif-3.ispc b/tests/array-scatter-unif-3.ispc
index 8aad3110..703af6fa 100644
--- a/tests/array-scatter-unif-3.ispc
+++ b/tests/array-scatter-unif-3.ispc
@@ -4,9 +4,8 @@ export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex];
-    uniform float x[100];
-    // HACK to avoid @llvm.memset...
-    for (uniform int i = 0; i < b*20; ++i)
+    uniform float x[2*programCount];
+    for (uniform int i = 0; i < 2*programCount; ++i)
         x[i] = 0;
 
     x[2*(a-1)] = b;
diff --git a/tests/array-scatter-unif.ispc b/tests/array-scatter-unif.ispc
index a553d703..6b5e75d0 100644
--- a/tests/array-scatter-unif.ispc
+++ b/tests/array-scatter-unif.ispc
@@ -5,8 +5,8 @@ export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex];
-    uniform float x[40];
-    for (uniform int i = 0; i < 40; ++i)
+    uniform float x[programCount+5];
+    for (uniform int i = 0; i < programCount+5; ++i)
         x[i] = 0.;
     x[a] = 2;
     RET[programIndex] = x[4] + x[0] + x[5];
diff --git a/tests/array-scatter-vary.ispc b/tests/array-scatter-vary.ispc
index 07527519..d9d9bc37 100644
--- a/tests/array-scatter-vary.ispc
+++ b/tests/array-scatter-vary.ispc
@@ -4,9 +4,8 @@ export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex];
-    float x[30];
-    // HACK to avoid @llvm.memset...
-    for (uniform int i = 0; i < b*6; ++i)
+    float x[2*programCount];
+    for (uniform int i = 0; i < 2*programCount; ++i)
         x[i] = 0;
     x[a] = a;
     RET[programIndex] = x[4] + x[0] + x[5];
diff --git a/tests/array-struct-gather.ispc b/tests/array-struct-gather.ispc
index 7a18acba..d3799800 100644
--- a/tests/array-struct-gather.ispc
+++ b/tests/array-struct-gather.ispc
@@ -4,14 +4,14 @@ export uniform int width() { return programCount; }
 
 
 struct Foo {
-    uniform float x[17];
+    uniform float x[programCount+1];
 };
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex];
     uniform Foo foo;
     uniform int i;
-    for (i = 0; i < 17; ++i)
+    for (i = 0; i < programCount+1; ++i)
         foo.x[i] = i;
 
     if ((int)a & 1)
diff --git a/tests/asin.ispc b/tests/asin.ispc
index a6839b09..4ad23b3a 100644
--- a/tests/asin.ispc
+++ b/tests/asin.ispc
@@ -8,12 +8,12 @@ export void f_v(uniform float RET[]) {
     uniform float vals[8] = { 0, 1, 0.5, -1, -.87, -.25, 1e-3, -.99999999 };
     uniform float r[8];
     foreach (i = 0 ... 8)
-        r[i] = sin(asin(vals[i]));
+        r[i] = sin(asin(vals[i%8]));
 
     int errors = 0;
     for (uniform int i = 0; i < 8; ++i) {
-        if (ok(r[i], vals[i]) == false) {
-            print("error @ %: got %, expected %\n", i, r[i], vals[i]);
+        if (ok(r[i], vals[i%8]) == false) {
+            print("error @ %: got %, expected %\n", i, r[i], vals[i%8]);
             ++errors;
         }
     }
diff --git a/tests/atomics-12.ispc b/tests/atomics-12.ispc
index 0596a85f..c27ad99c 100644
--- a/tests/atomics-12.ispc
+++ b/tests/atomics-12.ispc
@@ -6,14 +6,14 @@ uniform unsigned int32 s = 0;
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
     float b = 0;
-    if (programIndex & 1)
+    if (programIndex < 30 && programIndex & 1)
         b = atomic_or_global(&s, (1 << programIndex));
     RET[programIndex] = s;
 }
 
 export void result(uniform float RET[]) {
     uniform int sum = 0;
-    for (uniform int i = 0; i < programCount; ++i)
+    for (uniform int i = 0; i < min(30, programCount); ++i)
         if (i & 1)
             sum += (1 << i);
     RET[programIndex] = sum;
diff --git a/tests/atomics-13.ispc b/tests/atomics-13.ispc
index fe9a5d1e..86faaddb 100644
--- a/tests/atomics-13.ispc
+++ b/tests/atomics-13.ispc
@@ -5,12 +5,12 @@ uniform unsigned int32 s = 0;
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
-    float b = 0;
-    if (programIndex & 1)
+    int32 b = 0;
+    if (programIndex < 32 && programIndex & 1)
         b = atomic_or_global(&s, (1 << programIndex));
     RET[programIndex] = popcnt(reduce_max((int32)b));
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = programCount == 1 ? 0 : ((programCount/2) - 1);
+    RET[programIndex] = programCount == 1 ? 0 : ((min(32, programCount)/2) - 1);
 }
diff --git a/tests/atomics-14.ispc b/tests/atomics-14.ispc
index c4551039..7beb3e75 100644
--- a/tests/atomics-14.ispc
+++ b/tests/atomics-14.ispc
@@ -7,14 +7,14 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
     float b = 0;
     if (programIndex & 1)
-        b = atomic_or_global(&s, (1 << programIndex));
+        b = atomic_or_global(&s, (1ull << programIndex));
     RET[programIndex] = (s>>20);
 }
 
 export void result(uniform float RET[]) {
-    uniform int sum = 0;
+    uniform int64 sum = 0;
     for (uniform int i = 0; i < programCount; ++i)
         if (i & 1)
-            sum += (1 << i);
+            sum += (1ull << i);
     RET[programIndex] = ((unsigned int64)(0xffffffffff000000 | sum)) >> 20;
 }
diff --git a/tests/atomics-4.ispc b/tests/atomics-4.ispc
index 83e9fbf0..30b343d1 100644
--- a/tests/atomics-4.ispc
+++ b/tests/atomics-4.ispc
@@ -5,10 +5,10 @@ uniform int32 s = 0;
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
-    float b = atomic_or_global(&s, (1<<programIndex));
+    float b = atomic_or_global(&s, (1<<min(programIndex,30)));
     RET[programIndex] = s;
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = (1<<programCount)-1;
+    RET[programIndex] = (1<<min(programCount,31))-1;
 }
diff --git a/tests/coalesce-1.ispc b/tests/coalesce-1.ispc
index 205f8d05..acfe8cdf 100644
--- a/tests/coalesce-1.ispc
+++ b/tests/coalesce-1.ispc
@@ -5,7 +5,8 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     uniform float * uniform buf = uniform new uniform float[32*32];
     for (uniform int i = 0; i < 32*32; ++i)
         buf[i] = i;
-
+    
+    assert(programIndex <= 64);
     RET[programIndex] = buf[64-programIndex];
 }
 
diff --git a/tests/count-leading-trailing-zeros-4.ispc b/tests/count-leading-trailing-zeros-4.ispc
index 5cef2b7a..475c18ca 100644
--- a/tests/count-leading-trailing-zeros-4.ispc
+++ b/tests/count-leading-trailing-zeros-4.ispc
@@ -3,10 +3,10 @@ export uniform int width() { return programCount; }
 
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    int32 i = (1 << programIndex);
+    int32 i = (1 << (programIndex % 28));
     RET[programIndex] = count_leading_zeros(i);
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = 31-programIndex;
+    RET[programIndex] = 31-(programIndex%28);
 }
diff --git a/tests/count-leading-trailing-zeros-5.ispc b/tests/count-leading-trailing-zeros-5.ispc
index d68dc5e4..2fe8161e 100644
--- a/tests/count-leading-trailing-zeros-5.ispc
+++ b/tests/count-leading-trailing-zeros-5.ispc
@@ -3,10 +3,10 @@ export uniform int width() { return programCount; }
 
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    unsigned int64 i = ((unsigned int64)1 << (40+programIndex));
+    unsigned int64 i = ((unsigned int64)1 << min(63, 40+programIndex));
     RET[programIndex] = count_trailing_zeros(i);
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = 40+programIndex;
+    RET[programIndex] = min(63, 40+programIndex);
 }
diff --git a/tests/exclusive-scan-add-1.ispc b/tests/exclusive-scan-add-1.ispc
index b4b0cea9..6e4e83ab 100644
--- a/tests/exclusive-scan-add-1.ispc
+++ b/tests/exclusive-scan-add-1.ispc
@@ -5,8 +5,17 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     RET[programIndex] = exclusive_scan_add(programIndex); 
 }
 
-export void result(uniform float RET[]) {
-    uniform int result[] = { 0, 0, 1, 3, 6, 10, 15, 21, 28,
-                             36, 45, 55, 66, 78, 91, 105, 120 };
-    RET[programIndex] = result[programIndex]; 
+int es(int v) {
+    uniform int vv[programCount];
+    vv[programIndex] = v;
+
+    uniform int r[programCount];
+    r[0] = 0;
+    for (uniform int i = 1; i < programCount; ++i)
+        r[i] = r[i-1] + vv[i-1];
+    return r[programIndex];
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = es(programIndex); 
 }
diff --git a/tests/exclusive-scan-add-10.ispc b/tests/exclusive-scan-add-10.ispc
index c5e1aa18..154744b5 100644
--- a/tests/exclusive-scan-add-10.ispc
+++ b/tests/exclusive-scan-add-10.ispc
@@ -10,11 +10,19 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
 }
 
 
+int es(int v) {
+    uniform int vv[programCount];
+    vv[programIndex] = v;
+
+    uniform int r[programCount];
+    r[0] = 0;
+    for (uniform int i = 1; i < programCount; ++i)
+        r[i] = r[i-1] + vv[i-1];
+    return r[programIndex];
+}
+
 export void result(uniform float RET[]) {
-    uniform int result[16] = { 0, 0, 0, 2, 0, 6, 0, 12, 
-                               0, 20, 0, 30, 0, 42, 0, 56 };
-    if (programIndex & 1)
-        RET[programIndex] = result[programIndex]; 
-    else
+    RET[programIndex] = es((programIndex & 1) ? (programIndex+1) : 0);
+    if ((programIndex & 1) == 0)
         RET[programIndex] = -1;
 }
diff --git a/tests/exclusive-scan-add-2.ispc b/tests/exclusive-scan-add-2.ispc
index b8a9258f..fe10ffd8 100644
--- a/tests/exclusive-scan-add-2.ispc
+++ b/tests/exclusive-scan-add-2.ispc
@@ -5,8 +5,17 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     RET[programIndex] = exclusive_scan_add(aFOO[programIndex]);
 }
 
-export void result(uniform float RET[]) {
-    uniform int result[] = { 0, 1, 3, 6, 10, 15, 21, 28,
-                             36, 45, 55, 66, 78, 91, 105, 120, 136 };
-    RET[programIndex] = result[programIndex]; 
+int es(int v) {
+    uniform int vv[programCount];
+    vv[programIndex] = v;
+
+    uniform int r[programCount];
+    r[0] = 0;
+    for (uniform int i = 1; i < programCount; ++i)
+        r[i] = r[i-1] + vv[i-1];
+    return r[programIndex];
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = es(programIndex+1); 
 }
diff --git a/tests/exclusive-scan-add-3.ispc b/tests/exclusive-scan-add-3.ispc
index 2d883c6c..b62d9252 100644
--- a/tests/exclusive-scan-add-3.ispc
+++ b/tests/exclusive-scan-add-3.ispc
@@ -9,8 +9,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
 }
 
 export void result(uniform float RET[]) {
-    uniform int result[] = { 0, 1, 3, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0 };
+    uniform int result[] = { 0, 1, 3 };
     RET[programIndex] = -1;
     if (programIndex <= 1)
         RET[programIndex] = result[programIndex]; 
diff --git a/tests/exclusive-scan-add-5.ispc b/tests/exclusive-scan-add-5.ispc
index bb4d50db..3cc20292 100644
--- a/tests/exclusive-scan-add-5.ispc
+++ b/tests/exclusive-scan-add-5.ispc
@@ -9,12 +9,20 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     }
 }
 
+int es(int v) {
+    uniform int vv[programCount];
+    vv[programIndex] = v;
+
+    uniform int r[programCount];
+    r[0] = 0;
+    for (uniform int i = 1; i < programCount; ++i)
+        r[i] = r[i-1] + vv[i-1];
+    return r[programIndex];
+}
+
 
 export void result(uniform float RET[]) {
-    uniform int result[16] = { 0, 0, 0, 2, 0, 6, 0, 12, 
-                               0, 20, 0, 30, 0, 42, 0, 56 };
-    if (programIndex & 1)
-        RET[programIndex] = result[programIndex]; 
-    else
+    RET[programIndex] = es((programIndex & 1) ? (programIndex+1) : 0);
+    if ((programIndex & 1) == 0)
         RET[programIndex] = -1;
 }
diff --git a/tests/exclusive-scan-add-6.ispc b/tests/exclusive-scan-add-6.ispc
index 46908efe..fcee2568 100644
--- a/tests/exclusive-scan-add-6.ispc
+++ b/tests/exclusive-scan-add-6.ispc
@@ -5,8 +5,17 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     RET[programIndex] = exclusive_scan_add((float)programIndex); 
 }
 
-export void result(uniform float RET[]) {
-    uniform int result[] = { 0, 0, 1, 3, 6, 10, 15, 21, 28,
-                             36, 45, 55, 66, 78, 91, 105, 120 };
-    RET[programIndex] = result[programIndex]; 
+int es(int v) {
+    uniform int vv[programCount];
+    vv[programIndex] = v;
+
+    uniform int r[programCount];
+    r[0] = 0;
+    for (uniform int i = 1; i < programCount; ++i)
+        r[i] = r[i-1] + vv[i-1];
+    return r[programIndex];
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = es(programIndex); 
 }
diff --git a/tests/exclusive-scan-add-7.ispc b/tests/exclusive-scan-add-7.ispc
index ee0b0fcd..f7caf2d5 100644
--- a/tests/exclusive-scan-add-7.ispc
+++ b/tests/exclusive-scan-add-7.ispc
@@ -5,8 +5,17 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     RET[programIndex] = exclusive_scan_add((double)aFOO[programIndex]);
 }
 
-export void result(uniform float RET[]) {
-    uniform int result[] = { 0, 1, 3, 6, 10, 15, 21, 28,
-                             36, 45, 55, 66, 78, 91, 105, 120, 136 };
-    RET[programIndex] = result[programIndex]; 
+int es(int v) {
+    uniform int vv[programCount];
+    vv[programIndex] = v;
+
+    uniform int r[programCount];
+    r[0] = 0;
+    for (uniform int i = 1; i < programCount; ++i)
+        r[i] = r[i-1] + vv[i-1];
+    return r[programIndex];
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = es(programIndex+1); 
 }
diff --git a/tests/exclusive-scan-and-2.ispc b/tests/exclusive-scan-and-2.ispc
index 59a9900e..5d2bcd1f 100644
--- a/tests/exclusive-scan-and-2.ispc
+++ b/tests/exclusive-scan-and-2.ispc
@@ -4,7 +4,7 @@ export uniform int width() { return programCount; }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     RET[programIndex] = -1;
     int32 a = ~(1 << programIndex);
-    if ((programIndex & 1) == 0) {
+    if ((programIndex < 32) && (programIndex & 1) == 0) {
         RET[programIndex] = exclusive_scan_and(a);
     }
 }
@@ -12,7 +12,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
 
 export void result(uniform float RET[]) {
     RET[programIndex] = -1;
-    if ((programIndex & 1) == 0 && programIndex > 0) {
+    if ((programIndex & 1) == 0 && programIndex > 0 && programIndex < 32) {
         int val = 0xffffffff;
         for (int i = 0; i < programIndex-1; i += 2)
             val &= ~(1<<i);
diff --git a/tests/exclusive-scan-or-1.ispc b/tests/exclusive-scan-or-1.ispc
index 3d09984d..bd2b7598 100644
--- a/tests/exclusive-scan-or-1.ispc
+++ b/tests/exclusive-scan-or-1.ispc
@@ -3,11 +3,11 @@ export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     RET[programIndex] = -1;
-    int32 a = (1 << programIndex);
+    int32 a = (1 << (min(programIndex, 30)));
     RET[programIndex] = exclusive_scan_or(a);
 }
 
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = (1 << programIndex) - 1;
+    RET[programIndex] = (1 << (min(programIndex, 31))) - 1;
 }
diff --git a/tests/foreach-mask-1.ispc b/tests/foreach-mask-1.ispc
index 2f462b48..ee4b1b1e 100644
--- a/tests/foreach-mask-1.ispc
+++ b/tests/foreach-mask-1.ispc
@@ -10,8 +10,10 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
 
     // make sure we reset the func mask in the foreach loop...
     if ((int)aFOO[programIndex] & 1)
-        foreach (i = 0 ... programCount+3)
-            val[i] += aFOO[i] - 1;
+        foreach (i = 0 ... programCount+3) {
+            int ic = min(i, programCount-1);
+            val[i] += aFOO[ic] - 1 + i-ic;
+        }
 
     RET[programIndex] = val[3+programIndex]; 
 }
diff --git a/tests/foreach-mask.ispc b/tests/foreach-mask.ispc
index 0d01b16a..f6000a71 100644
--- a/tests/foreach-mask.ispc
+++ b/tests/foreach-mask.ispc
@@ -5,8 +5,10 @@ export uniform int width() { return programCount; }
 // make sure we reset the func mask in the foreach loop...
 
 void update(uniform float val[], const uniform float a[]) {
-    foreach (i = 0 ... programCount+3)
-        val[i] += a[i] - 1;
+    foreach (i = 0 ... programCount+3) {
+        int ic = min(i, programCount-1);
+        val[i] += a[ic] - 1 + i-ic;
+    }
 }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
diff --git a/tests/frexp-double-1.ispc b/tests/frexp-double-1.ispc
index db6f742e..6c38b05e 100644
--- a/tests/frexp-double-1.ispc
+++ b/tests/frexp-double-1.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    double a = (1<<programIndex) * 1.5;
+    double a = (1<< (programIndex % 28)) * 1.5;
     if (programIndex & 1)
         a = -a;
     int exponent;
@@ -12,5 +12,5 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = 1+programIndex;
+    RET[programIndex] = 1+(programIndex % 28);
 }
diff --git a/tests/frexp-double.ispc b/tests/frexp-double.ispc
index e2728197..ba4831d7 100644
--- a/tests/frexp-double.ispc
+++ b/tests/frexp-double.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    double a = (1<<programIndex) * 1.5;
+    double a = (1<< (programIndex%28)) * 1.5;
     if (programIndex & 1)
         a = -a;
     int exponent;
diff --git a/tests/frexp-float-1.ispc b/tests/frexp-float-1.ispc
index 0d971086..7d5fc1d2 100644
--- a/tests/frexp-float-1.ispc
+++ b/tests/frexp-float-1.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    float a = (1<<programIndex) * 1.5;
+    float a = (1<< (programIndex%28)) * 1.5;
     if (programIndex & 1)
         a = -a;
     int exponent;
@@ -12,5 +12,5 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = 1+programIndex;
+    RET[programIndex] = 1+(programIndex%28);
 }
diff --git a/tests/frexp-float.ispc b/tests/frexp-float.ispc
index 2d248448..ec54e4be 100644
--- a/tests/frexp-float.ispc
+++ b/tests/frexp-float.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    float a = (1<<programIndex) * 1.5;
+    float a = (1<< (programIndex%28)) * 1.5;
     if (programIndex & 1)
         a = -a;
     int exponent;
diff --git a/tests/func-anon-param.ispc b/tests/func-anon-param.ispc
new file mode 100644
index 00000000..8bf97065
--- /dev/null
+++ b/tests/func-anon-param.ispc
@@ -0,0 +1,15 @@
+
+
+export uniform int width() { return programCount; }
+
+float foo(float &) { return 1; }
+float bar(uniform float []) { return 2; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float x = 0;
+    RET[programIndex] = foo(x) + bar(aFOO);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 3;
+}
diff --git a/tests/func-overload-max.ispc b/tests/func-overload-max.ispc
new file mode 100644
index 00000000..37360030
--- /dev/null
+++ b/tests/func-overload-max.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = 1. / aFOO[programIndex]; 
+    RET[programIndex] = max(0, a); 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1. / (1+programIndex);
+}
diff --git a/tests/func-overload-refs.ispc b/tests/func-overload-refs.ispc
new file mode 100644
index 00000000..89184812
--- /dev/null
+++ b/tests/func-overload-refs.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+float foo(float &a) { return 1; }
+float foo(const float &a) { return 2; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float x = 0;
+    RET[programIndex] = foo(x); 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1;
+}
diff --git a/tests/func-ptr-initializer.ispc b/tests/func-ptr-initializer.ispc
new file mode 100644
index 00000000..96537391
--- /dev/null
+++ b/tests/func-ptr-initializer.ispc
@@ -0,0 +1,28 @@
+
+export uniform int width() { return programCount; }
+
+
+typedef float (*func)();
+
+float foo();
+float bar();
+
+struct X { func f, g; };
+
+static uniform X x = { foo, &bar };
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    RET[programIndex] = x.f() + x.g();
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = programIndex;
+}
+
+float foo() {
+    return 2 * programIndex;
+}
+
+float bar() {
+    return -programIndex;
+}
diff --git a/tests/funcptr-null-1.ispc b/tests/funcptr-null-1.ispc
index 05798918..784b5ada 100644
--- a/tests/funcptr-null-1.ispc
+++ b/tests/funcptr-null-1.ispc
@@ -15,7 +15,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
     float b = aFOO[0]-1;
     uniform FuncType func = foo;
-    RET[programIndex] = (func ? func : bar)(a, b);
+    RET[programIndex] = (func ? func : &bar)(a, b);
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests/funcptr-null-3.ispc b/tests/funcptr-null-3.ispc
index 8e228315..3fd74da0 100644
--- a/tests/funcptr-null-3.ispc
+++ b/tests/funcptr-null-3.ispc
@@ -14,7 +14,7 @@ static float bar(float a, float b) {
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
     float b = aFOO[0]-1;
-    FuncType func = foo;
+    FuncType func = &foo;
     if (a == 2)
         func = NULL;
     if (func != NULL)
diff --git a/tests/funcptr-null-6.ispc b/tests/funcptr-null-6.ispc
index cf92c4a7..45bcfcdd 100644
--- a/tests/funcptr-null-6.ispc
+++ b/tests/funcptr-null-6.ispc
@@ -16,7 +16,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     float b = aFOO[0]-1;
     FuncType func = NULL;
     if (a == 2)
-        func = foo;
+        func = &foo;
     if (!func)
         RET[programIndex] = -1;
     else
diff --git a/tests/funcptr-uniform-2.ispc b/tests/funcptr-uniform-2.ispc
index 849c9492..59d54b40 100644
--- a/tests/funcptr-uniform-2.ispc
+++ b/tests/funcptr-uniform-2.ispc
@@ -14,7 +14,7 @@ static float bar(float a, float b) {
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
     float b = aFOO[0]-1;
-    uniform FuncType func = bar;
+    uniform FuncType func = &bar;
     if (aFOO[0] == 1)
         func = foo;
     RET[programIndex] = func(a, b);
diff --git a/tests/global-decl-define.ispc b/tests/global-decl-define.ispc
new file mode 100644
index 00000000..44fb92a7
--- /dev/null
+++ b/tests/global-decl-define.ispc
@@ -0,0 +1,14 @@
+
+
+export uniform int width() { return programCount; }
+
+extern int foo;
+int foo = 1;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    RET[programIndex] = foo; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1;
+}
diff --git a/tests/gs-double-improve-multidim-1.ispc b/tests/gs-double-improve-multidim-1.ispc
index c001896e..21bd2d5c 100644
--- a/tests/gs-double-improve-multidim-1.ispc
+++ b/tests/gs-double-improve-multidim-1.ispc
@@ -4,9 +4,9 @@ export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex]; 
-    uniform double udx[25][25];
-    for (uniform int i = 0; i < 25; ++i)
-        for (uniform int j = 0; j < 25; ++j)
+    uniform double udx[5][programCount+5];
+    for (uniform int i = 0; i < 5; ++i)
+        for (uniform int j = 0; j < programCount+5; ++j)
             udx[i][j] = 10*i+j;
 
     int x = 1;
diff --git a/tests/gs-double-improve-multidim-struct-1.ispc b/tests/gs-double-improve-multidim-struct-1.ispc
index 07962eea..93b0ae7f 100644
--- a/tests/gs-double-improve-multidim-struct-1.ispc
+++ b/tests/gs-double-improve-multidim-struct-1.ispc
@@ -4,15 +4,15 @@ export uniform int width() { return programCount; }
 
 
 struct Foo {
-    uniform double udx[5][35];
+    uniform double udx[5][programCount+5];
 };
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex]; 
-    uniform Foo f[5];
-    for (uniform int i = 0; i < 5; ++i)
+    uniform Foo f[3];
+    for (uniform int i = 0; i < 3; ++i)
         for (uniform int j = 0; j < 5; ++j)
-            for (uniform int k = 0; k < 35; ++k)
+            for (uniform int k = 0; k < programCount+5; ++k)
                 f[i].udx[j][k] = 100*i+10*j+k;
 
     int x = 1;
diff --git a/tests/gs-double-improve-progindex-plus-const.ispc b/tests/gs-double-improve-progindex-plus-const.ispc
index 72eaee4e..07416cd1 100644
--- a/tests/gs-double-improve-progindex-plus-const.ispc
+++ b/tests/gs-double-improve-progindex-plus-const.ispc
@@ -5,9 +5,11 @@ export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex]; 
     uniform double udx[17] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17 };
-    int x = programIndex + 1;
+    int x = (programIndex + 1) % 17;
     RET[programIndex] = udx[x];
 }
 
-export void result(uniform float RET[]) { RET[programIndex] = 2+programIndex; }
+export void result(uniform float RET[]) { 
+    RET[programIndex] = 1+((1+programIndex)%17); 
+}
 
diff --git a/tests/gs-double-improve-progindex-plus-unif.ispc b/tests/gs-double-improve-progindex-plus-unif.ispc
index 51ea9983..f605998f 100644
--- a/tests/gs-double-improve-progindex-plus-unif.ispc
+++ b/tests/gs-double-improve-progindex-plus-unif.ispc
@@ -6,7 +6,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex]; 
     uniform double udx[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 };
     int x = -5 + programIndex + (int)b;
-    RET[programIndex] = udx[x];
+    RET[programIndex] = udx[x % 16];
 }
 
-export void result(uniform float RET[]) { RET[programIndex] = 1+programIndex; }
+export void result(uniform float RET[]) { RET[programIndex] = 1+(programIndex%16); }
diff --git a/tests/gs-double-improve-progindex.ispc b/tests/gs-double-improve-progindex.ispc
index cce9137e..dfc007af 100644
--- a/tests/gs-double-improve-progindex.ispc
+++ b/tests/gs-double-improve-progindex.ispc
@@ -4,7 +4,9 @@ export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex]; 
-    uniform double udx[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 };
+    uniform double udx[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        udx[i] = i+1;
     int x = programIndex;
     RET[programIndex] = udx[x];
 }
diff --git a/tests/gs-improve-multidim-1.ispc b/tests/gs-improve-multidim-1.ispc
index d85c1541..d978e97a 100644
--- a/tests/gs-improve-multidim-1.ispc
+++ b/tests/gs-improve-multidim-1.ispc
@@ -5,9 +5,9 @@ export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex]; 
-    uniform float udx[20][20];
-    for (uniform int i = 0; i < 20; ++i)
-        for (uniform int j = 0; j < 20; ++j)
+    uniform float udx[2][programCount+5];
+    for (uniform int i = 0; i < 2; ++i)
+        for (uniform int j = 0; j < programCount+5; ++j)
             udx[i][j] = 100*i+j;
 
     int x = 1;
diff --git a/tests/gs-improve-multidim-struct-1.ispc b/tests/gs-improve-multidim-struct-1.ispc
index 6b45e853..5c395820 100644
--- a/tests/gs-improve-multidim-struct-1.ispc
+++ b/tests/gs-improve-multidim-struct-1.ispc
@@ -4,15 +4,15 @@ export uniform int width() { return programCount; }
 
 
 struct Foo {
-    uniform float udx[25][25];
+    uniform float udx[5][programCount+5];
 };
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex]; 
-    uniform Foo f[5];
-    for (uniform int i = 0; i < 5; ++i)
-        for (uniform int j = 0; j < 25; ++j)
-            for (uniform int k = 0; k < 25; ++k)
+    uniform Foo f[3];
+    for (uniform int i = 0; i < 3; ++i)
+        for (uniform int j = 0; j < 5; ++j)
+            for (uniform int k = 0; k < programCount+5; ++k)
                 f[i].udx[j][k] = 1000*i+100*j+k;
 
     int x = 1;
diff --git a/tests/gs-improve-progindex-plus-const.ispc b/tests/gs-improve-progindex-plus-const.ispc
index fc350b2a..4035cae4 100644
--- a/tests/gs-improve-progindex-plus-const.ispc
+++ b/tests/gs-improve-progindex-plus-const.ispc
@@ -5,7 +5,9 @@ export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex]; 
-    uniform float udx[17] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17 };
+    uniform float udx[programCount+1];
+    for (uniform int i = 0; i < programCount + 1; ++i)
+        udx[i] = i+1;
     int x = programIndex + 1;
     RET[programIndex] = udx[x];
 }
diff --git a/tests/gs-improve-progindex-plus-unif.ispc b/tests/gs-improve-progindex-plus-unif.ispc
index e651adf8..0e1de841 100644
--- a/tests/gs-improve-progindex-plus-unif.ispc
+++ b/tests/gs-improve-progindex-plus-unif.ispc
@@ -5,7 +5,9 @@ export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex]; 
-    uniform float udx[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 };
+    uniform float udx[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        udx[i] = i+1;
     int x = -5 + programIndex + (int)b;
     RET[programIndex] = udx[x];
 }
diff --git a/tests/gs-improve-progindex.ispc b/tests/gs-improve-progindex.ispc
index 3845f72d..8e771ea1 100644
--- a/tests/gs-improve-progindex.ispc
+++ b/tests/gs-improve-progindex.ispc
@@ -5,7 +5,9 @@ export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex]; 
-    uniform float udx[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 };
+    uniform float udx[programCount+1];
+    for (uniform int i = 0; i < programCount + 1; ++i)
+        udx[i] = i+1;
     int x = programIndex;
     RET[programIndex] = udx[x];
 }
diff --git a/tests/half-3.ispc b/tests/half-3.ispc
index 47de0eee..2c7b4096 100644
--- a/tests/half-3.ispc
+++ b/tests/half-3.ispc
@@ -10,6 +10,8 @@ export void f_v(uniform float RET[]) {
         h = float_to_half(f);
 
         int mismatches = (f == f && i != h);
+        if (any(mismatches != 0))
+            print("mismatch: orig int16 % -> float % -> half %\n", i, f, h);
         errors += reduce_add(mismatches);
     }
 
diff --git a/tests/intptr.ispc b/tests/intptr.ispc
new file mode 100644
index 00000000..7eb9eef7
--- /dev/null
+++ b/tests/intptr.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) {
+    RET[programIndex] = sizeof(uniform intptr_t);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 
+#if (ISPC_POINTER_SIZE==32)
+        4
+#elif (ISPC_POINTER_SIZE==64)
+        8
+#else
+#error Unknown pointer size
+#endif
+        ;
+}
diff --git a/tests/ldexp-double.ispc b/tests/ldexp-double.ispc
index f2454826..6b3ed734 100644
--- a/tests/ldexp-double.ispc
+++ b/tests/ldexp-double.ispc
@@ -3,14 +3,15 @@ export uniform int width() { return programCount; }
 
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    double a = 1 << (programIndex);
+    double a = 1 << (programIndex % 28);
     if (programIndex & 1)
         a = -a;
     RET[programIndex] = ldexp(a, 2);
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = (1 << (programIndex + 2));
+    int pi = programIndex % 28;
+    RET[programIndex] = (1 << (pi + 2));
     if (programIndex & 1)
         RET[programIndex] = -RET[programIndex];
 }
diff --git a/tests/ldexp-float.ispc b/tests/ldexp-float.ispc
index aa6b5a0f..a2ec9a27 100644
--- a/tests/ldexp-float.ispc
+++ b/tests/ldexp-float.ispc
@@ -3,14 +3,15 @@ export uniform int width() { return programCount; }
 
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    float a = 1 << (programIndex);
+    float a = 1 << (programIndex % 28);
     if (programIndex & 1)
         a = -a;
     RET[programIndex] = ldexp(a, 2);
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = (1 << (programIndex + 2));
+    int pi = programIndex % 28;
+    RET[programIndex] = (1 << (pi + 2));
     if (programIndex & 1)
         RET[programIndex] = -RET[programIndex];
 }
diff --git a/tests/local-atomics-10.ispc b/tests/local-atomics-10.ispc
index 77eb1387..b27874e6 100644
--- a/tests/local-atomics-10.ispc
+++ b/tests/local-atomics-10.ispc
@@ -13,5 +13,5 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = programCount == 1 ? 1 : 2;
+    RET[programIndex] = (programCount == 1) ? 1 : 2;
 }
diff --git a/tests/local-atomics-12.ispc b/tests/local-atomics-12.ispc
index fc7938ce..23a30af5 100644
--- a/tests/local-atomics-12.ispc
+++ b/tests/local-atomics-12.ispc
@@ -6,14 +6,14 @@ uniform unsigned int32 s = 0;
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
     float b = 0;
-    if (programIndex & 1)
+    if (programIndex < 29 && (programIndex & 1))
         b = atomic_or_local(&s, (1 << programIndex));
     RET[programIndex] = s;
 }
 
 export void result(uniform float RET[]) {
     uniform int sum = 0;
-    for (uniform int i = 0; i < programCount; ++i)
+    for (uniform int i = 0; i < min(programCount, 29); ++i)
         if (i & 1)
             sum += (1 << i);
     RET[programIndex] = sum;
diff --git a/tests/local-atomics-13.ispc b/tests/local-atomics-13.ispc
index 632e34ea..36fd1f1c 100644
--- a/tests/local-atomics-13.ispc
+++ b/tests/local-atomics-13.ispc
@@ -5,12 +5,12 @@ uniform unsigned int32 s = 0;
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
-    float b = 0;
-    if (programIndex & 1)
+    int32 b = 0;
+    if (programIndex < 28 && (programIndex & 1))
         b = atomic_or_local(&s, (1 << programIndex));
-    RET[programIndex] = popcnt(reduce_max((int32)b));
+    RET[programIndex] = popcnt(reduce_max(b));
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = programCount == 1 ? 0 : ((programCount/2) - 1);
+    RET[programIndex] = (programCount == 1) ? 0 : ((min(28, programCount)/2) - 1);
 }
diff --git a/tests/local-atomics-14.ispc b/tests/local-atomics-14.ispc
index a5f7e63f..4cf81809 100644
--- a/tests/local-atomics-14.ispc
+++ b/tests/local-atomics-14.ispc
@@ -6,14 +6,14 @@ uniform unsigned int64 s = 0xffffffffff000000;
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
     float b = 0;
-    if (programIndex & 1)
+    if (programIndex < 32 && (programIndex & 1))
         b = atomic_or_local(&s, (1 << programIndex));
     RET[programIndex] = (s>>20);
 }
 
 export void result(uniform float RET[]) {
     uniform int sum = 0;
-    for (uniform int i = 0; i < programCount; ++i)
+    for (uniform int i = 0; i < min(32, programCount); ++i)
         if (i & 1)
             sum += (1 << i);
     RET[programIndex] = ((unsigned int64)(0xffffffffff000000 | sum)) >> 20;
diff --git a/tests/local-atomics-4.ispc b/tests/local-atomics-4.ispc
index 651cf4c6..f7f6a04a 100644
--- a/tests/local-atomics-4.ispc
+++ b/tests/local-atomics-4.ispc
@@ -5,10 +5,12 @@ uniform int32 s = 0;
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
-    float b = atomic_or_local(&s, (1<<programIndex));
+    float b = 0;
+    if (programIndex < 29)
+        atomic_or_local(&s, (1<<programIndex));
     RET[programIndex] = s;
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = (1<<programCount)-1;
+    RET[programIndex] = (1<<min(29,programCount))-1;
 }
diff --git a/tests/local-atomics-7.ispc b/tests/local-atomics-7.ispc
index 0d1b541d..1ac11b52 100644
--- a/tests/local-atomics-7.ispc
+++ b/tests/local-atomics-7.ispc
@@ -5,10 +5,13 @@ uniform int32 s = 0;
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     int32 a = aFOO[programIndex]; 
-    float b = atomic_min_local(&s, a);
+    float b = 0;
+    if (programIndex < 32)
+        atomic_min_local(&s, a);
     RET[programIndex] = reduce_min(b);
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = reduce_min(programIndex);
+    int pi = (programIndex < 32) ? programIndex : 0;
+    RET[programIndex] = reduce_min(pi);
 }
diff --git a/tests/local-atomics-swap.ispc b/tests/local-atomics-swap.ispc
index 64ae712a..1a6a7012 100644
--- a/tests/local-atomics-swap.ispc
+++ b/tests/local-atomics-swap.ispc
@@ -13,5 +13,5 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = 1234 + reduce_add(programIndex & 1 ? programIndex : 0);
+    RET[programIndex] = 1234 + reduce_add((programIndex & 1) ? programIndex : 0);
 }
diff --git a/tests/masked-scatter-struct.ispc b/tests/masked-scatter-struct.ispc
index bb36a60b..19df4bd0 100644
--- a/tests/masked-scatter-struct.ispc
+++ b/tests/masked-scatter-struct.ispc
@@ -5,8 +5,8 @@ struct Foo { float x; float y; };
 
 export void f_fu(uniform float ret[], uniform float aa[], uniform float b) {
     float a = aa[programIndex];
-    uniform Foo foo[32];
-    for (uniform int i = 0; i < 32; ++i) {
+    uniform Foo foo[programCount];
+    for (uniform int i = 0; i < programCount; ++i) {
         foo[i].x = i;
         foo[i].y = -1234 + i;
     }
diff --git a/tests/masked-scatter-vector.ispc b/tests/masked-scatter-vector.ispc
index adda440e..676cbfff 100644
--- a/tests/masked-scatter-vector.ispc
+++ b/tests/masked-scatter-vector.ispc
@@ -5,8 +5,8 @@ typedef int<3> int3;
 
 export void f_fu(uniform float ret[], uniform float aa[], uniform float b) {
     float a = aa[programIndex];
-    uniform int3 array[32];
-    for (uniform int i = 0; i < 6*b + 2; ++i) {
+    uniform int3 array[programCount];
+    for (uniform int i = 0; i < programCount + 5 - b; ++i) {
         for (uniform int j = 0; j < 3; ++j)
             array[i][j] = i+100*j;
     }
diff --git a/tests/masked-struct-scatter-varying.ispc b/tests/masked-struct-scatter-varying.ispc
index 928197a3..8211aa67 100644
--- a/tests/masked-struct-scatter-varying.ispc
+++ b/tests/masked-struct-scatter-varying.ispc
@@ -5,8 +5,8 @@ struct Foo { float x; float y; };
 
 export void f_fu(uniform float ret[], uniform float aa[], uniform float b) {
     float a = aa[programIndex];
-    Foo foo[32];
-    for (uniform int i = 0; i < 32; ++i)
+    Foo foo[programCount+1];
+    for (uniform int i = 0; i < programCount+1; ++i)
         foo[i].x = i;
     varying Foo fv = foo[a];
     fv.x += 1000;
diff --git a/tests/max-int-1.ispc b/tests/max-int-1.ispc
index 7ad02713..f1492b8b 100644
--- a/tests/max-int-1.ispc
+++ b/tests/max-int-1.ispc
@@ -6,8 +6,8 @@ export uniform int width() { return programCount; }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex];
     int i = (int)a;
-    RET[programIndex] = max((int)20, i);
+    RET[programIndex] = max((int)200, i);
 }
 
-export void result(uniform float RET[]) { RET[programIndex] = 20.; }
+export void result(uniform float RET[]) { RET[programIndex] = 200.; }
 
diff --git a/tests/min-float.ispc b/tests/min-float.ispc
index 3577daab..caedd962 100644
--- a/tests/min-float.ispc
+++ b/tests/min-float.ispc
@@ -5,7 +5,7 @@ export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex];
-    RET[programIndex] = min(a, 20.f);
+    RET[programIndex] = min(a, 200.f);
 }
 
 export void result(uniform float RET[]) { RET[programIndex] = 1+programIndex; }
diff --git a/tests/min-int.ispc b/tests/min-int.ispc
index 50df3e19..483b9b41 100644
--- a/tests/min-int.ispc
+++ b/tests/min-int.ispc
@@ -6,7 +6,7 @@ export uniform int width() { return programCount; }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex];
     int i = (int)a;
-    RET[programIndex] = min((int)20, i);
+    RET[programIndex] = min((int)200, i);
 }
 
 export void result(uniform float RET[]) { RET[programIndex] = 1+programIndex; }
diff --git a/tests/min-uint-2.ispc b/tests/min-uint-2.ispc
index e8f0e8c9..9338aeb0 100644
--- a/tests/min-uint-2.ispc
+++ b/tests/min-uint-2.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 export void f_f(uniform float r[], uniform float a[]) {
     unsigned int i = (unsigned int)a[programIndex];
-    r[programIndex] =  min((unsigned int)20, i);
+    r[programIndex] =  min((unsigned int)200, i);
 }
 
 export void result(uniform float r[]) { 
diff --git a/tests/nested-structs-2.ispc b/tests/nested-structs-2.ispc
index cb58e588..fd30c7ef 100644
--- a/tests/nested-structs-2.ispc
+++ b/tests/nested-structs-2.ispc
@@ -4,7 +4,7 @@ export uniform int width() { return programCount; }
 
 
 struct Foo {
-    float f[18];
+    float f[129];
 };
 
 struct Bar {
@@ -15,7 +15,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex];
     uniform Bar bar;
     for (uniform int i = 0; i < 6; ++i)
-        for (uniform int j = 0; j < 18; ++j)
+        for (uniform int j = 0; j < 129; ++j)
             bar.foo[i].f[j] = 2.+b-5;
 
     bar.foo[5].f[a] = a;
diff --git a/tests/pass-varying-lvalue-to-ref.ispc b/tests/pass-varying-lvalue-to-ref.ispc
index 4089b9d9..68beba1c 100644
--- a/tests/pass-varying-lvalue-to-ref.ispc
+++ b/tests/pass-varying-lvalue-to-ref.ispc
@@ -4,8 +4,8 @@ export uniform int width() { return programCount; }
 void inc(uniform float * varying v) { ++(*v); }
 
 export void f_fu(uniform float ret[], uniform float aa[], uniform float b) {
-    uniform float foo[32];
-    for (uniform int i = 0; i < 32; ++i)
+    uniform float foo[2*programCount];
+    for (uniform int i = 0; i < 2*programCount; ++i)
         foo[i] = 10+i;
     int a = (int)aa[programIndex];
     inc(&foo[a]);
diff --git a/tests/phi-opts-1.ispc b/tests/phi-opts-1.ispc
index d4265681..fb0a5282 100644
--- a/tests/phi-opts-1.ispc
+++ b/tests/phi-opts-1.ispc
@@ -3,8 +3,9 @@ export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float sum = 0;
-    for (int i = 0; i < 16; i += programCount)
-        sum += aFOO[i+programIndex];
+    for (int i = programIndex; i < 16; i += programCount) {
+        sum += aFOO[i];
+    }
     RET[programIndex] = reduce_add(sum); 
 }
 
diff --git a/tests/popcnt-1.ispc b/tests/popcnt-1.ispc
index 9a64c113..98139ea7 100644
--- a/tests/popcnt-1.ispc
+++ b/tests/popcnt-1.ispc
@@ -8,11 +8,18 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     RET[programIndex] = popcnt((int)a);
 }
 
-export void result(uniform float RET[]) { 
-    uniform int pc[16] = { 1, 1, 2, 1,
-                           2, 2, 3, 1,
-                           2, 2, 3, 2, 
-                           3, 3, 4, 1 };
-    RET[programIndex] = pc[programIndex];
+static int manualpc(int v) {
+    int count = 0;
+    while (v != 0) {
+        if (v & 1)
+            ++count;
+        v >>= 1;
+    }
+    return count;
+}
+
+export void result(uniform float RET[]) { 
+    assert(programCount <= 64);
+    RET[programIndex] = manualpc(programIndex+1);
 }
 
diff --git a/tests/popcnt-2.ispc b/tests/popcnt-2.ispc
index f792dde0..9a48fb7e 100644
--- a/tests/popcnt-2.ispc
+++ b/tests/popcnt-2.ispc
@@ -18,9 +18,11 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     RET[programIndex] = popcnt(int4(0xf0f0f0f0, 0xff, 0x10, 0));
 }
 
-export void result(uniform float RET[]) { 
-    RET[0] = RET[4] = RET[8] = RET[12] = 16;
-    RET[1] = RET[5] = RET[9] = RET[13] = 8;
-    RET[2] = RET[6] = RET[10] = RET[14] = 1;
-    RET[3] = RET[7] = RET[11] = RET[15] = 0;
+export void result(uniform float RET[]) {
+    for (uniform int i = 0; i < programCount; i += 4) {
+        RET[i] = 16;
+        RET[i+1] = 8;
+        RET[i+2] = 1;
+        RET[i+3] = 0;
+    }
 }
diff --git a/tests/popcnt-3.ispc b/tests/popcnt-3.ispc
index 110bf5e4..4ac8fa18 100644
--- a/tests/popcnt-3.ispc
+++ b/tests/popcnt-3.ispc
@@ -3,9 +3,9 @@ export uniform int width() { return programCount; }
 
 
 
-export void f_f(uniform float RET[4], uniform float aFOO[]) {
+export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex];
     RET[programIndex] = popcnt(a < 3);
 }
 
-export void result(uniform float RET[]) { RET[programIndex] = programCount == 1 ? 1 : 2; }
+export void result(uniform float RET[]) { RET[programIndex] = (programCount == 1) ? 1 : 2; }
diff --git a/tests/ptr-cast-complex.ispc b/tests/ptr-cast-complex.ispc
new file mode 100644
index 00000000..afdbf5e7
--- /dev/null
+++ b/tests/ptr-cast-complex.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int x[2][10];    
+    for (uniform int i = 0; i < 2; ++i) {
+        for (uniform int j = 0; j < 10; ++j) {
+            x[i][j] = 10*i+j;
+        }
+    }
+
+    uniform int (* varying y)[10] = x;
+    RET[programIndex] = y[1][programIndex % 5];
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 10+ (programIndex % 5);
+}
diff --git a/tests/ptr-math-variability.ispc b/tests/ptr-math-variability.ispc
new file mode 100644
index 00000000..4fa89206
--- /dev/null
+++ b/tests/ptr-math-variability.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform float * uniform ptr = aFOO;
+    RET[programIndex] = *(ptr + programIndex) - 1;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = programIndex;
+}
diff --git a/tests/ptr-null-func-arg.ispc b/tests/ptr-null-func-arg.ispc
new file mode 100644
index 00000000..fdd0cbab
--- /dev/null
+++ b/tests/ptr-null-func-arg.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+bool bar(float * x) {
+    return (x != 0);
+}
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    RET[programIndex] = bar(NULL);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}
diff --git a/tests/rand-distrib-1.ispc b/tests/rand-distrib-1.ispc
index a53cef12..3a23a917 100644
--- a/tests/rand-distrib-1.ispc
+++ b/tests/rand-distrib-1.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     RNGState state;
-    seed_rng(&state, 1);
+    seed_rng(&state, programIndex);
     int count[32];
     for (uniform int i = 0; i < 32; ++i)
         count[i] = (b == 5.) ? 0 : 1;
diff --git a/tests/rand-distrib.ispc b/tests/rand-distrib.ispc
index edf24f2b..393ec063 100644
--- a/tests/rand-distrib.ispc
+++ b/tests/rand-distrib.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     RNGState state;
-    seed_rng(&state, 1);
+    seed_rng(&state, programIndex);
     float sum = 0;
     uniform int iters = 40000;
     for (unsigned int i = 0; i < iters; ++i)
diff --git a/tests/recursion-forward-func-decl.ispc b/tests/recursion-forward-func-decl.ispc
index b6dd0496..01d2e1a1 100644
--- a/tests/recursion-forward-func-decl.ispc
+++ b/tests/recursion-forward-func-decl.ispc
@@ -6,8 +6,8 @@ export uniform int width() { return programCount; }
 float f(float);
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
-    float a = aFOO[programIndex];
-    RET[programIndex] = f(a);
+    int a = aFOO[programIndex];
+    RET[programIndex] = f(a % 16);
 }
 
 float f(float x) {
@@ -16,7 +16,7 @@ float f(float x) {
 }
 
 export void result(uniform float RET[]) { 
-    uniform float fib[16] = { 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 66, 78, 91, 105, 120, 136 };
-    RET[programIndex] = fib[programIndex];
+    uniform float fib[16] = { 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 66, 78, 91, 105, 120, 0 };
+    RET[programIndex] = fib[programIndex % 16];
 }
 
diff --git a/tests/recursion.ispc b/tests/recursion.ispc
index 30de649d..21730608 100644
--- a/tests/recursion.ispc
+++ b/tests/recursion.ispc
@@ -11,12 +11,12 @@ float f(float x) {
 }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
-    float a = aFOO[programIndex];
-    RET[programIndex] = f(a);
+    int a = aFOO[programIndex];
+    RET[programIndex] = f(a % 16);
 }
 
 export void result(uniform float RET[]) { 
-    uniform float fib[16] = { 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 66, 78, 91, 105, 120, 136 };
-    RET[programIndex] = fib[programIndex];
+    uniform float fib[16] = { 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 66, 78, 91, 105, 120, 0 };
+    RET[programIndex] = fib[programIndex % 16];
 }
 
diff --git a/tests/reduce-add-double-1.ispc b/tests/reduce-add-double-1.ispc
index 9ff50b2c..4d40509f 100644
--- a/tests/reduce-add-double-1.ispc
+++ b/tests/reduce-add-double-1.ispc
@@ -13,11 +13,9 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 }
 
 export void result(uniform float RET[]) { 
-    uniform int x = -1234;
-    if (programCount == 1) x = 1;
-    else if (programCount == 4) x = 4;
-    else if (programCount == 8) x = 16;
-    else if (programCount == 16) x = 64;
+    uniform int x = 0;
+    for (uniform int i = 1; i <= programCount; i += 2)
+        x += i;
     RET[programIndex] = x;
 }
 
diff --git a/tests/reduce-add-double-2.ispc b/tests/reduce-add-double-2.ispc
index 1be0d10b..aed4099c 100644
--- a/tests/reduce-add-double-2.ispc
+++ b/tests/reduce-add-double-2.ispc
@@ -12,11 +12,9 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 }
 
 export void result(uniform float RET[]) { 
-    uniform int x = -1234;
-    if (programCount == 1) x = 1;
-    else if (programCount == 4) x = 10;
-    else if (programCount == 8) x = 36;
-    else if (programCount == 16) x = 136;
+    uniform int x = 0;
+    for (uniform int i = 1; i <= programCount; ++i)
+        x += i;
     RET[programIndex] = x;
 }
 
diff --git a/tests/reduce-add-float-1.ispc b/tests/reduce-add-float-1.ispc
index dd373849..627e67d6 100644
--- a/tests/reduce-add-float-1.ispc
+++ b/tests/reduce-add-float-1.ispc
@@ -13,11 +13,9 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 }
 
 export void result(uniform float RET[]) { 
-    uniform int x = -1234;
-    if (programCount == 1) x = 1;
-    else if (programCount == 4) x = 4;
-    else if (programCount == 8) x = 16;
-    else if (programCount == 16) x = 64;
+    uniform int x = 0;
+    for (uniform int i = 1; i <= programCount; i += 2)
+        x += i;
     RET[programIndex] = x;
 }
 
diff --git a/tests/reduce-add-float-2.ispc b/tests/reduce-add-float-2.ispc
index 53aa85aa..473b4bdd 100644
--- a/tests/reduce-add-float-2.ispc
+++ b/tests/reduce-add-float-2.ispc
@@ -12,11 +12,9 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 }
 
 export void result(uniform float RET[]) { 
-    uniform int x = -1234;
-    if (programCount == 1) x = 1;
-    else if (programCount == 4) x = 10;
-    else if (programCount == 8) x = 36;
-    else if (programCount == 16) x = 136;
+    uniform int x = 0;
+    for (uniform int i = 1; i <= programCount; ++i)
+        x += i;
     RET[programIndex] = x;
 }
 
diff --git a/tests/reduce-add-int-1.ispc b/tests/reduce-add-int-1.ispc
index 9ac887c6..5351a81c 100644
--- a/tests/reduce-add-int-1.ispc
+++ b/tests/reduce-add-int-1.ispc
@@ -13,11 +13,9 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 }
 
 export void result(uniform float RET[]) { 
-    uniform int x = -1234;
-    if (programCount == 1) x = 1;
-    else if (programCount == 4) x = 4;
-    else if (programCount == 8) x = 16;
-    else if (programCount == 16) x = 64;
+    uniform int x = 0;
+    for (uniform int i = 1; i <= programCount; i += 2)
+        x += i;
     RET[programIndex] = x;
 }
 
diff --git a/tests/reduce-add-int.ispc b/tests/reduce-add-int.ispc
index 01ff745c..d5478374 100644
--- a/tests/reduce-add-int.ispc
+++ b/tests/reduce-add-int.ispc
@@ -13,11 +13,9 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 }
 
 export void result(uniform float RET[]) { 
-    uniform int x = -1234;
-    if (programCount == 1) x = 1;
-    else if (programCount == 4) x = 10;
-    else if (programCount == 8) x = 36;
-    else if (programCount == 16) x = 136;
+    uniform int x = 0;
+    for (uniform int i = 1; i <= programCount; ++i)
+        x += i;
     RET[programIndex] = x;
 }
 
diff --git a/tests/reduce-add-int64-1.ispc b/tests/reduce-add-int64-1.ispc
index cdc88bc3..e7df3b23 100644
--- a/tests/reduce-add-int64-1.ispc
+++ b/tests/reduce-add-int64-1.ispc
@@ -13,11 +13,9 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 }
 
 export void result(uniform float RET[]) { 
-    uniform int x = -1234;
-    if (programCount == 1) x = 1;
-    else if (programCount == 4) x = 4;
-    else if (programCount == 8) x = 16;
-    else if (programCount == 16) x = 64;
+    uniform int x = 0;
+    for (uniform int i = 1; i <= programCount; i += 2)
+        x += i;
     RET[programIndex] = x;
 }
 
diff --git a/tests/reduce-add-int64.ispc b/tests/reduce-add-int64.ispc
index 894dddea..5c85cfef 100644
--- a/tests/reduce-add-int64.ispc
+++ b/tests/reduce-add-int64.ispc
@@ -12,11 +12,9 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 }
 
 export void result(uniform float RET[]) { 
-    uniform int x = -1234;
-    if (programCount == 1) x = 1;
-    else if (programCount == 4) x = 10;
-    else if (programCount == 8) x = 36;
-    else if (programCount == 16) x = 136;
+    uniform int x = 0;
+    for (uniform int i = 1; i <= programCount; ++i)
+        x += i;
     RET[programIndex] = x;
 }
 
diff --git a/tests/reduce-add-uint-1.ispc b/tests/reduce-add-uint-1.ispc
index 291200a6..955fa11a 100644
--- a/tests/reduce-add-uint-1.ispc
+++ b/tests/reduce-add-uint-1.ispc
@@ -13,10 +13,8 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 }
 
 export void result(uniform float RET[]) { 
-    uniform int x = -1234;
-    if (programCount == 1) x = 1;
-    else if (programCount == 4) x = 4;
-    else if (programCount == 8) x = 16;
-    else if (programCount == 16) x = 64;
+    uniform int x = 0;
+    for (uniform int i = 1; i <= programCount; i += 2)
+        x += i;
     RET[programIndex] = x;
 }
diff --git a/tests/reduce-add-uint64-1.ispc b/tests/reduce-add-uint64-1.ispc
index 5469a898..98ffd2c2 100644
--- a/tests/reduce-add-uint64-1.ispc
+++ b/tests/reduce-add-uint64-1.ispc
@@ -13,10 +13,8 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 }
 
 export void result(uniform float RET[]) { 
-    uniform int x = -1234;
-    if (programCount == 1) x = 1;
-    else if (programCount == 4) x = 4;
-    else if (programCount == 8) x = 16;
-    else if (programCount == 16) x = 64;
+    uniform int x = 0;
+    for (uniform int i = 1; i <= programCount; i += 2)
+        x += i;
     RET[programIndex] = x;
 }
diff --git a/tests/ref-as-temporary.ispc b/tests/ref-as-temporary.ispc
new file mode 100644
index 00000000..1b167da6
--- /dev/null
+++ b/tests/ref-as-temporary.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+
+int func(const int &a) { return a+1; }
+int bar() { return 0; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    RET[programIndex] = func(bar());
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1;
+}
diff --git a/tests/ref-vec-param-index.ispc b/tests/ref-vec-param-index.ispc
new file mode 100644
index 00000000..70256dc1
--- /dev/null
+++ b/tests/ref-vec-param-index.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+float foo(uniform float<4> &vec) {
+    return vec[programIndex & 3];
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    uniform float<4> vec = { b, -1, 2*b, -b };
+    RET[programIndex] = foo(vec); 
+}
+
+export void result(uniform float RET[]) {
+    uniform float a[4] = { 5, -1, 10, -5 };
+    RET[programIndex] = a[programIndex & 3];
+}
diff --git a/tests/scatter-struct-with-array-member.ispc b/tests/scatter-struct-with-array-member.ispc
index 081649df..07e9cce3 100644
--- a/tests/scatter-struct-with-array-member.ispc
+++ b/tests/scatter-struct-with-array-member.ispc
@@ -18,6 +18,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 
     a *= -1;
     Point vp = { a, { 2*a, 3*a, 4*a }, {5*a} };
+    assert(programCount+2 < 80);
     pts[2+programIndex] = vp;
 
     RET[programIndex] = pts[programIndex].y[2];
diff --git a/tests/scatter-struct.ispc b/tests/scatter-struct.ispc
index b5d58e16..dae97b6f 100644
--- a/tests/scatter-struct.ispc
+++ b/tests/scatter-struct.ispc
@@ -17,8 +17,8 @@ void set(Foo f[], int offset, Foo val) {
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
-    Foo foo[35];
-    for (uniform int i = 0; i < 35; ++i) {
+    Foo foo[programCount+5];
+    for (uniform int i = 0; i < programCount+5; ++i) {
         foo[i].f = a;
         foo[i].a = i;
         foo[i].y = 2*a;
diff --git a/tests/scatter-vector.ispc b/tests/scatter-vector.ispc
index 8882af2b..50ac7c49 100644
--- a/tests/scatter-vector.ispc
+++ b/tests/scatter-vector.ispc
@@ -17,8 +17,8 @@ void set(uniform float3 f[], int offset, float3 val) {
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
 
-    uniform float3 foo[35];
-    for (uniform int i = 0; i < 35; ++i) {
+    uniform float3 foo[programCount+5];
+    for (uniform int i = 0; i < programCount+5; ++i) {
         foo[i].x = i;
         foo[i].y = -1;
         foo[i].z = 2*i;
diff --git a/tests/short-circuit-13.ispc b/tests/short-circuit-13.ispc
deleted file mode 100644
index fb0a94a2..00000000
--- a/tests/short-circuit-13.ispc
+++ /dev/null
@@ -1,25 +0,0 @@
-
-export uniform int width() { return programCount; }
-
-uniform int * uniform ptr;
-
-float foo(uniform float a[]) {
-    int index = (programIndex & 1) * 10000;
-    if (a[programIndex] < 10000 && a[index] == 1)
-        return 1;
-    else
-        return 1234;
-}
-
-export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
-    float a = aFOO[programIndex]; 
-    float a0 = aFOO[0], a1 = aFOO[1];
-    if ((programIndex & 1) == 0)
-        RET[programIndex] = foo(aFOO);
-    else
-        RET[programIndex] = 2;
-}
-
-export void result(uniform float RET[]) {
-    RET[programIndex] = (programIndex & 1) ? 2 : 1;
-}
diff --git a/tests/soa-1.ispc b/tests/soa-1.ispc
index 37a8f681..ffe391d9 100644
--- a/tests/soa-1.ispc
+++ b/tests/soa-1.ispc
@@ -13,6 +13,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
         pts[i].z = 3*b*i;
     }
 
+    assert(programCount < 80);
     RET[programIndex] = pts[programIndex].y;
 }
 
diff --git a/tests/soa-10.ispc b/tests/soa-10.ispc
index 2e1c8d30..b46ec080 100644
--- a/tests/soa-10.ispc
+++ b/tests/soa-10.ispc
@@ -15,6 +15,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 
     uniform Point up = pts[1];
 
+    assert(programCount < 80);
     RET[programIndex] = up.y;
 }
 
diff --git a/tests/soa-13.ispc b/tests/soa-13.ispc
index d1f7b056..3738b48b 100644
--- a/tests/soa-13.ispc
+++ b/tests/soa-13.ispc
@@ -17,6 +17,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     uniform Point up = { b, 3, 170 };
     pts[(int64)1] = up;
 
+    assert(programCount < 80);
     RET[programIndex] = pts[(int64)programIndex].z;
 }
 
diff --git a/tests/soa-15.ispc b/tests/soa-15.ispc
index 8e6bb0ce..a4dac089 100644
--- a/tests/soa-15.ispc
+++ b/tests/soa-15.ispc
@@ -15,10 +15,8 @@ static void p(uniform float *uniform ptr) {
 }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
-    soa<4> Point pts[10];
-//CO    uniform Point pts[40];
-//CO    foreach (i = 0 ... 40) {
-    for (uniform int i = 0; i < 40; ++i) {
+    soa<4> Point pts[30];
+    for (uniform int i = 0; i < 120; ++i) {
         pts[i].x = b*i;
         pts[i].y[0] = 2*b*i;
         pts[i].y[1] = 2*b*i+1;
@@ -26,14 +24,10 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
         pts[i].z = 3*b*i;
     }
 
-//CO    p((uniform float * uniform)&pts[0]);
-
-//CO    print("delta %\n", ((uniform float * varying)(&pts[2+programIndex]) -
-//CO                        (uniform float * uniform)&pts[0]));
-
     float a = aFOO[programIndex]; 
     a *= -1;
     Point vp = { a, { 2*a, 3*a, 4*a }, {5*a} };
+    assert(programCount+2 < 120);
     pts[2+programIndex] = vp;
 
 //CO    p((uniform float * uniform)&pts[0]);
diff --git a/tests/soa-16.ispc b/tests/soa-16.ispc
index 0f8c9c1f..f23c39cb 100644
--- a/tests/soa-16.ispc
+++ b/tests/soa-16.ispc
@@ -15,10 +15,8 @@ static void p(uniform float *uniform ptr) {
 }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
-    soa<4> Point pts[10];
-//CO    uniform Point pts[40];
-//CO    foreach (i = 0 ... 40) {
-    for (uniform int i = 0; i < 40; ++i) {
+    soa<4> Point pts[30];
+    for (uniform int i = 0; i < 120; ++i) {
         pts[i].x = b*i;
         pts[i].y[0] = 2*b*i;
         pts[i].y[1] = 2*b*i+1;
@@ -26,18 +24,12 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
         pts[i].z = 3*b*i;
     }
 
-//CO    p((uniform float * uniform)&pts[0]);
-
-//CO    print("delta %\n", ((uniform float * varying)(&pts[2+programIndex]) -
-//CO                        (uniform float * uniform)&pts[0]));
-
     float a = aFOO[programIndex]; 
     a *= -1;
     Point vp = { a, { 2*a, 3*a, 4*a }, {5*a} };
+    assert(programCount + 2 < 120);
     pts[2+programIndex] = vp;
 
-//CO    p((uniform float * uniform)&pts[0]);
-
     RET[programIndex] = pts[programIndex].y[2];
 }
 
diff --git a/tests/soa-17.ispc b/tests/soa-17.ispc
index 2423cf66..f25b85bd 100644
--- a/tests/soa-17.ispc
+++ b/tests/soa-17.ispc
@@ -16,10 +16,8 @@ static void p(uniform float *uniform ptr) {
 }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
-    soa<4> Point pts[10];
-//CO    uniform Point pts[40];
-//CO    foreach (i = 0 ... 40) {
-    for (uniform int i = 0; i < 40; ++i) {
+    soa<4> Point pts[40];
+    for (uniform int i = 0; i < 160; ++i) {
         pts[i].x = b*i;
         pts[i].y[0] = 2*b*i;
         pts[i].y[1] = 2*b*i+1;
@@ -27,19 +25,12 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
         pts[i].z = 3*b*i;
     }
 
-//CO    p((uniform float * uniform)&pts[0]);
-
-//CO    print("one size %\n", sizeof(soa<4> Point));
-//CO    print("delta %\n", ((uniform int8 * varying)(&pts[2+programIndex]) -
-//CO                        (uniform int8 * uniform)&pts[0]));
-
     float a = aFOO[programIndex]; 
     a *= -1;
     Point vp = { a, { 2*a, 3*a, 4*a }, {5*a} };
+    assert(2+programIndex < 160);
     pts[2+programIndex] = vp;
 
-//CO    p((uniform float * uniform)&pts[0]);
-
     RET[programIndex] = pts[programIndex].y[2];
 }
 
diff --git a/tests/soa-18.ispc b/tests/soa-18.ispc
index 1e8b70f6..39e0a80f 100644
--- a/tests/soa-18.ispc
+++ b/tests/soa-18.ispc
@@ -17,6 +17,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     ++ptr;
     ptr->y = -programIndex;
 
+    assert(1+programCount < 80);
     RET[programIndex] = pts[1+programIndex].y;
 }
 
diff --git a/tests/soa-19.ispc b/tests/soa-19.ispc
index 3b7ad46f..f9a8103b 100644
--- a/tests/soa-19.ispc
+++ b/tests/soa-19.ispc
@@ -16,6 +16,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     soa<8> Point * ptr = &pts[6+programIndex];
     ptr->y = -programIndex;;
 
+    assert(6+programIndex < 80);
     RET[programIndex] = pts[6+programIndex].y;
 }
 
diff --git a/tests/soa-2.ispc b/tests/soa-2.ispc
index d8ec37e3..e92d7c9b 100644
--- a/tests/soa-2.ispc
+++ b/tests/soa-2.ispc
@@ -13,6 +13,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
         pts[i].z = 3*b*i;
     }
 
+    assert(programCount < 80);
     RET[programIndex] = pts[programIndex].z;
 }
 
diff --git a/tests/soa-20.ispc b/tests/soa-20.ispc
index f8a1fe6f..8e87ece6 100644
--- a/tests/soa-20.ispc
+++ b/tests/soa-20.ispc
@@ -12,7 +12,8 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
         pts[i].y = 2*b*i;
         pts[i].z = 3*b*i;
     }
-    
+
+    assert(6+programIndex < 80);
     soa<8> Point * ptr = &pts[6+programIndex];
     RET[programIndex] = ptr - pts;
 }
diff --git a/tests/soa-21.ispc b/tests/soa-21.ispc
index f19ae448..46309788 100644
--- a/tests/soa-21.ispc
+++ b/tests/soa-21.ispc
@@ -12,8 +12,8 @@ export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex]; 
 
-    soa<8> Foo * uniform pts = uniform new soa<8> Foo[4];
-    foreach (i = 0 ... 32) {
+    soa<8> Foo * uniform pts = uniform new soa<8> Foo[11];
+    foreach (i = 0 ... 88) {
         pts[i].x = b*i;
         pts[i].z = -b*i;
         for (uniform int j = 0; j < 10; ++j) {
@@ -23,6 +23,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
         }
     }
     
+    assert(7+programCount < 88);
     soa<8> Foo * ptr = &pts[7+programIndex];
     RET[programIndex] = ptr->pts[3].z;
 }
diff --git a/tests/soa-22.ispc b/tests/soa-22.ispc
index 41795c96..60448694 100644
--- a/tests/soa-22.ispc
+++ b/tests/soa-22.ispc
@@ -12,9 +12,8 @@ export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex]; 
 
-    soa<8> Foo * uniform pts = uniform new soa<8> Foo[4];
-//CO    uniform Foo pts[32];
-    foreach (i = 0 ... 32) {
+    soa<8> Foo * uniform pts = uniform new soa<8> Foo[10];
+    foreach (i = 0 ... 80) {
         pts[i].x = b*i;
         pts[i].z = -b*i;
         for (uniform int j = 0; j < 3; ++j) {
@@ -27,6 +26,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
         }
     }
     
+    assert(programIndex < 80);
     RET[programIndex] = pts[programIndex].pts[programIndex % 3][programIndex % 4].z;
 }
 
diff --git a/tests/soa-23.ispc b/tests/soa-23.ispc
index 24b7b679..928eba25 100644
--- a/tests/soa-23.ispc
+++ b/tests/soa-23.ispc
@@ -11,15 +11,15 @@ export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex]; 
 
-    soa<8> Foo * uniform pts = uniform new soa<8> Foo[4];
-//CO    uniform Foo pts[32];
-    foreach (i = 0 ... 32) {
+    soa<8> Foo * uniform pts = uniform new soa<8> Foo[10];
+    foreach (i = 0 ... 80) {
         pts[i].vec.x = b*i;
         pts[i].vec.y = -b*i;
         pts[i].vec.z = 2*b*i;
         pts[i].z = i;
     }
-    
+
+    assert(programIndex + 2 < 80);
     RET[programIndex] = pts[programIndex+2].vec.y;
 }
 
diff --git a/tests/soa-24.ispc b/tests/soa-24.ispc
index 23835c9c..758f094e 100644
--- a/tests/soa-24.ispc
+++ b/tests/soa-24.ispc
@@ -11,9 +11,8 @@ export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex]; 
 
-    soa<8> Foo * uniform pts = uniform new soa<8> Foo[4];
-//CO    uniform Foo pts[32];
-    foreach (i = 0 ... 32) {
+    soa<8> Foo * uniform pts = uniform new soa<8> Foo[10];
+    foreach (i = 0 ... 80) {
         pts[i].vec.x = b*i;
         pts[i].vec.y = -b*i;
         pts[i].vec.z = 2*b*i;
@@ -21,6 +20,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     }
 
     pts[programIndex+2].vec.z *= -1;    
+    assert(programIndex < 80);
     float<3> vl = pts[programIndex].vec;
     RET[programIndex] = vl.z;
 }
diff --git a/tests/soa-25.ispc b/tests/soa-25.ispc
index d3be5253..b21e0d57 100644
--- a/tests/soa-25.ispc
+++ b/tests/soa-25.ispc
@@ -11,9 +11,8 @@ export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex]; 
 
-    soa<8> Foo * uniform pts = uniform new soa<8> Foo[4];
-//CO    uniform Foo pts[32];
-    foreach (i = 0 ... 32) {
+    soa<8> Foo * uniform pts = uniform new soa<8> Foo[10];
+    foreach (i = 0 ... 80) {
         pts[i].vec.x = b*i;
         pts[i].vec.y = -b*i;
         pts[i].vec.z = 2*b*i;
@@ -21,6 +20,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     }
 
     pts[2].vec.x *= -1;    
+    assert(programCount < 80);
     float<3> vl = pts[programIndex].vec;
     RET[programIndex] = vl.x;
 }
diff --git a/tests/soa-26.ispc b/tests/soa-26.ispc
index b765825a..cf1fc6d2 100644
--- a/tests/soa-26.ispc
+++ b/tests/soa-26.ispc
@@ -12,7 +12,6 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex]; 
 
     soa<8> Foo * uniform pts = uniform new soa<8> Foo[4];
-//CO    uniform Foo pts[32];
     for (uniform int i = 0; i < 32; ++i) {
         pts[i].vec.x = b*i;
         pts[i].vec.y = -b*i;
diff --git a/tests/soa-28.ispc b/tests/soa-28.ispc
index 92f3c4a3..a4df02dc 100644
--- a/tests/soa-28.ispc
+++ b/tests/soa-28.ispc
@@ -6,9 +6,9 @@ export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex]; 
 
-    soa<8> Point pts[10];
+    soa<8> Point pts[20];
 
-    foreach (i = b-5 ... 80) {
+    foreach (i = b-5 ... 160) {
         pts[i].x = b*i;
         pts[i].y = 2*b*i;
         pts[i].z = 3*b*i;
@@ -16,6 +16,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 
     uniform Point up = pts[4];
 
+    assert(2*programCount < 160);
     RET[programIndex] = pts[2*programIndex].x;
 }
 
diff --git a/tests/soa-29.ispc b/tests/soa-29.ispc
index e9a5a069..9c2d6f28 100644
--- a/tests/soa-29.ispc
+++ b/tests/soa-29.ispc
@@ -6,9 +6,9 @@ export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex]; 
 
-    soa<8> Point pts[10];
+    soa<8> Point pts[20];
 
-    for (int i = programIndex; i < 16*b; i += programCount) {
+    for (int i = programIndex; i < 32*b; i += programCount) {
         pts[i].x = b*i;
         pts[i].y = 2*b*i;
         pts[i].z = 3*b*i;
@@ -16,6 +16,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 
     uniform Point up = pts[4];
 
+    assert(2*programIndex < 160);
     RET[programIndex] = pts[2*programIndex].x;
 }
 
diff --git a/tests/soa-3.ispc b/tests/soa-3.ispc
index 0eed0bf4..2cec07a5 100644
--- a/tests/soa-3.ispc
+++ b/tests/soa-3.ispc
@@ -16,6 +16,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
         pts[i].z = 3*b*i;
     }
 
+    assert(programCount < 80);
     RET[programIndex] = pts[programIndex].y[2];
 }
 
diff --git a/tests/soa-4.ispc b/tests/soa-4.ispc
index 3af768fb..88a35393 100644
--- a/tests/soa-4.ispc
+++ b/tests/soa-4.ispc
@@ -6,14 +6,15 @@ export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex]; 
 
-    soa<8> Point pts[8];
-    foreach (i = 0 ... 64) {
+    soa<8> Point pts[10];
+    foreach (i = 0 ... 80) {
         pts[i].x = 0;
         pts[i].y = 0;
         pts[i].z = 0;
     }
 
     Point pv = { a, b, -a };
+    assert(8+programCount < 80);
     pts[8+programIndex] = pv;
 
     RET[programIndex] = pts[8+programIndex].z;
diff --git a/tests/soa-5.ispc b/tests/soa-5.ispc
index ae775c65..070375c6 100644
--- a/tests/soa-5.ispc
+++ b/tests/soa-5.ispc
@@ -6,14 +6,15 @@ export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex]; 
 
-    soa<8> Point pts[8];
-    foreach (i = 0 ... 64) {
+    soa<8> Point pts[9];
+    foreach (i = 0 ... 72) {
         pts[i].x = 0;
         pts[i].y = 0;
         pts[i].z = 0;
     }
 
     Point pv = { a, b, -a };
+    assert(6+programCount < 72);
     pts[6+programIndex] = pv;
 
     RET[programIndex] = pts[6+programIndex].x;
diff --git a/tests/soa-6.ispc b/tests/soa-6.ispc
index 4dd8439e..942b2d4e 100644
--- a/tests/soa-6.ispc
+++ b/tests/soa-6.ispc
@@ -6,14 +6,15 @@ export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex]; 
 
-    soa<8> Point pts[8];
-    foreach (i = 0 ... 64) {
+    soa<8> Point pts[11];
+    foreach (i = 0 ... 88) {
         pts[i].x = -42;
         pts[i].y = 0;
         pts[i].z = 0;
     }
 
     Point pv = { a, b, -a };
+    assert(8+programCount < 88);
     pts[8+programIndex] = pv;
 
     RET[programIndex] = pts[6+programIndex].x;
diff --git a/tests/soa-7.ispc b/tests/soa-7.ispc
index 042cf3dc..a7163aa4 100644
--- a/tests/soa-7.ispc
+++ b/tests/soa-7.ispc
@@ -6,14 +6,15 @@ export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex]; 
 
-    soa<8> Point pts[8];
-    foreach (i = 0 ... 64) {
+    soa<8> Point pts[11];
+    foreach (i = 0 ... 88) {
         pts[i].x = -42;
         pts[i].y = 0;
         pts[i].z = 0;
     }
 
     Point pv = { a, b, -a };
+    assert(8+programCount < 88);
     pts[8+programIndex].x = pv.x;
     pts[8+programIndex].y = pv.y;
     pts[8+programIndex].z = pv.z;
diff --git a/tests/soa-8.ispc b/tests/soa-8.ispc
index 0235fc2c..1e4b88d6 100644
--- a/tests/soa-8.ispc
+++ b/tests/soa-8.ispc
@@ -6,14 +6,15 @@ export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex]; 
 
-    soa<8> Point pts[8];
-    foreach (i = 0 ... 64) {
+    soa<8> Point pts[12];
+    foreach (i = 0 ... 96) {
         pts[i].x = -42;
         pts[i].y = 0;
         pts[i].z = 0;
     }
 
     Point pv = { a, b, -a };
+    assert(8+programCount < 96);
     pts[7+programIndex] = pv;
 
     RET[programIndex] = pts[8+programIndex].x;
diff --git a/tests/soa-9.ispc b/tests/soa-9.ispc
index 0249434c..872065b9 100644
--- a/tests/soa-9.ispc
+++ b/tests/soa-9.ispc
@@ -16,6 +16,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     uniform Point up = { b, 3, 170 };
     pts[1] = up;
 
+    assert(programCount < 80);
     RET[programIndex] = pts[programIndex].z;
 }
 
diff --git a/tests/struct-forward-decl-2.ispc b/tests/struct-forward-decl-2.ispc
new file mode 100644
index 00000000..2660c541
--- /dev/null
+++ b/tests/struct-forward-decl-2.ispc
@@ -0,0 +1,36 @@
+
+export uniform int width() { return programCount; }
+
+struct Foo;
+
+void bing(Foo * uniform);
+
+struct Foo {
+    int i;
+    varying float f;
+    Foo * uniform next;
+};
+
+void bar(Foo * uniform f) {
+    bing(f);
+}
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform Foo fa, fb;
+    fa.next = &fb;
+    fb.f = aFOO[programIndex]; 
+    fb.i = 100;
+    bar(&fa);
+    RET[programIndex] = fb.f; 
+}
+
+
+void bing(Foo * uniform f) {
+    f = f->next;
+    f->f *= 2;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2 + 2*programIndex;
+}
diff --git a/tests/struct-forward-decl.ispc b/tests/struct-forward-decl.ispc
new file mode 100644
index 00000000..54f09be6
--- /dev/null
+++ b/tests/struct-forward-decl.ispc
@@ -0,0 +1,33 @@
+
+export uniform int width() { return programCount; }
+
+struct Foo;
+
+void bing(varying Foo * uniform);
+
+struct Foo {
+    float f;
+    int i;
+};
+
+void bar(varying Foo * uniform f) {
+    bing(f);
+}
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    Foo f;
+    f.f = aFOO[programIndex]; 
+    f.i = programIndex;
+    bar(&f);
+    RET[programIndex] = f.f; 
+}
+
+
+void bing(varying Foo * uniform f) {
+    f->f *= 2;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2 + 2*programIndex;
+}
diff --git a/tests/struct-gather-2.ispc b/tests/struct-gather-2.ispc
index cfd427b7..6411ae90 100644
--- a/tests/struct-gather-2.ispc
+++ b/tests/struct-gather-2.ispc
@@ -13,9 +13,9 @@ float func(Foo foo[], int offset) {
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex];
-    Foo foo[17];
+    Foo foo[programCount+5];
     uniform int i;
-    for (i = 0; i < 17; ++i)
+    for (i = 0; i < programCount+5; ++i)
         foo[i].f = i*a;
     RET[programIndex] = func(foo, (int)a);
 }
diff --git a/tests/struct-gather-3.ispc b/tests/struct-gather-3.ispc
index cfd427b7..6de2c123 100644
--- a/tests/struct-gather-3.ispc
+++ b/tests/struct-gather-3.ispc
@@ -13,9 +13,9 @@ float func(Foo foo[], int offset) {
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex];
-    Foo foo[17];
+    Foo foo[programCount+1];
     uniform int i;
-    for (i = 0; i < 17; ++i)
+    for (i = 0; i < programCount+1; ++i)
         foo[i].f = i*a;
     RET[programIndex] = func(foo, (int)a);
 }
diff --git a/tests/struct-gather.ispc b/tests/struct-gather.ispc
index efa42e1d..a92a5862 100644
--- a/tests/struct-gather.ispc
+++ b/tests/struct-gather.ispc
@@ -9,9 +9,9 @@ struct Foo {
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex];
-    Foo foo[17];
+    Foo foo[programCount+1];
     uniform int i;
-    for (i = 0; i < 17; ++i)
+    for (i = 0; i < programCount+1; ++i)
         foo[i].f = i*a;
     RET[programIndex] = foo[(int)a].f;
 }
diff --git a/tests/struct-ref-lvalue.ispc b/tests/struct-ref-lvalue.ispc
index 535a1a0f..5cfbdf31 100644
--- a/tests/struct-ref-lvalue.ispc
+++ b/tests/struct-ref-lvalue.ispc
@@ -11,8 +11,8 @@ void f(Foo foo[], float a) {
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex];
-    Foo foo[17];
-    for (uniform int i = 0; i < 17; ++i)
+    Foo foo[programCount+5];
+    for (uniform int i = 0; i < programCount+5; ++i)
         foo[i].f = a;
     f(foo, a);
     RET[programIndex] = foo[a].f;
diff --git a/tests/struct-test-114.ispc b/tests/struct-test-114.ispc
index 66c4b07c..b5d8b18b 100644
--- a/tests/struct-test-114.ispc
+++ b/tests/struct-test-114.ispc
@@ -10,12 +10,13 @@ struct Foo {
 export void f_fi(uniform float RET[], uniform float aFOO[], uniform int bFOO[]) {
     float a = aFOO[programIndex];
     int b = bFOO[programIndex];
-    varying Foo myFoo[17];
+    varying Foo myFoo[128];
     uniform int i;
-    for (i = 0; i < 17; ++i) {
+    for (i = 0; i < 128; ++i) {
         myFoo[i].x = i;
         myFoo[i].f = 2*i;
     }
+    assert(b/2 < 128);
     RET[programIndex] = myFoo[b/2].f;
 }
 
diff --git a/tests/struct-vary-index-expr.ispc b/tests/struct-vary-index-expr.ispc
index dbf8de8f..169a9570 100644
--- a/tests/struct-vary-index-expr.ispc
+++ b/tests/struct-vary-index-expr.ispc
@@ -7,13 +7,14 @@ struct Foo { float f; };
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex];
-    Foo foo[17];
-    for (uniform int i = 0; i < 17; ++i)
+    Foo foo[programCount+1];
+    uniform int ind[programCount+1];
+    for (uniform int i = 0; i < programCount+1; ++i) {
         foo[i].f = a;
+        ind[i] = i+1;
+    }
     ++foo[a].f;
-    assert(programCount <= 16);
-    uniform int i[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 };
-    RET[programIndex] = foo[i[programIndex]].f;
+    RET[programIndex] = foo[ind[programIndex]].f;
 }
 
 export void result(uniform float RET[]) { RET[programIndex] = 2+programIndex; }
diff --git a/tests/struct-zero-len-array-member.ispc b/tests/struct-zero-len-array-member.ispc
new file mode 100644
index 00000000..83e91854
--- /dev/null
+++ b/tests/struct-zero-len-array-member.ispc
@@ -0,0 +1,24 @@
+
+struct Foo {
+    float x;
+    float a[0];
+};
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int nFloats = 3+programCount;
+    varying Foo * uniform ptr = (varying Foo * uniform)(uniform new varying int32[nFloats]);
+    memset(ptr, 0, nFloats*sizeof(int32));
+    
+    for (uniform int i = 0; i < nFloats-1; ++i)
+        ptr->a[i] = i;
+    ptr->x = aFOO[programIndex]; 
+
+    RET[programIndex] = ptr->a[1+programIndex]; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
diff --git a/tests/switch-10.ispc b/tests/switch-10.ispc
index abd94d7c..2957aee6 100644
--- a/tests/switch-10.ispc
+++ b/tests/switch-10.ispc
@@ -21,7 +21,7 @@ int switchit(int a, uniform int b) {
         }
         return -1234;
     }
-    case 32:
+    case 9999:
         *((int *)NULL) = 0;
     default:
         return 0;
diff --git a/tests/switch-11.ispc b/tests/switch-11.ispc
index daacdf76..c520f4f5 100644
--- a/tests/switch-11.ispc
+++ b/tests/switch-11.ispc
@@ -27,7 +27,7 @@ int switchit(int a, uniform int b) {
         }
         return 42;
     }
-    case 32:
+    case 9999:
         *((int *)NULL) = 0;
     default:
         return 0;
diff --git a/tests/switch-12.ispc b/tests/switch-12.ispc
index 9a803012..67e4d076 100644
--- a/tests/switch-12.ispc
+++ b/tests/switch-12.ispc
@@ -31,7 +31,7 @@ int switchit(int a, uniform int b) {
         }
         return 42;
     }
-    case 32:
+    case 9999:
         *((int *)NULL) = 0;
     default:
         return 0;
diff --git a/tests/switch-8.ispc b/tests/switch-8.ispc
index ca1848e8..24297e2f 100644
--- a/tests/switch-8.ispc
+++ b/tests/switch-8.ispc
@@ -12,7 +12,7 @@ int switchit(int a, uniform int b) {
         if (a & 1)
             break;
         return 2;
-    case 32:
+    case 9999:
         *((int *)NULL) = 0;
     default:
     case 1:
diff --git a/tests/switch-9.ispc b/tests/switch-9.ispc
index 9bfd0d03..3d3f8318 100644
--- a/tests/switch-9.ispc
+++ b/tests/switch-9.ispc
@@ -12,7 +12,7 @@ int switchit(int a, uniform int b) {
         if (a & 1)
             break;
         return 2;
-    case 32:
+    case 9999:
         *((int *)NULL) = 0;
     default:
         return 0;
diff --git a/tests/test-103.ispc b/tests/test-103.ispc
index 1c53213a..c536d032 100644
--- a/tests/test-103.ispc
+++ b/tests/test-103.ispc
@@ -4,7 +4,7 @@ export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
-    RET[programIndex] = a < 17.; 
+    RET[programIndex] = a < 65.; 
 }
 
 
diff --git a/tests/test-125.ispc b/tests/test-125.ispc
index d92d671c..e4a12811 100644
--- a/tests/test-125.ispc
+++ b/tests/test-125.ispc
@@ -5,8 +5,12 @@ export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex];
     if (a < 3) {
-        if (all(a < 3)) 
+        if (all(a < 256)) 
             RET[programIndex] = 1;
+        else {
+            print("FALSE %\n", a);
+            print("programCount %\n", programCount);
+        }
     }
     else RET[programIndex] = 0;
 }
diff --git a/tests/test-60.ispc b/tests/test-60.ispc
index fbdf8bd3..87f4fac2 100644
--- a/tests/test-60.ispc
+++ b/tests/test-60.ispc
@@ -4,12 +4,12 @@ export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex];
-    while (a < 20)
+    while (a < 64)
         ++a;
     RET[programIndex] = a;
 }
 
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = 20;
+    RET[programIndex] = 64;
 }
diff --git a/tests/test-64.ispc b/tests/test-64.ispc
index 5a674a95..b9af26cc 100644
--- a/tests/test-64.ispc
+++ b/tests/test-64.ispc
@@ -19,8 +19,20 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 
 
 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 2;
-    RET[1] = RET[5] = RET[9] = RET[13] = 3;
-    RET[2] = RET[6] = RET[10] = RET[14] = 5;
-    RET[3] = RET[7] = RET[11] = RET[15] = 6;
+    RET[0] = RET[4] = RET[8] = RET[12] =\
+        RET[16] = RET[20] = RET[24] = RET[28] =\
+        RET[32] = RET[36] = RET[40] = RET[44] =\
+        RET[48] = RET[52] = RET[56] = RET[60] = 2;
+    RET[1] = RET[5] = RET[9] = RET[13] =\
+        RET[17] = RET[21] = RET[25] = RET[29] =\
+        RET[33] = RET[37] = RET[41] = RET[45] =\
+        RET[49] = RET[53] = RET[57] = RET[61] = 3;
+    RET[2] = RET[6] = RET[10] = RET[14] =\
+        RET[18] = RET[22] = RET[26] = RET[30] =\
+        RET[34] = RET[38] = RET[42] = RET[46] =\
+        RET[50] = RET[54] = RET[58] = RET[62] = 5;
+    RET[3] = RET[7] = RET[11] = RET[15] =\
+        RET[19] = RET[23] = RET[27] = RET[31] =\
+        RET[35] = RET[39] = RET[43] = RET[47] =\
+        RET[51] = RET[55] = RET[59] = RET[63] = 6;
 }
diff --git a/tests/test-65.ispc b/tests/test-65.ispc
index 71b8b39c..7510b0d5 100644
--- a/tests/test-65.ispc
+++ b/tests/test-65.ispc
@@ -17,8 +17,20 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 
 
 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 1;
-    RET[1] = RET[5] = RET[9] = RET[13] = 3;
-    RET[2] = RET[6] = RET[10] = RET[14] = 3;
-    RET[3] = RET[7] = RET[11] = RET[15] = 29;
+    RET[0] = RET[4] = RET[8] = RET[12] =\
+        RET[16] = RET[20] = RET[24] = RET[28] =\
+        RET[32] = RET[36] = RET[40] = RET[44] =\
+        RET[48] = RET[52] = RET[56] = RET[60] = 1;
+    RET[1] = RET[5] = RET[9] = RET[13] =\
+        RET[17] = RET[21] = RET[25] = RET[29] =\
+        RET[33] = RET[37] = RET[41] = RET[45] =\
+        RET[49] = RET[53] = RET[57] = RET[61] = 3;
+    RET[2] = RET[6] = RET[10] = RET[14] =\
+        RET[18] = RET[22] = RET[26] = RET[30] =\
+        RET[34] = RET[38] = RET[42] = RET[46] =\
+        RET[50] = RET[54] = RET[58] = RET[62] = 3;
+    RET[3] = RET[7] = RET[11] = RET[15] =\
+        RET[19] = RET[23] = RET[27] = RET[31] =\
+        RET[35] = RET[39] = RET[43] = RET[47] =\
+        RET[51] = RET[55] = RET[59] = RET[63] = 29;
 }
diff --git a/tests/test-66.ispc b/tests/test-66.ispc
index 8f69e8a5..a7fe0adc 100644
--- a/tests/test-66.ispc
+++ b/tests/test-66.ispc
@@ -18,8 +18,20 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 
 
 export void result(uniform float RET[]) {
-    RET[0] = RET[4] = RET[8] = RET[12] = 32;
-    RET[1] = RET[5] = RET[9] = RET[13] = 32;
-    RET[2] = RET[6] = RET[10] = RET[14] = 38;
-    RET[3] = RET[7] = RET[11] = RET[15] = 39;
+    RET[0] = RET[4] = RET[8] = RET[12] =\
+        RET[16] = RET[20] = RET[24] = RET[28] =\
+        RET[32] = RET[36] = RET[40] = RET[44] =\
+        RET[48] = RET[52] = RET[56] = RET[60] = 32;
+    RET[1] = RET[5] = RET[9] = RET[13] =\
+        RET[17] = RET[21] = RET[25] = RET[29] =\
+        RET[33] = RET[37] = RET[41] = RET[45] =\
+        RET[49] = RET[53] = RET[57] = RET[61] = 32;
+    RET[2] = RET[6] = RET[10] = RET[14] =\
+        RET[18] = RET[22] = RET[26] = RET[30] =\
+        RET[34] = RET[38] = RET[42] = RET[46] =\
+        RET[50] = RET[54] = RET[58] = RET[62] = 38;
+    RET[3] = RET[7] = RET[11] = RET[15] =\
+        RET[19] = RET[23] = RET[27] = RET[31] =\
+        RET[35] = RET[39] = RET[43] = RET[47] =\
+        RET[51] = RET[55] = RET[59] = RET[63] = 39;
 }
diff --git a/tests/transcendentals-0-2.ispc b/tests/transcendentals-0-2.ispc
index ed75d71c..dad39715 100644
--- a/tests/transcendentals-0-2.ispc
+++ b/tests/transcendentals-0-2.ispc
@@ -13,7 +13,12 @@ static float float4(uniform float a, uniform float b, uniform float c,
 export uniform int width() { return programCount; }
 
 
-bool ok(float x, float ref) { return (abs(x - ref) < 1e-6) || abs((x-ref)/ref) < 1e-5; }
+bool ok(float x, float ref) { 
+    bool r = (abs(x - ref) < 1e-4) || abs((x-ref)/ref) < 1e-4;
+    if (any(r == false))
+        print("mismatch got %, expected %\n", x, ref);
+    return r;
+}
 
 export void f_v(uniform float RET[]) {
     float v = float4((-9.424777984619141),(4.000000000000000),(10.000000000000000),(-10.000000000000000));
diff --git a/tests/transcendentals-1-3.ispc b/tests/transcendentals-1-3.ispc
index cefa3547..a4167e89 100644
--- a/tests/transcendentals-1-3.ispc
+++ b/tests/transcendentals-1-3.ispc
@@ -13,7 +13,13 @@ static float float4(uniform float a, uniform float b, uniform float c,
 export uniform int width() { return programCount; }
 
 
-bool ok(float x, float ref) { return (abs(x - ref) < 1e-6) || abs((x-ref)/ref) < 1e-5; }
+bool ok(float x, float ref) { 
+    bool r = (abs(x - ref) < 1e-5) || abs((x-ref)/ref) < 1e-5;
+    if (any(r == false))
+        print("mismatch got %, expected %\n", x, ref);
+    return r;
+}
+
 
 export void f_v(uniform float RET[]) {
     float v = float4((14.300000190734863),(-6.699999809265137),(-21.200000762939453),(9.000000000000000));
diff --git a/tests/typecast-void-funcall-1.ispc b/tests/typecast-void-funcall-1.ispc
new file mode 100644
index 00000000..c9aa0ed7
--- /dev/null
+++ b/tests/typecast-void-funcall-1.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+float add(float a, float b, uniform float * uniform result) {
+    result[programIndex] = a+b;
+    return a+b;
+}
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = 0.; b = a; 
+    (void)add(a, b, RET);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2 + 2*programIndex;
+}
diff --git a/tests/typecast-void-funcall.ispc b/tests/typecast-void-funcall.ispc
new file mode 100644
index 00000000..f2431ef9
--- /dev/null
+++ b/tests/typecast-void-funcall.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+uniform float add(float a, float b, uniform float * uniform result) {
+    result[programIndex] = a+b;
+    return 1;
+}
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = 0.; b = a; 
+    (void)add(a, b, RET);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2 + 2*programIndex;
+}
diff --git a/tests/typedef-2.ispc b/tests/typedef-2.ispc
index 9217ff2c..e8117dd4 100644
--- a/tests/typedef-2.ispc
+++ b/tests/typedef-2.ispc
@@ -19,7 +19,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     for (uniform int i = 0; i < 16; ++i)
         for (uniform int j = 0; j < 16; ++j)
             bar.foo[i].x[j] = b;
-    RET[programIndex] = bar.foo[a-1].x[a-1];
+    RET[programIndex] = bar.foo[min(15, a-1)].x[min(15, a-1)];
 }
 
 export void result(uniform float RET[]) { RET[programIndex] = 5; }
diff --git a/tests/unif-struct-test-114.ispc b/tests/unif-struct-test-114.ispc
index d5c2e08a..dbee219c 100644
--- a/tests/unif-struct-test-114.ispc
+++ b/tests/unif-struct-test-114.ispc
@@ -8,12 +8,13 @@ struct Foo {
 };
 export void f_fi(uniform float RET[], uniform float a[], uniform int bFOO[]) {
     int b = bFOO[programIndex];
-    uniform struct Foo myFoo[17];
+    uniform struct Foo myFoo[256];
     uniform int i;
-    for (i = 0; i < 17; ++i) {
+    for (i = 0; i < 256; ++i) {
         myFoo[i].x = i;
         myFoo[i].f = 17+2*i;
     }
+    assert(b/2 < 256);
     RET[programIndex] = myFoo[b/2].f;
 }
 
diff --git a/tests/varying-struct-3.ispc b/tests/varying-struct-3.ispc
index 1691e855..af64878c 100644
--- a/tests/varying-struct-3.ispc
+++ b/tests/varying-struct-3.ispc
@@ -15,7 +15,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     RET[programIndex] = g.x;
 }
 
-export void result(uniform float RET[4]) { 
+export void result(uniform float RET[]) { 
     RET[programIndex] = 15;
     RET[0] = RET[1] = 10;
 }
diff --git a/tests/varying-struct-6.ispc b/tests/varying-struct-6.ispc
index 450e76e6..08bba06c 100644
--- a/tests/varying-struct-6.ispc
+++ b/tests/varying-struct-6.ispc
@@ -18,4 +18,4 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     RET[programIndex] = bar.uf.x + bar.vf.y;
 }
 
-export void result(uniform float RET[4]) { RET[programIndex] = 6+programIndex; }
+export void result(uniform float RET[]) { RET[programIndex] = 6+programIndex; }
diff --git a/tests_errors/addr-of-1.ispc b/tests_errors/addr-of-1.ispc
new file mode 100644
index 00000000..4d770f01
--- /dev/null
+++ b/tests_errors/addr-of-1.ispc
@@ -0,0 +1,5 @@
+// Illegal to take address of non-lvalue or function
+
+void foo() {
+    int *ptr = &(1+1);
+}
diff --git a/tests_errors/deref-3.ispc b/tests_errors/deref-3.ispc
index 19d4e82d..d7e6e906 100644
--- a/tests_errors/deref-3.ispc
+++ b/tests_errors/deref-3.ispc
@@ -1,4 +1,4 @@
-// Dereference operator "->" can't be applied to non-pointer type "varying struct Foo"
+// Member operator "->" can't be applied to non-pointer type "varying struct Foo"
 
 struct Foo { int x; };
 
diff --git a/tests_errors/export-multiple-name.ispc b/tests_errors/export-multiple-name.ispc
new file mode 100644
index 00000000..11a2c896
--- /dev/null
+++ b/tests_errors/export-multiple-name.ispc
@@ -0,0 +1,5 @@
+// Illegal to have "export" function with same name as previously declared function
+
+export void foo() { }
+
+export void foo(uniform int x) { }
diff --git a/tests_errors/func-def-with-typedef.ispc b/tests_errors/func-def-with-typedef.ispc
new file mode 100644
index 00000000..a750a9be
--- /dev/null
+++ b/tests_errors/func-def-with-typedef.ispc
@@ -0,0 +1,4 @@
+// Illegal "typedef" provided with function definition
+
+typedef float foo(float a, float b) { }
+
diff --git a/tests_errors/func-param-mismatch-2.ispc b/tests_errors/func-param-mismatch-2.ispc
index 09b27064..63c0239a 100644
--- a/tests_errors/func-param-mismatch-2.ispc
+++ b/tests_errors/func-param-mismatch-2.ispc
@@ -1,4 +1,4 @@
-// Unable to find matching overload for call to function
+// Unable to find any matching overload for call to function
 
 void foo(int x);
 
diff --git a/tests_errors/func-param-mismatch-3.ispc b/tests_errors/func-param-mismatch-3.ispc
index 7e5f2b99..cb34c8a7 100644
--- a/tests_errors/func-param-mismatch-3.ispc
+++ b/tests_errors/func-param-mismatch-3.ispc
@@ -1,4 +1,4 @@
-// Unable to find matching overload for call to function
+// Unable to find any matching overload for call to function
 
 void foo(int x);
 
diff --git a/tests_errors/func-param-mismatch.ispc b/tests_errors/func-param-mismatch.ispc
index c2bac94f..44a50903 100644
--- a/tests_errors/func-param-mismatch.ispc
+++ b/tests_errors/func-param-mismatch.ispc
@@ -1,4 +1,4 @@
-// Unable to find matching overload for call to function
+// Unable to find any matching overload for call to function
 
 void foo();
 
diff --git a/tests_errors/global-decl-1.ispc b/tests_errors/global-decl-1.ispc
new file mode 100644
index 00000000..6f111bbf
--- /dev/null
+++ b/tests_errors/global-decl-1.ispc
@@ -0,0 +1,4 @@
+// Definition of variable "foo" conflicts with definition at
+
+extern int foo;
+float foo;
diff --git a/tests_errors/global-decl-2.ispc b/tests_errors/global-decl-2.ispc
new file mode 100644
index 00000000..66647ea7
--- /dev/null
+++ b/tests_errors/global-decl-2.ispc
@@ -0,0 +1,4 @@
+// Definition of variable "foo" conflicts with definition at 
+
+extern int foo;
+extern float foo;
diff --git a/tests_errors/global-redef-1.ispc b/tests_errors/global-redef-1.ispc
new file mode 100644
index 00000000..7ebb3da7
--- /dev/null
+++ b/tests_errors/global-redef-1.ispc
@@ -0,0 +1,4 @@
+// Definition of variable "foo" conflicts with definition at
+
+int foo;
+float foo;
diff --git a/tests_errors/global-redef.ispc b/tests_errors/global-redef.ispc
new file mode 100644
index 00000000..9a2df32f
--- /dev/null
+++ b/tests_errors/global-redef.ispc
@@ -0,0 +1,4 @@
+// Redefinition of variable "foo" is illegal
+
+int foo;
+int foo;
diff --git a/tests_errors/goto-5.ispc b/tests_errors/goto-5.ispc
new file mode 100644
index 00000000..5b547982
--- /dev/null
+++ b/tests_errors/goto-5.ispc
@@ -0,0 +1,6 @@
+
+void func(int x) {
+completelydifferentlabel:
+    goto label;
+}
+
diff --git a/tests_errors/goto-6.ispc b/tests_errors/goto-6.ispc
new file mode 100644
index 00000000..7412cd3a
--- /dev/null
+++ b/tests_errors/goto-6.ispc
@@ -0,0 +1,9 @@
+
+void func(int x) {
+libel:
+babel:
+lbel:
+label2:
+    goto label;
+}
+
diff --git a/tests_errors/ptr-1.ispc b/tests_errors/ptr-1.ispc
index 66a9bff4..97a88488 100644
--- a/tests_errors/ptr-1.ispc
+++ b/tests_errors/ptr-1.ispc
@@ -1,4 +1,4 @@
-// Can't convert between incompatible pointer types
+// Can't convert from pointer type "void * varying" to incompatible pointer type "uniform int32 * varying" for return statement
 
 int *foo(void *p) {
     return p;
diff --git a/tests_errors/ref-initializer-1.ispc b/tests_errors/ref-initializer-1.ispc
new file mode 100644
index 00000000..c926c793
--- /dev/null
+++ b/tests_errors/ref-initializer-1.ispc
@@ -0,0 +1,6 @@
+// Initializer for reference-type variable "x" must have an lvalue type
+
+float &func(uniform float a[], int i, float f) {
+    float &x = 1.; // a[i];
+}
+
diff --git a/tests_errors/ref-initializer-2.ispc b/tests_errors/ref-initializer-2.ispc
new file mode 100644
index 00000000..4612addf
--- /dev/null
+++ b/tests_errors/ref-initializer-2.ispc
@@ -0,0 +1,6 @@
+// Initializer for reference-type variable "x" must have a uniform lvalue type
+
+float &func(uniform float a[], int i, float f) {
+    float &x = a[i];
+}
+
diff --git a/tests_errors/ref-initializer-3.ispc b/tests_errors/ref-initializer-3.ispc
new file mode 100644
index 00000000..27833b54
--- /dev/null
+++ b/tests_errors/ref-initializer-3.ispc
@@ -0,0 +1,6 @@
+// Initializer for reference-type variable "x" must have a uniform lvalue type
+
+float &func(uniform int a[], int i, float f) {
+    float &x = a[i];
+}
+
diff --git a/tests_errors/return-ref-1.ispc b/tests_errors/return-ref-1.ispc
new file mode 100644
index 00000000..fee20b18
--- /dev/null
+++ b/tests_errors/return-ref-1.ispc
@@ -0,0 +1,5 @@
+// Illegal to return non-lvalue from function returning reference type
+
+float &func(uniform float a[], int i, float f) {
+    return 1.f;
+}
diff --git a/tests_errors/return-ref-2.ispc b/tests_errors/return-ref-2.ispc
new file mode 100644
index 00000000..6ed667c1
--- /dev/null
+++ b/tests_errors/return-ref-2.ispc
@@ -0,0 +1,5 @@
+// Illegal to return varying lvalue type from function returning a reference type
+
+float &func(uniform float a[], int i, float f) {
+    return a[i];
+}
diff --git a/tests_errors/struct-ref-undecl-1.ispc b/tests_errors/struct-ref-undecl-1.ispc
new file mode 100644
index 00000000..0d851117
--- /dev/null
+++ b/tests_errors/struct-ref-undecl-1.ispc
@@ -0,0 +1,5 @@
+// Member operator "." can't be applied to declared but not defined struct type
+
+struct Foo;
+
+int bar(Foo & foo) { return foo.x; }
diff --git a/tests_errors/struct-ref-undecl-2.ispc b/tests_errors/struct-ref-undecl-2.ispc
new file mode 100644
index 00000000..bb233ccc
--- /dev/null
+++ b/tests_errors/struct-ref-undecl-2.ispc
@@ -0,0 +1,5 @@
+// Member operator "->" can't be applied to declared but not defined struct type
+
+struct Foo;
+
+int bar(Foo * uniform foo) { return foo->x; }
diff --git a/tests_errors/struct-unsized-array.ispc b/tests_errors/struct-unsized-array.ispc
index 77553eff..7238a351 100644
--- a/tests_errors/struct-unsized-array.ispc
+++ b/tests_errors/struct-unsized-array.ispc
@@ -1,4 +1,4 @@
-// Unsized arrays aren't allowed in struct definitions
+// Unsized arrays aren't allowed except for the last member in a struct definition.
 
 struct Foo {
     float a[];
diff --git a/tests_errors/undef-struct-new.ispc b/tests_errors/undef-struct-new.ispc
new file mode 100644
index 00000000..3a9037c3
--- /dev/null
+++ b/tests_errors/undef-struct-new.ispc
@@ -0,0 +1,7 @@
+// Can't dynamically allocate storage for declared but not defined type
+
+struct Foo;
+
+Foo * uniform bar() {
+    return uniform new Foo;
+}
diff --git a/tests_errors/undef-struct-ptrmath-1.ispc b/tests_errors/undef-struct-ptrmath-1.ispc
new file mode 100644
index 00000000..861c66fb
--- /dev/null
+++ b/tests_errors/undef-struct-ptrmath-1.ispc
@@ -0,0 +1,7 @@
+// Illegal to perform pointer arithmetic on undefined struct type
+
+struct Foo;
+
+Foo * uniform bar(Foo * uniform f) {
+    return f + 1;
+}
diff --git a/tests_errors/undef-struct-ptrmath-2.ispc b/tests_errors/undef-struct-ptrmath-2.ispc
new file mode 100644
index 00000000..dfaab13c
--- /dev/null
+++ b/tests_errors/undef-struct-ptrmath-2.ispc
@@ -0,0 +1,7 @@
+// Illegal to perform pointer arithmetic on undefined struct type
+
+struct Foo;
+
+Foo * uniform bar(Foo * uniform f) {
+    return 1 + f;
+}
diff --git a/tests_errors/undef-struct-ptrmath-3.ispc b/tests_errors/undef-struct-ptrmath-3.ispc
new file mode 100644
index 00000000..1fad2ac4
--- /dev/null
+++ b/tests_errors/undef-struct-ptrmath-3.ispc
@@ -0,0 +1,7 @@
+// Illegal to perform pointer arithmetic on undefined struct type
+
+struct Foo;
+
+Foo * uniform bar(Foo * uniform f) {
+    return f-1;
+}
diff --git a/tests_errors/undef-struct-ptrmath.ispc b/tests_errors/undef-struct-ptrmath.ispc
new file mode 100644
index 00000000..39b19a4e
--- /dev/null
+++ b/tests_errors/undef-struct-ptrmath.ispc
@@ -0,0 +1,7 @@
+// Illegal to pre/post increment pointer to undefined struct type
+
+struct Foo;
+
+Foo * uniform bar(Foo * uniform f) {
+    return ++f;
+}
diff --git a/tests_errors/undef-struct-sizeof.ispc b/tests_errors/undef-struct-sizeof.ispc
new file mode 100644
index 00000000..d2a2219a
--- /dev/null
+++ b/tests_errors/undef-struct-sizeof.ispc
@@ -0,0 +1,7 @@
+// Can't compute the size of declared but not defined struct type
+
+struct Foo;
+
+uniform int bar() {
+    return sizeof(Foo);
+}
diff --git a/type.cpp b/type.cpp
index 0fb8817e..ea61109d 100644
--- a/type.cpp
+++ b/type.cpp
@@ -42,6 +42,7 @@
 #include "module.h"
 
 #include <stdio.h>
+#include <map>
 #include <llvm/Value.h>
 #include <llvm/Module.h>
 #include <llvm/Analysis/DIBuilder.h>
@@ -59,7 +60,7 @@ static bool
 lShouldPrintName(const std::string &name) {
     if (name.size() == 0)
         return false;
-    else if (name[0] != '_')
+    else if (name[0] != '_' && name[0] != '$')
         return true;
     else
         return (name.size() == 1) || (name[1] != '_');
@@ -81,11 +82,7 @@ lCreateDIArray(llvm::DIType eltType, int count) {
     llvm::Value *sub = m->diBuilder->getOrCreateSubrange(lowerBound, upperBound);
     std::vector<llvm::Value *> subs;
     subs.push_back(sub);
-#ifdef LLVM_2_9
-    llvm::DIArray subArray = m->diBuilder->getOrCreateArray(&subs[0], subs.size());
-#else
     llvm::DIArray subArray = m->diBuilder->getOrCreateArray(subs);
-#endif
 
     uint64_t size = eltType.getSizeInBits() * count;
     uint64_t align = eltType.getAlignInBits();
@@ -187,7 +184,9 @@ const AtomicType *AtomicType::Void =
 
 
 AtomicType::AtomicType(BasicType bt, Variability v, bool ic) 
-    : basicType(bt), variability(v), isConst(ic) {
+    : Type(ATOMIC_TYPE), basicType(bt), variability(v), isConst(ic) {
+    asOtherConstType = NULL;
+    asUniformType = asVaryingType = NULL;
 }
 
 
@@ -257,19 +256,27 @@ AtomicType::GetAsUnsignedType() const {
 
 const AtomicType *
 AtomicType::GetAsConstType() const {
-    if (Type::Equal(this, AtomicType::Void) || isConst == true) 
+    if (basicType == TYPE_VOID || isConst == true) 
         return this;
     
-    return new AtomicType(basicType, variability, true);
+    if (asOtherConstType == NULL) {
+        asOtherConstType = new AtomicType(basicType, variability, true);
+        asOtherConstType->asOtherConstType = this;
+    }
+    return asOtherConstType;
 }
 
 
 const AtomicType *
 AtomicType::GetAsNonConstType() const {
-    if (Type::Equal(this, AtomicType::Void) || isConst == false) 
+    if (basicType == TYPE_VOID || isConst == false) 
         return this;
 
-    return new AtomicType(basicType, variability, false);
+    if (asOtherConstType == NULL) {
+        asOtherConstType = new AtomicType(basicType, variability, false);
+        asOtherConstType->asOtherConstType = this;
+    }
+    return asOtherConstType;
 }
 
 
@@ -281,25 +288,37 @@ AtomicType::GetBaseType() const {
 
 const AtomicType *
 AtomicType::GetAsVaryingType() const {
-    Assert(Type::Equal(this, AtomicType::Void) == false);
+    Assert(basicType != TYPE_VOID);
     if (variability == Variability::Varying)
         return this;
-    return new AtomicType(basicType, Variability::Varying, isConst);
+
+    if (asVaryingType == NULL) {
+        asVaryingType = new AtomicType(basicType, Variability::Varying, isConst);
+        if (variability == Variability::Uniform)
+            asVaryingType->asUniformType = this;
+    }
+    return asVaryingType;
 }
 
 
 const AtomicType *
 AtomicType::GetAsUniformType() const {
-    Assert(Type::Equal(this, AtomicType::Void) == false);
+    Assert(basicType != TYPE_VOID);
     if (variability == Variability::Uniform)
         return this;
-    return new AtomicType(basicType, Variability::Uniform, isConst);
+
+    if (asUniformType == NULL) {
+        asUniformType = new AtomicType(basicType, Variability::Uniform, isConst);
+        if (variability == Variability::Varying)
+            asUniformType->asVaryingType = this;
+    }
+    return asUniformType;
 }
 
 
 const AtomicType *
 AtomicType::GetAsUnboundVariabilityType() const {
-    Assert(Type::Equal(this, AtomicType::Void) == false);
+    Assert(basicType != TYPE_VOID);
     if (variability == Variability::Unbound)
         return this;
     return new AtomicType(basicType, Variability::Unbound, isConst);
@@ -308,7 +327,7 @@ AtomicType::GetAsUnboundVariabilityType() const {
 
 const AtomicType *
 AtomicType::GetAsSOAType(int width) const {
-    Assert(this != AtomicType::Void);
+    Assert(basicType != TYPE_VOID);
     if (variability == Variability(Variability::SOA, width))
         return this;
     return new AtomicType(basicType, Variability(Variability::SOA, width), isConst);
@@ -418,7 +437,7 @@ AtomicType::GetCDeclaration(const std::string &name) const {
 }
 
 
-LLVM_TYPE_CONST llvm::Type *
+llvm::Type *
 AtomicType::LLVMType(llvm::LLVMContext *ctx) const {
     Assert(variability.type != Variability::Unbound);
     bool isUniform = (variability == Variability::Uniform);
@@ -518,12 +537,7 @@ AtomicType::GetDIType(llvm::DIDescriptor scope) const {
     else if (variability == Variability::Varying) {
         llvm::DIType unifType = GetAsUniformType()->GetDIType(scope);
         llvm::Value *sub = m->diBuilder->getOrCreateSubrange(0, g->target.vectorWidth-1);
-#ifdef LLVM_2_9
-        llvm::Value *suba[] = { sub };
-        llvm::DIArray subArray = m->diBuilder->getOrCreateArray(suba, 1);
-#else
         llvm::DIArray subArray = m->diBuilder->getOrCreateArray(sub);
-#endif // LLVM_2_9
         uint64_t size =  unifType.getSizeInBits()  * g->target.vectorWidth;
         uint64_t align = unifType.getAlignInBits() * g->target.vectorWidth;
         return m->diBuilder->createVectorType(size, align, unifType, subArray);
@@ -540,7 +554,7 @@ AtomicType::GetDIType(llvm::DIDescriptor scope) const {
 // EnumType
 
 EnumType::EnumType(SourcePos p) 
-    : pos(p) {
+    : Type(ENUM_TYPE), pos(p) {
     //    name = "/* (anonymous) */";
     isConst = false;
     variability = Variability(Variability::Unbound);
@@ -548,7 +562,7 @@ EnumType::EnumType(SourcePos p)
 
 
 EnumType::EnumType(const char *n, SourcePos p) 
-    : pos(p), name(n) {
+    : Type(ENUM_TYPE), pos(p), name(n) {
     isConst = false;
     variability = Variability(Variability::Unbound);
 }
@@ -734,7 +748,7 @@ EnumType::GetCDeclaration(const std::string &varName) const {
 }
 
 
-LLVM_TYPE_CONST llvm::Type *
+llvm::Type *
 EnumType::LLVMType(llvm::LLVMContext *ctx) const {
     Assert(variability != Variability::Unbound);
 
@@ -767,21 +781,19 @@ EnumType::GetDIType(llvm::DIDescriptor scope) const {
             m->diBuilder->createEnumerator(enumerators[i]->name, enumeratorValue);
         enumeratorDescriptors.push_back(descriptor);
     }
-#ifdef LLVM_2_9
-    llvm::DIArray elementArray = 
-        m->diBuilder->getOrCreateArray(&enumeratorDescriptors[0],
-                                       enumeratorDescriptors.size());
-#else
     llvm::DIArray elementArray = 
         m->diBuilder->getOrCreateArray(enumeratorDescriptors);
-#endif
 
     llvm::DIFile diFile = pos.GetDIFile();
     llvm::DIType diType =
         m->diBuilder->createEnumerationType(scope, name, diFile, pos.first_line,
                                             32 /* size in bits */,
                                             32 /* align in bits */,
-                                            elementArray);
+                                            elementArray
+#if !defined(LLVM_3_0) && !defined(LLVM_3_1)
+                                            , llvm::DIType()
+#endif
+                                            );
 
 
     switch (variability.type) {
@@ -789,12 +801,7 @@ EnumType::GetDIType(llvm::DIDescriptor scope) const {
         return diType;
     case Variability::Varying: {
         llvm::Value *sub = m->diBuilder->getOrCreateSubrange(0, g->target.vectorWidth-1);
-#ifdef LLVM_2_9
-        llvm::Value *suba[] = { sub };
-        llvm::DIArray subArray = m->diBuilder->getOrCreateArray(suba, 1);
-#else
         llvm::DIArray subArray = m->diBuilder->getOrCreateArray(sub);
-#endif // !LLVM_2_9
         uint64_t size =  diType.getSizeInBits()  * g->target.vectorWidth;
         uint64_t align = diType.getAlignInBits() * g->target.vectorWidth;
         return m->diBuilder->createVectorType(size, align, diType, subArray);
@@ -836,7 +843,7 @@ PointerType *PointerType::Void =
 
 PointerType::PointerType(const Type *t, Variability v, bool ic, bool is, 
                          bool fr)
-    : variability(v), isConst(ic), isSlice(is), isFrozen(fr) {
+    : Type(POINTER_TYPE), variability(v), isConst(ic), isSlice(is), isFrozen(fr) {
     baseType = t;
 }
 
@@ -966,42 +973,6 @@ PointerType::GetAsFrozenSlice() const {
 }
 
 
-/** Returns a structure corresponding to the pointer representation for
-    slice pointers; the first member of this structure is a uniform or
-    varying pointer, and the second element is either a uniform or varying
-    int32.
- */
-const StructType *
-PointerType::GetSliceStructType() const {
-    Assert(isSlice == true);
-
-    std::vector<const Type *> eltTypes;
-    eltTypes.push_back(GetAsNonSlice());
-    switch (variability.type) {
-    case Variability::Uniform:
-        eltTypes.push_back(AtomicType::UniformInt32);
-        break;
-    case Variability::Varying:
-        eltTypes.push_back(AtomicType::VaryingInt32);
-        break;
-    default:
-        FATAL("Unexpected variability in PointerType::GetSliceStructType()");
-    }
-
-    std::vector<std::string> eltNames;
-    std::vector<SourcePos> eltPos;
-
-    eltNames.push_back("ptr");
-    eltNames.push_back("offset");
-
-    eltPos.push_back(SourcePos());
-    eltPos.push_back(SourcePos());
-
-    return new StructType("__ptr_slice_tmp", eltTypes, eltNames, eltPos, isConst,
-                          Variability::Uniform, SourcePos());
-}
-
-
 const PointerType *
 PointerType::ResolveUnboundVariability(Variability v) const {
     if (baseType == NULL) {
@@ -1103,23 +1074,42 @@ PointerType::GetCDeclaration(const std::string &name) const {
 }
 
 
-LLVM_TYPE_CONST llvm::Type *
+llvm::Type *
 PointerType::LLVMType(llvm::LLVMContext *ctx) const {
     if (baseType == NULL) {
         Assert(m->errorCount > 0);
         return NULL;
     }
 
-    if (isSlice)
-        // Slice pointers are represented as a structure with a pointer and
-        // an integer offset; the corresponding ispc type is returned by
-        // GetSliceStructType().
-        return GetSliceStructType()->LLVMType(ctx);
+    if (isSlice) {
+        llvm::Type *types[2];
+        types[0] = GetAsNonSlice()->LLVMType(ctx);
+        
+        switch (variability.type) {
+        case Variability::Uniform:
+            types[1] = LLVMTypes::Int32Type;
+            break;
+        case Variability::Varying:
+            types[1] = LLVMTypes::Int32VectorType;
+            break;
+        case Variability::SOA:
+            types[1] = llvm::ArrayType::get(LLVMTypes::Int32Type,
+                                            variability.soaWidth);
+            break;
+        default:
+            FATAL("unexpected variability for slice pointer in "
+                  "PointerType::LLVMType");
+        }
+
+        llvm::ArrayRef<llvm::Type *> typesArrayRef =
+            llvm::ArrayRef<llvm::Type *>(types, 2);
+        return llvm::StructType::get(*g->ctx, typesArrayRef);
+    }
 
     switch (variability.type) {
     case Variability::Uniform: {
-        LLVM_TYPE_CONST llvm::Type *ptype = NULL;
-        const FunctionType *ftype = dynamic_cast<const FunctionType *>(baseType);
+        llvm::Type *ptype = NULL;
+        const FunctionType *ftype = CastType<FunctionType>(baseType);
         if (ftype != NULL) 
             // Get the type of the function variant that takes the mask as the
             // last parameter--i.e. we don't allow taking function pointers of
@@ -1157,13 +1147,15 @@ PointerType::GetDIType(llvm::DIDescriptor scope) const {
 
     llvm::DIType diTargetType = baseType->GetDIType(scope);
     int bitsSize = g->target.is32Bit ? 32 : 64;
+    int ptrAlignBits = bitsSize;
     switch (variability.type) {
     case Variability::Uniform:
-        return m->diBuilder->createPointerType(diTargetType, bitsSize);
+        return m->diBuilder->createPointerType(diTargetType, bitsSize, 
+                                               ptrAlignBits);
     case Variability::Varying: {
         // emit them as an array of pointers
         llvm::DIType eltType = m->diBuilder->createPointerType(diTargetType, 
-                                                               bitsSize);
+                                                               bitsSize, ptrAlignBits);
         return lCreateDIArray(eltType, g->target.vectorWidth);
     }
     case Variability::SOA: {
@@ -1189,21 +1181,21 @@ const Type *SequentialType::GetElementType(int index) const {
 // ArrayType
 
 ArrayType::ArrayType(const Type *c, int a) 
-    : child(c), numElements(a) {
+    : SequentialType(ARRAY_TYPE), child(c), numElements(a) {
     // 0 -> unsized array.
     Assert(numElements >= 0);
     Assert(Type::Equal(c, AtomicType::Void) == false);
 }
 
 
-LLVM_TYPE_CONST llvm::ArrayType *
+llvm::ArrayType *
 ArrayType::LLVMType(llvm::LLVMContext *ctx) const {
     if (child == NULL) {
         Assert(m->errorCount > 0);
         return NULL;
     }
 
-    LLVM_TYPE_CONST llvm::Type *ct = child->LLVMType(ctx);
+    llvm::Type *ct = child->LLVMType(ctx);
     if (ct == NULL) {
         Assert(m->errorCount > 0);
         return NULL;
@@ -1251,11 +1243,11 @@ ArrayType::IsConstType() const {
 const Type *
 ArrayType::GetBaseType() const {
     const Type *type = child;
-    const ArrayType *at = dynamic_cast<const ArrayType *>(type);
+    const ArrayType *at = CastType<ArrayType>(type);
     // Keep walking until we reach a child that isn't itself an array
     while (at) {
         type = at->child;
-        at = dynamic_cast<const ArrayType *>(type);
+        at = CastType<ArrayType>(type);
     }
     return type;
 }
@@ -1372,7 +1364,7 @@ ArrayType::GetString() const {
         else
             buf[0] = '\0';
         s += std::string("[") + std::string(buf) + std::string("]");
-        at = dynamic_cast<const ArrayType *>(at->child);
+        at = CastType<ArrayType>(at->child);
     }
     return s;
 }
@@ -1415,7 +1407,7 @@ ArrayType::GetCDeclaration(const std::string &name) const {
         else
             buf[0] = '\0';
         s += std::string("[") + std::string(buf) + std::string("]");
-        at = dynamic_cast<const ArrayType *>(at->child);
+        at = CastType<ArrayType>(at->child);
     }
 
     if (soaWidth > 0) {
@@ -1430,7 +1422,7 @@ ArrayType::GetCDeclaration(const std::string &name) const {
 
 int
 ArrayType::TotalElementCount() const {
-    const ArrayType *ct = dynamic_cast<const ArrayType *>(child);
+    const ArrayType *ct = CastType<ArrayType>(child);
     if (ct != NULL)
         return numElements * ct->TotalElementCount();
     else
@@ -1459,7 +1451,7 @@ ArrayType::GetSizedArray(int sz) const {
 
 const Type *
 ArrayType::SizeUnsizedArrays(const Type *type, Expr *initExpr) {
-    const ArrayType *at = dynamic_cast<const ArrayType *>(type);
+    const ArrayType *at = CastType<ArrayType>(type);
     if (at == NULL)
         return type;
 
@@ -1471,7 +1463,7 @@ ArrayType::SizeUnsizedArrays(const Type *type, Expr *initExpr) {
     // length of the expression list
     if (at->GetElementCount() == 0) {
         type = at->GetSizedArray(exprList->exprs.size());
-        at = dynamic_cast<const ArrayType *>(type);
+        at = CastType<ArrayType>(type);
     }
 
     // Is there another nested level of expression lists?  If not, bail out
@@ -1483,7 +1475,7 @@ ArrayType::SizeUnsizedArrays(const Type *type, Expr *initExpr) {
         return type;
 
     const Type *nextType = at->GetElementType();
-    const ArrayType *nextArrayType = dynamic_cast<const ArrayType *>(nextType);
+    const ArrayType *nextArrayType = CastType<ArrayType>(nextType);
     if (nextArrayType != NULL && nextArrayType->GetElementCount() == 0) {
         // If the recursive call to SizeUnsizedArrays at the bottom of the
         // function is going to size an unsized dimension, make sure that
@@ -1519,7 +1511,7 @@ ArrayType::SizeUnsizedArrays(const Type *type, Expr *initExpr) {
 // VectorType
 
 VectorType::VectorType(const AtomicType *b, int a) 
-    : base(b), numElements(a) {
+    : SequentialType(VECTOR_TYPE), base(b), numElements(a) {
     Assert(numElements > 0);
     Assert(base != NULL);
 }
@@ -1648,14 +1640,14 @@ VectorType::GetElementType() const {
 }
 
 
-LLVM_TYPE_CONST llvm::Type *
+llvm::Type *
 VectorType::LLVMType(llvm::LLVMContext *ctx) const {
     if (base == NULL) {
         Assert(m->errorCount > 0);
         return NULL;
     }
 
-    LLVM_TYPE_CONST llvm::Type *bt = base->LLVMType(ctx);
+    llvm::Type *bt = base->LLVMType(ctx);
     if (!bt)
         return NULL;
 
@@ -1684,12 +1676,7 @@ llvm::DIType
 VectorType::GetDIType(llvm::DIDescriptor scope) const {
     llvm::DIType eltType = base->GetDIType(scope);
     llvm::Value *sub = m->diBuilder->getOrCreateSubrange(0, numElements-1);
-#ifdef LLVM_2_9
-    llvm::Value *subs[1] = { sub };
-    llvm::DIArray subArray = m->diBuilder->getOrCreateArray(subs, 1);
-#else
     llvm::DIArray subArray = m->diBuilder->getOrCreateArray(sub);
-#endif
 
     uint64_t sizeBits = eltType.getSizeInBits() * numElements;
 
@@ -1744,12 +1731,106 @@ VectorType::getVectorMemoryCount() const {
 ///////////////////////////////////////////////////////////////////////////
 // StructType
 
-StructType::StructType(const std::string &n, const std::vector<const Type *> &elts, 
-                       const std::vector<std::string> &en,
-                       const std::vector<SourcePos> &ep,
+// We maintain a map from struct names to LLVM struct types so that we can
+// uniquely get the llvm::StructType * for a given ispc struct type.  Note
+// that we need to mangle the name a bit so that we can e.g. differentiate
+// between the uniform and varying variants of a given struct type.  This
+// is handled by lMangleStructName() below.
+static std::map<std::string, llvm::StructType *> lStructTypeMap;
+
+/** Using a struct's name, its variability, and the vector width for the
+    current compilation target, this function generates a string that
+    encodes that full structure type, for use in the lStructTypeMap.  Note
+    that the vector width is needed in order to differentiate between
+    'varying' structs with different compilation targets, which have
+    different memory layouts...
+ */
+static std::string
+lMangleStructName(const std::string &name, Variability variability) {
+    char buf[32];
+    std::string n;
+
+    // Encode vector width
+    sprintf(buf, "v%d", g->target.vectorWidth);
+    n += buf;
+
+    // Variability
+    switch (variability.type) {
+    case Variability::Uniform:
+        n += "_uniform_";
+        break;
+    case Variability::Varying:
+        n += "_varying_";
+        break;
+    case Variability::SOA:
+        sprintf(buf, "_soa%d_", variability.soaWidth);
+        n += buf;
+        break;
+    default:
+        FATAL("Unexpected varaibility in lMangleStructName()");
+    }
+    
+    // And stuff the name at the end....
+    n += name;
+    return n;
+}
+
+        
+StructType::StructType(const std::string &n, const llvm::SmallVector<const Type *, 8> &elts, 
+                       const llvm::SmallVector<std::string, 8> &en,
+                       const llvm::SmallVector<SourcePos, 8> &ep,
                        bool ic, Variability v, SourcePos p) 
-    : name(n), elementTypes(elts), elementNames(en), elementPositions(ep),
-      variability(v), isConst(ic), pos(p) {
+    : CollectionType(STRUCT_TYPE), name(n), elementTypes(elts), elementNames(en), 
+      elementPositions(ep), variability(v), isConst(ic), pos(p) {
+    oppositeConstStructType = NULL;
+    finalElementTypes.resize(elts.size(), NULL);
+    
+    if (variability != Variability::Unbound) {
+        // For structs with non-unbound variability, we'll create the
+        // correspoing LLVM struct type now, if one hasn't been made
+        // already.
+
+        // Create a unique anonymous struct name if we have an anonymous
+        // struct (name == ""), or if we are creating a derived type from
+        // an anonymous struct (e.g. the varying variant--name == '$').
+        if (name == "" || name[0] == '$') {
+            char buf[16];
+            static int count = 0;
+            sprintf(buf, "$anon%d", count);
+            name = buf;
+            ++count;
+        }
+
+        // If a non-opaque LLVM struct for this type has already been
+        // created, we're done.  For an opaque struct type, we'll override
+        // the old definition now that we have a full definition.
+        std::string mname = lMangleStructName(name, variability);
+        if (lStructTypeMap.find(mname) != lStructTypeMap.end() &&
+            lStructTypeMap[mname]->isOpaque() == false)
+            return;
+
+        // Actually make the LLVM struct
+        std::vector<llvm::Type *> elementTypes;
+        for (int i = 0; i < GetElementCount(); ++i) {
+            const Type *type = GetElementType(i);
+            if (type == NULL) {
+                Assert(m->errorCount > 0);
+                return;
+            }
+            elementTypes.push_back(type->LLVMType(g->ctx));
+        }
+
+        if (lStructTypeMap.find(mname) == lStructTypeMap.end()) {
+            // New struct definition
+            llvm::StructType *st =
+                llvm::StructType::create(*g->ctx, elementTypes, mname);
+            lStructTypeMap[mname] = st;
+        }
+        else {
+            // Definition for what was before just a declaration
+            lStructTypeMap[mname]->setBody(elementTypes);
+        }
+    }
 }
 
 
@@ -1856,52 +1937,67 @@ StructType::ResolveUnboundVariability(Variability v) const {
 
 const StructType *
 StructType::GetAsConstType() const {
-    if (IsConstType()) 
+    if (isConst == true)
         return this;
-    else
-        return new StructType(name, elementTypes, elementNames, elementPositions,
-                              true, variability, pos);
+    else if (oppositeConstStructType != NULL)
+        return oppositeConstStructType;
+    else {
+        oppositeConstStructType = 
+            new StructType(name, elementTypes, elementNames, elementPositions,
+                           true, variability, pos);
+        oppositeConstStructType->oppositeConstStructType = this;
+        return oppositeConstStructType;
+    }
 }
 
 
 const StructType *
 StructType::GetAsNonConstType() const {
-    if (!IsConstType()) 
+    if (isConst == false)
         return this;
-    else
-        return new StructType(name, elementTypes, elementNames, elementPositions,
-                              false, variability, pos);
+    else if (oppositeConstStructType != NULL)
+        return oppositeConstStructType;
+    else {
+        oppositeConstStructType = 
+            new StructType(name, elementTypes, elementNames, elementPositions,
+                           false, variability, pos);
+        oppositeConstStructType->oppositeConstStructType = this;
+        return oppositeConstStructType;
+    }
 }
 
 
 std::string
 StructType::GetString() const {
     std::string ret;
-    if (isConst)   ret += "const ";
+    if (isConst)
+        ret += "const ";
     ret += variability.GetString();
     ret += " ";
 
-    // Don't print the entire struct declaration, just print the struct's name.
-    // @todo Do we need a separate method that prints the declaration?
-#if 0
-    ret += std::string("struct { ") + name;
-    for (unsigned int i = 0; i < elementTypes.size(); ++i) {
-        ret += elementTypes[i]->GetString();
-        ret += " ";
-        ret += elementNames[i];
-        ret += "; ";
+    if (name[0] == '$') {
+        // Print the whole anonymous struct declaration
+        ret += std::string("struct { ") + name;
+        for (unsigned int i = 0; i < elementTypes.size(); ++i) {
+            ret += elementTypes[i]->GetString();
+            ret += " ";
+            ret += elementNames[i];
+            ret += "; ";
+        }
+        ret += "}";
     }
-    ret += "}";
-#else
-    ret += "struct ";
-    ret += name;
-#endif
+    else {
+        ret += "struct ";
+        ret += name;
+    }
+
     return ret;
 }
 
 
-std::string
-StructType::Mangle() const {
+/** Mangle a struct name for use in function name mangling. */
+static std::string
+lMangleStruct(Variability variability, bool isConst, const std::string &name) {
     Assert(variability != Variability::Unbound);
 
     std::string ret;
@@ -1910,12 +2006,15 @@ StructType::Mangle() const {
         ret += "_c_";
     ret += variability.MangleString();
 
-    ret += name + std::string("]<");
-    for (unsigned int i = 0; i < elementTypes.size(); ++i)
-        ret += GetElementType(i)->Mangle();
-    ret += ">";
+    ret += name + std::string("]");
     return ret;
 }
+
+
+std::string
+StructType::Mangle() const {
+    return lMangleStruct(variability, isConst, name);
+}
     
 
 std::string
@@ -1923,31 +2022,31 @@ StructType::GetCDeclaration(const std::string &n) const {
     std::string ret;
     if (isConst) ret += "const ";
     ret += std::string("struct ") + name;
-    if (lShouldPrintName(n))
+    if (lShouldPrintName(n)) {
         ret += std::string(" ") + n;
 
-    if (variability.soaWidth > 0) {
-        char buf[32];
-        // This has to match the naming scheme used in lEmitStructDecls()
-        // in module.cpp
-        sprintf(buf, "_SOA%d", variability.soaWidth);
-        ret += buf;
+        if (variability.soaWidth > 0) {
+            char buf[32];
+            // This has to match the naming scheme used in lEmitStructDecls()
+            // in module.cpp
+            sprintf(buf, "_SOA%d", variability.soaWidth);
+            ret += buf;
+        }
     }
 
     return ret;
 }
 
 
-LLVM_TYPE_CONST llvm::Type *
+llvm::Type *
 StructType::LLVMType(llvm::LLVMContext *ctx) const {
-    std::vector<LLVM_TYPE_CONST llvm::Type *> llvmTypes;
-    for (int i = 0; i < GetElementCount(); ++i) {
-        const Type *type = GetElementType(i);
-        if (type == NULL)
-            return NULL;
-        llvmTypes.push_back(type->LLVMType(ctx));
+    Assert(variability != Variability::Unbound);
+    std::string mname = lMangleStructName(name, variability);
+    if (lStructTypeMap.find(mname) == lStructTypeMap.end()) {
+        Assert(m->errorCount > 0);
+        return NULL;
     }
-    return llvm::StructType::get(*ctx, llvmTypes);
+    return lStructTypeMap[mname];
 }
 
 
@@ -1963,6 +2062,7 @@ StructType::GetDIType(llvm::DIDescriptor scope) const {
         llvm::DIType eltType = GetElementType(i)->GetDIType(scope);
         uint64_t eltAlign = eltType.getAlignInBits();
         uint64_t eltSize = eltType.getSizeInBits();
+        Assert(eltAlign != 0);
 
         // The alignment for the entire structure is the maximum of the
         // required alignments of its elements
@@ -1976,17 +2076,10 @@ StructType::GetDIType(llvm::DIDescriptor scope) const {
 
         llvm::DIFile diFile = elementPositions[i].GetDIFile();
         int line = elementPositions[i].first_line;
-#ifdef LLVM_2_9
-        llvm::DIType fieldType = 
-            m->diBuilder->createMemberType(elementNames[i], diFile, line,
-                                           eltSize, eltAlign, currentSize, 0,
-                                           eltType);
-#else
         llvm::DIType fieldType = 
             m->diBuilder->createMemberType(scope, elementNames[i], diFile, 
                                            line, eltSize, eltAlign, 
                                            currentSize, 0, eltType);
-#endif // LLVM_2_9
         elementLLVMTypes.push_back(fieldType);
 
         currentSize += eltSize;
@@ -1997,12 +2090,7 @@ StructType::GetDIType(llvm::DIDescriptor scope) const {
     if (currentSize > 0 && (currentSize % align))
         currentSize += align - (currentSize % align);
 
-#ifdef LLVM_2_9
-    llvm::DIArray elements = m->diBuilder->getOrCreateArray(&elementLLVMTypes[0], 
-                                                            elementLLVMTypes.size());
-#else
     llvm::DIArray elements = m->diBuilder->getOrCreateArray(elementLLVMTypes);
-#endif
     llvm::DIFile diFile = pos.GetDIFile();
     return m->diBuilder->createStructType(scope, name, diFile, pos.first_line, currentSize, 
                                           align, 0, elements);
@@ -2013,13 +2101,23 @@ const Type *
 StructType::GetElementType(int i) const {
     Assert(variability != Variability::Unbound);
     Assert(i < (int)elementTypes.size());
-    const Type *ret = elementTypes[i];
 
-    // If the element has unbound variability, resolve its variability to
-    // the struct type's variability
-    ret = ret->ResolveUnboundVariability(variability);
+    if (finalElementTypes[i] == NULL) {
+        const Type *type = elementTypes[i];
+        if (type == NULL) {
+            Assert(m->errorCount > 0);
+            return NULL;
+        }
 
-    return isConst ? ret->GetAsConstType() : ret;
+        // If the element has unbound variability, resolve its variability to
+        // the struct type's variability
+        type = type ->ResolveUnboundVariability(variability);
+        if (isConst)
+            type = type->GetAsConstType();
+        finalElementTypes[i] = type;
+    }
+
+    return finalElementTypes[i];
 }
 
 
@@ -2046,8 +2144,7 @@ StructType::checkIfCanBeSOA(const StructType *st) {
     bool ok = true;
     for (int i = 0; i < (int)st->elementTypes.size(); ++i) {
         const Type *eltType = st->elementTypes[i];
-        const StructType *childStructType = 
-            dynamic_cast<const StructType *>(eltType);
+        const StructType *childStructType = CastType<StructType>(eltType);
 
         if (childStructType != NULL)
             ok &= checkIfCanBeSOA(childStructType);
@@ -2059,7 +2156,7 @@ StructType::checkIfCanBeSOA(const StructType *st) {
                   eltType->IsUniformType() ? "uniform" : "varying");
             ok = false;
         }
-        else if (dynamic_cast<const ReferenceType *>(eltType)) {
+        else if (CastType<ReferenceType>(eltType)) {
             Error(st->elementPositions[i], "Unable to apply SOA conversion to "
                   "struct due to member \"%s\" with reference type \"%s\".",
                   st->elementNames[i].c_str(), eltType->GetString().c_str());
@@ -2070,11 +2167,176 @@ StructType::checkIfCanBeSOA(const StructType *st) {
 }
 
 
+///////////////////////////////////////////////////////////////////////////
+// UndefinedStructType
+
+UndefinedStructType::UndefinedStructType(const std::string &n, 
+                                         const Variability var, bool ic,
+                                         SourcePos p) 
+    : Type(UNDEFINED_STRUCT_TYPE), name(n), variability(var), isConst(ic), pos(p) {
+    Assert(name != "");
+    if (variability != Variability::Unbound) {
+        // Create a new opaque LLVM struct type for this struct name
+        std::string mname = lMangleStructName(name, variability);
+        if (lStructTypeMap.find(mname) == lStructTypeMap.end())
+            lStructTypeMap[mname] = llvm::StructType::create(*g->ctx, mname);
+    }
+}
+
+
+Variability
+UndefinedStructType::GetVariability() const {
+    return variability;
+}
+
+
+bool
+UndefinedStructType::IsBoolType() const {
+    return false;
+}
+
+
+bool
+UndefinedStructType::IsFloatType() const {
+    return false;
+}
+
+
+bool
+UndefinedStructType::IsIntType() const {
+    return false;
+}
+
+
+bool
+UndefinedStructType::IsUnsignedType() const {
+    return false;
+}
+
+
+bool
+UndefinedStructType::IsConstType() const {
+    return isConst;
+}
+
+
+const Type *
+UndefinedStructType::GetBaseType() const {
+    return this;
+}
+
+
+const UndefinedStructType *
+UndefinedStructType::GetAsVaryingType() const {
+    if (variability == Variability::Varying)
+        return this;
+    return new UndefinedStructType(name, Variability::Varying, isConst, pos);
+}
+
+
+const UndefinedStructType *
+UndefinedStructType::GetAsUniformType() const {
+    if (variability == Variability::Uniform)
+        return this;
+    return new UndefinedStructType(name, Variability::Uniform, isConst, pos);
+}
+
+
+const UndefinedStructType *
+UndefinedStructType::GetAsUnboundVariabilityType() const {
+    if (variability == Variability::Unbound)
+        return this;
+    return new UndefinedStructType(name, Variability::Unbound, isConst, pos);
+}
+
+
+const UndefinedStructType *
+UndefinedStructType::GetAsSOAType(int width) const {
+    FATAL("UndefinedStructType::GetAsSOAType() shouldn't be called.");
+    return NULL;
+}
+
+
+const UndefinedStructType *
+UndefinedStructType::ResolveUnboundVariability(Variability v) const {
+    if (variability != Variability::Unbound)
+        return this;
+    return new UndefinedStructType(name, v, isConst, pos);
+}
+
+
+const UndefinedStructType *
+UndefinedStructType::GetAsConstType() const {
+    if (isConst)
+        return this;
+    return new UndefinedStructType(name, variability, true, pos);
+}
+
+
+const UndefinedStructType *
+UndefinedStructType::GetAsNonConstType() const {
+    if (isConst == false)
+        return this;
+    return new UndefinedStructType(name, variability, false, pos);
+}
+
+
+std::string
+UndefinedStructType::GetString() const {
+    std::string ret;
+    if (isConst)   ret += "const ";
+    ret += variability.GetString();
+    ret += " struct ";
+    ret += name;
+    return ret;
+}
+
+
+std::string
+UndefinedStructType::Mangle() const {
+    return lMangleStruct(variability, isConst, name);
+}
+
+
+std::string
+UndefinedStructType::GetCDeclaration(const std::string &n) const {
+    std::string ret;
+    if (isConst) ret += "const ";
+    ret += std::string("struct ") + name;
+    if (lShouldPrintName(n))
+        ret += std::string(" ") + n;
+    return ret;
+}
+
+
+llvm::Type *
+UndefinedStructType::LLVMType(llvm::LLVMContext *ctx) const {
+    Assert(variability != Variability::Unbound);
+    std::string mname = lMangleStructName(name, variability);
+    if (lStructTypeMap.find(mname) == lStructTypeMap.end()) {
+        Assert(m->errorCount > 0);
+        return NULL;
+    }
+    return lStructTypeMap[mname];
+}
+
+
+llvm::DIType
+UndefinedStructType::GetDIType(llvm::DIDescriptor scope) const {
+    llvm::DIFile diFile = pos.GetDIFile();
+    llvm::DIArray elements;
+    return m->diBuilder->createStructType(scope, name, diFile, pos.first_line, 
+                                          0 /* size */, 0 /* align */, 
+                                          0 /* flags */, elements); 
+}
+
+
 ///////////////////////////////////////////////////////////////////////////
 // ReferenceType
 
 ReferenceType::ReferenceType(const Type *t) 
-    : targetType(t) {
+    : Type(REFERENCE_TYPE), targetType(t) {
+    asOtherConstType = NULL;
 }
 
 
@@ -2215,7 +2477,12 @@ ReferenceType::GetAsConstType() const {
     }
     if (IsConstType())
         return this;
-    return new ReferenceType(targetType->GetAsConstType());
+
+    if (asOtherConstType == NULL) {
+        asOtherConstType = new ReferenceType(targetType->GetAsConstType());
+        asOtherConstType->asOtherConstType = this;
+    }
+    return asOtherConstType;
 }
 
 
@@ -2227,7 +2494,12 @@ ReferenceType::GetAsNonConstType() const {
     }
     if (!IsConstType())
         return this;
-    return new ReferenceType(targetType->GetAsNonConstType());
+
+    if (asOtherConstType == NULL) {
+        asOtherConstType = new ReferenceType(targetType->GetAsNonConstType());
+        asOtherConstType->asOtherConstType = this;
+    }
+    return asOtherConstType;
 }
 
 
@@ -2264,7 +2536,7 @@ ReferenceType::GetCDeclaration(const std::string &name) const {
         return "";
     }
 
-    const ArrayType *at = dynamic_cast<const ArrayType *>(targetType);
+    const ArrayType *at = CastType<ArrayType>(targetType);
     if (at != NULL) {
         if (at->GetElementCount() == 0) {
             // emit unsized arrays as pointers to the base type..
@@ -2282,7 +2554,7 @@ ReferenceType::GetCDeclaration(const std::string &name) const {
     }
     else {
         std::string ret;
-        ret += targetType->GetCDeclaration("") + std::string(" *");
+        ret += targetType->GetCDeclaration("") + std::string(" &");
         if (lShouldPrintName(name))
             ret += name;
         return ret;
@@ -2290,14 +2562,14 @@ ReferenceType::GetCDeclaration(const std::string &name) const {
 }
 
 
-LLVM_TYPE_CONST llvm::Type *
+llvm::Type *
 ReferenceType::LLVMType(llvm::LLVMContext *ctx) const {
     if (targetType == NULL) {
         Assert(m->errorCount > 0);
         return NULL;
     }
 
-    LLVM_TYPE_CONST llvm::Type *t = targetType->LLVMType(ctx);
+    llvm::Type *t = targetType->LLVMType(ctx);
     if (t == NULL) {
         Assert(m->errorCount > 0);
         return NULL;
@@ -2315,32 +2587,37 @@ ReferenceType::GetDIType(llvm::DIDescriptor scope) const {
     }
 
     llvm::DIType diTargetType = targetType->GetDIType(scope);
+#if defined(LLVM_3_0) || defined(LLVM_3_1)
     return m->diBuilder->createReferenceType(diTargetType);
+#else
+    return m->diBuilder->createReferenceType(llvm::dwarf::DW_TAG_reference_type,
+                                             diTargetType);
+#endif
 }
 
 
 ///////////////////////////////////////////////////////////////////////////
 // FunctionType
 
-FunctionType::FunctionType(const Type *r, const std::vector<const Type *> &a, 
+FunctionType::FunctionType(const Type *r, const llvm::SmallVector<const Type *, 8> &a, 
                            SourcePos p)
-    : isTask(false), isExported(false), isExternC(false), returnType(r), 
-      paramTypes(a), paramNames(std::vector<std::string>(a.size(), "")),
-      paramDefaults(std::vector<ConstExpr *>(a.size(), NULL)),
-      paramPositions(std::vector<SourcePos>(a.size(), p)) {
+    : Type(FUNCTION_TYPE), isTask(false), isExported(false), isExternC(false), 
+      returnType(r), paramTypes(a), paramNames(llvm::SmallVector<std::string, 8>(a.size(), "")),
+      paramDefaults(llvm::SmallVector<Expr *, 8>(a.size(), NULL)),
+      paramPositions(llvm::SmallVector<SourcePos, 8>(a.size(), p)) {
     Assert(returnType != NULL);
     isSafe = false;
     costOverride = -1;
 }
 
 
-FunctionType::FunctionType(const Type *r, const std::vector<const Type *> &a, 
-                           const std::vector<std::string> &an, 
-                           const std::vector<ConstExpr *> &ad,
-                           const std::vector<SourcePos> &ap,
+FunctionType::FunctionType(const Type *r, const llvm::SmallVector<const Type *, 8> &a, 
+                           const llvm::SmallVector<std::string, 8> &an, 
+                           const llvm::SmallVector<Expr *, 8> &ad,
+                           const llvm::SmallVector<SourcePos, 8> &ap,
                            bool it, bool is, bool ec) 
-    : isTask(it), isExported(is), isExternC(ec), returnType(r), paramTypes(a), 
-      paramNames(an), paramDefaults(ad), paramPositions(ap) {
+    : Type(FUNCTION_TYPE), isTask(it), isExported(is), isExternC(ec), returnType(r), 
+      paramTypes(a), paramNames(an), paramDefaults(ad), paramPositions(ap) {
     Assert(paramTypes.size() == paramNames.size() && 
            paramNames.size() == paramDefaults.size() &&
            paramDefaults.size() == paramPositions.size());
@@ -2429,7 +2706,7 @@ FunctionType::ResolveUnboundVariability(Variability v) const {
     }
     const Type *rt = returnType->ResolveUnboundVariability(v);
 
-    std::vector<const Type *> pt;
+    llvm::SmallVector<const Type *, 8> pt;
     for (unsigned int i = 0; i < paramTypes.size(); ++i) {
         if (paramTypes[i] == NULL) {
             Assert(m->errorCount > 0);
@@ -2450,32 +2727,19 @@ FunctionType::ResolveUnboundVariability(Variability v) const {
 
 const Type *
 FunctionType::GetAsConstType() const {
-    FATAL("FunctionType::GetAsConstType shouldn't be called");
-    return NULL;
+    return this;
 }
 
 
 const Type *
 FunctionType::GetAsNonConstType() const {
-    FATAL("FunctionType::GetAsNonConstType shouldn't be called");
-    return NULL;
+    return this;
 }
 
 
 std::string
 FunctionType::GetString() const {
-    std::string ret;
-    if (isTask) ret += "task ";
-    if (isSafe) ret += "/*safe*/ ";
-    if (costOverride > 0) {
-        char buf[32];
-        sprintf(buf, "/*cost=%d*/ ", costOverride);
-        ret += buf;
-    }
-    if (returnType != NULL)
-        ret += returnType->GetString();
-    else
-        ret += "/* ERROR */";
+    std::string ret = GetReturnTypeString();
     ret += "(";
     for (unsigned int i = 0; i < paramTypes.size(); ++i) {
         if (paramTypes[i] == NULL)
@@ -2517,9 +2781,9 @@ FunctionType::GetCDeclaration(const std::string &fname) const {
         // Convert pointers to arrays to unsized arrays, which are more clear
         // to print out for multidimensional arrays (i.e. "float foo[][4] "
         // versus "float (foo *)[4]").
-        const PointerType *pt = dynamic_cast<const PointerType *>(type);
+        const PointerType *pt = CastType<PointerType>(type);
         if (pt != NULL && 
-            dynamic_cast<const ArrayType *>(pt->GetBaseType()) != NULL) {
+            CastType<ArrayType>(pt->GetBaseType()) != NULL) {
             type = new ArrayType(pt->GetBaseType(), 0);
         }
 
@@ -2535,7 +2799,7 @@ FunctionType::GetCDeclaration(const std::string &fname) const {
 }
 
 
-LLVM_TYPE_CONST llvm::Type *
+llvm::Type *
 FunctionType::LLVMType(llvm::LLVMContext *ctx) const {
     FATAL("FunctionType::LLVMType() shouldn't be called");
     return NULL;
@@ -2544,19 +2808,55 @@ FunctionType::LLVMType(llvm::LLVMContext *ctx) const {
 
 llvm::DIType
 FunctionType::GetDIType(llvm::DIDescriptor scope) const {
-    // @todo need to implement FunctionType::GetDIType()
-    FATAL("need to implement FunctionType::GetDIType()");
-    return llvm::DIType();
+    std::vector<llvm::Value *> retArgTypes;
+
+    retArgTypes.push_back(returnType->GetDIType(scope));
+    for (int i = 0; i < GetNumParameters(); ++i) {
+        const Type *t = GetParameterType(i);
+        if (t == NULL)
+            return llvm::DIType();
+        retArgTypes.push_back(t->GetDIType(scope));
+    }
+
+    llvm::DIArray retArgTypesArray = 
+        m->diBuilder->getOrCreateArray(llvm::ArrayRef<llvm::Value *>(retArgTypes));
+    llvm::DIType diType = 
+        // FIXME: DIFile 
+        m->diBuilder->createSubroutineType(llvm::DIFile(), retArgTypesArray);
+    return diType;
 }
 
 
-LLVM_TYPE_CONST llvm::FunctionType *
+const std::string
+FunctionType::GetReturnTypeString() const {
+    if (returnType == NULL)
+        return "/* ERROR */";
+
+    std::string ret;
+    if (isTask)
+        ret += "task ";
+    if (isExported)
+        ret += "export ";
+    if (isExternC)
+        ret += "extern \"C\" ";
+    if (isSafe) 
+        ret += "/*safe*/ ";
+    if (costOverride > 0) {
+        char buf[32];
+        sprintf(buf, "/*cost=%d*/ ", costOverride);
+        ret += buf;
+    }
+    return ret + returnType->GetString();
+}
+
+
+llvm::FunctionType *
 FunctionType::LLVMFunctionType(llvm::LLVMContext *ctx, bool includeMask) const {
     if (isTask == true) 
         Assert(includeMask == true);
 
     // Get the LLVM Type *s for the function arguments
-    std::vector<LLVM_TYPE_CONST llvm::Type *> llvmArgTypes;
+    std::vector<llvm::Type *> llvmArgTypes;
     for (unsigned int i = 0; i < paramTypes.size(); ++i) {
         if (paramTypes[i] == NULL) {
             Assert(m->errorCount > 0);
@@ -2564,7 +2864,7 @@ FunctionType::LLVMFunctionType(llvm::LLVMContext *ctx, bool includeMask) const {
         }
         Assert(Type::Equal(paramTypes[i], AtomicType::Void) == false);
 
-        LLVM_TYPE_CONST llvm::Type *t = paramTypes[i]->LLVMType(ctx);
+        llvm::Type *t = paramTypes[i]->LLVMType(ctx);
         if (t == NULL) {
             Assert(m->errorCount > 0);
             return NULL;
@@ -2576,7 +2876,7 @@ FunctionType::LLVMFunctionType(llvm::LLVMContext *ctx, bool includeMask) const {
     if (includeMask)
         llvmArgTypes.push_back(LLVMTypes::MaskType);
 
-    std::vector<LLVM_TYPE_CONST llvm::Type *> callTypes;
+    std::vector<llvm::Type *> callTypes;
     if (isTask) {
         // Tasks take three arguments: a pointer to a struct that holds the
         // actual task arguments, the thread index, and the total number of
@@ -2610,7 +2910,7 @@ FunctionType::GetParameterType(int i) const {
 }
 
 
-ConstExpr *
+Expr *
 FunctionType::GetParameterDefault(int i) const { 
     Assert(i < (int)paramDefaults.size());
     return paramDefaults[i]; 
@@ -2654,7 +2954,7 @@ Type::GetAsUnsignedType() const {
  */
 static const Type *
 lVectorConvert(const Type *type, SourcePos pos, const char *reason, int vecSize) {
-    const VectorType *vt = dynamic_cast<const VectorType *>(type);
+    const VectorType *vt = CastType<VectorType>(type);
     if (vt) {
         if (vt->GetElementCount() != vecSize) {
             Error(pos, "Implicit conversion between from vector type "
@@ -2665,7 +2965,7 @@ lVectorConvert(const Type *type, SourcePos pos, const char *reason, int vecSize)
         return vt;
     }
     else {
-        const AtomicType *at = dynamic_cast<const AtomicType *>(type);
+        const AtomicType *at = CastType<AtomicType>(type);
         if (!at) {
             Error(pos, "Non-atomic type \"%s\" can't be converted to vector type "
                   "for %s.", type->GetString().c_str(), reason);
@@ -2681,6 +2981,16 @@ Type::MoreGeneralType(const Type *t0, const Type *t1, SourcePos pos, const char
                       bool forceVarying, int vecSize) {
     Assert(reason != NULL);
 
+    // First, if one or both types are function types, convert them to
+    // pointer to function types and then try again.
+    if (CastType<FunctionType>(t0) || CastType<FunctionType>(t1)) {
+        if (CastType<FunctionType>(t0))
+            t0 = PointerType::GetUniform(t0);
+        if (CastType<FunctionType>(t1))
+            t1 = PointerType::GetUniform(t1);
+        return MoreGeneralType(t0, t1, pos, reason, forceVarying, vecSize);
+    }
+
     // First, if we need to go varying, promote both of the types to be
     // varying.
     if (t0->IsVaryingType() || t1->IsVaryingType() || forceVarying) {
@@ -2704,8 +3014,7 @@ Type::MoreGeneralType(const Type *t0, const Type *t1, SourcePos pos, const char
     // If they're function types, it's hopeless if they didn't match in the
     // Type::Equal() call above.  Fail here so that we don't get into
     // trouble calling GetAsConstType()...
-    if (dynamic_cast<const FunctionType *>(t0) ||
-        dynamic_cast<const FunctionType *>(t1)) {
+    if (CastType<FunctionType>(t0) || CastType<FunctionType>(t1)) {
         Error(pos, "Incompatible function types \"%s\" and \"%s\" in %s.",
               t0->GetString().c_str(), t1->GetString().c_str(), reason);
         return NULL;
@@ -2716,8 +3025,8 @@ Type::MoreGeneralType(const Type *t0, const Type *t1, SourcePos pos, const char
     if (Type::EqualIgnoringConst(t0, t1))
         return t0->GetAsNonConstType();
 
-    const PointerType *pt0 = dynamic_cast<const PointerType *>(t0);
-    const PointerType *pt1 = dynamic_cast<const PointerType *>(t1);
+    const PointerType *pt0 = CastType<PointerType>(t0);
+    const PointerType *pt1 = CastType<PointerType>(t1);
     if (pt0 != NULL && pt1 != NULL) {
         if (PointerType::IsVoidPointer(pt0))
             return pt1;
@@ -2731,8 +3040,8 @@ Type::MoreGeneralType(const Type *t0, const Type *t1, SourcePos pos, const char
         }
     }
 
-    const VectorType *vt0 = dynamic_cast<const VectorType *>(t0);
-    const VectorType *vt1 = dynamic_cast<const VectorType *>(t1);
+    const VectorType *vt0 = CastType<VectorType>(t0);
+    const VectorType *vt1 = CastType<VectorType>(t1);
     if (vt0 && vt1) {
         // both are vectors; convert their base types and make a new vector
         // type, as long as their lengths match
@@ -2749,7 +3058,7 @@ Type::MoreGeneralType(const Type *t0, const Type *t1, SourcePos pos, const char
 
         // The 'more general' version of the two vector element types must
         // be an AtomicType (that's all that vectors can hold...)
-        const AtomicType *at = dynamic_cast<const AtomicType *>(t);
+        const AtomicType *at = CastType<AtomicType>(t);
         Assert(at != NULL);
 
         return new VectorType(at, vt0->GetElementCount());
@@ -2764,7 +3073,7 @@ Type::MoreGeneralType(const Type *t0, const Type *t1, SourcePos pos, const char
         if (!t) 
             return NULL;
 
-        const AtomicType *at = dynamic_cast<const AtomicType *>(t);
+        const AtomicType *at = CastType<AtomicType>(t);
         Assert(at != NULL);
         return new VectorType(at, vt0->GetElementCount());
     }
@@ -2776,18 +3085,18 @@ Type::MoreGeneralType(const Type *t0, const Type *t1, SourcePos pos, const char
         if (!t) 
             return NULL;
 
-        const AtomicType *at = dynamic_cast<const AtomicType *>(t);
+        const AtomicType *at = CastType<AtomicType>(t);
         Assert(at != NULL);
         return new VectorType(at, vt1->GetElementCount());
     }
 
     // TODO: what do we need to do about references here, if anything??
 
-    const AtomicType *at0 = dynamic_cast<const AtomicType *>(t0->GetReferenceTarget());
-    const AtomicType *at1 = dynamic_cast<const AtomicType *>(t1->GetReferenceTarget());
+    const AtomicType *at0 = CastType<AtomicType>(t0->GetReferenceTarget());
+    const AtomicType *at1 = CastType<AtomicType>(t1->GetReferenceTarget());
 
-    const EnumType *et0 = dynamic_cast<const EnumType *>(t0->GetReferenceTarget());
-    const EnumType *et1 = dynamic_cast<const EnumType *>(t1->GetReferenceTarget());
+    const EnumType *et0 = CastType<EnumType>(t0->GetReferenceTarget());
+    const EnumType *et1 = CastType<EnumType>(t1->GetReferenceTarget());
     if (et0 != NULL && et1 != NULL) {
         // Two different enum types -> make them uint32s...
         Assert(et0->IsVaryingType() == et1->IsVaryingType());
@@ -2835,9 +3144,9 @@ Type::MoreGeneralType(const Type *t0, const Type *t1, SourcePos pos, const char
 
 bool
 Type::IsBasicType(const Type *type) {
-    return (dynamic_cast<const AtomicType *>(type) != NULL ||
-            dynamic_cast<const EnumType *>(type) != NULL ||
-            dynamic_cast<const PointerType *>(type) != NULL);
+    return (CastType<AtomicType>(type) != NULL ||
+            CastType<EnumType>(type) != NULL ||
+            CastType<PointerType>(type) != NULL);
 }
 
 
@@ -2846,17 +3155,12 @@ lCheckTypeEquality(const Type *a, const Type *b, bool ignoreConst) {
     if (a == NULL || b == NULL)
         return false;
 
-    if (ignoreConst == true) {
-        if (dynamic_cast<const FunctionType *>(a) == NULL)
-            a = a->GetAsNonConstType();
-        if (dynamic_cast<const FunctionType *>(b) == NULL)
-            b = b->GetAsNonConstType();
-    }
-    else if (a->IsConstType() != b->IsConstType())
+    if (ignoreConst == false &&
+        a->IsConstType() != b->IsConstType())
         return false;
 
-    const AtomicType *ata = dynamic_cast<const AtomicType *>(a);
-    const AtomicType *atb = dynamic_cast<const AtomicType *>(b);
+    const AtomicType *ata = CastType<AtomicType>(a);
+    const AtomicType *atb = CastType<AtomicType>(b);
     if (ata != NULL && atb != NULL) {
         return ((ata->basicType == atb->basicType) && 
                 (ata->GetVariability() == atb->GetVariability()));
@@ -2865,47 +3169,46 @@ lCheckTypeEquality(const Type *a, const Type *b, bool ignoreConst) {
     // For all of the other types, we need to see if we have the same two
     // general types.  If so, then we dig into the details of the type and
     // see if all of the relevant bits are equal...
-    const EnumType *eta = dynamic_cast<const EnumType *>(a);
-    const EnumType *etb = dynamic_cast<const EnumType *>(b);
+    const EnumType *eta = CastType<EnumType>(a);
+    const EnumType *etb = CastType<EnumType>(b);
     if (eta != NULL && etb != NULL)
         // Kind of goofy, but this sufficies to check
         return (eta->pos == etb->pos &&
                 eta->GetVariability() == etb->GetVariability());
 
-    const ArrayType *arta = dynamic_cast<const ArrayType *>(a);
-    const ArrayType *artb = dynamic_cast<const ArrayType *>(b);
+    const ArrayType *arta = CastType<ArrayType>(a);
+    const ArrayType *artb = CastType<ArrayType>(b);
     if (arta != NULL && artb != NULL)
         return (arta->GetElementCount() == artb->GetElementCount() && 
                 lCheckTypeEquality(arta->GetElementType(), artb->GetElementType(), 
                                    ignoreConst));
 
-    const VectorType *vta = dynamic_cast<const VectorType *>(a);
-    const VectorType *vtb = dynamic_cast<const VectorType *>(b);
+    const VectorType *vta = CastType<VectorType>(a);
+    const VectorType *vtb = CastType<VectorType>(b);
     if (vta != NULL && vtb != NULL)
         return (vta->GetElementCount() == vtb->GetElementCount() && 
                 lCheckTypeEquality(vta->GetElementType(), vtb->GetElementType(),
                                    ignoreConst));
 
-    const StructType *sta = dynamic_cast<const StructType *>(a);
-    const StructType *stb = dynamic_cast<const StructType *>(b);
-    if (sta != NULL && stb != NULL) {
-        if (sta->GetElementCount() != stb->GetElementCount())
+    const StructType *sta = CastType<StructType>(a);
+    const StructType *stb = CastType<StructType>(b);
+    const UndefinedStructType *usta = CastType<UndefinedStructType>(a);
+    const UndefinedStructType *ustb = CastType<UndefinedStructType>(b);
+    if ((sta != NULL || usta != NULL) && (stb != NULL || ustb != NULL)) {
+        // Report both defuned and undefined structs as equal if their
+        // names are the same.
+        if (a->GetVariability() != b->GetVariability())
             return false;
-        if (sta->GetStructName() != stb->GetStructName())
-            return false;
-        if (sta->GetVariability() != stb->GetVariability())
-            return false;
-        for (int i = 0; i < sta->GetElementCount(); ++i)
-            // FIXME: is this redundant now?
-            if (!lCheckTypeEquality(sta->GetElementType(i), stb->GetElementType(i),
-                                    ignoreConst))
-                return false;
 
-        return true;
+        const std::string &namea = sta ? sta->GetStructName() : 
+            usta->GetStructName();
+        const std::string &nameb = stb ? stb->GetStructName() :
+            ustb->GetStructName();
+        return (namea == nameb);
     }
 
-    const PointerType *pta = dynamic_cast<const PointerType *>(a);
-    const PointerType *ptb = dynamic_cast<const PointerType *>(b);
+    const PointerType *pta = CastType<PointerType>(a);
+    const PointerType *ptb = CastType<PointerType>(b);
     if (pta != NULL && ptb != NULL)
         return (pta->IsUniformType() == ptb->IsUniformType() &&
                 pta->IsSlice() == ptb->IsSlice() &&
@@ -2913,14 +3216,14 @@ lCheckTypeEquality(const Type *a, const Type *b, bool ignoreConst) {
                 lCheckTypeEquality(pta->GetBaseType(), ptb->GetBaseType(), 
                                    ignoreConst));
 
-    const ReferenceType *rta = dynamic_cast<const ReferenceType *>(a);
-    const ReferenceType *rtb = dynamic_cast<const ReferenceType *>(b);
+    const ReferenceType *rta = CastType<ReferenceType>(a);
+    const ReferenceType *rtb = CastType<ReferenceType>(b);
     if (rta != NULL && rtb != NULL)
         return (lCheckTypeEquality(rta->GetReferenceTarget(),
                                    rtb->GetReferenceTarget(), ignoreConst));
 
-    const FunctionType *fta = dynamic_cast<const FunctionType *>(a);
-    const FunctionType *ftb = dynamic_cast<const FunctionType *>(b);
+    const FunctionType *fta = CastType<FunctionType>(a);
+    const FunctionType *ftb = CastType<FunctionType>(b);
     if (fta != NULL && ftb != NULL) {
         // Both the return types and all of the argument types must match
         // for function types to match
diff --git a/type.h b/type.h
index 94c28f0b..ebd69af9 100644
--- a/type.h
+++ b/type.h
@@ -42,6 +42,7 @@
 #include "util.h"
 #include <llvm/Type.h>
 #include <llvm/DerivedTypes.h>
+#include <llvm/ADT/SmallVector.h>
 
 class ConstExpr;
 class StructType;
@@ -72,6 +73,21 @@ struct Variability {
 };
 
 
+/** Enumerant that records each of the types that inherit from the Type
+    baseclass. */
+enum TypeId {
+    ATOMIC_TYPE,
+    ENUM_TYPE,
+    POINTER_TYPE,
+    ARRAY_TYPE,
+    VECTOR_TYPE,
+    STRUCT_TYPE,
+    UNDEFINED_STRUCT_TYPE,
+    REFERENCE_TYPE,
+    FUNCTION_TYPE
+};
+
+
 /** @brief Interface class that defines the type abstraction.
 
     Abstract base class that defines the interface that must be implemented
@@ -187,7 +203,7 @@ public:
     virtual std::string GetCDeclaration(const std::string &name) const = 0;
 
     /** Returns the LLVM type corresponding to this ispc type */
-    virtual LLVM_TYPE_CONST llvm::Type *LLVMType(llvm::LLVMContext *ctx) const = 0;
+    virtual llvm::Type *LLVMType(llvm::LLVMContext *ctx) const = 0;
 
     /** Returns the DIType (LLVM's debugging information structure),
         corresponding to this type. */
@@ -231,6 +247,14 @@ public:
         (i.e. not an aggregation of multiple instances of a type or
         types.) */
     static bool IsBasicType(const Type *type);
+
+    /** Indicates which Type implementation this type is.  This value can
+        be used to determine the actual type much more efficiently than
+        using dynamic_cast. */
+    const TypeId typeId;
+
+protected:
+    Type(TypeId id) : typeId(id) { }
 };
 
 
@@ -269,7 +293,7 @@ public:
     std::string Mangle() const;
     std::string GetCDeclaration(const std::string &name) const;
 
-    LLVM_TYPE_CONST llvm::Type *LLVMType(llvm::LLVMContext *ctx) const;
+    llvm::Type *LLVMType(llvm::LLVMContext *ctx) const;
     llvm::DIType GetDIType(llvm::DIDescriptor scope) const;
 
     /** This enumerator records the basic types that AtomicTypes can be 
@@ -309,6 +333,8 @@ private:
     const Variability variability;
     const bool isConst;
     AtomicType(BasicType basicType, Variability v, bool isConst);
+
+    mutable const AtomicType *asOtherConstType, *asUniformType, *asVaryingType;
 };
 
 
@@ -343,7 +369,7 @@ public:
     std::string Mangle() const;
     std::string GetCDeclaration(const std::string &name) const;
 
-    LLVM_TYPE_CONST llvm::Type *LLVMType(llvm::LLVMContext *ctx) const;
+    llvm::Type *LLVMType(llvm::LLVMContext *ctx) const;
     llvm::DIType GetDIType(llvm::DIDescriptor scope) const;
 
     /** Provides the enumerators defined in the enum definition. */
@@ -409,7 +435,6 @@ public:
     const PointerType *GetAsSlice() const;
     const PointerType *GetAsNonSlice() const;
     const PointerType *GetAsFrozenSlice() const;
-    const StructType *GetSliceStructType() const;
 
     const Type *GetBaseType() const;
     const PointerType *GetAsVaryingType() const;
@@ -425,7 +450,7 @@ public:
     std::string Mangle() const;
     std::string GetCDeclaration(const std::string &name) const;
 
-    LLVM_TYPE_CONST llvm::Type *LLVMType(llvm::LLVMContext *ctx) const;
+    llvm::Type *LLVMType(llvm::LLVMContext *ctx) const;
     llvm::DIType GetDIType(llvm::DIDescriptor scope) const;
 
     static PointerType *Void;
@@ -453,6 +478,9 @@ public:
         index must be between 0 and GetElementCount()-1.
      */
     virtual const Type *GetElementType(int index) const = 0;
+
+protected:
+    CollectionType(TypeId id) : Type(id) { }
 };
 
 
@@ -474,6 +502,9 @@ public:
         the same type.
      */
     const Type *GetElementType(int index) const;
+
+protected:
+    SequentialType(TypeId id) : CollectionType(id) { }
 };
 
 
@@ -523,7 +554,7 @@ public:
     std::string GetCDeclaration(const std::string &name) const;
 
     llvm::DIType GetDIType(llvm::DIDescriptor scope) const;
-    LLVM_TYPE_CONST llvm::ArrayType *LLVMType(llvm::LLVMContext *ctx) const;
+    llvm::ArrayType *LLVMType(llvm::LLVMContext *ctx) const;
 
     /** This method returns the total number of elements in the array,
         including all dimensions if this is a multidimensional array. */
@@ -589,7 +620,7 @@ public:
     std::string Mangle() const;
     std::string GetCDeclaration(const std::string &name) const;
 
-    LLVM_TYPE_CONST llvm::Type *LLVMType(llvm::LLVMContext *ctx) const;
+    llvm::Type *LLVMType(llvm::LLVMContext *ctx) const;
     llvm::DIType GetDIType(llvm::DIDescriptor scope) const;
 
     int GetElementCount() const;
@@ -612,9 +643,9 @@ private:
  */
 class StructType : public CollectionType {
 public:
-    StructType(const std::string &name, const std::vector<const Type *> &elts, 
-               const std::vector<std::string> &eltNames, 
-               const std::vector<SourcePos> &eltPositions, bool isConst, 
+    StructType(const std::string &name, const llvm::SmallVector<const Type *, 8> &elts, 
+               const llvm::SmallVector<std::string, 8> &eltNames, 
+               const llvm::SmallVector<SourcePos, 8> &eltPositions, bool isConst, 
                Variability variability, SourcePos pos);
 
     Variability GetVariability() const;
@@ -639,7 +670,7 @@ public:
     std::string Mangle() const;
     std::string GetCDeclaration(const std::string &name) const;
 
-    LLVM_TYPE_CONST llvm::Type *LLVMType(llvm::LLVMContext *ctx) const;
+    llvm::Type *LLVMType(llvm::LLVMContext *ctx) const;
     llvm::DIType GetDIType(llvm::DIDescriptor scope) const;
 
     /** Returns the type of the structure element with the given name (if any).
@@ -655,12 +686,12 @@ public:
     int GetElementNumber(const std::string &name) const;
 
     /** Returns the name of the i'th element of the structure. */
-    const std::string GetElementName(int i) const { return elementNames[i]; }
+    const std::string &GetElementName(int i) const { return elementNames[i]; }
     
     /** Returns the total number of elements in the structure. */
     int GetElementCount() const { return int(elementTypes.size()); }
 
-    SourcePos GetElementPosition(int i) const { return elementPositions[i]; }
+    const SourcePos &GetElementPosition(int i) const { return elementPositions[i]; }
 
     /** Returns the name of the structure type.  (e.g. struct Foo -> "Foo".) */
     const std::string &GetStructName() const { return name; }
@@ -668,7 +699,7 @@ public:
 private:
     static bool checkIfCanBeSOA(const StructType *st);
 
-    const std::string name;
+    /*const*/ std::string name;
     /** The types of the struct elements.  Note that we store these with
         uniform/varying exactly as they were declared in the source file.
         (In other words, even if this struct has a varying qualifier and
@@ -679,11 +710,61 @@ private:
         make a uniform version of the struct, we've maintained the original
         information about the member types.
      */
-    const std::vector<const Type *> elementTypes;
-    const std::vector<std::string> elementNames;
+    const llvm::SmallVector<const Type *, 8> elementTypes;
+    const llvm::SmallVector<std::string, 8> elementNames;
     /** Source file position at which each structure element declaration
         appeared. */
-    const std::vector<SourcePos> elementPositions;
+    const llvm::SmallVector<SourcePos, 8> elementPositions;
+    const Variability variability;
+    const bool isConst;
+    const SourcePos pos;
+
+    mutable llvm::SmallVector<const Type *, 8> finalElementTypes;
+
+    mutable const StructType *oppositeConstStructType;
+};
+
+
+/** Type implementation representing a struct name that has been declared
+    but where the struct members haven't been defined (i.e. "struct Foo;").
+    This class doesn't do much besides serve as a placeholder that other
+    code can use to detect the presence of such as truct.
+ */
+class UndefinedStructType : public Type {
+public:
+    UndefinedStructType(const std::string &name, const Variability variability,
+                        bool isConst, SourcePos pos);
+
+    Variability GetVariability() const;
+
+    bool IsBoolType() const;
+    bool IsFloatType() const;
+    bool IsIntType() const;
+    bool IsUnsignedType() const;
+    bool IsConstType() const;
+
+    const Type *GetBaseType() const;
+    const UndefinedStructType *GetAsVaryingType() const;
+    const UndefinedStructType *GetAsUniformType() const;
+    const UndefinedStructType *GetAsUnboundVariabilityType() const;
+    const UndefinedStructType *GetAsSOAType(int width) const;
+    const UndefinedStructType *ResolveUnboundVariability(Variability v) const;
+
+    const UndefinedStructType *GetAsConstType() const;
+    const UndefinedStructType *GetAsNonConstType() const;
+
+    std::string GetString() const;
+    std::string Mangle() const;
+    std::string GetCDeclaration(const std::string &name) const;
+
+    llvm::Type *LLVMType(llvm::LLVMContext *ctx) const;
+    llvm::DIType GetDIType(llvm::DIDescriptor scope) const;
+
+    /** Returns the name of the structure type.  (e.g. struct Foo -> "Foo".) */
+    const std::string &GetStructName() const { return name; }
+
+private:
+    const std::string name;
     const Variability variability;
     const bool isConst;
     const SourcePos pos;
@@ -719,11 +800,12 @@ public:
     std::string Mangle() const;
     std::string GetCDeclaration(const std::string &name) const;
 
-    LLVM_TYPE_CONST llvm::Type *LLVMType(llvm::LLVMContext *ctx) const;
+    llvm::Type *LLVMType(llvm::LLVMContext *ctx) const;
     llvm::DIType GetDIType(llvm::DIDescriptor scope) const;
 
 private:
     const Type * const targetType;
+    mutable const ReferenceType *asOtherConstType;
 };
 
 
@@ -741,12 +823,12 @@ private:
 class FunctionType : public Type {
 public:
     FunctionType(const Type *returnType, 
-                 const std::vector<const Type *> &argTypes, SourcePos pos);
+                 const llvm::SmallVector<const Type *, 8> &argTypes, SourcePos pos);
     FunctionType(const Type *returnType, 
-                 const std::vector<const Type *> &argTypes,
-                 const std::vector<std::string> &argNames,
-                 const std::vector<ConstExpr *> &argDefaults,
-                 const std::vector<SourcePos> &argPos,
+                 const llvm::SmallVector<const Type *, 8> &argTypes,
+                 const llvm::SmallVector<std::string, 8> &argNames,
+                 const llvm::SmallVector<Expr *, 8> &argDefaults,
+                 const llvm::SmallVector<SourcePos, 8> &argPos,
                  bool isTask, bool isExported, bool isExternC);
 
     Variability GetVariability() const;
@@ -771,21 +853,23 @@ public:
     std::string Mangle() const;
     std::string GetCDeclaration(const std::string &fname) const;
 
-    LLVM_TYPE_CONST llvm::Type *LLVMType(llvm::LLVMContext *ctx) const;
+    llvm::Type *LLVMType(llvm::LLVMContext *ctx) const;
     llvm::DIType GetDIType(llvm::DIDescriptor scope) const;
 
     const Type *GetReturnType() const { return returnType; }
 
+    const std::string GetReturnTypeString() const;
+
     /** This method returns the LLVM FunctionType that corresponds to this
         function type.  The \c includeMask parameter indicates whether the
         llvm::FunctionType should have a mask as the last argument in its
         function signature. */
-    LLVM_TYPE_CONST llvm::FunctionType *LLVMFunctionType(llvm::LLVMContext *ctx, 
+    llvm::FunctionType *LLVMFunctionType(llvm::LLVMContext *ctx, 
                                                          bool includeMask = false) const;
 
     int GetNumParameters() const { return (int)paramTypes.size(); }
     const Type *GetParameterType(int i) const;
-    ConstExpr * GetParameterDefault(int i) const;
+    Expr * GetParameterDefault(int i) const;
     const SourcePos &GetParameterSourcePos(int i) const;
     const std::string &GetParameterName(int i) const;
 
@@ -814,16 +898,131 @@ private:
 
     // The following four vectors should all have the same length (which is
     // in turn the length returned by GetNumParameters()).
-    const std::vector<const Type *> paramTypes;
-    const std::vector<std::string> paramNames;
+    const llvm::SmallVector<const Type *, 8> paramTypes;
+    const llvm::SmallVector<std::string, 8> paramNames;
     /** Default values of the function's arguments.  For arguments without
         default values provided, NULL is stored. */
-    mutable std::vector<ConstExpr *> paramDefaults;
+    mutable llvm::SmallVector<Expr *, 8> paramDefaults;
     /** The names provided (if any) with the function arguments in the
         function's signature.  These should only be used for error messages
         and the like and so not affect testing function types for equality,
         etc. */
-    const std::vector<SourcePos> paramPositions;
+    const llvm::SmallVector<SourcePos, 8> paramPositions;
 };
 
+
+/* Efficient dynamic casting of Types.  First, we specify a default
+   template function that returns NULL, indicating a failed cast, for
+   arbitrary types. */
+template <typename T> inline const T *
+CastType(const Type *type) {
+    return NULL;
+}
+
+
+/* Now we have template specializaitons for the Types implemented in this
+   file.  Each one checks the Type::typeId member and then performs the
+   corresponding static cast if it's safe as per the typeId.
+ */
+template <> inline const AtomicType *
+CastType(const Type *type) {
+    if (type != NULL && type->typeId == ATOMIC_TYPE)
+        return (const AtomicType *)type;
+    else
+        return NULL;
+}
+
+template <> inline const EnumType *
+CastType(const Type *type) {
+    if (type != NULL && type->typeId == ENUM_TYPE)
+        return (const EnumType *)type;
+    else
+        return NULL;
+}
+
+template <> inline const PointerType *
+CastType(const Type *type) {
+    if (type != NULL && type->typeId == POINTER_TYPE)
+        return (const PointerType *)type;
+    else
+        return NULL;
+}
+
+template <> inline const ArrayType *
+CastType(const Type *type) {
+    if (type != NULL && type->typeId == ARRAY_TYPE)
+        return (const ArrayType *)type;
+    else
+        return NULL;
+}
+
+template <> inline const VectorType *
+CastType(const Type *type) {
+    if (type != NULL && type->typeId == VECTOR_TYPE)
+        return (const VectorType *)type;
+    else
+        return NULL;
+}
+
+template <> inline const SequentialType *
+CastType(const Type *type) {
+    // Note that this function must be updated if other sequential type
+    // implementations are added.
+    if (type != NULL && 
+        (type->typeId == ARRAY_TYPE || type->typeId == VECTOR_TYPE))
+        return (const SequentialType *)type;
+    else
+        return NULL;
+}
+
+template <> inline const CollectionType *
+CastType(const Type *type) {
+    // Similarly a new collection type implementation requires updating
+    // this function.
+    if (type != NULL && 
+        (type->typeId == ARRAY_TYPE || type->typeId == VECTOR_TYPE ||
+         type->typeId == STRUCT_TYPE))
+        return (const CollectionType *)type;
+    else
+        return NULL;
+}
+
+template <> inline const StructType *
+CastType(const Type *type) {
+    if (type != NULL && type->typeId == STRUCT_TYPE)
+        return (const StructType *)type;
+    else
+        return NULL;
+}
+
+template <> inline const UndefinedStructType *
+CastType(const Type *type) {
+    if (type != NULL && type->typeId == UNDEFINED_STRUCT_TYPE)
+        return (const UndefinedStructType *)type;
+    else
+        return NULL;
+}
+
+template <> inline const ReferenceType *
+CastType(const Type *type) {
+    if (type != NULL && type->typeId == REFERENCE_TYPE)
+        return (const ReferenceType *)type;
+    else
+        return NULL;
+}
+
+template <> inline const FunctionType *
+CastType(const Type *type) {
+    if (type != NULL && type->typeId == FUNCTION_TYPE)
+        return (const FunctionType *)type;
+    else
+        return NULL;
+}
+
+
+inline bool IsReferenceType(const Type *t) {
+    return CastType<ReferenceType>(t) != NULL;
+}
+
+
 #endif // ISPC_TYPE_H
diff --git a/util.cpp b/util.cpp
index 7057755b..3c0de598 100644
--- a/util.cpp
+++ b/util.cpp
@@ -90,6 +90,49 @@ lTerminalWidth() {
 }
 
 
+static bool
+lHaveANSIColors() {
+    static bool r = (getenv("TERM") != NULL &&
+                     strcmp(getenv("TERM"), "dumb") != 0);
+    return r;
+}
+
+
+static const char *
+lStartBold() {
+    if (lHaveANSIColors())
+        return "\033[1m";
+    else
+        return "";
+}
+
+
+static const char *
+lStartRed() {
+    if (lHaveANSIColors())
+        return "\033[31m";
+    else
+        return "";
+}
+
+
+static const char *
+lStartBlue() {
+    if (lHaveANSIColors())
+        return "\033[34m";
+    else
+        return "";
+}
+
+
+static const char *
+lResetColor() {
+    if (lHaveANSIColors())
+        return "\033[0m";
+    else
+        return "";
+}
+
 /** Given a pointer into a string, find the end of the current word and
     return a pointer to its last character. 
 */
@@ -140,17 +183,43 @@ lPrintFileLineContext(SourcePos p) {
     fclose(f);
 }
 
+
+/** Counts the number of characters into the buf at which the numColons
+    colon character is found.  Skips over ANSI escape sequences and doesn't
+    include their characters in the final count.
+ */
+static int
+lFindIndent(int numColons, const char *buf) {
+    int indent = 0;
+    while (*buf != '\0') {
+        if (*buf == '\033') {
+            while (*buf != '\0' && *buf != 'm')
+                ++buf;
+            if (*buf == 'm')
+                ++buf;
+        }
+        else {
+            if (*buf == ':') {
+                if (--numColons == 0)
+                    break;
+            }
+            ++indent;
+            ++buf;
+        }
+    }
+    return indent + 2;
+}
+
+
 /** Print the given string to the given FILE, assuming the given output
     column width.  Break words as needed to avoid words spilling past the
     last column.  */
 static void
-lPrintWithWordBreaks(const char *buf, int columnWidth, FILE *out) {
+lPrintWithWordBreaks(const char *buf, int indent, int columnWidth, FILE *out) {
 #ifdef ISPC_IS_WINDOWS
     fputs(buf, out);
 #else
     int column = 0;
-    Assert(strchr(buf, ':') != NULL);
-    int indent = strchr(buf, ':') - buf + 2;
     int width = std::max(40, columnWidth - 2);
 
     // Collect everything into a string and print it all at once at the end
@@ -160,6 +229,29 @@ lPrintWithWordBreaks(const char *buf, int columnWidth, FILE *out) {
 
     const char *msgPos = buf;
     while (true) {
+        if (*msgPos == '\033') {
+            // handle ANSI color escape: copy it to the output buffer
+            // without charging for the characters it uses
+            do {
+                outStr.push_back(*msgPos++);
+            } while (*msgPos != '\0' && *msgPos != 'm');
+            continue;
+        }
+        else if (*msgPos == '\n') {
+            // Handle newlines cleanly
+            column = indent;
+            outStr.push_back('\n');
+            for (int i = 0; i < indent; ++i)
+                outStr.push_back(' ');
+            // Respect spaces after newlines
+            ++msgPos;
+            while (*msgPos == ' ') {
+                outStr.push_back(' ');
+                ++msgPos;
+            }
+            continue;
+        }
+
         while (*msgPos != '\0' && isspace(*msgPos))
             ++msgPos;
         if (*msgPos == '\0')
@@ -171,8 +263,8 @@ lPrintWithWordBreaks(const char *buf, int columnWidth, FILE *out) {
             column = indent;
             outStr.push_back('\n');
             // Indent to the same column as the ":" at the start of the
-            // message, unless doing so would be too far in.
-            for (int i = 0; i < std::min(16, indent); ++i)
+            // message.
+            for (int i = 0; i < indent; ++i)
                 outStr.push_back(' ');
         }
 
@@ -225,26 +317,37 @@ asprintf(char **sptr, const char *fmt, ...)
     @param args   Arguments with values for format string % entries
 */
 static void
-lPrint(const char *type, SourcePos p, const char *fmt, va_list args) {
+lPrint(const char *type, bool isError, SourcePos p, const char *fmt, 
+       va_list args) {
     char *errorBuf, *formattedBuf;
     if (vasprintf(&errorBuf, fmt, args) == -1) {
         fprintf(stderr, "vasprintf() unable to allocate memory!\n");
         abort();
     }
+
+    int indent = 0;
     if (p.first_line == 0) {
         // We don't have a valid SourcePos, so create a message without it
-        if (asprintf(&formattedBuf, "%s: %s\n", type, errorBuf) == -1) {
+        if (asprintf(&formattedBuf, "%s%s%s%s%s: %s%s", lStartBold(),
+                     isError ? lStartRed() : lStartBlue(), type,
+                     lResetColor(), lStartBold(), errorBuf, 
+                     lResetColor()) == -1) {
             fprintf(stderr, "asprintf() unable to allocate memory!\n");
             exit(1);
         }
+        indent = lFindIndent(1, formattedBuf);
     }
     else {
         // Create an error message that includes the file and line number
-        if (asprintf(&formattedBuf, "%s:%d:%d: %s: %s\n", p.name, 
-                     p.first_line, p.first_column, type, errorBuf) == -1) {
+        if (asprintf(&formattedBuf, "%s%s:%d:%d: %s%s%s%s: %s%s", 
+                     lStartBold(), p.name, p.first_line, p.first_column, 
+                     isError ? lStartRed() : lStartBlue(), type, 
+                     lResetColor(), lStartBold(), errorBuf, 
+                     lResetColor()) == -1) {
             fprintf(stderr, "asprintf() unable to allocate memory!\n");
             exit(1);
         }
+        indent = lFindIndent(3, formattedBuf);
     }
 
     // Now that we've done all that work, see if we've already printed the
@@ -255,7 +358,7 @@ lPrint(const char *type, SourcePos p, const char *fmt, va_list args) {
         return;
     printed.insert(formattedBuf);
 
-    lPrintWithWordBreaks(formattedBuf, lTerminalWidth(), stderr);
+    lPrintWithWordBreaks(formattedBuf, indent, lTerminalWidth(), stderr);
     lPrintFileLineContext(p);
 
     free(errorBuf);
@@ -271,7 +374,7 @@ Error(SourcePos p, const char *fmt, ...) {
 
     va_list args;
     va_start(args, fmt);
-    lPrint("Error", p, fmt, args);
+    lPrint("Error", true, p, fmt, args);
     va_end(args);
 }
 
@@ -283,7 +386,7 @@ Debug(SourcePos p, const char *fmt, ...) {
 
     va_list args;
     va_start(args, fmt);
-    lPrint("Debug", p, fmt, args);
+    lPrint("Debug", false, p, fmt, args);
     va_end(args);
 }
 
@@ -298,7 +401,8 @@ Warning(SourcePos p, const char *fmt, ...) {
 
     va_list args;
     va_start(args, fmt);
-    lPrint(g->warningsAsErrors ? "Error" : "Warning", p, fmt, args);
+    lPrint(g->warningsAsErrors ? "Error" : "Warning", g->warningsAsErrors,
+           p, fmt, args);
     va_end(args);
 }
 
@@ -311,23 +415,46 @@ PerformanceWarning(SourcePos p, const char *fmt, ...) {
 
     va_list args;
     va_start(args, fmt);
-    lPrint("Performance Warning", p, fmt, args);
+    lPrint("Performance Warning", false, p, fmt, args);
     va_end(args);
 }
 
 
-void
-FatalError(const char *file, int line, const char *message) {
-    fprintf(stderr, "%s(%d): FATAL ERROR: %s\n", file, line, message);
+static void
+lPrintBugText() {
     fprintf(stderr, "***\n"
             "*** Please file a bug report at https://github.com/ispc/ispc/issues\n"
             "*** (Including as much information as you can about how to "
             "reproduce this error).\n"
             "*** You have apparently encountered a bug in the compiler that we'd "
             "like to fix!\n***\n");
+}
+
+
+void
+FatalError(const char *file, int line, const char *message) {
+    fprintf(stderr, "%s(%d): FATAL ERROR: %s\n", file, line, message);
+    lPrintBugText();
     abort();
 }
 
+
+void
+DoAssert(const char *file, int line, const char *expr) {
+    fprintf(stderr, "%s:%u: Assertion failed: \"%s\".\n", file, line, expr);
+    lPrintBugText();
+    abort();
+}
+
+
+void
+DoAssertPos(SourcePos pos, const char *file, int line, const char *expr) {
+    Error(pos, "Assertion failed (%s:%u): \"%s\".", file, line, expr);
+    lPrintBugText();
+    abort();
+}
+
+
 ///////////////////////////////////////////////////////////////////////////
 
 // http://en.wikipedia.org/wiki/Levenshtein_distance