Merged Upstream

2012-06-01 11:13:16 -07:00
parent 512f8d8b60 6df7d31a5b
commit fa1fd8a576
253 changed files with 10866 additions and 4233 deletions
--- a/34
+++ b/34
@@ -2,6 +2,15 @@
 # ispc Makefile
 #

+# If you have your own special version of llvm and/or clang, change
+# these variables to match.
+LLVM_CONFIG=$(shell which llvm-config)
+CLANG_INCLUDE=$(shell $(LLVM_CONFIG) --includedir)
+
+# Add llvm bin to the path so any scripts run will go to the right llvm-config
+LLVM_BIN= $(shell $(LLVM_CONFIG) --bindir)
+export PATH:=$(LLVM_BIN):$(PATH)
+
 ARCH_OS = $(shell uname)
 ifeq ($(ARCH_OS), Darwin)
 	ARCH_OS2 = "OSX"
@@ -10,10 +19,12 @@ else
 endif
 ARCH_TYPE = $(shell arch)

-ifeq ($(shell llvm-config --version), 3.1svn)
+ifeq ($(shell $(LLVM_CONFIG) --version), 3.0)
+  LLVM_LIBS=$(shell $(LLVM_CONFIG) --libs)
+else
  LLVM_LIBS=-lLLVMAsmParser -lLLVMInstrumentation -lLLVMLinker			\
 	-lLLVMArchive -lLLVMBitReader -lLLVMDebugInfo -lLLVMJIT -lLLVMipo	\
-	-lLLVMBitWriter -lLLVMTableGen -lLLVMCBackendInfo			\
+	-lLLVMBitWriter -lLLVMTableGen 			                        \
 	-lLLVMX86Disassembler -lLLVMX86CodeGen -lLLVMSelectionDAG		\
 	-lLLVMAsmPrinter -lLLVMX86AsmParser -lLLVMX86Desc -lLLVMX86Info		\
 	-lLLVMX86AsmPrinter -lLLVMX86Utils -lLLVMMCDisassembler	-lLLVMMCParser	\
@@ -21,19 +32,17 @@ ifeq ($(shell llvm-config --version), 3.1svn)
 	-lLLVMipa -lLLVMAnalysis -lLLVMMCJIT -lLLVMRuntimeDyld			\
 	-lLLVMExecutionEngine -lLLVMTarget -lLLVMMC -lLLVMObject -lLLVMCore 	\
 	-lLLVMSupport
-else
-  LLVM_LIBS=$(shell llvm-config --libs)
 endif

 CLANG=clang
 CLANG_LIBS = -lclangFrontend -lclangDriver \
             -lclangSerialization -lclangParse -lclangSema \
             -lclangAnalysis -lclangAST -lclangLex -lclangBasic
-ifeq ($(shell llvm-config --version), 3.1svn)
+ifneq ($(shell $(LLVM_CONFIG) --version), 3.0)
  CLANG_LIBS += -lclangEdit
 endif

-ISPC_LIBS=$(shell llvm-config --ldflags) $(CLANG_LIBS) $(LLVM_LIBS) \
+ISPC_LIBS=$(shell $(LLVM_CONFIG) --ldflags) $(CLANG_LIBS) $(LLVM_LIBS) \
 	-lpthread

 ifeq ($(ARCH_OS),Linux)
@@ -44,8 +53,8 @@ ifeq ($(ARCH_OS2),Msys)
 	ISPC_LIBS += -lshlwapi -limagehlp -lpsapi
 endif

-LLVM_CXXFLAGS=$(shell llvm-config --cppflags)
-LLVM_VERSION=LLVM_$(shell llvm-config --version | sed s/\\./_/)
+LLVM_CXXFLAGS=$(shell $(LLVM_CONFIG) --cppflags)
+LLVM_VERSION=LLVM_$(shell $(LLVM_CONFIG) --version | sed -e s/\\./_/ -e s/svn//)
 LLVM_VERSION_DEF=-D$(LLVM_VERSION)

 BUILD_DATE=$(shell date +%Y%m%d)
@@ -53,8 +62,9 @@ BUILD_VERSION=$(shell git log --abbrev-commit --abbrev=16 | head -1)

 CXX=g++
 CPP=cpp
-OPT=-g3
-CXXFLAGS=$(OPT) $(LLVM_CXXFLAGS) -I. -Iobjs/ -Wall $(LLVM_VERSION_DEF) \
+OPT=-O2
+CXXFLAGS=$(OPT) $(LLVM_CXXFLAGS) -I. -Iobjs/ -I$(CLANG_INCLUDE)  \
+	-Wall $(LLVM_VERSION_DEF) \
 	-DBUILD_DATE="\"$(BUILD_DATE)\"" -DBUILD_VERSION="\"$(BUILD_VERSION)\""

 LDFLAGS=
@@ -75,7 +85,7 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
 HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
 	opt.h stmt.h sym.h type.h util.h
 TARGETS=avx1 avx1-x2 avx2 avx2-x2 sse2 sse2-x2 sse4 sse4-x2 generic-4 generic-8 \
-	generic-16 generic-1
+	generic-16 generic-32 generic-64 generic-1
 BUILTINS_SRC=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS))) \
 	builtins/dispatch.ll
 BUILTINS_OBJS=$(addprefix builtins-, $(notdir $(BUILTINS_SRC:.ll=.o))) \
@@ -114,7 +124,7 @@ doxygen:

 ispc: print_llvm_src dirs $(OBJS)
 	@echo Creating ispc executable
-	@$(CXX) $(LDFLAGS) -o $@ $(OBJS) $(ISPC_LIBS)
+	@$(CXX) $(OPT) $(LDFLAGS) -o $@ $(OBJS) $(ISPC_LIBS)

 objs/%.o: %.cpp
 	@echo Compiling $<
--- a/ast.cpp
+++ b/ast.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2011, Intel Corporation
+  Copyright (c) 2011-2012, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -32,8 +32,10 @@
 */

 /** @file ast.cpp
-    @brief 
-*/
+
+    @brief General functionality related to abstract syntax trees and
+    traversal of them.
+ */

 #include "ast.h"
 #include "expr.h"
@@ -53,10 +55,10 @@ ASTNode::~ASTNode() {
 // AST

 void
-AST::AddFunction(Symbol *sym, const std::vector<Symbol *> &args, Stmt *code) {
+AST::AddFunction(Symbol *sym, Stmt *code) {
    if (sym == NULL)
        return;
-    functions.push_back(new Function(sym, args, code));
+    functions.push_back(new Function(sym, code));
 }


@@ -151,7 +153,7 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
        else if ((ls = dynamic_cast<LabeledStmt *>(node)) != NULL)
            ls->stmt = (Stmt *)WalkAST(ls->stmt, preFunc, postFunc, data);
        else if ((rs = dynamic_cast<ReturnStmt *>(node)) != NULL)
-            rs->val = (Expr *)WalkAST(rs->val, preFunc, postFunc, data);
+            rs->expr = (Expr *)WalkAST(rs->expr, preFunc, postFunc, data);
        else if ((sl = dynamic_cast<StmtList *>(node)) != NULL) {
            std::vector<Stmt *> &sls = sl->stmts;
            for (unsigned int i = 0; i < sls.size(); ++i)
@@ -305,19 +307,39 @@ TypeCheck(Stmt *stmt) {
 }


+struct CostData {
+    CostData() { cost = foreachDepth = 0; }
+
+    int cost;
+    int foreachDepth;
+};
+
+
 static bool
-lCostCallback(ASTNode *node, void *c) {
-    int *cost = (int *)c;
-    *cost += node->EstimateCost();
+lCostCallbackPre(ASTNode *node, void *d) {
+    CostData *data = (CostData *)d;
+    if (dynamic_cast<ForeachStmt *>(node) != NULL)
+        ++data->foreachDepth;
+    if (data->foreachDepth == 0)
+        data->cost += node->EstimateCost();
    return true;
 }


+static ASTNode *
+lCostCallbackPost(ASTNode *node, void *d) {
+    CostData *data = (CostData *)d;
+    if (dynamic_cast<ForeachStmt *>(node) != NULL)
+        --data->foreachDepth;
+    return node;
+}
+
+
 int
 EstimateCost(ASTNode *root) {
-    int cost = 0;
-    WalkAST(root, lCostCallback, NULL, &cost);
-    return cost;
+    CostData data;
+    WalkAST(root, lCostCallbackPre, lCostCallbackPost, &data);
+    return data.cost;
 }


@@ -334,10 +356,10 @@ lCheckAllOffSafety(ASTNode *node, void *data) {
            return false;

        const Type *type = fce->func->GetType();
-        const PointerType *pt = dynamic_cast<const PointerType *>(type);
+        const PointerType *pt = CastType<PointerType>(type);
        if (pt != NULL)
            type = pt->GetBaseType();
-        const FunctionType *ftype = dynamic_cast<const FunctionType *>(type);
+        const FunctionType *ftype = CastType<FunctionType>(type);
        Assert(ftype != NULL);

        if (ftype->isSafe == false) {
@@ -363,17 +385,22 @@ lCheckAllOffSafety(ASTNode *node, void *data) {
        return false;
    }

-    if (g->target.allOffMaskIsSafe == true)
-        // Don't worry about memory accesses if we have a target that can
-        // safely run them with the mask all off
-        return true;
+    if (dynamic_cast<ForeachStmt *>(node) != NULL) {
+        // foreach() statements also shouldn't be run with an all-off mask.
+        // Since they re-establish an 'all on' mask, this would be pretty
+        // unintuitive.  (More generally, it's possibly a little strange to
+        // allow foreach() in the presence of any non-uniform control
+        // flow...)
+        *okPtr = false;
+        return false;
+    }

    IndexExpr *ie;
    if ((ie = dynamic_cast<IndexExpr *>(node)) != NULL && ie->baseExpr != NULL) {
        const Type *type = ie->baseExpr->GetType();
        if (type == NULL)
            return true;
-        if (dynamic_cast<const ReferenceType *>(type) != NULL)
+        if (CastType<ReferenceType>(type) != NULL)
            type = type->GetReferenceTarget();

        ConstExpr *ce = dynamic_cast<ConstExpr *>(ie->index);
@@ -383,16 +410,14 @@ lCheckAllOffSafety(ASTNode *node, void *data) {
            return false;
        }

-        const PointerType *pointerType = 
-            dynamic_cast<const PointerType *>(type);
+        const PointerType *pointerType = CastType<PointerType>(type);
        if (pointerType != NULL) {
            // pointer[index] -> can't be sure -> not safe
            *okPtr = false;
            return false;
        }

-        const SequentialType *seqType = 
-            dynamic_cast<const SequentialType *>(type);
+        const SequentialType *seqType = CastType<SequentialType>(type);
        Assert(seqType != NULL);
        int nElements = seqType->GetElementCount();
        if (nElements == 0) {
--- a/ast.h
+++ b/ast.h
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2011, Intel Corporation
+  Copyright (c) 2011-2012, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -84,8 +84,7 @@ class AST {
 public:
    /** Add the AST for a function described by the given declaration
        information and source code. */
-    void AddFunction(Symbol *sym, const std::vector<Symbol *> &args, 
-                     Stmt *code);
+    void AddFunction(Symbol *sym, Stmt *code);

    /** Generate LLVM IR for all of the functions into the current
        module. */
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -157,7 +157,7 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {

 static void
 lCreateSymbol(const std::string &name, const Type *returnType, 
-              const std::vector<const Type *> &argTypes, 
+              llvm::SmallVector<const Type *, 8> &argTypes, 
              const llvm::FunctionType *ftype, llvm::Function *func, 
              SymbolTable *symbolTable) {
    SourcePos noPos;
@@ -199,7 +199,7 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
    // bool, so just have a one-off override for that one...
    if (g->target.maskBitCount != 1 && name == "__sext_varying_bool") {
        const Type *returnType = AtomicType::VaryingInt32;
-        std::vector<const Type *> argTypes;
+        llvm::SmallVector<const Type *, 8> argTypes;
        argTypes.push_back(AtomicType::VaryingBool);

        FunctionType *funcType = new FunctionType(returnType, argTypes, noPos);
@@ -229,7 +229,7 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
        // Iterate over the arguments and try to find their equivalent ispc
        // types.  Track if any of the arguments has an integer type.
        bool anyIntArgs = false;
-        std::vector<const Type *> argTypes;
+        llvm::SmallVector<const Type *, 8> argTypes;
        for (unsigned int j = 0; j < ftype->getNumParams(); ++j) {
            const llvm::Type *llvmArgType = ftype->getParamType(j);
            const Type *type = lLLVMTypeToISPCType(llvmArgType, intAsUnsigned);
@@ -291,7 +291,7 @@ lCheckModuleIntrinsics(llvm::Module *module) {
        if (!strncmp(funcName.c_str(), "llvm.x86.", 9)) {
            llvm::Intrinsic::ID id = (llvm::Intrinsic::ID)func->getIntrinsicID();
            Assert(id != 0);
-            LLVM_TYPE_CONST llvm::Type *intrinsicType = 
+            llvm::Type *intrinsicType = 
                llvm::Intrinsic::getType(*g->ctx, id);
            intrinsicType = llvm::PointerType::get(intrinsicType, 0);
            Assert(func->getType() == intrinsicType);
@@ -411,12 +411,16 @@ lSetInternalFunctions(llvm::Module *module) {
        "__extract_int64",
        "__extract_int8",
        "__fastmath",
+        "__float_to_half_uniform",
+        "__float_to_half_varying",
        "__floatbits_uniform_int32",
        "__floatbits_varying_int32",
        "__floor_uniform_double",
        "__floor_uniform_float",
        "__floor_varying_double",
        "__floor_varying_float",
+        "__half_to_float_uniform",
+        "__half_to_float_varying",
        "__insert_int16",
        "__insert_int32",
        "__insert_int64",
@@ -616,9 +620,7 @@ AddBitcodeToModule(const unsigned char *bitcode, int length,

        std::string(linkError);
        if (llvm::Linker::LinkModules(module, bcModule, 
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
                                      llvm::Linker::DestroySource,
-#endif // LLVM_3_0
                                      &linkError))
            Error(SourcePos(), "Error linking stdlib bitcode: %s", linkError.c_str());
        lSetInternalFunctions(module);
@@ -635,16 +637,36 @@ AddBitcodeToModule(const unsigned char *bitcode, int length,
 static void
 lDefineConstantInt(const char *name, int val, llvm::Module *module,
                   SymbolTable *symbolTable) {
-    Symbol *pw = 
+    Symbol *sym = 
        new Symbol(name, SourcePos(), AtomicType::UniformInt32->GetAsConstType(),
                   SC_STATIC);
-    pw->constValue = new ConstExpr(pw->type, val, SourcePos());
-    LLVM_TYPE_CONST llvm::Type *ltype = LLVMTypes::Int32Type;
+    sym->constValue = new ConstExpr(sym->type, val, SourcePos());
+    llvm::Type *ltype = LLVMTypes::Int32Type;
    llvm::Constant *linit = LLVMInt32(val);
-    pw->storagePtr = new llvm::GlobalVariable(*module, ltype, true, 
-                                              llvm::GlobalValue::InternalLinkage,
-                                              linit, pw->name.c_str());
-    symbolTable->AddVariable(pw);
+    // Use WeakODRLinkage rather than InternalLinkage so that a definition
+    // survives even if it's not used in the module, so that the symbol is
+    // there in the debugger.
+    sym->storagePtr = new llvm::GlobalVariable(*module, ltype, true, 
+                                               llvm::GlobalValue::WeakODRLinkage,
+                                               linit, name);
+    symbolTable->AddVariable(sym);
+
+    if (m->diBuilder != NULL) {
+        llvm::DIFile file;
+        llvm::DIType diType = sym->type->GetDIType(file);
+        Assert(diType.Verify());
+        // FIXME? DWARF says that this (and programIndex below) should
+        // have the DW_AT_artifical attribute.  It's not clear if this
+        // matters for anything though.
+        llvm::DIGlobalVariable var = 
+            m->diBuilder->createGlobalVariable(name, 
+                                               file,
+                                               0 /* line */,
+                                               diType,
+                                               true /* static */,
+                                               sym->storagePtr);
+        Assert(var.Verify());
+    }
 }


@@ -652,7 +674,7 @@ lDefineConstantInt(const char *name, int val, llvm::Module *module,
 static void
 lDefineConstantIntFunc(const char *name, int val, llvm::Module *module,
                       SymbolTable *symbolTable) {
-    std::vector<const Type *> args;
+    llvm::SmallVector<const Type *, 8> args;
    FunctionType *ft = new FunctionType(AtomicType::UniformInt32, args, SourcePos());
    Symbol *sym = new Symbol(name, SourcePos(), ft, SC_STATIC);

@@ -670,21 +692,37 @@ lDefineConstantIntFunc(const char *name, int val, llvm::Module *module,

 static void
 lDefineProgramIndex(llvm::Module *module, SymbolTable *symbolTable) {
-    Symbol *pidx = 
+    Symbol *sym = 
        new Symbol("programIndex", SourcePos(), 
                   AtomicType::VaryingInt32->GetAsConstType(), SC_STATIC);

    int pi[ISPC_MAX_NVEC];
    for (int i = 0; i < g->target.vectorWidth; ++i)
        pi[i] = i;
-    pidx->constValue = new ConstExpr(pidx->type, pi, SourcePos());
+    sym->constValue = new ConstExpr(sym->type, pi, SourcePos());

-    LLVM_TYPE_CONST llvm::Type *ltype = LLVMTypes::Int32VectorType;
+    llvm::Type *ltype = LLVMTypes::Int32VectorType;
    llvm::Constant *linit = LLVMInt32Vector(pi);
-    pidx->storagePtr = new llvm::GlobalVariable(*module, ltype, true, 
-                                                llvm::GlobalValue::InternalLinkage, linit, 
-                                                pidx->name.c_str());
-    symbolTable->AddVariable(pidx);
+    // See comment in lDefineConstantInt() for why WeakODRLinkage is used here
+    sym->storagePtr = new llvm::GlobalVariable(*module, ltype, true, 
+                                               llvm::GlobalValue::WeakODRLinkage,
+                                               linit, 
+                                               sym->name.c_str());
+    symbolTable->AddVariable(sym);
+
+    if (m->diBuilder != NULL) {
+        llvm::DIFile file;
+        llvm::DIType diType = sym->type->GetDIType(file);
+        Assert(diType.Verify());
+        llvm::DIGlobalVariable var =
+            m->diBuilder->createGlobalVariable(sym->name.c_str(), 
+                                               file,
+                                               0 /* line */,
+                                               diType,
+                                               false /* static */,
+                                               sym->storagePtr);
+        Assert(var.Verify());
+    }
 }


@@ -809,6 +847,20 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
                               builtins_bitcode_generic_16_length, 
                               module, symbolTable);
            break;
+        case 32:
+            extern unsigned char builtins_bitcode_generic_32[];
+            extern int builtins_bitcode_generic_32_length;
+            AddBitcodeToModule(builtins_bitcode_generic_32, 
+                               builtins_bitcode_generic_32_length, 
+                               module, symbolTable);
+            break;
+        case 64:
+            extern unsigned char builtins_bitcode_generic_64[];
+            extern int builtins_bitcode_generic_64_length;
+            AddBitcodeToModule(builtins_bitcode_generic_64, 
+                               builtins_bitcode_generic_64_length, 
+                               module, symbolTable);
+            break;
 	case 1:
            extern unsigned char builtins_bitcode_generic_1[];
            extern int builtins_bitcode_generic_1_length;
@@ -841,10 +893,12 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
                       symbolTable);
    lDefineConstantInt("__math_lib_system", (int)Globals::Math_System, module,
                       symbolTable);
-    lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload, module,
-                           symbolTable);
+    lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload,
+                           module, symbolTable);

-    lDefineConstantInt("__have_native_half", (g->target.isa == Target::AVX2),
+    lDefineConstantInt("__have_native_half", g->target.hasHalf, module, 
+                       symbolTable);
+    lDefineConstantInt("__have_native_transcendentals", g->target.hasTranscendentals,
                       module, symbolTable);

    if (includeStdlibISPC) {
--- a/builtins/builtins.c
+++ b/builtins/builtins.c
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -70,7 +70,7 @@ typedef int Bool;
    putchar('[');                                                       \
    for (int i = 0; i < width; ++i) {                                   \
        /* only print the value if the current lane is executing */     \
-        if (mask & (1<<i))                                              \
+        if (mask & (1ull<<i))                                           \
            printf(fmt, ((type *)ptr)[i]);                              \
        else                                                            \
            printf("((" fmt "))", ((type *)ptr)[i]);                    \
@@ -89,7 +89,7 @@ typedef int Bool;
    @param mask    Current lane mask when the print statemnt is called
    @param args    Array of pointers to the values to be printed
 */
-void __do_print(const char *format, const char *types, int width, int mask, 
+void __do_print(const char *format, const char *types, int width, uint64_t mask, 
                void **args) {
    if (mask == 0) 
        return;
@@ -113,7 +113,7 @@ void __do_print(const char *format, const char *types, int width, int mask,
                case 'B': {
                    putchar('[');
                    for (int i = 0; i < width; ++i) {
-                        if (mask & (1<<i))
+                        if (mask & (1ull << i))
                            printf("%s", ((Bool *)ptr)[i] ? "true" : "false");
                        else
                            printf("_________");
--- a/builtins/target-avx-x2.ll
+++ b/builtins/target-avx-x2.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2010-2012, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -175,7 +175,7 @@ define <16 x float> @__min_varying_float(<16 x float>,

 declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone

-define i32 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
+define i64 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
  %floatmask = bitcast <16 x i32> %0 to <16 x float>
  %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -186,7 +186,8 @@ define i32 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {

  %v1shift = shl i32 %v1, 8
  %v = or i32 %v1shift, %v0
-  ret i32 %v
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
 }

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
--- a/builtins/target-avx.ll
+++ b/builtins/target-avx.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2010-2012, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -175,10 +175,11 @@ define <8 x float> @__min_varying_float(<8 x float>,

 declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone

-define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
+define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
  %floatmask = bitcast <8 x i32> %0 to <8 x float>
  %v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone
-  ret i32 %v
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
 }

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
--- a/builtins/target-generic-1.ll
+++ b/builtins/target-generic-1.ll
@@ -186,14 +186,14 @@ define void @__masked_store_blend_64(<1 x i64>* nocapture, <1 x i64>,
  ret void
 }

-define  i32 @__movmsk(<1 x i32>) nounwind readnone alwaysinline {
+define  i64 @__movmsk(<1 x i32>) nounwind readnone alwaysinline {
  %item = extractelement <1 x i32> %0, i32 0
  %v = lshr i32 %item, 31
-  ret i32 %v
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
 }


-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding
 ;;
--- a/builtins/target-generic-32.ll
+++ b/builtins/target-generic-32.ll
@@ -0,0 +1,33 @@
+;;  Copyright (c) 2010-2012, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`WIDTH',`32')
+include(`target-generic-common.ll')
--- a/builtins/target-generic-64.ll
+++ b/builtins/target-generic-64.ll
@@ -0,0 +1,33 @@
+;;  Copyright (c) 2010-2012, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`WIDTH',`64')
+include(`target-generic-common.ll')
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2010-2012, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -39,12 +39,12 @@ reduce_equal(WIDTH)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; broadcast/rotate/shuffle

-declare <WIDTH x float> @__smear_float(float) nounwind readnone
-declare <WIDTH x double> @__smear_double(double) nounwind readnone
-declare <WIDTH x i8> @__smear_i8(i8) nounwind readnone
-declare <WIDTH x i16> @__smear_i16(i16) nounwind readnone
-declare <WIDTH x i32> @__smear_i32(i32) nounwind readnone
-declare <WIDTH x i64> @__smear_i64(i64) nounwind readnone
+declare <WIDTH x float> @__smear_float(<WIDTH x float>, float) nounwind readnone
+declare <WIDTH x double> @__smear_double(<WIDTH x double>, double) nounwind readnone
+declare <WIDTH x i8> @__smear_i8(<WIDTH x i8>, i8) nounwind readnone
+declare <WIDTH x i16> @__smear_i16(<WIDTH x i16>, i16) nounwind readnone
+declare <WIDTH x i32> @__smear_i32(<WIDTH x i32>, i32) nounwind readnone
+declare <WIDTH x i64> @__smear_i64(<WIDTH x i64>, i64) nounwind readnone

 declare <WIDTH x float> @__broadcast_float(<WIDTH x float>, i32) nounwind readnone
 declare <WIDTH x double> @__broadcast_double(<WIDTH x double>, i32) nounwind readnone
@@ -201,7 +201,7 @@ declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; reductions

-declare i32 @__movmsk(<WIDTH x i1>) nounwind readnone 
+declare i64 @__movmsk(<WIDTH x i1>) nounwind readnone 

 declare float @__reduce_add_float(<WIDTH x float>) nounwind readnone
 declare float @__reduce_min_float(<WIDTH x float>) nounwind readnone 
@@ -249,7 +249,16 @@ declare void @__masked_store_32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
 declare void @__masked_store_64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
                                <WIDTH x i1> %mask) nounwind 

-ifelse(LLVM_VERSION, `LLVM_3_1svn',`
+ifelse(LLVM_VERSION, `LLVM_3_0', `
+declare void @__masked_store_blend_8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
+                                     <WIDTH x i1>) nounwind 
+declare void @__masked_store_blend_16(<WIDTH x i16>* nocapture, <WIDTH x i16>, 
+                                      <WIDTH x i1>) nounwind 
+declare void @__masked_store_blend_32(<WIDTH x i32>* nocapture, <WIDTH x i32>, 
+                                      <WIDTH x i1>) nounwind 
+declare void @__masked_store_blend_64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
+                                      <WIDTH x i1> %mask) nounwind 
+', `
 define void @__masked_store_blend_8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
                                     <WIDTH x i1>) nounwind alwaysinline {
  %v = load <WIDTH x i8> * %0
@@ -281,15 +290,6 @@ define void @__masked_store_blend_64(<WIDTH x i64>* nocapture,
  store <WIDTH x i64> %v1, <WIDTH x i64> * %0
  ret void
 }
-',`
-declare void @__masked_store_blend_8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
-                                     <WIDTH x i1>) nounwind 
-declare void @__masked_store_blend_16(<WIDTH x i16>* nocapture, <WIDTH x i16>, 
-                                      <WIDTH x i1>) nounwind 
-declare void @__masked_store_blend_32(<WIDTH x i32>* nocapture, <WIDTH x i32>, 
-                                      <WIDTH x i1>) nounwind 
-declare void @__masked_store_blend_64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
-                                      <WIDTH x i1> %mask) nounwind 
 ')

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
--- a/builtins/target-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2010-2012, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -295,7 +295,7 @@ define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {

 declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone

-define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
+define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
  ; first do two 4-wide movmsk calls
  %floatmask = bitcast <8 x i32> %0 to <8 x float>
  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
@@ -309,7 +309,8 @@ define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
  ; of the second one
  %v1s = shl i32 %v1, 4
  %v = or i32 %v0, %v1s
-  ret i32 %v
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
 }

 define <4 x float> @__vec4_add_float(<4 x float> %v0,
--- a/builtins/target-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2010-2012, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -239,10 +239,11 @@ define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {

 declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone

-define i32 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
+define i64 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
  %floatmask = bitcast <4 x i32> %0 to <4 x float>
  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
-  ret i32 %v
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
 }

 define float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline {
--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2010-2012, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -237,7 +237,7 @@ define <8 x i32> @__max_varying_uint32(<8 x i32>,

 declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone

-define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
+define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
  ; first do two 4-wide movmsk calls
  %floatmask = bitcast <8 x i32> %0 to <8 x float>
  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
@@ -251,7 +251,8 @@ define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
  ; of the second one
  %v1s = shl i32 %v1, 4
  %v = or i32 %v0, %v1s
-  ret i32 %v
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
 }

 define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2010-2012, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -271,10 +271,11 @@ define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alway

 declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone

-define i32 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
+define i64 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
  %floatmask = bitcast <4 x i32> %0 to <4 x float>
  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
-  ret i32 %v
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
 }

 declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2010-2012, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -38,6 +38,18 @@ declare i1 @__is_compile_time_constant_uniform_int32(i32)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

+;; It is a bit of a pain to compute this in m4 for 32 and 64-wide targets...
+define(`ALL_ON_MASK',
+`ifelse(WIDTH, `64', `-1', 
+        WIDTH, `32', `4294967295',
+                     `eval((1<<WIDTH)-1)')')
+
+define(`MASK_HIGH_BIT_ON',
+`ifelse(WIDTH, `64', `-9223372036854775808',
+        WIDTH, `32', `2147483648',
+                     `eval(1<<(WIDTH-1))')')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 ;; Helper macro for calling various SSE instructions for scalar values
 ;; but where the instruction takes a vector parameter.
@@ -1529,7 +1541,7 @@ declare i32 @__fast_masked_vload()
 declare i8* @ISPCAlloc(i8**, i64, i32) nounwind
 declare void @ISPCLaunch(i8**, i8*, i8*, i32) nounwind
 declare void @ISPCSync(i8*) nounwind
-declare void @ISPCInstrument(i8*, i8*, i32, i32) nounwind
+declare void @ISPCInstrument(i8*, i8*, i32, i64) nounwind

 declare i1 @__is_compile_time_constant_mask(<WIDTH x MASK> %mask)
 declare i1 @__is_compile_time_constant_varying_int32(<WIDTH x i32>)
@@ -1654,6 +1666,265 @@ declare void @__pseudo_scatter_base_offsets64_32(i8 * nocapture, <WIDTH x i64>,
 declare void @__pseudo_scatter_base_offsets64_64(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                 <WIDTH x i64>, <WIDTH x MASK>) nounwind

+declare float @__log_uniform_float(float) nounwind readnone
+declare <WIDTH x float> @__log_varying_float(<WIDTH x float>) nounwind readnone
+declare float @__exp_uniform_float(float) nounwind readnone
+declare <WIDTH x float> @__exp_varying_float(<WIDTH x float>) nounwind readnone
+declare float @__pow_uniform_float(float, float) nounwind readnone
+declare <WIDTH x float> @__pow_varying_float(<WIDTH x float>, <WIDTH x float>) nounwind readnone
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+declare void @__use8(<WIDTH x i8>)
+declare void @__use16(<WIDTH x i16>)
+declare void @__use32(<WIDTH x i32>)
+declare void @__use64(<WIDTH x i64>)
+
+;; This is a temporary function that will be removed at the end of
+;; compilation--the idea is that it calls out to all of the various
+;; functions / pseudo-function declarations that we need to keep around
+;; so that they are available to the various optimization passes.  This
+;; then prevents those functions from being removed as dead code when
+;; we do early DCE...
+
+define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
+                               <WIDTH x i32> %v32, <WIDTH x i64> %v64,
+                               <WIDTH x MASK> %mask) {
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+  ;; loads
+  %ml8  = call <WIDTH x i8>  @__masked_load_8(i8 * %ptr, <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %ml8)
+  %ml16 = call <WIDTH x i16> @__masked_load_16(i8 * %ptr, <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %ml16)
+  %ml32 = call <WIDTH x i32> @__masked_load_32(i8 * %ptr, <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %ml32)
+  %ml64 = call <WIDTH x i64> @__masked_load_64(i8 * %ptr, <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %ml64)
+
+  %lb8   = call <WIDTH x i8>  @__load_and_broadcast_8(i8 * %ptr, <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %lb8)
+  %lb16  = call <WIDTH x i16> @__load_and_broadcast_16(i8 * %ptr, <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %lb16)
+  %lb32  = call <WIDTH x i32> @__load_and_broadcast_32(i8 * %ptr, <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %lb32)
+  %lb64  = call <WIDTH x i64> @__load_and_broadcast_64(i8 * %ptr, <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %lb64)
+
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+  ;; stores
+  %pv8 = bitcast i8 * %ptr to <WIDTH x i8> *
+  call void @__pseudo_masked_store_8(<WIDTH x i8> * %pv8, <WIDTH x i8> %v8,
+                                     <WIDTH x MASK> %mask)
+  %pv16 = bitcast i8 * %ptr to <WIDTH x i16> *
+  call void @__pseudo_masked_store_16(<WIDTH x i16> * %pv16, <WIDTH x i16> %v16,
+                                      <WIDTH x MASK> %mask)
+  %pv32 = bitcast i8 * %ptr to <WIDTH x i32> *
+  call void @__pseudo_masked_store_32(<WIDTH x i32> * %pv32, <WIDTH x i32> %v32,
+                                      <WIDTH x MASK> %mask)
+  %pv64 = bitcast i8 * %ptr to <WIDTH x i64> *
+  call void @__pseudo_masked_store_64(<WIDTH x i64> * %pv64, <WIDTH x i64> %v64,
+                                      <WIDTH x MASK> %mask)
+
+  call void @__masked_store_8(<WIDTH x i8> * %pv8, <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__masked_store_16(<WIDTH x i16> * %pv16, <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__masked_store_32(<WIDTH x i32> * %pv32, <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__masked_store_64(<WIDTH x i64> * %pv64, <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+
+  call void @__masked_store_blend_8(<WIDTH x i8> * %pv8, <WIDTH x i8> %v8,
+                                    <WIDTH x MASK> %mask)
+  call void @__masked_store_blend_16(<WIDTH x i16> * %pv16, <WIDTH x i16> %v16,
+                                     <WIDTH x MASK> %mask)
+  call void @__masked_store_blend_32(<WIDTH x i32> * %pv32, <WIDTH x i32> %v32,
+                                     <WIDTH x MASK> %mask)
+  call void @__masked_store_blend_64(<WIDTH x i64> * %pv64, <WIDTH x i64> %v64,
+                                     <WIDTH x MASK> %mask)
+
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+  ;; gathers
+
+  %pg32_8 = call <WIDTH x i8>  @__pseudo_gather32_8(<WIDTH x i32> %v32,
+                                                    <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %pg32_8)
+  %pg32_16 = call <WIDTH x i16>  @__pseudo_gather32_16(<WIDTH x i32> %v32,
+                                                       <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %pg32_16)
+  %pg32_32 = call <WIDTH x i32>  @__pseudo_gather32_32(<WIDTH x i32> %v32,
+                                                       <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %pg32_32)
+  %pg32_64 = call <WIDTH x i64>  @__pseudo_gather32_64(<WIDTH x i32> %v32,
+                                                       <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %pg32_64)
+
+  %pg64_8 = call <WIDTH x i8>  @__pseudo_gather64_8(<WIDTH x i64> %v64,
+                                                    <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %pg64_8)
+  %pg64_16 = call <WIDTH x i16>  @__pseudo_gather64_16(<WIDTH x i64> %v64,
+                                                       <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %pg64_16)
+  %pg64_32 = call <WIDTH x i32>  @__pseudo_gather64_32(<WIDTH x i64> %v64,
+                                                       <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %pg64_32)
+  %pg64_64 = call <WIDTH x i64>  @__pseudo_gather64_64(<WIDTH x i64> %v64,
+                                                       <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %pg64_64)
+
+  %g32_8 = call <WIDTH x i8>  @__gather32_i8(<WIDTH x i32> %v32,
+                                            <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %g32_8)
+  %g32_16 = call <WIDTH x i16>  @__gather32_i16(<WIDTH x i32> %v32,
+                                               <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %g32_16)
+  %g32_32 = call <WIDTH x i32>  @__gather32_i32(<WIDTH x i32> %v32,
+                                               <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %g32_32)
+  %g32_64 = call <WIDTH x i64>  @__gather32_i64(<WIDTH x i32> %v32,
+                                               <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %g32_64)
+
+  %g64_8 = call <WIDTH x i8>  @__gather64_i8(<WIDTH x i64> %v64,
+                                            <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %g64_8)
+  %g64_16 = call <WIDTH x i16>  @__gather64_i16(<WIDTH x i64> %v64,
+                                               <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %g64_16)
+  %g64_32 = call <WIDTH x i32>  @__gather64_i32(<WIDTH x i64> %v64,
+                                               <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %g64_32)
+  %g64_64 = call <WIDTH x i64>  @__gather64_i64(<WIDTH x i64> %v64,
+                                                <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %g64_64)
+
+  %pgbo32_8 = call <WIDTH x i8>
+       @__pseudo_gather_base_offsets32_8(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                         <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %pgbo32_8)
+  %pgbo32_16 = call <WIDTH x i16>
+       @__pseudo_gather_base_offsets32_16(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                          <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %pgbo32_16)
+  %pgbo32_32 = call <WIDTH x i32>
+       @__pseudo_gather_base_offsets32_32(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                          <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %pgbo32_32)
+  %pgbo32_64 = call <WIDTH x i64>
+       @__pseudo_gather_base_offsets32_64(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                          <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %pgbo32_64)
+
+  %gbo32_8 = call <WIDTH x i8>
+       @__gather_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                  <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %gbo32_8)
+  %gbo32_16 = call <WIDTH x i16>
+       @__gather_base_offsets32_i16(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                   <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %gbo32_16)
+  %gbo32_32 = call <WIDTH x i32>
+       @__gather_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                   <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %gbo32_32)
+  %gbo32_64 = call <WIDTH x i64>
+       @__gather_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
+                                   <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %gbo32_64)
+
+
+  %pgbo64_8 = call <WIDTH x i8>
+       @__pseudo_gather_base_offsets64_8(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+                                         <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %pgbo64_8)
+  %pgbo64_16 = call <WIDTH x i16>
+       @__pseudo_gather_base_offsets64_16(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+                                          <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %pgbo64_16)
+  %pgbo64_32 = call <WIDTH x i32>
+       @__pseudo_gather_base_offsets64_32(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+                                          <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %pgbo64_32)
+  %pgbo64_64 = call <WIDTH x i64>
+       @__pseudo_gather_base_offsets64_64(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+                                          <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %pgbo64_64)
+
+  %gbo64_8 = call <WIDTH x i8>
+       @__gather_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+                                  <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use8(<WIDTH x i8> %gbo64_8)
+  %gbo64_16 = call <WIDTH x i16>
+       @__gather_base_offsets64_i16(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+                                   <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use16(<WIDTH x i16> %gbo64_16)
+  %gbo64_32 = call <WIDTH x i32>
+       @__gather_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+                                   <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use32(<WIDTH x i32> %gbo64_32)
+  %gbo64_64 = call <WIDTH x i64>
+       @__gather_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
+                                   <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+  call void @__use64(<WIDTH x i64> %gbo64_64)
+
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+  ;; scatters
+
+  call void @__pseudo_scatter32_8(<WIDTH x i32> %v32, <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter32_16(<WIDTH x i32> %v32, <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter32_32(<WIDTH x i32> %v32, <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter32_64(<WIDTH x i32> %v32, <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+
+  call void @__pseudo_scatter64_8(<WIDTH x i64> %v64, <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter64_16(<WIDTH x i64> %v64, <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter64_32(<WIDTH x i64> %v64, <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter64_64(<WIDTH x i64> %v64, <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+
+  call void @__scatter32_i8(<WIDTH x i32> %v32, <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__scatter32_i16(<WIDTH x i32> %v32, <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__scatter32_i32(<WIDTH x i32> %v32, <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__scatter32_i64(<WIDTH x i32> %v32, <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+
+  call void @__scatter64_i8(<WIDTH x i64> %v64, <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__scatter64_i16(<WIDTH x i64> %v64, <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__scatter64_i32(<WIDTH x i64> %v64, <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__scatter64_i64(<WIDTH x i64> %v64, <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+
+  call void @__pseudo_scatter_base_offsets32_8(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_base_offsets32_16(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_base_offsets32_32(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_base_offsets32_64(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+
+  call void @__pseudo_scatter_base_offsets64_8(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_base_offsets64_16(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                 <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_base_offsets64_32(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                 <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__pseudo_scatter_base_offsets64_64(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                 <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+
+  call void @__scatter_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets32_i16(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
+                                                 <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+
+  call void @__scatter_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                <WIDTH x i8> %v8, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets64_i16(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                 <WIDTH x i16> %v16, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                 <WIDTH x i32> %v32, <WIDTH x MASK> %mask)
+  call void @__scatter_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
+                                                 <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
+
+  ret void
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; vector ops

@@ -1837,12 +2108,12 @@ ok:


 define void @__do_assert_varying(i8 *%str, <WIDTH x MASK> %test,
-                                          <WIDTH x MASK> %mask) {
+                                 <WIDTH x MASK> %mask) {
  %nottest = xor <WIDTH x MASK> %test,
                 < forloop(i, 1, eval(WIDTH-1), `MASK -1, ') MASK -1 >
  %nottest_and_mask = and <WIDTH x MASK> %nottest, %mask
-  %mm = call i32 @__movmsk(<WIDTH x MASK> %nottest_and_mask)
-  %all_ok = icmp eq i32 %mm, 0
+  %mm = call i64 @__movmsk(<WIDTH x MASK> %nottest_and_mask)
+  %all_ok = icmp eq i64 %mm, 0
  br i1 %all_ok, label %ok, label %fail

 fail:
@@ -2244,14 +2515,18 @@ define <$1 x $2> @__load_and_broadcast_$3(i8 *, <$1 x MASK> %mask) nounwind alwa
 ;; $4: alignment for elements of type $2 (4, 8, ...)

 define(`masked_load', `
-define <$1 x $2> @__masked_load_$3(i8 *, <$1 x i32> %mask) nounwind alwaysinline {
+define <$1 x $2> @__masked_load_$3(i8 *, <$1 x MASK> %mask) nounwind alwaysinline {
 entry:
-  %mm = call i32 @__movmsk(<$1 x i32> %mask)
+  %mm = call i64 @__movmsk(<$1 x MASK> %mask)
  
  ; if the first lane and the last lane are on, then it is safe to do a vector load
  ; of the whole thing--what the lanes in the middle want turns out to not matter...
-  %mm_and = and i32 %mm, eval(1 | (1<<($1-1)))
-  %can_vload = icmp eq i32 %mm_and, eval(1 | (1<<($1-1)))
+  %mm_and_low = and i64 %mm, 1
+  %mm_and_high = and i64 %mm, MASK_HIGH_BIT_ON
+  %mm_and_high_shift = lshr i64 %mm_and_high, eval(WIDTH-1)
+  %mm_and_low_i1 = trunc i64 %mm_and_low to i1
+  %mm_and_high_shift_i1 = trunc i64 %mm_and_high_shift to i1
+  %can_vload = and i1 %mm_and_low_i1, %mm_and_high_shift_i1

  %fast32 = call i32 @__fast_masked_vload()
  %fast_i1 = trunc i32 %fast32 to i1
@@ -2270,9 +2545,10 @@ load:
 loop:
  ; loop over the lanes and see if each one is on...
  %lane = phi i32 [ 0, %entry ], [ %next_lane, %lane_done ]
-  %lanemask = shl i32 1, %lane
-  %mask_and = and i32 %mm, %lanemask
-  %do_lane = icmp ne i32 %mask_and, 0
+  %lane64 = zext i32 %lane to i64
+  %lanemask = shl i64 1, %lane64
+  %mask_and = and i64 %mm, %lanemask
+  %do_lane = icmp ne i64 %mask_and, 0
  br i1 %do_lane, label %load_lane, label %lane_done

 load_lane:
@@ -2484,12 +2760,12 @@ define(`packed_load_and_store', `
 define i32 @__packed_load_active(i32 * %startptr, <WIDTH x i32> * %val_ptr,
                                 <WIDTH x i32> %full_mask) nounwind alwaysinline {
 entry:
-  %mask = call i32 @__movmsk(<WIDTH x i32> %full_mask)
+  %mask = call i64 @__movmsk(<WIDTH x i32> %full_mask)
  %mask_known = call i1 @__is_compile_time_constant_mask(<WIDTH x i32> %full_mask)
  br i1 %mask_known, label %known_mask, label %unknown_mask

 known_mask:
-  %allon = icmp eq i32 %mask, eval((1 << WIDTH) -1)
+  %allon = icmp eq i64 %mask, ALL_ON_MASK
  br i1 %allon, label %all_on, label %unknown_mask

 all_on:
@@ -2505,12 +2781,12 @@ unknown_mask:

 loop:
  %lane = phi i32 [ 0, %unknown_mask ], [ %nextlane, %loopend ]
-  %lanemask = phi i32 [ 1, %unknown_mask ], [ %nextlanemask, %loopend ]
+  %lanemask = phi i64 [ 1, %unknown_mask ], [ %nextlanemask, %loopend ]
  %offset = phi i32 [ 0, %unknown_mask ], [ %nextoffset, %loopend ]

  ; is the current lane on?
-  %and = and i32 %mask, %lanemask
-  %do_load = icmp eq i32 %and, %lanemask
+  %and = and i64 %mask, %lanemask
+  %do_load = icmp eq i64 %and, %lanemask
  br i1 %do_load, label %load, label %loopend 

 load:
@@ -2525,7 +2801,7 @@ load:
 loopend:
  %nextoffset = phi i32 [ %offset1, %load ], [ %offset, %loop ]
  %nextlane = add i32 %lane, 1
-  %nextlanemask = mul i32 %lanemask, 2
+  %nextlanemask = mul i64 %lanemask, 2

  ; are we done yet?
  %test = icmp ne i32 %nextlane, WIDTH
@@ -2536,14 +2812,14 @@ done:
 }

 define i32 @__packed_store_active(i32 * %startptr, <WIDTH x i32> %vals,
-                                  <WIDTH x i32> %full_mask) nounwind alwaysinline {
+                                   <WIDTH x i32> %full_mask) nounwind alwaysinline {
 entry:
-  %mask = call i32 @__movmsk(<WIDTH x i32> %full_mask)
+  %mask = call i64 @__movmsk(<WIDTH x i32> %full_mask)
  %mask_known = call i1 @__is_compile_time_constant_mask(<WIDTH x i32> %full_mask)
  br i1 %mask_known, label %known_mask, label %unknown_mask

 known_mask:
-  %allon = icmp eq i32 %mask, eval((1 << WIDTH) -1)
+  %allon = icmp eq i64 %mask, ALL_ON_MASK
  br i1 %allon, label %all_on, label %unknown_mask

 all_on:
@@ -2556,12 +2832,12 @@ unknown_mask:

 loop:
  %lane = phi i32 [ 0, %unknown_mask ], [ %nextlane, %loopend ]
-  %lanemask = phi i32 [ 1, %unknown_mask ], [ %nextlanemask, %loopend ]
+  %lanemask = phi i64 [ 1, %unknown_mask ], [ %nextlanemask, %loopend ]
  %offset = phi i32 [ 0, %unknown_mask ], [ %nextoffset, %loopend ]

  ; is the current lane on?
-  %and = and i32 %mask, %lanemask
-  %do_store = icmp eq i32 %and, %lanemask
+  %and = and i64 %mask, %lanemask
+  %do_store = icmp eq i64 %and, %lanemask
  br i1 %do_store, label %store, label %loopend 

 store:
@@ -2574,7 +2850,7 @@ store:
 loopend:
  %nextoffset = phi i32 [ %offset1, %store ], [ %offset, %loop ]
  %nextlane = add i32 %lane, 1
-  %nextlanemask = mul i32 %lanemask, 2
+  %nextlanemask = mul i64 %lanemask, 2

  ; are we done yet?
  %test = icmp ne i32 %nextlane, WIDTH
@@ -2598,14 +2874,15 @@ define(`reduce_equal_aux', `
 define i1 @__reduce_equal_$3(<$1 x $2> %v, $2 * %samevalue,
                             <$1 x MASK> %mask) nounwind alwaysinline {
 entry:
-   %mm = call i32 @__movmsk(<$1 x MASK> %mask)
-   %allon = icmp eq i32 %mm, eval((1<<$1)-1)
+   %mm = call i64 @__movmsk(<$1 x MASK> %mask)
+   %allon = icmp eq i64 %mm, ALL_ON_MASK
   br i1 %allon, label %check_neighbors, label %domixed

 domixed:
  ; First, figure out which lane is the first active one
-  %first = call i32 @llvm.cttz.i32(i32 %mm)
-  %baseval = extractelement <$1 x $2> %v, i32 %first
+  %first = call i64 @llvm.cttz.i64(i64 %mm)
+  %first32 = trunc i64 %first to i32
+  %baseval = extractelement <$1 x $2> %v, i32 %first32
  %basev1 = bitcast $2 %baseval to <1 x $2>
  ; get a vector that is that value smeared across all elements
  %basesmear = shufflevector <1 x $2> %basev1, <1 x $2> undef,
@@ -2636,9 +2913,9 @@ check_neighbors:
  %eq = $5 eq <$1 x $2> %vec, %vr
  ifelse(MASK,i32, `
    %eq32 = sext <$1 x i1> %eq to <$1 x i32>
-    %eqmm = call i32 @__movmsk(<$1 x i32> %eq32)', `
-    %eqmm = call i32 @__movmsk(<$1 x MASK> %eq)')
-  %alleq = icmp eq i32 %eqmm, eval((1<<$1)-1)
+    %eqmm = call i64 @__movmsk(<$1 x i32> %eq32)', `
+    %eqmm = call i64 @__movmsk(<$1 x MASK> %eq)')
+  %alleq = icmp eq i64 %eqmm, ALL_ON_MASK
  br i1 %alleq, label %all_equal, label %not_all_equal
  ', `
  ; But for 64-bit elements, it turns out to be more efficient to just
@@ -2751,14 +3028,14 @@ define(`per_lane', `
  br label %pl_entry

 pl_entry:
-  %pl_mask = call i32 @__movmsk($2)
+  %pl_mask = call i64 @__movmsk($2)
  %pl_mask_known = call i1 @__is_compile_time_constant_mask($2)
  br i1 %pl_mask_known, label %pl_known_mask, label %pl_unknown_mask

 pl_known_mask:
  ;; the mask is known at compile time; see if it is something we can
  ;; handle more efficiently
-  %pl_is_allon = icmp eq i32 %pl_mask, eval((1<<$1)-1)
+  %pl_is_allon = icmp eq i64 %pl_mask, ALL_ON_MASK
  br i1 %pl_is_allon, label %pl_all_on, label %pl_unknown_mask

 pl_all_on:
@@ -2780,11 +3057,11 @@ pl_unknown_mask:
 pl_loop:
  ;; Loop over each lane and see if we want to do the work for this lane
  %pl_lane = phi i32 [ 0, %pl_unknown_mask ], [ %pl_nextlane, %pl_loopend ]
-  %pl_lanemask = phi i32 [ 1, %pl_unknown_mask ], [ %pl_nextlanemask, %pl_loopend ]
+  %pl_lanemask = phi i64 [ 1, %pl_unknown_mask ], [ %pl_nextlanemask, %pl_loopend ]

  ; is the current lane on?  if so, goto do work, otherwise to end of loop
-  %pl_and = and i32 %pl_mask, %pl_lanemask
-  %pl_doit = icmp eq i32 %pl_and, %pl_lanemask
+  %pl_and = and i64 %pl_mask, %pl_lanemask
+  %pl_doit = icmp eq i64 %pl_and, %pl_lanemask
  br i1 %pl_doit, label %pl_dolane, label %pl_loopend 

 pl_dolane:
@@ -2795,7 +3072,7 @@ pl_dolane:

 pl_loopend:
  %pl_nextlane = add i32 %pl_lane, 1
-  %pl_nextlanemask = mul i32 %pl_lanemask, 2
+  %pl_nextlanemask = mul i64 %pl_lanemask, 2

  ; are we done yet?
  %pl_test = icmp ne i32 %pl_nextlane, $1
@@ -2880,11 +3157,11 @@ define <$1 x $2> @__gather_base_offsets32_$2(i8 * %ptr, <$1 x i32> %offsets, i32
  %newDelta = load <$1 x i32> * %deltaPtr

  %ret0 = call <$1 x $2> @__gather_elt32_$2(i8 * %ptr, <$1 x i32> %newOffsets,
-                                            i32 %offset_scale, <$1 x i32> %offset_delta,
+                                            i32 %offset_scale, <$1 x i32> %newDelta,
                                            <$1 x $2> undef, i32 0)
  forloop(lane, 1, eval($1-1), 
          `patsubst(patsubst(`%retLANE = call <$1 x $2> @__gather_elt32_$2(i8 * %ptr, 
-                                <$1 x i32> %newOffsets, i32 %offset_scale, <$1 x i32> %offset_delta,
+                                <$1 x i32> %newOffsets, i32 %offset_scale, <$1 x i32> %newDelta,
                                <$1 x $2> %retPREV, i32 LANE)
                    ', `LANE', lane), `PREV', eval(lane-1))')
  ret <$1 x $2> %ret`'eval($1-1)
--- a/cbackend.cpp
+++ b/cbackend.cpp
@@ -12,9 +12,7 @@
 //
 //===----------------------------------------------------------------------===//

-#ifdef LLVM_2_9
-#warning "The C++ backend isn't supported when building with LLVM 2.9"
-#else
+#include <stdio.h>

 #ifndef _MSC_VER
 #include <inttypes.h>
@@ -339,8 +337,6 @@ namespace {
                           bool IsVolatile, unsigned Alignment);

  private :
-    std::string InterpretASMConstraint(InlineAsm::ConstraintInfo& c);
-
    void lowerIntrinsics(Function &F);
    /// Prints the definition of the intrinsic function F. Supports the 
    /// intrinsics which need to be explicitly defined in the CBackend.
@@ -363,7 +359,7 @@ namespace {
    bool printConstExprCast(const ConstantExpr *CE, bool Static);
    void printConstantArray(ConstantArray *CPA, bool Static);
    void printConstantVector(ConstantVector *CV, bool Static);
-#ifdef LLVM_3_1svn
+#ifndef LLVM_3_0
    void printConstantDataSequential(ConstantDataSequential *CDS, bool Static);
 #endif

@@ -440,11 +436,11 @@ namespace {
    void visitInvokeInst(InvokeInst &I) {
      llvm_unreachable("Lowerinvoke pass didn't work!");
    }
-#if !defined(LLVM_3_1) && !defined(LLVM_3_1svn)
+#ifdef LLVM_3_0
    void visitUnwindInst(UnwindInst &I) {
      llvm_unreachable("Lowerinvoke pass didn't work!");
    }
-#endif // !LLVM_3_1svn
+#endif // LLVM_3_0
    void visitResumeInst(ResumeInst &I) {
      llvm_unreachable("DwarfEHPrepare pass didn't work!");
    }
@@ -804,7 +800,7 @@ raw_ostream &CWriter::printType(raw_ostream &Out, Type *Ty,
 }

 void CWriter::printConstantArray(ConstantArray *CPA, bool Static) {
-#ifndef LLVM_3_1svn
+#ifdef LLVM_3_0
  Type *ETy = CPA->getType()->getElementType();
  // MMP: this looks like a bug: both sides of the || are the same
  bool isString = ETy == Type::getInt8Ty(CPA->getContext());
@@ -857,7 +853,7 @@ void CWriter::printConstantArray(ConstantArray *CPA, bool Static) {
    Out << "\"";
    return;
  }
-#endif // !LLVM_3_1
+#endif // LLVM_3_0

  printConstant(cast<Constant>(CPA->getOperand(0)), Static);
  for (unsigned i = 1, e = CPA->getNumOperands(); i != e; ++i) {
@@ -874,7 +870,7 @@ void CWriter::printConstantVector(ConstantVector *CP, bool Static) {
  }
 }

-#ifdef LLVM_3_1svn
+#ifndef LLVM_3_0
 void CWriter::printConstantDataSequential(ConstantDataSequential *CDS,
                                          bool Static) {
  // As a special case, print the array as a string if it is an array of
@@ -931,7 +927,21 @@ void CWriter::printConstantDataSequential(ConstantDataSequential *CDS,
    }
  }
 }
-#endif // LLVM_3_1svn
+#endif // !LLVM_3_0
+
+#ifndef LLVM_3_0
+static inline std::string ftostr(const APFloat& V) {
+  std::string Buf;
+  if (&V.getSemantics() == &APFloat::IEEEdouble) {
+    raw_string_ostream(Buf) << V.convertToDouble();
+    return Buf;
+  } else if (&V.getSemantics() == &APFloat::IEEEsingle) {
+    raw_string_ostream(Buf) << (double)V.convertToFloat();
+    return Buf;
+  }
+  return "<unknown format in ftostr>"; // error
+}
+#endif // !LLVM_3_0

 // isFPCSafeToPrint - Returns true if we may assume that CFP may be written out
 // textually as a double (rather than as a reference to a stack-allocated
@@ -1084,6 +1094,26 @@ bool CWriter::printCast(unsigned opc, Type *SrcTy, Type *DstTy) {
  return false;
 }

+
+// FIXME: generalize this/make it not so hard-coded?
+static const char *lGetSmearFunc(Type *matchType) {
+    switch (matchType->getTypeID()) {
+    case Type::FloatTyID:  return "__smear_float";
+    case Type::DoubleTyID: return "__smear_double";
+    case Type::IntegerTyID: {
+        switch (cast<IntegerType>(matchType)->getBitWidth()) {
+        case 1:  return "__smear_i1";
+        case 8:  return "__smear_i8";
+        case 16: return "__smear_i16";
+        case 32: return "__smear_i32";
+        case 64: return "__smear_i64";
+        }
+    }
+    default: return NULL;
+    }
+}
+
+
 // printConstant - The LLVM Constant to C Constant converter.
 void CWriter::printConstant(Constant *CPV, bool Static) {
  if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(CPV)) {
@@ -1400,11 +1430,11 @@ void CWriter::printConstant(Constant *CPV, bool Static) {
    }
    if (ConstantArray *CA = dyn_cast<ConstantArray>(CPV)) {
      printConstantArray(CA, Static);
-#ifdef LLVM_3_1svn
+#ifndef LLVM_3_0
    } else if (ConstantDataSequential *CDS = 
               dyn_cast<ConstantDataSequential>(CPV)) {
      printConstantDataSequential(CDS, Static);
-#endif // LLVM_3_1svn
+#endif // !LLVM_3_0
    } else {
      assert(isa<ConstantAggregateZero>(CPV) || isa<UndefValue>(CPV));
      if (AT->getNumElements()) {
@@ -1423,30 +1453,68 @@ void CWriter::printConstant(Constant *CPV, bool Static) {
        Out << ")";
    break;
  }
-  case Type::VectorTyID:
-    printType(Out, CPV->getType());
-    Out << "(";
+  case Type::VectorTyID: {
+    VectorType *VT = dyn_cast<VectorType>(CPV->getType());
+    const char *smearFunc = lGetSmearFunc(VT->getElementType());

-    if (ConstantVector *CV = dyn_cast<ConstantVector>(CPV)) {
-      printConstantVector(CV, Static);
-#ifdef LLVM_3_1svn
-    } else if (ConstantDataSequential *CDS = 
-               dyn_cast<ConstantDataSequential>(CPV)) {
-      printConstantDataSequential(CDS, Static);
-#endif
-    } else {
-      assert(isa<ConstantAggregateZero>(CPV) || isa<UndefValue>(CPV));
-      VectorType *VT = cast<VectorType>(CPV->getType());
+    if (isa<ConstantAggregateZero>(CPV)) {
+        assert(smearFunc != NULL);
+
+        Constant *CZ = Constant::getNullValue(VT->getElementType());
+        Out << smearFunc << "(";
+        printType(Out, VT);
+        Out << "(), ";
+        printConstant(CZ, Static);
+        Out << ")";
+    }
+    else if (ConstantVector *CV = dyn_cast<ConstantVector>(CPV)) {
+      llvm::Constant *splatValue = CV->getSplatValue();
+      if (splatValue != NULL && smearFunc != NULL) {
+        Out << smearFunc << "(";
+        printType(Out, VT);
+        Out << "(), ";
+        printConstant(splatValue, Static);
+        Out << ")";
+      }
+      else {
+        printType(Out, CPV->getType());
+        Out << "(";
+        printConstantVector(CV, Static);
+        Out << ")";
+      }
+    }
+#ifndef LLVM_3_0
+    else if (ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(CPV)) {
+      llvm::Constant *splatValue = CDV->getSplatValue();
+      if (splatValue != NULL && smearFunc != NULL) {
+        Out << smearFunc << "(";
+        printType(Out, VT);
+        Out << "(), ";
+        printConstant(splatValue, Static);
+        Out << ")";
+      }
+      else {
+        printType(Out, CPV->getType());
+        Out << "(";
+        printConstantDataSequential(CDV, Static);
+        Out << ")";
+      }
+    }
+#endif // !LLVM_3_0
+    else {
+      assert(isa<UndefValue>(CPV));
      Constant *CZ = Constant::getNullValue(VT->getElementType());
+      printType(Out, CPV->getType());
+      Out << "(";
      printConstant(CZ, Static);
      for (unsigned i = 1, e = VT->getNumElements(); i != e; ++i) {
        Out << ", ";
        printConstant(CZ, Static);
      }
+      Out << ")";
    }
-    Out << ")";
    break;
-
+  }
  case Type::StructTyID:
    if (!Static) {
      // call init func...
@@ -1639,7 +1707,12 @@ std::string CWriter::GetValueName(const Value *Operand) {
      VarName += ch;
  }

-  return VarName + "_llvm_cbe";
+  if (isa<BasicBlock>(Operand))
+    VarName += "_label";
+  else
+    VarName += "_";
+
+  return VarName;
 }

 /// writeInstComputationInline - Emit the computation for the specified
@@ -2071,69 +2144,18 @@ bool CWriter::doInitialization(Module &M) {

  Out << "#include \"" << includeName << "\"\n";

-  generateCompilerSpecificCode(Out, TD);
-
-  // Function declarations
-  Out << "\n/* Function Declarations */\n";
+  Out << "\n/* Basic Library Function Declarations */\n";
  Out << "extern \"C\" {\n";
  Out << "int puts(unsigned char *);\n";
  Out << "unsigned int putchar(unsigned int);\n";
  Out << "int fflush(void *);\n";
  Out << "int printf(const unsigned char *, ...);\n";
  Out << "uint8_t *memcpy(uint8_t *, uint8_t *, uint64_t );\n";
+  Out << "uint8_t *memset(uint8_t *, uint8_t, uint64_t );\n";
+  Out << "void memset_pattern16(void *, const void *, uint64_t );\n";
+  Out << "}\n\n";

-  // Store the intrinsics which will be declared/defined below.
-  SmallVector<const Function*, 8> intrinsicsToDefine;
-
-  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
-    // Don't print declarations for intrinsic functions.
-    // Store the used intrinsics, which need to be explicitly defined.
-    if (I->isIntrinsic()) {
-      switch (I->getIntrinsicID()) {
-        default:
-          break;
-        case Intrinsic::uadd_with_overflow:
-        case Intrinsic::sadd_with_overflow:
-          intrinsicsToDefine.push_back(I);
-          break;
-      }
-      continue;
-    }
-
-    if (I->getName() == "setjmp" || I->getName() == "abort" ||
-        I->getName() == "longjmp" || I->getName() == "_setjmp" ||
-        I->getName() == "memset" || I->getName() == "memset_pattern16" ||
-        I->getName() == "puts" ||
-        I->getName() == "printf" || I->getName() == "putchar" ||
-        I->getName() == "fflush" || I->getName() == "malloc" ||
-        I->getName() == "free")
-      continue;
-
-    // Don't redeclare ispc's own intrinsics
-    std::string name = I->getName();
-    if (name.size() > 2 && name[0] == '_' && name[1] == '_')
-        continue;
-
-    if (I->hasExternalWeakLinkage())
-      Out << "extern ";
-    printFunctionSignature(I, true);
-    if (I->hasWeakLinkage() || I->hasLinkOnceLinkage())
-      Out << " __ATTRIBUTE_WEAK__";
-    if (I->hasExternalWeakLinkage())
-      Out << " __EXTERNAL_WEAK__";
-    if (StaticCtors.count(I))
-      Out << " __ATTRIBUTE_CTOR__";
-    if (StaticDtors.count(I))
-      Out << " __ATTRIBUTE_DTOR__";
-    if (I->hasHiddenVisibility())
-      Out << " __HIDDEN__";
-
-    if (I->hasName() && I->getName()[0] == 1)
-      Out << " LLVM_ASM(\"" << I->getName().substr(1) << "\")";
-
-    Out << ";\n";
-  }
-  Out << "}\n";
+  generateCompilerSpecificCode(Out, TD);

  // Provide a definition for `bool' if not compiling with a C++ compiler.
  Out << "\n"
@@ -2240,6 +2262,106 @@ bool CWriter::doInitialization(Module &M) {
      }
  }

+  // Function declarations
+  Out << "\n/* Function Declarations */\n";
+  Out << "extern \"C\" {\n";
+
+  // Store the intrinsics which will be declared/defined below.
+  SmallVector<const Function*, 8> intrinsicsToDefine;
+
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
+    // Don't print declarations for intrinsic functions.
+    // Store the used intrinsics, which need to be explicitly defined.
+    if (I->isIntrinsic()) {
+      switch (I->getIntrinsicID()) {
+        default:
+          break;
+        case Intrinsic::uadd_with_overflow:
+        case Intrinsic::sadd_with_overflow:
+          intrinsicsToDefine.push_back(I);
+          break;
+      }
+      continue;
+    }
+
+    if (I->getName() == "setjmp" || I->getName() == "abort" ||
+        I->getName() == "longjmp" || I->getName() == "_setjmp" ||
+        I->getName() == "memset" || I->getName() == "memset_pattern16" ||
+        I->getName() == "puts" ||
+        I->getName() == "printf" || I->getName() == "putchar" ||
+        I->getName() == "fflush" || I->getName() == "malloc" ||
+        I->getName() == "free")
+      continue;
+
+    // Don't redeclare ispc's own intrinsics
+    std::string name = I->getName();
+    if (name.size() > 2 && name[0] == '_' && name[1] == '_')
+        continue;
+
+    if (I->hasExternalWeakLinkage())
+      Out << "extern ";
+    printFunctionSignature(I, true);
+    if (I->hasWeakLinkage() || I->hasLinkOnceLinkage())
+      Out << " __ATTRIBUTE_WEAK__";
+    if (I->hasExternalWeakLinkage())
+      Out << " __EXTERNAL_WEAK__";
+    if (StaticCtors.count(I))
+      Out << " __ATTRIBUTE_CTOR__";
+    if (StaticDtors.count(I))
+      Out << " __ATTRIBUTE_DTOR__";
+    if (I->hasHiddenVisibility())
+      Out << " __HIDDEN__";
+
+    if (I->hasName() && I->getName()[0] == 1)
+      Out << " LLVM_ASM(\"" << I->getName().substr(1) << "\")";
+
+    Out << ";\n";
+  }
+  Out << "}\n\n";
+
+  if (!M.empty())
+    Out << "\n\n/* Function Bodies */\n";
+
+  // Emit some helper functions for dealing with FCMP instruction's
+  // predicates
+  Out << "template <typename A, typename B> static inline int llvm_fcmp_ord(A X, B Y) { ";
+  Out << "return X == X && Y == Y; }\n";
+  Out << "template <typename A, typename B> static inline int llvm_fcmp_uno(A X, B Y) { ";
+  Out << "return X != X || Y != Y; }\n";
+  Out << "template <typename A, typename B> static inline int llvm_fcmp_ueq(A X, B Y) { ";
+  Out << "return X == Y || llvm_fcmp_uno(X, Y); }\n";
+  Out << "template <typename A, typename B> static inline int llvm_fcmp_une(A X, B Y) { ";
+  Out << "return X != Y; }\n";
+  Out << "template <typename A, typename B> static inline int llvm_fcmp_ult(A X, B Y) { ";
+  Out << "return X <  Y || llvm_fcmp_uno(X, Y); }\n";
+  Out << "template <typename A, typename B> static inline int llvm_fcmp_ugt(A X, B Y) { ";
+  Out << "return X >  Y || llvm_fcmp_uno(X, Y); }\n";
+  Out << "template <typename A, typename B> static inline int llvm_fcmp_ule(A X, B Y) { ";
+  Out << "return X <= Y || llvm_fcmp_uno(X, Y); }\n";
+  Out << "template <typename A, typename B> static inline int llvm_fcmp_uge(A X, B Y) { ";
+  Out << "return X >= Y || llvm_fcmp_uno(X, Y); }\n";
+  Out << "template <typename A, typename B> static inline int llvm_fcmp_oeq(A X, B Y) { ";
+  Out << "return X == Y ; }\n";
+  Out << "template <typename A, typename B> static inline int llvm_fcmp_one(A X, B Y) { ";
+  Out << "return X != Y && llvm_fcmp_ord(X, Y); }\n";
+  Out << "template <typename A, typename B> static inline int llvm_fcmp_olt(A X, B Y) { ";
+  Out << "return X <  Y ; }\n";
+  Out << "template <typename A, typename B> static inline int llvm_fcmp_ogt(A X, B Y) { ";
+  Out << "return X >  Y ; }\n";
+  Out << "template <typename A, typename B> static inline int llvm_fcmp_ole(A X, B Y) { ";
+  Out << "return X <= Y ; }\n";
+  Out << "template <typename A, typename B> static inline int llvm_fcmp_oge(A X, B Y) { ";
+  Out << "return X >= Y ; }\n";
+  Out << "template <typename A> A *Memset(A *ptr, int count, size_t len) { ";
+  Out << "return (A *)memset(ptr, count, len); }\n";
+
+  // Emit definitions of the intrinsics.
+  for (SmallVector<const Function*, 8>::const_iterator
+       I = intrinsicsToDefine.begin(),
+       E = intrinsicsToDefine.end(); I != E; ++I) {
+    printIntrinsicDefinition(**I, Out);
+  }
+
  // Output the global variable definitions and contents...
  if (!M.global_empty()) {
    Out << "\n\n/* Global Variable Definitions and Initialization */\n";
@@ -2303,49 +2425,6 @@ bool CWriter::doInitialization(Module &M) {
      }
  }

-  if (!M.empty())
-    Out << "\n\n/* Function Bodies */\n";
-
-  // Emit some helper functions for dealing with FCMP instruction's
-  // predicates
-  Out << "template <typename A, typename B> static inline int llvm_fcmp_ord(A X, B Y) { ";
-  Out << "return X == X && Y == Y; }\n";
-  Out << "template <typename A, typename B> static inline int llvm_fcmp_uno(A X, B Y) { ";
-  Out << "return X != X || Y != Y; }\n";
-  Out << "template <typename A, typename B> static inline int llvm_fcmp_ueq(A X, B Y) { ";
-  Out << "return X == Y || llvm_fcmp_uno(X, Y); }\n";
-  Out << "template <typename A, typename B> static inline int llvm_fcmp_une(A X, B Y) { ";
-  Out << "return X != Y; }\n";
-  Out << "template <typename A, typename B> static inline int llvm_fcmp_ult(A X, B Y) { ";
-  Out << "return X <  Y || llvm_fcmp_uno(X, Y); }\n";
-  Out << "template <typename A, typename B> static inline int llvm_fcmp_ugt(A X, B Y) { ";
-  Out << "return X >  Y || llvm_fcmp_uno(X, Y); }\n";
-  Out << "template <typename A, typename B> static inline int llvm_fcmp_ule(A X, B Y) { ";
-  Out << "return X <= Y || llvm_fcmp_uno(X, Y); }\n";
-  Out << "template <typename A, typename B> static inline int llvm_fcmp_uge(A X, B Y) { ";
-  Out << "return X >= Y || llvm_fcmp_uno(X, Y); }\n";
-  Out << "template <typename A, typename B> static inline int llvm_fcmp_oeq(A X, B Y) { ";
-  Out << "return X == Y ; }\n";
-  Out << "template <typename A, typename B> static inline int llvm_fcmp_one(A X, B Y) { ";
-  Out << "return X != Y && llvm_fcmp_ord(X, Y); }\n";
-  Out << "template <typename A, typename B> static inline int llvm_fcmp_olt(A X, B Y) { ";
-  Out << "return X <  Y ; }\n";
-  Out << "template <typename A, typename B> static inline int llvm_fcmp_ogt(A X, B Y) { ";
-  Out << "return X >  Y ; }\n";
-  Out << "template <typename A, typename B> static inline int llvm_fcmp_ole(A X, B Y) { ";
-  Out << "return X <= Y ; }\n";
-  Out << "template <typename A, typename B> static inline int llvm_fcmp_oge(A X, B Y) { ";
-  Out << "return X >= Y ; }\n";
-  Out << "template <typename A> A *Memset(A *ptr, int count, size_t len) { ";
-  Out << "return (A *)memset(ptr, count, len); }\n";
-
-  // Emit definitions of the intrinsics.
-  for (SmallVector<const Function*, 8>::const_iterator
-       I = intrinsicsToDefine.begin(),
-       E = intrinsicsToDefine.end(); I != E; ++I) {
-    printIntrinsicDefinition(**I, Out);
-  }
-
  return false;
 }

@@ -2823,17 +2902,17 @@ void CWriter::visitSwitchInst(SwitchInst &SI) {
  printBranchToBlock(SI.getParent(), SI.getDefaultDest(), 2);
  Out << ";\n";

-#ifdef LLVM_3_1svn
-  for (SwitchInst::CaseIt i = SI.case_begin(), e = SI.case_end(); i != e; ++i) {
-    ConstantInt* CaseVal = i.getCaseValue();
-    BasicBlock* Succ = i.getCaseSuccessor();
-#else
+#ifdef LLVM_3_0
  // Skip the first item since that's the default case.
  unsigned NumCases = SI.getNumCases();
  for (unsigned i = 1; i < NumCases; ++i) {
    ConstantInt* CaseVal = SI.getCaseValue(i);
    BasicBlock* Succ = SI.getSuccessor(i);
-#endif // LLVM_3_1svn
+#else
+  for (SwitchInst::CaseIt i = SI.case_begin(), e = SI.case_end(); i != e; ++i) {
+    ConstantInt* CaseVal = i.getCaseValue();
+    BasicBlock* Succ = i.getCaseSuccessor();
+#endif // !LLVM_3_0
    Out << "  case ";
    writeOperand(CaseVal);
    Out << ":\n";
@@ -3401,6 +3480,7 @@ void CWriter::lowerIntrinsics(Function &F) {
          case Intrinsic::ppc_altivec_lvsl:
          case Intrinsic::uadd_with_overflow:
          case Intrinsic::sadd_with_overflow:
+          case Intrinsic::trap:
              // We directly implement these intrinsics
            break;
          default:
@@ -3568,7 +3648,9 @@ bool CWriter::visitBuiltinCall(CallInst &I, Intrinsic::ID ID,
    // If this is an intrinsic that directly corresponds to a GCC
    // builtin, we emit it here.
    const char *BuiltinName = "";
+#ifdef LLVM_3_0
    Function *F = I.getCalledFunction();
+#endif // LLVM_3_0
 #define GET_GCC_BUILTIN_NAME
 #include "llvm/Intrinsics.gen"
 #undef GET_GCC_BUILTIN_NAME
@@ -3711,184 +3793,17 @@ bool CWriter::visitBuiltinCall(CallInst &I, Intrinsic::ID ID,
    writeOperand(I.getArgOperand(1));
    Out << ")";
    return true;
+  case Intrinsic::trap:
+    Out << "abort()";
+    return true;
  }
 }

-//This converts the llvm constraint string to something gcc is expecting.
-//TODO: work out platform independent constraints and factor those out
-//      of the per target tables
-//      handle multiple constraint codes
-std::string CWriter::InterpretASMConstraint(InlineAsm::ConstraintInfo& c) {
-  assert(c.Codes.size() == 1 && "Too many asm constraint codes to handle");
-
-  // Grab the translation table from MCAsmInfo if it exists.
-  const MCAsmInfo *TargetAsm;
-  std::string Triple = TheModule->getTargetTriple();
-  if (Triple.empty())
-#if defined(LLVM_3_1) || defined(LLVM_3_1svn)
-    Triple = llvm::sys::getDefaultTargetTriple();
-#else
-    Triple = llvm::sys::getHostTriple();
-#endif
-
-  std::string E;
-  if (const llvm::Target *Match = TargetRegistry::lookupTarget(Triple, E))
-    TargetAsm = Match->createMCAsmInfo(Triple);
-  else
-    return c.Codes[0];
-
-  const char *const *table = TargetAsm->getAsmCBE();
-
-  // Search the translation table if it exists.
-  for (int i = 0; table && table[i]; i += 2)
-    if (c.Codes[0] == table[i]) {
-      delete TargetAsm;
-      return table[i+1];
-    }
-
-  // Default is identity.
-  delete TargetAsm;
-  return c.Codes[0];
-}
-
-//TODO: import logic from AsmPrinter.cpp
-static std::string gccifyAsm(std::string asmstr) {
-  for (std::string::size_type i = 0; i != asmstr.size(); ++i)
-    if (asmstr[i] == '\n')
-      asmstr.replace(i, 1, "\\n");
-    else if (asmstr[i] == '\t')
-      asmstr.replace(i, 1, "\\t");
-    else if (asmstr[i] == '$') {
-      if (asmstr[i + 1] == '{') {
-        std::string::size_type a = asmstr.find_first_of(':', i + 1);
-        std::string::size_type b = asmstr.find_first_of('}', i + 1);
-        std::string n = "%" +
-          asmstr.substr(a + 1, b - a - 1) +
-          asmstr.substr(i + 2, a - i - 2);
-        asmstr.replace(i, b - i + 1, n);
-        i += n.size() - 1;
-      } else
-        asmstr.replace(i, 1, "%");
-    }
-    else if (asmstr[i] == '%')//grr
-      { asmstr.replace(i, 1, "%%"); ++i;}
-
-  return asmstr;
-}

 //TODO: assumptions about what consume arguments from the call are likely wrong
 //      handle communitivity
 void CWriter::visitInlineAsm(CallInst &CI) {
-  InlineAsm* as = cast<InlineAsm>(CI.getCalledValue());
-  InlineAsm::ConstraintInfoVector Constraints = as->ParseConstraints();
-
-  std::vector<std::pair<Value*, int> > ResultVals;
-  if (CI.getType() == Type::getVoidTy(CI.getContext()))
-    ;
-  else if (StructType *ST = dyn_cast<StructType>(CI.getType())) {
-    for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i)
-      ResultVals.push_back(std::make_pair(&CI, (int)i));
-  } else {
-    ResultVals.push_back(std::make_pair(&CI, -1));
-  }
-
-  // Fix up the asm string for gcc and emit it.
-  Out << "__asm__ volatile (\"" << gccifyAsm(as->getAsmString()) << "\"\n";
-  Out << "        :";
-
-  unsigned ValueCount = 0;
-  bool IsFirst = true;
-
-  // Convert over all the output constraints.
-  for (InlineAsm::ConstraintInfoVector::iterator I = Constraints.begin(),
-       E = Constraints.end(); I != E; ++I) {
-
-    if (I->Type != InlineAsm::isOutput) {
-      ++ValueCount;
-      continue;  // Ignore non-output constraints.
-    }
-
-    assert(I->Codes.size() == 1 && "Too many asm constraint codes to handle");
-    std::string C = InterpretASMConstraint(*I);
-    if (C.empty()) continue;
-
-    if (!IsFirst) {
-      Out << ", ";
-      IsFirst = false;
-    }
-
-    // Unpack the dest.
-    Value *DestVal;
-    int DestValNo = -1;
-
-    if (ValueCount < ResultVals.size()) {
-      DestVal = ResultVals[ValueCount].first;
-      DestValNo = ResultVals[ValueCount].second;
-    } else
-      DestVal = CI.getArgOperand(ValueCount-ResultVals.size());
-
-    if (I->isEarlyClobber)
-      C = "&"+C;
-
-    Out << "\"=" << C << "\"(" << GetValueName(DestVal);
-    if (DestValNo != -1)
-      Out << ".field" << DestValNo; // Multiple retvals.
-    Out << ")";
-    ++ValueCount;
-  }
-
-
-  // Convert over all the input constraints.
-  Out << "\n        :";
-  IsFirst = true;
-  ValueCount = 0;
-  for (InlineAsm::ConstraintInfoVector::iterator I = Constraints.begin(),
-       E = Constraints.end(); I != E; ++I) {
-    if (I->Type != InlineAsm::isInput) {
-      ++ValueCount;
-      continue;  // Ignore non-input constraints.
-    }
-
-    assert(I->Codes.size() == 1 && "Too many asm constraint codes to handle");
-    std::string C = InterpretASMConstraint(*I);
-    if (C.empty()) continue;
-
-    if (!IsFirst) {
-      Out << ", ";
-      IsFirst = false;
-    }
-
-    assert(ValueCount >= ResultVals.size() && "Input can't refer to result");
-    Value *SrcVal = CI.getArgOperand(ValueCount-ResultVals.size());
-
-    Out << "\"" << C << "\"(";
-    if (!I->isIndirect)
-      writeOperand(SrcVal);
-    else
-      writeOperandDeref(SrcVal);
-    Out << ")";
-  }
-
-  // Convert over the clobber constraints.
-  IsFirst = true;
-  for (InlineAsm::ConstraintInfoVector::iterator I = Constraints.begin(),
-       E = Constraints.end(); I != E; ++I) {
-    if (I->Type != InlineAsm::isClobber)
-      continue;  // Ignore non-input constraints.
-
-    assert(I->Codes.size() == 1 && "Too many asm constraint codes to handle");
-    std::string C = InterpretASMConstraint(*I);
-    if (C.empty()) continue;
-
-    if (!IsFirst) {
-      Out << ", ";
-      IsFirst = false;
-    }
-
-    Out << '\"' << C << '"';
-  }
-
-  Out << ")";
+  assert(!"Inline assembly not supported");
 }

 void CWriter::visitAllocaInst(AllocaInst &I) {
@@ -4240,14 +4155,14 @@ void CWriter::visitAtomicCmpXchgInst(AtomicCmpXchgInst &ACXI) {

 class SmearCleanupPass : public llvm::BasicBlockPass {
 public:
-    SmearCleanupPass(llvm::Module *m, int width)
+    SmearCleanupPass(Module *m, int width)
        : BasicBlockPass(ID) { module = m; vectorWidth = width; }

    const char *getPassName() const { return "Smear Cleanup Pass"; }
    bool runOnBasicBlock(llvm::BasicBlock &BB);

    static char ID;
-    llvm::Module *module;
+    Module *module;
    int vectorWidth;
 };

@@ -4303,41 +4218,28 @@ SmearCleanupPass::runOnBasicBlock(llvm::BasicBlock &bb) {
        assert(toMatch != NULL);

        {
-        // FIXME: generalize this/make it not so hard-coded?
        Type *matchType = toMatch->getType();
-        const char *smearFuncName = NULL;
-
-        switch (matchType->getTypeID()) {
-        case Type::FloatTyID:  smearFuncName = "__smear_float"; break;
-        case Type::DoubleTyID: smearFuncName = "__smear_double"; break;
-        case Type::IntegerTyID: {
-            switch (cast<IntegerType>(matchType)->getBitWidth()) {
-            case 8:  smearFuncName = "__smear_i8";  break;
-            case 16: smearFuncName = "__smear_i16"; break;
-            case 32: smearFuncName = "__smear_i32"; break;
-            case 64: smearFuncName = "__smear_i64"; break;
-            }
-        }
-        default: break;
-        }
+        const char *smearFuncName = lGetSmearFunc(matchType);

        if (smearFuncName != NULL) {
            Function *smearFunc = module->getFunction(smearFuncName);
            if (smearFunc == NULL) {
                Constant *sf = 
                    module->getOrInsertFunction(smearFuncName, iter->getType(), 
-                                                matchType, NULL);
+                                                iter->getType(), matchType, NULL);
                smearFunc = dyn_cast<Function>(sf);
                assert(smearFunc != NULL);
                smearFunc->setDoesNotThrow(true);
                smearFunc->setDoesNotAccessMemory(true);
            }
-                
+
+            llvm::Value *undefResult = llvm::UndefValue::get(vt);
            assert(smearFunc != NULL);
-            Value *args[1] = { toMatch };
-            ArrayRef<llvm::Value *> argArray(&args[0], &args[1]);
+            Value *args[2] = { undefResult, toMatch };
+            ArrayRef<llvm::Value *> argArray(&args[0], &args[2]);
            Instruction *smearCall = 
-                CallInst::Create(smearFunc, argArray, "smear", (Instruction *)NULL);
+                CallInst::Create(smearFunc, argArray, LLVMGetName(toMatch, "_smear"),
+                                 (Instruction *)NULL);

            ReplaceInstWithInst(iter, smearCall);

@@ -4401,6 +4303,155 @@ BitcastCleanupPass::runOnBasicBlock(llvm::BasicBlock &bb) {
    return modifiedAny;
 }

+///////////////////////////////////////////////////////////////////////////
+// MaskOpsCleanupPass
+
+/** This pass does various peephole improvements to mask modification
+    operations.  In particular, it converts mask XORs with "all true" to
+    calls to __not() and replaces operations like and(not(a), b) to
+    __and_not1(a, b) (and similarly if the second operand has not applied
+    to it...)
+ */
+class MaskOpsCleanupPass : public llvm::BasicBlockPass {
+public:
+    MaskOpsCleanupPass(Module *m)
+        : BasicBlockPass(ID) { 
+        Type *mt = LLVMTypes::MaskType;
+
+        // Declare the __not, __and_not1, and __and_not2 functions that we
+        // expect the target to end up providing.
+        notFunc = 
+            dyn_cast<Function>(m->getOrInsertFunction("__not", mt, mt, NULL));
+        assert(notFunc != NULL);
+        notFunc->addFnAttr(Attribute::NoUnwind);
+        notFunc->addFnAttr(Attribute::ReadNone);
+
+        andNotFuncs[0] = 
+            dyn_cast<Function>(m->getOrInsertFunction("__and_not1", mt, mt, mt,
+                                                      NULL));
+        assert(andNotFuncs[0] != NULL);
+        andNotFuncs[0]->addFnAttr(Attribute::NoUnwind);
+        andNotFuncs[0]->addFnAttr(Attribute::ReadNone);
+
+        andNotFuncs[1] = 
+            dyn_cast<Function>(m->getOrInsertFunction("__and_not2", mt, mt, mt,
+                                                      NULL));
+        assert(andNotFuncs[1] != NULL);
+        andNotFuncs[1]->addFnAttr(Attribute::NoUnwind);
+        andNotFuncs[1]->addFnAttr(Attribute::ReadNone);
+    }
+
+    const char *getPassName() const { return "MaskOps Cleanup Pass"; }
+    bool runOnBasicBlock(llvm::BasicBlock &BB);
+
+private:
+    Value *lGetNotOperand(Value *v) const;
+
+    Function *notFunc, *andNotFuncs[2];
+
+    static char ID;
+};
+
+char MaskOpsCleanupPass::ID = 0;
+
+
+/** Returns true if the given value is a compile-time constant vector of
+    i1s with all elements 'true'. 
+*/
+static bool
+lIsAllTrue(Value *v) {
+    if (ConstantVector *cv = dyn_cast<ConstantVector>(v)) {
+        ConstantInt *ci;
+        return (cv->getSplatValue() != NULL &&
+                (ci = dyn_cast<ConstantInt>(cv->getSplatValue())) != NULL &&
+                ci->isOne());
+    }
+                
+#ifndef LLVM_3_0
+    if (ConstantDataVector *cdv = dyn_cast<ConstantDataVector>(v)) {
+        ConstantInt *ci;
+        return (cdv->getSplatValue() != NULL &&
+                (ci = dyn_cast<ConstantInt>(cdv->getSplatValue())) != NULL &&
+                ci->isOne());
+    }
+#endif
+
+    return false;
+}
+
+
+/** Checks to see if the given value is the NOT of some other value.  If
+    so, it returns the operand of the NOT; otherwise returns NULL.
+ */
+Value *
+MaskOpsCleanupPass::lGetNotOperand(Value *v) const {
+    if (CallInst *ci = dyn_cast<CallInst>(v))
+        if (ci->getCalledFunction() == notFunc)
+            // Direct call to __not()
+            return ci->getArgOperand(0);
+
+    if (BinaryOperator *bop = dyn_cast<BinaryOperator>(v))
+        if (bop->getOpcode() == Instruction::Xor &&
+            lIsAllTrue(bop->getOperand(1)))
+            // XOR of all-true vector.
+            return bop->getOperand(0);
+
+    return NULL;
+}
+
+
+bool
+MaskOpsCleanupPass::runOnBasicBlock(llvm::BasicBlock &bb) {
+    bool modifiedAny = false;
+
+ restart:
+    for (BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
+        BinaryOperator *bop = dyn_cast<BinaryOperator>(&*iter);
+        if (bop == NULL)
+            continue;
+
+        if (bop->getType() != LLVMTypes::MaskType)
+            continue;
+
+        if (bop->getOpcode() == Instruction::Xor) {
+            // Check for XOR with all-true values
+            if (lIsAllTrue(bop->getOperand(1))) {
+                ArrayRef<Value *> arg(bop->getOperand(0));
+                CallInst *notCall = CallInst::Create(notFunc, arg, 
+                                                     bop->getName());
+                ReplaceInstWithInst(iter, notCall);
+                modifiedAny = true;
+                goto restart;
+            }
+        }
+        else if (bop->getOpcode() == Instruction::And) {
+            // Check each of the operands to see if they have NOT applied
+            // to them.
+            for (int i = 0; i < 2; ++i) {
+                if (Value *notOp = lGetNotOperand(bop->getOperand(i))) {
+                    // In notOp we have the target of the NOT operation;
+                    // put it in its appropriate spot in the operand array.
+                    // Copy in the other operand directly.
+                    Value *args[2];
+                    args[i]     = notOp;
+                    args[i ^ 1] = bop->getOperand(i ^ 1);
+                    ArrayRef<Value *> argsRef(&args[0], 2);
+
+                    // Call the appropriate __and_not* function.
+                    CallInst *andNotCall = 
+                        CallInst::Create(andNotFuncs[i], argsRef, bop->getName());
+
+                    ReplaceInstWithInst(iter, andNotCall);
+                    modifiedAny = true;
+                    goto restart;
+                }
+            }
+        }
+    }
+
+    return modifiedAny;
+}
+

 //===----------------------------------------------------------------------===//
 //                       External Interface declaration
@@ -4432,6 +4483,7 @@ WriteCXXFile(llvm::Module *module, const char *fn, int vectorWidth,
    pm.add(createCFGSimplificationPass());   // clean up after lower invoke.
    pm.add(new SmearCleanupPass(module, vectorWidth));
    pm.add(new BitcastCleanupPass);
+    pm.add(new MaskOpsCleanupPass(module));
    pm.add(createDeadCodeEliminationPass()); // clean up after smear pass
 //CO    pm.add(createPrintModulePass(&fos));
    pm.add(new CWriter(fos, includeName, vectorWidth));
@@ -4442,5 +4494,3 @@ WriteCXXFile(llvm::Module *module, const char *fn, int vectorWidth,

    return true;
 }
-
-#endif // LLVM_2_9
--- a/contrib/ispc.vim
+++ b/contrib/ispc.vim
@@ -17,7 +17,7 @@ syn keyword	ispcStatement	cbreak ccontinue creturn launch print reference soa sy
 syn keyword	ispcConditional	cif
 syn keyword	ispcRepeat	cdo cfor cwhile
 syn keyword	ispcBuiltin	programCount programIndex	
-syn keyword	ispcType	export int8 int16 int32 int64
+syn keyword	ispcType	export uniform varying int8 int16 int32 int64

 " Default highlighting
 command -nargs=+ HiLink hi def link <args>
--- a/contrib/ispc.vim.README
+++ b/contrib/ispc.vim.README
@@ -0,0 +1,8 @@
+To install vim syntax highlighting for ispc files:
+
+1) Copy ispc.vim into ~/.vim/syntax/ispc.vim (create if necessary)
+2) Create a filetype for ispc files to correspond to that syntax file
+   To do this, create and append the following line to ~/.vim/ftdetect/ispc.vim
+
+au BufRead,BufNewFile *.ispc set filetype=ispc
+
--- a/ctx.cpp
+++ b/ctx.cpp
--- a/ctx.h
+++ b/ctx.h
@@ -248,6 +248,10 @@ public:
        new basic block that it starts. */
    llvm::BasicBlock *GetLabeledBasicBlock(const std::string &label);

+    /** Returns a vector of all labels in the context. This is
+        simply the key set of the labelMap */
+    std::vector<std::string> GetLabels();
+
    /** Called to generate code for 'return' statement; value is the
        expression in the return statement (if non-NULL), and
        doCoherenceCheck indicates whether instructions should be generated
@@ -272,7 +276,7 @@ public:
    llvm::Value *None(llvm::Value *mask);

    /** Given a boolean mask value of type LLVMTypes::MaskType, return an
-        i32 value wherein the i'th bit is on if and only if the i'th lane
+        i64 value wherein the i'th bit is on if and only if the i'th lane
        of the mask is on. */
    llvm::Value *LaneMask(llvm::Value *mask);

@@ -338,7 +342,7 @@ public:

    /** Emits debugging information for the function parameter represented
        by sym.  */
-    void EmitFunctionParameterDebugInfo(Symbol *sym);
+    void EmitFunctionParameterDebugInfo(Symbol *sym, int parameterNum);
    /** @} */

    /** @name IR instruction emission
@@ -380,23 +384,23 @@ public:
        array, for pointer types). */
    llvm::Value *SmearUniform(llvm::Value *value, const char *name = NULL);

-    llvm::Value *BitCastInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
+    llvm::Value *BitCastInst(llvm::Value *value, llvm::Type *type,
                             const char *name = NULL);
    llvm::Value *PtrToIntInst(llvm::Value *value, const char *name = NULL);
-    llvm::Value *PtrToIntInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
+    llvm::Value *PtrToIntInst(llvm::Value *value, llvm::Type *type,
                              const char *name = NULL);
-    llvm::Value *IntToPtrInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
+    llvm::Value *IntToPtrInst(llvm::Value *value, llvm::Type *type,
                              const char *name = NULL);

-    llvm::Instruction *TruncInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
+    llvm::Instruction *TruncInst(llvm::Value *value, llvm::Type *type,
                                 const char *name = NULL);
    llvm::Instruction *CastInst(llvm::Instruction::CastOps op, llvm::Value *value,
-                                LLVM_TYPE_CONST llvm::Type *type, const char *name = NULL);
-    llvm::Instruction *FPCastInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type, 
+                                llvm::Type *type, const char *name = NULL);
+    llvm::Instruction *FPCastInst(llvm::Value *value, llvm::Type *type, 
                                  const char *name = NULL);
-    llvm::Instruction *SExtInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type, 
+    llvm::Instruction *SExtInst(llvm::Value *value, llvm::Type *type, 
                                const char *name = NULL);
-    llvm::Instruction *ZExtInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type, 
+    llvm::Instruction *ZExtInst(llvm::Value *value, llvm::Type *type, 
                                const char *name = NULL);

    /** Given two integer-typed values (but possibly one vector and the
@@ -448,7 +452,7 @@ public:
        instruction is added at the start of the function in the entry
        basic block; if it should be added to the current basic block, then
        the atEntryBlock parameter should be false. */ 
-    llvm::Value *AllocaInst(LLVM_TYPE_CONST llvm::Type *llvmType, 
+    llvm::Value *AllocaInst(llvm::Type *llvmType, 
                            const char *name = NULL, int align = 0, 
                            bool atEntryBlock = true);

@@ -485,7 +489,7 @@ public:
    llvm::Value *InsertInst(llvm::Value *v, llvm::Value *eltVal, int elt, 
                            const char *name = NULL);

-    llvm::PHINode *PhiNode(LLVM_TYPE_CONST llvm::Type *type, int count, 
+    llvm::PHINode *PhiNode(llvm::Type *type, int count, 
                           const char *name = NULL);
    llvm::Instruction *SelectInst(llvm::Value *test, llvm::Value *val0,
                                  llvm::Value *val1, const char *name = NULL);
@@ -632,12 +636,12 @@ private:
    std::vector<CFInfo *> controlFlowInfo;

    /** DIFile object corresponding to the source file where the current
-        function was defined (used for debugging info0. */
+        function was defined (used for debugging info). */
    llvm::DIFile diFile;

    /** DISubprogram corresponding to this function (used for debugging
        info). */
-    llvm::DISubprogram diFunction;
+    llvm::DISubprogram diSubprogram;

    /** These correspond to the current set of nested scopes in the
        function. */
--- a/decl.cpp
+++ b/decl.cpp
@@ -33,7 +33,7 @@

 /** @file decl.cpp
    @brief Implementations of classes related to turning declarations into 
-           symbols and types.
+           symbol names and types.
 */

 #include "decl.h"
@@ -44,6 +44,7 @@
 #include "stmt.h"
 #include "expr.h"
 #include <stdio.h>
+#include <string.h>
 #include <set>

 static void
@@ -55,6 +56,7 @@ lPrintTypeQualifiers(int typeQualifiers) {
    if (typeQualifiers & TYPEQUAL_TASK)      printf("task ");
    if (typeQualifiers & TYPEQUAL_SIGNED)    printf("signed ");
    if (typeQualifiers & TYPEQUAL_UNSIGNED)  printf("unsigned ");
+    if (typeQualifiers & TYPEQUAL_EXPORT)    printf("export ");
 }


@@ -134,7 +136,7 @@ DeclSpecs::GetBaseType(SourcePos pos) const {
    }

    if (vectorSize > 0) {
-        const AtomicType *atomicType = dynamic_cast<const AtomicType *>(retType);
+        const AtomicType *atomicType = CastType<AtomicType>(retType);
        if (atomicType == NULL) {
            Error(pos, "Only atomic types (int, float, ...) are legal for vector "
                  "types.");
@@ -146,7 +148,7 @@ DeclSpecs::GetBaseType(SourcePos pos) const {
    retType = lApplyTypeQualifiers(typeQualifiers, retType, pos);
    
    if (soaWidth > 0) {
-        const StructType *st = dynamic_cast<const StructType *>(retType);
+        const StructType *st = CastType<StructType>(retType);

        if (st == NULL) {
            Error(pos, "Illegal to provide soa<%d> qualifier with non-struct "
@@ -188,7 +190,6 @@ lGetStorageClassName(StorageClass storageClass) {
    case SC_NONE:     return "";
    case SC_EXTERN:   return "extern";
    case SC_EXTERN_C: return "extern \"C\"";
-    case SC_EXPORT:   return "export";
    case SC_STATIC:   return "static";
    case SC_TYPEDEF:  return "typedef";
    default:          FATAL("Unhandled storage class in lGetStorageClassName");
@@ -217,50 +218,44 @@ Declarator::Declarator(DeclaratorKind dk, SourcePos p)
    : pos(p), kind(dk) { 
    child = NULL;
    typeQualifiers = 0;
+    storageClass = SC_NONE;
    arraySize = -1;
-    sym = NULL;
+    type = NULL;
    initExpr = NULL;
 }


 void
 Declarator::InitFromDeclSpecs(DeclSpecs *ds) {
-    const Type *t = GetType(ds);
-    if (t == NULL) {
-        Assert(m->errorCount > 0);
+    const Type *baseType = ds->GetBaseType(pos);
+    InitFromType(baseType, ds);
+
+    if (type == NULL) {
+        AssertPos(pos, m->errorCount > 0);
        return;
    }

-    Symbol *sym = GetSymbol();
-    if (sym != NULL) {
-        sym->type = t;
-        sym->storageClass = ds->storageClass;
+    storageClass = ds->storageClass;
+
+    if (ds->declSpecList.size() > 0 && 
+        CastType<FunctionType>(type) == NULL) {
+        Error(pos, "__declspec specifiers for non-function type \"%s\" are "
+              "not used.", type->GetString().c_str());
    }
 }


-Symbol *
-Declarator::GetSymbol() const {
-    // The symbol lives at the last child in the chain, so walk down there
-    // and return the one there.
-    const Declarator *d = this;
-    while (d->child != NULL)
-        d = d->child;
-    return d->sym;
-}
-
-
 void
 Declarator::Print(int indent) const {
    printf("%*cdeclarator: [", indent, ' ');
    pos.Print();

    lPrintTypeQualifiers(typeQualifiers);
-    Symbol *sym = GetSymbol();
-    if (sym != NULL)
-        printf("%s", sym->name.c_str());
+    printf("%s ", lGetStorageClassName(storageClass));
+    if (name.size() > 0)
+        printf("%s", name.c_str());
    else
-        printf("(null symbol)");
+        printf("(unnamed)");

    printf(", array size = %d", arraySize);

@@ -294,66 +289,26 @@ Declarator::Print(int indent) const {
 }


-Symbol *
-Declarator::GetFunctionInfo(DeclSpecs *ds, std::vector<Symbol *> *funArgs) {
-    const FunctionType *type = 
-        dynamic_cast<const FunctionType *>(GetType(ds));
-    if (type == NULL)
-        return NULL;
-
-    Symbol *declSym = GetSymbol();
-    Assert(declSym != NULL);
-
-    // Get the symbol for the function from the symbol table.  (It should
-    // already have been added to the symbol table by AddGlobal() by the
-    // time we get here.)
-    Symbol *funSym = m->symbolTable->LookupFunction(declSym->name.c_str(), type);
-    if (funSym == NULL)
-        // May be NULL due to error earlier in compilation
-        Assert(m->errorCount > 0);
-    else
-        funSym->pos = pos;
-
-    // Walk down to the declarator for the function.  (We have to get past
-    // the stuff that specifies the function's return type before we get to
-    // the function's declarator.)
-    Declarator *d = this;
-    while (d != NULL && d->kind != DK_FUNCTION)
-        d = d->child;
-    Assert(d != NULL);
-
-    for (unsigned int i = 0; i < d->functionParams.size(); ++i) {
-        Symbol *sym = d->GetSymbolForFunctionParameter(i);
-        if (sym->type == NULL) {
-            Assert(m->errorCount > 0);
-            continue;
-        }
-        else
-            sym->type = sym->type->ResolveUnboundVariability(Variability::Varying);
-
-        funArgs->push_back(sym);
-    }
-
-    if (funSym != NULL)
-        funSym->type = funSym->type->ResolveUnboundVariability(Variability::Varying);
-
-    return funSym;
-}
-
-
-const Type *
-Declarator::GetType(const Type *base, DeclSpecs *ds) const {
+void
+Declarator::InitFromType(const Type *baseType, DeclSpecs *ds) {
    bool hasUniformQual = ((typeQualifiers & TYPEQUAL_UNIFORM) != 0);
    bool hasVaryingQual = ((typeQualifiers & TYPEQUAL_VARYING) != 0);
    bool isTask =         ((typeQualifiers & TYPEQUAL_TASK) != 0);
+    bool isExported =     ((typeQualifiers & TYPEQUAL_EXPORT) != 0);
    bool isConst =        ((typeQualifiers & TYPEQUAL_CONST) != 0);

    if (hasUniformQual && hasVaryingQual) {
        Error(pos, "Can't provide both \"uniform\" and \"varying\" qualifiers.");
-        return NULL;
+        return;
    }
-    if (kind != DK_FUNCTION && isTask)
+    if (kind != DK_FUNCTION && isTask) {
        Error(pos, "\"task\" qualifier illegal in variable declaration.");
+        return;
+    }
+    if (kind != DK_FUNCTION && isExported) {
+        Error(pos, "\"export\" qualifier illegal in variable declaration.");
+        return;
+    }

    Variability variability(Variability::Unbound);
    if (hasUniformQual)
@@ -361,91 +316,125 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
    else if (hasVaryingQual)
        variability = Variability::Varying;

-    const Type *type = base;
-    switch (kind) {
-    case DK_BASE:
+    if (kind == DK_BASE) {
        // All of the type qualifiers should be in the DeclSpecs for the
        // base declarator
-        Assert(typeQualifiers == 0);
-        Assert(child == NULL);
-        return type;
-
-    case DK_POINTER:
+        AssertPos(pos, typeQualifiers == 0);
+        AssertPos(pos, child == NULL);
+        type = baseType;
+    }
+    else if (kind == DK_POINTER) {
        /* For now, any pointer to an SOA type gets the slice property; if
           we add the capability to declare pointers as slices or not,
           we'll want to set this based on a type qualifier here. */
-        type = new PointerType(type, variability, isConst, type->IsSOAType());
-        if (child != NULL)
-            return child->GetType(type, ds);
+        const Type *ptrType = new PointerType(baseType, variability, isConst,
+                                              baseType->IsSOAType());
+        if (child != NULL) {
+            child->InitFromType(ptrType, ds);
+            type = child->type;
+            name = child->name;
+        }
        else
-            return type;
-        break;
-
-    case DK_REFERENCE:
-        if (hasUniformQual)
+            type = ptrType;
+    }
+    else if (kind == DK_REFERENCE) {
+        if (hasUniformQual) {
            Error(pos, "\"uniform\" qualifier is illegal to apply to references.");
-        if (hasVaryingQual)
+            return;
+        }
+        if (hasVaryingQual) {
            Error(pos, "\"varying\" qualifier is illegal to apply to references.");
-        if (isConst)
+            return;
+        }
+        if (isConst) {
            Error(pos, "\"const\" qualifier is to illegal apply to references.");
-
+            return;
+        }
        // The parser should disallow this already, but double check.
-        if (dynamic_cast<const ReferenceType *>(type) != NULL) {
+        if (CastType<ReferenceType>(baseType) != NULL) {
            Error(pos, "References to references are illegal.");
-            return NULL;
+            return;
        }

-        type = new ReferenceType(type);
-        if (child != NULL)
-            return child->GetType(type, ds);
+        const Type *refType = new ReferenceType(baseType);
+        if (child != NULL) {
+            child->InitFromType(refType, ds);
+            type = child->type;
+            name = child->name;
+        }
        else
-            return type;
-        break;
-
-    case DK_ARRAY:
-        if (Type::Equal(type, AtomicType::Void)) {
+            type = refType;
+    }
+    else if (kind == DK_ARRAY) {
+        if (Type::Equal(baseType, AtomicType::Void)) {
            Error(pos, "Arrays of \"void\" type are illegal.");
-            return NULL;
+            return;
        }
-        if (dynamic_cast<const ReferenceType *>(type)) {
+        if (CastType<ReferenceType>(baseType)) {
            Error(pos, "Arrays of references (type \"%s\") are illegal.",
-                  type->GetString().c_str());
-            return NULL;
+                  baseType->GetString().c_str());
+            return;
        }

-        type = new ArrayType(type, arraySize);
-        if (child)
-            return child->GetType(type, ds);
+        const Type *arrayType = new ArrayType(baseType, arraySize);
+        if (child != NULL) {
+            child->InitFromType(arrayType, ds);
+            type = child->type;
+            name = child->name;
+        }
        else
-            return type;
-        break;
-
-    case DK_FUNCTION: {
-        std::vector<const Type *> args;
-        std::vector<std::string> argNames;
-        std::vector<ConstExpr *> argDefaults;
-        std::vector<SourcePos> argPos;
-
+            type = arrayType;
+    }
+    else if (kind == DK_FUNCTION) {
+        llvm::SmallVector<const Type *, 8> args;
+        llvm::SmallVector<std::string, 8> argNames;
+        llvm::SmallVector<Expr *, 8> argDefaults;
+        llvm::SmallVector<SourcePos, 8> argPos;
+        
        // Loop over the function arguments and store the names, types,
        // default values (if any), and source file positions each one in
        // the corresponding vector.
        for (unsigned int i = 0; i < functionParams.size(); ++i) {
            Declaration *d = functionParams[i];

-            Symbol *sym = GetSymbolForFunctionParameter(i);
-
-            if (d->declSpecs->storageClass != SC_NONE)
-                Error(sym->pos, "Storage class \"%s\" is illegal in "
-                      "function parameter declaration for parameter \"%s\".", 
-                      lGetStorageClassName(d->declSpecs->storageClass),
-                      sym->name.c_str());
-            if (Type::Equal(sym->type, AtomicType::Void)) {
-                Error(sym->pos, "Parameter with type \"void\" illegal in function "
-                      "parameter list.");
-                sym->type = NULL;
+            if (d == NULL) {
+                AssertPos(pos, m->errorCount > 0);
+                continue;
+            }
+            if (d->declarators.size() == 0) {
+                // function declaration like foo(float), w/o a name for the
+                // parameter; wire up a placeholder Declarator for it
+                d->declarators.push_back(new Declarator(DK_BASE, pos));
+                d->declarators[0]->InitFromDeclSpecs(d->declSpecs);
            }

-            const ArrayType *at = dynamic_cast<const ArrayType *>(sym->type);
+            AssertPos(pos, d->declarators.size() == 1);
+            Declarator *decl = d->declarators[0];
+            if (decl == NULL || decl->type == NULL) {
+                AssertPos(pos, m->errorCount > 0);
+                continue;
+            }
+
+            if (decl->name == "") {
+                // Give a name to any anonymous parameter declarations
+                char buf[32];
+                sprintf(buf, "__anon_parameter_%d", i);
+                decl->name = buf;
+            }
+            decl->type = decl->type->ResolveUnboundVariability(Variability::Varying);
+
+            if (d->declSpecs->storageClass != SC_NONE)
+                Error(decl->pos, "Storage class \"%s\" is illegal in "
+                      "function parameter declaration for parameter \"%s\".", 
+                      lGetStorageClassName(d->declSpecs->storageClass),
+                      decl->name.c_str());
+            if (Type::Equal(decl->type, AtomicType::Void)) {
+                Error(decl->pos, "Parameter with type \"void\" illegal in function "
+                      "parameter list.");
+                decl->type = NULL;
+            }
+
+            const ArrayType *at = CastType<ArrayType>(decl->type);
            if (at != NULL) {
                // As in C, arrays are passed to functions as pointers to
                // their element type.  We'll just immediately make this
@@ -455,93 +444,94 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
                // report this differently than it was originally declared
                // in the function, but it's not clear that this is a
                // significant problem.)
-                if (at->GetElementType() == NULL) {
-                    Assert(m->errorCount > 0);
-                    return NULL;
+                const Type *targetType = at->GetElementType();
+                if (targetType == NULL) {
+                    AssertPos(pos, m->errorCount > 0);
+                    return;
                }

-                const Type *targetType = at->GetElementType();
-                targetType = 
-                    targetType->ResolveUnboundVariability(Variability::Varying);
-                sym->type = PointerType::GetUniform(targetType);
+                decl->type = PointerType::GetUniform(targetType);

                // Make sure there are no unsized arrays (other than the
                // first dimension) in function parameter lists.
-                at = dynamic_cast<const ArrayType *>(at->GetElementType());
+                at = CastType<ArrayType>(targetType);
                while (at != NULL) {
                    if (at->GetElementCount() == 0)
-                        Error(sym->pos, "Arrays with unsized dimensions in "
+                        Error(decl->pos, "Arrays with unsized dimensions in "
                              "dimensions after the first one are illegal in "
                              "function parameter lists.");
-                    at = dynamic_cast<const ArrayType *>(at->GetElementType());
+                    at = CastType<ArrayType>(at->GetElementType());
                }
            }

-            args.push_back(sym->type);
-            argNames.push_back(sym->name);
-            argPos.push_back(sym->pos);
+            args.push_back(decl->type);
+            argNames.push_back(decl->name);
+            argPos.push_back(decl->pos);

-            ConstExpr *init = NULL;
-            if (d->declarators.size()) {
-                // Try to find an initializer expression; if there is one,
-                // it lives down to the base declarator.
-                Declarator *decl = d->declarators[0];
-                while (decl->child != NULL) {
-                    Assert(decl->initExpr == NULL);
+            Expr *init = NULL;
+            // Try to find an initializer expression.
+            while (decl != NULL) {
+                if (decl->initExpr != NULL) {
+                    decl->initExpr = TypeCheck(decl->initExpr);
+                    decl->initExpr = Optimize(decl->initExpr);
+                    if (decl->initExpr != NULL) {
+                        init = dynamic_cast<ConstExpr *>(decl->initExpr);
+                        if (init == NULL)
+                            init = dynamic_cast<NullPointerExpr *>(decl->initExpr);
+                        if (init == NULL)
+                            Error(decl->initExpr->pos, "Default value for parameter "
+                                  "\"%s\" must be a compile-time constant.", 
+                                  decl->name.c_str());
+                    }
+                    break;
+                }
+                else
                    decl = decl->child;
-                }
-
-                if (decl->initExpr != NULL &&
-                    (decl->initExpr = TypeCheck(decl->initExpr)) != NULL &&
-                    (decl->initExpr = Optimize(decl->initExpr)) != NULL &&
-                    (init = dynamic_cast<ConstExpr *>(decl->initExpr)) == NULL) {
-                    Error(decl->initExpr->pos, "Default value for parameter "
-                          "\"%s\" must be a compile-time constant.", 
-                          sym->name.c_str());
-                }
            }
            argDefaults.push_back(init);
        }

-        const Type *returnType = type;
+        const Type *returnType = baseType;
        if (returnType == NULL) {
            Error(pos, "No return type provided in function declaration.");
-            return NULL;
+            return;
        }
-        if (dynamic_cast<const FunctionType *>(returnType) != NULL) {
+
+        if (CastType<FunctionType>(returnType) != NULL) {
            Error(pos, "Illegal to return function type from function.");
-            return NULL;
+            return;
        }
        
-        bool isExported = ds && (ds->storageClass == SC_EXPORT);
+        returnType = returnType->ResolveUnboundVariability(Variability::Varying);
+
        bool isExternC =  ds && (ds->storageClass == SC_EXTERN_C);
+        bool isExported = ds && ((ds->typeQualifiers & TYPEQUAL_EXPORT) != 0);
        bool isTask =     ds && ((ds->typeQualifiers & TYPEQUAL_TASK) != 0);

        if (isExported && isTask) {
            Error(pos, "Function can't have both \"task\" and \"export\" "
                  "qualifiers");
-            return NULL;
+            return;
        }
        if (isExternC && isTask) {
            Error(pos, "Function can't have both \"extern \"C\"\" and \"task\" "
                  "qualifiers");
-            return NULL;
+            return;
        }
        if (isExternC && isExported) {
            Error(pos, "Function can't have both \"extern \"C\"\" and \"export\" "
                  "qualifiers");
-            return NULL;
+            return;
        }

        if (child == NULL) {
-            Assert(m->errorCount > 0);
-            return NULL;
+            AssertPos(pos, m->errorCount > 0);
+            return;
        }

        const FunctionType *functionType = 
            new FunctionType(returnType, args, argNames, argDefaults,
                             argPos, isTask, isExported, isExternC);
-        functionType = functionType->ResolveUnboundVariability(Variability::Varying);

        // handle any explicit __declspecs on the function
        if (ds != NULL) {
@@ -563,60 +553,12 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
            }
        }

-        return child->GetType(functionType, ds);
-    }
-    default:
-        FATAL("Unexpected decl kind");
-        return NULL;
+        child->InitFromType(functionType, ds);
+        type = child->type;
+        name = child->name;
    }
 }

-
-const Type *
-Declarator::GetType(DeclSpecs *ds) const {
-    const Type *baseType = ds->GetBaseType(pos);
-    const Type *type = GetType(baseType, ds);
-
-    if (ds->declSpecList.size() > 0 && 
-        type != NULL &&
-        dynamic_cast<const FunctionType *>(type) == NULL) {
-        Error(pos, "__declspec specifiers for non-function type \"%s\" are "
-              "not used.", type->GetString().c_str());
-    }
-
-    return type;
-}
-
-
-Symbol *
-Declarator::GetSymbolForFunctionParameter(int paramNum) const {
-    Assert(paramNum < (int)functionParams.size());
-    Declaration *d = functionParams[paramNum];
-
-    char buf[32];
-    Symbol *sym;
-    if (d->declarators.size() == 0) {
-        // function declaration like foo(float), w/o a name for
-        // the parameter
-        sprintf(buf, "__anon_parameter_%d", paramNum);
-        sym = new Symbol(buf, pos);
-        sym->type = d->declSpecs->GetBaseType(pos);
-    }
-    else {
-        Assert(d->declarators.size() == 1);
-        sym = d->declarators[0]->GetSymbol();
-        if (sym == NULL) {
-            // Handle more complex anonymous declarations like
-            // float (float **).
-            sprintf(buf, "__anon_parameter_%d", paramNum);
-            sym = new Symbol(buf, d->declarators[0]->pos);
-            sym->type = d->declarators[0]->GetType(d->declSpecs);
-        }
-    }
-    return sym;
-}
-
-
 ///////////////////////////////////////////////////////////////////////////
 // Declaration

@@ -646,27 +588,23 @@ Declaration::GetVariableDeclarations() const {

    for (unsigned int i = 0; i < declarators.size(); ++i) {
        Declarator *decl = declarators[i];
-        if (decl == NULL) {
+        if (decl == NULL || decl->type == NULL) {
            // Ignore earlier errors
            Assert(m->errorCount > 0);
            continue;
        }

-        Symbol *sym = decl->GetSymbol();
-        if (sym == NULL || sym->type == NULL) {
-            // Ignore errors
-            Assert(m->errorCount > 0);
-            continue;
-        }
-        sym->type = sym->type->ResolveUnboundVariability(Variability::Varying);
-
-        if (Type::Equal(sym->type, AtomicType::Void))
-            Error(sym->pos, "\"void\" type variable illegal in declaration.");
-        else if (dynamic_cast<const FunctionType *>(sym->type) == NULL) {
+        if (Type::Equal(decl->type, AtomicType::Void))
+            Error(decl->pos, "\"void\" type variable illegal in declaration.");
+        else if (CastType<FunctionType>(decl->type) == NULL) {
+            decl->type = decl->type->ResolveUnboundVariability(Variability::Varying);
+            Symbol *sym = new Symbol(decl->name, decl->pos, decl->type,
+                                     decl->storageClass);
            m->symbolTable->AddVariable(sym);
            vars.push_back(VariableDeclaration(sym, decl->initExpr));
        }
    }
+
    return vars;
 }

@@ -677,25 +615,19 @@ Declaration::DeclareFunctions() {

    for (unsigned int i = 0; i < declarators.size(); ++i) {
        Declarator *decl = declarators[i];
-        if (decl == NULL) {
+        if (decl == NULL || decl->type == NULL) {
            // Ignore earlier errors
            Assert(m->errorCount > 0);
            continue;
        }

-        Symbol *sym = decl->GetSymbol();
-        if (sym == NULL || sym->type == NULL) {
-            // Ignore errors
-            Assert(m->errorCount > 0);
-            continue;
-        }
-        sym->type = sym->type->ResolveUnboundVariability(Variability::Varying);
-
-        if (dynamic_cast<const FunctionType *>(sym->type) == NULL)
+        const FunctionType *ftype = CastType<FunctionType>(decl->type);
+        if (ftype == NULL)
            continue;

        bool isInline = (declSpecs->typeQualifiers & TYPEQUAL_INLINE);
-        m->AddFunctionDeclaration(sym, isInline);
+        m->AddFunctionDeclaration(decl->name, ftype, decl->storageClass,
+                                  isInline, decl->pos);
    }
 }

@@ -709,13 +641,14 @@ Declaration::Print(int indent) const {
        declarators[i]->Print(indent+4);
 }

+
 ///////////////////////////////////////////////////////////////////////////

 void
 GetStructTypesNamesPositions(const std::vector<StructDeclaration *> &sd,
-                             std::vector<const Type *> *elementTypes,
-                             std::vector<std::string> *elementNames,
-                             std::vector<SourcePos> *elementPositions) {
+                             llvm::SmallVector<const Type *, 8> *elementTypes,
+                             llvm::SmallVector<std::string, 8> *elementNames,
+                             llvm::SmallVector<SourcePos, 8> *elementPositions) {
    std::set<std::string> seenNames;
    for (unsigned int i = 0; i < sd.size(); ++i) {
        const Type *type = sd[i]->type;
@@ -725,38 +658,41 @@ GetStructTypesNamesPositions(const std::vector<StructDeclaration *> &sd,
        // FIXME: making this fake little DeclSpecs here is really
        // disgusting
        DeclSpecs ds(type);
-        if (type->IsUniformType()) 
-            ds.typeQualifiers |= TYPEQUAL_UNIFORM;
-        else if (type->IsVaryingType())
-            ds.typeQualifiers |= TYPEQUAL_VARYING;
+        if (Type::Equal(type, AtomicType::Void) == false) {
+            if (type->IsUniformType()) 
+                ds.typeQualifiers |= TYPEQUAL_UNIFORM;
+            else if (type->IsVaryingType())
+                ds.typeQualifiers |= TYPEQUAL_VARYING;
+            else if (type->GetSOAWidth() != 0)
+                ds.soaWidth = type->GetSOAWidth();
+            // FIXME: ds.vectorSize?
+        }

        for (unsigned int j = 0; j < sd[i]->declarators->size(); ++j) {
            Declarator *d = (*sd[i]->declarators)[j];
            d->InitFromDeclSpecs(&ds);

-            Symbol *sym = d->GetSymbol();
-
-            if (Type::Equal(sym->type, AtomicType::Void))
+            if (Type::Equal(d->type, AtomicType::Void))
                Error(d->pos, "\"void\" type illegal for struct member.");

-            const ArrayType *arrayType = 
-                dynamic_cast<const ArrayType *>(sym->type);
-            if (arrayType != NULL && arrayType->GetElementCount() == 0) {
-                Error(d->pos, "Unsized arrays aren't allowed in struct "
-                      "definitions.");
-                elementTypes->push_back(NULL);
-            }
-            else
-                elementTypes->push_back(sym->type);
+            elementTypes->push_back(d->type);

-            if (seenNames.find(sym->name) != seenNames.end())
+            if (seenNames.find(d->name) != seenNames.end())
                Error(d->pos, "Struct member \"%s\" has same name as a "
-                      "previously-declared member.", sym->name.c_str());
+                      "previously-declared member.", d->name.c_str());
            else
-                seenNames.insert(sym->name);
+                seenNames.insert(d->name);

-            elementNames->push_back(sym->name);
-            elementPositions->push_back(sym->pos);
+            elementNames->push_back(d->name);
+            elementPositions->push_back(d->pos);
        }
    }
+
+    for (int i = 0; i < (int)elementTypes->size() - 1; ++i) {
+        const ArrayType *arrayType = CastType<ArrayType>((*elementTypes)[i]);
+
+        if (arrayType != NULL && arrayType->GetElementCount() == 0)
+            Error((*elementPositions)[i], "Unsized arrays aren't allowed except "
+                  "for the last member in a struct definition.");
+    }
 }
--- a/decl.h
+++ b/decl.h
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -47,30 +47,21 @@
    variables--here, that the declaration has the 'static' and 'uniform'
    qualifiers, and that it's basic type is 'int'.  Then for each variable
    declaration, the Declaraiton class holds an instance of a Declarator,
-    which in turn records the per-variable information like the symbol
-    name, array size (if any), initializer expression, etc.
+    which in turn records the per-variable information like the name, array
+    size (if any), initializer expression, etc.  
 */

 #ifndef ISPC_DECL_H
 #define ISPC_DECL_H

 #include "ispc.h"
+#include <llvm/ADT/SmallVector.h>

 struct VariableDeclaration;

 class Declaration;
 class Declarator;

-enum StorageClass {
-    SC_NONE,
-    SC_EXTERN,
-    SC_EXPORT,
-    SC_STATIC,
-    SC_TYPEDEF,
-    SC_EXTERN_C
-};
-
-
 /* Multiple qualifiers can be provided with types in declarations;
   therefore, they are set up so that they can be ANDed together into an
   int. */
@@ -82,6 +73,7 @@ enum StorageClass {
 #define TYPEQUAL_SIGNED     (1<<4)
 #define TYPEQUAL_UNSIGNED   (1<<5)
 #define TYPEQUAL_INLINE     (1<<6)
+#define TYPEQUAL_EXPORT     (1<<7)

 /** @brief Representation of the declaration specifiers in a declaration.

@@ -141,25 +133,11 @@ public:
    Declarator(DeclaratorKind dk, SourcePos p);

    /** Once a DeclSpecs instance is available, this method completes the
-        initialization of the Symbol, setting its Type accordingly.
+        initialization of the type member.
     */
    void InitFromDeclSpecs(DeclSpecs *ds);

-    /** Get the actual type of the combination of Declarator and the given
-        DeclSpecs.  If an explicit base type is provided, the declarator is
-        applied to that type; otherwise the base type from the DeclSpecs is
-        used. */
-    const Type *GetType(DeclSpecs *ds) const;
-    const Type *GetType(const Type *base, DeclSpecs *ds) const;
-
-    /** Returns the symbol corresponding to the function declared by this
-        declarator and symbols for its arguments in *args. */
-    Symbol *GetFunctionInfo(DeclSpecs *ds, std::vector<Symbol *> *args);
-
-    Symbol *GetSymbolForFunctionParameter(int paramNum) const;
-
-    /** Returns the symbol associated with the declarator. */
-    Symbol *GetSymbol() const;
+    void InitFromType(const Type *base, DeclSpecs *ds);

    void Print(int indent) const;

@@ -180,18 +158,24 @@ public:
    /** Type qualifiers provided with the declarator. */
    int typeQualifiers;

+    StorageClass storageClass;
+
    /** For array declarators, this gives the declared size of the array.
        Unsized arrays have arraySize == 0. */ 
    int arraySize;

-    /** Symbol associated with the declarator. */
-    Symbol *sym;
+    /** Name associated with the declarator. */
+    std::string name;

    /** Initialization expression for the variable.  May be NULL. */
    Expr *initExpr;

+    /** Type of the declarator.  This is NULL until InitFromDeclSpecs() or
+        InitFromType() is called. */
+    const Type *type;
+
    /** For function declarations, this holds the Declaration *s for the
-        funciton's parameters. */
+        function's parameters. */
    std::vector<Declaration *> functionParams;
 };

@@ -236,8 +220,8 @@ struct StructDeclaration {
 /** Given a set of StructDeclaration instances, this returns the types of
    the elements of the corresponding struct and their names. */
 extern void GetStructTypesNamesPositions(const std::vector<StructDeclaration *> &sd,
-                                         std::vector<const Type *> *elementTypes,
-                                         std::vector<std::string> *elementNames,
-                                         std::vector<SourcePos> *elementPositions);
+                                         llvm::SmallVector<const Type *, 8> *elementTypes,
+                                         llvm::SmallVector<std::string, 8> *elementNames,
+                                         llvm::SmallVector<SourcePos, 8> *elementPositions);

 #endif // ISPC_DECL_H
--- a/docs/ReleaseNotes.txt
+++ b/docs/ReleaseNotes.txt
@@ -1,3 +1,81 @@
+=== v1.2.2 === (20 April 2012)
+
+This release includes a number of small additions to functionality and a
+number of bugfixes.  New functionality includes:
+
+* It's now possible to forward declare structures as in C/C++: "struct
+  Foo;".  After such a declaration, structs with pointers to "Foo" and
+  functions that take pointers or references to Foo structs can be declared
+  without the entire definition of Foo being available.
+
+* New built-in types size_t, ptrdiff_t, and [u]intptr_t are now available,
+  corresponding to the equivalent types in C.
+
+* The standard library now provides atomic_swap*() and
+  atomic_compare_exchange*() functions for void * types.
+
+* The C++ backend has seen a number of improvements to the quality and
+  readability of generated code.
+
+A number of bugs have been fixed in this release as well.  The most
+significant are:
+
+* Fixed a bug where nested loops could cause a compiler crash in some
+  circumstances (issues #240, and #229)
+
+* Gathers could access invlaid mamory (and cause the program to crash) in
+  some circumstances (#235)
+
+* References to temporary values are now handled properly when passed to a
+  function that takes a reference typed parameter.
+
+* A case where incorrect code could be generated for compile-time-constant
+  initializers has been fixed (#234).
+
+=== v1.2.1 === (6 April 2012)
+
+This release contains only minor new functionality and is mostly for many
+small bugfixes and improvements to error handling and error reporting.
+The new functionality that is present is:
+
+* Significantly more efficient versions of the float / half conversion
+  routines are now available in the standard library, thanks to Fabian
+  Giesen.
+
+* The last member of a struct can now be a zero-length array; this allows
+  the trick of dynamically allocating enough storage for the struct and
+  some number of array elements at the end of it.
+
+Significant bugs fixed include:
+
+* Issue #205: When a target ISA isn't specified, use the host system's
+  capabilities to choose a target for which it will be able to run the
+  generated code.
+
+* Issues #215 and #217: Don't allocate storage for global variables that
+  are declared "extern".
+
+* Issue #197: Allow NULL as a default argument value in a function
+  declaration.
+
+* Issue #223: Fix bugs where taking the address of a function wouldn't work
+  as expected.
+
+* Issue #224: When there are overloaded variants of a function that take
+  both reference and const reference parameters, give the non-const
+  reference preference when matching values of that underlying type.
+
+* Issue #225: An error is issed when a varying lvalue is assigned to a
+  reference type (rather than crashing).
+
+* Issue #193: Permit conversions from array types to void *, not just the
+  pointer type of the underlying array element.
+
+* Issue #199: Still evaluate expressions that are cast to (void).
+
+The documentation has also been improved, with FAQs added to clarify some
+aspects of the ispc pointer model.
+
 === v1.2.0 === (20 March 2012)

 This is a major new release of ispc, with a number of significant
--- a/docs/faq.rst
+++ b/docs/faq.rst
@@ -14,12 +14,19 @@ distribution.
  + `Why are there multiple versions of exported ispc functions in the assembly output?`_
  + `How can I more easily see gathers and scatters in generated assembly?`_

+* Language Details
+
+  + `What is the difference between "int *foo" and "int foo[]"?`_
+  + `Why are pointed-to types "uniform" by default?`_
+  + `What am I getting an error about assigning a varying lvalue to a reference type?`_ 
+  
 * Interoperability

  + `How can I supply an initial execution mask in the call from the application?`_
  + `How can I generate a single binary executable with support for multiple instruction sets?`_
  + `How can I determine at run-time which vector instruction set's instructions were selected to execute?`_
  + `Is it possible to inline ispc functions in C/C++ code?`_
+  + `Why is it illegal to pass "varying" values from C/C++ to ispc functions?`_ 

 * Programming Techniques

@@ -27,6 +34,7 @@ distribution.
  + `How can a gang of program instances generate variable amounts of output efficiently?`_
  + `Is it possible to use ispc for explicit vector programming?`_
  + `How can I debug my ispc programs using Valgrind?`_
+  + `foreach statements generate more complex assembly than I'd expect; what's going on?`_

 Understanding ispc's Output
 ===========================
@@ -213,6 +221,125 @@ easier to understand:
            jmp        ___pseudo_scatter_base_offsets32_32 ## TAILCALL


+Language Details
+================
+
+What is the difference between "int \*foo" and "int foo[]"?
+-----------------------------------------------------------
+
+In C and C++, declaring a function to take a parameter ``int *foo`` and
+``int foo[]`` results in the same type for the parameter.  Both are
+pointers to integers.  In ``ispc``, these are different types.  The first
+one is a varying pointer to a uniform integer value in memory, while the
+second results in a uniform pointer to the start of an array of varying
+integer values in memory.
+
+To understand why the first is a varying pointer to a uniform integer,
+first recall that types without explicit rate qualifiers (``uniform``,
+``varying``, or ``soa<>``) are ``varying`` by default.  Second, recall from
+the `discussion of pointer types in the ispc User's Guide`_ that pointed-to
+types without rate qualifiers are ``uniform`` by default.  (This second
+rule is discussed further below, in `Why are pointed-to types "uniform" by
+default?`_.)  The type of ``int *foo`` follows from these.
+
+.. _discussion of pointer types in the ispc User's Guide: ispc.html#pointer-types 
+
+Conversely, in a function body, ``int foo[10]`` represents a declaration of
+a 10-element array of varying ``int`` values.  In that we'd certainly like
+to be able to pass such an array to a function that takes a ``int []``
+parameter, the natural type for an ``int []`` parameter is a uniform
+pointer to varying integer values.
+
+In terms of compatibility with C/C++, it's unfortunate that this
+distinction exists, though any other set of rules seems to introduce more
+awkwardness than this one.  (Though we're interested to hear ideas to
+improve these rules!).
+
+Why are pointed-to types "uniform" by default?
+----------------------------------------------
+
+In ``ispc``, types without rate qualifiers are "varying" by default, but
+types pointed to by pointers without rate qualifiers are "uniform" by
+default.  Why this difference?
+
+::
+
+    int foo;  // no rate qualifier, "varying int".
+    uniform int *foo;  // pointer type has no rate qualifier, pointed-to does.
+                       // "varying pointer to uniform int".
+    int *foo;  // neither pointer type nor pointed-to type ("int") have
+               // rate qualifiers. Pointer type is varying by default,
+               // pointed-to is uniform. "varying pointer to uniform int".
+    varying int *foo;   // varying pointer to varying int
+
+The first rule, having types without rate qualifiers be varying by default,
+is a default that keeps the number of "uniform" or "varying" qualifiers in
+``ispc`` programs low.  Most ``ispc`` programs use mostly "varying"
+variables, so this rule allows most variables to be declared without also
+requiring rate qualifiers.
+
+On a related note, this rule allows many C/C++ functions to be used to
+define equivalent functions in the SPMD execution model that ``ispc``
+provides with little or no modification:
+
+::
+
+    // scalar add in C/C++, SPMD/vector add in ispc
+    int add(int a, int b) { return a + b; }
+
+This motivation also explains why ``uniform int *foo`` represents a varying
+pointer; having pointers be varying by default if they don't have rate
+qualifiers similarly helps with porting code from C/C++ to ``ispc``.
+
+The tricker issue is why pointed-to types are "uniform" by default.  In our
+experience, data in memory that is accessed via pointers is most often
+uniform; this generally includes all data that has been allocated and
+initialized by the C/C++ application code. In practice, "varying" types are
+more generally (but not exclusively) used for local data in ``ispc``
+functions.  Thus, making the pointed-to type uniform by default leads to
+more concise code for the most common cases.
+
+
+What am I getting an error about assigning a varying lvalue to a reference type?
+--------------------------------------------------------------------------------
+
+Given code like the following:
+
+::
+
+    uniform float a[...];
+    int index = ...;
+    float &r = a[index];
+
+``ispc`` issues the error "Initializer for reference-type variable "r" must
+have a uniform lvalue type.".  The underlying issue stems from how
+references are represented in the code generated by ``ispc``.  Recall that
+``ispc`` supports both uniform and varying pointer types--a uniform pointer
+points to the same location in memory for all program instances in the
+gang, while a varying pointer allows each program instance to have its own
+pointer value.
+
+References are represented a pointer in the code generated by ``ispc``,
+though this is generally opaque to the user; in ``ispc``, they are
+specifically uniform pointers.  This design decision was made so that given
+code like this:
+
+::
+
+    extern void func(float &val);
+    float foo = ...;
+    func(foo);
+
+Then the reference would be handled efficiently as a single pointer, rather
+than unnecessarily being turned into a gang-size of pointers.
+
+However, an implication of this decision is that it's not possible for
+references to refer to completely different things for each of the program
+instances.  (And hence the error that is issued).  In cases where a unique
+per-program-instance pointer is needed, a varying pointer should be used
+instead of a reference.
+
+
 Interoperability
 ================

@@ -391,6 +518,48 @@ linking your applicaiton.
 ``-mattr=+avx`` flag to ``llc``.)
    

+Why is it illegal to pass "varying" values from C/C++ to ispc functions?
+------------------------------------------------------------------------
+
+If any of the types in the parameter list to an exported function is
+"varying" (including recursively, and members of structure types, etc.),
+then ``ispc`` will issue an error and refuse to compile the function:
+
+::
+
+    % echo "export int add(int x) { return ++x; }" | ispc
+    <stdin>:1:12: Error: Illegal to return a "varying" type from exported function "foo" 
+    <stdin>:1:20: Error: Varying parameter "x" is illegal in an exported function. 
+
+While there's no fundamental reason why this isn't possible, recall the
+definition of "varying" variables: they have one value for each program
+instance in the gang.  As such, the number of values and amount of storage
+required to represent a varying variable depends on the gang size
+(i.e. ``programCount``), which can have different values depending on the
+compilation target.
+
+``ispc`` therefore prohibits passing "varying" values between the
+application and the ``ispc`` program in order to prevent the
+application-side code from depending on a particular gang size, in order to
+encourage portability to different gang sizes.  (A generally desirable
+programming practice.)
+
+For cases where the size of data is actually fixed from the application
+side, the value can be passed via a pointer to a short ``uniform`` array,
+as follows:
+
+::
+
+    export void add4(uniform int ptr[4]) {
+        foreach (i = 0 ... 4)
+            ptr[i]++;
+    }
+
+On the 4-wide SSE instruction set, this compiles to a single vector add
+instruction (and associated move instructions), while it still also
+efficiently computes the correct result on 8-wide AVX targets.
+
+
 Programming Techniques
 ======================

@@ -525,3 +694,79 @@ you can use ``--target=sse4`` when compiling to run with ``valgrind``.
 Note that ``valgrind`` does not yet support programs that use the AVX
 instruction set.

+foreach statements generate more complex assembly than I'd expect; what's going on?
+-----------------------------------------------------------------------------------
+
+Given a simple ``foreach`` loop like the following:
+
+::
+
+    void foo(uniform float a[], uniform int count) {
+        foreach (i = 0 ... count)
+            a[i] *= 2;
+    }
+
+
+the ``ispc`` compiler generates approximately 40 instructions--why isn't
+the generated code simpler?
+
+There are two main components to the code: one handles
+``programCount``-sized chunks of elements of the array, and the other
+handles any excess elements at the end of the array that don't completely
+fill a gang.  The code for the main loop is essentially what one would
+expect: a vector of values are laoded from the array, the multiply is done,
+and the result is stored.
+
+::
+
+    LBB0_2:                                 ## %foreach_full_body
+	movslq	%edx, %rdx
+	vmovups	(%rdi,%rdx), %ymm1
+	vmulps	%ymm0, %ymm1, %ymm1
+	vmovups	%ymm1, (%rdi,%rdx)
+	addl	$32, %edx
+	addl	$8, %eax
+	cmpl	%ecx, %eax
+	jl	LBB0_2
+
+
+Then, there is a sequence of instructions that handles any additional
+elements at the end of the array.  (These instructions don't execute if
+there aren't any left-over values to process, but they do lengthen the
+amount of generated code.)
+
+::
+
+  ## BB#4:                                ## %partial_inner_only
+	vmovd	%eax, %xmm0
+	vinsertf128	$1, %xmm0, %ymm0, %ymm0
+	vpermilps	$0, %ymm0, %ymm0 ## ymm0 = ymm0[0,0,0,0,4,4,4,4]
+	vextractf128	$1, %ymm0, %xmm3
+	vmovd	%esi, %xmm2
+	vmovaps	LCPI0_1(%rip), %ymm1
+	vextractf128	$1, %ymm1, %xmm4
+	vpaddd	%xmm4, %xmm3, %xmm3
+        # ....
+	vmulps	LCPI0_0(%rip), %ymm1, %ymm1
+	vmaskmovps	%ymm1, %ymm0, (%rdi,%rax)
+
+
+If you know that the number of elements to be processed will always be an
+exact multiple of the 8, 16, etc., then adding a simple assignment to
+``count`` like the one below gives the compiler enough information to be
+able to eliminate the code for the additional array elements.
+
+::
+
+    void foo(uniform float a[], uniform int count) {
+        // This assignment doesn't change the value of count
+        // if it's a multiple of 16, but it gives the compiler
+        // insight into this fact, allowing for simpler code to
+        // be generated for the foreach loop.
+        count = (count & ~(16-1));
+        foreach (i = 0 ... count)
+            a[i] *= 2;
+    }
+
+With this new version of ``foo()``, only the code for the first loop above
+is generated.
--- a/docs/ispc.rst
+++ b/docs/ispc.rst
@@ -121,10 +121,14 @@ Contents:

 * `The ISPC Standard Library`_

+  + `Basic Operations On Data`_
+
+    * `Logical and Selection Operations`_
+    * `Bit Operations`_
+
  + `Math Functions`_

    * `Basic Math Functions`_
-    * `Bit-Level Operations`_
    * `Transcendental Functions`_
    * `Pseudo-Random Numbers`_

@@ -143,6 +147,7 @@ Contents:

    * `Converting Between Array-of-Structures and Structure-of-Arrays Layout`_
    * `Conversions To and From Half-Precision Floats`_
+    * `Converting to sRGB8`_

  + `Systems Programming Support`_

@@ -538,7 +543,7 @@ preprocessor runs:
  * - ISPC
    - 1
    - Detecting that the ``ispc`` compiler is processing the file
-  * - ISPC_TARGET_{SSE2,SSE4,AVX}
+  * - ISPC_TARGET_{SSE2,SSE4,AVX,AVX2}
    - 1
    - One of these will be set, depending on the compilation target.
  * - ISPC_POINTER_SIZE
@@ -1390,8 +1395,8 @@ Types
 Basic Types and Type Qualifiers
 -------------------------------

-``ispc`` is a statically-typed language.  It supports a variety of basic
-types.
+``ispc`` is a statically-typed language.  It supports a variety of core
+basic types:

 * ``void``: "empty" type representing no value.
 * ``bool``: boolean value; may be assigned ``true``, ``false``, or the
@@ -1408,6 +1413,15 @@ types.
 * ``unsigned int64``: 64-bit unsigned integer.
 * ``double``: 64-bit double-precision floating point value.

+There are also a few built-in types related to pointers and memory:
+
+* ``size_t``: the maximum size of any object (structure or array)
+* ``ptrdiff_t``: an integer type large enough to represent the difference
+  between two pointers
+* ``intptr_t``: signed integer type that is large enough to represent
+  a pointer value
+* ``uintptr_t``: unsigned integer type large enough to represent a pointer
+
 Implicit type conversion between values of different types is done
 automatically by the ``ispc`` compiler.  Thus, a value of ``float`` type
 can be assigned to a variable of ``int`` type directly.  In binary
@@ -1492,13 +1506,17 @@ Defining New Names For Types
 The ``typedef`` keyword can be used to name types:

 ::
+ 
+    typedef int64 BigInt;
+    typedef float Float3[3];

-  typedef Float3 float[3];
+Following C's syntax, the code above defines ``BigInt`` to have ``int64``
+type and ``Float3`` to have ``float[3]`` type.

-``typedef`` doesn't create a new type: it just provides an alternative name
-for an existing type.  Thus, in the above example, it is legal to pass a
-value with ``float[3]`` type to a function that has been declared to take a
-``Float3`` parameter.
+Also as in C, ``typedef`` doesn't create a new type: it just provides an
+alternative name for an existing type.  Thus, in the above example, it is
+legal to pass a value with ``float[3]`` type to a function that has been
+declared to take a ``Float3`` parameter.


 Pointer Types
@@ -2150,6 +2168,12 @@ greater than or equal to ``NUM_ITEMS``.
        // ...
    }

+Short-circuiting may impose some overhead in the generated code; for cases
+where short-circuiting is undesirable due to performance impact, see
+the section `Logical and Selection Operations`_, which introduces helper
+functions in the standard library that provide these operations without
+short-circuiting.
+

 Dynamic Memory Allocation
 -------------------------
@@ -2827,6 +2851,123 @@ The ISPC Standard Library
 compiling ``ispc`` programs.  (To disable the standard library, pass the
 ``--nostdlib`` command-line flag to the compiler.)

+Basic Operations On Data
+------------------------
+
+Logical and Selection Operations
+--------------------------------
+
+Recall from `Expressions`_ that ``ispc`` short-circuits the evaluation of
+logical and selection operators: given an expression like ``(index < count
+&& array[index] == 0)``, then ``array[index] == 0`` is only evaluated if
+``index < count`` is true.  This property is useful for writing expressions
+like the preceeding one, where the second expression may not be safe to
+evaluate in some cases.
+
+This short-circuiting can impose overhead in the generated code; additional
+operations are required to test the first value and to conditionally jump
+over the code that evaluates the second value.  The ``ispc`` compiler does
+try to mitigate this cost by detecting cases where it is both safe and
+inexpensive to evaluate both expressions, and skips short-circuiting in the
+generated code in this case (without there being any programmer-visible
+change in program behavior.)
+
+For cases where the compiler can't detect this case but the programmer
+wants to avoid short-circuiting behavior, the standard library provides a
+few helper functions.  First, ``and()`` and ``or()`` provide
+non-short-circuiting logical AND and OR operations.
+
+::
+
+    bool and(bool a, bool b)
+    bool or(bool a, bool b)
+    uniform bool and(uniform bool a, uniform bool b)
+    uniform bool or(uniform bool a, uniform bool b)
+
+And there are three variants of ``select()`` that select between two values
+based on a boolean condition.  These are the variants of ``select()`` for
+the ``int8`` type:
+
+::
+
+    int8 select(bool v, int8 a, int8 b)
+    int8 select(uniform bool v, int8 a, int8 b)
+    uniform int8 select(uniform bool v, uniform int8 a, uniform int8 b)
+
+There are also variants for ``int16``, ``int32``, ``int64``, ``float``, and
+``double`` types.
+
+Bit Operations
+--------------
+
+The various variants of ``popcnt()`` return the population count--the
+number of bits set in the given value.
+
+::
+
+    uniform int popcnt(uniform int v)
+    int popcnt(int v)
+    uniform int popcnt(bool v)
+
+
+A few functions determine how many leading bits in the given value are zero
+and how many of the trailing bits are zero; there are also ``unsigned``
+variants of these functions and variants that take ``int64`` and ``unsigned
+int64`` types.
+
+::
+
+    int32 count_leading_zeros(int32 v)
+    uniform int32 count_leading_zeros(uniform int32 v)
+    int32 count_trailing_zeros(int32 v)
+    uniform int32 count_trailing_zeros(uniform int32 v)
+
+Sometimes it's useful to convert a ``bool`` value to an integer using sign
+extension so that the integer's bits are all on if the ``bool`` has the
+value ``true`` (rather than just having the value one).  The
+``sign_extend()`` functions provide this functionality:
+
+::
+
+    int sign_extend(bool value) 
+    uniform int sign_extend(uniform bool value) 
+
+The ``intbits()`` and ``floatbits()`` functions can be used to implement
+low-level floating-point bit twiddling.  For example, ``intbits()`` returns
+an ``unsigned int`` that is a bit-for-bit copy of the given ``float``
+value.  (Note: it is **not** the same as ``(int)a``, but corresponds to
+something like ``*((int *)&a)`` in C.
+
+::
+
+    float floatbits(unsigned int a);
+    uniform float floatbits(uniform unsigned int a);
+    unsigned int intbits(float a);
+    uniform unsigned int intbits(uniform float a);
+
+
+The ``intbits()`` and ``floatbits()`` functions have no cost at runtime;
+they just let the compiler know how to interpret the bits of the given
+value.  They make it possible to efficiently write functions that take
+advantage of the low-level bit representation of floating-point values.
+
+For example, the ``abs()`` function in the standard library is implemented
+as follows:
+
+::
+
+    float abs(float a) {
+        unsigned int i = intbits(a);
+        i &= 0x7fffffff;
+        return floatbits(i);
+    }
+
+This code directly clears the high order bit to ensure that the given
+floating-point value is positive.  This compiles down to a single ``andps``
+instruction when used with an Intel® SSE target, for example.
+
+
+
 Math Functions
 --------------

@@ -2919,77 +3060,6 @@ quite efficient.)
                               uniform unsigned int low,
                               uniform unsigned int high)

-Bit-Level Operations
--------------------
-
-
-The various variants of ``popcnt()`` return the population count--the
-number of bits set in the given value.
-
-::
-
-    uniform int popcnt(uniform int v)
-    int popcnt(int v)
-    uniform int popcnt(bool v)
-
-
-A few functions determine how many leading bits in the given value are zero
-and how many of the trailing bits are zero; there are also ``unsigned``
-variants of these functions and variants that take ``int64`` and ``unsigned
-int64`` types.
-
-::
-
-    int32 count_leading_zeros(int32 v)
-    uniform int32 count_leading_zeros(uniform int32 v)
-    int32 count_trailing_zeros(int32 v)
-    uniform int32 count_trailing_zeros(uniform int32 v)
-
-Sometimes it's useful to convert a ``bool`` value to an integer using sign
-extension so that the integer's bits are all on if the ``bool`` has the
-value ``true`` (rather than just having the value one).  The
-``sign_extend()`` functions provide this functionality:
-
-::
-
-    int sign_extend(bool value) 
-    uniform int sign_extend(uniform bool value) 
-
-The ``intbits()`` and ``floatbits()`` functions can be used to implement
-low-level floating-point bit twiddling.  For example, ``intbits()`` returns
-an ``unsigned int`` that is a bit-for-bit copy of the given ``float``
-value.  (Note: it is **not** the same as ``(int)a``, but corresponds to
-something like ``*((int *)&a)`` in C.
-
-::
-
-    float floatbits(unsigned int a);
-    uniform float floatbits(uniform unsigned int a);
-    unsigned int intbits(float a);
-    uniform unsigned int intbits(uniform float a);
-
-
-The ``intbits()`` and ``floatbits()`` functions have no cost at runtime;
-they just let the compiler know how to interpret the bits of the given
-value.  They make it possible to efficiently write functions that take
-advantage of the low-level bit representation of floating-point values.
-
-For example, the ``abs()`` function in the standard library is implemented
-as follows:
-
-::
-
-    float abs(float a) {
-        unsigned int i = intbits(a);
-        i &= 0x7fffffff;
-        return floatbits(i);
-    }
-
-This code directly clears the high order bit to ensure that the given
-floating-point value is positive.  This compiles down to a single ``andps``
-instruction when used with an Intel® SSE target, for example.
-
-
 Transcendental Functions
 ------------------------

@@ -3027,8 +3097,8 @@ The corresponding inverse functions are also available:
   uniform float acos(uniform float x)
   float atan(float x)
   uniform float atan(uniform float x)
-   float atan2(float x, float y)
-   uniform float atan2(uniform float x, uniform float y)
+   float atan2(float y, float x)
+   uniform float atan2(uniform float y, uniform float x)

 If both sine and cosine are needed, then the ``sincos()`` call computes
 both more efficiently than two calls to the respective individual
@@ -3077,7 +3147,7 @@ library.  State for the RNG is maintained in an instance of the
 ::

    struct RNGState;
-    void seed_rng(varying RNGState * uniform state, uniform int seed)
+    void seed_rng(varying RNGState * uniform state, int seed)
    void seed_rng(uniform RNGState * uniform state, uniform int seed)

 After the RNG is seeded, the ``random()`` function can be used to get a
@@ -3622,6 +3692,22 @@ precise.
    uniform int16 float_to_half_fast(uniform float f)


+Converting to sRGB8
+-------------------
+
+The sRGB color space is used in many applications in graphics and imaging;
+see the `Wikipedia page on sRGB`_ for more information.  The ``ispc``
+standard library provides two functions for converting floating-point color
+values to 8-bit values in the sRGB space.
+
+.. _Wikipedia page on sRGB: http://en.wikipedia.org/wiki/SRGB
+
+::
+
+    int float_to_srgb8(float v)
+    uniform int float_to_srgb8(uniform float v)
+
+
 Systems Programming Support
 ---------------------------

@@ -3732,6 +3818,13 @@ For global atomics, only atomic swap is available for these types:
  float atomic_swap_global(uniform float * uniform ptr, float value)
  double atomic_swap_global(uniform double * uniform ptr, double value)

+Finally, "swap" (but none of these other atomics) is available for pointer
+types:
+
+::
+
+  void *atomic_swap_{local,global}(void * * uniform ptr, void * value)
+
 There are also variants of the atomic that take ``uniform`` values for the
 operand and return a ``uniform`` result.  These correspond to a single
 atomic operation being performed for the entire gang of program instances,
@@ -3756,6 +3849,13 @@ rather than one per program instance.
  uniform int32 atomic_swap_{local,global}(uniform int32 * uniform ptr,
                                           uniform int32 newval)

+And similarly for pointers:
+
+::
+
+  uniform void *atomic_swap_{local,global}(void * * uniform ptr,
+                                           void *newval)
+
 Be careful that you use the atomic function that you mean to; consider the
 following code:

@@ -3797,12 +3897,18 @@ the same location in memory!)
  int32 atomic_xor_{local,global}(uniform int32 * varying ptr, int32 value)
  int32 atomic_swap_{local,global}(uniform int32 * varying ptr, int32 value)

+And:
+
+::
+
+  void *atomic_swap_{local,global}(void * * ptr, void *value)
+
 There are also atomic "compare and exchange" functions.  Compare and
 exchange atomically compares the value in "val" to "compare"--if they
 match, it assigns "newval" to "val".  In either case, the old value of
 "val" is returned.  (As with the other atomic operations, there are also
 ``unsigned`` and 64-bit variants of this function.  Furthermore, there are
-``float`` and ``double`` variants as well.)
+``float``, ``double``, and ``void *`` variants as well.)

 ::

@@ -3824,6 +3930,11 @@ code.

    void memory_barrier();

+Note that this barrier is *not* needed for coordinating reads and writes
+among the program instances in a gang; it's only needed for coordinating
+between multiple hardware threads running on different cores.  See the
+section `Data Races Within a Gang`_ for the guarantees provided about
+memory read/write ordering across a gang.

 Prefetches
 ----------
--- a/docs/news.rst
+++ b/docs/news.rst
@@ -2,6 +2,24 @@
 ispc News
 =========

+ispc 1.2.1 is Released
+----------------------
+
+This is a bugfix release, fixing approximately 20 bugs in the system and
+improving error handling and error reporting.  New functionality includes
+very efficient float/half conversion routines thanks to Fabian 
+Giesen.  See the `1.2.1 release notes`_ for details.
+
+.. _1.2.1 release notes: https://github.com/ispc/ispc/tree/master/docs/ReleaseNotes.txt
+
+ispc 1.2.0 is Released
+-----------------------
+
+A new major release was posted on March 20, 2012.  This release includes
+significant new functionality for cleanly handling "structure of arrays"
+(SoA) data layout and a new model for how uniform and varying are handled
+with structure types.  
+
 Paper on ispc To Appear in InPar 2012
 -------------------------------------

--- a/docs/perfguide.rst
+++ b/docs/perfguide.rst
@@ -624,7 +624,7 @@ gathers happen.)

    extern "C" {
        void ISPCInstrument(const char *fn, const char *note, 
-                            int line, int mask);
+                            int line, uint64_t mask);
    }

 This function is passed the file name of the ``ispc`` file running, a short
@@ -637,7 +637,7 @@ as follows:

 ::

-   ISPCInstrument("foo.ispc", "function entry", 55, 0xf);
+   ISPCInstrument("foo.ispc", "function entry", 55, 0xfull);

 This call indicates that at the currently executing program has just
 entered the function defined at line 55 of the file ``foo.ispc``, with a
--- a/doxygen.cfg
+++ b/doxygen.cfg
@@ -31,7 +31,7 @@ PROJECT_NAME           = "Intel SPMD Program Compiler"
 # This could be handy for archiving the generated documentation or
 # if some version control system is used.

-PROJECT_NUMBER         = 1.2.0
+PROJECT_NUMBER         = 1.2.2

 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
 # base path where the generated documentation will be put.
--- a/examples/aobench/ao.ispc
+++ b/examples/aobench/ao.ispc
@@ -210,7 +210,7 @@ static void ao_scanlines(uniform int y0, uniform int y1, uniform int w,
        { { 1.0f, 0.0f, -2.2f }, 0.5f } };
    RNGState rngstate;

-    seed_rng(&rngstate, y0);
+    seed_rng(&rngstate, programIndex + (y0 << (programIndex & 15)));
    float invSamples = 1.f / nsubsamples;

    foreach_tiled(y = y0 ... y1, x = 0 ... w, 
--- a/examples/aobench_instrumented/ao.ispc
+++ b/examples/aobench_instrumented/ao.ispc
@@ -211,7 +211,7 @@ static void ao_scanlines(uniform int y0, uniform int y1, uniform int w,
        { { 1.0f, 0.0f, -2.2f }, 0.5f } };
    RNGState rngstate;

-    seed_rng(&rngstate, y0);
+    seed_rng(&rngstate, programIndex + (y0 << (programIndex & 15)));

    // Compute the mapping between the 'programCount'-wide program
    // instances running in parallel and samples in the image.  
--- a/examples/deferred/main.cpp
+++ b/examples/deferred/main.cpp
@@ -87,7 +87,7 @@ int main(int argc, char** argv) {
        framebuffer.clear();
        reset_and_start_timer();
        for (int j = 0; j < nframes; ++j)
-            ispc::RenderStatic(&input->header, &input->arrays, 
+            ispc::RenderStatic(input->header, input->arrays,
                               VISUALIZE_LIGHT_COUNT,
                               framebuffer.r, framebuffer.g, framebuffer.b);
        double mcycles = get_elapsed_mcycles() / nframes;
--- a/examples/intrinsics/generic-16.h
+++ b/examples/intrinsics/generic-16.h
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -259,13 +259,13 @@ static FORCEINLINE TYPE NAME(TYPE a, int32_t b) {                   \
   return ret;                                                      \
 }

-#define SMEAR(VTYPE, NAME, STYPE)               \
-static FORCEINLINE VTYPE __smear_##NAME(STYPE v) {        \
-    VTYPE ret;                                  \
-    for (int i = 0; i < 16; ++i)                \
-        ret.v[i] = v;                           \
-    return ret;                                 \
-}                                               \
+#define SMEAR(VTYPE, NAME, STYPE)                                  \
+static FORCEINLINE VTYPE __smear_##NAME(VTYPE retType, STYPE v) {  \
+    VTYPE ret;                                                     \
+    for (int i = 0; i < 16; ++i)                                   \
+        ret.v[i] = v;                                              \
+    return ret;                                                    \
+}

 #define BROADCAST(VTYPE, NAME, STYPE)                 \
 static FORCEINLINE VTYPE __broadcast_##NAME(VTYPE v, int index) {   \
@@ -311,8 +311,8 @@ INSERT_EXTRACT(__vec1_d, double)
 ///////////////////////////////////////////////////////////////////////////
 // mask ops

-static FORCEINLINE uint32_t __movmsk(__vec16_i1 mask) {
-    return mask.v;
+static FORCEINLINE uint64_t __movmsk(__vec16_i1 mask) {
+    return (uint64_t)mask.v;
 }

 static FORCEINLINE __vec16_i1 __equal(__vec16_i1 a, __vec16_i1 b) {
@@ -339,6 +339,24 @@ static FORCEINLINE __vec16_i1 __or(__vec16_i1 a, __vec16_i1 b) {
    return r;
 }

+static FORCEINLINE __vec16_i1 __not(__vec16_i1 v) {
+    __vec16_i1 r;
+    r.v = ~v.v;
+    return r;
+}
+
+static FORCEINLINE __vec16_i1 __and_not1(__vec16_i1 a, __vec16_i1 b) {
+    __vec16_i1 r;
+    r.v = ~a.v & b.v;
+    return r;
+}
+
+static FORCEINLINE __vec16_i1 __and_not2(__vec16_i1 a, __vec16_i1 b) {
+    __vec16_i1 r;
+    r.v = a.v & ~b.v;
+    return r;
+}
+
 static FORCEINLINE __vec16_i1 __select(__vec16_i1 mask, __vec16_i1 a, 
                                       __vec16_i1 b) {
    __vec16_i1 r;
@@ -374,6 +392,12 @@ static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v, int align) {
    *ptr = v.v;
 }

+static FORCEINLINE __vec16_i1 __smear_i1(__vec16_i1, int v) {
+    return __vec16_i1(v, v, v, v, v, v, v, v, 
+                      v, v, v, v, v, v, v, v);
+}
+
+
 ///////////////////////////////////////////////////////////////////////////
 // int8

@@ -581,6 +605,121 @@ ROTATE(__vec16_f, float, float)
 SHUFFLES(__vec16_f, float, float)
 LOAD_STORE(__vec16_f, float)

+static FORCEINLINE float __exp_uniform_float(float v) {
+    return expf(v);
+}
+
+static FORCEINLINE __vec16_f __exp_varying_float(__vec16_f v) {
+    __vec16_f ret;
+    for (int i = 0; i < 16; ++i)
+        ret.v[i] = expf(v.v[i]);
+    return ret;
+}
+
+static FORCEINLINE float __log_uniform_float(float v) {
+    return logf(v);
+}
+
+static FORCEINLINE __vec16_f __log_varying_float(__vec16_f v) {
+    __vec16_f ret;
+    for (int i = 0; i < 16; ++i)
+        ret.v[i] = logf(v.v[i]);
+    return ret;
+}
+
+static FORCEINLINE float __pow_uniform_float(float a, float b) {
+    return powf(a, b);
+}
+
+static FORCEINLINE __vec16_f __pow_varying_float(__vec16_f a, __vec16_f b) {
+    __vec16_f ret;
+    for (int i = 0; i < 16; ++i)
+        ret.v[i] = powf(a.v[i], b.v[i]);
+    return ret;
+}
+
+static FORCEINLINE int __intbits(float v) {
+    union {
+        float f;
+        int i;
+    } u;
+    u.f = v;
+    return u.i;
+}
+
+static FORCEINLINE float __floatbits(int v) {
+    union {
+        float f;
+        int i;
+    } u;
+    u.i = v;
+    return u.f;
+}
+
+static FORCEINLINE float __half_to_float_uniform(int16_t h) {
+    static const uint32_t shifted_exp = 0x7c00 << 13; // exponent mask after shift
+
+    int32_t o = ((int32_t)(h & 0x7fff)) << 13;     // exponent/mantissa bits
+    uint32_t exp = shifted_exp & o;   // just the exponent
+    o += (127 - 15) << 23;        // exponent adjust
+
+    // handle exponent special cases
+    if (exp == shifted_exp) // Inf/NaN?
+        o += (128 - 16) << 23;    // extra exp adjust
+    else if (exp == 0) { // Zero/Denormal?
+        o += 1 << 23;             // extra exp adjust
+        o = __intbits(__floatbits(o) - __floatbits(113 << 23)); // renormalize
+    }
+
+    o |= ((int32_t)(h & 0x8000)) << 16;    // sign bit
+    return __floatbits(o);
+}
+
+
+static FORCEINLINE __vec16_f __half_to_float_varying(__vec16_i16 v) {
+    __vec16_f ret;
+    for (int i = 0; i < 16; ++i)
+        ret.v[i] = __half_to_float_uniform(v.v[i]);
+    return ret;
+}
+
+
+static FORCEINLINE int16_t __float_to_half_uniform(float f) {
+    uint32_t sign_mask = 0x80000000u;
+    int32_t o;
+
+    int32_t fint = __intbits(f);
+    int32_t sign = fint & sign_mask;
+    fint ^= sign;
+
+    int32_t f32infty = 255 << 23;
+    o = (fint > f32infty) ? 0x7e00 : 0x7c00; 
+
+    // (De)normalized number or zero
+    // update fint unconditionally to save the blending; we don't need it
+    // anymore for the Inf/NaN case anyway.
+    const uint32_t round_mask = ~0xfffu; 
+    const int32_t magic = 15 << 23;
+    const int32_t f16infty = 31 << 23;
+
+    int32_t fint2 = __intbits(__floatbits(fint & round_mask) * __floatbits(magic)) - round_mask;
+    fint2 = (fint2 > f16infty) ? f16infty : fint2; // Clamp to signed infinity if overflowed
+
+    if (fint < f32infty)
+        o = fint2 >> 13; // Take the bits!
+
+    return (o | (sign >> 16));
+}
+
+
+static FORCEINLINE __vec16_i16 __float_to_half_varying(__vec16_f v) {
+    __vec16_i16 ret;
+    for (int i = 0; i < 16; ++i)
+        ret.v[i] = __float_to_half_uniform(v.v[i]);
+    return ret;
+}
+
+
 ///////////////////////////////////////////////////////////////////////////
 // double

--- a/examples/intrinsics/generic-32.h
+++ b/examples/intrinsics/generic-32.h
--- a/examples/intrinsics/generic-64.h
+++ b/examples/intrinsics/generic-64.h
--- a/examples/intrinsics/sse4.h
+++ b/examples/intrinsics/sse4.h
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -224,8 +224,8 @@ CAST_BITS_SCALAR(double, int64_t)
 ///////////////////////////////////////////////////////////////////////////
 // mask ops

-static FORCEINLINE uint32_t __movmsk(__vec4_i1 mask) {
-    return _mm_movemask_ps(mask.v);
+static FORCEINLINE uint64_t __movmsk(__vec4_i1 mask) {
+    return (uint64_t)_mm_movemask_ps(mask.v);
 }

 static FORCEINLINE __vec4_i1 __equal(__vec4_i1 a, __vec4_i1 b) {
@@ -266,6 +266,10 @@ static FORCEINLINE void __store(__vec4_i1 *p, __vec4_i1 value, int align) {
    _mm_storeu_ps((float *)(&p->v), value.v);
 }

+static FORCEINLINE __vec4_i1 __smear_i1(__vec4_i1, int v) {
+    return __vec4_i1(v, v, v, v);
+}
+
 ///////////////////////////////////////////////////////////////////////////
 // int8

@@ -489,7 +493,7 @@ static FORCEINLINE void __insert_element(__vec4_i8 *v, int index, int8_t val) {
    ((int8_t *)v)[index] = val;
 }

-static FORCEINLINE __vec4_i8 __smear_i8(int8_t v) {
+static FORCEINLINE __vec4_i8 __smear_i8(__vec4_i8, int8_t v) {
    return _mm_set1_epi8(v);
 }

@@ -748,7 +752,7 @@ static FORCEINLINE void __insert_element(__vec4_i16 *v, int index, int16_t val)
    ((int16_t *)v)[index] = val;
 }

-static FORCEINLINE __vec4_i16 __smear_i16(int16_t v) {
+static FORCEINLINE __vec4_i16 __smear_i16(__vec4_i16, int16_t v) {
    return _mm_set1_epi16(v);
 }

@@ -985,7 +989,7 @@ static FORCEINLINE __vec4_i32 __select(__vec4_i1 mask, __vec4_i32 a, __vec4_i32
                                          _mm_castsi128_ps(a.v), mask.v));
 }

-static FORCEINLINE __vec4_i32 __smear_i32(int32_t v) {
+static FORCEINLINE __vec4_i32 __smear_i32(__vec4_i32, int32_t v) {
    return _mm_set1_epi32(v);
 }

@@ -1246,7 +1250,7 @@ static FORCEINLINE __vec4_i64 __select(__vec4_i1 mask, __vec4_i64 a, __vec4_i64
    return __vec4_i64(_mm_castpd_si128(r0), _mm_castpd_si128(r1));
 }

-static FORCEINLINE __vec4_i64 __smear_i64(int64_t v) {
+static FORCEINLINE __vec4_i64 __smear_i64(__vec4_i64, int64_t v) {
    return __vec4_i64(v, v, v, v);
 }

@@ -1350,7 +1354,7 @@ static FORCEINLINE __vec4_f __select(__vec4_i1 mask, __vec4_f a, __vec4_f b) {
    return _mm_blendv_ps(b.v, a.v, mask.v);
 }

-static FORCEINLINE __vec4_f __smear_float(float v) {
+static FORCEINLINE __vec4_f __smear_float(__vec4_f, float v) {
    return _mm_set1_ps(v);
 }

@@ -1482,7 +1486,7 @@ static FORCEINLINE __vec4_d __select(__vec4_i1 mask, __vec4_d a, __vec4_d b) {
    return __vec4_d(r0, r1);
 }

-static FORCEINLINE __vec4_d __smear_double(double v) {
+static FORCEINLINE __vec4_d __smear_double(__vec4_d, double v) {
    return __vec4_d(_mm_set1_pd(v), _mm_set1_pd(v));
 }

@@ -1582,11 +1586,13 @@ static FORCEINLINE __vec4_i16 __cast_sext(__vec4_i16, __vec4_i8 val) {
 }

 static FORCEINLINE __vec4_i8 __cast_sext(__vec4_i8, __vec4_i1 v) {
-    return __select(v, __smear_i8(0xff), __smear_i8(0));
+    return __select(v, __smear_i8(__vec4_i8(), 0xff), 
+                       __smear_i8(__vec4_i8(), 0));
 }

 static FORCEINLINE __vec4_i16 __cast_sext(__vec4_i16, __vec4_i1 v) {
-    return __select(v, __smear_i16(0xffff), __smear_i16(0));
+    return __select(v, __smear_i16(__vec4_i16(), 0xffff),
+                       __smear_i16(__vec4_i16(), 0));
 }

 static FORCEINLINE __vec4_i32 __cast_sext(__vec4_i32, __vec4_i1 v) {
@@ -1646,11 +1652,12 @@ static FORCEINLINE __vec4_i16 __cast_zext(__vec4_i16, __vec4_i8 val) {
 }

 static FORCEINLINE __vec4_i8 __cast_zext(__vec4_i8, __vec4_i1 v) {
-    return __select(v, __smear_i8(1), __smear_i8(0));
+    return __select(v, __smear_i8(__vec4_i8(), 1), __smear_i8(__vec4_i8(), 0));
 }

 static FORCEINLINE __vec4_i16 __cast_zext(__vec4_i16, __vec4_i1 v) {
-    return __select(v, __smear_i16(1), __smear_i16(0));
+    return __select(v, __smear_i16(__vec4_i16(), 1), 
+                       __smear_i16(__vec4_i16(), 0));
 }

 static FORCEINLINE __vec4_i32 __cast_zext(__vec4_i32, __vec4_i1 v) {
@@ -1658,7 +1665,7 @@ static FORCEINLINE __vec4_i32 __cast_zext(__vec4_i32, __vec4_i1 v) {
 }

 static FORCEINLINE __vec4_i64 __cast_zext(__vec4_i64, __vec4_i1 v) {
-    return __select(v, __smear_i64(1), __smear_i64(0));
+    return __select(v, __smear_i64(__vec4_i64(), 1), __smear_i64(__vec4_i64(), 0));
 }

 // truncations
@@ -1818,11 +1825,11 @@ static FORCEINLINE __vec4_d __cast_uitofp(__vec4_d, __vec4_i64 val) {
 }

 static FORCEINLINE __vec4_f __cast_uitofp(__vec4_f, __vec4_i1 v) {
-    return __select(v, __smear_float(1.), __smear_float(0.));
+    return __select(v, __smear_float(__vec4_f(), 1.), __smear_float(__vec4_f(), 0.));
 }

 static FORCEINLINE __vec4_d __cast_uitofp(__vec4_d, __vec4_i1 v) {
-    return __select(v, __smear_double(1.), __smear_double(0.));
+    return __select(v, __smear_double(__vec4_d(), 1.), __smear_double(__vec4_d(), 0.));
 }

 // float/double to signed int
@@ -2613,8 +2620,8 @@ lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p, __vec4_i32 offsets,
    RetScalar r[4];
 #if 1
    // "Fast gather" trick...
-    offsets = __select(mask, offsets, __smear_i32(0));
-    constOffset = __select(mask, constOffset, __smear_i32(0));
+    offsets = __select(mask, offsets, __smear_i32(__vec4_i32(), 0));
+    constOffset = __select(mask, constOffset, __smear_i32(__vec4_i32(), 0));

    int offset = scale * _mm_extract_epi32(offsets.v, 0) + _mm_extract_epi32(constOffset.v, 0);
    RetScalar *ptr = (RetScalar *)(p + offset);
@@ -2671,8 +2678,8 @@ lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, __vec4_i64 offsets,
    RetScalar r[4];
 #if 1
    // "Fast gather" trick...
-    offsets = __select(mask, offsets, __smear_i64(0));
-    constOffset = __select(mask, constOffset, __smear_i64(0));
+    offsets = __select(mask, offsets, __smear_i64(__vec4_i64(), 0));
+    constOffset = __select(mask, constOffset, __smear_i64(__vec4_i64(), 0));

    int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + _mm_extract_epi64(constOffset.v[0], 0);
    RetScalar *ptr = (RetScalar *)(p + offset);
@@ -2756,8 +2763,8 @@ __gather_base_offsets32_i32(uint8_t *p, __vec4_i32 offsets, uint32_t scale,
    __m128i r = _mm_set_epi32(0, 0, 0, 0);
 #if 1
    // "Fast gather"...
-    offsets = __select(mask, offsets, __smear_i32(0));
-    constOffset = __select(mask, constOffset, __smear_i32(0));
+    offsets = __select(mask, offsets, __smear_i32(__vec4_i32(), 0));
+    constOffset = __select(mask, constOffset, __smear_i32(__vec4_i32(), 0));

    int offset = scale * _mm_extract_epi32(offsets.v, 0) +
        _mm_extract_epi32(constOffset.v, 0);
--- a/examples/timing.h
+++ b/examples/timing.h
@@ -43,9 +43,15 @@ extern "C" {
 #endif /* __cplusplus */
    __inline__ uint64_t rdtsc() {
        uint32_t low, high;
+#ifdef __x86_64
        __asm__ __volatile__ (
            "xorl %%eax,%%eax \n    cpuid"
            ::: "%rax", "%rbx", "%rcx", "%rdx" );
+#else
+        __asm__ __volatile__ (
+            "xorl %%eax,%%eax \n    cpuid"
+            ::: "%eax", "%ebx", "%ecx", "%edx" );
+#endif
        __asm__ __volatile__ (
                              "rdtsc" : "=a" (low), "=d" (high));
        return (uint64_t)high << 32 | low;
--- a/expr.cpp
+++ b/expr.cpp
--- a/expr.h
+++ b/expr.h
@@ -284,6 +284,10 @@ public:
    int EstimateCost() const;

    Expr *baseExpr, *index;
+
+private:
+    mutable const Type *type;
+    mutable const PointerType *lvalueType;
 };


@@ -320,6 +324,9 @@ public:
        member is found.  (i.e. this is true if the MemberExpr was a '->'
        operator, and is false if it was a '.' operator. */
    bool dereferenceExpr;
+
+protected:
+    mutable const Type *type, *lvalueType;
 };


@@ -584,6 +591,7 @@ public:
    Expr *TypeCheck();
    Expr *Optimize();
    int EstimateCost() const;
+    llvm::Constant *GetConstant(const Type *type) const;

    Expr *expr;
 };
@@ -651,20 +659,26 @@ public:
        function overloading, this method resolves which actual function
        the arguments match best.  If the argCouldBeNULL parameter is
        non-NULL, each element indicates whether the corresponding argument
-        is the number zero, indicating that it could be a NULL pointer.
-        This parameter may be NULL (for cases where overload resolution is
-        being done just given type information without the parameter
-        argument expressions being available.  It returns true on success.
+        is the number zero, indicating that it could be a NULL pointer, and
+        if argIsConstant is non-NULL, each element indicates whether the
+        corresponding argument is a compile-time constant value.  Both of
+        these parameters may be NULL (for cases where overload resolution
+        is being done just given type information without the parameter
+        argument expressions being available.  This function returns true
+        on success.
     */
    bool ResolveOverloads(SourcePos argPos,
                          const std::vector<const Type *> &argTypes,
-                          const std::vector<bool> *argCouldBeNULL = NULL);
+                          const std::vector<bool> *argCouldBeNULL = NULL,
+                          const std::vector<bool> *argIsConstant = NULL);
    Symbol *GetMatchingFunction();

 private:
-    bool tryResolve(int (*matchFunc)(const Type *, const Type *),
-                    SourcePos argPos, const std::vector<const Type *> &argTypes,
-                    const std::vector<bool> *argCouldBeNULL);
+    std::vector<Symbol *> getCandidateFunctions(int argCount) const;
+    static int computeOverloadCost(const FunctionType *ftype,
+                                   const std::vector<const Type *> &argTypes,
+                                   const std::vector<bool> *argCouldBeNULL,
+                            const std::vector<bool> *argIsConstant);

    /** Name of the function that is being called. */
    std::string name;
--- a/func.cpp
+++ b/func.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2011, Intel Corporation
+  Copyright (c) 2011-2012, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -66,9 +66,8 @@
 #include <llvm/Support/ToolOutputFile.h>
 #include <llvm/Assembly/PrintModulePass.h>

-Function::Function(Symbol *s, const std::vector<Symbol *> &a, Stmt *c) {
+Function::Function(Symbol *s, Stmt *c) {
    sym = s;
-    args = a;
    code = c;

    maskSymbol = m->symbolTable->LookupVariable("__mask");
@@ -101,12 +100,20 @@ Function::Function(Symbol *s, const std::vector<Symbol *> &a, Stmt *c) {
        printf("\n\n\n");
    }

-    const FunctionType *type = dynamic_cast<const FunctionType *>(sym->type);
+    const FunctionType *type = CastType<FunctionType>(sym->type);
    Assert(type != NULL);

-    for (unsigned int i = 0; i < args.size(); ++i)
-        if (dynamic_cast<const ReferenceType *>(args[i]->type) == NULL)
-            args[i]->parentFunction = this;
+    for (int i = 0; i < type->GetNumParameters(); ++i) {
+        const char *paramName = type->GetParameterName(i).c_str();
+        Symbol *sym = m->symbolTable->LookupVariable(paramName);
+        if (sym == NULL)
+            Assert(strncmp(paramName, "__anon_parameter_", 17) == 0);
+        args.push_back(sym);
+
+        const Type *t = type->GetParameterType(i);
+        if (sym != NULL && CastType<ReferenceType>(t) == NULL)
+            sym->parentFunction = this;
+    }

    if (type->isTask) {
        threadIndexSym = m->symbolTable->LookupVariable("threadIndex");
@@ -125,7 +132,7 @@ Function::Function(Symbol *s, const std::vector<Symbol *> &a, Stmt *c) {

 const Type *
 Function::GetReturnType() const {
-    const FunctionType *type = dynamic_cast<const FunctionType *>(sym->type);
+    const FunctionType *type = CastType<FunctionType>(sym->type);
    Assert(type != NULL);
    return type->GetReturnType();
 }
@@ -133,7 +140,7 @@ Function::GetReturnType() const {

 const FunctionType *
 Function::GetType() const {
-    const FunctionType *type = dynamic_cast<const FunctionType *>(sym->type);
+    const FunctionType *type = CastType<FunctionType>(sym->type);
    Assert(type != NULL);
    return type;
 }
@@ -145,7 +152,8 @@ Function::GetType() const {
    'mem2reg' pass will in turn promote to SSA registers..
 */
 static void
-lCopyInTaskParameter(int i, llvm::Value *structArgPtr, const std::vector<Symbol *> &args,
+lCopyInTaskParameter(int i, llvm::Value *structArgPtr, const 
+                     std::vector<Symbol *> &args,
                     FunctionEmitContext *ctx) {
    // We expect the argument structure to come in as a poitner to a
    // structure.  Confirm and figure out its type here.
@@ -157,9 +165,13 @@ lCopyInTaskParameter(int i, llvm::Value *structArgPtr, const std::vector<Symbol
        llvm::dyn_cast<const llvm::StructType>(pt->getElementType());

    // Get the type of the argument we're copying in and its Symbol pointer
-    LLVM_TYPE_CONST llvm::Type *argType = argStructType->getElementType(i);
+    llvm::Type *argType = argStructType->getElementType(i);
    Symbol *sym = args[i];

+    if (sym == NULL)
+        // anonymous parameter, so don't worry about it
+        return;
+
    // allocate space to copy the parameter in to
    sym->storagePtr = ctx->AllocaInst(argType, sym->name.c_str());

@@ -170,7 +182,7 @@ lCopyInTaskParameter(int i, llvm::Value *structArgPtr, const std::vector<Symbol
    // memory
    llvm::Value *ptrval = ctx->LoadInst(ptr, sym->name.c_str());
    ctx->StoreInst(ptrval, sym->storagePtr);
-    ctx->EmitFunctionParameterDebugInfo(sym);
+    ctx->EmitFunctionParameterDebugInfo(sym, i);
 }


@@ -186,14 +198,14 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
    // value
    maskSymbol->storagePtr = ctx->GetFullMaskPointer();

-    // add debugging info for __mask, programIndex, ...
+    // add debugging info for __mask
    maskSymbol->pos = firstStmtPos;
    ctx->EmitVariableDebugInfo(maskSymbol);

 #if 0
    llvm::BasicBlock *entryBBlock = ctx->GetCurrentBasicBlock();
 #endif
-    const FunctionType *type = dynamic_cast<const FunctionType *>(sym->type);
+    const FunctionType *type = CastType<FunctionType>(sym->type);
    Assert(type != NULL);
    if (type->isTask == true) {
        // For tasks, we there should always be three parmeters: the
@@ -240,13 +252,17 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
        llvm::Function::arg_iterator argIter = function->arg_begin(); 
        for (unsigned int i = 0; i < args.size(); ++i, ++argIter) {
            Symbol *sym = args[i];
+            if (sym == NULL)
+                // anonymous function parameter
+                continue;
+
            argIter->setName(sym->name.c_str());

            // Allocate stack storage for the parameter and emit code
            // to store the its value there.
            sym->storagePtr = ctx->AllocaInst(argIter->getType(), sym->name.c_str());
            ctx->StoreInst(argIter, sym->storagePtr);
-            ctx->EmitFunctionParameterDebugInfo(sym);
+            ctx->EmitFunctionParameterDebugInfo(sym, i);
        }

        // If the number of actual function arguments is equal to the
@@ -415,11 +431,11 @@ Function::GenerateIR() {
        // If the function is 'export'-qualified, emit a second version of
        // it without a mask parameter and without name mangling so that
        // the application can call it
-        const FunctionType *type = dynamic_cast<const FunctionType *>(sym->type);
+        const FunctionType *type = CastType<FunctionType>(sym->type);
        Assert(type != NULL);
        if (type->isExported) {
            if (!type->isTask) {
-                LLVM_TYPE_CONST llvm::FunctionType *ftype = 
+                llvm::FunctionType *ftype = 
                    type->LLVMFunctionType(g->ctx);
                llvm::GlobalValue::LinkageTypes linkage = llvm::GlobalValue::ExternalLinkage;
                std::string functionName = sym->name;
--- a/func.h
+++ b/func.h
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2011, Intel Corporation
+  Copyright (c) 2011-2012, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -43,7 +43,7 @@

 class Function {
 public:
-    Function(Symbol *sym, const std::vector<Symbol *> &args, Stmt *code);
+    Function(Symbol *sym, Stmt *code);

    const Type *GetReturnType() const;
    const FunctionType *GetType() const;
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -54,14 +54,8 @@
 #include <llvm/Target/TargetMachine.h>
 #include <llvm/Target/TargetOptions.h>
 #include <llvm/Target/TargetData.h>
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
-  #include <llvm/Support/TargetRegistry.h>
-  #include <llvm/Support/TargetSelect.h>
-#else
-  #include <llvm/Target/TargetRegistry.h>
-  #include <llvm/Target/TargetSelect.h>
-  #include <llvm/Target/SubtargetFeature.h>
-#endif
+#include <llvm/Support/TargetRegistry.h>
+#include <llvm/Support/TargetSelect.h>
 #include <llvm/Support/Host.h>

 Globals *g;
@@ -70,9 +64,82 @@ Module *m;
 ///////////////////////////////////////////////////////////////////////////
 // Target

+#ifndef ISPC_IS_WINDOWS
+static void __cpuid(int info[4], int infoType) {
+    __asm__ __volatile__ ("cpuid"
+                          : "=a" (info[0]), "=b" (info[1]), "=c" (info[2]), "=d" (info[3])
+                          : "0" (infoType));
+}
+
+/* Save %ebx in case it's the PIC register */
+static void __cpuidex(int info[4], int level, int count) {
+  __asm__ __volatile__ ("xchg{l}\t{%%}ebx, %1\n\t"
+                        "cpuid\n\t"
+                        "xchg{l}\t{%%}ebx, %1\n\t"
+                        : "=a" (info[0]), "=r" (info[1]), "=c" (info[2]), "=d" (info[3])
+                        : "0" (level), "2" (count));
+}
+#endif // ISPC_IS_WINDOWS
+
+
+static const char *
+lGetSystemISA() {
+    int info[4];
+    __cpuid(info, 1);
+
+    if ((info[2] & (1 << 28)) != 0) {
+        // AVX1 for sure. Do we have AVX2?
+        // Call cpuid with eax=7, ecx=0
+        __cpuidex(info, 7, 0);
+        if ((info[1] & (1 << 5)) != 0)
+            return "avx2";
+        else
+            return "avx";
+    }
+    else if ((info[2] & (1 << 19)) != 0)
+        return "sse4";
+    else if ((info[3] & (1 << 26)) != 0)
+        return "sse2";
+    else {
+        fprintf(stderr, "Unable to detect supported SSE/AVX ISA.  Exiting.\n");
+        exit(1);
+    }
+}
+
+
+static const char *supportedCPUs[] = { 
+    "atom", "penryn", "core2", "corei7", "corei7-avx"
+};
+
+
 bool
 Target::GetTarget(const char *arch, const char *cpu, const char *isa,
                  bool pic, Target *t) {
+    if (isa == NULL) {
+        if (cpu != NULL) {
+            // If a CPU was specified explicitly, try to pick the best
+            // possible ISA based on that.
+            if (!strcmp(cpu, "sandybridge") ||
+                !strcmp(cpu, "corei7-avx"))
+                isa = "avx";
+            else if (!strcmp(cpu, "corei7") ||
+                     !strcmp(cpu, "penryn"))
+                isa = "sse4";
+            else
+                isa = "sse2";
+            fprintf(stderr, "Notice: no --target specified on command-line.  "
+                    "Using ISA \"%s\" based on specified CPU \"%s\".\n", isa,
+                    cpu);
+        }
+        else {
+            // No CPU and no ISA, so use CPUID to figure out what this CPU
+            // supports.
+            isa = lGetSystemISA();
+            fprintf(stderr, "Notice: no --target specified on command-line.  "
+                    "Using system ISA \"%s\".\n", isa);
+        }
+    }
+
    if (cpu == NULL) {
        std::string hostCPU = llvm::sys::getHostCPUName();
        if (hostCPU.size() > 0)
@@ -82,19 +149,24 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
            cpu = "generic";
        }
    }
+    else {
+        bool foundCPU = false;
+        for (int i = 0; i < int(sizeof(supportedCPUs) / sizeof(supportedCPUs[0])); 
+             ++i) {
+            if (!strcmp(cpu, supportedCPUs[i])) {
+                foundCPU = true;
+                break;
+            }
+        }
+        if (foundCPU == false) {
+            fprintf(stderr, "Error: CPU type \"%s\" unknown. Supported CPUs: "
+                    "%s.\n", cpu, SupportedTargetCPUs().c_str());
+            return false;
+        }
+    }
+
    t->cpu = cpu;

-    if (isa == NULL) {
-        if (!strcasecmp(cpu, "atom"))
-            isa = "sse2";
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
-        else if (!strcasecmp(cpu, "sandybridge") ||
-                 !strcasecmp(cpu, "corei7-avx"))
-            isa = "avx";
-#endif // LLVM_3_0
-        else
-            isa = "sse4";
-    }
    if (arch == NULL)
        arch = "x86-64";

@@ -125,13 +197,15 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
        t->arch = arch;
    }

+    // This is the case for most of them
+    t->hasHalf = t->hasTranscendentals = false;
+
    if (!strcasecmp(isa, "sse2")) {
        t->isa = Target::SSE2;
        t->nativeVectorWidth = 4;
        t->vectorWidth = 4;
        t->attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt";
        t->maskingIsFree = false;
-        t->allOffMaskIsSafe = false;
        t->maskBitCount = 32;
    }
    else if (!strcasecmp(isa, "sse2-x2")) {
@@ -140,7 +214,6 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
        t->vectorWidth = 8;
        t->attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt";
        t->maskingIsFree = false;
-        t->allOffMaskIsSafe = false;
        t->maskBitCount = 32;
    }
    else if (!strcasecmp(isa, "sse4")) {
@@ -149,7 +222,6 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
        t->vectorWidth = 4;
        t->attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
        t->maskingIsFree = false;
-        t->allOffMaskIsSafe = false;
        t->maskBitCount = 32;
    }
    else if (!strcasecmp(isa, "sse4x2") || !strcasecmp(isa, "sse4-x2")) {
@@ -158,7 +230,6 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
        t->vectorWidth = 8;
        t->attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
        t->maskingIsFree = false;
-        t->allOffMaskIsSafe = false;
        t->maskBitCount = 32;
    }
    else if (!strcasecmp(isa, "generic-4")) {
@@ -166,41 +237,59 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
        t->nativeVectorWidth = 4;
        t->vectorWidth = 4;
        t->maskingIsFree = true;
-        t->allOffMaskIsSafe = true;
        t->maskBitCount = 1;
+        t->hasHalf = true;
+        t->hasTranscendentals = true;
    }
    else if (!strcasecmp(isa, "generic-8")) {
        t->isa = Target::GENERIC;
        t->nativeVectorWidth = 8;
        t->vectorWidth = 8;
        t->maskingIsFree = true;
-        t->allOffMaskIsSafe = true;
        t->maskBitCount = 1;
+        t->hasHalf = true;
+        t->hasTranscendentals = true;
    }
    else if (!strcasecmp(isa, "generic-16")) {
        t->isa = Target::GENERIC;
        t->nativeVectorWidth = 16;
        t->vectorWidth = 16;
        t->maskingIsFree = true;
-        t->allOffMaskIsSafe = true;
        t->maskBitCount = 1;
+        t->hasHalf = true;
+        t->hasTranscendentals = true;
+    }
+    else if (!strcasecmp(isa, "generic-32")) {
+        t->isa = Target::GENERIC;
+        t->nativeVectorWidth = 32;
+        t->vectorWidth = 32;
+        t->maskingIsFree = true;
+        t->maskBitCount = 1;
+        t->hasHalf = true;
+        t->hasTranscendentals = true;
+    }
+    else if (!strcasecmp(isa, "generic-64")) {
+        t->isa = Target::GENERIC;
+        t->nativeVectorWidth = 64;
+        t->vectorWidth = 64;
+        t->maskingIsFree = true;
+        t->maskBitCount = 1;
+        t->hasHalf = true;
+        t->hasTranscendentals = true;
    }
    else if (!strcasecmp(isa, "generic-1")) {
        t->isa = Target::GENERIC;
        t->nativeVectorWidth = 1;
        t->vectorWidth = 1;
        t->maskingIsFree = false;
-        t->allOffMaskIsSafe = false;
        t->maskBitCount = 32;
    }
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
    else if (!strcasecmp(isa, "avx")) {
        t->isa = Target::AVX;
        t->nativeVectorWidth = 8;
        t->vectorWidth = 8;
        t->attributes = "+avx,+popcnt,+cmov";
        t->maskingIsFree = false;
-        t->allOffMaskIsSafe = false;
        t->maskBitCount = 32;
    }
    else if (!strcasecmp(isa, "avx-x2")) {
@@ -209,19 +298,17 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
        t->vectorWidth = 16;
        t->attributes = "+avx,+popcnt,+cmov";
        t->maskingIsFree = false;
-        t->allOffMaskIsSafe = false;
        t->maskBitCount = 32;
    }
-#endif // LLVM 3.0+
-#if defined(LLVM_3_1svn)
+#ifndef LLVM_3_0
    else if (!strcasecmp(isa, "avx2")) {
        t->isa = Target::AVX2;
        t->nativeVectorWidth = 8;
        t->vectorWidth = 8;
        t->attributes = "+avx2,+popcnt,+cmov,+f16c";
        t->maskingIsFree = false;
-        t->allOffMaskIsSafe = false;
        t->maskBitCount = 32;
+        t->hasHalf = true;
    }
    else if (!strcasecmp(isa, "avx2-x2")) {
        t->isa = Target::AVX2;
@@ -229,10 +316,10 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
        t->vectorWidth = 16;
        t->attributes = "+avx2,+popcnt,+cmov,+f16c";
        t->maskingIsFree = false;
-        t->allOffMaskIsSafe = false;
        t->maskBitCount = 32;
+        t->hasHalf = true;
    }
-#endif // LLVM 3.1
+#endif // !LLVM_3_0
    else {
        fprintf(stderr, "Target ISA \"%s\" is unknown.  Choices are: %s\n", 
                isa, SupportedTargetISAs());
@@ -243,23 +330,23 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
        llvm::TargetMachine *targetMachine = t->GetTargetMachine();
        const llvm::TargetData *targetData = targetMachine->getTargetData();
        t->is32Bit = (targetData->getPointerSize() == 4);
+        Assert(t->vectorWidth <= ISPC_MAX_NVEC);
    }

    return !error;
 }


-const char *
+std::string
 Target::SupportedTargetCPUs() {
-    return "atom, barcelona, core2, corei7, "
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
-        "corei7-avx, "
-#endif
-        "istanbul, nocona, penryn, "
-#ifdef LLVM_2_9
-        "sandybridge, "
-#endif
-        "westmere";
+    std::string ret;
+    int count = sizeof(supportedCPUs) / sizeof(supportedCPUs[0]);
+    for (int i = 0; i < count; ++i) {
+        ret += supportedCPUs[i];
+        if (i != count - 1)
+            ret += ", ";
+    }
+    return ret;
 }


@@ -271,14 +358,11 @@ Target::SupportedTargetArchs() {

 const char *
 Target::SupportedTargetISAs() {
-    return "sse2, sse2-x2, sse4, sse4-x2"
-#ifndef LLVM_2_9
-        ", avx, avx-x2"
-#endif // !LLVM_2_9
-#ifdef LLVM_3_1svn
+    return "sse2, sse2-x2, sse4, sse4-x2, avx, avx-x2"
+#ifndef LLVM_3_0
        ", avx2, avx2-x2"
-#endif // LLVM_3_1svn
-        ", generic-4, generic-8, generic-16, generic-1";
+#endif // !LLVM_3_0
+        ", generic-1, generic-4, generic-8, generic-16, generic-32";
 }


@@ -286,10 +370,10 @@ std::string
 Target::GetTripleString() const {
    llvm::Triple triple;
    // Start with the host triple as the default
-#if defined(LLVM_3_1) || defined(LLVM_3_1svn)
-    triple.setTriple(llvm::sys::getDefaultTargetTriple());
-#else
+#ifdef LLVM_3_0
    triple.setTriple(llvm::sys::getHostTriple());
+#else
+    triple.setTriple(llvm::sys::getDefaultTargetTriple());
 #endif

    // And override the arch in the host triple based on what the user
@@ -315,30 +399,17 @@ Target::GetTargetMachine() const {

    llvm::Reloc::Model relocModel = generatePIC ? llvm::Reloc::PIC_ : 
                                                  llvm::Reloc::Default;
-#if defined(LLVM_3_1svn)
-    std::string featuresString = attributes;
-    llvm::TargetOptions options;
-    if (g->opt.fastMath == true)
-        options.UnsafeFPMath = 1;
-    llvm::TargetMachine *targetMachine = 
-        target->createTargetMachine(triple, cpu, featuresString, options,
-                                    relocModel);
-#elif defined(LLVM_3_0)
+#ifdef LLVM_3_0
    std::string featuresString = attributes;
    llvm::TargetMachine *targetMachine = 
        target->createTargetMachine(triple, cpu, featuresString, relocModel);
-#else // LLVM 2.9
-#ifdef ISPC_IS_APPLE
-    relocModel = llvm::Reloc::PIC_;
-#endif // ISPC_IS_APPLE
-    std::string featuresString = cpu + std::string(",") + attributes;
+#else
+    std::string featuresString = attributes;
+    llvm::TargetOptions options;
    llvm::TargetMachine *targetMachine = 
-        target->createTargetMachine(triple, featuresString);
-#ifndef ISPC_IS_WINDOWS
-    targetMachine->setRelocationModel(relocModel);
-#endif // !ISPC_IS_WINDOWS
-#endif // LLVM_2_9
-
+        target->createTargetMachine(triple, cpu, featuresString, options,
+                                    relocModel);
+#endif // !LLVM_3_0
    Assert(targetMachine != NULL);

    targetMachine->setAsmVerbosityDefault(true);
@@ -367,7 +438,7 @@ Target::GetISAString() const {


 static bool
-lGenericTypeLayoutIndeterminate(LLVM_TYPE_CONST llvm::Type *type) {
+lGenericTypeLayoutIndeterminate(llvm::Type *type) {
    if (type->isPrimitiveType() || type->isIntegerTy())
        return false;

@@ -376,18 +447,18 @@ lGenericTypeLayoutIndeterminate(LLVM_TYPE_CONST llvm::Type *type) {
        type == LLVMTypes::Int1VectorType)
        return true;

-    LLVM_TYPE_CONST llvm::ArrayType *at = 
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(type);
+    llvm::ArrayType *at = 
+        llvm::dyn_cast<llvm::ArrayType>(type);
    if (at != NULL)
        return lGenericTypeLayoutIndeterminate(at->getElementType());

-    LLVM_TYPE_CONST llvm::PointerType *pt = 
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::PointerType>(type);
+    llvm::PointerType *pt = 
+        llvm::dyn_cast<llvm::PointerType>(type);
    if (pt != NULL)
        return false;

-    LLVM_TYPE_CONST llvm::StructType *st =
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::StructType>(type);
+    llvm::StructType *st =
+        llvm::dyn_cast<llvm::StructType>(type);
    if (st != NULL) {
        for (int i = 0; i < (int)st->getNumElements(); ++i)
            if (lGenericTypeLayoutIndeterminate(st->getElementType(i)))
@@ -395,29 +466,24 @@ lGenericTypeLayoutIndeterminate(LLVM_TYPE_CONST llvm::Type *type) {
        return false;
    }

-    Assert(llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(type));
+    Assert(llvm::isa<llvm::VectorType>(type));
    return true;
 }


 llvm::Value *
-Target::SizeOf(LLVM_TYPE_CONST llvm::Type *type, 
+Target::SizeOf(llvm::Type *type, 
               llvm::BasicBlock *insertAtEnd) {
    if (isa == Target::GENERIC &&
        lGenericTypeLayoutIndeterminate(type)) {
        llvm::Value *index[1] = { LLVMInt32(1) };
-        LLVM_TYPE_CONST llvm::PointerType *ptrType = llvm::PointerType::get(type, 0);
+        llvm::PointerType *ptrType = llvm::PointerType::get(type, 0);
        llvm::Value *voidPtr = llvm::ConstantPointerNull::get(ptrType);
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
        llvm::ArrayRef<llvm::Value *> arrayRef(&index[0], &index[1]);
        llvm::Instruction *gep = 
            llvm::GetElementPtrInst::Create(voidPtr, arrayRef, "sizeof_gep",
                                            insertAtEnd);
-#else
-        llvm::Instruction *gep =
-            llvm::GetElementPtrInst::Create(voidPtr, &index[0], &index[1],
-                                            "sizeof_gep", insertAtEnd);
-#endif
+
        if (is32Bit || g->opt.force32BitAddressing)
            return new llvm::PtrToIntInst(gep, LLVMTypes::Int32Type, 
                                          "sizeof_int", insertAtEnd);
@@ -428,7 +494,9 @@ Target::SizeOf(LLVM_TYPE_CONST llvm::Type *type,

    const llvm::TargetData *td = GetTargetMachine()->getTargetData();
    Assert(td != NULL);
-    uint64_t byteSize = td->getTypeSizeInBits(type) / 8;
+    uint64_t bitSize = td->getTypeSizeInBits(type);
+    Assert((bitSize % 8) == 0);
+    uint64_t byteSize = bitSize / 8;
    if (is32Bit || g->opt.force32BitAddressing)
        return LLVMInt32((int32_t)byteSize);
    else
@@ -437,23 +505,18 @@ Target::SizeOf(LLVM_TYPE_CONST llvm::Type *type,


 llvm::Value *
-Target::StructOffset(LLVM_TYPE_CONST llvm::Type *type, int element,
+Target::StructOffset(llvm::Type *type, int element,
                     llvm::BasicBlock *insertAtEnd) {
    if (isa == Target::GENERIC && 
        lGenericTypeLayoutIndeterminate(type) == true) {
        llvm::Value *indices[2] = { LLVMInt32(0), LLVMInt32(element) };
-        LLVM_TYPE_CONST llvm::PointerType *ptrType = llvm::PointerType::get(type, 0);
+        llvm::PointerType *ptrType = llvm::PointerType::get(type, 0);
        llvm::Value *voidPtr = llvm::ConstantPointerNull::get(ptrType);
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
        llvm::ArrayRef<llvm::Value *> arrayRef(&indices[0], &indices[2]);
        llvm::Instruction *gep = 
            llvm::GetElementPtrInst::Create(voidPtr, arrayRef, "offset_gep",
                                            insertAtEnd);
-#else
-        llvm::Instruction *gep =
-            llvm::GetElementPtrInst::Create(voidPtr, &indices[0], &indices[2],
-                                            "offset_gep", insertAtEnd);
-#endif
+
        if (is32Bit || g->opt.force32BitAddressing)
            return new llvm::PtrToIntInst(gep, LLVMTypes::Int32Type, 
                                          "offset_int", insertAtEnd);
@@ -464,9 +527,12 @@ Target::StructOffset(LLVM_TYPE_CONST llvm::Type *type, int element,

    const llvm::TargetData *td = GetTargetMachine()->getTargetData();
    Assert(td != NULL);
-    LLVM_TYPE_CONST llvm::StructType *structType = 
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::StructType>(type);
-    Assert(structType != NULL);
+    llvm::StructType *structType = 
+        llvm::dyn_cast<llvm::StructType>(type);
+    if (structType == NULL || structType->isSized() == false) {
+        Assert(m->errorCount > 0);
+        return NULL;
+    }
    const llvm::StructLayout *sl = td->getStructLayout(structType);
    Assert(sl != NULL);

@@ -552,7 +618,9 @@ llvm::DIFile
 SourcePos::GetDIFile() const {
    std::string directory, filename;
    GetDirectoryAndFileName(g->currentDirectory, name, &directory, &filename);
-    return m->diBuilder->createFile(filename, directory);
+    llvm::DIFile ret = m->diBuilder->createFile(filename, directory);
+    Assert(ret.Verify());
+    return ret;
 }


--- a/ispc.h
+++ b/ispc.h
@@ -38,10 +38,10 @@
 #ifndef ISPC_H
 #define ISPC_H

-#define ISPC_VERSION "1.2.1dev"
+#define ISPC_VERSION "1.2.3dev"

-#if !defined(LLVM_2_9) && !defined(LLVM_3_0) && !defined(LLVM_3_0svn) && !defined(LLVM_3_1svn)
-#error "Only LLVM 2.9, 3.0, and the 3.1 development branch are supported"
+#if !defined(LLVM_3_0) && !defined(LLVM_3_1) && !defined(LLVM_3_2)
+#error "Only LLVM 3.0, 3.1, and the 3.2 development branch are supported"
 #endif

 #if defined(_WIN32) || defined(_WIN64)
@@ -58,20 +58,10 @@
 #include <vector>
 #include <string>

-#define Assert(expr)                                            \
-    ((void)((expr) ? 0 : __Assert (#expr, __FILE__, __LINE__)))
-#define __Assert(expr, file, line)                                      \
-    ((void)fprintf(stderr, "%s:%u: Assertion failed: \"%s\"\n"          \
-                   "***\n*** Please file a bug report at "              \
-                   "https://github.com/ispc/ispc/issues\n*** (Including as much " \
-                   "information as you can about how to reproduce this error).\n" \
-                   "*** You have apparently encountered a bug in the compiler that " \
-                   "we'd like to fix!\n***\n", file, line, expr), abort(), 0)
-
 /** @def ISPC_MAX_NVEC maximum vector size of any of the compliation
    targets.
 */
-#define ISPC_MAX_NVEC 16
+#define ISPC_MAX_NVEC 64

 // Forward declarations of a number of widely-used LLVM types
 namespace llvm {
@@ -92,12 +82,6 @@ namespace llvm {
    class Value;
 }

-// llvm::Type *s are no longer const in llvm 3.0
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
-#define LLVM_TYPE_CONST
-#else
-#define LLVM_TYPE_CONST const
-#endif

 class ArrayType;
 class AST;
@@ -116,6 +100,15 @@ class SymbolTable;
 class Type;
 struct VariableDeclaration;

+enum StorageClass {
+    SC_NONE,
+    SC_EXTERN,
+    SC_STATIC,
+    SC_TYPEDEF,
+    SC_EXTERN_C
+};
+
+
 /** @brief Representation of a range of positions in a source file.

    This class represents a range of characters in a source file
@@ -142,11 +135,25 @@ struct SourcePos {
    bool operator==(const SourcePos &p2) const;
 };

+
 /** Returns a SourcePos that encompasses the extent of both of the given
    extents. */
 SourcePos Union(const SourcePos &p1, const SourcePos &p2);


+
+// Assert
+
+extern void DoAssert(const char *file, int line, const char *expr);
+extern void DoAssertPos(SourcePos pos, const char *file, int line, const char *expr);
+
+#define Assert(expr)                                            \
+    ((void)((expr) ? 0 : ((void)DoAssert (__FILE__, __LINE__, #expr), 0)))
+
+#define AssertPos(pos, expr)                                     \
+    ((void)((expr) ? 0 : ((void)DoAssertPos (pos, __FILE__, __LINE__, #expr), 0)))
+
+
 /** @brief Structure that defines a compilation target 

    This structure defines a compilation target for the ispc compiler.
@@ -164,7 +171,7 @@ struct Target {

    /** Returns a comma-delimited string giving the names of the currently
        supported target CPUs. */
-    static const char *SupportedTargetCPUs();
+    static std::string SupportedTargetCPUs();

    /** Returns a comma-delimited string giving the names of the currently
        supported target architectures. */
@@ -182,13 +189,13 @@ struct Target {
    const char *GetISAString() const;

    /** Returns the size of the given type */
-    llvm::Value *SizeOf(LLVM_TYPE_CONST llvm::Type *type,
+    llvm::Value *SizeOf(llvm::Type *type,
                        llvm::BasicBlock *insertAtEnd);

    /** Given a structure type and an element number in the structure,
        returns a value corresponding to the number of bytes from the start
        of the structure where the element is located. */
-    llvm::Value *StructOffset(LLVM_TYPE_CONST llvm::Type *type,
+    llvm::Value *StructOffset(llvm::Type *type,
                              int element, llvm::BasicBlock *insertAtEnd);

    /** llvm Target object representing this target. */
@@ -236,16 +243,18 @@ struct Target {
        natively. */
    bool maskingIsFree;

-    /** Is it safe to run code with the mask all if: e.g. on SSE, the fast
-        gather trick assumes that at least one program instance is running
-        (so that it can safely assume that the array base pointer is
-        valid). */
-    bool allOffMaskIsSafe;
-
    /** How many bits are used to store each element of the mask: e.g. this
        is 32 on SSE/AVX, since that matches the HW better, but it's 1 for
        the generic target. */
    int maskBitCount;
+
+    /** Indicates whether the target has native support for float/half
+        conversions. */
+    bool hasHalf;
+
+    /** Indicates whether the target has support for transcendentals (beyond
+        sqrt, which we assume that all of them handle). */
+    bool hasTranscendentals;
 };


--- a/ispc.vcxproj
+++ b/ispc.vcxproj
@@ -29,6 +29,8 @@
    <ClCompile Include="gen-bitcode-generic-4.cpp" />
    <ClCompile Include="gen-bitcode-generic-8.cpp" />
    <ClCompile Include="gen-bitcode-generic-16.cpp" />
+    <ClCompile Include="gen-bitcode-generic-32.cpp" />
+    <ClCompile Include="gen-bitcode-generic-64.cpp" />
    <ClCompile Include="gen-bitcode-sse2.cpp" />
    <ClCompile Include="gen-bitcode-sse2-x2.cpp" />
    <ClCompile Include="gen-bitcode-sse4.cpp" />
@@ -264,6 +266,32 @@
      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-generic-16.cpp</Message>
    </CustomBuild>
  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-generic-32.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-32.ll | python bitcode2cpp.py builtins\target-generic-32.ll &gt; gen-bitcode-generic-32.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-generic-32.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-32.ll | python bitcode2cpp.py builtins\target-generic-32.ll &gt; gen-bitcode-generic-32.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-generic-32.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-generic-32.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-generic-32.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-generic-64.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-64.ll | python bitcode2cpp.py builtins\target-generic-64.ll &gt; gen-bitcode-generic-64.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-generic-64.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-64.ll | python bitcode2cpp.py builtins\target-generic-64.ll &gt; gen-bitcode-generic-64.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-generic-64.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-generic-64.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-generic-64.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
  <ItemGroup>
    <CustomBuild Include="lex.ll">
      <FileType>Document</FileType>
--- a/lex.ll
+++ b/lex.ll
@@ -43,6 +43,7 @@
 #include <stdint.h>

 static uint64_t lParseBinary(const char *ptr, SourcePos pos, char **endPtr);
+static int lParseInteger(bool dotdotdot);
 static void lCComment(SourcePos *);
 static void lCppComment(SourcePos *);
 static void lHandleCppHash(SourcePos *);
@@ -322,7 +323,8 @@ inline int ispcRand() {
 %option nounistd

 WHITESPACE [ \t\r]+
-INT_NUMBER (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[kMG]?
+INT_NUMBER (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]*
+INT_NUMBER_DOTDOTDOT (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]*\.\.\.
 FLOAT_NUMBER (([0-9]+|(([0-9]+\.[0-9]*[fF]?)|(\.[0-9]+)))([eE][-+]?[0-9]+)?[fF]?)
 HEX_FLOAT_NUMBER (0x[01](\.[0-9a-fA-F]*)?p[-+]?[0-9]+[fF]?)

@@ -406,53 +408,14 @@ L?\"(\\.|[^\\"])*\" { lStringConst(&yylval, &yylloc); return TOKEN_STRING_LITERA
        return TOKEN_IDENTIFIER; 
 }

-{INT_NUMBER}+(u|U|l|L)*? { 
+{INT_NUMBER} { 
    RT;
-    int ls = 0, us = 0;
+    return lParseInteger(false);
+}

-    char *endPtr = NULL;
-    if (yytext[0] == '0' && yytext[1] == 'b')
-        yylval.intVal = lParseBinary(yytext+2, yylloc, &endPtr);
-    else {
-#if defined(ISPC_IS_WINDOWS) && !defined(__MINGW32__)
-        yylval.intVal = _strtoui64(yytext, &endPtr, 0);
-#else
-        // FIXME: should use strtouq and then issue an error if we can't
-        // fit into 64 bits...
-        yylval.intVal = strtoull(yytext, &endPtr, 0);
-#endif
-    }
-
-    bool kilo = false, mega = false, giga = false;
-    for (; *endPtr; endPtr++) {
-        if (*endPtr == 'k')
-            kilo = true;
-        else if (*endPtr == 'M')
-            mega = true;
-        else if (*endPtr == 'G')
-            giga = true;        
-        else if (*endPtr == 'l' || *endPtr == 'L')
-            ls++;
-        else if (*endPtr == 'u' || *endPtr == 'U')
-            us++;
-    }
-    if (kilo)
-        yylval.intVal *= 1024;
-    if (mega)
-        yylval.intVal *= 1024*1024;
-    if (giga)
-        yylval.intVal *= 1024*1024*1024;
-
-    if (ls >= 2)
-        return us ? TOKEN_UINT64_CONSTANT : TOKEN_INT64_CONSTANT;
-    else if (ls == 1)
-        return us ? TOKEN_UINT32_CONSTANT : TOKEN_INT32_CONSTANT;
-
-    // See if we can fit this into a 32-bit integer...
-    if ((yylval.intVal & 0xffffffff) == yylval.intVal)
-        return us ? TOKEN_UINT32_CONSTANT : TOKEN_INT32_CONSTANT;
-    else
-        return us ? TOKEN_UINT64_CONSTANT : TOKEN_INT64_CONSTANT;
+{INT_NUMBER_DOTDOTDOT} {
+    RT;
+    return lParseInteger(true);
 }


@@ -562,6 +525,72 @@ lParseBinary(const char *ptr, SourcePos pos, char **endPtr) {
 }


+static int
+lParseInteger(bool dotdotdot) {
+    int ls = 0, us = 0;
+
+    char *endPtr = NULL;
+    if (yytext[0] == '0' && yytext[1] == 'b')
+        yylval.intVal = lParseBinary(yytext+2, yylloc, &endPtr);
+    else {
+#if defined(ISPC_IS_WINDOWS) && !defined(__MINGW32__)
+        yylval.intVal = _strtoui64(yytext, &endPtr, 0);
+#else
+        // FIXME: should use strtouq and then issue an error if we can't
+        // fit into 64 bits...
+        yylval.intVal = strtoull(yytext, &endPtr, 0);
+#endif
+    }
+
+    bool kilo = false, mega = false, giga = false;
+    for (; *endPtr; endPtr++) {
+        if (*endPtr == 'k')
+            kilo = true;
+        else if (*endPtr == 'M')
+            mega = true;
+        else if (*endPtr == 'G')
+            giga = true;        
+        else if (*endPtr == 'l' || *endPtr == 'L')
+            ls++;
+        else if (*endPtr == 'u' || *endPtr == 'U')
+            us++;
+        else
+            Assert(dotdotdot && *endPtr == '.');
+    }
+    if (kilo)
+        yylval.intVal *= 1024;
+    if (mega)
+        yylval.intVal *= 1024*1024;
+    if (giga)
+        yylval.intVal *= 1024*1024*1024;
+
+    if (dotdotdot) {
+        if (ls >= 2)
+            return us ? TOKEN_UINT64DOTDOTDOT_CONSTANT : TOKEN_INT64DOTDOTDOT_CONSTANT;
+        else if (ls == 1)
+            return us ? TOKEN_UINT32DOTDOTDOT_CONSTANT : TOKEN_INT32DOTDOTDOT_CONSTANT;
+
+        // See if we can fit this into a 32-bit integer...
+        if ((yylval.intVal & 0xffffffff) == yylval.intVal)
+            return us ? TOKEN_UINT32DOTDOTDOT_CONSTANT : TOKEN_INT32DOTDOTDOT_CONSTANT;
+        else
+            return us ? TOKEN_UINT64DOTDOTDOT_CONSTANT : TOKEN_INT64DOTDOTDOT_CONSTANT;
+    }
+    else {
+        if (ls >= 2)
+            return us ? TOKEN_UINT64_CONSTANT : TOKEN_INT64_CONSTANT;
+        else if (ls == 1)
+            return us ? TOKEN_UINT32_CONSTANT : TOKEN_INT32_CONSTANT;
+
+        // See if we can fit this into a 32-bit integer...
+        if ((yylval.intVal & 0xffffffff) == yylval.intVal)
+            return us ? TOKEN_UINT32_CONSTANT : TOKEN_INT32_CONSTANT;
+        else
+            return us ? TOKEN_UINT64_CONSTANT : TOKEN_INT64_CONSTANT;
+    }
+}
+
+
 /** Handle a C-style comment in the source. 
 */
 static void
@@ -675,7 +704,7 @@ lEscapeChar(char *str, char *pChar, SourcePos *pos)
            str = tail - 1;
            break;
        default:
-            Error(*pos, "Bad character escape sequence: '%s'\n.", str);
+            Error(*pos, "Bad character escape sequence: '%s'.", str);
            break;
        }
    }
--- a/llvmutil.cpp
+++ b/llvmutil.cpp
@@ -43,44 +43,44 @@
 #include <set>
 #include <map>

-LLVM_TYPE_CONST llvm::Type *LLVMTypes::VoidType = NULL;
-LLVM_TYPE_CONST llvm::PointerType *LLVMTypes::VoidPointerType = NULL;
-LLVM_TYPE_CONST llvm::Type *LLVMTypes::PointerIntType = NULL;
-LLVM_TYPE_CONST llvm::Type *LLVMTypes::BoolType = NULL;
+llvm::Type *LLVMTypes::VoidType = NULL;
+llvm::PointerType *LLVMTypes::VoidPointerType = NULL;
+llvm::Type *LLVMTypes::PointerIntType = NULL;
+llvm::Type *LLVMTypes::BoolType = NULL;

-LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int8Type = NULL;
-LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int16Type = NULL;
-LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int32Type = NULL;
-LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int64Type = NULL;
-LLVM_TYPE_CONST llvm::Type *LLVMTypes::FloatType = NULL;
-LLVM_TYPE_CONST llvm::Type *LLVMTypes::DoubleType = NULL;
+llvm::Type *LLVMTypes::Int8Type = NULL;
+llvm::Type *LLVMTypes::Int16Type = NULL;
+llvm::Type *LLVMTypes::Int32Type = NULL;
+llvm::Type *LLVMTypes::Int64Type = NULL;
+llvm::Type *LLVMTypes::FloatType = NULL;
+llvm::Type *LLVMTypes::DoubleType = NULL;

-LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int8PointerType = NULL;
-LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int16PointerType = NULL;
-LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int32PointerType = NULL;
-LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int64PointerType = NULL;
-LLVM_TYPE_CONST llvm::Type *LLVMTypes::FloatPointerType = NULL;
-LLVM_TYPE_CONST llvm::Type *LLVMTypes::DoublePointerType = NULL;
+llvm::Type *LLVMTypes::Int8PointerType = NULL;
+llvm::Type *LLVMTypes::Int16PointerType = NULL;
+llvm::Type *LLVMTypes::Int32PointerType = NULL;
+llvm::Type *LLVMTypes::Int64PointerType = NULL;
+llvm::Type *LLVMTypes::FloatPointerType = NULL;
+llvm::Type *LLVMTypes::DoublePointerType = NULL;

-LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::MaskType = NULL;
-LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::BoolVectorType = NULL;
+llvm::VectorType *LLVMTypes::MaskType = NULL;
+llvm::VectorType *LLVMTypes::BoolVectorType = NULL;

-LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::Int1VectorType = NULL;
-LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::Int8VectorType = NULL;
-LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::Int16VectorType = NULL;
-LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::Int32VectorType = NULL;
-LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::Int64VectorType = NULL;
-LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::FloatVectorType = NULL;
-LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::DoubleVectorType = NULL;
+llvm::VectorType *LLVMTypes::Int1VectorType = NULL;
+llvm::VectorType *LLVMTypes::Int8VectorType = NULL;
+llvm::VectorType *LLVMTypes::Int16VectorType = NULL;
+llvm::VectorType *LLVMTypes::Int32VectorType = NULL;
+llvm::VectorType *LLVMTypes::Int64VectorType = NULL;
+llvm::VectorType *LLVMTypes::FloatVectorType = NULL;
+llvm::VectorType *LLVMTypes::DoubleVectorType = NULL;

-LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int8VectorPointerType = NULL;
-LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int16VectorPointerType = NULL;
-LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int32VectorPointerType = NULL;
-LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int64VectorPointerType = NULL;
-LLVM_TYPE_CONST llvm::Type *LLVMTypes::FloatVectorPointerType = NULL;
-LLVM_TYPE_CONST llvm::Type *LLVMTypes::DoubleVectorPointerType = NULL;
+llvm::Type *LLVMTypes::Int8VectorPointerType = NULL;
+llvm::Type *LLVMTypes::Int16VectorPointerType = NULL;
+llvm::Type *LLVMTypes::Int32VectorPointerType = NULL;
+llvm::Type *LLVMTypes::Int64VectorPointerType = NULL;
+llvm::Type *LLVMTypes::FloatVectorPointerType = NULL;
+llvm::Type *LLVMTypes::DoubleVectorPointerType = NULL;

-LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::VoidPointerVectorType = NULL;
+llvm::VectorType *LLVMTypes::VoidPointerVectorType = NULL;

 llvm::Constant *LLVMTrue = NULL;
 llvm::Constant *LLVMFalse = NULL;
@@ -473,9 +473,9 @@ LLVMBoolVector(const bool *bvec) {


 llvm::Constant *
-LLVMIntAsType(int64_t val, LLVM_TYPE_CONST llvm::Type *type) {
-    LLVM_TYPE_CONST llvm::VectorType *vecType =
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(type);
+LLVMIntAsType(int64_t val, llvm::Type *type) {
+    llvm::VectorType *vecType =
+        llvm::dyn_cast<llvm::VectorType>(type);

    if (vecType != NULL) {
        llvm::Constant *v = llvm::ConstantInt::get(vecType->getElementType(),
@@ -491,9 +491,9 @@ LLVMIntAsType(int64_t val, LLVM_TYPE_CONST llvm::Type *type) {


 llvm::Constant *
-LLVMUIntAsType(uint64_t val, LLVM_TYPE_CONST llvm::Type *type) {
-    LLVM_TYPE_CONST llvm::VectorType *vecType =
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(type);
+LLVMUIntAsType(uint64_t val, llvm::Type *type) {
+    llvm::VectorType *vecType =
+        llvm::dyn_cast<llvm::VectorType>(type);

    if (vecType != NULL) {
        llvm::Constant *v = llvm::ConstantInt::get(vecType->getElementType(),
@@ -642,8 +642,8 @@ LLVMFlattenInsertChain(llvm::InsertElementInst *ie, int vectorWidth,
 bool
 LLVMExtractVectorInts(llvm::Value *v, int64_t ret[], int *nElts) {
    // Make sure we do in fact have a vector of integer values here
-    LLVM_TYPE_CONST llvm::VectorType *vt =
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(v->getType());
+    llvm::VectorType *vt =
+        llvm::dyn_cast<llvm::VectorType>(v->getType());
    Assert(vt != NULL);
    Assert(llvm::isa<llvm::IntegerType>(vt->getElementType()));

@@ -657,7 +657,7 @@ LLVMExtractVectorInts(llvm::Value *v, int64_t ret[], int *nElts) {

    // Deal with the fact that LLVM3.1 and previous versions have different
    // representations for vectors of constant ints...
-#ifdef LLVM_3_1svn
+#ifndef LLVM_3_0
    llvm::ConstantDataVector *cv = llvm::dyn_cast<llvm::ConstantDataVector>(v);
    if (cv == NULL)
        return false;
@@ -678,7 +678,7 @@ LLVMExtractVectorInts(llvm::Value *v, int64_t ret[], int *nElts) {
         ret[i] = ci->getSExtValue();
     }
     return true;
-#endif // LLVM_3_1svn
+#endif // !LLVM_3_0
 }


@@ -696,7 +696,7 @@ lVectorValuesAllEqual(llvm::Value *v, int vectorLength,
 static bool
 lIsExactMultiple(llvm::Value *val, int baseValue, int vectorLength,
                 std::vector<llvm::PHINode *> &seenPhis) {
-    if (llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(val->getType()) == false) {
+    if (llvm::isa<llvm::VectorType>(val->getType()) == false) {
        // If we've worked down to a constant int, then the moment of truth
        // has arrived...
        llvm::ConstantInt *ci = llvm::dyn_cast<llvm::ConstantInt>(val);
@@ -780,7 +780,7 @@ static bool
 lAllDivBaseEqual(llvm::Value *val, int64_t baseValue, int vectorLength,
                 std::vector<llvm::PHINode *> &seenPhis,
                 bool &canAdd) {
-    Assert(llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(val->getType()));
+    Assert(llvm::isa<llvm::VectorType>(val->getType()));
    // Make sure the base value is a positive power of 2
    Assert(baseValue > 0 && (baseValue & (baseValue-1)) == 0);

@@ -790,7 +790,7 @@ lAllDivBaseEqual(llvm::Value *val, int64_t baseValue, int vectorLength,

    int64_t vecVals[ISPC_MAX_NVEC];
    int nElts;
-    if (llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(val->getType()) &&
+    if (llvm::isa<llvm::VectorType>(val->getType()) &&
        LLVMExtractVectorInts(val, vecVals, &nElts)) {
        // If we have a vector of compile-time constant integer values,
        // then go ahead and check them directly..
@@ -880,7 +880,7 @@ lAllDivBaseEqual(llvm::Value *val, int64_t baseValue, int vectorLength,
        // the addConstants[], mod baseValue.  If we round that up to the
        // next power of 2, we'll have a value that will be no greater than
        // baseValue and sometimes less.
-        int maxMod = addConstants[0] % baseValue;
+        int maxMod = int(addConstants[0] % baseValue);
        for (int i = 1; i < vectorLength; ++i)
            maxMod = std::max(maxMod, int(addConstants[i] % baseValue));
        int requiredAlignment = lRoundUpPow2(maxMod);
@@ -947,7 +947,7 @@ lVectorValuesAllEqual(llvm::Value *v, int vectorLength,
    if (cv != NULL)
        return (cv->getSplatValue() != NULL);

-#ifdef LLVM_3_1svn
+#ifndef LLVM_3_0
    llvm::ConstantDataVector *cdv = llvm::dyn_cast<llvm::ConstantDataVector>(v);
    if (cdv != NULL)
        return (cdv->getSplatValue() != NULL);
@@ -1074,8 +1074,8 @@ lVectorValuesAllEqual(llvm::Value *v, int vectorLength,
 */
 bool
 LLVMVectorValuesAllEqual(llvm::Value *v) {
-    LLVM_TYPE_CONST llvm::VectorType *vt =
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(v->getType());
+    llvm::VectorType *vt =
+        llvm::dyn_cast<llvm::VectorType>(v->getType());
    Assert(vt != NULL);
    int vectorLength = vt->getNumElements();

@@ -1102,7 +1102,7 @@ lVectorIsLinear(llvm::Value *v, int vectorLength, int stride,
 */
 static bool
 lVectorIsLinearConstantInts(
-#ifdef LLVM_3_1svn
+#ifndef LLVM_3_0
                            llvm::ConstantDataVector *cv, 
 #else
                            llvm::ConstantVector *cv, 
@@ -1111,7 +1111,7 @@ lVectorIsLinearConstantInts(
                            int stride) {
    // Flatten the vector out into the elements array
    llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> elements;
-#ifdef LLVM_3_1svn
+#ifndef LLVM_3_0
    for (int i = 0; i < (int)cv->getNumElements(); ++i)
        elements.push_back(cv->getElementAsConstant(i));
 #else
@@ -1152,7 +1152,7 @@ lCheckMulForLinear(llvm::Value *op0, llvm::Value *op1, int vectorLength,
                   int stride, std::vector<llvm::PHINode *> &seenPhis) {
    // Is the first operand a constant integer value splatted across all of
    // the lanes?
-#ifdef LLVM_3_1svn
+#ifndef LLVM_3_0
    llvm::ConstantDataVector *cv = llvm::dyn_cast<llvm::ConstantDataVector>(op0);
 #else
    llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(op0);
@@ -1226,7 +1226,7 @@ lVectorIsLinear(llvm::Value *v, int vectorLength, int stride,
                std::vector<llvm::PHINode *> &seenPhis) {
    // First try the easy case: if the values are all just constant
    // integers and have the expected stride between them, then we're done.
-#ifdef LLVM_3_1svn
+#ifndef LLVM_3_0
    llvm::ConstantDataVector *cv = llvm::dyn_cast<llvm::ConstantDataVector>(v);
 #else
    llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(v);
@@ -1344,8 +1344,8 @@ lVectorIsLinear(llvm::Value *v, int vectorLength, int stride,
 */
 bool
 LLVMVectorIsLinear(llvm::Value *v, int stride) {
-    LLVM_TYPE_CONST llvm::VectorType *vt =
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(v->getType());
+    llvm::VectorType *vt =
+        llvm::dyn_cast<llvm::VectorType>(v->getType());
    Assert(vt != NULL);
    int vectorLength = vt->getNumElements();

@@ -1390,19 +1390,38 @@ LLVMDumpValue(llvm::Value *v) {


 static llvm::Value *
-lExtractFirstVectorElement(llvm::Value *v, llvm::Instruction *insertBefore,
+lExtractFirstVectorElement(llvm::Value *v, 
                           std::map<llvm::PHINode *, llvm::PHINode *> &phiMap) {
-    // If it's not an instruction (i.e. is a constant), then we can just
-    // emit an extractelement instruction and let the regular optimizer do
-    // the rest.
-    if (llvm::isa<llvm::Instruction>(v) == false)
-        return llvm::ExtractElementInst::Create(v, LLVMInt32(0), "first_elt",
-                                                insertBefore);
-
-    LLVM_TYPE_CONST llvm::VectorType *vt =
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(v->getType());
+    llvm::VectorType *vt =
+        llvm::dyn_cast<llvm::VectorType>(v->getType());
    Assert(vt != NULL);

+    // First, handle various constant types; do the extraction manually, as
+    // appropriate.
+    if (llvm::isa<llvm::ConstantAggregateZero>(v) == true) {
+        Assert(vt->getElementType()->isIntegerTy());
+        return llvm::ConstantInt::get(vt->getElementType(), 0);
+    }
+    if (llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(v)) {
+#ifndef LLVM_3_0
+        return cv->getOperand(0);
+#else
+        llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> elements;
+        cv->getVectorElements(elements);
+        return elements[0];
+#endif // !LLVM_3_0
+    }
+#ifndef LLVM_3_0
+    if (llvm::ConstantDataVector *cdv = 
+        llvm::dyn_cast<llvm::ConstantDataVector>(v))
+        return cdv->getElementAsConstant(0);
+#endif  // !LLVM_3_0
+
+    // Otherwise, all that we should have at this point is an instruction
+    // of some sort
+    Assert(llvm::isa<llvm::Constant>(v) == false);
+    Assert(llvm::isa<llvm::Instruction>(v) == true);
+
    std::string newName = v->getName().str() + std::string(".elt0");

    // Rewrite regular binary operators and casts to the scalarized
@@ -1410,20 +1429,24 @@ lExtractFirstVectorElement(llvm::Value *v, llvm::Instruction *insertBefore,
    llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(v);
    if (bop != NULL) {
        llvm::Value *v0 = lExtractFirstVectorElement(bop->getOperand(0),
-                                                     insertBefore, phiMap);
+                                                     phiMap);
        llvm::Value *v1 = lExtractFirstVectorElement(bop->getOperand(1),
-                                                     insertBefore, phiMap);
+                                                     phiMap);
+        // Note that the new binary operator is inserted immediately before
+        // the previous vector one
        return llvm::BinaryOperator::Create(bop->getOpcode(), v0, v1,
-                                            newName, insertBefore);
+                                            newName, bop);
    }

    llvm::CastInst *cast = llvm::dyn_cast<llvm::CastInst>(v);
    if (cast != NULL) {
        llvm::Value *v = lExtractFirstVectorElement(cast->getOperand(0),
-                                                    insertBefore, phiMap);
+                                                    phiMap);
+        // Similarly, the equivalent scalar cast instruction goes right
+        // before the vector cast
        return llvm::CastInst::Create(cast->getOpcode(), v,
                                      vt->getElementType(), newName,
-                                      insertBefore);
+                                      cast);
    }

    llvm::PHINode *phi = llvm::dyn_cast<llvm::PHINode>(v);
@@ -1438,18 +1461,17 @@ lExtractFirstVectorElement(llvm::Value *v, llvm::Instruction *insertBefore,
        // return the pointer and not get stuck in an infinite loop.
        //
        // The insertion point for the new phi node also has to be the
-        // start of the bblock of the original phi node, which isn't
-        // necessarily the same bblock as insertBefore is in!
+        // start of the bblock of the original phi node.
        llvm::Instruction *phiInsertPos = phi->getParent()->begin();
        llvm::PHINode *scalarPhi = 
            llvm::PHINode::Create(vt->getElementType(), 
-                                  phi->getNumIncomingValues(), newName,
-                                  phiInsertPos);
+                                  phi->getNumIncomingValues(), 
+                                  newName, phiInsertPos);
        phiMap[phi] = scalarPhi;

        for (unsigned i = 0; i < phi->getNumIncomingValues(); ++i) {
            llvm::Value *v = lExtractFirstVectorElement(phi->getIncomingValue(i),
-                                                        insertBefore, phiMap);
+                                                        phiMap);
            scalarPhi->addIncoming(v, phi->getIncomingBlock(i));
        }

@@ -1466,15 +1488,22 @@ lExtractFirstVectorElement(llvm::Value *v, llvm::Instruction *insertBefore,
    }

    // Worst case, for everything else, just do a regular extract element
-    return llvm::ExtractElementInst::Create(v, LLVMInt32(0), "first_elt",
-                                            insertBefore);
+    // instruction, which we insert immediately after the instruction we
+    // have here.
+    llvm::Instruction *insertAfter = llvm::dyn_cast<llvm::Instruction>(v);
+    Assert(insertAfter != NULL);
+    llvm::Instruction *ee = 
+        llvm::ExtractElementInst::Create(v, LLVMInt32(0), "first_elt",
+                                         (llvm::Instruction *)NULL);
+    ee->insertAfter(insertAfter);
+    return ee;
 }


 llvm::Value *
-LLVMExtractFirstVectorElement(llvm::Value *v, llvm::Instruction *insertBefore) {
+LLVMExtractFirstVectorElement(llvm::Value *v) {
    std::map<llvm::PHINode *, llvm::PHINode *> phiMap;
-    llvm::Value *ret = lExtractFirstVectorElement(v, insertBefore, phiMap);
+    llvm::Value *ret = lExtractFirstVectorElement(v, phiMap);
    return ret;
 }

@@ -1489,8 +1518,8 @@ LLVMConcatVectors(llvm::Value *v1, llvm::Value *v2,
                  llvm::Instruction *insertBefore) {
    Assert(v1->getType() == v2->getType());

-    LLVM_TYPE_CONST llvm::VectorType *vt =
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(v1->getType());
+    llvm::VectorType *vt =
+        llvm::dyn_cast<llvm::VectorType>(v1->getType());
    Assert(vt != NULL);

    int32_t identity[ISPC_MAX_NVEC];
@@ -1518,12 +1547,29 @@ LLVMShuffleVectors(llvm::Value *v1, llvm::Value *v2, int32_t shuf[],
            shufVec.push_back(LLVMInt32(shuf[i]));
    }

-#ifndef LLVM_2_9
    llvm::ArrayRef<llvm::Constant *> aref(&shufVec[0], &shufVec[shufSize]);
    llvm::Value *vec = llvm::ConstantVector::get(aref);
-#else // LLVM_2_9
-    llvm::Value *vec = llvm::ConstantVector::get(shufVec);
-#endif

    return new llvm::ShuffleVectorInst(v1, v2, vec, "shuffle", insertBefore);
 }
+
+
+const char *
+LLVMGetName(llvm::Value *v, const char *s) {
+    if (v == NULL) return s;
+    std::string ret = v->getName();
+    ret += s;
+    return strdup(ret.c_str());
+}
+
+
+const char *
+LLVMGetName(const char *op, llvm::Value *v1, llvm::Value *v2) {
+    std::string r = op;
+    r += "_";
+    r += v1->getName().str();
+    r += "_";
+    r += v2->getName().str();
+    return strdup(r.c_str());
+}
+
--- a/llvmutil.h
+++ b/llvmutil.h
@@ -48,57 +48,50 @@ namespace llvm {
    class InsertElementInst;
 }

-// llvm::Type *s are no longer const in llvm 3.0
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
-#define LLVM_TYPE_CONST
-#else
-#define LLVM_TYPE_CONST const
-#endif
-

 /** This structure holds pointers to a variety of LLVM types; code
    elsewhere can use them from here, ratherthan needing to make more
    verbose LLVM API calls.
 */ 
 struct LLVMTypes {
-    static LLVM_TYPE_CONST llvm::Type *VoidType;
-    static LLVM_TYPE_CONST llvm::PointerType *VoidPointerType;
-    static LLVM_TYPE_CONST llvm::Type *PointerIntType;
-    static LLVM_TYPE_CONST llvm::Type *BoolType;
+    static llvm::Type *VoidType;
+    static llvm::PointerType *VoidPointerType;
+    static llvm::Type *PointerIntType;
+    static llvm::Type *BoolType;

-    static LLVM_TYPE_CONST llvm::Type *Int8Type;
-    static LLVM_TYPE_CONST llvm::Type *Int16Type;
-    static LLVM_TYPE_CONST llvm::Type *Int32Type;
-    static LLVM_TYPE_CONST llvm::Type *Int64Type;
-    static LLVM_TYPE_CONST llvm::Type *FloatType;
-    static LLVM_TYPE_CONST llvm::Type *DoubleType;
+    static llvm::Type *Int8Type;
+    static llvm::Type *Int16Type;
+    static llvm::Type *Int32Type;
+    static llvm::Type *Int64Type;
+    static llvm::Type *FloatType;
+    static llvm::Type *DoubleType;

-    static LLVM_TYPE_CONST llvm::Type *Int8PointerType;
-    static LLVM_TYPE_CONST llvm::Type *Int16PointerType;
-    static LLVM_TYPE_CONST llvm::Type *Int32PointerType;
-    static LLVM_TYPE_CONST llvm::Type *Int64PointerType;
-    static LLVM_TYPE_CONST llvm::Type *FloatPointerType;
-    static LLVM_TYPE_CONST llvm::Type *DoublePointerType;
+    static llvm::Type *Int8PointerType;
+    static llvm::Type *Int16PointerType;
+    static llvm::Type *Int32PointerType;
+    static llvm::Type *Int64PointerType;
+    static llvm::Type *FloatPointerType;
+    static llvm::Type *DoublePointerType;

-    static LLVM_TYPE_CONST llvm::VectorType *MaskType;
+    static llvm::VectorType *MaskType;

-    static LLVM_TYPE_CONST llvm::VectorType *BoolVectorType;
-    static LLVM_TYPE_CONST llvm::VectorType *Int1VectorType;
-    static LLVM_TYPE_CONST llvm::VectorType *Int8VectorType;
-    static LLVM_TYPE_CONST llvm::VectorType *Int16VectorType;
-    static LLVM_TYPE_CONST llvm::VectorType *Int32VectorType;
-    static LLVM_TYPE_CONST llvm::VectorType *Int64VectorType;
-    static LLVM_TYPE_CONST llvm::VectorType *FloatVectorType;
-    static LLVM_TYPE_CONST llvm::VectorType *DoubleVectorType;
+    static llvm::VectorType *BoolVectorType;
+    static llvm::VectorType *Int1VectorType;
+    static llvm::VectorType *Int8VectorType;
+    static llvm::VectorType *Int16VectorType;
+    static llvm::VectorType *Int32VectorType;
+    static llvm::VectorType *Int64VectorType;
+    static llvm::VectorType *FloatVectorType;
+    static llvm::VectorType *DoubleVectorType;

-    static LLVM_TYPE_CONST llvm::Type *Int8VectorPointerType;
-    static LLVM_TYPE_CONST llvm::Type *Int16VectorPointerType;
-    static LLVM_TYPE_CONST llvm::Type *Int32VectorPointerType;
-    static LLVM_TYPE_CONST llvm::Type *Int64VectorPointerType;
-    static LLVM_TYPE_CONST llvm::Type *FloatVectorPointerType;
-    static LLVM_TYPE_CONST llvm::Type *DoubleVectorPointerType;
+    static llvm::Type *Int8VectorPointerType;
+    static llvm::Type *Int16VectorPointerType;
+    static llvm::Type *Int32VectorPointerType;
+    static llvm::Type *Int64VectorPointerType;
+    static llvm::Type *FloatVectorPointerType;
+    static llvm::Type *DoubleVectorPointerType;

-    static LLVM_TYPE_CONST llvm::VectorType *VoidPointerVectorType;
+    static llvm::VectorType *VoidPointerVectorType;
 };

 /** These variables hold the corresponding LLVM constant values as a
@@ -175,11 +168,11 @@ extern llvm::Constant *LLVMDoubleVector(double f);

 /** Returns a constant integer or vector (according to the given type) of
    the given signed integer value. */
-extern llvm::Constant *LLVMIntAsType(int64_t, LLVM_TYPE_CONST llvm::Type *t);
+extern llvm::Constant *LLVMIntAsType(int64_t, llvm::Type *t);

 /** Returns a constant integer or vector (according to the given type) of
    the given unsigned integer value. */
-extern llvm::Constant *LLVMUIntAsType(uint64_t, LLVM_TYPE_CONST llvm::Type *t);
+extern llvm::Constant *LLVMUIntAsType(uint64_t, llvm::Type *t);

 /** Returns an LLVM boolean vector based on the given array of values.
    The array should have g->target.vectorWidth elements. */
@@ -281,8 +274,7 @@ extern void LLVMDumpValue(llvm::Value *v);
    worth of values just to extract the first element, in cases where only
    the first element's value is needed.
  */
-extern llvm::Value *LLVMExtractFirstVectorElement(llvm::Value *v, 
-                                              llvm::Instruction *insertBefore);
+extern llvm::Value *LLVMExtractFirstVectorElement(llvm::Value *v);

 /** This function takes two vectors, expected to be the same length, and
    returns a new vector of twice the length that represents concatenating
@@ -298,4 +290,10 @@ extern llvm::Value *LLVMShuffleVectors(llvm::Value *v1, llvm::Value *v2,
                                       int32_t shuf[], int shufSize,
                                       llvm::Instruction *insertBefore);

+/** Utility routines to concat strings with the names of existing values to
+    create meaningful new names for instruction values.
+*/
+extern const char *LLVMGetName(llvm::Value *v, const char *);
+extern const char *LLVMGetName(const char *op, llvm::Value *v1, llvm::Value *v2);
+
 #endif // ISPC_LLVMUTIL_H
--- a/main.cpp
+++ b/main.cpp
@@ -44,16 +44,9 @@
 #ifdef ISPC_IS_WINDOWS
  #include <time.h>
 #endif // ISPC_IS_WINDOWS
-#include <llvm/Support/PrettyStackTrace.h>
 #include <llvm/Support/Signals.h>
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
-  #include <llvm/Support/TargetRegistry.h>
-  #include <llvm/Support/TargetSelect.h>
-#else
-  #include <llvm/Target/TargetRegistry.h>
-  #include <llvm/Target/TargetSelect.h>
-  #include <llvm/Target/SubtargetFeature.h>
-#endif
+#include <llvm/Support/TargetRegistry.h>
+#include <llvm/Support/TargetSelect.h>

 #ifdef ISPC_IS_WINDOWS
 #define strcasecmp stricmp
@@ -67,12 +60,12 @@ static void
 lPrintVersion() {
    printf("Intel(r) SPMD Program Compiler (ispc), %s (build %s @ %s, LLVM %s)\n", 
           ISPC_VERSION, BUILD_VERSION, BUILD_DATE, 
-#ifdef LLVM_2_9
-           "2.9"
-#elif defined(LLVM_3_0) || defined(LLVM_3_0svn)
+#if defined(LLVM_3_0)
           "3.0"
-#elif defined(LLVM_3_1) || defined(LLVM_3_1svn)
+#elif defined(LLVM_3_1)
           "3.1"
+#elif defined(LLVM_3_2)
+           "3.2"
 #else
 #error "Unhandled LLVM version"
 #endif 
@@ -91,12 +84,10 @@ usage(int ret) {
           Target::SupportedTargetArchs());
    printf("    [--c++-include-file=<name>]\t\tSpecify name of file to emit in #include statement in generated C++ code.\n");
    printf("    [--cpu=<cpu>]\t\t\tSelect target CPU type\n");
-    printf("         <cpu>={%s}\n", Target::SupportedTargetCPUs());
+    printf("         <cpu>={%s}\n", Target::SupportedTargetCPUs().c_str());
    printf("    [-D<foo>]\t\t\t\t#define given value when running preprocessor\n");
    printf("    [--emit-asm]\t\t\tGenerate assembly language file as output\n");
-#ifndef LLVM_2_9
    printf("    [--emit-c++]\t\t\tEmit a C++ source file as output\n");
-#endif // !LLVM_2_9
    printf("    [--emit-llvm]\t\t\tEmit LLVM bitode file as output\n");
    printf("    [--emit-obj]\t\t\tGenerate object file file as output (default)\n");
    printf("    [-g]\t\t\t\tGenerate debugging information\n");
@@ -202,17 +193,18 @@ static void lGetAllArgs(int Argc, char *Argv[], int &argc, char *argv[128]) {
 }


+static void
+lSignal(void *) {
+    FATAL("Unhandled signal sent to process; terminating.");
+}
+
+
 int main(int Argc, char *Argv[]) {
    int argc;
    char *argv[128];
    lGetAllArgs(Argc, Argv, argc, argv);

-#if 0
-    // Use LLVM's little utility function to print out nice stack traces if
-    // we crash
-    llvm::sys::PrintStackTraceOnErrorSignal();
-    llvm::PrettyStackTraceProgram X(argc, argv);
-#endif
+    llvm::sys::AddSignalHandler(lSignal, NULL);

    // initialize available LLVM targets
    LLVMInitializeX86TargetInfo();
@@ -220,9 +212,7 @@ int main(int Argc, char *Argv[]) {
    LLVMInitializeX86AsmPrinter();
    LLVMInitializeX86AsmParser();
    LLVMInitializeX86Disassembler();
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
    LLVMInitializeX86TargetMC();
-#endif

    char *file = NULL;
    const char *headerFileName = NULL;
@@ -279,10 +269,8 @@ int main(int Argc, char *Argv[]) {
        }
        else if (!strcmp(argv[i], "--emit-asm"))
            ot = Module::Asm;
-#ifndef LLVM_2_9
        else if (!strcmp(argv[i], "--emit-c++"))
            ot = Module::CXX;
-#endif // !LLVM_2_9
        else if (!strcmp(argv[i], "--emit-llvm"))
            ot = Module::Bitcode;
        else if (!strcmp(argv[i], "--emit-obj"))
--- a/module.cpp
+++ b/module.cpp
--- a/module.h
+++ b/module.h
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -59,30 +59,33 @@ public:
    int CompileFile();

    /** Add a named type definition to the module. */
-    void AddTypeDef(Symbol *sym);
+    void AddTypeDef(const std::string &name, const Type *type,
+                    SourcePos pos);

    /** Add a new global variable corresponding to the given Symbol to the
        module.  If non-NULL, initExpr gives the initiailizer expression
        for the global's inital value. */ 
-    void AddGlobalVariable(Symbol *sym, Expr *initExpr, bool isConst);
+    void AddGlobalVariable(const std::string &name, const Type *type,
+                           Expr *initExpr, bool isConst,
+                           StorageClass storageClass, SourcePos pos);

    /** Add a declaration of the function defined by the given function
        symbol to the module. */
-    void AddFunctionDeclaration(Symbol *funSym, bool isInline);
+    void AddFunctionDeclaration(const std::string &name,
+                                const FunctionType *ftype, 
+                                StorageClass sc, bool isInline, SourcePos pos);

    /** Adds the function described by the declaration information and the
        provided statements to the module. */
-    void AddFunctionDefinition(Symbol *sym, const std::vector<Symbol *> &args,
-                               Stmt *code);
+    void AddFunctionDefinition(const std::string &name,
+                               const FunctionType *ftype, Stmt *code);

    /** After a source file has been compiled, output can be generated in a
        number of different formats. */
    enum OutputType { Asm,      /** Generate text assembly language output */
                      Bitcode,  /** Generate LLVM IR bitcode output */
                      Object,   /** Generate a native object file */
-#ifndef LLVM_2_9
                      CXX,      /** Generate a C++ file */
-#endif // !LLVM_2_9
                      Header    /** Generate a C/C++ header file with 
                                    declarations of 'export'ed functions, global
                                    variables, and the types used by them. */
--- a/opt.cpp
+++ b/opt.cpp
--- a/parse.yy
+++ b/parse.yy
@@ -173,8 +173,11 @@ struct ForeachDimension {
 }


-%token TOKEN_INT32_CONSTANT TOKEN_UINT32_CONSTANT TOKEN_INT64_CONSTANT
-%token TOKEN_UINT64_CONSTANT TOKEN_FLOAT_CONSTANT TOKEN_STRING_C_LITERAL
+%token TOKEN_INT32_CONSTANT TOKEN_UINT32_CONSTANT 
+%token TOKEN_INT64_CONSTANT TOKEN_UINT64_CONSTANT 
+%token TOKEN_INT32DOTDOTDOT_CONSTANT TOKEN_UINT32DOTDOTDOT_CONSTANT 
+%token TOKEN_INT64DOTDOTDOT_CONSTANT TOKEN_UINT64DOTDOTDOT_CONSTANT
+%token TOKEN_FLOAT_CONSTANT TOKEN_STRING_C_LITERAL
 %token TOKEN_IDENTIFIER TOKEN_STRING_LITERAL TOKEN_TYPE_NAME TOKEN_NULL
 %token TOKEN_PTR_OP TOKEN_INC_OP TOKEN_DEC_OP TOKEN_LEFT_OP TOKEN_RIGHT_OP 
 %token TOKEN_LE_OP TOKEN_GE_OP TOKEN_EQ_OP TOKEN_NE_OP
@@ -196,7 +199,7 @@ struct ForeachDimension {
 %token TOKEN_CIF TOKEN_CDO TOKEN_CFOR TOKEN_CWHILE TOKEN_CBREAK
 %token TOKEN_CCONTINUE TOKEN_CRETURN TOKEN_SYNC TOKEN_PRINT TOKEN_ASSERT

-%type <expr> primary_expression postfix_expression
+%type <expr> primary_expression postfix_expression integer_dotdotdot
 %type <expr> unary_expression cast_expression funcall_expression launch_expression
 %type <expr> multiplicative_expression additive_expression shift_expression
 %type <expr> relational_expression equality_expression and_expression
@@ -250,6 +253,12 @@ struct ForeachDimension {

 string_constant
    : TOKEN_STRING_LITERAL { $$ = new std::string(*yylval.stringVal); }
+    | string_constant TOKEN_STRING_LITERAL
+    {
+        std::string s = *((std::string *)$1);
+        s += *yylval.stringVal;
+        $$ = new std::string(s);
+    }
    ;

 primary_expression
@@ -382,7 +391,7 @@ argument_expression_list
      {
          ExprList *argList = dynamic_cast<ExprList *>($1);
          if (argList == NULL) {
-              Assert(m->errorCount > 0);
+              AssertPos(@1, m->errorCount > 0);
              argList = new ExprList(@3);
          }
          argList->exprs.push_back($3);
@@ -540,8 +549,8 @@ rate_qualified_type_specifier
        if ($2 == NULL)
            $$ = NULL;
        else {
-            int soaWidth = $1;
-            const StructType *st = dynamic_cast<const StructType *>($2);
+            int soaWidth = (int)$1;
+            const StructType *st = CastType<StructType>($2);
            if (st == NULL) {
                Error(@1, "\"soa\" qualifier is illegal with non-struct type \"%s\".",
                      $2->GetString().c_str());
@@ -614,15 +623,17 @@ declaration_statement
    : declaration     
    {
        if ($1 == NULL) {
-            Assert(m->errorCount > 0);
+            AssertPos(@1, m->errorCount > 0);
            $$ = NULL;
        }
        else if ($1->declSpecs->storageClass == SC_TYPEDEF) {
            for (unsigned int i = 0; i < $1->declarators.size(); ++i) {
                if ($1->declarators[i] == NULL)
-                    Assert(m->errorCount > 0);
+                    AssertPos(@1, m->errorCount > 0);
                else
-                    m->AddTypeDef($1->declarators[i]->GetSymbol());
+                    m->AddTypeDef($1->declarators[i]->name,
+                                  $1->declarators[i]->type,
+                                  $1->declarators[i]->pos);
            }
            $$ = NULL;
        }
@@ -778,7 +789,7 @@ init_declarator_list
      {
          std::vector<Declarator *> *dl = (std::vector<Declarator *> *)$1;
          if (dl == NULL) {
-              Assert(m->errorCount > 0);
+              AssertPos(@1, m->errorCount > 0);
              dl = new std::vector<Declarator *>;
          }
          if ($3 != NULL)
@@ -801,7 +812,6 @@ storage_class_specifier
    : TOKEN_TYPEDEF { $$ = SC_TYPEDEF; }
    | TOKEN_EXTERN { $$ = SC_EXTERN; }
    | TOKEN_EXTERN TOKEN_STRING_C_LITERAL  { $$ = SC_EXTERN_C; }
-    | TOKEN_EXPORT { $$ = SC_EXPORT; }
    | TOKEN_STATIC { $$ = SC_STATIC; }
    ;

@@ -843,9 +853,9 @@ struct_or_union_specifier
    : struct_or_union struct_or_union_name '{' struct_declaration_list '}' 
      {
          if ($4 != NULL) {
-              std::vector<const Type *> elementTypes;
-              std::vector<std::string> elementNames;
-              std::vector<SourcePos> elementPositions;
+              llvm::SmallVector<const Type *, 8> elementTypes;
+              llvm::SmallVector<std::string, 8> elementNames;
+              llvm::SmallVector<SourcePos, 8> elementPositions;
              GetStructTypesNamesPositions(*$4, &elementTypes, &elementNames,
                                           &elementPositions);
              StructType *st = new StructType($2, elementTypes, elementNames,
@@ -859,12 +869,11 @@ struct_or_union_specifier
    | struct_or_union '{' struct_declaration_list '}' 
      {
          if ($3 != NULL) {
-              std::vector<const Type *> elementTypes;
-              std::vector<std::string> elementNames;
-              std::vector<SourcePos> elementPositions;
+              llvm::SmallVector<const Type *, 8> elementTypes;
+              llvm::SmallVector<std::string, 8> elementNames;
+              llvm::SmallVector<SourcePos, 8> elementPositions;
              GetStructTypesNamesPositions(*$3, &elementTypes, &elementNames,
                                           &elementPositions);
-              // FIXME: should be unbound
              $$ = new StructType("", elementTypes, elementNames, elementPositions,
                                  false, Variability::Unbound, @1);
          }
@@ -882,12 +891,11 @@ struct_or_union_specifier
    | struct_or_union struct_or_union_name
      { 
          const Type *st = m->symbolTable->LookupType($2); 
-          if (!st) {
-              std::vector<std::string> alternates = m->symbolTable->ClosestTypeMatch($2);
-              std::string alts = lGetAlternates(alternates);
-              Error(@2, "Struct type \"%s\" unknown.%s", $2, alts.c_str());
+          if (st == NULL) {
+              st = new UndefinedStructType($2, Variability::Unbound, false, @2);
+              m->symbolTable->AddType($2, st, @2);
          }
-          else if (dynamic_cast<const StructType *>(st) == NULL)
+          else if (CastType<StructType>(st) == NULL)
              Error(@2, "Type \"%s\" is not a struct type! (%s)", $2,
                    st->GetString().c_str());
          $$ = st;
@@ -910,7 +918,7 @@ struct_declaration_list
      {
          std::vector<StructDeclaration *> *sdl = (std::vector<StructDeclaration *> *)$1;
          if (sdl == NULL) {
-              Assert(m->errorCount > 0);
+              AssertPos(@1, m->errorCount > 0);
              sdl = new std::vector<StructDeclaration *>;
          }
          if ($2 != NULL)
@@ -976,6 +984,11 @@ specifier_qualifier_list
                      "function declarations.");
                $$ = $2;
            }
+            else if ($1 == TYPEQUAL_EXPORT) {
+                Error(@1, "\"export\" qualifier is illegal outside of "
+                      "function declarations.");
+                $$ = $2;
+            }
            else
                FATAL("Unhandled type qualifier in parser.");
        }
@@ -1000,7 +1013,7 @@ struct_declarator_list
      {
          std::vector<Declarator *> *sdl = (std::vector<Declarator *> *)$1;
          if (sdl == NULL) {
-              Assert(m->errorCount > 0);
+              AssertPos(@1, m->errorCount > 0);
              sdl = new std::vector<Declarator *>;
          }
          if ($3 != NULL)
@@ -1047,7 +1060,7 @@ enum_specifier
              $$ = NULL;
          }
          else {
-              const EnumType *enumType = dynamic_cast<const EnumType *>(type);
+              const EnumType *enumType = CastType<EnumType>(type);
              if (enumType == NULL) {
                  Error(@2, "Type \"%s\" is not an enum type (%s).", $2,
                        type->GetString().c_str());
@@ -1074,7 +1087,7 @@ enumerator_list
      {
          std::vector<Symbol *> *symList = $1;
          if (symList == NULL) {
-              Assert(m->errorCount > 0);
+              AssertPos(@1, m->errorCount > 0);
              symList = new std::vector<Symbol *>;
          }
          if ($3 != NULL)
@@ -1108,6 +1121,7 @@ type_qualifier
    | TOKEN_UNIFORM    { $$ = TYPEQUAL_UNIFORM; }
    | TOKEN_VARYING    { $$ = TYPEQUAL_VARYING; }
    | TOKEN_TASK       { $$ = TYPEQUAL_TASK; }
+    | TOKEN_EXPORT     { $$ = TYPEQUAL_EXPORT; }
    | TOKEN_INLINE     { $$ = TYPEQUAL_INLINE; }
    | TOKEN_SIGNED     { $$ = TYPEQUAL_SIGNED; }
    | TOKEN_UNSIGNED   { $$ = TYPEQUAL_UNSIGNED; }
@@ -1160,7 +1174,7 @@ direct_declarator
    : TOKEN_IDENTIFIER
      {
          Declarator *d = new Declarator(DK_BASE, @1);
-          d->sym = new Symbol(yytext, @1);
+          d->name = yytext;
          $$ = d;
      }
    | '(' declarator ')' 
@@ -1335,8 +1349,10 @@ type_name
    {
        if ($1 == NULL || $2 == NULL)
            $$ = NULL;
-        else
-            $$ = $2->GetType($1, NULL);
+        else {
+            $2->InitFromType($1, NULL);
+            $$ = $2->type;
+        }
    }
    ;

@@ -1471,7 +1487,7 @@ initializer_list
      {
          ExprList *exprList = $1;
          if (exprList == NULL) {
-              Assert(m->errorCount > 0);
+              AssertPos(@1, m->errorCount > 0);
              exprList = new ExprList(@3);
          }
          exprList->exprs.push_back($3);
@@ -1542,7 +1558,7 @@ statement_list
      {
          StmtList *sl = (StmtList *)$1;
          if (sl == NULL) {
-              Assert(m->errorCount > 0);
+              AssertPos(@1, m->errorCount > 0);
              sl = new StmtList(@2);
          }
          sl->Add($2);
@@ -1614,11 +1630,34 @@ foreach_active_identifier
    }
    ;

+integer_dotdotdot
+    : TOKEN_INT32DOTDOTDOT_CONSTANT {
+        $$ = new ConstExpr(AtomicType::UniformInt32->GetAsConstType(),
+                           (int32_t)yylval.intVal, @1); 
+    }
+    | TOKEN_UINT32DOTDOTDOT_CONSTANT {
+        $$ = new ConstExpr(AtomicType::UniformUInt32->GetAsConstType(),
+                           (uint32_t)yylval.intVal, @1); 
+    }
+    | TOKEN_INT64DOTDOTDOT_CONSTANT {
+        $$ = new ConstExpr(AtomicType::UniformInt64->GetAsConstType(),
+                           (int64_t)yylval.intVal, @1); 
+    }
+    | TOKEN_UINT64DOTDOTDOT_CONSTANT {
+        $$ = new ConstExpr(AtomicType::UniformUInt64->GetAsConstType(),
+                           (uint64_t)yylval.intVal, @1); 
+    }
+    ;
+
 foreach_dimension_specifier
    : foreach_identifier '=' assignment_expression TOKEN_DOTDOTDOT assignment_expression
    {
        $$ = new ForeachDimension($1, $3, $5);
    }
+    | foreach_identifier '=' integer_dotdotdot assignment_expression
+    {
+        $$ = new ForeachDimension($1, $3, $4);
+    }
    ;

 foreach_dimension_list
@@ -1631,7 +1670,7 @@ foreach_dimension_list
    {
        std::vector<ForeachDimension *> *dv = $1;
        if (dv == NULL) {
-            Assert(m->errorCount > 0);
+            AssertPos(@1, m->errorCount > 0);
            dv = new std::vector<ForeachDimension *>;
        }
        if ($3 != NULL)
@@ -1669,7 +1708,7 @@ iteration_statement
     {
         std::vector<ForeachDimension *> *dims = $3;
         if (dims == NULL) {
-             Assert(m->errorCount > 0);
+             AssertPos(@3, m->errorCount > 0);
             dims = new std::vector<ForeachDimension *>;
         }
         for (unsigned int i = 0; i < dims->size(); ++i)
@@ -1679,7 +1718,7 @@ iteration_statement
     {
         std::vector<ForeachDimension *> *dims = $3;
         if (dims == NULL) {
-             Assert(m->errorCount > 0);
+             AssertPos(@3, m->errorCount > 0);
             dims = new std::vector<ForeachDimension *>;
         }

@@ -1697,7 +1736,7 @@ iteration_statement
     {
         std::vector<ForeachDimension *> *dims = $3;
         if (dims == NULL) {
-             Assert(m->errorCount > 0);
+             AssertPos(@3, m->errorCount > 0);
             dims = new std::vector<ForeachDimension *>;
         }

@@ -1708,7 +1747,7 @@ iteration_statement
     {
         std::vector<ForeachDimension *> *dims = $3;
         if (dims == NULL) {
-             Assert(m->errorCount > 0);
+             AssertPos(@1, m->errorCount > 0);
             dims = new std::vector<ForeachDimension *>;
         }

@@ -1804,6 +1843,7 @@ external_declaration
            for (unsigned int i = 0; i < $1->declarators.size(); ++i)
                lAddDeclaration($1->declSpecs, $1->declarators[i]);
    }
+    | ';'
    ;

 function_definition
@@ -1817,11 +1857,18 @@ function_definition
    } 
    compound_statement
    {
-        std::vector<Symbol *> args;
        if ($2 != NULL) {
-            Symbol *sym = $2->GetFunctionInfo($1, &args);
-            if (sym != NULL)
-                m->AddFunctionDefinition(sym, args, $4);
+            $2->InitFromDeclSpecs($1);
+            const FunctionType *funcType = CastType<FunctionType>($2->type);
+            if (funcType == NULL)
+                AssertPos(@1, m->errorCount > 0);
+            else if ($1->storageClass == SC_TYPEDEF)
+                Error(@1, "Illegal \"typedef\" provided with function definition.");
+            else {
+                Stmt *code = $4;
+                if (code == NULL) code = new StmtList(@4);
+                m->AddFunctionDefinition($2->name, funcType, code);
+            }
        }
        m->symbolTable->PopScope(); // push in lAddFunctionParams();
    }
@@ -1931,35 +1978,27 @@ lAddDeclaration(DeclSpecs *ds, Declarator *decl) {
        // Error happened earlier during parsing
        return;

+    decl->InitFromDeclSpecs(ds);
    if (ds->storageClass == SC_TYPEDEF)
-        m->AddTypeDef(decl->GetSymbol());
+        m->AddTypeDef(decl->name, decl->type, decl->pos);
    else {
-        const Type *t = decl->GetType(ds);
-        if (t == NULL) {
+        if (decl->type == NULL) {
            Assert(m->errorCount > 0);
            return;
        }

-        Symbol *sym = decl->GetSymbol();
-        if (sym == NULL) {
-            Assert(m->errorCount > 0);
-            return;
-        }
-
-        const FunctionType *ft = dynamic_cast<const FunctionType *>(t);
+        decl->type = decl->type->ResolveUnboundVariability(Variability::Varying);
+        
+        const FunctionType *ft = CastType<FunctionType>(decl->type);
        if (ft != NULL) {
-            sym->type = ft;
-            sym->storageClass = ds->storageClass;
            bool isInline = (ds->typeQualifiers & TYPEQUAL_INLINE);
-            m->AddFunctionDeclaration(sym, isInline);
+            m->AddFunctionDeclaration(decl->name, ft, ds->storageClass,
+                                      isInline, decl->pos);
        }
        else {
-            if (sym->type == NULL)
-                Assert(m->errorCount > 0);
-            else
-                sym->type = sym->type->ResolveUnboundVariability(Variability::Varying);
            bool isConst = (ds->typeQualifiers & TYPEQUAL_CONST) != 0;
-            m->AddGlobalVariable(sym, decl->initExpr, isConst);
+            m->AddGlobalVariable(decl->name, decl->type, decl->initExpr,
+                                 isConst, decl->storageClass, decl->pos);
        }
    }
 }
@@ -1973,7 +2012,7 @@ lAddFunctionParams(Declarator *decl) {
    m->symbolTable->PushScope();

    if (decl == NULL) {
-        Assert(m->errorCount > 0);
+        AssertPos(decl->pos, m->errorCount > 0);
        return;
    }

@@ -1981,27 +2020,24 @@ lAddFunctionParams(Declarator *decl) {
    while (decl->kind != DK_FUNCTION && decl->child != NULL)
        decl = decl->child;
    if (decl->kind != DK_FUNCTION) {
-        Assert(m->errorCount > 0);
+        AssertPos(decl->pos, m->errorCount > 0);
        return;
    }

    // now loop over its parameters and add them to the symbol table
    for (unsigned int i = 0; i < decl->functionParams.size(); ++i) {
        Declaration *pdecl = decl->functionParams[i];
-        if (pdecl == NULL || pdecl->declarators.size() == 0)
-            // zero size declarators array corresponds to an anonymous 
-            // parameter
-            continue;
-        Assert(pdecl->declarators.size() == 1);
-        Symbol *sym = pdecl->declarators[0]->GetSymbol();
-        if (sym == NULL || sym->type == NULL)
-            Assert(m->errorCount > 0);
+        Assert(pdecl != NULL && pdecl->declarators.size() == 1);
+        Declarator *declarator = pdecl->declarators[0];
+        if (declarator == NULL)
+            AssertPos(decl->pos, m->errorCount > 0);
        else {
-            sym->type = sym->type->ResolveUnboundVariability(Variability::Varying);
+            Symbol *sym = new Symbol(declarator->name, declarator->pos,
+                                     declarator->type, declarator->storageClass);
 #ifndef NDEBUG
            bool ok = m->symbolTable->AddVariable(sym);
            if (ok == false)
-                Assert(m->errorCount > 0);
+                AssertPos(decl->pos, m->errorCount > 0);
 #else
            m->symbolTable->AddVariable(sym);
 #endif
@@ -2064,8 +2100,6 @@ lGetStorageClassString(StorageClass sc) {
        return "";
    case SC_EXTERN:
        return "extern";
-    case SC_EXPORT:
-        return "export";
    case SC_STATIC:
        return "static";
    case SC_TYPEDEF:
@@ -2157,7 +2191,7 @@ lFinalizeEnumeratorSymbols(std::vector<Symbol *> &enums,
        if (enums[i]->constValue != NULL) {
            /* Already has a value, so first update nextVal with it. */
            int count = enums[i]->constValue->AsUInt32(&nextVal);
-            Assert(count == 1);
+            AssertPos(enums[i]->pos, count == 1);
            ++nextVal;

            /* When the source file as being parsed, the ConstExpr for any
@@ -2170,7 +2204,7 @@ lFinalizeEnumeratorSymbols(std::vector<Symbol *> &enums,
                                              enums[i]->pos);
            castExpr = Optimize(castExpr);
            enums[i]->constValue = dynamic_cast<ConstExpr *>(castExpr);
-            Assert(enums[i]->constValue != NULL);
+            AssertPos(enums[i]->pos, enums[i]->constValue != NULL);
        }
        else {
            enums[i]->constValue = new ConstExpr(enumType, nextVal++, 
--- a/run_tests.py
+++ b/run_tests.py
@@ -17,6 +17,10 @@ import shlex
 import platform
 import tempfile

+# disable fancy error/warning printing with ANSI colors, so grepping for error
+# messages doesn't get confused
+os.environ["TERM"] = "dumb"
+
 # This script is affected by http://bugs.python.org/issue5261 on OSX 10.5 Leopard
 # git history has a workaround for that issue.

@@ -28,8 +32,10 @@ parser.add_option("-r", "--random-shuffle", dest="random", help="Randomly order
                  default=False, action="store_true")
 parser.add_option("-g", "--generics-include", dest="include_file", help="Filename for header implementing functions for generics",
                  default=None)
+parser.add_option("-f", "--ispc-flags", dest="ispc_flags", help="Additional flags for ispc (-g, -O1, ...)",
+                  default="")
 parser.add_option('-t', '--target', dest='target',
-                  help='Set compilation target (sse2, sse2-x2, sse4, sse4-x2, avx, avx-x2, generic-4, generic-8, generic-16)',
+                  help='Set compilation target (sse2, sse2-x2, sse4, sse4-x2, avx, avx-x2, generic-4, generic-8, generic-16, generic-32)',
                  default="sse4")
 parser.add_option('-a', '--arch', dest='arch',
                  help='Set architecture (x86, x86-64)',
@@ -53,6 +59,10 @@ if not is_windows:
 else:
    ispc_exe = "../Release/ispc.exe"

+ispc_exe += " " + options.ispc_flags
+
+print ispc_exe
+
 is_generic_target = (options.target.find("generic-") != -1 and
                     options.target != "generic-1")
 if is_generic_target and options.include_file == None:
@@ -65,6 +75,12 @@ if is_generic_target and options.include_file == None:
    elif options.target == "generic-16":
        sys.stderr.write("No generics #include specified; using examples/intrinsics/generic-16.h\n")
        options.include_file = "examples/intrinsics/generic-16.h"
+    elif options.target == "generic-32":
+        sys.stderr.write("No generics #include specified; using examples/intrinsics/generic-32.h\n")
+        options.include_file = "examples/intrinsics/generic-32.h"
+    elif options.target == "generic-64":
+        sys.stderr.write("No generics #include specified; using examples/intrinsics/generic-64.h\n")
+        options.include_file = "examples/intrinsics/generic-64.h"

 if options.compiler_exe == None:
    if is_windows:
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -1,6 +1,6 @@
 // -*- mode: c++ -*-
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -355,7 +355,8 @@ static inline uniform bool all(bool v) {
 #else
    int32 match = __sext_varying_bool((__sext_varying_bool(v) & __mask) == __mask);
 #endif
-    return __movmsk(match) == (1 << programCount) - 1;
+    return __movmsk(match) == ((programCount == 64) ? ~0ull : 
+                               ((1ull << programCount) - 1));
 }

 __declspec(safe) 
@@ -388,14 +389,14 @@ __declspec(safe)
 static inline uniform int popcnt(bool v) {
    // As with any() and all(), only count across the active lanes
 #ifdef ISPC_TARGET_GENERIC
-    return __popcnt_int32(__movmsk(v & __mask));
+    return __popcnt_int64(__movmsk(v & __mask));
 #else
-    return __popcnt_int32(__movmsk(__sext_varying_bool(v) & __mask));
+    return __popcnt_int64(__movmsk(__sext_varying_bool(v) & __mask));
 #endif
 }

 __declspec(safe) 
-static inline uniform int lanemask() {
+static inline uniform unsigned int64 lanemask() {
    return __movmsk(__mask);
 }

@@ -746,6 +747,125 @@ static inline void prefetch_nt(const void * varying ptr) {
    }
 }

+///////////////////////////////////////////////////////////////////////////
+// non-short-circuiting alternatives
+
+__declspec(safe,cost1)
+static inline bool and(bool a, bool b) {
+    return a && b;
+}
+
+__declspec(safe,cost1)
+static inline uniform bool and(uniform bool a, uniform bool b) {
+    return a && b;
+}
+
+__declspec(safe,cost1)
+static inline bool or(bool a, bool b) {
+    return a || b;
+}
+
+__declspec(safe,cost1)
+static inline uniform bool or(uniform bool a, uniform bool b) {
+    return a || b;
+}
+
+__declspec(safe,cost1)
+static inline int8 select(bool c, int8 a, int8 b) {
+    return c ? a : b;
+}
+
+__declspec(safe,cost1)
+static inline int8 select(uniform bool c, int8 a, int8 b) {
+    return c ? a : b;
+}
+
+__declspec(safe,cost1)
+static inline uniform int8 select(uniform bool c, uniform int8 a,
+                                  uniform int8 b) {
+    return c ? a : b;
+}
+
+__declspec(safe,cost1)
+static inline int16 select(bool c, int16 a, int16 b) {
+    return c ? a : b;
+}
+
+__declspec(safe,cost1)
+static inline int16 select(uniform bool c, int16 a, int16 b) {
+    return c ? a : b;
+}
+
+__declspec(safe,cost1)
+static inline uniform int16 select(uniform bool c, uniform int16 a,
+                                   uniform int16 b) {
+    return c ? a : b;
+}
+
+__declspec(safe,cost1)
+static inline int32 select(bool c, int32 a, int32 b) {
+    return c ? a : b;
+}
+
+__declspec(safe,cost1)
+static inline int32 select(uniform bool c, int32 a, int32 b) {
+    return c ? a : b;
+}
+
+__declspec(safe,cost1)
+static inline uniform int32 select(uniform bool c, uniform int32 a,
+                                   uniform int32 b) {
+    return c ? a : b;
+}
+
+__declspec(safe,cost1)
+static inline int64 select(bool c, int64 a, int64 b) {
+    return c ? a : b;
+}
+
+__declspec(safe,cost1)
+static inline int64 select(uniform bool c, int64 a, int64 b) {
+    return c ? a : b;
+}
+
+__declspec(safe,cost1)
+static inline uniform int64 select(uniform bool c, uniform int64 a,
+                                   uniform int64 b) {
+    return c ? a : b;
+}
+
+__declspec(safe,cost1)
+static inline float select(bool c, float a, float b) {
+    return c ? a : b;
+}
+
+__declspec(safe,cost1)
+static inline float select(uniform bool c, float a, float b) {
+    return c ? a : b;
+}
+
+__declspec(safe,cost1)
+static inline uniform float select(uniform bool c, uniform float a,
+                                   uniform float b) {
+    return c ? a : b;
+}
+
+__declspec(safe,cost1)
+static inline double select(bool c, double a, double b) {
+    return c ? a : b;
+}
+
+__declspec(safe,cost1)
+static inline double select(uniform bool c, double a, double b) {
+    return c ? a : b;
+}
+
+__declspec(safe,cost1)
+static inline uniform double select(uniform bool c, uniform double a,
+                                    uniform double b) {
+    return c ? a : b;
+}
+
 ///////////////////////////////////////////////////////////////////////////
 // Horizontal ops / reductions

@@ -1469,22 +1589,17 @@ static inline void memory_barrier() {

 #define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB,MASKTYPE)                        \
 static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
-    memory_barrier();                                                   \
    TA ret = __atomic_##OPB##_##TB##_global(ptr, value, (MASKTYPE)__mask); \
-    memory_barrier();                                                   \
    return ret;                                                         \
 }                                                                       \
 static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
                                               uniform TA value) {      \
-    memory_barrier();                                                   \
    uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value); \
-    memory_barrier();                                                   \
    return ret;                                                         \
 }                                                                       \
 static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \
    uniform TA * uniform ptrArray[programCount];                        \
    ptrArray[programIndex] = ptr;                                       \
-    memory_barrier();                                                   \
    TA ret;                                                             \
    __foreach_active (i) {                                              \
        uniform TA * uniform p = ptrArray[i];                           \
@@ -1492,23 +1607,21 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \
        uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v);    \
        ret = insert(ret, i, r);                                        \
    }                                                                   \
-    memory_barrier();                                                   \
    return ret;                                                         \
 }                                                                       \

 #define DEFINE_ATOMIC_SWAP(TA,TB)                                       \
 static inline TA atomic_swap_global(uniform TA * uniform ptr, TA value) { \
-    memory_barrier();                                                   \
    uniform int i = 0;                                                  \
    TA ret[programCount];                                               \
    TA memVal;                                                          \
    uniform int lastSwap;                                               \
-    uniform int mask = lanemask();                                      \
+    uniform unsigned int64 mask = lanemask();                           \
    /* First, have the first running program instance (if any) perform  \
       the swap with memory with its value of "value"; record the       \
       value returned. */                                               \
    for (; i < programCount; ++i) {                                     \
-        if ((mask & (1 << i)) == 0)                                     \
+        if ((mask & (1ull << i)) == 0)                                  \
            continue;                                                   \
        memVal = __atomic_swap_uniform_##TB##_global(ptr, extract(value, i)); \
        lastSwap = i;                                                   \
@@ -1520,7 +1633,7 @@ static inline TA atomic_swap_global(uniform TA * uniform ptr, TA value) { \
       current instance had executed a hardware atomic swap right before \
       the last one that did a swap. */                                 \
    for (; i < programCount; ++i) {                                     \
-        if ((mask & (1 << i)) == 0)                                     \
+        if ((mask & (1ull << i)) == 0)                                  \
            continue;                                                   \
        ret[lastSwap] = extract(value, i);                              \
        lastSwap = i;                                                   \
@@ -1528,20 +1641,16 @@ static inline TA atomic_swap_global(uniform TA * uniform ptr, TA value) { \
    /* And the last instance that wanted to swap gets the value we      \
       originally got back from memory... */                            \
    ret[lastSwap] = memVal;                                             \
-    memory_barrier();                                                   \
    return ret[programIndex];                                           \
 }                                                                       \
 static inline uniform TA atomic_swap_global(uniform TA * uniform ptr,   \
                                            uniform TA value) {         \
-    memory_barrier();                                                   \
    uniform TA ret = __atomic_swap_uniform_##TB##_global(ptr, value);   \
-    memory_barrier();                                                   \
    return ret;                                                         \
 }                                                                       \
 static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \
    uniform TA * uniform ptrArray[programCount];                        \
    ptrArray[programIndex] = ptr;                                       \
-    memory_barrier();                                                   \
    TA ret;                                                             \
    __foreach_active (i) {                                              \
        uniform TA * uniform p = ptrArray[i];                           \
@@ -1549,7 +1658,6 @@ static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \
        uniform TA r = __atomic_swap_uniform_##TB##_global(p, v);       \
        ret = insert(ret, i, r);                                        \
    }                                                                   \
-    memory_barrier();                                                   \
    return ret;                                                         \
 }                                                                       \

@@ -1557,25 +1665,19 @@ static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \
 static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
    uniform TA oneval = reduce_##OPA(value);                            \
    TA ret;                                                             \
-    if (lanemask() != 0) {                                              \
-        memory_barrier();                                               \
+    if (lanemask() != 0)                                                \
        ret = __atomic_##OPB##_uniform_##TB##_global(ptr, oneval);      \
-        memory_barrier();                                               \
-    }                                                                   \
    return ret;                                                         \
 }                                                                       \
 static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
                                               uniform TA value) {      \
-    memory_barrier();                                                   \
    uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value); \
-    memory_barrier();                                                   \
    return ret;                                                         \
 }                                                                       \
 static inline TA atomic_##OPA##_global(uniform TA * varying ptr,        \
                                       TA value) {                      \
    uniform TA * uniform ptrArray[programCount];                        \
    ptrArray[programIndex] = ptr;                                       \
-    memory_barrier();                                                   \
    TA ret;                                                             \
    __foreach_active (i) {                                              \
        uniform TA * uniform p = ptrArray[i];                           \
@@ -1583,7 +1685,6 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr,        \
        uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v);    \
        ret = insert(ret, i, r);                                        \
    }                                                                   \
-    memory_barrier();                                                   \
    return ret;                                                         \
 }

@@ -1638,25 +1739,20 @@ DEFINE_ATOMIC_SWAP(double,double)
 #define ATOMIC_DECL_CMPXCHG(TA, TB, MASKTYPE)                           \
 static inline uniform TA atomic_compare_exchange_global(               \
         uniform TA * uniform ptr, uniform TA oldval, uniform TA newval) { \
-    memory_barrier();                                                      \
    uniform TA ret =                                                    \
        __atomic_compare_exchange_uniform_##TB##_global(ptr, oldval, newval); \
-    memory_barrier();                                                   \
    return ret;                                                         \
 }                                                                       \
 static inline TA atomic_compare_exchange_global(                           \
         uniform TA * uniform ptr, TA oldval, TA newval) {                 \
-    memory_barrier();                                                      \
    TA ret = __atomic_compare_exchange_##TB##_global(ptr, oldval, newval,  \
                                                     (MASKTYPE)__mask);    \
-    memory_barrier();                                                      \
    return ret;                                                            \
 } \
 static inline TA atomic_compare_exchange_global(               \
         uniform TA * varying ptr, TA oldval, TA newval) { \
    uniform TA * uniform ptrArray[programCount];                        \
    ptrArray[programIndex] = ptr;                                       \
-    memory_barrier();                                                   \
    TA ret;                                                             \
    __foreach_active (i) {                                              \
        uniform TA r =                                                  \
@@ -1665,7 +1761,6 @@ static inline TA atomic_compare_exchange_global(               \
                                                            extract(newval, i)); \
        ret = insert(ret, i, r);                                        \
    }                                                                   \
-    memory_barrier();                                                   \
    return ret;                                                         \
 }

@@ -1678,6 +1773,49 @@ ATOMIC_DECL_CMPXCHG(double, double, IntMaskType)

 #undef ATOMIC_DECL_CMPXCHG

+// void * variants of swap and compare exchange
+
+static inline void *atomic_swap_global(void ** uniform ptr,
+                                       void * value) {
+    return (void *)atomic_swap_global((intptr_t * uniform)ptr,
+                                      (intptr_t)value);
+}
+
+static inline void * uniform atomic_swap_global(void ** uniform ptr,
+                                                void * uniform value) {
+    return (void * uniform)atomic_swap_global((intptr_t * uniform)ptr,
+                                              (uniform intptr_t)value);
+}
+
+static inline void *atomic_swap_global(void ** ptr, void * value) {
+    return (void *)atomic_swap_global((intptr_t *)ptr,
+                                      (intptr_t)value);
+}
+
+static inline void * 
+atomic_compare_exchange_global(void ** uniform ptr, 
+                               void * oldval, void * newval) {
+    return (void *)atomic_compare_exchange_global((intptr_t * uniform)ptr,
+                                                  (intptr_t)oldval,
+                                                  (intptr_t)newval);
+}
+
+static inline void * uniform
+atomic_compare_exchange_global(void ** uniform ptr, void * uniform oldval, 
+                               void * uniform newval) { 
+    return (void * uniform)atomic_compare_exchange_global((intptr_t * uniform)ptr,
+                                                          (uniform intptr_t)oldval,
+                                                          (uniform intptr_t)newval);
+}
+
+static inline void *
+atomic_compare_exchange_global(void ** ptr, void * oldval,
+                               void * newval) {
+    return (void *)atomic_compare_exchange_global((intptr_t *)ptr,
+                                                  (intptr_t)oldval,
+                                                  (intptr_t)newval);
+}
+
 ///////////////////////////////////////////////////////////////////////////
 // local atomics

@@ -1849,6 +1987,49 @@ LOCAL_CMPXCHG(double)
 #undef LOCAL_ATOMIC
 #undef LOCAL_CMPXCHG

+// void * variants of swap and compare exchange
+
+static inline void *atomic_swap_local(void ** uniform ptr,
+                                      void * value) {
+    return (void *)atomic_swap_local((intptr_t * uniform)ptr,
+                                      (intptr_t)value);
+}
+
+static inline void * uniform atomic_swap_local(void ** uniform ptr,
+                                               void * uniform value) {
+    return (void * uniform)atomic_swap_local((intptr_t * uniform)ptr,
+                                              (uniform intptr_t)value);
+}
+
+static inline void *atomic_swap_local(void ** ptr, void * value) {
+    return (void *)atomic_swap_local((intptr_t *)ptr,
+                                      (intptr_t)value);
+}
+
+static inline void * 
+atomic_compare_exchange_local(void ** uniform ptr, 
+                              void * oldval, void * newval) {
+    return (void *)atomic_compare_exchange_local((intptr_t * uniform)ptr,
+                                                  (intptr_t)oldval,
+                                                  (intptr_t)newval);
+}
+
+static inline void * uniform
+atomic_compare_exchange_local(void ** uniform ptr, void * uniform oldval, 
+                              void * uniform newval) { 
+    return (void * uniform)atomic_compare_exchange_local((intptr_t * uniform)ptr,
+                                                          (uniform intptr_t)oldval,
+                                                          (uniform intptr_t)newval);
+}
+
+static inline void *
+atomic_compare_exchange_local(void ** ptr, void * oldval,
+                              void * newval) {
+    return (void *)atomic_compare_exchange_local((intptr_t *)ptr,
+                                                  (intptr_t)oldval,
+                                                  (intptr_t)newval);
+}
+
 ///////////////////////////////////////////////////////////////////////////
 // Transcendentals (float precision)

@@ -2735,7 +2916,10 @@ static inline uniform float atan2(uniform float y, uniform float x) {

 __declspec(safe)
 static inline float exp(float x_full) {
-    if (__math_lib == __math_lib_svml) {
+    if (__have_native_transcendentals) {
+        return __exp_varying_float(x_full);
+    }
+    else if (__math_lib == __math_lib_svml) {
        return __svml_exp(x_full);
    }
    else if (__math_lib == __math_lib_system) {
@@ -2814,7 +2998,10 @@ static inline float exp(float x_full) {

 __declspec(safe)
 static inline uniform float exp(uniform float x_full) {
-    if (__math_lib == __math_lib_system ||
+    if (__have_native_transcendentals) {
+        return __exp_uniform_float(x_full);
+    }
+    else if (__math_lib == __math_lib_system ||
        __math_lib == __math_lib_svml) {
        return __stdlib_expf(x_full);
    }
@@ -2936,7 +3123,10 @@ static inline void __range_reduce_log(uniform float input, uniform float * unifo

 __declspec(safe)
 static inline float log(float x_full) {
-    if (__math_lib == __math_lib_svml) {
+    if (__have_native_transcendentals) {
+        return __log_varying_float(x_full);
+    }
+    else if (__math_lib == __math_lib_svml) {
        return __svml_log(x_full);
    }
    else if (__math_lib == __math_lib_system) {
@@ -3024,7 +3214,10 @@ static inline float log(float x_full) {

 __declspec(safe)
 static inline uniform float log(uniform float x_full) {
-    if (__math_lib == __math_lib_system ||
+    if (__have_native_transcendentals) {
+        return __log_uniform_float(x_full);
+    }
+    else if (__math_lib == __math_lib_system ||
        __math_lib == __math_lib_svml) {
        return __stdlib_logf(x_full);
    }
@@ -3105,7 +3298,10 @@ static inline uniform float log(uniform float x_full) {

 __declspec(safe)
 static inline float pow(float a, float b) {
-    if (__math_lib == __math_lib_svml) {
+    if (__have_native_transcendentals) {
+        return __pow_varying_float(a, b);
+    }
+    else if (__math_lib == __math_lib_svml) {
        return __svml_pow(a, b);
    }
    else if (__math_lib == __math_lib_system) {
@@ -3124,6 +3320,9 @@ static inline float pow(float a, float b) {

 __declspec(safe)
 static inline uniform float pow(uniform float a, uniform float b) {
+    if (__have_native_transcendentals) {
+        return __pow_uniform_float(a, b);
+    }
    if (__math_lib == __math_lib_system ||
        __math_lib == __math_lib_svml) {
        return __stdlib_powf(a, b);
@@ -3551,8 +3750,9 @@ static inline int16 float_to_half(float f) {
        //   like recursive filters in DSP - not a typical half-float application. Whether
        //   FP16 denormals are rare in practice, I don't know. Whatever slow path your HW
        //   may or may not have for denormals, this may well hit it.
-        int32 fint2 = intbits(floatbits(fint & round_mask) * floatbits(magic)) - round_mask;
-        fint2 = (fint2 > f16infty) ? f16infty : fint2; // Clamp to signed infinity if overflowed
+        float fscale = floatbits(fint & round_mask) * floatbits(magic);
+        fscale = min(fscale, floatbits((31 << 23) - 0x1000));
+        int32 fint2 = intbits(fscale) - round_mask;

        if (fint < f32infty)
            o = fint2 >> 13; // Take the bits!
@@ -3648,6 +3848,133 @@ static inline int16 float_to_half_fast(float f) {
    }
 }

+///////////////////////////////////////////////////////////////////////////
+// float -> srgb8
+
+// https://gist.github.com/2246678, from Fabian "rygorous" Giesen.
+//
+// The basic ideas are still the same, only this time, we squeeze
+// everything into the table, even the linear part of the range; since we
+// are approximating the function as piecewise linear anyway, this is
+// fairly easy.
+//
+// In the exact version of the conversion, any value that produces an
+// output float less than 0.5 will be rounded to an integer of
+// zero. Inverting the linear part of the transform, we get:
+//
+//   log2(0.5 / (255 * 12.92)) =~ -12.686
+//
+// which in turn means that any value smaller than about 2^(-12.687) will
+// return 0.  What this means is that we can adapt the clamping code to
+// just clamp to [2^(-13), 1-eps] and we're covered. This means our table
+// needs to cover a range of 13 different exponents from -13 to -1.
+//
+// The table lookup, storage and interpolation works exactly the same way
+// as in the code above.
+//
+// Max error for the whole function (integer-rounded result minus "exact"
+// value, as computed in floats using the official formula): 0.544403 at
+// 0x3e9f8000
+
+__declspec(safe)
+static inline int
+float_to_srgb8(float in)
+{
+    static const uniform unsigned int table[104] = {
+        0x0073000d, 0x007a000d, 0x0080000d, 0x0087000d,
+        0x008d000d, 0x0094000d, 0x009a000d, 0x00a1000d,
+        0x00a7001a, 0x00b4001a, 0x00c1001a, 0x00ce001a,
+        0x00da001a, 0x00e7001a, 0x00f4001a, 0x0101001a,
+        0x010e0033, 0x01280033, 0x01410033, 0x015b0033,
+        0x01750033, 0x018f0033, 0x01a80033, 0x01c20033,
+        0x01dc0067, 0x020f0067, 0x02430067, 0x02760067,
+        0x02aa0067, 0x02dd0067, 0x03110067, 0x03440067,
+        0x037800ce, 0x03df00ce, 0x044600ce, 0x04ad00ce,
+        0x051400ce, 0x057b00c5, 0x05dd00bc, 0x063b00b5,
+        0x06970158, 0x07420142, 0x07e30130, 0x087b0120,
+        0x090b0112, 0x09940106, 0x0a1700fc, 0x0a9500f2,
+        0x0b0f01cb, 0x0bf401ae, 0x0ccb0195, 0x0d950180,
+        0x0e56016e, 0x0f0d015e, 0x0fbc0150, 0x10630143,
+        0x11070264, 0x1238023e, 0x1357021d, 0x14660201,
+        0x156601e9, 0x165a01d3, 0x174401c0, 0x182401af,
+        0x18fe0331, 0x1a9602fe, 0x1c1502d2, 0x1d7e02ad,
+        0x1ed4028d, 0x201a0270, 0x21520256, 0x227d0240,
+        0x239f0443, 0x25c003fe, 0x27bf03c4, 0x29a10392,
+        0x2b6a0367, 0x2d1d0341, 0x2ebe031f, 0x304d0300,
+        0x31d105b0, 0x34a80555, 0x37520507, 0x39d504c5,
+        0x3c37048b, 0x3e7c0458, 0x40a8042a, 0x42bd0401,
+        0x44c20798, 0x488e071e, 0x4c1c06b6, 0x4f76065d,
+        0x52a50610, 0x55ac05cc, 0x5892058f, 0x5b590559,
+        0x5e0c0a23, 0x631c0980, 0x67db08f6, 0x6c55087f,
+        0x70940818, 0x74a007bd, 0x787d076c, 0x7c330723,
+    };
+
+    static const uniform unsigned int almost_one = 0x3f7fffff;
+    
+    // Clamp to [2^(-13), 1-eps]; these two values map to 0 and 1, respectively.
+    in = max(in, 0.0f);
+    in = min(in, floatbits(almost_one));
+
+    // Do the table lookup and unpack bias, scale
+    unsigned int tab = table[(intbits(in) - 0x39000000u) >> 20];
+    unsigned int bias = (tab >> 16) << 9;
+    unsigned int scale = tab & 0xffff;
+
+    // Grab next-highest mantissa bits and perform linear interpolation
+    unsigned int t = (intbits(in) >> 12) & 0xff;
+    return (bias + scale*t) >> 16;
+}
+
+
+__declspec(safe)
+static inline uniform int
+float_to_srgb8(uniform float in)
+{
+    static const uniform unsigned int table[104] = {
+        0x0073000d, 0x007a000d, 0x0080000d, 0x0087000d,
+        0x008d000d, 0x0094000d, 0x009a000d, 0x00a1000d,
+        0x00a7001a, 0x00b4001a, 0x00c1001a, 0x00ce001a,
+        0x00da001a, 0x00e7001a, 0x00f4001a, 0x0101001a,
+        0x010e0033, 0x01280033, 0x01410033, 0x015b0033,
+        0x01750033, 0x018f0033, 0x01a80033, 0x01c20033,
+        0x01dc0067, 0x020f0067, 0x02430067, 0x02760067,
+        0x02aa0067, 0x02dd0067, 0x03110067, 0x03440067,
+        0x037800ce, 0x03df00ce, 0x044600ce, 0x04ad00ce,
+        0x051400ce, 0x057b00c5, 0x05dd00bc, 0x063b00b5,
+        0x06970158, 0x07420142, 0x07e30130, 0x087b0120,
+        0x090b0112, 0x09940106, 0x0a1700fc, 0x0a9500f2,
+        0x0b0f01cb, 0x0bf401ae, 0x0ccb0195, 0x0d950180,
+        0x0e56016e, 0x0f0d015e, 0x0fbc0150, 0x10630143,
+        0x11070264, 0x1238023e, 0x1357021d, 0x14660201,
+        0x156601e9, 0x165a01d3, 0x174401c0, 0x182401af,
+        0x18fe0331, 0x1a9602fe, 0x1c1502d2, 0x1d7e02ad,
+        0x1ed4028d, 0x201a0270, 0x21520256, 0x227d0240,
+        0x239f0443, 0x25c003fe, 0x27bf03c4, 0x29a10392,
+        0x2b6a0367, 0x2d1d0341, 0x2ebe031f, 0x304d0300,
+        0x31d105b0, 0x34a80555, 0x37520507, 0x39d504c5,
+        0x3c37048b, 0x3e7c0458, 0x40a8042a, 0x42bd0401,
+        0x44c20798, 0x488e071e, 0x4c1c06b6, 0x4f76065d,
+        0x52a50610, 0x55ac05cc, 0x5892058f, 0x5b590559,
+        0x5e0c0a23, 0x631c0980, 0x67db08f6, 0x6c55087f,
+        0x70940818, 0x74a007bd, 0x787d076c, 0x7c330723,
+    };
+
+    static const uniform unsigned int almost_one = 0x3f7fffff;
+    
+    // Clamp to [2^(-13), 1-eps]; these two values map to 0 and 1, respectively.
+    in = max(in, 0.0f);
+    in = min(in, floatbits(almost_one));
+
+    // Do the table lookup and unpack bias, scale
+    uniform unsigned int tab = table[(intbits(in) - 0x39000000u) >> 20];
+    uniform unsigned int bias = (tab >> 16) << 9;
+    uniform unsigned int scale = tab & 0xffff;
+
+    // Grab next-highest mantissa bits and perform linear interpolation
+    uniform unsigned int t = (intbits(in) >> 12) & 0xff;
+    return (bias + scale*t) >> 16;
+}
+
 ///////////////////////////////////////////////////////////////////////////
 // RNG stuff

@@ -3699,60 +4026,13 @@ static inline uniform float frandom(uniform RNGState * uniform state)
    return floatbits(0x3F800000 | irand)-1.0f;
 }

-static inline uniform unsigned int __seed4(varying RNGState * uniform state, 
-                                           uniform int start,
-                                           uniform unsigned int seed) {
-    uniform unsigned int c1 = 0xf0f0f0f0;
-    uniform unsigned int c2 = 0x0f0f0f0f;
-
-    state->z1 = insert(state->z1, start + 0, seed);
-    state->z1 = insert(state->z1, start + 1, seed ^ c1);
-    state->z1 = insert(state->z1, start + 2, (seed << 3) ^ c1);
-    state->z1 = insert(state->z1, start + 3, (seed << 2) ^ c2);
-
-    seed += 131;
-    state->z2 = insert(state->z2, start + 0, seed);
-    state->z2 = insert(state->z2, start + 1, seed ^ c1);
-    state->z2 = insert(state->z2, start + 2, (seed << 3) ^ c1);
-    state->z2 = insert(state->z2, start + 3, (seed << 2) ^ c2);
-
-    seed ^= extract(state->z2, 2);
-    state->z3 = insert(state->z3, start + 0, seed);
-    state->z3 = insert(state->z3, start + 1, seed ^ c1);
-    state->z3 = insert(state->z3, start + 2, (seed << 3) ^ c1);
-    state->z3 = insert(state->z3, start + 3, (seed << 2) ^ c2);
-
-    seed <<= 4;
-    seed += 3;
-    seed ^= extract(state->z1, 3);
-    state->z4 = insert(state->z4, start + 0, seed);
-    state->z4 = insert(state->z4, start + 1, seed ^ c1);
-    state->z4 = insert(state->z4, start + 2, (seed << 3) ^ c1);
-    state->z4 = insert(state->z4, start + 3, (seed << 2) ^ c2);
-
-    return seed;
-}
-
 static inline void seed_rng(varying RNGState * uniform state, 
-                            uniform unsigned int seed) {
-    if (programCount == 1) {
-        state->z1 = seed;
-        state->z2 = seed ^ 0xbeeff00d;
-        state->z3 = ((seed & 0xffff) << 16) | (seed >> 16);
-        state->z4 = (((seed & 0xff) << 24) | ((seed & 0xff00)  << 8) |
-                     ((seed & 0xff0000) >> 8) | (seed & 0xff000000) >> 24);
-    }
-    else {
-        seed = __seed4(state, 0, seed);
-        if (programCount == 8)
-            __seed4(state, 4, seed ^ 0xbeeff00d);
-        if (programCount == 16) {
-            __seed4(state, 4,  seed ^ 0xbeeff00d);
-            __seed4(state, 8,  ((seed & 0xffff) << 16) | (seed >> 16));
-            __seed4(state, 12, (((seed & 0xff) << 24) | ((seed & 0xff00)  << 8) |
-                                ((seed & 0xff0000) >> 8) | (seed & 0xff000000) >> 24));
-        }
-    }
+                            unsigned int seed) {
+    state->z1 = seed;
+    state->z2 = seed ^ 0xbeeff00d;
+    state->z3 = ((seed & 0xffff) << 16) | (seed >> 16);
+    state->z4 = (((seed & 0xff) << 24) | ((seed & 0xff00)  << 8) |
+                 ((seed & 0xff0000) >> 8) | (seed & 0xff000000) >> 24);
 }

 static inline void seed_rng(uniform RNGState * uniform state, 
--- a/stmt.cpp
+++ b/stmt.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -40,6 +40,7 @@
 #include "util.h"
 #include "expr.h"
 #include "type.h"
+#include "func.h"
 #include "sym.h"
 #include "module.h"
 #include "llvmutil.h"
@@ -121,7 +122,7 @@ DeclStmt::DeclStmt(const std::vector<VariableDeclaration> &v, SourcePos p)

 static bool
 lHasUnsizedArrays(const Type *type) {
-    const ArrayType *at = dynamic_cast<const ArrayType *>(type);
+    const ArrayType *at = CastType<ArrayType>(type);
    if (at == NULL)
        return false;

@@ -139,7 +140,7 @@ DeclStmt::EmitCode(FunctionEmitContext *ctx) const {

    for (unsigned int i = 0; i < vars.size(); ++i) {
        Symbol *sym = vars[i].sym;
-        Assert(sym != NULL);
+        AssertPos(pos, sym != NULL);
        if (sym->type == NULL)
            continue;
        Expr *initExpr = vars[i].init;
@@ -167,16 +168,30 @@ DeclStmt::EmitCode(FunctionEmitContext *ctx) const {
        }

        // References must have initializer expressions as well.
-        if (dynamic_cast<const ReferenceType *>(sym->type) && initExpr == NULL) {
-            Error(sym->pos,
-                  "Must provide initializer for reference-type variable \"%s\".",
-                  sym->name.c_str());
-            continue;
+        if (IsReferenceType(sym->type) == true) {
+            if (initExpr == NULL) {
+                Error(sym->pos, "Must provide initializer for reference-type "
+                      "variable \"%s\".", sym->name.c_str());
+                continue;
+            }
+            if (IsReferenceType(initExpr->GetType()) == false) {
+                const Type *initLVType = initExpr->GetLValueType();
+                if (initLVType == NULL) {
+                    Error(initExpr->pos, "Initializer for reference-type variable "
+                          "\"%s\" must have an lvalue type.", sym->name.c_str());
+                    continue;
+                }
+                if (initLVType->IsUniformType() == false) {
+                    Error(initExpr->pos, "Initializer for reference-type variable "
+                          "\"%s\" must have a uniform lvalue type.", sym->name.c_str());
+                    continue;
+                }
+            }
        }

-        LLVM_TYPE_CONST llvm::Type *llvmType = sym->type->LLVMType(g->ctx);
+        llvm::Type *llvmType = sym->type->LLVMType(g->ctx);
        if (llvmType == NULL) {
-            Assert(m->errorCount > 0);
+            AssertPos(pos, m->errorCount > 0);
            return;
        }

@@ -282,8 +297,8 @@ DeclStmt::TypeCheck() {
        // the int->float type conversion is in there and we don't return
        // an int as the constValue later...
        const Type *type = vars[i].sym->type;
-        if (dynamic_cast<const AtomicType *>(type) != NULL ||
-            dynamic_cast<const EnumType *>(type) != NULL) {
+        if (CastType<AtomicType>(type) != NULL ||
+            CastType<EnumType>(type) != NULL) {
            // If it's an expr list with an atomic type, we'll later issue
            // an error.  Need to leave vars[i].init as is in that case so
            // it is in fact caught later, though.
@@ -463,12 +478,12 @@ IfStmt::emitMaskedTrueAndFalse(FunctionEmitContext *ctx, llvm::Value *oldMask,
        lEmitIfStatements(ctx, trueStmts, "if: expr mixed, true statements");
        // under varying control flow,, returns can't stop instruction
        // emission, so this better be non-NULL...
-        Assert(ctx->GetCurrentBasicBlock()); 
+        AssertPos(ctx->GetDebugPos(), ctx->GetCurrentBasicBlock()); 
    }
    if (falseStmts) {
        ctx->SetInternalMaskAndNot(oldMask, test);
        lEmitIfStatements(ctx, falseStmts, "if: expr mixed, false statements");
-        Assert(ctx->GetCurrentBasicBlock());
+        AssertPos(ctx->GetDebugPos(), ctx->GetCurrentBasicBlock());
    }
 }

@@ -549,7 +564,7 @@ IfStmt::emitVaryingIf(FunctionEmitContext *ctx, llvm::Value *ltest) const {
            (costIsAcceptable || g->opt.disableCoherentControlFlow)) {
            ctx->StartVaryingIf(oldMask);
            emitMaskedTrueAndFalse(ctx, oldMask, ltest);
-            Assert(ctx->GetCurrentBasicBlock());
+            AssertPos(pos, ctx->GetCurrentBasicBlock());
            ctx->EndIf();
        }
        else {
@@ -572,7 +587,7 @@ IfStmt::emitMaskAllOn(FunctionEmitContext *ctx, llvm::Value *ltest,
    // compiler see what's going on so that subsequent optimizations for
    // code emitted here can operate with the knowledge that the mask is
    // definitely all on (until it modifies the mask itself).
-    Assert(!g->opt.disableCoherentControlFlow);
+    AssertPos(pos, !g->opt.disableCoherentControlFlow);
    if (!g->opt.disableMaskAllOnOptimizations)
        ctx->SetInternalMask(LLVMMaskAllOn);
    llvm::Value *oldFunctionMask = ctx->GetFunctionMask();
@@ -622,7 +637,7 @@ IfStmt::emitMaskAllOn(FunctionEmitContext *ctx, llvm::Value *ltest,
    emitMaskedTrueAndFalse(ctx, LLVMMaskAllOn, ltest);
    // In this case, return/break/continue isn't allowed to jump and end
    // emission.
-    Assert(ctx->GetCurrentBasicBlock());
+    AssertPos(pos, ctx->GetCurrentBasicBlock());
    ctx->EndIf();
    ctx->BranchInst(bDone);

@@ -651,7 +666,7 @@ IfStmt::emitMaskMixed(FunctionEmitContext *ctx, llvm::Value *oldMask,
        // Emit statements for true
        ctx->SetCurrentBasicBlock(bRunTrue);
        lEmitIfStatements(ctx, trueStmts, "if: expr mixed, true statements");
-        Assert(ctx->GetCurrentBasicBlock()); 
+        AssertPos(pos, ctx->GetCurrentBasicBlock()); 
        ctx->BranchInst(bNext);
        ctx->SetCurrentBasicBlock(bNext);
    }
@@ -668,7 +683,7 @@ IfStmt::emitMaskMixed(FunctionEmitContext *ctx, llvm::Value *oldMask,
        // Emit code for false
        ctx->SetCurrentBasicBlock(bRunFalse);
        lEmitIfStatements(ctx, falseStmts, "if: expr mixed, false statements");
-        Assert(ctx->GetCurrentBasicBlock());
+        AssertPos(pos, ctx->GetCurrentBasicBlock());
        ctx->BranchInst(bNext);
        ctx->SetCurrentBasicBlock(bNext);
    }
@@ -822,7 +837,7 @@ void DoStmt::EmitCode(FunctionEmitContext *ctx) const {
            ctx->SetFunctionMask(LLVMMaskAllOn);
        if (bodyStmts)
            bodyStmts->EmitCode(ctx);
-        Assert(ctx->GetCurrentBasicBlock());
+        AssertPos(pos, ctx->GetCurrentBasicBlock());
        ctx->SetFunctionMask(oldFunctionMask);
        ctx->BranchInst(btest);

@@ -830,7 +845,7 @@ void DoStmt::EmitCode(FunctionEmitContext *ctx) const {
        ctx->SetCurrentBasicBlock(bMixed);
        if (bodyStmts)
            bodyStmts->EmitCode(ctx);
-        Assert(ctx->GetCurrentBasicBlock());
+        AssertPos(pos, ctx->GetCurrentBasicBlock());
        ctx->BranchInst(btest);
    }
    else {
@@ -971,7 +986,7 @@ ForStmt::EmitCode(FunctionEmitContext *ctx) const {
    // it and then jump into the loop test code.  (Also start a new scope
    // since the initiailizer may be a declaration statement).
    if (init) {
-        Assert(dynamic_cast<StmtList *>(init) == NULL);
+        AssertPos(pos, dynamic_cast<StmtList *>(init) == NULL);
        ctx->StartScope();
        init->EmitCode(ctx);
    }
@@ -1000,7 +1015,7 @@ ForStmt::EmitCode(FunctionEmitContext *ctx) const {
        if (doCoherentCheck)
            Warning(test->pos, "Uniform condition supplied to cfor/cwhile "
                    "statement.");
-        Assert(ltest->getType() == LLVMTypes::BoolType);
+        AssertPos(pos, ltest->getType() == LLVMTypes::BoolType);
        ctx->BranchInst(bloop, bexit, ltest);
    }
    else {
@@ -1036,7 +1051,7 @@ ForStmt::EmitCode(FunctionEmitContext *ctx) const {
            ctx->SetFunctionMask(LLVMMaskAllOn);
        if (stmts)
            stmts->EmitCode(ctx);
-        Assert(ctx->GetCurrentBasicBlock());
+        AssertPos(pos, ctx->GetCurrentBasicBlock());
        ctx->SetFunctionMask(oldFunctionMask);
        ctx->BranchInst(bstep);

@@ -1349,8 +1364,8 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
    ctx->SetFunctionMask(LLVMMaskAllOn);

    // This should be caught during typechecking
-    Assert(startExprs.size() == dimVariables.size() && 
-           endExprs.size() == dimVariables.size());
+    AssertPos(pos, startExprs.size() == dimVariables.size() && 
+              endExprs.size() == dimVariables.size());
    int nDims = (int)dimVariables.size();

    ///////////////////////////////////////////////////////////////////////
@@ -1689,7 +1704,7 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
        ctx->SetContinueTarget(bbFullBodyContinue);
        ctx->AddInstrumentationPoint("foreach loop body (all on)");
        stmts->EmitCode(ctx);
-        Assert(ctx->GetCurrentBasicBlock() != NULL);
+        AssertPos(pos, ctx->GetCurrentBasicBlock() != NULL);
        ctx->BranchInst(bbFullBodyContinue);
    }
    ctx->SetCurrentBasicBlock(bbFullBodyContinue); {
@@ -2079,7 +2094,7 @@ SwitchStmt::EmitCode(FunctionEmitContext *ctx) const {

    const Type *type;
    if (expr == NULL || ((type = expr->GetType()) == NULL)) {
-        Assert(m->errorCount > 0);
+        AssertPos(pos, m->errorCount > 0);
        return;
    }

@@ -2097,7 +2112,7 @@ SwitchStmt::EmitCode(FunctionEmitContext *ctx) const {

    llvm::Value *exprValue = expr->GetValue(ctx);
    if (exprValue == NULL) {
-        Assert(m->errorCount > 0);
+        AssertPos(pos, m->errorCount > 0);
        return;
    }

@@ -2173,8 +2188,8 @@ SwitchStmt::EstimateCost() const {
 ///////////////////////////////////////////////////////////////////////////
 // ReturnStmt

-ReturnStmt::ReturnStmt(Expr *v, bool cc, SourcePos p) 
-    : Stmt(p), val(v), 
+ReturnStmt::ReturnStmt(Expr *e, bool cc, SourcePos p) 
+    : Stmt(p), expr(e), 
      doCoherenceCheck(cc && !g->opt.disableCoherentControlFlow) {
 }

@@ -2189,8 +2204,29 @@ ReturnStmt::EmitCode(FunctionEmitContext *ctx) const {
        return;
    }

+    // Make sure we're not trying to return a reference to something where
+    // that doesn't make sense
+    const Function *func = ctx->GetFunction();
+    const Type *returnType = func->GetReturnType();
+    if (IsReferenceType(returnType) == true &&
+        IsReferenceType(expr->GetType()) == false) {
+        const Type *lvType = expr->GetLValueType();
+        if (lvType == NULL) {
+            Error(expr->pos, "Illegal to return non-lvalue from function "
+                  "returning reference type \"%s\".",
+                  returnType->GetString().c_str());
+            return;
+        }
+        else if (lvType->IsUniformType() == false) {
+            Error(expr->pos, "Illegal to return varying lvalue type from "
+                  "function returning a reference type \"%s\".",
+                  returnType->GetString().c_str());
+            return;
+        }
+    }
+
    ctx->SetDebugPos(pos);
-    ctx->CurrentLanesReturned(val, doCoherenceCheck);
+    ctx->CurrentLanesReturned(expr, doCoherenceCheck);
 }


@@ -2210,7 +2246,8 @@ void
 ReturnStmt::Print(int indent) const {
    printf("%*c%sReturn Stmt", indent, ' ', doCoherenceCheck ? "Coherent " : "");
    pos.Print();
-    if (val) val->Print();
+    if (expr)
+        expr->Print();
    else printf("(void)");
    printf("\n");
 }
@@ -2228,6 +2265,9 @@ GotoStmt::GotoStmt(const char *l, SourcePos gotoPos, SourcePos ip)

 void
 GotoStmt::EmitCode(FunctionEmitContext *ctx) const {
+    if (!ctx->GetCurrentBasicBlock()) 
+        return;
+
    if (ctx->VaryingCFDepth() > 0) {
        Error(pos, "\"goto\" statements are only legal under \"uniform\" "
              "control flow.");
@@ -2241,10 +2281,22 @@ GotoStmt::EmitCode(FunctionEmitContext *ctx) const {

    llvm::BasicBlock *bb = ctx->GetLabeledBasicBlock(label);
    if (bb == NULL) {
-        // TODO: use the string distance stuff to suggest alternatives if
-        // there are some with names close to the label name we have here..
-        Error(identifierPos, "No label named \"%s\" found in current function.",
-              label.c_str());
+        /* Label wasn't found. Look for suggestions that are close */
+        std::vector<std::string> labels = ctx->GetLabels();
+        std::vector<std::string> matches = MatchStrings(label, labels);
+        std::string match_output;
+        if (! matches.empty()) {
+            /* Print up to 5 matches. Don't want to spew too much */
+            match_output += "\nDid you mean:";
+            for (unsigned int i=0; i<matches.size() && i<5; i++)
+                match_output += "\n " + matches[i] + "?";
+        }
+
+        /* Label wasn't found. Emit an error */
+        Error(identifierPos, 
+                "No label named \"%s\" found in current function.%s",
+              label.c_str(), match_output.c_str());
+
        return;
    }

@@ -2290,7 +2342,7 @@ LabeledStmt::LabeledStmt(const char *n, Stmt *s, SourcePos p)
 void
 LabeledStmt::EmitCode(FunctionEmitContext *ctx) const {
    llvm::BasicBlock *bblock = ctx->GetLabeledBasicBlock(name);
-    Assert(bblock != NULL);
+    AssertPos(pos, bblock != NULL);

    // End the current basic block with a jump to our basic block and then
    // set things up for emission to continue there.  Note that the current
@@ -2409,7 +2461,7 @@ lEncodeType(const Type *t) {
    if (Type::Equal(t, AtomicType::VaryingUInt64)) return 'V';
    if (Type::Equal(t, AtomicType::UniformDouble)) return 'd';
    if (Type::Equal(t, AtomicType::VaryingDouble)) return 'D';
-    if (dynamic_cast<const PointerType *>(t) != NULL) {
+    if (CastType<PointerType>(t) != NULL) {
        if (t->IsUniformType())
            return 'p';
        else
@@ -2429,7 +2481,7 @@ lProcessPrintArg(Expr *expr, FunctionEmitContext *ctx, std::string &argTypes) {
    if (type == NULL)
        return NULL;

-    if (dynamic_cast<const ReferenceType *>(type) != NULL) {
+    if (CastType<ReferenceType>(type) != NULL) {
        expr = new RefDerefExpr(expr, expr->pos);
        type = expr->GetType();
        if (type == NULL)
@@ -2457,7 +2509,7 @@ lProcessPrintArg(Expr *expr, FunctionEmitContext *ctx, std::string &argTypes) {
    else {
        argTypes.push_back(t);

-        LLVM_TYPE_CONST llvm::Type *llvmExprType = type->LLVMType(g->ctx);
+        llvm::Type *llvmExprType = type->LLVMType(g->ctx);
        llvm::Value *ptr = ctx->AllocaInst(llvmExprType, "print_arg");
        llvm::Value *val = expr->GetValue(ctx);
        if (!val)
@@ -2478,6 +2530,9 @@ lProcessPrintArg(Expr *expr, FunctionEmitContext *ctx, std::string &argTypes) {
 */
 void
 PrintStmt::EmitCode(FunctionEmitContext *ctx) const {
+    if (!ctx->GetCurrentBasicBlock()) 
+        return;
+
    ctx->SetDebugPos(pos);

    // __do_print takes 5 arguments; we'll get them stored in the args[] array
@@ -2494,7 +2549,7 @@ PrintStmt::EmitCode(FunctionEmitContext *ctx) const {
    std::string argTypes;

    if (values == NULL) {
-        LLVM_TYPE_CONST llvm::Type *ptrPtrType = 
+        llvm::Type *ptrPtrType = 
            llvm::PointerType::get(LLVMTypes::VoidPointerType, 0);
        args[4] = llvm::Constant::getNullValue(ptrPtrType);
    }
@@ -2506,7 +2561,7 @@ PrintStmt::EmitCode(FunctionEmitContext *ctx) const {
        int nArgs = elist ? elist->exprs.size() : 1;

        // Allocate space for the array of pointers to values to be printed 
-        LLVM_TYPE_CONST llvm::Type *argPtrArrayType = 
+        llvm::Type *argPtrArrayType = 
            llvm::ArrayType::get(LLVMTypes::VoidPointerType, nArgs);
        llvm::Value *argPtrArray = ctx->AllocaInst(argPtrArrayType,
                                                   "print_arg_ptrs");
@@ -2542,7 +2597,7 @@ PrintStmt::EmitCode(FunctionEmitContext *ctx) const {

    // Now we can emit code to call __do_print()
    llvm::Function *printFunc = m->module->getFunction("__do_print");
-    Assert(printFunc);
+    AssertPos(pos, printFunc);

    llvm::Value *mask = ctx->GetFullMask();
    // Set up the rest of the parameters to it
@@ -2583,6 +2638,9 @@ AssertStmt::AssertStmt(const std::string &msg, Expr *e, SourcePos p)

 void
 AssertStmt::EmitCode(FunctionEmitContext *ctx) const {
+    if (!ctx->GetCurrentBasicBlock()) 
+        return;
+
    if (expr == NULL)
        return;
    const Type *type = expr->GetType();
@@ -2595,7 +2653,7 @@ AssertStmt::EmitCode(FunctionEmitContext *ctx) const {
    llvm::Function *assertFunc = 
        isUniform ? m->module->getFunction("__do_assert_uniform") :
                    m->module->getFunction("__do_assert_varying");
-    Assert(assertFunc != NULL);
+    AssertPos(pos, assertFunc != NULL);

    char *errorString;
    if (asprintf(&errorString, "%s:%d:%d: Assertion failed: %s\n", 
@@ -2658,20 +2716,23 @@ DeleteStmt::DeleteStmt(Expr *e, SourcePos p)

 void
 DeleteStmt::EmitCode(FunctionEmitContext *ctx) const {
+    if (!ctx->GetCurrentBasicBlock()) 
+        return;
+
    const Type *exprType;
    if (expr == NULL || ((exprType = expr->GetType()) == NULL)) {
-        Assert(m->errorCount > 0);
+        AssertPos(pos, m->errorCount > 0);
        return;
    }

    llvm::Value *exprValue = expr->GetValue(ctx);
    if (exprValue == NULL) {
-        Assert(m->errorCount > 0);
+        AssertPos(pos, m->errorCount > 0);
        return;
    }

    // Typechecking should catch this
-    Assert(dynamic_cast<const PointerType *>(exprType) != NULL);
+    AssertPos(pos, CastType<PointerType>(exprType) != NULL);

    if (exprType->IsUniformType()) {
        // For deletion of a uniform pointer, we just need to cast the
@@ -2680,7 +2741,7 @@ DeleteStmt::EmitCode(FunctionEmitContext *ctx) const {
        exprValue = ctx->BitCastInst(exprValue, LLVMTypes::VoidPointerType,
                                     "ptr_to_void");
        llvm::Function *func = m->module->getFunction("__delete_uniform");
-        Assert(func != NULL);
+        AssertPos(pos, func != NULL);

        ctx->CallInst(func, NULL, exprValue, "");
    }
@@ -2690,7 +2751,7 @@ DeleteStmt::EmitCode(FunctionEmitContext *ctx) const {
        // only need to extend to 64-bit values on 32-bit targets before
        // calling it.
        llvm::Function *func = m->module->getFunction("__delete_varying");
-        Assert(func != NULL);
+        AssertPos(pos, func != NULL);
        if (g->target.is32Bit)
            exprValue = ctx->ZExtInst(exprValue, LLVMTypes::Int64VectorType,
                                      "ptr_to_64");
@@ -2711,7 +2772,7 @@ DeleteStmt::TypeCheck() {
    if (expr == NULL || ((exprType = expr->GetType()) == NULL))
        return NULL;

-    if (dynamic_cast<const PointerType *>(exprType) == NULL) {
+    if (CastType<PointerType>(exprType) == NULL) {
        Error(pos, "Illegal to delete non-pointer type \"%s\".",
              exprType->GetString().c_str());
        return NULL;
@@ -2743,7 +2804,7 @@ DeleteStmt::EstimateCost() const {
 Stmt *
 CreateForeachActiveStmt(Symbol *iterSym, Stmt *stmts, SourcePos pos) {
    if (iterSym == NULL) {
-        Assert(m->errorCount > 0);
+        AssertPos(pos, m->errorCount > 0);
        return NULL;
    }

@@ -2770,11 +2831,11 @@ CreateForeachActiveStmt(Symbol *iterSym, Stmt *stmts, SourcePos pos) {
    // First, call __movmsk(__mask)) to get the mask as a set of bits.
    // This should be hoisted out of the loop
    Symbol *maskSym = m->symbolTable->LookupVariable("__mask");
-    Assert(maskSym != NULL);
+    AssertPos(pos, maskSym != NULL);
    Expr *maskVecExpr = new SymbolExpr(maskSym, pos);
    std::vector<Symbol *> mmFuns;
    m->symbolTable->LookupFunction("__movmsk", &mmFuns);
-    Assert(mmFuns.size() == 2);
+    AssertPos(pos, mmFuns.size() == (g->target.maskBitCount == 32 ? 2 : 1));
    FunctionSymbolExpr *movmskFunc = new FunctionSymbolExpr("__movmsk", mmFuns,
                                                            pos);
    ExprList *movmskArgs = new ExprList(maskVecExpr, pos);
@@ -2782,7 +2843,7 @@ CreateForeachActiveStmt(Symbol *iterSym, Stmt *stmts, SourcePos pos) {
                                                        pos);

    // Compute the per lane mask to test the mask bits against: (1 << iter)
-    ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, 1,
+    ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt64, int64_t(1),
                                       iterSym->pos);
    Expr *shiftLaneExpr = new BinaryExpr(BinaryExpr::Shl, oneExpr, symExpr, 
                                         pos);
@@ -2802,4 +2863,3 @@ CreateForeachActiveStmt(Symbol *iterSym, Stmt *stmts, SourcePos pos) {
    // And return a for loop that wires it all together.
    return new ForStmt(initStmt, testExpr, stepStmt, laneCheckIf, false, pos);
 }
-
--- a/stmt.h
+++ b/stmt.h
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -265,7 +265,7 @@ public:
    statement in the program. */
 class ReturnStmt : public Stmt {
 public:
-    ReturnStmt(Expr *v, bool cc, SourcePos p);
+    ReturnStmt(Expr *e, bool cc, SourcePos p);

    void EmitCode(FunctionEmitContext *ctx) const;
    void Print(int indent) const;
@@ -273,7 +273,7 @@ public:
    Stmt *TypeCheck();
    int EstimateCost() const;

-    Expr *val;
+    Expr *expr;
    /** This indicates whether the generated code will check to see if no
        more program instances are currently running after the return, in
        which case the code can possibly jump to the end of the current
--- a/sym.cpp
+++ b/sym.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -56,12 +56,6 @@ Symbol::Symbol(const std::string &n, SourcePos p, const Type *t,
 }


-std::string
-Symbol::MangledName() const {
-    return name + type->Mangle();
-}
-
-
 ///////////////////////////////////////////////////////////////////////////
 // SymbolTable

@@ -72,27 +66,31 @@ SymbolTable::SymbolTable() {

 SymbolTable::~SymbolTable() {
    // Otherwise we have mismatched push/pop scopes
-    Assert(variables.size() == 1 && types.size() == 1);
+    Assert(variables.size() == 1);
    PopScope();
 }


 void
 SymbolTable::PushScope() { 
-    variables.push_back(new SymbolMapType);
-    types.push_back(new TypeMapType);
+    SymbolMapType *sm;
+    if (freeSymbolMaps.size() > 0) {
+        sm = freeSymbolMaps.back();
+        freeSymbolMaps.pop_back();
+        sm->erase(sm->begin(), sm->end());
+    }
+    else
+        sm = new SymbolMapType;
+
+    variables.push_back(sm);
 }


 void
 SymbolTable::PopScope() { 
    Assert(variables.size() > 1);
-    delete variables.back();
+    freeSymbolMaps.push_back(variables.back());
    variables.pop_back();
-
-    Assert(types.size() > 1);
-    delete types.back();
-    types.pop_back();
 }


@@ -147,7 +145,7 @@ SymbolTable::LookupVariable(const char *name) {

 bool
 SymbolTable::AddFunction(Symbol *symbol) {
-    const FunctionType *ft = dynamic_cast<const FunctionType *>(symbol->type);
+    const FunctionType *ft = CastType<FunctionType>(symbol->type);
    Assert(ft != NULL);
    if (LookupFunction(symbol->name.c_str(), ft) != NULL)
        // A function of the same name and type has already been added to
@@ -192,26 +190,17 @@ SymbolTable::LookupFunction(const char *name, const FunctionType *type) {

 bool
 SymbolTable::AddType(const char *name, const Type *type, SourcePos pos) {
-    // Like AddVariable(), we go backwards through the type maps, working
-    // from innermost scope to outermost.
-    for (int i = types.size()-1; i >= 0; --i) {
-        TypeMapType &sm = *(types[i]);
-        if (sm.find(name) != sm.end()) {
-            if (i == (int)types.size() - 1) {
-                Error(pos, "Ignoring redefinition of type \"%s\".", name);
-                return false;
-            }
-            else {
-                Warning(pos, "Type \"%s\" shadows type declared in outer scope.", name);
-                TypeMapType &sm = *(types.back());
-                sm[name] = type;
-                return true;
-            }
-        }
+    const Type *t = LookupType(name);
+    if (t != NULL && CastType<UndefinedStructType>(t) == NULL) {
+        // If we have a previous declaration of anything other than an
+        // UndefinedStructType with this struct name, issue an error.  If
+        // we have an UndefinedStructType, then we'll fall through to the
+        // code below that adds the definition to the type map.
+        Error(pos, "Ignoring redefinition of type \"%s\".", name);
+        return false;
    }

-    TypeMapType &sm = *(types.back());
-    sm[name] = type;
+    types[name] = type;
    return true;
 }

@@ -219,11 +208,9 @@ SymbolTable::AddType(const char *name, const Type *type, SourcePos pos) {
 const Type *
 SymbolTable::LookupType(const char *name) const {
    // Again, search through the type maps backward to get scoping right.
-    for (int i = types.size()-1; i >= 0; --i) {
-        TypeMapType &sm = *(types[i]);
-        if (sm.find(name) != sm.end())
-            return sm[name];
-    }
+    TypeMapType::const_iterator iter = types.find(name);
+    if (iter != types.end())
+        return iter->second;
    return NULL;
 }

@@ -288,21 +275,19 @@ SymbolTable::closestTypeMatch(const char *str, bool structsVsEnums) const {
    const int maxDelta = 2;
    std::vector<std::string> matches[maxDelta+1];

-    for (unsigned int i = 0; i < types.size(); ++i) {
-        TypeMapType::const_iterator iter;
-        for (iter = types[i]->begin(); iter != types[i]->end(); ++iter) {
-            // Skip over either StructTypes or EnumTypes, depending on the
-            // value of the structsVsEnums parameter
-            bool isEnum = (dynamic_cast<const EnumType *>(iter->second) != NULL);
-            if (isEnum && structsVsEnums)
-                continue;
-            else if (!isEnum && !structsVsEnums)
-                continue;
+    TypeMapType::const_iterator iter;
+    for (iter = types.begin(); iter != types.end(); ++iter) {
+        // Skip over either StructTypes or EnumTypes, depending on the
+        // value of the structsVsEnums parameter
+        bool isEnum = (CastType<EnumType>(iter->second) != NULL);
+        if (isEnum && structsVsEnums)
+            continue;
+        else if (!isEnum && !structsVsEnums)
+            continue;

-            int dist = StringEditDistance(str, iter->first, maxDelta+1);
-            if (dist <= maxDelta)
-                matches[dist].push_back(iter->first);
-        }
+        int dist = StringEditDistance(str, iter->first, maxDelta+1);
+        if (dist <= maxDelta)
+            matches[dist].push_back(iter->first);
    }

    for (int i = 0; i <= maxDelta; ++i) {
@@ -342,16 +327,12 @@ SymbolTable::Print() {

    depth = 0;
    fprintf(stderr, "Named types:\n---------------\n");
-    for (unsigned int i = 0; i < types.size(); ++i) {
-        TypeMapType &sm = *types[i];
-        TypeMapType::iterator siter = sm.begin();
-        while (siter != sm.end()) {
-            fprintf(stderr, "%*c", depth, ' ');
-            fprintf(stderr, "%s -> %s\n", siter->first.c_str(),
-                    siter->second->GetString().c_str());
-            ++siter;
-        }
-        depth += 4;
+    TypeMapType::iterator siter = types.begin();
+    while (siter != types.end()) {
+        fprintf(stderr, "%*c", depth, ' ');
+        fprintf(stderr, "%s -> %s\n", siter->first.c_str(),
+                siter->second->GetString().c_str());
+        ++siter;
    }
 }

@@ -382,14 +363,11 @@ SymbolTable::RandomSymbol() {

 const Type *
 SymbolTable::RandomType() {
-    int v = ispcRand() % types.size();
-    if (types[v]->size() == 0)
-        return NULL;
-    int count = ispcRand() % types[v]->size();
-    TypeMapType::iterator iter = types[v]->begin();
+    int count = types.size();
+    TypeMapType::iterator iter = types.begin();
    while (count-- > 0) {
        ++iter;
-        Assert(iter != types[v]->end());
+        Assert(iter != types.end());
    }
    return iter->second;
 }
--- a/sym.h
+++ b/sym.h
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -67,15 +67,8 @@ public:
    Symbol(const std::string &name, SourcePos pos, const Type *t = NULL,
           StorageClass sc = SC_NONE);

-    /** This method should only be called for function symbols; for them,
-        it returns a mangled version of the function name with the argument
-        types encoded into the returned name.  This is used to generate
-        unique symbols in object files for overloaded functions.
-     */
-    std::string MangledName() const;
-
    SourcePos pos;            /*!< Source file position where the symbol was defined */
-    const std::string name;   /*!< Symbol's name */
+    std::string name;         /*!< Symbol's name */
    llvm::Value *storagePtr;  /*!< For symbols with storage associated with
                                   them (i.e. variables but not functions),
                                   this member stores a pointer to its
@@ -208,6 +201,9 @@ public:
    /** Adds the named type to the symbol table.  This is used for both
        struct definitions (where <tt>struct Foo</tt> causes type \c Foo to
        be added to the symbol table) as well as for <tt>typedef</tt>s.
+        For structs with forward declarations ("struct Foo;") and are thus
+        UndefinedStructTypes, this method replaces these with an actual
+        struct definition if one is provided.

        @param name Name of the type to be added
        @param type Type that \c name represents
@@ -264,6 +260,8 @@ private:
    typedef std::map<std::string, Symbol *> SymbolMapType;
    std::vector<SymbolMapType *> variables;

+    std::vector<SymbolMapType *> freeSymbolMaps;
+
    /** Function declarations are *not* scoped.  (C99, for example, allows
        an implementation to maintain function declarations in a single
        namespace.)  A STL \c vector is used to store the function symbols
@@ -272,12 +270,10 @@ private:
    typedef std::map<std::string, std::vector<Symbol *> > FunctionMapType;
    FunctionMapType functions;

-    /** Type definitions can also be scoped.  A new \c TypeMapType
-        is added to the back of the \c types \c vector each time a new scope
-        is entered.  (And it's removed when the scope exits).
+    /** Type definitions can't currently be scoped.
     */
    typedef std::map<std::string, const Type *> TypeMapType;
-    std::vector<TypeMapType *> types;
+    TypeMapType types;
 };


--- a/test_static.cpp
+++ b/test_static.cpp
@@ -102,15 +102,21 @@ void *ISPCAlloc(void **handle, int64_t size, int32_t alignment) {

 int main(int argc, char *argv[]) {
    int w = width();
-    assert(w <= 16);
+    assert(w <= 64);

-    float returned_result[16];
-    for (int i = 0; i < 16; ++i)
+    float returned_result[64];
+    float vfloat[64];
+    double vdouble[64];
+    int vint[64], vint2[64];
+
+    for (int i = 0; i < 64; ++i) {
        returned_result[i] = -1e20;
-    float vfloat[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
-    double vdouble[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
-    int vint[16] = { 2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32 };
-    int vint2[16] = { 5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20};
+        vfloat[i] = i+1;
+        vdouble[i] = i+1;
+        vint[i] = 2*(i+1);
+        vint2[i] = i+5;
+    }
+
    float b = 5.;

 #if (TEST_SIG == 0)
@@ -131,8 +137,8 @@ int main(int argc, char *argv[]) {
 #error "Unknown or unset TEST_SIG value"
 #endif    

-    float expected_result[16];
-    memset(expected_result, 0, 16*sizeof(float));
+    float expected_result[64];
+    memset(expected_result, 0, 64*sizeof(float));
    result(expected_result);

    int errors = 0;
--- a/tests/acos.ispc
+++ b/tests/acos.ispc
@@ -6,14 +6,14 @@ bool ok(float x, float ref) { return (abs(x - ref) < 1e-6) || abs((x-ref)/ref) <

 export void f_v(uniform float RET[]) {
    uniform float vals[8] = { 0, 1, 0.5, -1, -.87, -.25, 1e-3, -.99999999 };
-    uniform float r[8];
+    uniform float r[programCount];
    foreach (i = 0 ... 8)
-        r[i] = cos(acos(vals[i]));
+        r[i] = cos(acos(vals[i % 8]));

    int errors = 0;
    for (uniform int i = 0; i < 8; ++i) {
-        if (ok(r[i], vals[i]) == false) {
-            print("error @ %: got %, expected %\n", i, r[i], vals[i]);
+        if (ok(r[i], vals[i%8]) == false) {
+            print("error @ %: got %, expected %\n", i, r[i], vals[i%8]);
            ++errors;
        }
    }
--- a/tests/aossoa-1.ispc
+++ b/tests/aossoa-1.ispc
@@ -3,7 +3,9 @@ export uniform int width() { return programCount; }

 export void f_v(uniform float RET[]) {
 #define width 3
-#define maxProgramCount 16
+#define maxProgramCount 64
+    assert(programCount <= maxProgramCount);
+
 //CO    const uniform int width = 3;
 //CO    const uniform int maxProgramCount = 16;
    uniform float a[width*maxProgramCount], r[width*maxProgramCount];
--- a/tests/aossoa-2.ispc
+++ b/tests/aossoa-2.ispc
@@ -3,7 +3,9 @@ export uniform int width() { return programCount; }

 export void f_v(uniform float RET[]) {
 #define width 4
-#define maxProgramCount 16
+#define maxProgramCount 64
+    assert(programCount <= maxProgramCount);
+
 //CO    const uniform int width = 4;
 //CO    const uniform int maxProgramCount = 16;
    uniform float a[width*maxProgramCount], r[width*maxProgramCount];
--- a/tests/aossoa-5.ispc
+++ b/tests/aossoa-5.ispc
@@ -3,7 +3,9 @@ export uniform int width() { return programCount; }

 export void f_v(uniform float RET[]) {
 #define width 3
-#define maxProgramCount 16
+#define maxProgramCount 64
+    assert(programCount <= maxProgramCount);
+
 //CO    const uniform int width = 3;
 //CO    const uniform int maxProgramCount = 16;
    uniform int a[width*maxProgramCount], r[width*maxProgramCount];
--- a/tests/aossoa-6.ispc
+++ b/tests/aossoa-6.ispc
@@ -3,7 +3,9 @@ export uniform int width() { return programCount; }

 export void f_v(uniform float RET[]) {
 #define width 4
-#define maxProgramCount 16
+#define maxProgramCount 64
+    assert(programCount <= maxProgramCount);
+
 //CO    const uniform int width = 4;
 //CO    const uniform int maxProgramCount = 16;
    uniform int a[width*maxProgramCount], r[width*maxProgramCount];
--- a/tests/array-gather-ifs.ispc
+++ b/tests/array-gather-ifs.ispc
@@ -5,9 +5,9 @@ export uniform int width() { return programCount; }

 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    float a = aFOO[programIndex];
-    uniform float x[45];
+    uniform float x[programCount+15];
    uniform int i;
-    for (i = 0; i < 45; ++i)
+    for (i = 0; i < programCount+15; ++i)
        x[i] = i;

    float ret;
--- a/tests/array-gather-multi-unif.ispc
+++ b/tests/array-gather-multi-unif.ispc
@@ -10,7 +10,8 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    for (uniform int i = 0; i < 29+b; ++i)
        for (uniform int j = 0; j < 29+b; ++j)
            x[i][j] = 0;
-    x[a][a] = a;
+    if (a < 34)
+        x[a][a] = a;
    RET[programIndex] = x[4][4] + x[1][1] + x[b][b] + x[0][0];
 }

--- a/tests/array-gather-simple.ispc
+++ b/tests/array-gather-simple.ispc
@@ -12,8 +12,10 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 }

 export void result(uniform float RET[]) { 
-    RET[0] = 1; RET[4] = 5; RET[8] = 9; RET[12] = 13;
-    RET[1] = RET[5] = RET[9] = RET[13] = 0;
-    RET[2] = 6; RET[6] = 14; RET[10] = 22; RET[14] = 30;
-    RET[3] = RET[7] = RET[11] = RET[15] = 3;
+    for (uniform int i = 0; i < programCount; i += 4) {
+        RET[i] = i+1;
+        RET[i+1] = 0;
+        RET[i+2] = 2 * (i+3);
+        RET[i+3] = 3;
+    }
 }
--- a/tests/array-gather-unif-runflags.ispc
+++ b/tests/array-gather-unif-runflags.ispc
@@ -4,9 +4,9 @@ export uniform int width() { return programCount; }

 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    float a = aFOO[programIndex];
-    uniform float x[45];
+    uniform float x[programCount+5];
    uniform int i;
-    for (i = 0; i < 45; ++i)
+    for (i = 0; i < programCount+5; ++i)
        x[i] = i+b;
    a -= 1;
    if (a == 3) a = 0;
--- a/tests/array-gather-unif.ispc
+++ b/tests/array-gather-unif.ispc
@@ -4,9 +4,9 @@ export uniform int width() { return programCount; }

 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    float a = aFOO[programIndex];
-    uniform float x[45];
+    uniform float x[programCount+5];
    uniform int i;
-    for (i = 0; i < 45; ++i)
+    for (i = 0; i < programCount+5; ++i)
        x[i] = i+b;
    RET[programIndex] = x[a];
 }
--- a/tests/array-gather-vary.ispc
+++ b/tests/array-gather-vary.ispc
@@ -4,14 +4,14 @@ export uniform int width() { return programCount; }

 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    float a = aFOO[programIndex];
-    float x[55];
+    float x[programCount+10];
    uniform int i;
-    for (i = 0; i < 45; ++i)
+    for (i = 0; i < programCount+10; ++i)
        x[i] = a+b;
    RET[programIndex] = x[a];
 }


 export void result(uniform float RET[]) {
-    RET[programIndex] = 6 + programIndex;;
+    RET[programIndex] = 6 + programIndex;
 }
--- a/tests/array-mixed-unif-vary-indexing-2.ispc
+++ b/tests/array-mixed-unif-vary-indexing-2.ispc
@@ -15,6 +15,9 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
        x[a][b-1] = 0;
    else
        x[a][b-1] = 1;
+
+    a = min(a, 46);
+
    RET[programIndex] = x[3][a];
 }

--- a/tests/array-mixed-unif-vary-indexing-3.ispc
+++ b/tests/array-mixed-unif-vary-indexing-3.ispc
@@ -4,9 +4,10 @@ export uniform int width() { return programCount; }

 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    float a = aFOO[programIndex];
-    uniform float x[47][47];
-    for (uniform int i = 0; i < 47; ++i)
-        for (uniform int j = 0; j < 47; ++j)
+    assert(programCount <= 64);
+    uniform float x[70][70];
+    for (uniform int i = 0; i < 70; ++i)
+        for (uniform int j = 0; j < 70; ++j)
            x[i][j] = 2+b-5;

    // all are 2 except (4,2) = 0, (4,...) = 1, (4,programCount-1)=2
--- a/tests/array-mixed-unif-vary-indexing.ispc
+++ b/tests/array-mixed-unif-vary-indexing.ispc
@@ -10,6 +10,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
        for (uniform int j = 0; j < 47; ++j)
            x[i][j] = 2+b-5;

+    a = min(a,46);
    x[a][b-1] = 0;
    RET[programIndex] = x[2][a];
 }
--- a/tests/array-multidim-gather-scatter.ispc
+++ b/tests/array-multidim-gather-scatter.ispc
@@ -11,7 +11,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {

    uniform int index[4] = { 0, 1, 2, 4 };
    float v = index[programIndex & 0x3];
-    x[a][v] = 0;
+    x[min(a,39)][v] = 0;
    RET[programIndex] = x[v+1][v];
 }

--- a/tests/array-pointer-duality-1.ispc
+++ b/tests/array-pointer-duality-1.ispc
@@ -4,7 +4,7 @@ export uniform int width() { return programCount; }

 export void f_f(uniform float RET[], uniform float aFOO[]) {
    uniform float a[programCount+4];
-    for (unsigned int i = 0; i < programCount+4; ++i)
+    for (uniform int i = 0; i < programCount+4; ++i)
        a[i] = aFOO[min((int)i, programCount)];

    RET[programIndex] = *(a + 2);
--- a/tests/array-scatter-unif-2.ispc
+++ b/tests/array-scatter-unif-2.ispc
@@ -4,9 +4,8 @@ export uniform int width() { return programCount; }

 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    float a = aFOO[programIndex];
-    uniform float x[100];
-    // HACK to avoid @llvm.memset...
-    for (uniform int i = 0; i < b*20; ++i)
+    uniform float x[2*programCount];
+    for (uniform int i = 0; i < 2*programCount; ++i)
        x[i] = 0;
    
    x[2*(a-1)] = b;
--- a/tests/array-scatter-unif-3.ispc
+++ b/tests/array-scatter-unif-3.ispc
@@ -4,9 +4,8 @@ export uniform int width() { return programCount; }

 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    float a = aFOO[programIndex];
-    uniform float x[100];
-    // HACK to avoid @llvm.memset...
-    for (uniform int i = 0; i < b*20; ++i)
+    uniform float x[2*programCount];
+    for (uniform int i = 0; i < 2*programCount; ++i)
        x[i] = 0;

    x[2*(a-1)] = b;
--- a/tests/array-scatter-unif.ispc
+++ b/tests/array-scatter-unif.ispc
@@ -5,8 +5,8 @@ export uniform int width() { return programCount; }

 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    float a = aFOO[programIndex];
-    uniform float x[40];
-    for (uniform int i = 0; i < 40; ++i)
+    uniform float x[programCount+5];
+    for (uniform int i = 0; i < programCount+5; ++i)
        x[i] = 0.;
    x[a] = 2;
    RET[programIndex] = x[4] + x[0] + x[5];
--- a/tests/array-scatter-vary.ispc
+++ b/tests/array-scatter-vary.ispc
@@ -4,9 +4,8 @@ export uniform int width() { return programCount; }

 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    float a = aFOO[programIndex];
-    float x[30];
-    // HACK to avoid @llvm.memset...
-    for (uniform int i = 0; i < b*6; ++i)
+    float x[2*programCount];
+    for (uniform int i = 0; i < 2*programCount; ++i)
        x[i] = 0;
    x[a] = a;
    RET[programIndex] = x[4] + x[0] + x[5];
--- a/tests/array-struct-gather.ispc
+++ b/tests/array-struct-gather.ispc
@@ -4,14 +4,14 @@ export uniform int width() { return programCount; }


 struct Foo {
-    uniform float x[17];
+    uniform float x[programCount+1];
 };

 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    float a = aFOO[programIndex];
    uniform Foo foo;
    uniform int i;
-    for (i = 0; i < 17; ++i)
+    for (i = 0; i < programCount+1; ++i)
        foo.x[i] = i;

    if ((int)a & 1)
--- a/tests/asin.ispc
+++ b/tests/asin.ispc
@@ -8,12 +8,12 @@ export void f_v(uniform float RET[]) {
    uniform float vals[8] = { 0, 1, 0.5, -1, -.87, -.25, 1e-3, -.99999999 };
    uniform float r[8];
    foreach (i = 0 ... 8)
-        r[i] = sin(asin(vals[i]));
+        r[i] = sin(asin(vals[i%8]));

    int errors = 0;
    for (uniform int i = 0; i < 8; ++i) {
-        if (ok(r[i], vals[i]) == false) {
-            print("error @ %: got %, expected %\n", i, r[i], vals[i]);
+        if (ok(r[i], vals[i%8]) == false) {
+            print("error @ %: got %, expected %\n", i, r[i], vals[i%8]);
            ++errors;
        }
    }
--- a/tests/atomics-12.ispc
+++ b/tests/atomics-12.ispc
@@ -6,14 +6,14 @@ uniform unsigned int32 s = 0;
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex]; 
    float b = 0;
-    if (programIndex & 1)
+    if (programIndex < 30 && programIndex & 1)
        b = atomic_or_global(&s, (1 << programIndex));
    RET[programIndex] = s;
 }

 export void result(uniform float RET[]) {
    uniform int sum = 0;
-    for (uniform int i = 0; i < programCount; ++i)
+    for (uniform int i = 0; i < min(30, programCount); ++i)
        if (i & 1)
            sum += (1 << i);
    RET[programIndex] = sum;
--- a/tests/atomics-13.ispc
+++ b/tests/atomics-13.ispc
@@ -5,12 +5,12 @@ uniform unsigned int32 s = 0;

 export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex]; 
-    float b = 0;
-    if (programIndex & 1)
+    int32 b = 0;
+    if (programIndex < 32 && programIndex & 1)
        b = atomic_or_global(&s, (1 << programIndex));
    RET[programIndex] = popcnt(reduce_max((int32)b));
 }

 export void result(uniform float RET[]) {
-    RET[programIndex] = programCount == 1 ? 0 : ((programCount/2) - 1);
+    RET[programIndex] = programCount == 1 ? 0 : ((min(32, programCount)/2) - 1);
 }
--- a/tests/atomics-14.ispc
+++ b/tests/atomics-14.ispc
@@ -7,14 +7,14 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex]; 
    float b = 0;
    if (programIndex & 1)
-        b = atomic_or_global(&s, (1 << programIndex));
+        b = atomic_or_global(&s, (1ull << programIndex));
    RET[programIndex] = (s>>20);
 }

 export void result(uniform float RET[]) {
-    uniform int sum = 0;
+    uniform int64 sum = 0;
    for (uniform int i = 0; i < programCount; ++i)
        if (i & 1)
-            sum += (1 << i);
+            sum += (1ull << i);
    RET[programIndex] = ((unsigned int64)(0xffffffffff000000 | sum)) >> 20;
 }
--- a/tests/atomics-4.ispc
+++ b/tests/atomics-4.ispc
@@ -5,10 +5,10 @@ uniform int32 s = 0;

 export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex]; 
-    float b = atomic_or_global(&s, (1<<programIndex));
+    float b = atomic_or_global(&s, (1<<min(programIndex,30)));
    RET[programIndex] = s;
 }

 export void result(uniform float RET[]) {
-    RET[programIndex] = (1<<programCount)-1;
+    RET[programIndex] = (1<<min(programCount,31))-1;
 }
--- a/tests/coalesce-1.ispc
+++ b/tests/coalesce-1.ispc
@@ -5,7 +5,8 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
    uniform float * uniform buf = uniform new uniform float[32*32];
    for (uniform int i = 0; i < 32*32; ++i)
        buf[i] = i;
-
+    
+    assert(programIndex <= 64);
    RET[programIndex] = buf[64-programIndex];
 }

--- a/tests/count-leading-trailing-zeros-4.ispc
+++ b/tests/count-leading-trailing-zeros-4.ispc
@@ -3,10 +3,10 @@ export uniform int width() { return programCount; }


 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    int32 i = (1 << programIndex);
+    int32 i = (1 << (programIndex % 28));
    RET[programIndex] = count_leading_zeros(i);
 }

 export void result(uniform float RET[]) {
-    RET[programIndex] = 31-programIndex;
+    RET[programIndex] = 31-(programIndex%28);
 }
--- a/tests/count-leading-trailing-zeros-5.ispc
+++ b/tests/count-leading-trailing-zeros-5.ispc
@@ -3,10 +3,10 @@ export uniform int width() { return programCount; }


 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    unsigned int64 i = ((unsigned int64)1 << (40+programIndex));
+    unsigned int64 i = ((unsigned int64)1 << min(63, 40+programIndex));
    RET[programIndex] = count_trailing_zeros(i);
 }

 export void result(uniform float RET[]) {
-    RET[programIndex] = 40+programIndex;
+    RET[programIndex] = min(63, 40+programIndex);
 }
--- a/tests/exclusive-scan-add-1.ispc
+++ b/tests/exclusive-scan-add-1.ispc
@@ -5,8 +5,17 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
    RET[programIndex] = exclusive_scan_add(programIndex); 
 }

-export void result(uniform float RET[]) {
-    uniform int result[] = { 0, 0, 1, 3, 6, 10, 15, 21, 28,
-                             36, 45, 55, 66, 78, 91, 105, 120 };
-    RET[programIndex] = result[programIndex]; 
+int es(int v) {
+    uniform int vv[programCount];
+    vv[programIndex] = v;
+
+    uniform int r[programCount];
+    r[0] = 0;
+    for (uniform int i = 1; i < programCount; ++i)
+        r[i] = r[i-1] + vv[i-1];
+    return r[programIndex];
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = es(programIndex); 
 }
--- a/tests/exclusive-scan-add-10.ispc
+++ b/tests/exclusive-scan-add-10.ispc
@@ -10,11 +10,19 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
 }


+int es(int v) {
+    uniform int vv[programCount];
+    vv[programIndex] = v;
+
+    uniform int r[programCount];
+    r[0] = 0;
+    for (uniform int i = 1; i < programCount; ++i)
+        r[i] = r[i-1] + vv[i-1];
+    return r[programIndex];
+}
+
 export void result(uniform float RET[]) {
-    uniform int result[16] = { 0, 0, 0, 2, 0, 6, 0, 12, 
-                               0, 20, 0, 30, 0, 42, 0, 56 };
-    if (programIndex & 1)
-        RET[programIndex] = result[programIndex]; 
-    else
+    RET[programIndex] = es((programIndex & 1) ? (programIndex+1) : 0);
+    if ((programIndex & 1) == 0)
        RET[programIndex] = -1;
 }
--- a/tests/exclusive-scan-add-2.ispc
+++ b/tests/exclusive-scan-add-2.ispc
@@ -5,8 +5,17 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
    RET[programIndex] = exclusive_scan_add(aFOO[programIndex]);
 }

-export void result(uniform float RET[]) {
-    uniform int result[] = { 0, 1, 3, 6, 10, 15, 21, 28,
-                             36, 45, 55, 66, 78, 91, 105, 120, 136 };
-    RET[programIndex] = result[programIndex]; 
+int es(int v) {
+    uniform int vv[programCount];
+    vv[programIndex] = v;
+
+    uniform int r[programCount];
+    r[0] = 0;
+    for (uniform int i = 1; i < programCount; ++i)
+        r[i] = r[i-1] + vv[i-1];
+    return r[programIndex];
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = es(programIndex+1); 
 }
--- a/tests/exclusive-scan-add-3.ispc
+++ b/tests/exclusive-scan-add-3.ispc
@@ -9,8 +9,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
 }

 export void result(uniform float RET[]) {
-    uniform int result[] = { 0, 1, 3, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0 };
+    uniform int result[] = { 0, 1, 3 };
    RET[programIndex] = -1;
    if (programIndex <= 1)
        RET[programIndex] = result[programIndex]; 
--- a/tests/exclusive-scan-add-5.ispc
+++ b/tests/exclusive-scan-add-5.ispc
@@ -9,12 +9,20 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
    }
 }

+int es(int v) {
+    uniform int vv[programCount];
+    vv[programIndex] = v;
+
+    uniform int r[programCount];
+    r[0] = 0;
+    for (uniform int i = 1; i < programCount; ++i)
+        r[i] = r[i-1] + vv[i-1];
+    return r[programIndex];
+}
+

 export void result(uniform float RET[]) {
-    uniform int result[16] = { 0, 0, 0, 2, 0, 6, 0, 12, 
-                               0, 20, 0, 30, 0, 42, 0, 56 };
-    if (programIndex & 1)
-        RET[programIndex] = result[programIndex]; 
-    else
+    RET[programIndex] = es((programIndex & 1) ? (programIndex+1) : 0);
+    if ((programIndex & 1) == 0)
        RET[programIndex] = -1;
 }
--- a/tests/exclusive-scan-add-6.ispc
+++ b/tests/exclusive-scan-add-6.ispc
@@ -5,8 +5,17 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
    RET[programIndex] = exclusive_scan_add((float)programIndex); 
 }

-export void result(uniform float RET[]) {
-    uniform int result[] = { 0, 0, 1, 3, 6, 10, 15, 21, 28,
-                             36, 45, 55, 66, 78, 91, 105, 120 };
-    RET[programIndex] = result[programIndex]; 
+int es(int v) {
+    uniform int vv[programCount];
+    vv[programIndex] = v;
+
+    uniform int r[programCount];
+    r[0] = 0;
+    for (uniform int i = 1; i < programCount; ++i)
+        r[i] = r[i-1] + vv[i-1];
+    return r[programIndex];
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = es(programIndex); 
 }
--- a/tests/exclusive-scan-add-7.ispc
+++ b/tests/exclusive-scan-add-7.ispc
@@ -5,8 +5,17 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
    RET[programIndex] = exclusive_scan_add((double)aFOO[programIndex]);
 }

-export void result(uniform float RET[]) {
-    uniform int result[] = { 0, 1, 3, 6, 10, 15, 21, 28,
-                             36, 45, 55, 66, 78, 91, 105, 120, 136 };
-    RET[programIndex] = result[programIndex]; 
+int es(int v) {
+    uniform int vv[programCount];
+    vv[programIndex] = v;
+
+    uniform int r[programCount];
+    r[0] = 0;
+    for (uniform int i = 1; i < programCount; ++i)
+        r[i] = r[i-1] + vv[i-1];
+    return r[programIndex];
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = es(programIndex+1); 
 }
--- a/tests/exclusive-scan-and-2.ispc
+++ b/tests/exclusive-scan-and-2.ispc
@@ -4,7 +4,7 @@ export uniform int width() { return programCount; }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    RET[programIndex] = -1;
    int32 a = ~(1 << programIndex);
-    if ((programIndex & 1) == 0) {
+    if ((programIndex < 32) && (programIndex & 1) == 0) {
        RET[programIndex] = exclusive_scan_and(a);
    }
 }
@@ -12,7 +12,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {

 export void result(uniform float RET[]) {
    RET[programIndex] = -1;
-    if ((programIndex & 1) == 0 && programIndex > 0) {
+    if ((programIndex & 1) == 0 && programIndex > 0 && programIndex < 32) {
        int val = 0xffffffff;
        for (int i = 0; i < programIndex-1; i += 2)
            val &= ~(1<<i);
--- a/tests/exclusive-scan-or-1.ispc
+++ b/tests/exclusive-scan-or-1.ispc
@@ -3,11 +3,11 @@ export uniform int width() { return programCount; }

 export void f_f(uniform float RET[], uniform float aFOO[]) {
    RET[programIndex] = -1;
-    int32 a = (1 << programIndex);
+    int32 a = (1 << (min(programIndex, 30)));
    RET[programIndex] = exclusive_scan_or(a);
 }


 export void result(uniform float RET[]) {
-    RET[programIndex] = (1 << programIndex) - 1;
+    RET[programIndex] = (1 << (min(programIndex, 31))) - 1;
 }
--- a/tests/foreach-mask-1.ispc
+++ b/tests/foreach-mask-1.ispc
@@ -10,8 +10,10 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {

    // make sure we reset the func mask in the foreach loop...
    if ((int)aFOO[programIndex] & 1)
-        foreach (i = 0 ... programCount+3)
-            val[i] += aFOO[i] - 1;
+        foreach (i = 0 ... programCount+3) {
+            int ic = min(i, programCount-1);
+            val[i] += aFOO[ic] - 1 + i-ic;
+        }

    RET[programIndex] = val[3+programIndex]; 
 }
--- a/tests/foreach-mask.ispc
+++ b/tests/foreach-mask.ispc
@@ -5,8 +5,10 @@ export uniform int width() { return programCount; }
 // make sure we reset the func mask in the foreach loop...

 void update(uniform float val[], const uniform float a[]) {
-    foreach (i = 0 ... programCount+3)
-        val[i] += a[i] - 1;
+    foreach (i = 0 ... programCount+3) {
+        int ic = min(i, programCount-1);
+        val[i] += a[ic] - 1 + i-ic;
+    }
 }

 export void f_f(uniform float RET[], uniform float aFOO[]) {
--- a/tests/frexp-double-1.ispc
+++ b/tests/frexp-double-1.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }


 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    double a = (1<<programIndex) * 1.5;
+    double a = (1<< (programIndex % 28)) * 1.5;
    if (programIndex & 1)
        a = -a;
    int exponent;
@@ -12,5 +12,5 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
 }

 export void result(uniform float RET[]) {
-    RET[programIndex] = 1+programIndex;
+    RET[programIndex] = 1+(programIndex % 28);
 }
--- a/Show More
+++ b/Show More