Release notes and doxygen bump for 1.1.1

Fix test runner script to not crash if one of the tests_errors didn't return the expected result.
Linux build fixes
2011-12-15 13:17:08 -08:00 · 2011-12-15 12:38:41 -08:00 · 2011-12-15 12:23:26 -08:00 · 2011-12-15 12:10:27 -08:00 · 2011-12-15 12:06:38 -08:00 · 2011-12-15 11:11:16 -08:00
63 changed files with 1520 additions and 1657 deletions
--- a/32
+++ b/32
@@ -5,20 +5,32 @@
 ARCH_OS = $(shell uname)
 ARCH_TYPE = $(shell arch)

+ifeq ($(shell llvm-config --version), 3.1svn)
+  LLVM_LIBS=-lLLVMAsmParser -lLLVMInstrumentation -lLLVMLinker			\
+	-lLLVMArchive -lLLVMBitReader -lLLVMDebugInfo -lLLVMJIT -lLLVMipo	\
+	-lLLVMBitWriter -lLLVMTableGen -lLLVMCBackendInfo			\
+	-lLLVMX86Disassembler -lLLVMX86CodeGen -lLLVMSelectionDAG		\
+	-lLLVMAsmPrinter -lLLVMX86AsmParser -lLLVMX86Desc -lLLVMX86Info		\
+	-lLLVMX86AsmPrinter -lLLVMX86Utils -lLLVMMCDisassembler	-lLLVMMCParser	\
+	-lLLVMCodeGen -lLLVMScalarOpts	-lLLVMInstCombine -lLLVMTransformUtils	\
+	-lLLVMipa -lLLVMAnalysis -lLLVMMCJIT -lLLVMRuntimeDyld			\
+	-lLLVMExecutionEngine -lLLVMTarget -lLLVMMC -lLLVMObject -lLLVMCore 	\
+	-lLLVMSupport
+else
+  LLVM_LIBS=$(shell llvm-config --libs)
+endif
+
 CLANG=clang
 CLANG_LIBS = -lclangFrontend -lclangDriver \
             -lclangSerialization -lclangParse -lclangSema \
             -lclangAnalysis -lclangAST -lclangLex -lclangBasic

-ISPC_LIBS=$(CLANG_LIBS) \
-	$(shell llvm-config --ldflags --libs) \
-	-lpthread -ldl
-ISPC_TEST_LIBS=$(shell llvm-config --ldflags --libs) \
+ISPC_LIBS=$(shell llvm-config --ldflags) $(CLANG_LIBS) $(LLVM_LIBS) \
 	-lpthread -ldl

 LLVM_CXXFLAGS=$(shell llvm-config --cppflags)
-LLVM_VERSION=$(shell llvm-config --version | sed s/\\./_/)
-LLVM_VERSION_DEF=-DLLVM_$(LLVM_VERSION)
+LLVM_VERSION=LLVM_$(shell llvm-config --version | sed s/\\./_/)
+LLVM_VERSION_DEF=-D$(LLVM_VERSION)

 BUILD_DATE=$(shell date +%Y%m%d)
 BUILD_VERSION=$(shell git log --abbrev-commit --abbrev=16 | head -1)
@@ -59,7 +71,7 @@ OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_SRC:.ll=.o) \
 	builtins-c-32.o builtins-c-64.o stdlib_ispc.o $(BISON_SRC:.yy=.o) \
 	$(FLEX_SRC:.ll=.o))

-default: ispc ispc_test
+default: ispc

 .PHONY: dirs clean depend doxygen print_llvm_src
 .PRECIOUS: objs/builtins-%.cpp
@@ -78,7 +90,7 @@ print_llvm_src:
 	@echo Using LLVM `llvm-config --version` from `llvm-config --libdir`

 clean:
-	/bin/rm -rf objs ispc ispc_test
+	/bin/rm -rf objs ispc

 doxygen:
 	/bin/rm -rf docs/doxygen
@@ -88,10 +100,6 @@ ispc: print_llvm_src dirs $(OBJS)
 	@echo Creating ispc executable
 	@$(CXX) $(LDFLAGS) -o $@ $(OBJS) $(ISPC_LIBS)

-ispc_test: dirs ispc_test.cpp
-	@echo Creating ispc_test executable
-	@$(CXX) $(LDFLAGS) $(CXXFLAGS) -o $@ ispc_test.cpp $(ISPC_TEST_LIBS)
-
 objs/%.o: %.cpp
 	@echo Compiling $<
 	@$(CXX) $(CXXFLAGS) -o $@ -c $<
--- a/buildall.bat
+++ b/buildall.bat
@@ -8,7 +8,6 @@ REM Both the LLVM binaries and python need to be in the path
 set path=%LLVM_INSTALL_DIR%\bin;%PATH%;c:\cygwin\bin

 msbuild ispc.vcxproj /V:m /p:Platform=Win32 /p:Configuration=Release
-msbuild ispc_test.vcxproj /V:m /p:Platform=Win32 /p:Configuration=Release

 msbuild examples\examples.sln /V:m /p:Platform=x64 /p:Configuration=Release /t:rebuild
 msbuild examples\examples.sln /V:m /p:Platform=x64 /p:Configuration=Debug /t:rebuild
--- a/builtins-sse2-x2.ll
+++ b/builtins-sse2-x2.ll
@@ -301,7 +301,7 @@ define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
 }

 define <4 x float> @__vec4_add_float(<4 x float> %v0,
-                                            <4 x float> %v1) nounwind readnone alwaysinline {
+                                     <4 x float> %v1) nounwind readnone alwaysinline {
  %v = fadd <4 x float> %v0, %v1
  ret <4 x float> %v
 }
@@ -325,7 +325,7 @@ define float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {

 ; helper function for reduce_add_int32
 define <4 x i32> @__vec4_add_int32(<4 x i32> %v0,
-                                            <4 x i32> %v1) nounwind readnone alwaysinline {
+                                   <4 x i32> %v1) nounwind readnone alwaysinline {
  %v = add <4 x i32> %v0, %v1
  ret <4 x i32> %v
 }
--- a/builtins-sse2.ll
+++ b/builtins-sse2.ll
@@ -144,7 +144,7 @@ define <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alway
 ; from %1, and otherwise return the value from %0.

 define <4 x i32> @__vselect_i32(<4 x i32>, <4 x i32> ,
-                                         <4 x i32> %mask) nounwind readnone alwaysinline {
+                                <4 x i32> %mask) nounwind readnone alwaysinline {
  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
  %cleared_old = and <4 x i32> %0, %notmask
  %masked_new = and <4 x i32> %1, %mask
@@ -153,7 +153,7 @@ define <4 x i32> @__vselect_i32(<4 x i32>, <4 x i32> ,
 }

 define <4 x float> @__vselect_float(<4 x float>, <4 x float>,
-                                             <4 x i32> %mask) nounwind readnone alwaysinline {
+                                    <4 x i32> %mask) nounwind readnone alwaysinline {
  %v0 = bitcast <4 x float> %0 to <4 x i32>
  %v1 = bitcast <4 x float> %1 to <4 x i32>
  %r = call <4 x i32> @__vselect_i32(<4 x i32> %v0, <4 x i32> %v1, <4 x i32> %mask)
--- a/builtins-sse4-x2.ll
+++ b/builtins-sse4-x2.ll
@@ -252,7 +252,7 @@ define float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {

 ; helper function for reduce_add_int32
 define <4 x i32> @__vec4_add_int32(<4 x i32> %v0,
-                                            <4 x i32> %v1) nounwind readnone alwaysinline {
+                                   <4 x i32> %v1) nounwind readnone alwaysinline {
  %v = add <4 x i32> %v0, %v1
  ret <4 x i32> %v
 }
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -257,7 +257,7 @@ static void
 lAddModuleSymbols(llvm::Module *module, SymbolTable *symbolTable) {
 #if 0
    // FIXME: handle globals?
-    assert(module->global_empty());
+    Assert(module->global_empty());
 #endif

    llvm::Module::iterator iter;
@@ -287,11 +287,11 @@ lCheckModuleIntrinsics(llvm::Module *module) {
        // check the llvm.x86.* intrinsics for now...
        if (!strncmp(funcName.c_str(), "llvm.x86.", 9)) {
            llvm::Intrinsic::ID id = (llvm::Intrinsic::ID)func->getIntrinsicID();
-            assert(id != 0);
+            Assert(id != 0);
            LLVM_TYPE_CONST llvm::Type *intrinsicType = 
                llvm::Intrinsic::getType(*g->ctx, id);
            intrinsicType = llvm::PointerType::get(intrinsicType, 0);
-            assert(func->getType() == intrinsicType);
+            Assert(func->getType() == intrinsicType);
        }
    }
 }
@@ -311,8 +311,12 @@ lCheckModuleIntrinsics(llvm::Module *module) {
 static void
 lSetInternalFunctions(llvm::Module *module) {
    const char *names[] = {
+        "__add_float",
+        "__add_int32",
+        "__add_uniform_double",
        "__add_uniform_int32",
        "__add_uniform_int64",
+        "__add_varying_double",
        "__add_varying_int32",
        "__add_varying_int64",
        "__aos_to_soa3_float",
@@ -543,6 +547,10 @@ lSetInternalFunctions(llvm::Module *module) {
        "__svml_pow",
        "__undef_uniform",
        "__undef_varying",
+        "__vec4_add_float",
+        "__vec4_add_int32",
+        "__vselect_float",
+        "__vselect_i32",
    };

    int count = sizeof(names) / sizeof(names[0]);
@@ -583,9 +591,9 @@ AddBitcodeToModule(const unsigned char *bitcode, int length,
        // linking together modules with incompatible target triples..
        llvm::Triple mTriple(m->module->getTargetTriple());
        llvm::Triple bcTriple(bcModule->getTargetTriple());
-        assert(bcTriple.getArch() == llvm::Triple::UnknownArch ||
+        Assert(bcTriple.getArch() == llvm::Triple::UnknownArch ||
               mTriple.getArch() == bcTriple.getArch());
-        assert(bcTriple.getVendor() == llvm::Triple::UnknownVendor ||
+        Assert(bcTriple.getVendor() == llvm::Triple::UnknownVendor ||
               mTriple.getVendor() == bcTriple.getVendor());
        bcModule->setTargetTriple(mTriple.str());

@@ -631,7 +639,7 @@ lDefineConstantIntFunc(const char *name, int val, llvm::Module *module,
    Symbol *sym = new Symbol(name, SourcePos(), ft, SC_STATIC);

    llvm::Function *func = module->getFunction(name);
-    assert(func != NULL); // it should be declared already...
+    Assert(func != NULL); // it should be declared already...
    func->addFnAttr(llvm::Attribute::AlwaysInline);
    llvm::BasicBlock *bblock = llvm::BasicBlock::Create(*g->ctx, "entry", func, 0);
    llvm::ReturnInst::Create(*g->ctx, LLVMInt32(val), bblock);
@@ -718,6 +726,7 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
        }
        break;
    case Target::AVX:
+    case Target::AVX2:
        switch (g->target.vectorWidth) {
        case 8:
            extern unsigned char builtins_bitcode_avx[];
--- a/builtins.m4
+++ b/builtins.m4
@@ -715,7 +715,7 @@ define <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val,
  %eltvec`'i = insertelement <$1 x $3> %eltvec`'eval(i-1), $3 %red`'eval(i-1), i32 i')

  ; make the atomic call, passing it the final reduced value
-ifelse(`LLVM_VERSION', `LLVM_2_9',`
+ifelse(LLVM_VERSION, `LLVM_2_9',`
  %final0 = call $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %red`'eval($1-1))', `
  %final0 = atomicrmw $2 $3 * %ptr, $3 %red`'eval($1-1) seq_cst')

@@ -747,7 +747,7 @@ ifelse(`LLVM_VERSION', `LLVM_2_9',`

 define(`global_atomic_uniform', `

-ifelse(`LLVM_VERSION', `LLVM_2_9',`
+ifelse(LLVM_VERSION, `LLVM_2_9',`
 declare $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %delta)

 define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val,
@@ -771,7 +771,7 @@ define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val,
 ;; $2: llvm type of the vector elements (e.g. i32)
 ;; $3: ispc type of the elements (e.g. int32)

-ifelse(`LLVM_VERSION', `LLVM_2_9',`
+ifelse(LLVM_VERSION, `LLVM_2_9',`
 declare i32 @llvm.atomic.swap.i32.p0i32(i32 * %ptr, i32 %val)
 declare i64 @llvm.atomic.swap.i64.p0i64(i64 * %ptr, i64 %val)')

@@ -784,7 +784,7 @@ define <$1 x $2> @__atomic_swap_$3_global($2* %ptr, <$1 x $2> %val,

  per_lane($1, <$1 x i32> %mask, `
   %val_LANE_ID = extractelement <$1 x $2> %val, i32 LANE
-ifelse(`LLVM_VERSION', `LLVM_2_9',`
+ifelse(LLVM_VERSION, `LLVM_2_9',`
   %r_LANE_ID = call $2 @llvm.atomic.swap.$2.p0$2($2 * %ptr, $2 %val_LANE_ID)', `
   %r_LANE_ID = atomicrmw xchg $2 * %ptr, $2 %val_LANE_ID seq_cst')
   %rp_LANE_ID = getelementptr $2 * %rptr32, i32 LANE
@@ -796,7 +796,7 @@ ifelse(`LLVM_VERSION', `LLVM_2_9',`

 define $2 @__atomic_swap_uniform_$3_global($2* %ptr, $2 %val,
                                                    <$1 x i32> %mask) nounwind alwaysinline {
-ifelse(`LLVM_VERSION', `LLVM_2_9',`
+ifelse(LLVM_VERSION, `LLVM_2_9',`
 %r = call $2 @llvm.atomic.swap.$2.p0$2($2 * %ptr, $2 %val)', `
 %r = atomicrmw xchg $2 * %ptr, $2 %val seq_cst')
 ret $2 %r
@@ -812,7 +812,7 @@ ifelse(`LLVM_VERSION', `LLVM_2_9',`

 define(`global_atomic_exchange', `

-ifelse(`LLVM_VERSION', `LLVM_2_9',`
+ifelse(LLVM_VERSION, `LLVM_2_9',`
 declare $2 @llvm.atomic.cmp.swap.$2.p0$2($2 * %ptr, $2 %cmp, $2 %val)')

 define <$1 x $2> @__atomic_compare_exchange_$3_global($2* %ptr, <$1 x $2> %cmp,
@@ -823,7 +823,7 @@ define <$1 x $2> @__atomic_compare_exchange_$3_global($2* %ptr, <$1 x $2> %cmp,
  per_lane($1, <$1 x i32> %mask, `
   %cmp_LANE_ID = extractelement <$1 x $2> %cmp, i32 LANE
   %val_LANE_ID = extractelement <$1 x $2> %val, i32 LANE
-ifelse(`LLVM_VERSION', `LLVM_2_9',`
+ifelse(LLVM_VERSION, `LLVM_2_9',`
   %r_LANE_ID = call $2 @llvm.atomic.cmp.swap.$2.p0$2($2 * %ptr, $2 %cmp_LANE_ID,
                                                         $2 %val_LANE_ID)', `
   %r_LANE_ID = cmpxchg $2 * %ptr, $2 %cmp_LANE_ID, $2 %val_LANE_ID seq_cst')
@@ -836,7 +836,7 @@ ifelse(`LLVM_VERSION', `LLVM_2_9',`

 define $2 @__atomic_compare_exchange_uniform_$3_global($2* %ptr, $2 %cmp,
                               $2 %val, <$1 x i32> %mask) nounwind alwaysinline {
-ifelse(`LLVM_VERSION', `LLVM_2_9',`
+ifelse(LLVM_VERSION, `LLVM_2_9',`
  %r = call $2 @llvm.atomic.cmp.swap.$2.p0$2($2 * %ptr, $2 %cmp, $2 %val)', `
  %r = cmpxchg $2 * %ptr, $2 %cmp, $2 %val seq_cst')
  ret $2 %r
@@ -1784,7 +1784,7 @@ define void
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; prefetching

-ifelse(`LLVM_VERSION', `LLVM_2_9',
+ifelse(LLVM_VERSION, `LLVM_2_9',
 `
 declare void @llvm.prefetch(i8* nocapture %ptr, i32 %readwrite, i32 %locality)

--- a/ctx.cpp
+++ b/ctx.cpp
@@ -89,7 +89,7 @@ struct CFInfo {

 private:
    CFInfo(CFType t, bool uniformIf, llvm::Value *sm) {
-        assert(t == If);
+        Assert(t == If);
        type = t;
        isUniform = uniformIf;
        savedBreakTarget = savedContinueTarget = NULL;
@@ -99,7 +99,7 @@ private:
    CFInfo(CFType t, bool iu, llvm::BasicBlock *bt, llvm::BasicBlock *ct,
           llvm::Value *sb, llvm::Value *sc, llvm::Value *sm,
           llvm::Value *lm) {
-        assert(t == Loop);
+        Assert(t == Loop);
        type = t;
        isUniform = iu;
        savedBreakTarget = bt;
@@ -112,7 +112,7 @@ private:
    CFInfo(CFType t, llvm::BasicBlock *bt, llvm::BasicBlock *ct,
           llvm::Value *sb, llvm::Value *sc, llvm::Value *sm,
           llvm::Value *lm) {
-        assert(t == Foreach);
+        Assert(t == Foreach);
        type = t;
        isUniform = false;
        savedBreakTarget = bt;
@@ -171,8 +171,11 @@ FunctionEmitContext::FunctionEmitContext(Function *func, Symbol *funSym,

    internalMaskPointer = AllocaInst(LLVMTypes::MaskType, "internal_mask_memory");
    StoreInst(LLVMMaskAllOn, internalMaskPointer);
+
    functionMaskValue = LLVMMaskAllOn;
-    fullMaskPointer = NULL;
+
+    fullMaskPointer = AllocaInst(LLVMTypes::MaskType, "full_mask_memory");
+    StoreInst(LLVMMaskAllOn, fullMaskPointer);

    loopMask = NULL;
    breakLanesPtr = continueLanesPtr = NULL;
@@ -194,6 +197,47 @@ FunctionEmitContext::FunctionEmitContext(Function *func, Symbol *funSym,
        returnValuePtr = AllocaInst(ftype, "return_value_memory");
    }

+    if (g->opt.disableMaskAllOnOptimizations) {
+        // This is really disgusting.  We want to be able to fool the
+        // compiler to not be able to reason that the mask is all on, but
+        // we don't want to pay too much of a price at the start of each
+        // function to do so.
+        //
+        // Therefore: first, we declare a module-static __all_on_mask
+        // variable that will hold an "all on" mask value.  At the start of
+        // each function, we'll load its value and call SetInternalMaskAnd
+        // with the result to set the current internal execution mask.
+        // (This is a no-op at runtime.)
+        //
+        // Then, to fool the optimizer that maybe the value of
+        // __all_on_mask can't be guaranteed to be "all on", we emit a
+        // dummy function that sets __all_on_mask be "all off".  (That
+        // function is never actually called.)
+        llvm::Value *globalAllOnMaskPtr = 
+            m->module->getNamedGlobal("__all_on_mask");
+        if (globalAllOnMaskPtr == NULL) {
+            globalAllOnMaskPtr = 
+                new llvm::GlobalVariable(*m->module, LLVMTypes::MaskType, false,
+                                         llvm::GlobalValue::InternalLinkage,
+                                         LLVMMaskAllOn, "__all_on_mask");
+
+            char buf[256];
+            sprintf(buf, "__off_all_on_mask_%s", g->target.GetISAString());
+            llvm::Constant *offFunc = 
+                m->module->getOrInsertFunction(buf, LLVMTypes::VoidType,
+                                               NULL);
+            Assert(llvm::isa<llvm::Function>(offFunc));
+            llvm::BasicBlock *offBB = 
+                   llvm::BasicBlock::Create(*g->ctx, "entry", 
+                                            (llvm::Function *)offFunc, 0);
+            new llvm::StoreInst(LLVMMaskAllOff, globalAllOnMaskPtr, offBB);
+            llvm::ReturnInst::Create(*g->ctx, offBB);
+        }
+
+        llvm::Value *allOnMask = LoadInst(globalAllOnMaskPtr, "all_on_mask");
+        SetInternalMaskAnd(LLVMMaskAllOn, allOnMask);
+    }
+
    if (m->diBuilder) {
        /* If debugging is enabled, tell the debug information emission
           code about this new function */
@@ -216,7 +260,7 @@ FunctionEmitContext::FunctionEmitContext(Function *func, Symbol *funSym,

        llvm::DIFile file = funcStartPos.GetDIFile();
        Symbol *programIndexSymbol = m->symbolTable->LookupVariable("programIndex");
-        assert(programIndexSymbol && programIndexSymbol->storagePtr);
+        Assert(programIndexSymbol && programIndexSymbol->storagePtr);
        m->diBuilder->createGlobalVariable(programIndexSymbol->name, 
                                           file,
                                           funcStartPos.first_line,
@@ -225,7 +269,7 @@ FunctionEmitContext::FunctionEmitContext(Function *func, Symbol *funSym,
                                           programIndexSymbol->storagePtr);

        Symbol *programCountSymbol = m->symbolTable->LookupVariable("programCount");
-        assert(programCountSymbol);
+        Assert(programCountSymbol);
        m->diBuilder->createGlobalVariable(programCountSymbol->name, 
                                           file,
                                           funcStartPos.first_line,
@@ -237,8 +281,8 @@ FunctionEmitContext::FunctionEmitContext(Function *func, Symbol *funSym,


 FunctionEmitContext::~FunctionEmitContext() {
-    assert(controlFlowInfo.size() == 0);
-    assert(debugScopes.size() == (m->diBuilder ? 1 : 0));
+    Assert(controlFlowInfo.size() == 0);
+    Assert(debugScopes.size() == (m->diBuilder ? 1 : 0));
 }


@@ -268,17 +312,15 @@ FunctionEmitContext::GetFunctionMask() {

 llvm::Value *
 FunctionEmitContext::GetInternalMask() {
-    if (VaryingCFDepth() == 0)
-        return LLVMMaskAllOn;
-    else
-        return LoadInst(internalMaskPointer, "load_mask");
+    return LoadInst(internalMaskPointer, "load_mask");
 }


 llvm::Value *
 FunctionEmitContext::GetFullMask() {
    llvm::Value *internalMask = GetInternalMask();
-    if (internalMask == LLVMMaskAllOn && functionMaskValue == LLVMMaskAllOn)
+    if (internalMask == LLVMMaskAllOn && functionMaskValue == LLVMMaskAllOn &&
+        !g->opt.disableMaskAllOnOptimizations)
        return LLVMMaskAllOn;
    else
        return BinaryOperator(llvm::Instruction::And, GetInternalMask(), 
@@ -286,16 +328,17 @@ FunctionEmitContext::GetFullMask() {
 }


-void
-FunctionEmitContext::SetMaskPointer(llvm::Value *p) {
-    fullMaskPointer = p;
+llvm::Value *
+FunctionEmitContext::GetFullMaskPointer() {
+    return fullMaskPointer;
 }


 void
 FunctionEmitContext::SetFunctionMask(llvm::Value *value) {
    functionMaskValue = value;
-    StoreInst(GetFullMask(), fullMaskPointer);
+    if (bblock != NULL)
+        StoreInst(GetFullMask(), fullMaskPointer);
 }


@@ -333,7 +376,7 @@ FunctionEmitContext::SetInternalMaskAndNot(llvm::Value *oldMask, llvm::Value *te

 void
 FunctionEmitContext::BranchIfMaskAny(llvm::BasicBlock *btrue, llvm::BasicBlock *bfalse) {
-    assert(bblock != NULL);
+    Assert(bblock != NULL);
    llvm::Value *any = Any(GetFullMask());
    BranchInst(btrue, bfalse, any);
    // It's illegal to add any additional instructions to the basic block
@@ -344,7 +387,7 @@ FunctionEmitContext::BranchIfMaskAny(llvm::BasicBlock *btrue, llvm::BasicBlock *

 void
 FunctionEmitContext::BranchIfMaskAll(llvm::BasicBlock *btrue, llvm::BasicBlock *bfalse) {
-    assert(bblock != NULL);
+    Assert(bblock != NULL);
    llvm::Value *all = All(GetFullMask());
    BranchInst(btrue, bfalse, all);
    // It's illegal to add any additional instructions to the basic block
@@ -355,7 +398,7 @@ FunctionEmitContext::BranchIfMaskAll(llvm::BasicBlock *btrue, llvm::BasicBlock *

 void
 FunctionEmitContext::BranchIfMaskNone(llvm::BasicBlock *btrue, llvm::BasicBlock *bfalse) {
-    assert(bblock != NULL);
+    Assert(bblock != NULL);
    // switch sense of true/false bblocks
    BranchIfMaskAny(bfalse, btrue);
    // It's illegal to add any additional instructions to the basic block
@@ -379,7 +422,7 @@ FunctionEmitContext::StartVaryingIf(llvm::Value *oldMask) {
 void
 FunctionEmitContext::EndIf() {
    // Make sure we match up with a Start{Uniform,Varying}If().
-    assert(controlFlowInfo.size() > 0 && controlFlowInfo.back()->IsIf());
+    Assert(controlFlowInfo.size() > 0 && controlFlowInfo.back()->IsIf());
    CFInfo *ci = controlFlowInfo.back();
    controlFlowInfo.pop_back();

@@ -458,7 +501,7 @@ FunctionEmitContext::StartLoop(llvm::BasicBlock *bt, llvm::BasicBlock *ct,

 void
 FunctionEmitContext::EndLoop() {
-    assert(controlFlowInfo.size() && controlFlowInfo.back()->IsLoop());
+    Assert(controlFlowInfo.size() && controlFlowInfo.back()->IsLoop());
    CFInfo *ci = controlFlowInfo.back();
    controlFlowInfo.pop_back();

@@ -501,7 +544,7 @@ FunctionEmitContext::StartForeach(llvm::BasicBlock *ct) {

 void
 FunctionEmitContext::EndForeach() {
-    assert(controlFlowInfo.size() && controlFlowInfo.back()->IsForeach());
+    Assert(controlFlowInfo.size() && controlFlowInfo.back()->IsForeach());
    CFInfo *ci = controlFlowInfo.back();
    controlFlowInfo.pop_back();

@@ -555,7 +598,7 @@ FunctionEmitContext::Break(bool doCoherenceCheck) {
        // Otherwise we need to update the mask of the lanes that have
        // executed a 'break' statement:
        // breakLanes = breakLanes | mask
-        assert(breakLanesPtr != NULL);
+        Assert(breakLanesPtr != NULL);
        llvm::Value *mask = GetInternalMask();
        llvm::Value *breakMask = LoadInst(breakLanesPtr,
                                          "break_mask");
@@ -605,7 +648,7 @@ FunctionEmitContext::Continue(bool doCoherenceCheck) {
    else {
        // Otherwise update the stored value of which lanes have 'continue'd.
        // continueLanes = continueLanes | mask
-        assert(continueLanesPtr);
+        Assert(continueLanesPtr);
        llvm::Value *mask = GetInternalMask();
        llvm::Value *continueMask = 
            LoadInst(continueLanesPtr, "continue_mask");
@@ -632,7 +675,7 @@ FunctionEmitContext::Continue(bool doCoherenceCheck) {
 */
 bool
 FunctionEmitContext::ifsInLoopAllUniform() const {
-    assert(controlFlowInfo.size() > 0);
+    Assert(controlFlowInfo.size() > 0);
    // Go backwards through controlFlowInfo, since we add new nested scopes
    // to the back.  Stop once we come to the first enclosing loop.
    int i = controlFlowInfo.size() - 1;
@@ -642,7 +685,7 @@ FunctionEmitContext::ifsInLoopAllUniform() const {
            return false;
        --i;
    }
-    assert(i >= 0); // else we didn't find a loop!
+    Assert(i >= 0); // else we didn't find a loop!
    return true;
 }

@@ -650,7 +693,7 @@ FunctionEmitContext::ifsInLoopAllUniform() const {
 void
 FunctionEmitContext::jumpIfAllLoopLanesAreDone(llvm::BasicBlock *target) {
    llvm::Value *allDone = NULL;
-    assert(continueLanesPtr != NULL);
+    Assert(continueLanesPtr != NULL);
    if (breakLanesPtr == NULL) {
        // In a foreach loop, break and return are illegal, and
        // breakLanesPtr is NULL.  In this case, the mask is guaranteed to
@@ -752,13 +795,19 @@ FunctionEmitContext::CurrentLanesReturned(Expr *expr, bool doCoherenceCheck) {
        expr = TypeConvertExpr(expr, returnType, "return statement");
        if (expr != NULL) {
            llvm::Value *retVal = expr->GetValue(this);
-            if (retVal != NULL)
-                // Use a masked store to store the value of the expression
-                // in the return value memory; this preserves the return
-                // values from other lanes that may have executed return
-                // statements previously.
-                StoreInst(retVal, returnValuePtr, GetInternalMask(), 
-                          PointerType::GetUniform(returnType));
+            if (retVal != NULL) {
+                if (returnType->IsUniformType() ||
+                    dynamic_cast<const ReferenceType *>(returnType) != NULL)
+                    StoreInst(retVal, returnValuePtr);
+                else {
+                    // Use a masked store to store the value of the expression
+                    // in the return value memory; this preserves the return
+                    // values from other lanes that may have executed return
+                    // statements previously.
+                    StoreInst(retVal, returnValuePtr, GetInternalMask(), 
+                              PointerType::GetUniform(returnType));
+                }
+            }
        }
    }

@@ -827,7 +876,7 @@ FunctionEmitContext::LaneMask(llvm::Value *v) {
    std::vector<Symbol *> mm;
    m->symbolTable->LookupFunction("__movmsk", &mm);
    // There should be one with signed int signature, one unsigned int.
-    assert(mm.size() == 2); 
+    Assert(mm.size() == 2); 
    // We can actually call either one, since both are i32s as far as
    // LLVM's type system is concerned...
    llvm::Function *fmm = mm[0]->function;
@@ -876,7 +925,7 @@ FunctionEmitContext::CreateBasicBlock(const char *name) {
 llvm::Value *
 FunctionEmitContext::I1VecToBoolVec(llvm::Value *b) {
    if (b == NULL) {
-        assert(m->errorCount > 0);
+        Assert(m->errorCount > 0);
        return NULL;
    }

@@ -922,7 +971,7 @@ lGetStringAsValue(llvm::BasicBlock *bblock, const char *s) {

 void
 FunctionEmitContext::AddInstrumentationPoint(const char *note) {
-    assert(note != NULL);
+    Assert(note != NULL);
    if (!g->emitInstrumentation)
        return;

@@ -990,7 +1039,7 @@ FunctionEmitContext::StartScope() {
 void
 FunctionEmitContext::EndScope() {
    if (m->diBuilder != NULL) {
-        assert(debugScopes.size() > 0);
+        Assert(debugScopes.size() > 0);
        debugScopes.pop_back();
    }
 }
@@ -998,7 +1047,7 @@ FunctionEmitContext::EndScope() {

 llvm::DIScope 
 FunctionEmitContext::GetDIScope() const {
-    assert(debugScopes.size() > 0);
+    Assert(debugScopes.size() > 0);
    return debugScopes.back();
 }

@@ -1059,7 +1108,7 @@ lArrayVectorWidth(LLVM_TYPE_CONST llvm::Type *t) {
    // to things like FunctionEmitContext::BinaryOperator() as operands.
    LLVM_TYPE_CONST llvm::VectorType *vectorElementType = 
        llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(arrayType->getElementType());
-    assert((vectorElementType != NULL &&
+    Assert((vectorElementType != NULL &&
            (int)vectorElementType->getNumElements() == g->target.vectorWidth));
           
    return (int)arrayType->getNumElements();
@@ -1071,11 +1120,11 @@ FunctionEmitContext::BinaryOperator(llvm::Instruction::BinaryOps inst,
                                    llvm::Value *v0, llvm::Value *v1, 
                                    const char *name) {
    if (v0 == NULL || v1 == NULL) {
-        assert(m->errorCount > 0);
+        Assert(m->errorCount > 0);
        return NULL;
    }

-    assert(v0->getType() == v1->getType());
+    Assert(v0->getType() == v1->getType());
    LLVM_TYPE_CONST llvm::Type *type = v0->getType();
    int arraySize = lArrayVectorWidth(type);
    if (arraySize == 0) {
@@ -1103,7 +1152,7 @@ FunctionEmitContext::BinaryOperator(llvm::Instruction::BinaryOps inst,
 llvm::Value *
 FunctionEmitContext::NotOperator(llvm::Value *v, const char *name) {
    if (v == NULL) {
-        assert(m->errorCount > 0);
+        Assert(m->errorCount > 0);
        return NULL;
    }

@@ -1139,12 +1188,12 @@ static LLVM_TYPE_CONST llvm::Type *
 lGetMatchingBoolVectorType(LLVM_TYPE_CONST llvm::Type *type) {
    LLVM_TYPE_CONST llvm::ArrayType *arrayType = 
        llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(type);
-    assert(arrayType != NULL);
+    Assert(arrayType != NULL);

    LLVM_TYPE_CONST llvm::VectorType *vectorElementType = 
        llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(arrayType->getElementType());
-    assert(vectorElementType != NULL);
-    assert((int)vectorElementType->getNumElements() == g->target.vectorWidth);
+    Assert(vectorElementType != NULL);
+    Assert((int)vectorElementType->getNumElements() == g->target.vectorWidth);

    LLVM_TYPE_CONST llvm::Type *base = 
        llvm::VectorType::get(LLVMTypes::BoolType, g->target.vectorWidth);
@@ -1158,11 +1207,11 @@ FunctionEmitContext::CmpInst(llvm::Instruction::OtherOps inst,
                             llvm::Value *v0, llvm::Value *v1, 
                             const char *name) {
    if (v0 == NULL || v1 == NULL) {
-        assert(m->errorCount > 0);
+        Assert(m->errorCount > 0);
        return NULL;
    }

-    assert(v0->getType() == v1->getType());
+    Assert(v0->getType() == v1->getType());
    LLVM_TYPE_CONST llvm::Type *type = v0->getType();
    int arraySize = lArrayVectorWidth(type);
    if (arraySize == 0) {
@@ -1189,7 +1238,7 @@ FunctionEmitContext::CmpInst(llvm::Instruction::OtherOps inst,
 llvm::Value *
 FunctionEmitContext::SmearUniform(llvm::Value *value, const char *name) {
    if (value == NULL) {
-        assert(m->errorCount > 0);
+        Assert(m->errorCount > 0);
        return NULL;
    }

@@ -1224,7 +1273,7 @@ FunctionEmitContext::BitCastInst(llvm::Value *value,
                                 LLVM_TYPE_CONST llvm::Type *type, 
                                 const char *name) {
    if (value == NULL) {
-        assert(m->errorCount > 0);
+        Assert(m->errorCount > 0);
        return NULL;
    }

@@ -1238,7 +1287,7 @@ FunctionEmitContext::BitCastInst(llvm::Value *value,
 llvm::Value *
 FunctionEmitContext::PtrToIntInst(llvm::Value *value, const char *name) {
    if (value == NULL) {
-        assert(m->errorCount > 0);
+        Assert(m->errorCount > 0);
        return NULL;
    }

@@ -1259,7 +1308,7 @@ FunctionEmitContext::PtrToIntInst(llvm::Value *value,
                                  LLVM_TYPE_CONST llvm::Type *toType,
                                  const char *name) {
    if (value == NULL) {
-        assert(m->errorCount > 0);
+        Assert(m->errorCount > 0);
        return NULL;
    }

@@ -1272,7 +1321,7 @@ FunctionEmitContext::PtrToIntInst(llvm::Value *value,
        else if (fromType->getScalarSizeInBits() > toType->getScalarSizeInBits())
            return TruncInst(value, toType, "ptr_to_int");
        else {
-            assert(fromType->getScalarSizeInBits() <
+            Assert(fromType->getScalarSizeInBits() <
                   toType->getScalarSizeInBits());
            return ZExtInst(value, toType, "ptr_to_int");
        }
@@ -1290,7 +1339,7 @@ FunctionEmitContext::IntToPtrInst(llvm::Value *value,
                                  LLVM_TYPE_CONST llvm::Type *toType,
                                  const char *name) {
    if (value == NULL) {
-        assert(m->errorCount > 0);
+        Assert(m->errorCount > 0);
        return NULL;
    }

@@ -1303,7 +1352,7 @@ FunctionEmitContext::IntToPtrInst(llvm::Value *value,
        else if (fromType->getScalarSizeInBits() > toType->getScalarSizeInBits())
            return TruncInst(value, toType, "int_to_ptr");
        else {
-            assert(fromType->getScalarSizeInBits() <
+            Assert(fromType->getScalarSizeInBits() <
                   toType->getScalarSizeInBits());
            return ZExtInst(value, toType, "int_to_ptr");
        }
@@ -1320,7 +1369,7 @@ llvm::Instruction *
 FunctionEmitContext::TruncInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
                               const char *name) {
    if (value == NULL) {
-        assert(m->errorCount > 0);
+        Assert(m->errorCount > 0);
        return NULL;
    }

@@ -1337,7 +1386,7 @@ llvm::Instruction *
 FunctionEmitContext::CastInst(llvm::Instruction::CastOps op, llvm::Value *value,
                              LLVM_TYPE_CONST llvm::Type *type, const char *name) {
    if (value == NULL) {
-        assert(m->errorCount > 0);
+        Assert(m->errorCount > 0);
        return NULL;
    }

@@ -1354,7 +1403,7 @@ llvm::Instruction *
 FunctionEmitContext::FPCastInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type, 
                                const char *name) {
    if (value == NULL) {
-        assert(m->errorCount > 0);
+        Assert(m->errorCount > 0);
        return NULL;
    }

@@ -1371,7 +1420,7 @@ llvm::Instruction *
 FunctionEmitContext::SExtInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type, 
                              const char *name) {
    if (value == NULL) {
-        assert(m->errorCount > 0);
+        Assert(m->errorCount > 0);
        return NULL;
    }

@@ -1388,7 +1437,7 @@ llvm::Instruction *
 FunctionEmitContext::ZExtInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type, 
                              const char *name) {
    if (value == NULL) {
-        assert(m->errorCount > 0);
+        Assert(m->errorCount > 0);
        return NULL;
    }

@@ -1461,7 +1510,7 @@ FunctionEmitContext::applyVaryingGEP(llvm::Value *basePtr, llvm::Value *index,
    // index must be varying for this method to be called.
    bool baseIsUniform = 
        (llvm::isa<LLVM_TYPE_CONST llvm::PointerType>(basePtr->getType()));
-    assert(baseIsUniform == false || indexIsVarying == true);
+    Assert(baseIsUniform == false || indexIsVarying == true);
    llvm::Value *varyingPtr = baseIsUniform ? 
        SmearUniform(basePtr, "ptr_smear") : basePtr;

@@ -1474,13 +1523,13 @@ llvm::Value *
 FunctionEmitContext::GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index, 
                                       const Type *ptrType, const char *name) {
    if (basePtr == NULL || index == NULL) {
-        assert(m->errorCount > 0);
+        Assert(m->errorCount > 0);
        return NULL;
    }

    if (dynamic_cast<const ReferenceType *>(ptrType) != NULL)
        ptrType = PointerType::GetUniform(ptrType->GetReferenceTarget());
-    assert(dynamic_cast<const PointerType *>(ptrType) != NULL);
+    Assert(dynamic_cast<const PointerType *>(ptrType) != NULL);

    bool indexIsVaryingType = 
        llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(index->getType());
@@ -1512,13 +1561,13 @@ FunctionEmitContext::GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index0
                                       llvm::Value *index1, const Type *ptrType,
                                       const char *name) {
    if (basePtr == NULL || index0 == NULL || index1 == NULL) {
-        assert(m->errorCount > 0);
+        Assert(m->errorCount > 0);
        return NULL;
    }

    if (dynamic_cast<const ReferenceType *>(ptrType) != NULL)
        ptrType = PointerType::GetUniform(ptrType->GetReferenceTarget());
-    assert(dynamic_cast<const PointerType *>(ptrType) != NULL);
+    Assert(dynamic_cast<const PointerType *>(ptrType) != NULL);

    bool index0IsVaryingType = 
        llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(index0->getType());
@@ -1551,7 +1600,7 @@ FunctionEmitContext::GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index0
        // out the type of ptr0.
        const Type *baseType = ptrType->GetBaseType();
        const SequentialType *st = dynamic_cast<const SequentialType *>(baseType);
-        assert(st != NULL);
+        Assert(st != NULL);

        bool ptr0IsUniform = 
            llvm::isa<LLVM_TYPE_CONST llvm::PointerType>(ptr0->getType());
@@ -1586,7 +1635,7 @@ FunctionEmitContext::AddElementOffset(llvm::Value *basePtr, int elementNum,

    if (dynamic_cast<const ReferenceType *>(ptrType) != NULL)
        ptrType = PointerType::GetUniform(ptrType->GetReferenceTarget());
-    assert(dynamic_cast<const PointerType *>(ptrType) != NULL);
+    Assert(dynamic_cast<const PointerType *>(ptrType) != NULL);

    // Otherwise do the math to find the offset and add it to the given
    // varying pointers
@@ -1598,14 +1647,14 @@ FunctionEmitContext::AddElementOffset(llvm::Value *basePtr, int elementNum,
        // us the offset in bytes to the given element of the structure
        offset = g->target.StructOffset(st->LLVMType(g->ctx), elementNum);
    else {
-        // Otherwise we should have a vector here and the offset is given
-        // by the element number times the size of the element type of the
-        // vector.
-        const VectorType *vt = 
-            dynamic_cast<const VectorType *>(ptrType->GetBaseType());
-        assert(vt != NULL);
+        // Otherwise we should have a vector or array here and the offset
+        // is given by the element number times the size of the element
+        // type of the vector.
+        const SequentialType *st = 
+            dynamic_cast<const SequentialType *>(ptrType->GetBaseType());
+        Assert(st != NULL);
        llvm::Value *size = 
-            g->target.SizeOf(vt->GetElementType()->LLVMType(g->ctx));
+            g->target.SizeOf(st->GetElementType()->LLVMType(g->ctx));
        llvm::Value *scale = (g->target.is32Bit || g->opt.force32BitAddressing) ?
            LLVMInt32(elementNum) : LLVMInt64(elementNum);
        offset = BinaryOperator(llvm::Instruction::Mul, size, scale);
@@ -1627,13 +1676,13 @@ FunctionEmitContext::AddElementOffset(llvm::Value *basePtr, int elementNum,
 llvm::Value *
 FunctionEmitContext::LoadInst(llvm::Value *ptr, const char *name) {
    if (ptr == NULL) {
-        assert(m->errorCount > 0);
+        Assert(m->errorCount > 0);
        return NULL;
    }

    LLVM_TYPE_CONST llvm::PointerType *pt = 
        llvm::dyn_cast<LLVM_TYPE_CONST llvm::PointerType>(ptr->getType());
-    assert(pt != NULL);
+    Assert(pt != NULL);

    // FIXME: it's not clear to me that we generate unaligned vector loads
    // of varying stuff out of the front-end any more.  (Only by the
@@ -1654,16 +1703,16 @@ llvm::Value *
 FunctionEmitContext::LoadInst(llvm::Value *ptr, llvm::Value *mask,
                              const Type *ptrType, const char *name) {
    if (ptr == NULL) {
-        assert(m->errorCount > 0);
+        Assert(m->errorCount > 0);
        return NULL;
    }

-    assert(ptrType != NULL && mask != NULL);
+    Assert(ptrType != NULL && mask != NULL);

    if (dynamic_cast<const ReferenceType *>(ptrType) != NULL)
        ptrType = PointerType::GetUniform(ptrType->GetReferenceTarget());

-    assert(dynamic_cast<const PointerType *>(ptrType) != NULL);
+    Assert(dynamic_cast<const PointerType *>(ptrType) != NULL);

    if (ptrType->IsUniformType()) {
        // FIXME: same issue as above load inst regarding alignment...
@@ -1691,7 +1740,7 @@ FunctionEmitContext::LoadInst(llvm::Value *ptr, llvm::Value *mask,
    else {
        // Otherwise we should have a varying ptr and it's time for a
        // gather.
-        return gather(ptr, ptrType, mask, name);
+        return gather(ptr, ptrType, GetFullMask(), name);
    }
 }

@@ -1700,7 +1749,7 @@ llvm::Value *
 FunctionEmitContext::gather(llvm::Value *ptr, const Type *ptrType, 
                            llvm::Value *mask, const char *name) {
    // We should have a varying lvalue if we get here...
-    assert(ptrType->IsVaryingType() &&
+    Assert(ptrType->IsVaryingType() &&
           ptr->getType() == LLVMTypes::VoidPointerVectorType);

    const Type *returnType = ptrType->GetBaseType()->GetAsVaryingType();
@@ -1749,13 +1798,13 @@ FunctionEmitContext::gather(llvm::Value *ptr, const Type *ptrType,
        funcName = g->target.is32Bit ? "__pseudo_gather32_16" : 
            "__pseudo_gather64_16";
    else {
-        assert(llvmReturnType == LLVMTypes::Int8VectorType);
+        Assert(llvmReturnType == LLVMTypes::Int8VectorType);
        funcName = g->target.is32Bit ? "__pseudo_gather32_8" : 
            "__pseudo_gather64_8";
    }

    llvm::Function *gatherFunc = m->module->getFunction(funcName);
-    assert(gatherFunc != NULL);
+    Assert(gatherFunc != NULL);

    llvm::Value *call = CallInst(gatherFunc, NULL, ptr, mask, name);

@@ -1804,12 +1853,17 @@ llvm::Value *
 FunctionEmitContext::AllocaInst(LLVM_TYPE_CONST llvm::Type *llvmType, 
                                const char *name, int align, 
                                bool atEntryBlock) {
+    if (llvmType == NULL) {
+        Assert(m->errorCount > 0);
+        return NULL;
+    }
+
    llvm::AllocaInst *inst = NULL;
    if (atEntryBlock) {
        // We usually insert it right before the jump instruction at the
        // end of allocaBlock
        llvm::Instruction *retInst = allocaBlock->getTerminator();
-        assert(retInst);
+        Assert(retInst);
        inst = new llvm::AllocaInst(llvmType, name ? name : "", retInst);
    }
    else
@@ -1845,12 +1899,12 @@ void
 FunctionEmitContext::maskedStore(llvm::Value *value, llvm::Value *ptr,
                                 const Type *ptrType, llvm::Value *mask) {
    if (value == NULL || ptr == NULL) {
-        assert(m->errorCount > 0);
+        Assert(m->errorCount > 0);
        return;
    }

-    assert(dynamic_cast<const PointerType *>(ptrType) != NULL);
-    assert(ptrType->IsUniformType());
+    Assert(dynamic_cast<const PointerType *>(ptrType) != NULL);
+    Assert(ptrType->IsUniformType());

    const Type *valueType = ptrType->GetBaseType();
    const CollectionType *collectionType = 
@@ -1872,7 +1926,7 @@ FunctionEmitContext::maskedStore(llvm::Value *value, llvm::Value *ptr,

    // We must have a regular atomic, enumerator, or pointer type at this
    // point.
-    assert(dynamic_cast<const AtomicType *>(valueType) != NULL ||
+    Assert(dynamic_cast<const AtomicType *>(valueType) != NULL ||
           dynamic_cast<const EnumType *>(valueType) != NULL ||
           dynamic_cast<const PointerType *>(valueType) != NULL);
    valueType = valueType->GetAsNonConstType();
@@ -1918,7 +1972,7 @@ FunctionEmitContext::maskedStore(llvm::Value *value, llvm::Value *ptr,
        ptr = BitCastInst(ptr, LLVMTypes::Int8VectorPointerType, 
                             "ptr_to_int8vecptr");
    }
-    assert(maskedStoreFunc != NULL);
+    Assert(maskedStoreFunc != NULL);

    std::vector<llvm::Value *> args;
    args.push_back(ptr);
@@ -1938,13 +1992,13 @@ FunctionEmitContext::maskedStore(llvm::Value *value, llvm::Value *ptr,
 void
 FunctionEmitContext::scatter(llvm::Value *value, llvm::Value *ptr, 
                             const Type *ptrType, llvm::Value *mask) {
-    assert(dynamic_cast<const PointerType *>(ptrType) != NULL);
-    assert(ptrType->IsVaryingType());
+    Assert(dynamic_cast<const PointerType *>(ptrType) != NULL);
+    Assert(ptrType->IsVaryingType());

    const Type *valueType = ptrType->GetBaseType();

    // I think this should be impossible
-    assert(dynamic_cast<const ArrayType *>(valueType) == NULL);
+    Assert(dynamic_cast<const ArrayType *>(valueType) == NULL);

    const CollectionType *collectionType = dynamic_cast<const CollectionType *>(valueType);
    if (collectionType != NULL) {
@@ -1963,7 +2017,7 @@ FunctionEmitContext::scatter(llvm::Value *value, llvm::Value *ptr,
    const PointerType *pt = dynamic_cast<const PointerType *>(valueType);

    // And everything should be a pointer or atomic from here on out...
-    assert(pt != NULL || 
+    Assert(pt != NULL || 
           dynamic_cast<const AtomicType *>(valueType) != NULL);

    LLVM_TYPE_CONST llvm::Type *type = value->getType();
@@ -1991,7 +2045,7 @@ FunctionEmitContext::scatter(llvm::Value *value, llvm::Value *ptr,
            "__pseudo_scatter64_8";

    llvm::Function *scatterFunc = m->module->getFunction(funcName);
-    assert(scatterFunc != NULL);
+    Assert(scatterFunc != NULL);
    
    AddInstrumentationPoint("scatter");

@@ -2008,7 +2062,7 @@ void
 FunctionEmitContext::StoreInst(llvm::Value *value, llvm::Value *ptr) {
    if (value == NULL || ptr == NULL) {
        // may happen due to error elsewhere
-        assert(m->errorCount > 0);
+        Assert(m->errorCount > 0);
        return;
    }

@@ -2032,7 +2086,7 @@ FunctionEmitContext::StoreInst(llvm::Value *value, llvm::Value *ptr,
                               llvm::Value *mask, const Type *ptrType) {
    if (value == NULL || ptr == NULL) {
        // may happen due to error elsewhere
-        assert(m->errorCount > 0);
+        Assert(m->errorCount > 0);
        return;
    }

@@ -2044,7 +2098,7 @@ FunctionEmitContext::StoreInst(llvm::Value *value, llvm::Value *ptr,
        if (ptrType->GetBaseType()->IsUniformType())
            // the easy case
            StoreInst(value, ptr);
-        else if (mask == LLVMMaskAllOn)
+        else if (mask == LLVMMaskAllOn && !g->opt.disableMaskAllOnOptimizations)
            // Otherwise it is a masked store unless we can determine that the
            // mask is all on...  (Unclear if this check is actually useful.)
            StoreInst(value, ptr);
@@ -2052,10 +2106,10 @@ FunctionEmitContext::StoreInst(llvm::Value *value, llvm::Value *ptr,
            maskedStore(value, ptr, ptrType, mask);
    }
    else {
-        assert(ptrType->IsVaryingType());
+        Assert(ptrType->IsVaryingType());
        // We have a varying ptr (an array of pointers), so it's time to
        // scatter
-        scatter(value, ptr, ptrType, mask);
+        scatter(value, ptr, ptrType, GetFullMask());
    }
 }

@@ -2072,7 +2126,7 @@ FunctionEmitContext::BranchInst(llvm::BasicBlock *trueBlock,
                                llvm::BasicBlock *falseBlock,
                                llvm::Value *test) {
    if (test == NULL) {
-        assert(m->errorCount > 0);
+        Assert(m->errorCount > 0);
        return;
    }

@@ -2085,7 +2139,7 @@ FunctionEmitContext::BranchInst(llvm::BasicBlock *trueBlock,
 llvm::Value *
 FunctionEmitContext::ExtractInst(llvm::Value *v, int elt, const char *name) {
    if (v == NULL) {
-        assert(m->errorCount > 0);
+        Assert(m->errorCount > 0);
        return NULL;
    }

@@ -2105,7 +2159,7 @@ llvm::Value *
 FunctionEmitContext::InsertInst(llvm::Value *v, llvm::Value *eltVal, int elt, 
                                const char *name) {
    if (v == NULL || eltVal == NULL) {
-        assert(m->errorCount > 0);
+        Assert(m->errorCount > 0);
        return NULL;
    }

@@ -2138,7 +2192,7 @@ llvm::Instruction *
 FunctionEmitContext::SelectInst(llvm::Value *test, llvm::Value *val0,
                                llvm::Value *val1, const char *name) {
    if (test == NULL || val0 == NULL || val1 == NULL) {
-        assert(m->errorCount > 0);
+        Assert(m->errorCount > 0);
        return NULL;
    }

@@ -2169,7 +2223,7 @@ lCalleeArgCount(llvm::Value *callee, const FunctionType *funcType) {
        ft = llvm::dyn_cast<LLVM_TYPE_CONST llvm::FunctionType>(pt->getElementType());
    }

-    assert(ft != NULL);
+    Assert(ft != NULL);
    return ft->getNumParams();
 }

@@ -2179,7 +2233,7 @@ FunctionEmitContext::CallInst(llvm::Value *func, const FunctionType *funcType,
                              const std::vector<llvm::Value *> &args,
                              const char *name) {
    if (func == NULL) {
-        assert(m->errorCount > 0);
+        Assert(m->errorCount > 0);
        return NULL;
    }

@@ -2188,7 +2242,7 @@ FunctionEmitContext::CallInst(llvm::Value *func, const FunctionType *funcType,
    // isn't the case for things like intrinsics, builtins, and extern "C"
    // functions from the application.  Add the mask if it's needed.
    unsigned int calleeArgCount = lCalleeArgCount(func, funcType);
-    assert(argVals.size() + 1 == calleeArgCount ||
+    Assert(argVals.size() + 1 == calleeArgCount ||
           argVals.size() == calleeArgCount);
    if (argVals.size() + 1 == calleeArgCount)
        argVals.push_back(GetFullMask());
@@ -2259,7 +2313,7 @@ FunctionEmitContext::CallInst(llvm::Value *func, const FunctionType *funcType,
            llvm::Value *currentMask = LoadInst(maskPtr);
            llvm::Function *cttz = 
                m->module->getFunction("__count_trailing_zeros_i32");
-            assert(cttz != NULL);
+            Assert(cttz != NULL);
            llvm::Value *firstLane = CallInst(cttz, NULL, LaneMask(currentMask),
                                              "first_lane");

@@ -2306,12 +2360,12 @@ FunctionEmitContext::CallInst(llvm::Value *func, const FunctionType *funcType,
            // Now, do a masked store into the memory allocated to
            // accumulate the result using the call mask.
            if (callResult != NULL) {
-                assert(resultPtr != NULL);
+                Assert(resultPtr != NULL);
                StoreInst(callResult, resultPtr, callMask, 
                          PointerType::GetUniform(returnType));
            }
            else
-                assert(resultPtr == NULL);
+                Assert(resultPtr == NULL);

            // Update the mask to turn off the program instances for which
            // we just called the function.
@@ -2371,7 +2425,7 @@ FunctionEmitContext::ReturnInst() {
        rinst = llvm::ReturnInst::Create(*g->ctx, retVal, bblock);
    }
    else {
-        assert(function->GetReturnType() == AtomicType::Void);
+        Assert(function->GetReturnType() == AtomicType::Void);
        rinst = llvm::ReturnInst::Create(*g->ctx, bblock);
    }

@@ -2386,25 +2440,25 @@ FunctionEmitContext::LaunchInst(llvm::Value *callee,
                                std::vector<llvm::Value *> &argVals,
                                llvm::Value *launchCount) {
    if (callee == NULL) {
-        assert(m->errorCount > 0);
+        Assert(m->errorCount > 0);
        return NULL;
    }

    launchedTasks = true;

-    assert(llvm::isa<llvm::Function>(callee));
+    Assert(llvm::isa<llvm::Function>(callee));
    LLVM_TYPE_CONST llvm::Type *argType = 
        (llvm::dyn_cast<llvm::Function>(callee))->arg_begin()->getType();
-    assert(llvm::PointerType::classof(argType));
+    Assert(llvm::PointerType::classof(argType));
    LLVM_TYPE_CONST llvm::PointerType *pt = 
        llvm::dyn_cast<LLVM_TYPE_CONST llvm::PointerType>(argType);
-    assert(llvm::StructType::classof(pt->getElementType()));
+    Assert(llvm::StructType::classof(pt->getElementType()));
    LLVM_TYPE_CONST llvm::StructType *argStructType = 
        static_cast<LLVM_TYPE_CONST llvm::StructType *>(pt->getElementType());
-    assert(argStructType->getNumElements() == argVals.size() + 1);
+    Assert(argStructType->getNumElements() == argVals.size() + 1);

    llvm::Function *falloc = m->module->getFunction("ISPCAlloc");
-    assert(falloc != NULL);
+    Assert(falloc != NULL);
    llvm::Value *structSize = g->target.SizeOf(argStructType);
    if (structSize->getType() != LLVMTypes::Int64Type)
        // ISPCAlloc expects the size as an uint64_t, but on 32-bit
@@ -2439,7 +2493,7 @@ FunctionEmitContext::LaunchInst(llvm::Value *callee,
    // argument block we just filled in
    llvm::Value *fptr = BitCastInst(callee, LLVMTypes::VoidPointerType);
    llvm::Function *flaunch = m->module->getFunction("ISPCLaunch");
-    assert(flaunch != NULL);
+    Assert(flaunch != NULL);
    std::vector<llvm::Value *> args;
    args.push_back(launchGroupHandlePtr);
    args.push_back(fptr);
@@ -2488,12 +2542,13 @@ FunctionEmitContext::addVaryingOffsetsIfNeeded(llvm::Value *ptr,
                                               const Type *ptrType) {
    // This should only be called for varying pointers
    const PointerType *pt = dynamic_cast<const PointerType *>(ptrType);
-    assert(pt && pt->IsVaryingType());
+    Assert(pt && pt->IsVaryingType());

    const Type *baseType = ptrType->GetBaseType();
-    assert(dynamic_cast<const AtomicType *>(baseType) != NULL ||
-           dynamic_cast<const EnumType *>(baseType) != NULL ||
-           dynamic_cast<const PointerType *>(baseType));
+    if (dynamic_cast<const AtomicType *>(baseType) == NULL &&
+        dynamic_cast<const EnumType *>(baseType) == NULL &&
+        dynamic_cast<const PointerType *>(baseType) == NULL)
+        return ptr;
    if (baseType->IsUniformType())
        return ptr;
    
--- a/ctx.h
+++ b/ctx.h
@@ -98,9 +98,9 @@ public:
        the function entry mask and the internal mask. */ 
    llvm::Value *GetFullMask();

-    /** Provides the alloca'd pointer to memory to store the full function
-        mask.  This is only used to wire up the __mask builtin variable. */
-    void SetMaskPointer(llvm::Value *p);
+    /** Returns a pointer to storage in memory that stores the current full
+        mask. */
+    llvm::Value *GetFullMaskPointer();

    /** Provides the value of the mask at function entry */
    void SetFunctionMask(llvm::Value *val);
--- a/decl.cpp
+++ b/decl.cpp
@@ -216,7 +216,7 @@ Declarator::GetFunctionInfo(DeclSpecs *ds, std::vector<Symbol *> *funArgs) {
        return NULL;

    Symbol *declSym = GetSymbol();
-    assert(declSym != NULL);
+    Assert(declSym != NULL);

    // Get the symbol for the function from the symbol table.  (It should
    // already have been added to the symbol table by AddGlobal() by the
@@ -232,11 +232,11 @@ Declarator::GetFunctionInfo(DeclSpecs *ds, std::vector<Symbol *> *funArgs) {
    Declarator *d = this;
    while (d != NULL && d->kind != DK_FUNCTION)
        d = d->child;
-    assert(d != NULL);
+    Assert(d != NULL);

    for (unsigned int i = 0; i < d->functionParams.size(); ++i) {
        Declaration *pdecl = d->functionParams[i];
-        assert(pdecl->declarators.size() == 1);
+        Assert(pdecl->declarators.size() == 1);
        funArgs->push_back(pdecl->declarators[0]->GetSymbol());
    }

@@ -263,8 +263,8 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
    case DK_BASE:
        // All of the type qualifiers should be in the DeclSpecs for the
        // base declarator
-        assert(typeQualifiers == 0);
-        assert(child == NULL);
+        Assert(typeQualifiers == 0);
+        Assert(child == NULL);
        return type;

    case DK_POINTER:
@@ -376,7 +376,7 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
                // it lives down to the base declarator.
                Declarator *decl = d->declarators[0];
                while (decl->child != NULL) {
-                    assert(decl->initExpr == NULL);
+                    Assert(decl->initExpr == NULL);
                    decl = decl->child;
                }

@@ -485,7 +485,7 @@ Declaration::Declaration(DeclSpecs *ds, Declarator *d) {

 std::vector<VariableDeclaration>
 Declaration::GetVariableDeclarations() const {
-    assert(declSpecs->storageClass != SC_TYPEDEF);
+    Assert(declSpecs->storageClass != SC_TYPEDEF);
    std::vector<VariableDeclaration> vars;

    for (unsigned int i = 0; i < declarators.size(); ++i) {
--- a/docs/ReleaseNotes.txt
+++ b/docs/ReleaseNotes.txt
@@ -1,3 +1,33 @@
+=== v1.1.1 === (15 December 2011)
+
+This release doesn't include any significant new functionality, but does
+include a small improvements in generated code and a number of bug fixes.
+
+The one user-visible language change is that integer constants may be
+specified with 'u' and 'l' suffixes, like in C.  For example, "1024llu"
+defines the constant with unsigned 64-bit type.
+
+More informative and useful error messages are printed when function
+overload resolution fails.
+
+Masking is avoided in additional cases when the mask can be
+statically-determined to be all on. 
+
+A number of small bugs have been fixed:
+- Under some circumstances, incorrect masks were used when assigning a
+  value to a reference and when doing gathers/scatters.
+- Incorrect code could be generated in some cases when some instances
+  returned part way through a function but others contineud executing.
+- Type checking wasn't being performed for calls through function pointers;
+  now an error is issued if the arguments don't match up, etc.
+- Incorrect code was being generated for gather/scatter to structs that had
+  elements with varying short-vector types.
+- Typechecking wasn't being performed for "foreach" statements; this led to
+  problems like function overload resolution not being performed if an
+  overloaded function call was used to determine the iteration range..
+- A number of symbols would be multiply-defined when compiling to multiple
+  targets and using the sse2-x2 target as one of them (issue #131).
+
 === v1.1.0 === (5 December 2011)

 This is a major new release of the compiler, with significant additions to
--- a/docs/faq.txt
+++ b/docs/faq.txt
@@ -273,10 +273,10 @@ Then four object files will be generated: ``foo_sse2.o``, ``foo_sse4.o``,
 ``foo_avx.o``, and ``foo.o``.[#]_  Link all of these into your executable, and
 when you call a function in ``foo.ispc`` from your application code,
 ``ispc`` will determine which instruction sets are supported by the CPU the
-code is running on and will call the most appropraite version of the
+code is running on and will call the most appropriate version of the
 function available.  

-.. [#] Similarly, if you choose to generate assembly langauage output or
+.. [#] Similarly, if you choose to generate assembly language output or
   LLVM bitcode output, multiple versions of those files will be created.

 In general, the version of the function that runs will be the one in the
--- a/docs/ispc.txt
+++ b/docs/ispc.txt
@@ -26,9 +26,9 @@ The main goals behind ``ispc`` are to:
  units without the extremely low-programmer-productivity activity of directly
  writing intrinsics.
 * Explore opportunities from close-coupling between C/C++ application code
-  and SPMD ``ispc`` code running on the same processor--lightweight funcion
-  calls betwen the two languages, sharing data directly via pointers without
-  copying or reformating, etc.
+  and SPMD ``ispc`` code running on the same processor--lightweight function
+  calls between the two languages, sharing data directly via pointers without
+  copying or reformatting, etc.

 **We are very interested in your feedback and comments about ispc and
 in hearing your experiences using the system.  We are especially interested
@@ -146,8 +146,6 @@ Contents:
  + `Restructuring Existing Programs to Use ISPC`_
  + `Understanding How to Interoperate With the Application's Data`_

-* `Related Languages`_
-
 * `Disclaimer and Legal Information`_

 * `Optimization Notice`_
@@ -251,7 +249,7 @@ of the value.
 The first thing to notice in this program is the presence of the ``export``
 keyword in the function definition; this indicates that the function should
 be made available to be called from application code.  The ``uniform``
-qualifiers on the parameters to ``simple`` indicate that the correpsonding
+qualifiers on the parameters to ``simple`` indicate that the corresponding
 variables are non-vector quantities--this concept is discussed in detail in the
 `"uniform" and "varying" Qualifiers`_ section.

@@ -323,7 +321,7 @@ When the executable ``simple`` runs, it generates the expected output:
    ...

 For a slightly more complex example of using ``ispc``, see the `Mandelbrot
-set example`_ page on the ``ispc`` website for a walkthrough of an ``ispc``
+set example`_ page on the ``ispc`` website for a walk-through of an ``ispc``
 implementation of that algorithm.  After reading through that example, you
 may want to examine the source code of the various examples in the
 ``examples/`` directory of the ``ispc`` distribution.
@@ -374,7 +372,7 @@ Optimizations are on by default; they can be turned off with ``-O0``:
 On Mac\* and Linux\*, there is basic support for generating debugging
 symbols; this is enabled with the ``-g`` command-line flag.  Using ``-g``
 causes optimizations to be disabled; to compile with debugging symbols and
-optimizaion, ``-O1`` should be provided as well as the ``-g`` flag.
+optimization, ``-O1`` should be provided as well as the ``-g`` flag.

 The ``-h`` flag can also be used to direct ``ispc`` to generate a C/C++
 header file that includes C/C++ declarations of the C-callable ``ispc``
@@ -402,7 +400,7 @@ which sets the target architecture, ``--cpu``, which sets the target CPU,
 and ``--target``, which sets the target instruction set.

 By default, the ``ispc`` compiler generates code for the 64-bit x86-64
-architecture (i.e. ``--arch=x86-64`.)  To compile to a 32-bit x86 target,
+architecture (i.e. ``--arch=x86-64``.)  To compile to a 32-bit x86 target,
 supply ``--arch=x86`` on the command line:

 ::
@@ -473,6 +471,9 @@ preprocessor runs:
  * - ISPC_TARGET_{SSE2,SSE4,AVX}
    - 1
    - One of these will be set, depending on the compilation target.
+  * - ISPC_POINTER_SIZE
+    - 32 or 64
+    - Number of bits used to represent a pointer for the target architecture.
  * - ISPC_MAJOR_VERSION
    - 1
    - Major version of the ``ispc`` compiler/language
@@ -525,8 +526,8 @@ Basic Concepts: Program Instances and Gangs of Program Instances
 Upon entry to a ``ispc`` function called from C/C++ code, the execution
 model switches from the application's serial model to ``ispc``'s execution
 model.  Conceptually, a number of ``ispc`` *program instances* start
-running in concurrently.  The group of running program instances is a
-called *gang* (harkening to "gang scheduling", since ``ispc`` provides
+running concurrently.  The group of running program instances is a
+called a *gang* (harkening to "gang scheduling", since ``ispc`` provides
 certain guarantees about the control flow coherence of program instances
 running in a gang, detailed in `Gang Convergence Guarantees`_.)  An
 ``ispc`` program instance is thus similar to a CUDA* "thread" or an OpenCL*
@@ -609,7 +610,7 @@ side-effects.

 Upon entry to an ``ispc`` function called by the application, the execution
 mask is "all on" and the program counter points at the first statement in
-the function.  The following two statments describe the required behavior
+the function.  The following two statements describe the required behavior
 of the program counter and the execution mask over the course of execution
 of an ``ispc`` function.

@@ -730,7 +731,7 @@ program instances is *maximally converged*.  Maximal convergence means that
 if two program instances follow the same control path, they are guaranteed
 to execute each program statement concurrently. If two program instances
 follow diverging control paths, it is guaranteed that they will reconverge
-as soon as possible (if they do later reconverge). [#]_
+as soon as possible in the function (if they do later reconverge). [#]_

 .. [#] This is another significant difference between the ``ispc``
       execution model and the one implemented by OpenCL* and CUDA*, which
@@ -754,9 +755,25 @@ It is guaranteed that all program instances that were running before the
 for the gang of program instances, rather than the concept of a unique
 program counter for each program instance.)

-Another implication of this property is that it is illegal to execute a
-function with an 8-wide gang by running it two times, with a 4-wide gang
-representing half of the original 8-wide gang each time.
+Another implication of this property is that it would be illegal for the
+``ispc`` implementation to execute a function with an 8-wide gang by
+running it two times, with a 4-wide gang representing half of the original
+8-wide gang each time.
+
+It also follows that given the following program:
+
+::
+
+    if (programIndex == 0) {
+        while (true)  // infinite loop
+            ;
+    }
+    print("hello, world\n");
+
+the program will loop infinitely and the ``print`` statement will never be
+executed.  (A different execution model that allowed gang divergence might
+execute the ``print`` statement since not all program instances were caught
+in the infinite loop in the example above.)

 The way that "varying" function pointers are handled in ``ispc`` is also
 affected by this guarantee: if a function pointer is ``varying``, then it
@@ -802,7 +819,7 @@ of control flow, will say that control flow based on ``varying``
 expressions is "varying" control flow.)

 Consider for example an image filtering operation where the program loops
-over pixels adjacent to the given (x,y) coordiantes:
+over pixels adjacent to the given (x,y) coordinates:

 ::

@@ -902,7 +919,7 @@ for all program instances in the gang, it's possible that the "true" clause
 executed with an "all off" mask and ``b`` was modified there.

 If it is important that code never be executed with an "all off" execution
-mask, then the ``cif`` statment (documented in the `"Coherent" Control Flow
+mask, then the ``cif`` statement (documented in the `"Coherent" Control Flow
 Statements: "cif" and Friends`_ section) can be used in place of a regular
 ``if``, as it guarantees this property. 

@@ -973,6 +990,20 @@ which of them will write their value of ``value`` to ``array[index]``.
        array[index] = value;
    }

+As another example, if the values of the array indices ``i`` and ``j`` have
+the same values for some of the program instances, and an assignment like
+the following is performed:
+
+::
+
+    int i = ..., j = ...;
+    uniform int array[...] = { ... };
+    array[i] = array[j];
+
+
+then the program's behavior is undefined, since there is no sequence point
+between the reads and writes to the same location. 
+
 While this rule that says that program instances can safely depend on
 side-effects from by other program instances in their gang eliminates a
 class of synchronization requirements imposed by some other SPMD languages,
@@ -1014,7 +1045,7 @@ completed.
 The ISPC Language
 =================

-``ispc`` is an extended verion of the C programming language, providing a
+``ispc`` is an extended version of the C programming language, providing a
 number of new features that make it easy to write high-performance SPMD
 programs for the CPU.  Note that between not only the few small syntactic
 differences between ``ispc`` and C code but more importantly ``ispc``'s
@@ -1035,12 +1066,12 @@ This subsection summarizes the differences between ``ispc`` and C; if you
 are already familiar with C, you may find it most effective to focus on
 this subsection and just focus on the topics in the remainder of section
 that introduce new language features.  You may also find it helpful to
-comapre the ``ispc`` and C++ implementations of various algorithms in the
+compare the ``ispc`` and C++ implementations of various algorithms in the
 ``ispc`` ``examples/`` directory to get a sense of the close relationship
 between ``ispc`` and C.

 Specifically, C89 is used as the baseline for comparison in this subsection
-(this is also the verion of C described in the Second Edition of Kernighan
+(this is also the version of C described in the Second Edition of Kernighan
 and Ritchie's book).  (``ispc`` adopts some features from C99 and from C++,
 which will be highlighted in the below.)

@@ -1068,7 +1099,7 @@ in C:
  statement itself (e.g. ``for (int i = 0; ...``) 
 * The ``inline`` qualifier to indicate that a function should be inlined 
 * Function overloading by parameter type
-* Hexidecimal floating-point constants
+* Hexadecimal floating-point constants

 ``ispc`` also adds a number of new features that aren't in C89, C99, or
 C++:
@@ -1127,11 +1158,11 @@ The following reserved words from C89 are also reserved in ``ispc``:
 Lexical Structure
 -----------------

-Tokens in ``ispc`` are delimted by white-space and comments.  The
+Tokens in ``ispc`` are delimited by white-space and comments.  The
 white-space characters are the usual set of spaces, tabs, and carriage
-returns/line feeds.  Comments can be delinated with ``//``, which starts a
+returns/line feeds.  Comments can be delineated with ``//``, which starts a
 comment that continues to the end of the line, or the start of a comment
-can be delinated with ``/*`` and the end with ``*/``.  Like C/C++,
+can be delineated with ``/*`` and the end with ``*/``.  Like C/C++,
 comments can't be nested.

 Identifiers in ``ispc`` are sequences of characters that start with an
@@ -1139,9 +1170,9 @@ underscore or an upper-case or lower-case letter, and then followed by
 zero or more letters, numbers, or underscores.  Identifiers that start with
 two underscores are reserved for use by the compiler.

-Integer numeric constants can be specified in base 10, hexidecimal, or
+Integer numeric constants can be specified in base 10, hexadecimal, or
 binary.  (Octal integer constants aren't supported).  Base 10 constants are
-given by a sequence of one or more digits from 0 to 9.  Hexidecimal
+given by a sequence of one or more digits from 0 to 9.  Hexadecimal
 constants are denoted by a leading ``0x`` and then one or more digits from
 0-9, a-f, or A-F.  Finally, binary constants are denoted by a leading
 ``0b`` and then a sequence of 1s and 0s.
@@ -1163,11 +1194,11 @@ The second option is scientific notation, where a base value is specified
 as the first form of a floating-point constant but is then followed by an
 "e" or "E", then a plus sign or a minus sign, and then an exponent.

-Finally, floating-point constants may be specified as hexidecimal
+Finally, floating-point constants may be specified as hexadecimal
 constants; this form can ensure a perfectly bit-accurate representation of
 a particular floating-point number.  These are specified with an "0x"
 prefix, followed by a zero or a one, a period, and then the remainder of
-the mantissa in hexidecimal form, with digits from 0-9, a-f, or A-F.  The
+the mantissa in hexadecimal form, with digits from 0-9, a-f, or A-F.  The
 start of the exponent is denoted by a "p", which is then followed by an
 optional plus or minus sign and then digits from 0 to 9.  For example:

@@ -1204,7 +1235,7 @@ to specify special characters.  These sequences all start with an initial
  * - ``\n``
    - newline
  * - ``\r``
-    - carriabe return
+    - carriage return
  * - ``\t``
    - horizontal tab
  * - ``\v``
@@ -1212,7 +1243,7 @@ to specify special characters.  These sequences all start with an initial
  * - ``\`` followed by one or more digits from 0-8
    - ASCII character in octal notation
  * - ``\x``, followed by one or more digits from 0-9, a-f, A-F 
-    - ASCII character in hexidecimal notation
+    - ASCII character in hexadecimal notation

 ``ispc`` doesn't support a string data type; string constants can be passed
 as the first argument to the ``print()`` statement, however.  ``ispc`` also
@@ -1367,7 +1398,7 @@ store are:
    uniform float bar[10];

 The first declaration corresponds to 10 gang-wide ``float`` values in
-memory, while the second declaration corresonds to 10 ``float`` values.
+memory, while the second declaration corresponds to 10 ``float`` values.


 Defining New Names For Types
@@ -1531,7 +1562,7 @@ instance in the gang has its own unique pointer value)

 (The rationale for this limitation is that references must be represented
 as either a uniform pointer or a varying pointer internally.  While
-choosing a varying pointer would provide maximum flexibilty and eliminate
+choosing a varying pointer would provide maximum flexibility and eliminate
 this restriction, it would reduce performance in the common case where a
 uniform pointer is all that's needed.  As a work-around, a varying pointer
 can be used in cases where a varying lvalue reference would be desired.)
@@ -1554,7 +1585,7 @@ and then a brace-delimited list of enumerators with optional values:

 Each ``enum`` declaration defines a new type; an attempt to implicitly
 convert between enumerations of different types gives a compile-time error,
-but enuemrations of different types can be explicitly cast to one other.
+but enumerations of different types can be explicitly cast to one other.

 ::

@@ -1564,7 +1595,7 @@ Enumerators are implicitly converted to integer types, however, so they can
 be directly passed to routines that take integer parameters and can be used
 in expressions including integers, for example.  However, the integer
 result of such an expression must be explicitly cast back to the enumerant
-type if it to be assigned to a variable with the enuemrant type.
+type if it to be assigned to a variable with the enumerant type.

 ::

@@ -1815,7 +1846,7 @@ Structures can also be initialized by providing element values in braces:
    ....
    Color d = { 0.5, .75, 1.0 }; // r = 0.5, ...

-Arrays of structures and arrays inside structures can be initialzed with
+Arrays of structures and arrays inside structures can be initialized with
 the expected syntax:

 ::
@@ -1849,7 +1880,7 @@ Structure member access and array indexing also work as in C.
   return foo.f[4] - foo.i;
    

-The address-of operator, pointer derefernce operator, and pointer member
+The address-of operator, pointer dereference operator, and pointer member
 operator also work as expected.

 ::
@@ -1894,7 +1925,7 @@ Basic Iteration Statements: "for", "while", and "do"

 ``ispc`` supports ``for``, ``while``, and ``do`` loops, with the same
 specification as in C.  Like C++, variables can be declared in the ``for``
-statment itself:
+statement itself:

 ::

@@ -1978,7 +2009,7 @@ nested inside a ``foreach`` loop.)  ``continue`` statements are legal in
 a program instances that executes a ``continue`` statement effectively
 skips over the rest of the loop body for the current iteration.

-As a specific example, consdier the following ``foreach`` statement:
+As a specific example, consider the following ``foreach`` statement:

 ::

@@ -2076,7 +2107,7 @@ some computation on an array of data.
    }

 Here, we've written a loop that explicitly loops over the data in chunks of
-``programCount`` elements.  In each loop iteraton, the running program
+``programCount`` elements.  In each loop iteration, the running program
 instances effectively collude amongst themselves using ``programIndex`` to
 determine which elements to work on in a way that ensures that all of the
 data elements will be processed.  In this particular case, a ``foreach``
@@ -2282,7 +2313,7 @@ distributions.
 If you are implementing your own task system, the remainder of this section
 discusses the requirements for these calls.  You will also likely want to
 review the example task systems in ``examples/tasksys.cpp`` for reference.
-If you are not implmenting your own task system, you can skip reading the
+If you are not implementing your own task system, you can skip reading the
 remainder of this section.

 Here are the declarations of the three functions that must be provided to
@@ -2302,7 +2333,7 @@ implementation can efficiently wait for completion on just the tasks
 launched from a single function.

 The first time one of ``ISPCLaunch()`` or ``ISPCAlloc()`` is called in an
-``ispc`` functon, the ``void *`` pointed to by the ``handlePtr`` parameter
+``ispc`` function, the ``void *`` pointed to by the ``handlePtr`` parameter
 will be ``NULL``.  The implementations of these function should then
 initialize ``*handlePtr`` to a unique handle value of some sort.  (For
 example, it might allocate a small structure to record which tasks were
@@ -2318,14 +2349,14 @@ than a pointer to it, as in the other functions.

 The ``ISPCAlloc()`` function is used to allocate small blocks of memory to
 store parameters passed to tasks.  It should return a pointer to memory
-with the given aize and alignment.  Note that there is no explicit
+with the given size and alignment.  Note that there is no explicit
 ``ISPCFree()`` call; instead, all memory allocated within an ``ispc``
 function should be freed when ``ISPCSync()`` is called.

 ``ISPCLaunch()`` is called to launch to launch one or more asynchronous
 tasks.  Each ``launch`` statement in ``ispc`` code causes a call to
 ``ISPCLaunch()`` to be emitted in the generated code.  The three parameters
-after the handle pointer to thie function are relatively straightforward;
+after the handle pointer to the function are relatively straightforward;
 the ``void *f`` parameter holds a pointer to a function to call to run the
 work for this task, ``data`` holds a pointer to data to pass to this
 function, and ``count`` is the number of instances of this function to
@@ -2340,7 +2371,7 @@ The signature of the provided function pointer ``f`` is
                        int taskIndex, int taskCount)

 When this function pointer is called by one of the hardware threads managed
-bythe task system, the ``data`` pointer passed to ``ISPCLaunch()`` should
+by the task system, the ``data`` pointer passed to ``ISPCLaunch()`` should
 be passed to it for its first parameter; ``threadCount`` gives the total
 number of hardware threads that have been spawned to run tasks and
 ``threadIndex`` should be an integer index between zero and ``threadCount``
@@ -2659,7 +2690,7 @@ generates the following output on a four-wide compilation target:
 When a varying variable is printed, the values for program instances that
 aren't currently executing are printed inside double parenthesis,
 indicating inactive program instances.  The elements for inactive program
-instances may have garabge values, though in some circumstances it can be
+instances may have garbage values, though in some circumstances it can be
 useful to see their values.

 Assertions
@@ -2879,7 +2910,7 @@ If called when none of the program instances are running,
 There are also a number of functions to compute "scan"s of values across
 the program instances.  For example, the ``exclusive_scan_and()`` function
 computes, for each program instance, the sum of the given value over all of
-the preceeding program instances.  (The scans currently available in
+the preceding program instances.  (The scans currently available in
 ``ispc`` are all so-called "exclusive" scans, meaning that the value
 computed for a given element does not include the value provided for that
 element.)  In C code, an exclusive add scan over an array might be
@@ -3175,7 +3206,7 @@ rather than one per program instance.
                                   uniform int32 newval)

 Be careful that you use the atomic function that you mean to; consider the
-folloiwng code:
+following code:

 ::

@@ -3532,7 +3563,7 @@ Restructuring Existing Programs to Use ISPC

 ``ispc`` is designed to enable you to incorporate
 SPMD parallelism into existing code with minimal modification; features
-like the ability to share memory and data structures betwen C/C++ and
+like the ability to share memory and data structures between C/C++ and
 ``ispc`` code and the ability to directly call back and forth between
 ``ispc`` and C/C++ are motivated by this.  These features also make it
 easy to incrementally transform a program to use ``ispc``; the most
@@ -3708,12 +3739,6 @@ elements to work with and then proceeds with the computation.
  }


-Related Languages
-=================
-
-TODO: rsl, C*, IVL
-
-
 Disclaimer and Legal Information
 ================================

--- a/docs/perf.txt
+++ b/docs/perf.txt
@@ -22,8 +22,8 @@ also included in the ``examples/`` directory.)
    - ``ispc``, 1 core
    - ``ispc``, 4 cores
  * - `AOBench`_ (512 x 512 resolution)
-    - 3.99x
-    - 19.32x
+    - 6.19x
+    - 28.06x
  * - `Binomial Options`_ (128k options)
    - 7.94x
    - 33.43x
@@ -31,23 +31,23 @@ also included in the ``examples/`` directory.)
    - 8.45x
    - 32.48x
  * - `Deferred Shading`_ (1280p)
-    - n/a
+    - 5.02x
    - 23.06x
  * - `Mandelbrot Set`_
    - 6.21x
-    - 19.90x
+    - 20.28x
  * - `Perlin Noise Function`_
    - 5.37x
    - n/a
  * - `Ray Tracer`_ (Sponza dataset)
-    - 3.99x
-    - 19.32x
+    - 4.31x
+    - 20.29x
  * - `3D Stencil`_
-    - 3.76x
-    - 13.79x
+    - 4.05x
+    - 15.53x
  * - `Volume Rendering`_
-    - 3.11x
-    - 15.80x
+    - 3.60x
+    - 17.53x


 .. _AOBench: https://github.com/ispc/ispc/tree/master/examples/aobench
--- a/docs/perfguide.txt
+++ b/docs/perfguide.txt
@@ -64,7 +64,7 @@ on each one:
 Depending on the specifics of the computation being performed, the code
 generated for this function could likely be improved by modifying the code 
 so that the loop only goes as far through the data as is possible to pack
-an entire gang of program instances with computation each time thorugh the
+an entire gang of program instances with computation each time through the
 loop.  Doing so enables the ``ispc`` compiler to generate more efficient
 code for cases where it knows that the execution mask is "all on".  Then,
 an ``if`` statement at the end handles processing the ragged extra bits of
@@ -153,7 +153,7 @@ processed, and so forth.

 Performance benefit can come from using ``foreach_tiled`` in that it
 essentially optimizes for the benefit of iterating over *compact* regions
-of the domian (while ``foreach`` iterates over the domain in a way that
+of the domain (while ``foreach`` iterates over the domain in a way that
 generally allows linear memory access.)  There are two benefits from
 processing compact regions of the domain.  

@@ -215,7 +215,7 @@ Use "uniform" Whenever Appropriate
 ----------------------------------

 For any variable that will always have the same value across all of the
-program instances in a gang, declare the variable with the  ``unfiorm``
+program instances in a gang, declare the variable with the  ``uniform``
 qualifier.  Doing so enables the ``ispc`` compiler to emit better code in
 many different ways.

@@ -229,7 +229,7 @@ number of iterations:

 If this is written with ``i`` as a ``varying`` variable, as above, there's
 additional overhead in the code generated for the loop as the compiler
-emits instructions to handle the possibilty of not all program instances
+emits instructions to handle the possibility of not all program instances
 following the same control flow path (as might be the case if the loop
 limit, 10, was itself a ``varying`` value.)

@@ -568,7 +568,7 @@ mask of all lanes currently executing (assuming a four-wide gang size
 target machine).

 For a fuller example of the utility of this functionality, see
-``examples/aobench_instrumented`` in the ``ispc`` distribution.  Ths
+``examples/aobench_instrumented`` in the ``ispc`` distribution.  This
 example includes an implementation of the ``ISPCInstrument()`` function
 that collects aggregate data about the program's execution behavior.

--- a/doxygen.cfg
+++ b/doxygen.cfg
@@ -31,7 +31,7 @@ PROJECT_NAME           = "Intel SPMD Program Compiler"
 # This could be handy for archiving the generated documentation or
 # if some version control system is used.

-PROJECT_NUMBER         = 1.1.0
+PROJECT_NUMBER         = 1.1.1

 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
 # base path where the generated documentation will be put.
--- a/examples/mandelbrot_tasks/mandelbrot.ispc
+++ b/examples/mandelbrot_tasks/mandelbrot.ispc
@@ -49,17 +49,16 @@ mandel(float c_re, float c_im, int count) {
 }


-/* Task to compute the Mandelbrot iterations for a span of scanlines from
-   [ystart,yend).
+/* Task to compute the Mandelbrot iterations for a single scanline.
 */
 task void
-mandelbrot_scanlines(uniform int ybase, uniform int span,
-                     uniform float x0, uniform float dx, 
-                     uniform float y0, uniform float dy,
-                     uniform int width, uniform int maxIterations,
-                     uniform int output[]) {
-    uniform int ystart = ybase + taskIndex * span;
-    uniform int yend = ystart + span;
+mandelbrot_scanline(uniform float x0, uniform float dx, 
+                    uniform float y0, uniform float dy,
+                    uniform int width, uniform int height, 
+                    uniform int span,
+                    uniform int maxIterations, uniform int output[]) {
+    uniform int ystart = taskIndex * span;
+    uniform int yend = min((taskIndex+1) * span, (unsigned int)height);

    foreach (yi = ystart ... yend, xi = 0 ... width) {
        float x = x0 + xi * dx;
@@ -71,20 +70,6 @@ mandelbrot_scanlines(uniform int ybase, uniform int span,
 }
                               

-task void
-mandelbrot_chunk(uniform float x0, uniform float dx,
-                 uniform float y0, uniform float dy,
-                 uniform int width, uniform int height,
-                 uniform int maxIterations, uniform int output[]) {
-    uniform int ystart = taskIndex * (height/taskCount);
-    uniform int yend = (taskIndex+1) * (height/taskCount);
-    uniform int span = 1;
-
-    launch[(yend-ystart)/span] < mandelbrot_scanlines(ystart, span, x0, dx, y0, dy,
-                                                      width, maxIterations, output) >;
-}
-
-
 export void
 mandelbrot_ispc(uniform float x0, uniform float y0, 
                uniform float x1, uniform float y1,
@@ -92,7 +77,8 @@ mandelbrot_ispc(uniform float x0, uniform float y0,
                uniform int maxIterations, uniform int output[]) {
    uniform float dx = (x1 - x0) / width;
    uniform float dy = (y1 - y0) / height;
+    uniform int span = 4;

-    launch[32] < mandelbrot_chunk(x0, dx, y0, dy, width, height,
-                                  maxIterations, output) >;
+    launch[height/span] < mandelbrot_scanline(x0, dx, y0, dy, width, height, span,
+                                              maxIterations, output) >;
 }
--- a/examples/stencil/Makefile
+++ b/examples/stencil/Makefile
@@ -8,7 +8,7 @@ TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
 CXX=g++
 CXXFLAGS=-Iobjs/ -O3 -Wall -m64
 ISPC=ispc
-ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx --arch=x86-64
+ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx-x2 --arch=x86-64

 OBJS=objs/stencil.o objs/stencil_serial.o $(TASK_OBJ) objs/stencil_ispc.o \
 	objs/stencil_ispc_sse2.o objs/stencil_ispc_sse4.o \
--- a/examples/stencil/stencil.ispc
+++ b/examples/stencil/stencil.ispc
@@ -41,27 +41,23 @@ stencil_step(uniform int x0, uniform int x1,
             uniform const float Ain[], uniform float Aout[]) {
    const uniform int Nxy = Nx * Ny;

-    for (uniform int z = z0; z < z1; ++z) {
-        for (uniform int y = y0; y < y1; ++y) {
-            foreach (x = x0 ... x1) {
-                int index = (z * Nxy) + (y * Nx) + x;
+    foreach (z = z0 ... z1, y = y0 ... y1, x = x0 ... x1) {
+        int index = (z * Nxy) + (y * Nx) + x;
 #define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]
 #define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
-                float div = coef[0] * A_cur(0, 0, 0) +
-                            coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) +
-                                       A_cur(0, +1, 0) + A_cur(0, -1, 0) +
-                                       A_cur(0, 0, +1) + A_cur(0, 0, -1)) +
-                            coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) +
-                                       A_cur(0, +2, 0) + A_cur(0, -2, 0) +
-                                       A_cur(0, 0, +2) + A_cur(0, 0, -2)) +
-                            coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) +
-                                       A_cur(0, +3, 0) + A_cur(0, -3, 0) +
-                                       A_cur(0, 0, +3) + A_cur(0, 0, -3));
+        float div = coef[0] * A_cur(0, 0, 0) +
+            coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) +
+                       A_cur(0, +1, 0) + A_cur(0, -1, 0) +
+                       A_cur(0, 0, +1) + A_cur(0, 0, -1)) +
+            coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) +
+                       A_cur(0, +2, 0) + A_cur(0, -2, 0) +
+                       A_cur(0, 0, +2) + A_cur(0, 0, -2)) +
+            coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) +
+                       A_cur(0, +3, 0) + A_cur(0, -3, 0) +
+                       A_cur(0, 0, +3) + A_cur(0, 0, -3));

-                A_next(0, 0, 0) = 2 * A_cur(0, 0, 0) - A_next(0, 0, 0) + 
-                    vsq[index] * div;
-            }
-        }
+        A_next(0, 0, 0) = 2 * A_cur(0, 0, 0) - A_next(0, 0, 0) + 
+            vsq[index] * div;
    }
 }

@@ -69,11 +65,12 @@ stencil_step(uniform int x0, uniform int x1,
 static task void
 stencil_step_task(uniform int x0, uniform int x1,
                  uniform int y0, uniform int y1,
-                  uniform int z0, uniform int z1,
+                  uniform int z0,
                  uniform int Nx, uniform int Ny, uniform int Nz,
                  uniform const float coef[4], uniform const float vsq[],
                  uniform const float Ain[], uniform float Aout[]) {
-    stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, Ain, Aout);
+    stencil_step(x0, x1, y0, y1, z0+taskIndex, z0+taskIndex+1,
+                 Nx, Ny, Nz, coef, vsq, Ain, Aout);
 }


@@ -89,17 +86,14 @@ loop_stencil_ispc_tasks(uniform int t0, uniform int t1,
 {
    for (uniform int t = t0; t < t1; ++t) {
        // Parallelize across cores as well: each task will work on a slice
-        // of "dz" in the z extent of the volume.  (dz=1 seems to work
-        // better than any larger values.)
-        uniform int dz = 1;
-        for (uniform int z = z0; z < z1; z += dz) {
-            if ((t & 1) == 0)
-                launch < stencil_step_task(x0, x1, y0, y1, z, z+dz, Nx, Ny, Nz, 
-                                           coef, vsq, Aeven, Aodd) >;
-            else
-                launch < stencil_step_task(x0, x1, y0, y1, z, z+dz, Nx, Ny, Nz, 
-                                           coef, vsq, Aodd, Aeven) >;
-        }
+        // of 1 in the z extent of the volume.
+        if ((t & 1) == 0)
+            launch[z1-z0] < stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz, 
+                                              coef, vsq, Aeven, Aodd) >;
+        else
+            launch[z1-z0] < stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz, 
+                                              coef, vsq, Aodd, Aeven) >;
+
        // We need to wait for all of the launched tasks to finish before
        // starting the next iteration.
        sync;
--- a/examples/volume_rendering/Makefile
+++ b/examples/volume_rendering/Makefile
@@ -8,10 +8,10 @@ TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
 CXX=g++
 CXXFLAGS=-Iobjs/ -O3 -Wall -m64
 ISPC=ispc
-ISPCFLAGS=-O2 --target=sse2,sse4-x2 --arch=x86-64
+ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx --arch=x86-64

 OBJS=objs/volume.o objs/volume_serial.o $(TASK_OBJ) objs/volume_ispc.o \
-	objs/volume_ispc_sse2.o objs/volume_ispc_sse4.o
+	objs/volume_ispc_sse2.o objs/volume_ispc_sse4.o objs/volume_ispc_avx.o

 default: volume

@@ -34,5 +34,5 @@ objs/%.o: ../%.cpp

 objs/volume.o: objs/volume_ispc.h 

-objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o: %.ispc
+objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
 	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/volume_rendering/volume.ispc
+++ b/examples/volume_rendering/volume.ispc
@@ -124,24 +124,13 @@ static inline float D(int x, int y, int z, uniform int nVoxels[3],
 }


-static inline float Du(uniform int x, uniform int y, uniform int z, 
-                       uniform int nVoxels[3], uniform float density[]) {
-    x = clamp(x, 0, nVoxels[0]-1);
-    y = clamp(y, 0, nVoxels[1]-1);
-    z = clamp(z, 0, nVoxels[2]-1);
-
-    return density[z*nVoxels[0]*nVoxels[1] + y*nVoxels[0] + x];
-}
-
-
 static inline float3 Offset(float3 p, float3 pMin, float3 pMax) {
    return (p - pMin) / (pMax - pMin);
 }


 static inline float Density(float3 Pobj, float3 pMin, float3 pMax, 
-                            uniform float density[], uniform int nVoxels[3],
-                            uniform bool &checkForSameVoxel) {
+                            uniform float density[], uniform int nVoxels[3]) {
    if (!Inside(Pobj, pMin, pMax)) 
        return 0;
    // Compute voxel coordinates and offsets for _Pobj_
@@ -153,39 +142,14 @@ static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
    float dx = vox.x - vx, dy = vox.y - vy, dz = vox.z - vz;

    // Trilinearly interpolate density values to compute local density
-    float d00, d10, d01, d11;
-    uniform int uvx, uvy, uvz;
-    if (checkForSameVoxel && reduce_equal(vx, &uvx) && reduce_equal(vy, &uvy) &&
-        reduce_equal(vz, &uvz)) {
-        // If all of the program instances are inside the same voxel, then
-        // we'll call the 'uniform' variant of the voxel density lookup
-        // function, thus doing a single load for each value rather than a
-        // gather.
-        d00 = Lerp(dx, Du(uvx, uvy, uvz, nVoxels, density),     
-                       Du(uvx+1, uvy, uvz, nVoxels, density));
-        d10 = Lerp(dx, Du(uvx, uvy+1, uvz, nVoxels, density),   
-                       Du(uvx+1, uvy+1, uvz, nVoxels, density));
-        d01 = Lerp(dx, Du(uvx, uvy, uvz+1, nVoxels, density),   
-                       Du(uvx+1, uvy, uvz+1, nVoxels, density));
-        d11 = Lerp(dx, Du(uvx, uvy+1, uvz+1, nVoxels, density), 
-                       Du(uvx+1, uvy+1, uvz+1, nVoxels, density));
-    }
-    else {
-        // Otherwise, we have to do an actual gather in the more general
-        // D() function.  Once the reduce_equal tests above fail, we stop
-        // checking in subsequent steps, since it's unlikely that this will
-        // be true in the future once they've diverged into different
-        // voxels.
-        checkForSameVoxel = false;
-        d00 = Lerp(dx, D(vx, vy, vz, nVoxels, density),     
-                       D(vx+1, vy, vz, nVoxels, density));
-        d10 = Lerp(dx, D(vx, vy+1, vz, nVoxels, density),   
-                       D(vx+1, vy+1, vz, nVoxels, density));
-        d01 = Lerp(dx, D(vx, vy, vz+1, nVoxels, density),   
-                       D(vx+1, vy, vz+1, nVoxels, density));
-        d11 = Lerp(dx, D(vx, vy+1, vz+1, nVoxels, density), 
-                       D(vx+1, vy+1, vz+1, nVoxels, density));
-    }
+    float d00 = Lerp(dx, D(vx, vy, vz, nVoxels, density),     
+                     D(vx+1, vy, vz, nVoxels, density));
+    float d10 = Lerp(dx, D(vx, vy+1, vz, nVoxels, density),   
+                     D(vx+1, vy+1, vz, nVoxels, density));
+    float d01 = Lerp(dx, D(vx, vy, vz+1, nVoxels, density),   
+                     D(vx+1, vy, vz+1, nVoxels, density));
+    float d11 = Lerp(dx, D(vx, vy+1, vz+1, nVoxels, density), 
+                     D(vx+1, vy+1, vz+1, nVoxels, density));
    float d0 = Lerp(dy, d00, d10);
    float d1 = Lerp(dy, d01, d11);
    return Lerp(dz, d0, d1);
@@ -221,10 +185,8 @@ transmittance(uniform float3 p0, float3 p1, uniform float3 pMin,
    float t = rayT0;
    float3 pos = ray.origin + ray.dir * rayT0;
    float3 dirStep = ray.dir * stepT;
-    uniform bool checkForSameVoxel = true;
    while (t < rayT1) {
-        tau += stepDist * sigma_t * Density(pos, pMin, pMax, density, nVoxels,
-                                            checkForSameVoxel);
+        tau += stepDist * sigma_t * Density(pos, pMin, pMax, density, nVoxels);
        pos = pos + dirStep;
        t += stepT;
    }
@@ -268,9 +230,8 @@ raymarch(uniform float density[], uniform int nVoxels[3], Ray ray) {
    float t = rayT0;
    float3 pos = ray.origin + ray.dir * rayT0;
    float3 dirStep = ray.dir * stepT;
-    uniform bool checkForSameVoxel = true;
    cwhile (t < rayT1) {
-        float d = Density(pos, pMin, pMax, density, nVoxels, checkForSameVoxel);
+        float d = Density(pos, pMin, pMax, density, nVoxels);

        // terminate once attenuation is high
        float atten = exp(-tau);
--- a/examples/volume_rendering/volume.vcxproj
+++ b/examples/volume_rendering/volume.vcxproj
@@ -156,18 +156,18 @@
  <ItemGroup>
    <CustomBuild Include="volume.ispc">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
    </CustomBuild>
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
--- a/expr.cpp
+++ b/expr.cpp
--- a/expr.h
+++ b/expr.h
@@ -634,13 +634,14 @@ public:
        being done just given type information without the parameter
        argument expressions being available.  It returns true on success.
     */
-    bool ResolveOverloads(const std::vector<const Type *> &argTypes,
+    bool ResolveOverloads(SourcePos argPos,
+                          const std::vector<const Type *> &argTypes,
                          const std::vector<bool> *argCouldBeNULL = NULL);
    Symbol *GetMatchingFunction();

 private:
    bool tryResolve(int (*matchFunc)(const Type *, const Type *),
-                    const std::vector<const Type *> &argTypes,
+                    SourcePos argPos, const std::vector<const Type *> &argTypes,
                    const std::vector<bool> *argCouldBeNULL);

    /** Name of the function that is being called. */
--- a/func.cpp
+++ b/func.cpp
@@ -72,7 +72,7 @@ Function::Function(Symbol *s, const std::vector<Symbol *> &a, Stmt *c) {
    code = c;

    maskSymbol = m->symbolTable->LookupVariable("__mask");
-    assert(maskSymbol != NULL);
+    Assert(maskSymbol != NULL);

    if (code != NULL) {
        if (g->debugPrint) {
@@ -109,7 +109,7 @@ Function::Function(Symbol *s, const std::vector<Symbol *> &a, Stmt *c) {
    }

    const FunctionType *type = dynamic_cast<const FunctionType *>(sym->type);
-    assert(type != NULL);
+    Assert(type != NULL);

    for (unsigned int i = 0; i < args.size(); ++i)
        if (dynamic_cast<const ReferenceType *>(args[i]->type) == NULL)
@@ -117,13 +117,13 @@ Function::Function(Symbol *s, const std::vector<Symbol *> &a, Stmt *c) {

    if (type->isTask) {
        threadIndexSym = m->symbolTable->LookupVariable("threadIndex");
-        assert(threadIndexSym);
+        Assert(threadIndexSym);
        threadCountSym = m->symbolTable->LookupVariable("threadCount");
-        assert(threadCountSym);
+        Assert(threadCountSym);
        taskIndexSym = m->symbolTable->LookupVariable("taskIndex");
-        assert(taskIndexSym);
+        Assert(taskIndexSym);
        taskCountSym = m->symbolTable->LookupVariable("taskCount");
-        assert(taskCountSym);
+        Assert(taskCountSym);
    }
    else
        threadIndexSym = threadCountSym = taskIndexSym = taskCountSym = NULL;
@@ -133,7 +133,7 @@ Function::Function(Symbol *s, const std::vector<Symbol *> &a, Stmt *c) {
 const Type *
 Function::GetReturnType() const {
    const FunctionType *type = dynamic_cast<const FunctionType *>(sym->type);
-    assert(type != NULL);
+    Assert(type != NULL);
    return type->GetReturnType();
 }

@@ -141,7 +141,7 @@ Function::GetReturnType() const {
 const FunctionType *
 Function::GetType() const {
    const FunctionType *type = dynamic_cast<const FunctionType *>(sym->type);
-    assert(type != NULL);
+    Assert(type != NULL);
    return type;
 }

@@ -157,9 +157,9 @@ lCopyInTaskParameter(int i, llvm::Value *structArgPtr, const std::vector<Symbol
    // We expect the argument structure to come in as a poitner to a
    // structure.  Confirm and figure out its type here.
    const llvm::Type *structArgType = structArgPtr->getType();
-    assert(llvm::isa<llvm::PointerType>(structArgType));
+    Assert(llvm::isa<llvm::PointerType>(structArgType));
    const llvm::PointerType *pt = llvm::dyn_cast<const llvm::PointerType>(structArgType);
-    assert(llvm::isa<llvm::StructType>(pt->getElementType()));
+    Assert(llvm::isa<llvm::StructType>(pt->getElementType()));
    const llvm::StructType *argStructType = 
        llvm::dyn_cast<const llvm::StructType>(pt->getElementType());

@@ -189,10 +189,9 @@ lCopyInTaskParameter(int i, llvm::Value *structArgPtr, const std::vector<Symbol
 void 
 Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function, 
                   SourcePos firstStmtPos) {
-    llvm::Value *maskPtr = ctx->AllocaInst(LLVMTypes::MaskType, "mask_memory");
-    ctx->StoreInst(LLVMMaskAllOn, maskPtr);
-    maskSymbol->storagePtr = maskPtr;
-    ctx->SetMaskPointer(maskPtr);
+    // Connect the __mask builtin to the location in memory that stores its
+    // value
+    maskSymbol->storagePtr = ctx->GetFullMaskPointer();

    // add debugging info for __mask, programIndex, ...
    maskSymbol->pos = firstStmtPos;
@@ -202,7 +201,7 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
    llvm::BasicBlock *entryBBlock = ctx->GetCurrentBasicBlock();
 #endif
    const FunctionType *type = dynamic_cast<const FunctionType *>(sym->type);
-    assert(type != NULL);
+    Assert(type != NULL);
    if (type->isTask == true) {
        // For tasks, we there should always be three parmeters: the
        // pointer to the structure that holds all of the arguments, the
@@ -267,38 +266,74 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
        else {
            // Otherwise use the mask to set the entry mask value
            argIter->setName("__mask");
-            assert(argIter->getType() == LLVMTypes::MaskType);
+            Assert(argIter->getType() == LLVMTypes::MaskType);
            ctx->SetFunctionMask(argIter);
-            assert(++argIter == function->arg_end());
+            Assert(++argIter == function->arg_end());
        }
    }

    // Finally, we can generate code for the function
    if (code != NULL) {
+        ctx->SetDebugPos(code->pos);
+        ctx->AddInstrumentationPoint("function entry");
+
        int costEstimate = code->EstimateCost();
+        Debug(code->pos, "Estimated cost for function \"%s\" = %d\n", 
+              sym->name.c_str(), costEstimate);
+
+        // If the body of the function is non-trivial, then we wrap the
+        // entire thing inside code that tests to see if the mask is all
+        // on, all off, or mixed.  If this is a simple function, then this
+        // isn't worth the code bloat / overhead.
        bool checkMask = (type->isTask == true) || 
            ((function->hasFnAttr(llvm::Attribute::AlwaysInline) == false) &&
             costEstimate > CHECK_MASK_AT_FUNCTION_START_COST);
-        Debug(code->pos, "Estimated cost for function \"%s\" = %d\n", 
-              sym->name.c_str(), costEstimate);
-        // If the body of the function is non-trivial, then we wrap the
-        // entire thing around a varying "cif (true)" test in order to reap
-        // the side-effect benefit of checking to see if the execution mask
-        // is all on and thence having a specialized code path for that
-        // case.  If this is a simple function, then this isn't worth the
-        // code bloat / overhead.
-        if (checkMask) {
-            bool allTrue[ISPC_MAX_NVEC];
-            for (int i = 0; i < g->target.vectorWidth; ++i)
-                allTrue[i] = true;
-            Expr *trueExpr = new ConstExpr(AtomicType::VaryingBool, allTrue, 
-                                           code->pos);
-            code = new IfStmt(trueExpr, code, NULL, true, code->pos);
-        }
+        if (checkMask && g->opt.disableCoherentControlFlow == false) {
+            llvm::Value *mask = ctx->GetFunctionMask();
+            llvm::Value *allOn = ctx->All(mask);
+            llvm::BasicBlock *bbAllOn = ctx->CreateBasicBlock("all_on");
+            llvm::BasicBlock *bbNotAll = ctx->CreateBasicBlock("not_all_on");

-        ctx->SetDebugPos(code->pos);
-        ctx->AddInstrumentationPoint("function entry");
-        code->EmitCode(ctx);
+            ctx->BranchInst(bbAllOn, bbNotAll, allOn);
+
+            // all on: we've determined dynamically that the mask is all
+            // on.  Set the current mask to "all on" explicitly so that
+            // codegen for this path can be improved with this knowledge in
+            // hand...
+            ctx->SetCurrentBasicBlock(bbAllOn);
+            if (!g->opt.disableMaskAllOnOptimizations)
+                ctx->SetFunctionMask(LLVMMaskAllOn);
+            code->EmitCode(ctx);
+            if (ctx->GetCurrentBasicBlock())
+                ctx->ReturnInst();
+
+            // not all on: figure out if no instances are running, or if
+            // some of them are
+            ctx->SetCurrentBasicBlock(bbNotAll);
+            ctx->SetFunctionMask(mask);
+            llvm::BasicBlock *bbNoneOn = ctx->CreateBasicBlock("none_on");
+            llvm::BasicBlock *bbSomeOn = ctx->CreateBasicBlock("some_on");
+            llvm::Value *anyOn = ctx->Any(mask);
+            ctx->BranchInst(bbSomeOn, bbNoneOn, anyOn);
+            
+            // Everyone is off; get out of here.
+            ctx->SetCurrentBasicBlock(bbNoneOn);
+            ctx->ReturnInst();
+
+            // some on: reset the mask to the value it had at function
+            // entry and emit the code.  Resetting the mask here is
+            // important, due to the "all on" setting of it for the path
+            // above
+            ctx->SetCurrentBasicBlock(bbSomeOn);
+            ctx->SetFunctionMask(mask);
+            code->EmitCode(ctx);
+            if (ctx->GetCurrentBasicBlock())
+                ctx->ReturnInst();
+
+        }
+        else
+            // No check, just emit the code
+            code->EmitCode(ctx);
    }

    if (ctx->GetCurrentBasicBlock()) {
@@ -337,7 +372,7 @@ Function::GenerateIR() {
        return;

    llvm::Function *function = sym->function;
-    assert(function != NULL);
+    Assert(function != NULL);

    // But if that function has a definition, we don't want to redefine it.
    if (function->empty() == false) {
@@ -376,7 +411,7 @@ Function::GenerateIR() {
        // it without a mask parameter and without name mangling so that
        // the application can call it
        const FunctionType *type = dynamic_cast<const FunctionType *>(sym->type);
-        assert(type != NULL);
+        Assert(type != NULL);
        if (type->isExported) {
            if (!type->isTask) {
                LLVM_TYPE_CONST llvm::FunctionType *ftype = 
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -161,7 +161,21 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
        t->vectorWidth = 16;
        t->attributes = "+avx,+popcnt,+cmov";
    }
-#endif // LLVM 3.0
+#endif // LLVM 3.0+
+#if defined(LLVM_3_1svn)
+    else if (!strcasecmp(isa, "avx2")) {
+        t->isa = Target::AVX2;
+        t->nativeVectorWidth = 8;
+        t->vectorWidth = 8;
+        t->attributes = "+avx2,+popcnt,+cmov";
+    }
+    else if (!strcasecmp(isa, "avx2-x2")) {
+        t->isa = Target::AVX2;
+        t->nativeVectorWidth = 16;
+        t->vectorWidth = 16;
+        t->attributes = "+avx2,+popcnt,+cmov";
+    }
+#endif // LLVM 3.1
    else {
        fprintf(stderr, "Target ISA \"%s\" is unknown.  Choices are: %s\n", 
                isa, SupportedTargetISAs());
@@ -201,9 +215,12 @@ Target::SupportedTargetArchs() {
 const char *
 Target::SupportedTargetISAs() {
    return "sse2, sse2-x2, sse4, sse4-x2"
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
+#ifndef LLVM_2_9
        ", avx, avx-x2"
-#endif
+#endif // !LLVM_2_9
+#ifdef LLVM_3_1svn
+        ", avx2, avx2-x2"
+#endif // LLVM_3_1svn
        ;
 }

@@ -241,11 +258,19 @@ Target::GetTargetMachine() const {

    llvm::Reloc::Model relocModel = generatePIC ? llvm::Reloc::PIC_ : 
                                                  llvm::Reloc::Default;
-#if defined(LLVM_3_0svn) || defined(LLVM_3_1svn) || defined(LLVM_3_0)
+#if defined(LLVM_3_1svn)
+    std::string featuresString = attributes;
+    llvm::TargetOptions options;
+    if (g->opt.fastMath == true)
+        options.UnsafeFPMath = 1;
+    llvm::TargetMachine *targetMachine = 
+        target->createTargetMachine(triple, cpu, featuresString, options,
+                                    relocModel);
+#elif defined(LLVM_3_0)
    std::string featuresString = attributes;
    llvm::TargetMachine *targetMachine = 
        target->createTargetMachine(triple, cpu, featuresString, relocModel);
-#else
+#else // LLVM 2.9
 #ifdef ISPC_IS_APPLE
    relocModel = llvm::Reloc::PIC_;
 #endif // ISPC_IS_APPLE
@@ -255,8 +280,9 @@ Target::GetTargetMachine() const {
 #ifndef ISPC_IS_WINDOWS
    targetMachine->setRelocationModel(relocModel);
 #endif // !ISPC_IS_WINDOWS
-#endif
-    assert(targetMachine != NULL);
+#endif // LLVM_2_9
+
+    Assert(targetMachine != NULL);

    targetMachine->setAsmVerbosityDefault(true);
    return targetMachine;
@@ -272,7 +298,8 @@ Target::GetISAString() const {
        return "sse4";
    case Target::AVX:
        return "avx";
-        break;
+    case Target::AVX2:
+        return "avx2";
    default:
        FATAL("Unhandled target in GetISAString()");
    }
@@ -283,10 +310,10 @@ Target::GetISAString() const {
 llvm::Value *
 Target::SizeOf(LLVM_TYPE_CONST llvm::Type *type) {
    const llvm::TargetData *td = GetTargetMachine()->getTargetData();
-    assert(td != NULL);
+    Assert(td != NULL);
    uint64_t byteSize = td->getTypeSizeInBits(type) / 8;
    if (is32Bit || g->opt.force32BitAddressing)
-        return LLVMInt32(byteSize);
+        return LLVMInt32((int32_t)byteSize);
    else
        return LLVMInt64(byteSize);
 }
@@ -295,16 +322,16 @@ Target::SizeOf(LLVM_TYPE_CONST llvm::Type *type) {
 llvm::Value *
 Target::StructOffset(LLVM_TYPE_CONST llvm::Type *type, int element) {
    const llvm::TargetData *td = GetTargetMachine()->getTargetData();
-    assert(td != NULL);
+    Assert(td != NULL);
    LLVM_TYPE_CONST llvm::StructType *structType = 
        llvm::dyn_cast<LLVM_TYPE_CONST llvm::StructType>(type);
-    assert(structType != NULL);
+    Assert(structType != NULL);
    const llvm::StructLayout *sl = td->getStructLayout(structType);
-    assert(sl != NULL);
+    Assert(sl != NULL);

    uint64_t offset = sl->getElementOffset(element);
    if (is32Bit || g->opt.force32BitAddressing)
-        return LLVMInt32(offset);
+        return LLVMInt32((int32_t)offset);
    else
        return LLVMInt64(offset);
 }
@@ -320,6 +347,7 @@ Opt::Opt() {
    force32BitAddressing = true;
    unrollLoops = true;
    disableAsserts = false;
+    disableMaskAllOnOptimizations = false;
    disableHandlePseudoMemoryOps = false;
    disableBlendedMaskedStores = false;
    disableCoherentControlFlow = false;
@@ -328,7 +356,6 @@ Opt::Opt() {
    disableMaskedStoreToStore = false;
    disableGatherScatterFlattening = false;
    disableUniformMemoryOptimizations = false;
-    disableMaskedStoreOptimizations = false;
 }

 ///////////////////////////////////////////////////////////////////////////
@@ -362,7 +389,13 @@ Globals::Globals() {
 // SourcePos

 SourcePos::SourcePos(const char *n, int fl, int fc, int ll, int lc) {
-    name = n ? n : m->module->getModuleIdentifier().c_str();
+    name = n;
+    if (name == NULL) {
+        if (m != NULL)
+            name = m->module->getModuleIdentifier().c_str();
+        else
+            name = "(unknown)";
+    }
    first_line = fl;
    first_column = fc;
    last_line = ll != 0 ? ll : fl;
--- a/ispc.h
+++ b/ispc.h
@@ -50,11 +50,22 @@
 #define ISPC_IS_APPLE
 #endif

-#include <assert.h>
 #include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
 #include <vector>
 #include <string>

+#define Assert(expr)                                            \
+    ((void)((expr) ? 0 : __Assert (#expr, __FILE__, __LINE__)))
+#define __Assert(expr, file, line)                                      \
+    ((void)fprintf(stderr, "%s:%u: Assertion failed: \"%s\"\n"          \
+                   "***\n*** Please file a bug report at "              \
+                   "https://github.com/ispc/ispc/issues\n*** (Including as much " \
+                   "information as you can about how to reproduce this error).\n" \
+                   "*** You have apparently encountered a bug in the compiler that " \
+                   "we'd like to fix!\n***\n", file, line, expr), abort(), 0)
+
 /** @def ISPC_MAX_NVEC maximum vector size of any of the compliation
    targets.
 */
@@ -182,7 +193,7 @@ struct Target {
        flexible/performant of them will apear last in the enumerant.  Note
        also that __best_available_isa() needs to be updated if ISAs are
        added or the enumerant values are reordered.  */
-    enum ISA { SSE2, SSE4, AVX, NUM_ISAS };
+    enum ISA { SSE2, SSE4, AVX, AVX2, NUM_ISAS };

    /** Instruction set being compiled to. */
    ISA isa;
@@ -247,10 +258,15 @@ struct Opt {
     */ 
    bool force32BitAddressing;

-    /** Indicates whether assert() statements should be ignored (for
+    /** Indicates whether Assert() statements should be ignored (for
        performance in the generated code). */
    bool disableAsserts;
-    
+
+    /** If enabled, disables the various optimizations that kick in when
+        the execution mask can be determined to be "all on" at compile
+        time. */
+    bool disableMaskAllOnOptimizations;
+
    /** If enabled, the various __pseudo* memory ops (gather/scatter,
        masked load/store) are left in their __pseudo* form, for better
        understanding of the structure of generated code when reading
@@ -302,14 +318,6 @@ struct Opt {
        than gathers/scatters.  This is likely only useful for measuring
        the impact of this optimization. */
    bool disableUniformMemoryOptimizations;
-
-    /** Disables optimizations for masked stores: masked stores with the
-        mask all on are transformed to regular stores, and masked stores
-        with the mask are all off are removed (which in turn can allow
-        eliminating additional dead code related to computing the value
-        stored).  This is likely only useful for measuring the impact of
-        this optimization. */
-    bool disableMaskedStoreOptimizations;
 };

 /** @brief This structure collects together a number of global variables. 
--- a/ispc.sln
+++ b/ispc.sln
@@ -3,8 +3,6 @@ Microsoft Visual Studio Solution File, Format Version 11.00
 # Visual Studio 2010
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ispc", "ispc.vcxproj", "{9861F490-F516-480C-B63C-D62A77AFA9D5}"
 EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ispc_test", "ispc_test.vcxproj", "{92547BA8-BE86-4E78-8799-1D72A70E5831}"
-EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|Win32 = Debug|Win32
@@ -15,9 +13,6 @@ Global
 		{9861F490-F516-480C-B63C-D62A77AFA9D5}.Debug|Win32.Build.0 = Debug|Win32
 		{9861F490-F516-480C-B63C-D62A77AFA9D5}.Release|Win32.ActiveCfg = Release|Win32
 		{9861F490-F516-480C-B63C-D62A77AFA9D5}.Release|Win32.Build.0 = Release|Win32
-		{92547BA8-BE86-4E78-8799-1D72A70E5831}.Debug|Win32.ActiveCfg = Debug|Win32
-		{92547BA8-BE86-4E78-8799-1D72A70E5831}.Debug|Win32.Build.0 = Debug|Win32
-		{92547BA8-BE86-4E78-8799-1D72A70E5831}.Release|Win32.ActiveCfg = Release|Win32
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
--- a/ispc_test.cpp
+++ b/ispc_test.cpp
@@ -1,379 +0,0 @@
-/*
-  Copyright (c) 2010-2011, Intel Corporation
-  All rights reserved.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions are
-  met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived from
-      this software without specific prior written permission.
-
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
-   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
-*/
-
-#define _CRT_SECURE_NO_WARNINGS
-
-#if defined(_WIN32) || defined(_WIN64)
-#define ISPC_IS_WINDOWS
-#elif defined(__linux__)
-#define ISPC_IS_LINUX
-#elif defined(__APPLE__)
-#define ISPC_IS_APPLE
-#endif
-
-#ifdef ISPC_IS_WINDOWS
-#define NOMINMAX
-#include <windows.h>
-#endif
-#include <stdio.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <memory.h>
-#ifdef ISPC_IS_LINUX
-#include <malloc.h>
-#endif
-
-#ifdef ISPC_HAVE_SVML
-#include <xmmintrin.h>
-extern "C" {
-    extern __m128 __svml_sinf4(__m128);
-    extern __m128 __svml_cosf4(__m128);
-    extern __m128 __svml_sincosf4(__m128 *,__m128);
-    extern __m128 __svml_tanf4(__m128);
-    extern __m128 __svml_atanf4(__m128);
-    extern __m128 __svml_atan2f4(__m128, __m128);
-    extern __m128 __svml_expf4(__m128);
-    extern __m128 __svml_logf4(__m128);
-    extern __m128 __svml_powf4(__m128, __m128);
-}
-#endif
-
-#include <llvm/LLVMContext.h>
-#include <llvm/Module.h>
-#include <llvm/Type.h>
-#include <llvm/DerivedTypes.h>
-#include <llvm/Instructions.h>
-#include <llvm/ExecutionEngine/ExecutionEngine.h>
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
-  #include <llvm/Support/TargetRegistry.h>
-  #include <llvm/Support/TargetSelect.h>
-#else
-  #include <llvm/Target/TargetRegistry.h>
-  #include <llvm/Target/TargetSelect.h>
-#endif
-#include <llvm/ExecutionEngine/JIT.h>
-#include <llvm/Target/TargetOptions.h>
-#include <llvm/Target/TargetData.h>
-#include <llvm/Transforms/Scalar.h>
-#include <llvm/Transforms/IPO.h>
-#include <llvm/PassManager.h>
-#include <llvm/Support/CFG.h>
-#include <llvm/Analysis/Verifier.h>
-#include <llvm/Assembly/PrintModulePass.h>
-#include <llvm/Support/raw_ostream.h>
-#include <llvm/Bitcode/ReaderWriter.h>
-#include <llvm/Support/MemoryBuffer.h>
-#include <llvm/Support/system_error.h>
-
-bool shouldFail = false;
-
-extern "C" { 
-    void ISPCLaunch(void **, void *, void *, int32_t);
-    void ISPCSync(void *);
-    void *ISPCAlloc(void **, int64_t size, int32_t alignment);
-}
-
-void ISPCLaunch(void **handle, void *func, void *data, int32_t count) {
-    *handle = (void *)0xdeadbeef;
-    typedef void (*TaskFuncType)(void *, int, int, int, int);
-    TaskFuncType tft = (TaskFuncType)(func);
-    for (int i = 0; i < count; ++i)
-        tft(data, 0, 1, i, count);
-}
-
-
-void ISPCSync(void *) {
-}
-
-
-void *ISPCAlloc(void **handle, int64_t size, int32_t alignment) {
-    *handle = (void *)0xdeadbeef;
-    // leak time!
-#ifdef ISPC_IS_WINDOWS
-    return _aligned_malloc((size_t)size, alignment);
-#endif
-#ifdef ISPC_IS_LINUX
-    return memalign(alignment, size);
-#endif
-#ifdef ISPC_IS_APPLE
-    void *mem = malloc(size + (alignment-1) + sizeof(void*));
-    char *amem = ((char*)mem) + sizeof(void*);
-    amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
-                                        (alignment - 1)));
-    ((void**)amem)[-1] = mem;
-    return amem;
-#endif
-}
-
-
-static void usage(int ret) {
-    fprintf(stderr, "usage: ispc_test\n");
-    fprintf(stderr, "\t[-h/--help]\tprint help\n");
-    fprintf(stderr, "\t[-f]\t\tindicates that test is expected to fail\n");
-    fprintf(stderr, "\t<files>\n");
-    exit(ret);
-}
-
-static void svml_missing() {
-    fprintf(stderr, "Program called unavailable SVML function!\n");
-    exit(1);
-}
-
-// On Windows, sin() is an overloaded function, so we need an unambiguous
-// function we can take the address of when wiring up the external references
-// below.
-
-double Sin(double x) { return sin(x); }
-double Cos(double x) { return cos(x); }
-double Tan(double x) { return tan(x); }
-double Atan(double x) { return atan(x); }
-double Atan2(double y, double x) { return atan2(y, x); }
-double Pow(double a, double b) { return pow(a, b); }
-double Exp(double x) { return exp(x); }
-double Log(double x) { return log(x); }
-
-static bool lRunTest(const char *fn) {
-    llvm::LLVMContext *ctx = new llvm::LLVMContext;
-
-    llvm::OwningPtr<llvm::MemoryBuffer> buf;
-    llvm::error_code err = llvm::MemoryBuffer::getFileOrSTDIN(fn, buf);
-    if (err) {
-        fprintf(stderr, "Unable to open file \"%s\": %s\n", fn, err.message().c_str());
-        delete ctx;
-        return false;
-    }
-    std::string bcErr;
-    llvm::Module *module = llvm::ParseBitcodeFile(buf.get(), *ctx, &bcErr);
-
-    if (!module) {
-        fprintf(stderr, "Bitcode reader failed for \"%s\": %s\n", fn, bcErr.c_str());
-        delete ctx;
-        return false;
-    }
-
-    std::string eeError;
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
-    llvm::EngineBuilder engineBuilder(module);
-    engineBuilder.setErrorStr(&eeError);
-    engineBuilder.setEngineKind(llvm::EngineKind::JIT);
-#if 0
-    std::vector<std::string> attributes;
-    if (target != NULL && !strcmp(target, "avx"))
-        attributes.push_back("+avx");
-    engineBuilder.setMAttrs(attributes);
-    engineBuilder.setUseMCJIT(true);
-#endif
-    llvm::ExecutionEngine *ee = engineBuilder.create();
-#else
-    llvm::ExecutionEngine *ee = llvm::ExecutionEngine::createJIT(module, &eeError);
-#endif
-    if (!ee) {
-        fprintf(stderr, "Unable to create ExecutionEngine: %s\n", eeError.c_str());
-        return false;
-    }
-
-    llvm::Function *func;
-#define DO_FUNC(FUNC ,FUNCNAME)                           \
-    if ((func = module->getFunction(FUNCNAME)) != NULL)   \
-        ee->addGlobalMapping(func, (void *)FUNC)
-    DO_FUNC(ISPCLaunch, "ISPCLaunch");
-    DO_FUNC(ISPCSync, "ISPCSync");
-    DO_FUNC(ISPCAlloc, "ISPCAlloc");
-    DO_FUNC(putchar, "putchar");
-    DO_FUNC(printf, "printf");
-    DO_FUNC(fflush, "fflush");
-    DO_FUNC(sinf, "sinf");
-    DO_FUNC(cosf, "cosf");
-    DO_FUNC(tanf, "tanf");
-    DO_FUNC(atanf, "atanf");
-    DO_FUNC(atan2f, "atan2f");
-    DO_FUNC(powf, "powf");
-    DO_FUNC(expf, "expf");
-    DO_FUNC(logf, "logf");
-    DO_FUNC(Sin, "sin");
-    DO_FUNC(Cos, "cos");
-    DO_FUNC(Tan, "tan");
-    DO_FUNC(Atan, "atan");
-    DO_FUNC(Atan2, "atan2");
-    DO_FUNC(Pow, "pow");
-    DO_FUNC(Exp, "exp");
-    DO_FUNC(Log, "log");
-    DO_FUNC(memset, "memset");
-#ifdef ISPC_IS_APPLE
-    DO_FUNC(memset_pattern4, "memset_pattern4");
-    DO_FUNC(memset_pattern8, "memset_pattern8");
-    DO_FUNC(memset_pattern16, "memset_pattern16");
-#endif
-
-#ifdef ISPC_HAVE_SVML
-#define DO_SVML(FUNC ,FUNCNAME)                           \
-    if ((func = module->getFunction(FUNCNAME)) != NULL)   \
-        ee->addGlobalMapping(func, (void *)FUNC)
-#else
-#define DO_SVML(FUNC, FUNCNAME)                                         \
-    if ((func = module->getFunction(FUNCNAME)) != NULL)                 \
-        ee->addGlobalMapping(func, (void *)svml_missing)
-#endif
-
-    DO_SVML(__svml_sinf4, "__svml_sinf4");
-    DO_SVML(__svml_cosf4, "__svml_cosf4");
-    DO_SVML(__svml_sincosf4, "__svml_sincosf4");
-    DO_SVML(__svml_tanf4, "__svml_tanf4");
-    DO_SVML(__svml_atanf4, "__svml_atanf4");
-    DO_SVML(__svml_atan2f4, "__svml_atan2f4");
-    DO_SVML(__svml_expf4, "__svml_expf4");
-    DO_SVML(__svml_logf4, "__svml_logf4");
-    DO_SVML(__svml_powf4, "__svml_powf4");
-
-    // figure out the vector width in the compiled code
-    func = module->getFunction("width");
-    if (!func) {
-        fprintf(stderr, "No width() function found!\n");
-        return false;
-    }
-    int width;
-    {
-        typedef int (*PFN)();
-        PFN pfn = reinterpret_cast<PFN>(ee->getPointerToFunction(func));
-        width = pfn();
-        assert(width == 4 || width == 8 || width == 12 || width == 16);
-    }
-
-    // find the value that returns the desired result
-    func = module->getFunction("result");
-    bool foundResult = (func != NULL);
-    float result[16];
-    for (int i = 0; i < 16; ++i)
-        result[i] = 0;
-    if (foundResult) {
-        typedef void (*PFN)(float *);
-        PFN pfn = reinterpret_cast<PFN>(ee->getPointerToFunction(func));
-        pfn(result);
-    }
-    else
-        fprintf(stderr, "Warning: no result() function found.\n");
-
-    // try to find a function to run
-    float returned[16];
-    for (int i = 0; i < 16; ++i)
-        returned[i] = 0;
-    float vfloat[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
-    double vdouble[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
-    int vint[16] = { 2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32 };
-    int vint2[16] = { 5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20};
-
-    if ((func = module->getFunction("f_v")) != NULL) {
-        typedef void (*PFN)(float *);
-        PFN pfn = reinterpret_cast<PFN>(ee->getPointerToFunction(func));
-        pfn(returned);
-    }
-    else if ((func = module->getFunction("f_f")) != NULL) {
-        typedef void (*PFN)(float *, float *);
-        PFN pfn = reinterpret_cast<PFN>(ee->getPointerToFunction(func));
-        llvm::verifyFunction(*func);
-        pfn(returned, vfloat);
-    }
-    else if ((func = module->getFunction("f_fu")) != NULL) {
-        typedef void (*PFN)(float *, float *, float fu);
-        PFN pfn = reinterpret_cast<PFN>(ee->getPointerToFunction(func));
-        llvm::verifyFunction(*func);
-        pfn(returned, vfloat, 5.);
-    }
-    else if ((func = module->getFunction("f_fi")) != NULL) {
-        typedef void (*PFN)(float *, float *, int *);
-        PFN pfn = reinterpret_cast<PFN>(ee->getPointerToFunction(func));
-        pfn(returned, vfloat, vint);
-    }
-    else if ((func = module->getFunction("f_du")) != NULL) {
-        typedef void (*PFN)(float *, double *, double);
-        PFN pfn = reinterpret_cast<PFN>(ee->getPointerToFunction(func));
-        pfn(returned, vdouble, 5.);
-    }
-    else if ((func = module->getFunction("f_duf")) != NULL) {
-        typedef void (*PFN)(float *, double *, float);
-        PFN pfn = reinterpret_cast<PFN>(ee->getPointerToFunction(func));
-        pfn(returned, vdouble, 5.f);
-    }
-    else if ((func = module->getFunction("f_di")) != NULL) {
-        typedef void (*PFN)(float *, double *, int *);
-        PFN pfn = reinterpret_cast<PFN>(ee->getPointerToFunction(func));
-        pfn(returned, vdouble, vint2);
-    }
-    else {
-        fprintf(stderr, "Unable to find runnable function in file \"%s\"\n", fn);
-        return false;
-    }
-
-    // see if we got the right result
-    bool resultsMatch = true;
-    if (foundResult) {
-        for (int i = 0; i < width; ++i)
-            if (returned[i] != result[i]) {
-                resultsMatch = false;
-                fprintf(stderr, "Test \"%s\" RETURNED %d: %g / %a EXPECTED %g / %a\n",
-                        fn, i, returned[i], returned[i], result[i], result[i]);
-            }
-    }
-    else {
-        for (int i = 0; i < width; ++i)
-            fprintf(stderr, "Test \"%s\" returned %d: %g / %a\n",
-                    fn, i, returned[i], returned[i]);
-    }
-    if (foundResult && shouldFail && resultsMatch)
-        fprintf(stderr, "Test %s unexpectedly passed\n", fn);
-
-    delete ee;
-    delete ctx;
-
-    return foundResult && resultsMatch;
-}
-
-
-int main(int argc, char *argv[]) {
-    llvm::InitializeNativeTarget();
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
-    LLVMLinkInJIT();
-#endif
-
-    const char *filename = NULL;
-    for (int i = 1; i < argc; ++i) {
-        if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h"))
-            usage(0);
-        if (!strcmp(argv[i], "-f"))
-            shouldFail = true;
-        else
-            filename = argv[i];
-    }
-
-    return (lRunTest(filename) == true) ? 0 : 1;
-}
--- a/ispc_test.vcxproj
+++ b/ispc_test.vcxproj
@@ -1,90 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|Win32">
-      <Configuration>Debug</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|Win32">
-      <Configuration>Release</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
-  <ItemGroup>
-    <ClCompile Include="ispc_test.cpp" />
-  </ItemGroup>
-  <PropertyGroup Label="Globals">
-    <ProjectGuid>{92547BA8-BE86-4E78-8799-1D72A70E5831}</ProjectGuid>
-    <Keyword>Win32Proj</Keyword>
-    <RootNamespace>ispc_test</RootNamespace>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <LinkIncremental>true</LinkIncremental>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <LinkIncremental>false</LinkIncremental>
-  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>LLVM_3_0;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)/include</AdditionalIncludeDirectories>
-      <DisableSpecificWarnings>4146;4355;4800</DisableSpecificWarnings>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)/lib</AdditionalLibraryDirectories>
-      <AdditionalDependencies>LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMJIT.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Desc.lib;LLVMX86Info.lib;%(AdditionalDependencies)</AdditionalDependencies>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>LLVM_3_0;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)/include</AdditionalIncludeDirectories>
-      <DisableSpecificWarnings>4146;4355;4800</DisableSpecificWarnings>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)/lib</AdditionalLibraryDirectories>
-      <AdditionalDependencies>LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMJIT.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Desc.lib;LLVMX86Info.lib;%(AdditionalDependencies)</AdditionalDependencies>
-    </Link>
-  </ItemDefinitionGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-  </ImportGroup>
-</Project>
--- a/lex.ll
+++ b/lex.ll
@@ -148,65 +148,48 @@ L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL;
        return TOKEN_IDENTIFIER; 
 }

-{INT_NUMBER} { 
-    char *endPtr = NULL;
-    int64_t val;
+{INT_NUMBER}+(u|U|l|L)*? { 
+    int ls = 0, us = 0;

    if (yytext[0] == '0' && yytext[1] == 'b')
-        val = lParseBinary(yytext+2, *yylloc);
+        yylval->intVal = lParseBinary(yytext+2, *yylloc);
    else {
+        char *endPtr = NULL;
+
 #ifdef ISPC_IS_WINDOWS
-        val = _strtoi64(yytext, &endPtr, 0);
+        yylval->intVal = _strtoi64(yytext, &endPtr, 0);
 #else
        // FIXME: should use strtouq and then issue an error if we can't
        // fit into 64 bits...
-        val = strtoull(yytext, &endPtr, 0);
+        yylval->intVal = strtoull(yytext, &endPtr, 0);
 #endif
+        for (; *endPtr; endPtr++) {
+           if (*endPtr == 'l' || *endPtr == 'L')
+                ls++;
+           else if (*endPtr == 'u' || *endPtr == 'U')
+                us++;
+        }
+        if (ls >= 2)
+            return us ? TOKEN_UINT64_CONSTANT : TOKEN_INT64_CONSTANT;
+        else if (ls == 1)
+           return us ? TOKEN_UINT32_CONSTANT : TOKEN_INT32_CONSTANT;
    }

    // See if we can fit this into a 32-bit integer...
-    if ((val & 0xffffffff) == val) {
-        yylval->int32Val = (int32_t)val;
-        return TOKEN_INT32_CONSTANT; 
-    }
-    else {
-        yylval->int64Val = val;
-        return TOKEN_INT64_CONSTANT; 
-    }
+    if ((yylval->intVal & 0xffffffff) == yylval->intVal)
+        return us ? TOKEN_UINT32_CONSTANT : TOKEN_INT32_CONSTANT;
+    else
+        return us ? TOKEN_UINT64_CONSTANT : TOKEN_INT64_CONSTANT;
 }

-{INT_NUMBER}[uU] {
-    char *endPtr = NULL;
-    uint64_t val;
-
-    if (yytext[0] == '0' && yytext[1] == 'b')
-        val = lParseBinary(yytext+2, *yylloc);
-    else {
-#ifdef ISPC_IS_WINDOWS
-        val = _strtoui64(yytext, &endPtr, 0);
-#else
-        val = strtoull(yytext, &endPtr, 0);
-#endif
-    }
-
-    if ((val & 0xffffffff) == val) {
-        // we can represent it in a 32-bit value
-        yylval->int32Val = (int32_t)val;
-        return TOKEN_UINT32_CONSTANT; 
-    }
-    else {
-        yylval->int64Val = val;
-        return TOKEN_UINT64_CONSTANT; 
-    }
-}

 {FLOAT_NUMBER} { 
-    yylval->floatVal = atof(yytext); 
+    yylval->floatVal = (float)atof(yytext); 
    return TOKEN_FLOAT_CONSTANT; 
 }

 {HEX_FLOAT_NUMBER} {
-    yylval->floatVal = lParseHexFloat(yytext); 
+    yylval->floatVal = (float)lParseHexFloat(yytext); 
    return TOKEN_FLOAT_CONSTANT; 
 }

@@ -291,7 +274,7 @@ lParseBinary(const char *ptr, SourcePos pos) {

    while (*ptr != '\0') {
        /* if this hits, the regexp for 0b... constants is broken */
-        assert(*ptr == '0' || *ptr == '1');
+        Assert(*ptr == '0' || *ptr == '1');

        if ((val & (((int64_t)1)<<63)) && warned == false) {
            // We're about to shift out a set bit
@@ -346,7 +329,7 @@ static void lHandleCppHash(SourcePos *pos) {
    char *ptr, *src;

    // Advance past the opening stuff on the line.
-    assert(yytext[0] == '#');
+    Assert(yytext[0] == '#');
    if (yytext[1] == ' ')
        // On Linux/OSX, the preprocessor gives us lines like
        // # 1234 "foo.c"
@@ -354,7 +337,7 @@ static void lHandleCppHash(SourcePos *pos) {
    else {
        // On windows, cl.exe's preprocessor gives us lines of the form:
        // #line 1234 "foo.c"
-        assert(!strncmp(yytext+1, "line ", 5));
+        Assert(!strncmp(yytext+1, "line ", 5));
        ptr = yytext + 6;
    }

@@ -364,13 +347,13 @@ static void lHandleCppHash(SourcePos *pos) {
    pos->last_column = 1;
    // Make sure that the character after the integer is a space and that
    // then we have open quotes
-    assert(src != ptr && src[0] == ' ' && src[1] == '"');
+    Assert(src != ptr && src[0] == ' ' && src[1] == '"');
    src += 2;

    // And the filename is everything up until the closing quotes
    std::string filename;
    while (*src != '"') {
-        assert(*src && *src != '\n');
+        Assert(*src && *src != '\n');
        filename.push_back(*src);
        ++src;
    }
@@ -471,13 +454,13 @@ ipow2(int exponent) {
 */
 static double
 lParseHexFloat(const char *ptr) {
-    assert(ptr != NULL);
+    Assert(ptr != NULL);

-    assert(ptr[0] == '0' && ptr[1] == 'x');
+    Assert(ptr[0] == '0' && ptr[1] == 'x');
    ptr += 2;

    // Start initializing the mantissa
-    assert(*ptr == '0' || *ptr == '1');
+    Assert(*ptr == '0' || *ptr == '1');
    double mantissa = (*ptr == '1') ? 1. : 0.;
    ++ptr;

@@ -497,7 +480,7 @@ lParseHexFloat(const char *ptr) {
            else if (*ptr >= 'a' && *ptr <= 'f')
                digit = 10 + *ptr - 'a';
            else {
-                assert(*ptr >= 'A' && *ptr <= 'F');
+                Assert(*ptr >= 'A' && *ptr <= 'F');
                digit = 10 + *ptr - 'A';
            }

@@ -510,7 +493,7 @@ lParseHexFloat(const char *ptr) {
    else
        // If there's not a '.', then we better be going straight to the
        // exponent
-        assert(*ptr == 'p');
+        Assert(*ptr == 'p');

    ++ptr; // skip the 'p'

--- a/llvmutil.cpp
+++ b/llvmutil.cpp
@@ -424,7 +424,7 @@ LLVMBoolVector(bool b) {
        v = llvm::ConstantInt::get(LLVMTypes::Int32Type, b ? 0xffffffff : 0, 
                                   false /*unsigned*/);
    else {
-        assert(LLVMTypes::BoolVectorType->getElementType() == 
+        Assert(LLVMTypes::BoolVectorType->getElementType() == 
               llvm::Type::getInt1Ty(*g->ctx));
        v = b ? LLVMTrue : LLVMFalse;
    }
@@ -445,7 +445,7 @@ LLVMBoolVector(const bool *bvec) {
            v = llvm::ConstantInt::get(LLVMTypes::Int32Type, bvec[i] ? 0xffffffff : 0, 
                                       false /*unsigned*/);
        else {
-            assert(LLVMTypes::BoolVectorType->getElementType() == 
+            Assert(LLVMTypes::BoolVectorType->getElementType() == 
                   llvm::Type::getInt1Ty(*g->ctx));
            v = bvec[i] ? LLVMTrue : LLVMFalse;
        }
--- a/main.cpp
+++ b/main.cpp
@@ -37,6 +37,7 @@

 #include "ispc.h"
 #include "module.h"
+#include "util.h"
 #include <stdio.h>
 #include <stdlib.h>
 #include <llvm/Support/PrettyStackTrace.h>
@@ -91,15 +92,15 @@ static void usage(int ret) {
    printf("        fast-masked-vload\t\tFaster masked vector loads on SSE (may go past end of array)\n");
    printf("        fast-math\t\t\tPerform non-IEEE-compliant optimizations of numeric expressions\n");
 #if 0
-    printf("        disable-handle-pseudo-memory-ops\n");
+    printf("        disable-all-on-optimizations\n");
    printf("        disable-blended-masked-stores\t\tScalarize masked stores on SSE (vs. using vblendps)\n");
-    printf("        disable-coherent-control-flow\t\tDisable coherent control flow optimizations\n");
-    printf("        disable-uniform-control-flow\t\tDisable uniform control flow optimizations\n");
-    printf("        disable-gather-scatter-optimizations\tDisable improvements to gather/scatter\n");
    printf("        disable-blending-removal\t\tDisable eliminating blend at same scope\n");
+    printf("        disable-coherent-control-flow\t\tDisable coherent control flow optimizations\n");
    printf("        disable-gather-scatter-flattening\tDisable flattening when all lanes are on\n");
+    printf("        disable-gather-scatter-optimizations\tDisable improvements to gather/scatter\n");
+    printf("        disable-handle-pseudo-memory-ops\n");
+    printf("        disable-uniform-control-flow\t\tDisable uniform control flow optimizations\n");
    printf("        disable-uniform-memory-optimizations\tDisable uniform-based coherent memory access\n");
-    printf("        disable-masked-store-optimizations\tDisable lowering to regular stores when possible\n");
 #endif
 #ifndef ISPC_IS_WINDOWS
    printf("    [--pic]\t\t\t\tGenerate position-independent code\n");
@@ -166,10 +167,12 @@ int main(int Argc, char *Argv[]) {
    char *argv[128];
    lGetAllArgs(Argc, Argv, argc, argv);

+#if 0
    // Use LLVM's little utility function to print out nice stack traces if
    // we crash
    llvm::sys::PrintStackTraceOnErrorSignal();
    llvm::PrettyStackTraceProgram X(argc, argv);
+#endif

    // initialize available LLVM targets
    LLVMInitializeX86TargetInfo();
@@ -203,7 +206,7 @@ int main(int Argc, char *Argv[]) {
            if (atoi(argv[i] + 13) == 64)
                g->opt.force32BitAddressing = false;
            else if (atoi(argv[i] + 13) == 32)
-                g->opt.force32BitAddressing = 32;
+                g->opt.force32BitAddressing = true;
            else {
                fprintf(stderr, "Addressing width \"%s\" invalid--only 32 and "
                        "64 are allowed.\n", argv[i]+13);
@@ -270,6 +273,8 @@ int main(int Argc, char *Argv[]) {

            // These are only used for performance tests of specific
            // optimizations
+            else if (!strcmp(opt, "disable-all-on-optimizations"))
+                g->opt.disableMaskAllOnOptimizations = true;
            else if (!strcmp(opt, "disable-handle-pseudo-memory-ops"))
                g->opt.disableHandlePseudoMemoryOps = true;
            else if (!strcmp(opt, "disable-blended-masked-stores"))
@@ -286,8 +291,6 @@ int main(int Argc, char *Argv[]) {
                g->opt.disableGatherScatterFlattening = true;
            else if (!strcmp(opt, "disable-uniform-memory-optimizations"))
                g->opt.disableUniformMemoryOptimizations = true;
-            else if (!strcmp(opt, "disable-masked-store-optimizations"))
-                g->opt.disableMaskedStoreOptimizations = true;
            else 
                usage(1);
        }
@@ -354,6 +357,11 @@ int main(int Argc, char *Argv[]) {
    if (debugSet && !optSet)
        g->opt.level = 0;

+    if (outFileName == NULL && headerFileName == NULL)
+        Warning(SourcePos(), "No output file or header file name specified. "
+                "Program will be compiled and warnings/errors will "
+                "be issued, but no output will be generated.");
+
    return Module::CompileAndOutput(file, arch, cpu, target, generatePIC,
                                    ot, outFileName, headerFileName);
 }
--- a/module.cpp
+++ b/module.cpp
@@ -49,7 +49,6 @@
 #include "llvmutil.h"

 #include <stdio.h>
-#include <assert.h>
 #include <stdarg.h>
 #include <ctype.h>
 #include <sys/types.h>
@@ -150,8 +149,10 @@ extern void yy_delete_buffer(YY_BUFFER_STATE);

 int
 Module::CompileFile() {
+#ifndef LLVM_3_1svn
    if (g->opt.fastMath == true)
        llvm::UnsafeFPMath = true;
+#endif // !LLVM_3_1svn

    // FIXME: it'd be nice to do this in the Module constructor, but this
    // function ends up calling into routines that expect the global
@@ -222,7 +223,7 @@ Module::AddGlobalVariable(Symbol *sym, Expr *initExpr, bool isConst) {
    if (sym == NULL || sym->type == NULL) {
        // But if these are NULL and there haven't been any previous
        // errors, something surprising is going on
-        assert(errorCount > 0);
+        Assert(errorCount > 0);
        return;
    }

@@ -389,7 +390,7 @@ void
 Module::AddFunctionDeclaration(Symbol *funSym, bool isInline) {
    const FunctionType *functionType = 
        dynamic_cast<const FunctionType *>(funSym->type);
-    assert(functionType != NULL);
+    Assert(functionType != NULL);

    // If a global variable with the same name has already been declared
    // issue an error.
@@ -416,7 +417,7 @@ Module::AddFunctionDeclaration(Symbol *funSym, bool isInline) {
            // allowed.
            const FunctionType *ofType = 
                dynamic_cast<const FunctionType *>(overloadFunc->type);
-            assert(ofType != NULL);
+            Assert(ofType != NULL);
            if (ofType->GetNumParameters() == functionType->GetNumParameters()) {
                int i;
                for (i = 0; i < functionType->GetNumParameters(); ++i) {
@@ -571,7 +572,7 @@ Module::AddFunctionDeclaration(Symbol *funSym, bool isInline) {
    // Finally, we know all is good and we can add the function to the
    // symbol table
    bool ok = symbolTable->AddFunction(funSym);
-    assert(ok);
+    Assert(ok);
 }


@@ -729,7 +730,7 @@ static void
 lVisitNode(const StructType *structType, 
           std::map<const StructType *, StructDAGNode *> &structToNode,
           std::vector<const StructType *> &sortedTypes) {
-    assert(structToNode.find(structType) != structToNode.end());
+    Assert(structToNode.find(structType) != structToNode.end());
    // Get the node that encodes the structs that this one is immediately
    // dependent on.
    StructDAGNode *node = structToNode[structType];
@@ -793,7 +794,7 @@ lEmitStructDecls(std::vector<const StructType *> &structTypes, FILE *file) {
        if (hasIncomingEdges.find(structType) == hasIncomingEdges.end())
            lVisitNode(structType, structToNode, sortedTypes);
    }
-    assert(sortedTypes.size() == structTypes.size());
+    Assert(sortedTypes.size() == structTypes.size());

    // And finally we can emit the struct declarations by going through the
    // sorted ones in order.
@@ -828,10 +829,10 @@ lEmitEnumDecls(const std::vector<const EnumType *> &enumTypes, FILE *file) {
        // Print the individual enumerators 
        for (int j = 0; j < enumTypes[i]->GetEnumeratorCount(); ++j) {
            const Symbol *e = enumTypes[i]->GetEnumerator(j);
-            assert(e->constValue != NULL);
+            Assert(e->constValue != NULL);
            unsigned int enumValue;
            int count = e->constValue->AsUInt32(&enumValue);
-            assert(count == 1);
+            Assert(count == 1);

            // Always print an initializer to set the value.  We could be
            // 'clever' here and detect whether the implicit value given by
@@ -897,7 +898,7 @@ lAddTypeIfNew(const Type *type, std::vector<const T *> *exportedTypes) {
            return;

    const T *castType = dynamic_cast<const T *>(type);
-    assert(castType != NULL);
+    Assert(castType != NULL);
    exportedTypes->push_back(castType);
 }

@@ -934,7 +935,7 @@ lGetExportedTypes(const Type *type,
    else if (dynamic_cast<const VectorType *>(type) != NULL)
        lAddTypeIfNew(type, exportedVectorTypes);
    else
-        assert(dynamic_cast<const AtomicType *>(type) != NULL);
+        Assert(dynamic_cast<const AtomicType *>(type) != NULL);
 }


@@ -965,7 +966,7 @@ lPrintFunctionDeclarations(FILE *file, const std::vector<Symbol *> &funcs) {
    fprintf(file, "#ifdef __cplusplus\nextern \"C\" {\n#endif // __cplusplus\n");
    for (unsigned int i = 0; i < funcs.size(); ++i) {
        const FunctionType *ftype = dynamic_cast<const FunctionType *>(funcs[i]->type);
-        assert(ftype);
+        Assert(ftype);
        std::string decl = ftype->GetCDeclaration(funcs[i]->name);
        fprintf(file, "    extern %s;\n", decl.c_str());
    }
@@ -990,7 +991,7 @@ lPrintExternGlobals(FILE *file, const std::vector<Symbol *> &externGlobals) {
 static bool
 lIsExported(const Symbol *sym) {
    const FunctionType *ft = dynamic_cast<const FunctionType *>(sym->type);
-    assert(ft);
+    Assert(ft);
    return ft->isExported;
 }

@@ -998,7 +999,7 @@ lIsExported(const Symbol *sym) {
 static bool
 lIsExternC(const Symbol *sym) {
    const FunctionType *ft = dynamic_cast<const FunctionType *>(sym->type);
-    assert(ft);
+    Assert(ft);
    return ft->isExternC;
 }

@@ -1167,10 +1168,18 @@ Module::execPreprocessor(const char* infilename, llvm::raw_string_ostream* ostre
    case Target::AVX:
        opts.addMacroDef("ISPC_TARGET_AVX");
        break;
+    case Target::AVX2:
+        opts.addMacroDef("ISPC_TARGET_AVX2");
+        break;
    default:
        FATAL("Unhandled target ISA in preprocessor symbol definition");
    }

+    if (g->target.is32Bit)
+        opts.addMacroDef("ISPC_POINTER_SIZE=32");
+    else
+        opts.addMacroDef("ISPC_POINTER_SIZE=64");
+
    opts.addMacroDef("ISPC_MAJOR_VERSION=1");
    opts.addMacroDef("ISPC_MINOR_VERSION=1");

@@ -1317,7 +1326,7 @@ lExtractAndRewriteGlobals(llvm::Module *module,

            Symbol *sym = 
                m->symbolTable->LookupVariable(gv->getName().str().c_str());
-            assert(sym != NULL);
+            Assert(sym != NULL);
            globals->push_back(RewriteGlobalInfo(gv, init, sym->pos));
        }
    }
@@ -1366,9 +1375,9 @@ lAddExtractedGlobals(llvm::Module *module,
            if (globals[j].size() > 0) {
                // There should be the same number of globals in the other
                // vectors, in the same order.
-                assert(globals[firstActive].size() == globals[j].size());
+                Assert(globals[firstActive].size() == globals[j].size());
                llvm::GlobalVariable *gv2 = globals[j][i].gv;
-                assert(gv2->getName() == gv->getName());
+                Assert(gv2->getName() == gv->getName());

                // It is possible that the types may not match, though--for
                // example, this happens with varying globals if we compile
@@ -1422,7 +1431,7 @@ lCreateDispatchFunction(llvm::Module *module, llvm::Function *setISAFunc,

        // Grab the type of the function as well.
        if (ftype != NULL)
-            assert(ftype == funcs.func[i]->getFunctionType());
+            Assert(ftype == funcs.func[i]->getFunctionType());
        else
            ftype = funcs.func[i]->getFunctionType();

@@ -1510,7 +1519,7 @@ lCreateDispatchFunction(llvm::Module *module, llvm::Function *setISAFunc,
    // or some such, but we don't want to start imposing too much of a
    // runtime library requirement either...
    llvm::Function *abortFunc = module->getFunction("abort");
-    assert(abortFunc);
+    Assert(abortFunc);
    llvm::CallInst::Create(abortFunc, "", bblock);

    // Return an undef value from the function here; we won't get to this
@@ -1542,10 +1551,10 @@ lCreateDispatchModule(std::map<std::string, FunctionTargetVariants> &functions)

    // Get pointers to things we need below
    llvm::Function *setFunc = module->getFunction("__set_system_isa");
-    assert(setFunc != NULL);
+    Assert(setFunc != NULL);
    llvm::Value *systemBestISAPtr = 
        module->getGlobalVariable("__system_best_isa", true);
-    assert(systemBestISAPtr != NULL);
+    Assert(systemBestISAPtr != NULL);

    // For each exported function, create the dispatch function
    std::map<std::string, FunctionTargetVariants>::iterator iter;
@@ -1591,7 +1600,7 @@ Module::CompileAndOutput(const char *srcFile, const char *arch, const char *cpu,
    else {
        // The user supplied multiple targets
        std::vector<std::string> targets = lExtractTargets(target);
-        assert(targets.size() > 1);
+        Assert(targets.size() > 1);

        if (outFileName != NULL && strcmp(outFileName, "-") == 0) {
            Error(SourcePos(), "Multi-target compilation can't generate output "
@@ -1668,7 +1677,7 @@ Module::CompileAndOutput(const char *srcFile, const char *arch, const char *cpu,
        int i = 1;
        while (i < Target::NUM_ISAS && firstTargetMachine == NULL)
            firstTargetMachine = targetMachines[i++];
-        assert(firstTargetMachine != NULL);
+        Assert(firstTargetMachine != NULL);

        if (outFileName != NULL) {
            if (outputType == Bitcode)
--- a/opt.cpp
+++ b/opt.cpp
@@ -152,19 +152,19 @@ lGetSourcePosFromMetadata(const llvm::Instruction *inst, SourcePos *pos) {
    // All of these asserts are things that FunctionEmitContext::addGSMetadata() is
    // expected to have done in its operation
    llvm::MDString *str = llvm::dyn_cast<llvm::MDString>(filename->getOperand(0));
-    assert(str);
+    Assert(str);
    llvm::ConstantInt *first_lnum = 
        llvm::dyn_cast<llvm::ConstantInt>(first_line->getOperand(0));
-    assert(first_lnum);
+    Assert(first_lnum);
    llvm::ConstantInt *first_colnum = 
        llvm::dyn_cast<llvm::ConstantInt>(first_column->getOperand(0));
-    assert(first_column);
+    Assert(first_column);
    llvm::ConstantInt *last_lnum = 
        llvm::dyn_cast<llvm::ConstantInt>(last_line->getOperand(0));
-    assert(last_lnum);
+    Assert(last_lnum);
    llvm::ConstantInt *last_colnum = 
        llvm::dyn_cast<llvm::ConstantInt>(last_column->getOperand(0));
-    assert(last_column);
+    Assert(last_column);

    *pos = SourcePos(str->getString().data(), (int)first_lnum->getZExtValue(),
                     (int)first_colnum->getZExtValue(), (int)last_lnum->getZExtValue(),
@@ -250,7 +250,7 @@ Optimize(llvm::Module *module, int optLevel) {
        optPM.add(llvm::createReassociatePass());
        optPM.add(llvm::createConstantPropagationPass());

-        if (!g->opt.disableMaskedStoreOptimizations) {
+        if (!g->opt.disableMaskAllOnOptimizations) {
            optPM.add(CreateIntrinsicsOptPass());
            optPM.add(CreateMaskedStoreOptPass());
        }
@@ -287,7 +287,7 @@ Optimize(llvm::Module *module, int optLevel) {
        optPM.add(llvm::createInstructionCombiningPass());
        optPM.add(llvm::createTailCallEliminationPass());

-        if (!g->opt.disableMaskedStoreOptimizations) {
+        if (!g->opt.disableMaskAllOnOptimizations) {
            optPM.add(CreateIntrinsicsOptPass());
            optPM.add(CreateMaskedStoreOptPass());
        }
@@ -334,12 +334,16 @@ Optimize(llvm::Module *module, int optLevel) {
            builder.DisableUnrollLoops = true;
        builder.populateFunctionPassManager(funcPM);
        builder.populateModulePassManager(optPM);
-        optPM.add(CreateIsCompileTimeConstantPass(true));
+
+        optPM.add(CreateIsCompileTimeConstantPass(false));
        optPM.add(CreateIntrinsicsOptPass());
+
        builder.populateLTOPassManager(optPM, true /* internalize */,
                                       true /* inline once again */);
+
        optPM.add(CreateIsCompileTimeConstantPass(true));
        optPM.add(CreateIntrinsicsOptPass());
+
        builder.populateModulePassManager(optPM);
 #endif
        optPM.add(CreateMakeInternalFuncsStaticPass());
@@ -436,7 +440,7 @@ IntrinsicsOpt::IntrinsicsOpt()
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
    llvm::Function *avxMovmsk = 
        llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_avx_movmsk_ps_256);
-    assert(avxMovmsk != NULL);
+    Assert(avxMovmsk != NULL);
    maskInstructions.push_back(avxMovmsk);
 #endif

@@ -482,7 +486,7 @@ lGetMask(llvm::Value *factor) {
            else {
                // Otherwise get it as an int
                llvm::ConstantInt *ci = llvm::dyn_cast<llvm::ConstantInt>(elements[i]);
-                assert(ci != NULL);  // vs return -1 if NULL?
+                Assert(ci != NULL);  // vs return -1 if NULL?
                intMaskValue = ci->getValue();
            }
            // Is the high-bit set?  If so, OR in the appropriate bit in
@@ -505,7 +509,7 @@ lGetMask(llvm::Value *factor) {
            factor = c;
        }
        // else we should be able to handle it above...
-        assert(!llvm::isa<llvm::Constant>(factor));
+        Assert(!llvm::isa<llvm::Constant>(factor));
 #endif
        return -1;
    }
@@ -545,8 +549,8 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
        llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_avx_maskstore_ps_256);
    llvm::Function *avxMaskedStore64 = 
        llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_avx_maskstore_pd_256);
-    assert(avxMaskedLoad32 != NULL && avxMaskedStore32 != NULL);
-    assert(avxMaskedLoad64 != NULL && avxMaskedStore64 != NULL);
+    Assert(avxMaskedLoad32 != NULL && avxMaskedStore32 != NULL);
+    Assert(avxMaskedLoad64 != NULL && avxMaskedStore64 != NULL);
 #endif

    bool modifiedAny = false;
@@ -627,7 +631,7 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
            if (mask == 0) {
                // nothing being loaded, replace with undef value
                llvm::Type *returnType = callInst->getType();
-                assert(llvm::isa<llvm::VectorType>(returnType));
+                Assert(llvm::isa<llvm::VectorType>(returnType));
                llvm::Value *undefValue = llvm::UndefValue::get(returnType);
                llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
                                           iter, undefValue);
@@ -637,7 +641,7 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
            else if (mask == 0xff) {
                // all lanes active; replace with a regular load
                llvm::Type *returnType = callInst->getType();
-                assert(llvm::isa<llvm::VectorType>(returnType));
+                Assert(llvm::isa<llvm::VectorType>(returnType));
                // cast the i8 * to the appropriate type
                llvm::Value *castPtr = 
                    new llvm::BitCastInst(callInst->getArgOperand(0),
@@ -751,7 +755,7 @@ llvm::RegisterPass<GatherScatterFlattenOpt> gsf("gs-flatten", "Gather/Scatter Fl
 static int64_t
 lGetIntValue(llvm::Value *offset) {
    llvm::ConstantInt *intOffset = llvm::dyn_cast<llvm::ConstantInt>(offset);
-    assert(intOffset && (intOffset->getBitWidth() == 32 ||
+    Assert(intOffset && (intOffset->getBitWidth() == 32 ||
                         intOffset->getBitWidth() == 64));
    return intOffset->getSExtValue();
 }
@@ -776,15 +780,15 @@ lFlattenInsertChain(llvm::InsertElementInst *ie, int vectorWidth,

    while (ie != NULL) {
        int64_t iOffset = lGetIntValue(ie->getOperand(2));
-        assert(iOffset >= 0 && iOffset < vectorWidth);
-        assert(elements[iOffset] == NULL);
+        Assert(iOffset >= 0 && iOffset < vectorWidth);
+        Assert(elements[iOffset] == NULL);

        elements[iOffset] = ie->getOperand(1);

        llvm::Value *insertBase = ie->getOperand(0);
        ie = llvm::dyn_cast<llvm::InsertElementInst>(insertBase);
        if (ie == NULL)
-            assert(llvm::isa<llvm::UndefValue>(insertBase));
+            Assert(llvm::isa<llvm::UndefValue>(insertBase));
    }
 }

@@ -950,7 +954,7 @@ lGetBasePtrAndOffsets(llvm::Value *ptrs, llvm::Value **offsets) {
            if (elementBase == NULL)
                return NULL;

-            assert(delta[i] != NULL);
+            Assert(delta[i] != NULL);
            if (base == NULL)
                // The first time we've found a base pointer
                base = elementBase;
@@ -960,10 +964,14 @@ lGetBasePtrAndOffsets(llvm::Value *ptrs, llvm::Value **offsets) {
                return NULL;
        }

-        assert(base != NULL);
+        Assert(base != NULL);
+#ifdef LLVM_2_9
+        *offsets = llvm::ConstantVector::get(delta);
+#else
        llvm::ArrayRef<llvm::Constant *> deltas(&delta[0], 
                                                &delta[elements.size()]);
        *offsets = llvm::ConstantVector::get(deltas);
+#endif
        return base;
    }

@@ -1023,7 +1031,7 @@ GatherScatterFlattenOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
    };
    int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]);
    for (int i = 0; i < numGSFuncs; ++i)
-        assert(gsFuncs[i].func != NULL && gsFuncs[i].baseOffsetsFunc != NULL &&
+        Assert(gsFuncs[i].func != NULL && gsFuncs[i].baseOffsetsFunc != NULL &&
               gsFuncs[i].baseOffsets32Func != NULL);

    bool modifiedAny = false;
@@ -1169,7 +1177,7 @@ struct MSInfo {
    MSInfo(const char *name, const int a) 
        : align(a) {
        func = m->module->getFunction(name);
-        assert(func != NULL);
+        Assert(func != NULL);
    }
    llvm::Function *func;
    const int align;
@@ -1313,7 +1321,7 @@ struct LMSInfo {
        pseudoFunc = m->module->getFunction(pname);
        blendFunc = m->module->getFunction(bname);
        maskedStoreFunc = m->module->getFunction(msname);
-        assert(pseudoFunc != NULL && blendFunc != NULL && 
+        Assert(pseudoFunc != NULL && blendFunc != NULL && 
               maskedStoreFunc != NULL);
    }
    llvm::Function *pseudoFunc;
@@ -1447,7 +1455,7 @@ lValuesAreEqual(llvm::Value *v0, llvm::Value *v1,
    if (v0 == v1)
        return true;

-    assert(seenPhi0.size() == seenPhi1.size());
+    Assert(seenPhi0.size() == seenPhi1.size());
    for (unsigned int i = 0; i < seenPhi0.size(); ++i)
        if (v0 == seenPhi0[i] && v1 == seenPhi1[i])
            return true;
@@ -1477,7 +1485,7 @@ lValuesAreEqual(llvm::Value *v0, llvm::Value *v1,
        // then we're good.
        bool anyFailure = false;
        for (unsigned int i = 0; i < numIncoming; ++i) {
-            assert(phi0->getIncomingBlock(i) == phi1->getIncomingBlock(i));
+            Assert(phi0->getIncomingBlock(i) == phi1->getIncomingBlock(i));
            if (!lValuesAreEqual(phi0->getIncomingValue(i), 
                                 phi1->getIncomingValue(i), seenPhi0, seenPhi1)) {
                anyFailure = true;
@@ -1531,7 +1539,7 @@ lVectorValuesAllEqual(llvm::Value *v, int vectorLength,
            // probably to just ignore undef elements and return true if
            // all of the other ones are equal, but it'd be nice to have
            // some test cases to verify this.
-            assert(elements[i] != NULL && elements[i+1] != NULL);
+            Assert(elements[i] != NULL && elements[i+1] != NULL);

            std::vector<llvm::PHINode *> seenPhi0;
            std::vector<llvm::PHINode *> seenPhi1;
@@ -1565,7 +1573,7 @@ lVectorValuesAllEqual(llvm::Value *v, int vectorLength,
        return true;
    }

-    assert(!llvm::isa<llvm::Constant>(v));
+    Assert(!llvm::isa<llvm::Constant>(v));

    if (llvm::isa<llvm::CallInst>(v) || llvm::isa<llvm::LoadInst>(v) ||
        !llvm::isa<llvm::Instruction>(v))
@@ -1610,7 +1618,7 @@ lVectorIsLinearConstantInts(llvm::ConstantVector *cv, int vectorLength,
    // Flatten the vector out into the elements array
    llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> elements;
    cv->getVectorElements(elements);
-    assert((int)elements.size() == vectorLength);
+    Assert((int)elements.size() == vectorLength);

    llvm::ConstantInt *ci = llvm::dyn_cast<llvm::ConstantInt>(elements[0]);
    if (ci == NULL)
@@ -1665,7 +1673,8 @@ lCheckMulForLinear(llvm::Value *op0, llvm::Value *op1, int vectorLength,

    // Check to see if the other operand is a linear vector with stride
    // given by stride/splatVal.
-    return lVectorIsLinear(op1, vectorLength, stride / splatVal, seenPhis);
+    return lVectorIsLinear(op1, vectorLength, (int)(stride / splatVal), 
+                           seenPhis);
 }


@@ -1784,7 +1793,7 @@ struct GatherImpInfo {
        loadBroadcastFunc = m->module->getFunction(lbName);
        loadMaskedFunc = m->module->getFunction(lmName);

-        assert(pseudoFunc != NULL && loadBroadcastFunc != NULL &&
+        Assert(pseudoFunc != NULL && loadBroadcastFunc != NULL &&
               loadMaskedFunc != NULL);
    }
    llvm::Function *pseudoFunc;
@@ -1801,7 +1810,7 @@ struct ScatterImpInfo {
        pseudoFunc = m->module->getFunction(pName);
        maskedStoreFunc = m->module->getFunction(msName);
        vecPtrType = vpt;
-        assert(pseudoFunc != NULL && maskedStoreFunc != NULL);
+        Assert(pseudoFunc != NULL && maskedStoreFunc != NULL);
    }
    llvm::Function *pseudoFunc;
    llvm::Function *maskedStoreFunc;
@@ -1880,7 +1889,7 @@ GSImprovementsPass::runOnBasicBlock(llvm::BasicBlock &bb) {

        SourcePos pos;
        bool ok = lGetSourcePosFromMetadata(callInst, &pos);
-        assert(ok);     
+        Assert(ok);     

        llvm::Value *base = callInst->getArgOperand(0);
        llvm::Value *offsets = callInst->getArgOperand(1);
@@ -2058,7 +2067,7 @@ struct LowerGSInfo {
        : isGather(ig) {
        pseudoFunc = m->module->getFunction(pName);
        actualFunc = m->module->getFunction(aName);
-        assert(pseudoFunc != NULL && actualFunc != NULL);
+        Assert(pseudoFunc != NULL && actualFunc != NULL);
    }
    llvm::Function *pseudoFunc;
    llvm::Function *actualFunc;
@@ -2135,7 +2144,7 @@ LowerGSPass::runOnBasicBlock(llvm::BasicBlock &bb) {
        // instruction so that we can issue PerformanceWarning()s below.
        SourcePos pos;
        bool ok = lGetSourcePosFromMetadata(callInst, &pos);
-        assert(ok);     
+        Assert(ok);     

        callInst->setCalledFunction(info->actualFunc);
        if (info->isGather)
@@ -2217,9 +2226,11 @@ IsCompileTimeConstantPass::runOnBasicBlock(llvm::BasicBlock &bb) {
            // not a __is_compile_time_constant_* function
            continue;

-        // This optimization pass can be disabled with the (poorly named)
-        // disableGatherScatterFlattening option.
-        if (g->opt.disableGatherScatterFlattening) {
+        // This optimization pass can be disabled with both the (poorly
+        // named) disableGatherScatterFlattening option and
+        // disableMaskAllOnOptimizations.
+        if (g->opt.disableGatherScatterFlattening ||
+            g->opt.disableMaskAllOnOptimizations) {
            llvm::ReplaceInstWithValue(i->getParent()->getInstList(), i, LLVMFalse);
            modifiedAny = true;
            goto restart;
--- a/parse.yy
+++ b/parse.yy
@@ -134,9 +134,8 @@ struct ForeachDimension {
 %}

 %union {
-    int32_t int32Val;
-    double floatVal;
-    int64_t int64Val;
+    int64_t intVal;
+    float floatVal;
    std::string *stringVal;
    const char *constCharPtr;

@@ -226,7 +225,7 @@ struct ForeachDimension {

 %type <stringVal> string_constant
 %type <constCharPtr> struct_or_union_name enum_identifier
-%type <int32Val> int_constant soa_width_specifier
+%type <intVal> int_constant soa_width_specifier

 %type <foreachDimension> foreach_dimension_specifier
 %type <foreachDimensionList> foreach_dimension_list
@@ -259,16 +258,16 @@ primary_expression
        }
    }
    | TOKEN_INT32_CONSTANT {
-        $$ = new ConstExpr(AtomicType::UniformConstInt32, yylval.int32Val, @1); 
+        $$ = new ConstExpr(AtomicType::UniformConstInt32, (int32_t)yylval.intVal, @1); 
    }
    | TOKEN_UINT32_CONSTANT {
-        $$ = new ConstExpr(AtomicType::UniformConstUInt32, (uint32_t)yylval.int32Val, @1); 
+        $$ = new ConstExpr(AtomicType::UniformConstUInt32, (uint32_t)yylval.intVal, @1); 
    }
    | TOKEN_INT64_CONSTANT {
-        $$ = new ConstExpr(AtomicType::UniformConstInt64, yylval.int64Val, @1); 
+        $$ = new ConstExpr(AtomicType::UniformConstInt64, (int64_t)yylval.intVal, @1); 
    }
    | TOKEN_UINT64_CONSTANT {
-        $$ = new ConstExpr(AtomicType::UniformConstUInt64, (uint64_t)yylval.int64Val, @1); 
+        $$ = new ConstExpr(AtomicType::UniformConstUInt64, (uint64_t)yylval.intVal, @1); 
    }
    | TOKEN_FLOAT_CONSTANT {
        $$ = new ConstExpr(AtomicType::UniformConstFloat, (float)yylval.floatVal, @1); 
@@ -328,7 +327,7 @@ argument_expression_list
    | argument_expression_list ',' assignment_expression
      {
          ExprList *argList = dynamic_cast<ExprList *>($1);
-          assert(argList != NULL);
+          Assert(argList != NULL);
          argList->exprs.push_back($3);
          argList->pos = Union(argList->pos, @3);
          $$ = argList;
@@ -545,7 +544,7 @@ declaration_specifiers
    | soa_width_specifier
      {
          DeclSpecs *ds = new DeclSpecs;
-          ds->soaWidth = $1;
+          ds->soaWidth = (int32_t)$1;
          $$ = ds;
      }
    | soa_width_specifier declaration_specifiers
@@ -555,7 +554,7 @@ declaration_specifiers
              if (ds->soaWidth != 0)
                  Error(@1, "soa<> qualifier supplied multiple times in declaration.");
              else
-                  ds->soaWidth = $1;
+                  ds->soaWidth = (int32_t)$1;
          }
          $$ = ds;
      }
@@ -566,7 +565,7 @@ declaration_specifiers
    | type_specifier '<' int_constant '>'
    {
          DeclSpecs *ds = new DeclSpecs($1);
-          ds->vectorSize = $3;
+          ds->vectorSize = (int32_t)$3;
          $$ = ds;
    }
    | type_specifier declaration_specifiers
@@ -630,7 +629,7 @@ type_specifier
    : atomic_var_type_specifier { $$ = $1; }
    | TOKEN_TYPE_NAME
      { const Type *t = m->symbolTable->LookupType(yytext); 
-        assert(t != NULL);
+        Assert(t != NULL);
        $$ = t;
      }
    | struct_or_union_specifier { $$ = $1; }
@@ -652,7 +651,7 @@ short_vec_specifier
    : atomic_var_type_specifier '<' int_constant '>'
      {
        Type* vt = 
-          new VectorType($1, $3);
+          new VectorType($1, (int32_t)$3);
        $$ = vt;
      }
    ;
@@ -930,7 +929,7 @@ declarator
    ;

 int_constant
-    : TOKEN_INT32_CONSTANT { $$ = yylval.int32Val; }
+    : TOKEN_INT32_CONSTANT { $$ = yylval.intVal; }
    ;

 direct_declarator
@@ -948,10 +947,16 @@ direct_declarator
    {
        int size;
        if ($1 != NULL && lGetConstantInt($3, &size, @3, "Array dimension")) {
-            Declarator *d = new Declarator(DK_ARRAY, Union(@1, @4));
-            d->arraySize = size;
-            d->child = $1;
-            $$ = d;
+            if (size < 0) {
+                Error(@3, "Array dimension must be non-negative.");
+                $$ = NULL;
+            }
+            else {
+                Declarator *d = new Declarator(DK_ARRAY, Union(@1, @4));
+                d->arraySize = size;
+                d->child = $1;
+                $$ = d;
+            }
        }
        else
            $$ = NULL;
@@ -1142,10 +1147,16 @@ direct_abstract_declarator
    | '[' constant_expression ']'
      {
        int size;
-        if (lGetConstantInt($2, &size, @2, "Array dimension")) {
-            Declarator *d = new Declarator(DK_ARRAY, Union(@1, @3));
-            d->arraySize = size;
-            $$ = d;
+        if ($2 != NULL && lGetConstantInt($2, &size, @2, "Array dimension")) {
+            if (size < 0) {
+                Error(@2, "Array dimension must be non-negative.");
+                $$ = NULL;
+            }
+            else {
+                Declarator *d = new Declarator(DK_ARRAY, Union(@1, @3));
+                d->arraySize = size;
+                $$ = d;
+            }
        }
        else
            $$ = NULL;
@@ -1160,11 +1171,17 @@ direct_abstract_declarator
    | direct_abstract_declarator '[' constant_expression ']'
      {
          int size;
-          if (lGetConstantInt($3, &size, @3, "Array dimension")) {
-              Declarator *d = new Declarator(DK_ARRAY, Union(@1, @4));
-              d->arraySize = size;
-              d->child = $1;
-              $$ = d;
+          if ($3 != NULL && lGetConstantInt($3, &size, @3, "Array dimension")) {
+              if (size < 0) {
+                  Error(@3, "Array dimension must be non-negative.");
+                  $$ = NULL;
+              }
+              else {
+                  Declarator *d = new Declarator(DK_ARRAY, Union(@1, @4));
+                  d->arraySize = size;
+                  d->child = $1;
+                  $$ = d;
+              }
          }
          else
              $$ = NULL;
@@ -1206,7 +1223,7 @@ initializer_list
              $$ = NULL;
          else {
              ExprList *exprList = dynamic_cast<ExprList *>($1);
-              assert(exprList);
+              Assert(exprList);
              exprList->exprs.push_back($3);
              exprList->pos = Union(exprList->pos, @3);
              $$ = exprList;
@@ -1537,7 +1554,7 @@ lAddDeclaration(DeclSpecs *ds, Declarator *decl) {
        const FunctionType *ft = dynamic_cast<const FunctionType *>(t);
        if (ft != NULL) {
            Symbol *funSym = decl->GetSymbol();
-            assert(funSym != NULL);
+            Assert(funSym != NULL);
            funSym->type = ft;
            funSym->storageClass = ds->storageClass;

@@ -1561,19 +1578,21 @@ lAddFunctionParams(Declarator *decl) {
    // walk down to the declarator for the function itself 
    while (decl->kind != DK_FUNCTION && decl->child != NULL)
        decl = decl->child;
-    assert(decl->kind == DK_FUNCTION);
+    Assert(decl->kind == DK_FUNCTION);

    // now loop over its parameters and add them to the symbol table
    for (unsigned int i = 0; i < decl->functionParams.size(); ++i) {
        Declaration *pdecl = decl->functionParams[i];
-        if (pdecl == NULL)
+        if (pdecl == NULL || pdecl->declarators.size() == 0)
+            // zero size declarators array corresponds to an anonymous 
+            // parameter
            continue;
-        assert(pdecl->declarators.size() == 1);
+        Assert(pdecl->declarators.size() == 1);
        Symbol *sym = pdecl->declarators[0]->GetSymbol();
 #ifndef NDEBUG
        bool ok = m->symbolTable->AddVariable(sym);
        if (ok == false)
-            assert(m->errorCount > 0);
+            Assert(m->errorCount > 0);
 #else
        m->symbolTable->AddVariable(sym);
 #endif
@@ -1640,7 +1659,7 @@ lGetStorageClassString(StorageClass sc) {
    case SC_EXTERN_C:
        return "extern \"C\"";
    default:
-        assert(!"logic error in lGetStorageClassString()");
+        Assert(!"logic error in lGetStorageClassString()");
        return "";
    }
 }
@@ -1673,6 +1692,10 @@ lGetConstantInt(Expr *expr, int *value, SourcePos pos, const char *usage) {
            Error(pos, "%s must be a compile-time integer constant.", usage);
            return false;
        }
+        if ((int64_t)((int32_t)ci->getSExtValue()) != ci->getSExtValue()) {
+            Error(pos, "%s must be representable with a 32-bit integer.", usage);
+            return false;
+        }
        *value = (int)ci->getZExtValue();
        return true;
    }
@@ -1720,7 +1743,7 @@ lFinalizeEnumeratorSymbols(std::vector<Symbol *> &enums,
        if (enums[i]->constValue != NULL) {
            /* Already has a value, so first update nextVal with it. */
            int count = enums[i]->constValue->AsUInt32(&nextVal);
-            assert(count == 1);
+            Assert(count == 1);
            ++nextVal;

            /* When the source file as being parsed, the ConstExpr for any
@@ -1733,7 +1756,7 @@ lFinalizeEnumeratorSymbols(std::vector<Symbol *> &enums,
                                              false, enums[i]->pos);
            castExpr = castExpr->Optimize();
            enums[i]->constValue = dynamic_cast<ConstExpr *>(castExpr);
-            assert(enums[i]->constValue != NULL);
+            Assert(enums[i]->constValue != NULL);
        }
        else {
            enums[i]->constValue = new ConstExpr(enumType, nextVal++, 
--- a/run_tests.py
+++ b/run_tests.py
@@ -2,9 +2,6 @@

 # test-running driver for ispc

-# TODO: windows support (mostly should be calling CL.exe rather than gcc
-#   for static linking?)
-
 from optparse import OptionParser
 import multiprocessing
 from ctypes import c_int
@@ -23,9 +20,6 @@ import platform
 parser = OptionParser()
 parser.add_option("-r", "--random-shuffle", dest="random", help="Randomly order tests",
                  default=False, action="store_true")
-parser.add_option("-s", "--static-exe", dest="static_exe", 
-                  help="Create and run a regular executable for each test (rather than using the LLVM JIT).",
-                  default=False, action="store_true")
 parser.add_option('-t', '--target', dest='target',
                  help='Set compilation target (sse2, sse2-x2, sse4, sse4-x2, avx, avx-x2)',
                  default="sse4")
@@ -52,7 +46,6 @@ if (options.random):

 # counter
 total_tests = 0
-finished_tests_counter = multiprocessing.Value(c_int)

 # We'd like to use the Lock class from the multiprocessing package to
 # serialize accesses to finished_tests_counter.  Unfortunately, the version of
@@ -60,7 +53,10 @@ finished_tests_counter = multiprocessing.Value(c_int)
 # http://bugs.python.org/issue5261.  Therefore, we use the (deprecated but
 # still available) mutex class.
 #finished_tests_counter_lock = multiprocessing.Lock()
-finished_tests_mutex = mutex.mutex()
+if not (platform.system() == 'Windows' or
+        'CYGWIN_NT' in platform.system()):
+    finished_tests_mutex = mutex.mutex()
+    finished_tests_counter = multiprocessing.Value(c_int)

 # utility routine to print an update on the number of tests that have been
 # finished.  Should be called with the mutex (or lock) held..
@@ -79,21 +75,127 @@ fnull = open(os.devnull, 'w')

 # run the commands in cmd_list
 def run_cmds(cmd_list, filename, expect_failure):
+    output = ""
    for cmd in cmd_list:
-        if expect_failure:
-            failed = (subprocess.call(cmd, shell = True, stdout = fnull, stderr = fnull) != 0)
-        else:
-            failed = (os.system(cmd) != 0)
+        sp = subprocess.Popen(shlex.split(cmd), stdin=None,
+                              stdout=subprocess.PIPE,
+                              stderr=subprocess.PIPE)
+        out = sp.communicate()
+        output += out[0]
+        output += out[1]
+        failed = (sp.returncode != 0)
        if failed:
            break

-    surprise = ((expect_failure and not failed) or (not expect_failure and failed))
+    surprise = ((expect_failure and not failed) or
+                (not expect_failure and failed))
    if surprise == True:
-        print "Test %s %s                 " % \
-            (filename, "unexpectedly passed" if expect_failure else "failed")
+        print "Test %s %s (return code %d)            " % \
+            (filename, "unexpectedly passed" if expect_failure else "failed",
+             sp.returncode)
+    if output != "":
+        print "%s" % output
    return surprise


+def run_test(filename):
+    # is this a test to make sure an error is issued?
+    error_count = 0
+    want_error = (filename.find("tests_errors") != -1)
+    if want_error == True:
+        ispc_cmd = "ispc --werror --nowrap %s --arch=%s --target=%s" % \
+            (filename, options.arch, options.target)
+        sp = subprocess.Popen(shlex.split(ispc_cmd), stdin=None,
+                              stdout=subprocess.PIPE,
+                              stderr=subprocess.PIPE)
+        out = sp.communicate()
+        output = ""
+        output += out[0]
+        output += out[1]
+        got_error = (sp.returncode != 0)
+
+        # figure out the error message we're expecting
+        file = open(filename, 'r')
+        firstline = file.readline()
+        firstline = string.replace(firstline, "//", "")
+        firstline = string.lstrip(firstline)
+        firstline = string.rstrip(firstline)
+        file.close()
+
+        if (output.find(firstline) == -1):
+            print "OUT %s" % filename
+            print "Didnt see expected error message %s from test %s.\nActual output:\n%s" % \
+                (firstline, filename, output)
+            error_count += 1
+        elif got_error == False:
+            print "Unexpectedly no errors issued from test %s" % filename
+            error_count += 1
+    else:
+        # do we expect this test to fail?
+        should_fail = (filename.find("failing_") != -1)
+
+        # We need to figure out the signature of the test
+        # function that this test has.
+        sig2def = { "f_v(" : 0, "f_f(" : 1, "f_fu(" : 2, "f_fi(" : 3, 
+                    "f_du(" : 4, "f_duf(" : 5, "f_di(" : 6 }
+        file = open(filename, 'r')
+        match = -1
+        for line in file:
+            # look for lines with 'export'...
+            if line.find("export") == -1:
+                continue
+            # one of them should have a function with one of the
+            # declarations in sig2def
+            for pattern, ident in sig2def.items():
+                if line.find(pattern) != -1:
+                    match = ident
+                    break
+        file.close()
+        if match == -1:
+            print "Fatal error: unable to find function signature " + \
+                  "in test %s" % filename
+            error_count += 1
+        else:
+            if (platform.system() == 'Windows' or
+                'CYGWIN_NT' in platform.system()):
+                obj_name = "%s.obj" % filename
+                exe_name = "%s.exe" % filename
+                cc_cmd = "cl /nologo test_static.cpp /DTEST_SIG=%d %s.obj /Fe%s" % \
+                         (match, filename, exe_name)
+                if should_fail:
+                    cc_cmd += " /DEXPECT_FAILURE"
+            else:
+                obj_name = "%s.o" % filename
+                exe_name = "%s.run" % filename
+                if options.arch == 'x86':
+                    gcc_arch = '-m32'
+                else:
+                    gcc_arch = '-m64'
+                cc_cmd = "g++ %s test_static.cpp -DTEST_SIG=%d %s.o -o %s" % \
+                         (gcc_arch, match, filename, exe_name)
+                if platform.system() == 'Darwin':
+                    cc_cmd += ' -Wl,-no_pie'
+                if should_fail:
+                    cc_cmd += " -DEXPECT_FAILURE"
+    
+            ispc_cmd = "ispc --woff %s -o %s --arch=%s --target=%s" % \
+                       (filename, obj_name, options.arch, options.target)
+            if options.no_opt:
+                ispc_cmd += " -O0" 
+    
+        # compile the ispc code, make the executable, and run it...
+        error_count += run_cmds([ispc_cmd, cc_cmd, exe_name], \
+                                filename, should_fail)
+    
+        # clean up after running the test
+        try:
+            os.unlink(exe_name)
+            os.unlink(obj_name)
+        except:
+            None
+
+    return error_count
+
 # pull tests to run from the given queue and run them.  Multiple copies of
 # this function will be running in parallel across all of the CPU cores of
 # the system.
@@ -104,100 +206,7 @@ def run_tasks_from_queue(queue):
        if (filename == 'STOP'):
            sys.exit(error_count)

-        # is this a test to make sure an error is issued?
-        want_error = (filename.find("tests_errors") != -1)
-        if want_error == True:
-            ispc_cmd = "ispc --werror --nowrap %s --arch=%s --target=%s" % \
-                (filename, options.arch, options.target)
-            sp = subprocess.Popen(shlex.split(ispc_cmd), stdin=None, stdout=subprocess.PIPE,
-                                  stderr=subprocess.PIPE)
-            output = sp.communicate()[1]
-            got_error = (sp.returncode != 0)
-
-            # figure out the error message we're expecting
-            file = open(filename, 'r')
-            firstline = file.readline()
-            firstline = string.replace(firstline, "//", "")
-            firstline = string.lstrip(firstline)
-            firstline = string.rstrip(firstline)
-            file.close()
-
-            if (output.find(firstline) == -1):
-                print "Didn't see expected error message \"%s\" from test %s.\nActual outout: %s" % \
-                    (firstline, filename, output)
-                error_count += 1
-            elif got_error == False:
-                print "Unexpectedly no errors issued from test %s" % filename
-                error_count += 1
-            continue
-
-        # do we expect this test to fail?
-        should_fail = (filename.find("failing_") != -1)
-
-        if options.static_exe == True:
-            # if the user wants us to build a static executable to run for
-            # this test, we need to figure out the signature of the test
-            # function that this test has.
-            sig2def = { "f_v(" : 0, "f_f(" : 1, "f_fu(" : 2, "f_fi(" : 3, 
-                        "f_du(" : 4, "f_duf(" : 5, "f_di(" : 6 }
-            file = open(filename, 'r')
-            match = -1
-            for line in file:
-                # look for lines with 'export'...
-                if line.find("export") == -1:
-                    continue
-                # one of them should have a function with one of the
-                # declarations in sig2def
-                for pattern, ident in sig2def.items():
-                    if line.find(pattern) != -1:
-                        match = ident
-                        break
-            file.close()
-            if match == -1:
-                print "Fatal error: unable to find function signature in test %s" % filename
-                error_count += 1
-            else:
-                obj_name = "%s.o" % filename
-                exe_name = "%s.run" % filename
-                ispc_cmd = "ispc --woff %s -o %s --arch=%s --target=%s" % \
-                    (filename, obj_name, options.arch, options.target)
-                if options.no_opt:
-                    ispc_cmd += " -O0" 
-                if options.arch == 'x86':
-                    gcc_arch = '-m32'
-                else:
-                    gcc_arch = '-m64'
-                gcc_cmd = "g++ %s test_static.cpp -DTEST_SIG=%d %s.o -o %s" % \
-                    (gcc_arch, match, filename, exe_name)
-                if platform.system() == 'Darwin':
-                    gcc_cmd += ' -Wl,-no_pie'
-                if should_fail:
-                    gcc_cmd += " -DEXPECT_FAILURE"
-                    
-                # compile the ispc code, make the executable, and run it...
-                error_count += run_cmds([ispc_cmd, gcc_cmd, exe_name], filename, should_fail)
-
-                # clean up after running the test
-                try:
-                    os.unlink(exe_name)
-                    os.unlink(obj_name)
-                except:
-                    None
-        else:
-            # otherwise we'll use ispc_test + the LLVM JIT to run the test
-            bitcode_file = "%s.bc" % filename
-            compile_cmd = "ispc --woff --emit-llvm %s --target=%s -o %s" % \
-                (filename, options.target, bitcode_file)
-            if options.no_opt:
-                compile_cmd += " -O0"
-            test_cmd = "ispc_test %s" % bitcode_file
-
-            error_count += run_cmds([compile_cmd, test_cmd], filename, should_fail)
-
-            try:
-                os.unlink(bitcode_file)
-            except:
-                None
+        error_count += run_test(filename)

        # If not for http://bugs.python.org/issue5261 on OSX, we'd like to do this:
        #with finished_tests_counter_lock:
@@ -214,34 +223,58 @@ def sigint(signum, frame):
    sys.exit(1)

 if __name__ == '__main__':
-    nthreads = multiprocessing.cpu_count()
    total_tests = len(files)
-    print "Found %d CPUs. Running %d tests." % (nthreads, total_tests)
-
-    # put each of the test filenames into a queue
-    q = multiprocessing.Queue()
-    for fn in files:
-        q.put(fn)
-    for x in range(nthreads):
-        q.put('STOP')
-
-    # need to catch sigint so that we can terminate all of the tasks if
-    # we're interrupted
-    signal.signal(signal.SIGINT, sigint)
-
-    # launch jobs to run tests
-    for x in range(nthreads):
-        t = multiprocessing.Process(target=run_tasks_from_queue, args=(q,))
-        task_threads.append(t)
-        t.start()
-
-    # wait for them to all finish and then return the number that failed
-    # (i.e. return 0 if all is ok)
    error_count = 0
-    for t in task_threads:
-        t.join()
-        error_count += t.exitcode
-    print
+
+    if (platform.system() == 'Windows' or
+        'CYGWIN_NT' in platform.system()):
+        # cl.exe gets itself all confused if we have multiple instances of
+        # it running concurrently and operating on the same .cpp file
+        # (test_static.cpp), even if we are generating a differently-named
+        # exe in the end.  So run serially. :-(
+        nthreads = 1
+        num_done = 0
+        print "Running %d tests." % (total_tests)
+        for fn in files:
+            error_count += run_test(fn)
+
+            num_done += 1
+            progress_str = " Done %d / %d [%s]" % (num_done, total_tests, fn)
+            # spaces to clear out detrius from previous printing...
+            for x in range(30):
+                progress_str += ' '
+            progress_str += '\r'
+            sys.stdout.write(progress_str)
+            sys.stdout.flush()
+    else:
+        nthreads = multiprocessing.cpu_count()
+        print "Found %d CPUs. Running %d tests." % (nthreads, total_tests)
+
+        # put each of the test filenames into a queue
+        q = multiprocessing.Queue()
+        for fn in files:
+            q.put(fn)
+        for x in range(nthreads):
+            q.put('STOP')
+
+        # need to catch sigint so that we can terminate all of the tasks if
+        # we're interrupted
+        signal.signal(signal.SIGINT, sigint)
+
+        # launch jobs to run tests
+        for x in range(nthreads):
+            t = multiprocessing.Process(target=run_tasks_from_queue, args=(q,))
+            task_threads.append(t)
+            t.start()
+
+        # wait for them to all finish and then return the number that failed
+        # (i.e. return 0 if all is ok)
+        error_count = 0
+        for t in task_threads:
+            t.join()
+            error_count += t.exitcode
+        print
+
    if error_count > 0:
        print "%d / %d tests FAILED!" % (error_count, total_tests)
    sys.exit(error_count)
--- a/run_tests.sh
+++ b/run_tests.sh
@@ -1,95 +0,0 @@
-#!/bin/bash
-
-surprises=0
-verbose=false
-number=$(ls -1 tests/*.ispc|wc -l)
-counter=1
-target=sse4
-
-while getopts ":vt:h" opt;do
-    case $opt in
-        v) verbose=true
-            ;;
-        t) target=$OPTARG
-            ;;
-        h) cat <<EOF
-           usage: run_tests.sh [-v] [-t target] [filenames]
-                  -v           # verbose output
-                  -t           # specify compilation target (SSE4 is the default).
-                  [filenames]  # (optional) files to run through testing infrastructure
-                               # if none are provided, all in tests/ will be run.
-EOF
-            exit 1
-    esac
-done
-
-ISPC_ARCH=x86-64
-if [[ $OS == "Windows_NT" ]]; then
-  ISPC_ARCH=x86
-fi
-ISPC_ARGS="--target=$target --arch=$ISPC_ARCH -O2 --woff"
-
-shift $(( $OPTIND - 1 ))
-if [[ "$1" > 0 ]]; then
-    while [[ "$1" > 0 ]]; do
-        i=$1
-        shift
-        echo Running test $i
-
-        bc=${i%%ispc}bc
-        ispc $ISPC_ARGS $i -o $bc --emit-llvm
-        if [[ $? != 0 ]]; then
-            surprises=1
-            echo Test $i FAILED ispc compile
-            echo
-        else
-            ispc_test $bc
-            if [[ $? != 0 ]]; then
-                surprises=1
-                echo Test $i FAILED ispc_test
-                echo
-            fi
-        fi
-        /bin/rm -f $bc
-    done
-else
-    echo Running all correctness tests
-
-    for i in tests/*.ispc; do
-        if $verbose; then
-            echo -en "Running test $counter of $number.\r"
-        fi
-        (( counter++ ))
-        bc=${i%%ispc}bc
-        ispc $ISPC_ARGS $i -o $bc --emit-llvm 
-        if [[ $? != 0 ]]; then
-            surprises=1
-            echo Test $i FAILED ispc compile
-            echo
-        else
-            ispc_test $bc
-            if [[ $? != 0 ]]; then
-                surprises=1
-                echo Test $i FAILED ispc_test
-                echo
-            fi
-        fi
-        /bin/rm -f $bc
-    done
-
-    echo -e "\nRunning failing tests"
-    for i in failing_tests/*.ispc; do
-        (ispc -O2 $i -woff -o - --emit-llvm | ispc_test -) 2>/dev/null 1>/dev/null
-        if [[ $? == 0 ]]; then
-            surprises=1
-            echo Test $i UNEXPECTEDLY PASSED
-            echo
-        fi
-    done
-fi
-
-if [[ $surprises == 0 ]]; then
-    echo No surprises.
-fi
-
-exit $surprises
--- a/stmt.cpp
+++ b/stmt.cpp
@@ -135,7 +135,7 @@ lPossiblyResolveFunctionOverloads(Expr *expr, const Type *type) {
        for (int i = 0; i < funcType->GetNumParameters(); ++i)
            paramTypes.push_back(funcType->GetParameterType(i));

-        if (fse->ResolveOverloads(paramTypes) == false)
+        if (fse->ResolveOverloads(expr->pos, paramTypes) == false)
            return false;
    }
    return true;
@@ -287,7 +287,7 @@ DeclStmt::EmitCode(FunctionEmitContext *ctx) const {

    for (unsigned int i = 0; i < vars.size(); ++i) {
        Symbol *sym = vars[i].sym;
-        assert(sym != NULL);
+        Assert(sym != NULL);
        if (sym->type == NULL)
            continue;
        Expr *initExpr = vars[i].init;
@@ -324,7 +324,7 @@ DeclStmt::EmitCode(FunctionEmitContext *ctx) const {

        LLVM_TYPE_CONST llvm::Type *llvmType = sym->type->LLVMType(g->ctx);
        if (llvmType == NULL) {
-            assert(m->errorCount > 0);
+            Assert(m->errorCount > 0);
            return;
        }

@@ -491,7 +491,6 @@ IfStmt::IfStmt(Expr *t, Stmt *ts, Stmt *fs, bool checkCoherence, SourcePos p)
    : Stmt(p), test(t), trueStmts(ts), falseStmts(fs), 
      doAllCheck(checkCoherence &&
                 !g->opt.disableCoherentControlFlow) {
-    // have to wait until after type checking to initialize doAnyCheck.
 }


@@ -646,12 +645,12 @@ IfStmt::emitMaskedTrueAndFalse(FunctionEmitContext *ctx, llvm::Value *oldMask,
        lEmitIfStatements(ctx, trueStmts, "if: expr mixed, true statements");
        // under varying control flow,, returns can't stop instruction
        // emission, so this better be non-NULL...
-        assert(ctx->GetCurrentBasicBlock()); 
+        Assert(ctx->GetCurrentBasicBlock()); 
    }
    if (falseStmts) {
        ctx->SetInternalMaskAndNot(oldMask, test);
        lEmitIfStatements(ctx, falseStmts, "if: expr mixed, false statements");
-        assert(ctx->GetCurrentBasicBlock());
+        Assert(ctx->GetCurrentBasicBlock());
    }
 }

@@ -725,7 +724,7 @@ lSafeToRunWithAllLanesOff(Expr *expr) {

        const SequentialType *seqType = 
            dynamic_cast<const SequentialType *>(type);
-        assert(seqType != NULL);
+        Assert(seqType != NULL);
        int nElements = seqType->GetElementCount();
        if (nElements == 0)
            // Unsized array, so we can't be sure
@@ -869,7 +868,9 @@ lSafeToRunWithAllLanesOff(Stmt *stmt) {
 void
 IfStmt::emitVaryingIf(FunctionEmitContext *ctx, llvm::Value *ltest) const {
    llvm::Value *oldMask = ctx->GetInternalMask();
-    if (ctx->GetFullMask() == LLVMMaskAllOn) {
+    if (ctx->GetFullMask() == LLVMMaskAllOn && 
+        !g->opt.disableCoherentControlFlow &&
+        !g->opt.disableMaskAllOnOptimizations) {
        // We can tell that the mask is on statically at compile time; just
        // emit code for the 'if test with the mask all on' path
        llvm::BasicBlock *bDone = ctx->CreateBasicBlock("cif_done");
@@ -921,14 +922,15 @@ IfStmt::emitVaryingIf(FunctionEmitContext *ctx, llvm::Value *ltest) const {
        //
        // where our use of blend for conditional assignments doesn't check
        // for the 'all lanes' off case.
+        bool costIsAcceptable = ((trueStmts ? trueStmts->EstimateCost() : 0) + 
+                                 (falseStmts ? falseStmts->EstimateCost() : 0)) < 
+            PREDICATE_SAFE_IF_STATEMENT_COST;
        if (lSafeToRunWithAllLanesOff(trueStmts) &&
            lSafeToRunWithAllLanesOff(falseStmts) &&
-            (((trueStmts ? trueStmts->EstimateCost() : 0) + 
-              (falseStmts ? falseStmts->EstimateCost() : 0)) < 
-             PREDICATE_SAFE_IF_STATEMENT_COST)) {
+            (costIsAcceptable || g->opt.disableCoherentControlFlow)) {
            ctx->StartVaryingIf(oldMask);
            emitMaskedTrueAndFalse(ctx, oldMask, ltest);
-            assert(ctx->GetCurrentBasicBlock());
+            Assert(ctx->GetCurrentBasicBlock());
            ctx->EndIf();
        }
        else {
@@ -951,9 +953,12 @@ IfStmt::emitMaskAllOn(FunctionEmitContext *ctx, llvm::Value *ltest,
    // compiler see what's going on so that subsequent optimizations for
    // code emitted here can operate with the knowledge that the mask is
    // definitely all on (until it modifies the mask itself).
-    ctx->SetInternalMask(LLVMMaskAllOn);
+    Assert(!g->opt.disableCoherentControlFlow);
+    if (!g->opt.disableMaskAllOnOptimizations)
+        ctx->SetInternalMask(LLVMMaskAllOn);
    llvm::Value *oldFunctionMask = ctx->GetFunctionMask();
-    ctx->SetFunctionMask(LLVMMaskAllOn);
+    if (!g->opt.disableMaskAllOnOptimizations)
+        ctx->SetFunctionMask(LLVMMaskAllOn);

    // First, check the value of the test.  If it's all on, then we jump to
    // a basic block that will only have code for the true case.
@@ -998,7 +1003,7 @@ IfStmt::emitMaskAllOn(FunctionEmitContext *ctx, llvm::Value *ltest,
    emitMaskedTrueAndFalse(ctx, LLVMMaskAllOn, ltest);
    // In this case, return/break/continue isn't allowed to jump and end
    // emission.
-    assert(ctx->GetCurrentBasicBlock());
+    Assert(ctx->GetCurrentBasicBlock());
    ctx->EndIf();
    ctx->BranchInst(bDone);

@@ -1027,7 +1032,7 @@ IfStmt::emitMaskMixed(FunctionEmitContext *ctx, llvm::Value *oldMask,
        // Emit statements for true
        ctx->SetCurrentBasicBlock(bRunTrue);
        lEmitIfStatements(ctx, trueStmts, "if: expr mixed, true statements");
-        assert(ctx->GetCurrentBasicBlock()); 
+        Assert(ctx->GetCurrentBasicBlock()); 
        ctx->BranchInst(bNext);
        ctx->SetCurrentBasicBlock(bNext);
    }
@@ -1044,7 +1049,7 @@ IfStmt::emitMaskMixed(FunctionEmitContext *ctx, llvm::Value *oldMask,
        // Emit code for false
        ctx->SetCurrentBasicBlock(bRunFalse);
        lEmitIfStatements(ctx, falseStmts, "if: expr mixed, false statements");
-        assert(ctx->GetCurrentBasicBlock());
+        Assert(ctx->GetCurrentBasicBlock());
        ctx->BranchInst(bNext);
        ctx->SetCurrentBasicBlock(bNext);
    }
@@ -1155,12 +1160,14 @@ void DoStmt::EmitCode(FunctionEmitContext *ctx) const {
        // IfStmt::emitCoherentTests()), and then emit the code for the
        // loop body.
        ctx->SetCurrentBasicBlock(bAllOn);
-        ctx->SetInternalMask(LLVMMaskAllOn);
+        if (!g->opt.disableMaskAllOnOptimizations)
+            ctx->SetInternalMask(LLVMMaskAllOn);
        llvm::Value *oldFunctionMask = ctx->GetFunctionMask();
-        ctx->SetFunctionMask(LLVMMaskAllOn);
+        if (!g->opt.disableMaskAllOnOptimizations)
+            ctx->SetFunctionMask(LLVMMaskAllOn);
        if (bodyStmts)
            bodyStmts->EmitCode(ctx);
-        assert(ctx->GetCurrentBasicBlock());
+        Assert(ctx->GetCurrentBasicBlock());
        ctx->SetFunctionMask(oldFunctionMask);
        ctx->BranchInst(btest);

@@ -1168,7 +1175,7 @@ void DoStmt::EmitCode(FunctionEmitContext *ctx) const {
        ctx->SetCurrentBasicBlock(bMixed);
        if (bodyStmts)
            bodyStmts->EmitCode(ctx);
-        assert(ctx->GetCurrentBasicBlock());
+        Assert(ctx->GetCurrentBasicBlock());
        ctx->BranchInst(btest);
    }
    else {
@@ -1321,7 +1328,7 @@ ForStmt::EmitCode(FunctionEmitContext *ctx) const {
    // it and then jump into the loop test code.  (Also start a new scope
    // since the initiailizer may be a declaration statement).
    if (init) {
-        assert(dynamic_cast<StmtList *>(init) == NULL);
+        Assert(dynamic_cast<StmtList *>(init) == NULL);
        ctx->StartScope();
        init->EmitCode(ctx);
    }
@@ -1349,7 +1356,7 @@ ForStmt::EmitCode(FunctionEmitContext *ctx) const {
    if (uniformTest) {
        if (doCoherentCheck)
            Warning(pos, "Uniform condition supplied to cfor/cwhile statement.");
-        assert(ltest->getType() == LLVMTypes::BoolType);
+        Assert(ltest->getType() == LLVMTypes::BoolType);
        ctx->BranchInst(bloop, bexit, ltest);
    }
    else {
@@ -1378,12 +1385,14 @@ ForStmt::EmitCode(FunctionEmitContext *ctx) const {
        // the runtime test has passed, make this fact clear for code
        // generation at compile time here.)
        ctx->SetCurrentBasicBlock(bAllOn);
-        ctx->SetInternalMask(LLVMMaskAllOn);
+        if (!g->opt.disableMaskAllOnOptimizations)
+            ctx->SetInternalMask(LLVMMaskAllOn);
        llvm::Value *oldFunctionMask = ctx->GetFunctionMask();
-        ctx->SetFunctionMask(LLVMMaskAllOn);
+        if (!g->opt.disableMaskAllOnOptimizations)
+            ctx->SetFunctionMask(LLVMMaskAllOn);
        if (stmts)
            stmts->EmitCode(ctx);
-        assert(ctx->GetCurrentBasicBlock());
+        Assert(ctx->GetCurrentBasicBlock());
        ctx->SetFunctionMask(oldFunctionMask);
        ctx->BranchInst(bstep);

@@ -1732,7 +1741,7 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
    ctx->StartScope();

    // This should be caught during typechecking
-    assert(startExprs.size() == dimVariables.size() && 
+    Assert(startExprs.size() == dimVariables.size() && 
           endExprs.size() == dimVariables.size());
    int nDims = (int)dimVariables.size();

@@ -1914,7 +1923,7 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
    ctx->SetInternalMask(LLVMMaskAllOn);
    ctx->AddInstrumentationPoint("foreach loop body");
    stmts->EmitCode(ctx);
-    assert(ctx->GetCurrentBasicBlock() != NULL);
+    Assert(ctx->GetCurrentBasicBlock() != NULL);
    ctx->BranchInst(bbStep[nDims-1]);

    ///////////////////////////////////////////////////////////////////////////
@@ -1944,12 +1953,12 @@ ForeachStmt::Optimize() {
    bool anyErrors = false;
    for (unsigned int i = 0; i < startExprs.size(); ++i) {
        if (startExprs[i] != NULL)
-            startExprs[i]->Optimize();
+            startExprs[i] = startExprs[i]->Optimize();
        anyErrors |= (startExprs[i] == NULL);
    }
    for (unsigned int i = 0; i < endExprs.size(); ++i) {
        if (endExprs[i] != NULL)
-            endExprs[i]->Optimize();
+            endExprs[i] = endExprs[i]->Optimize();
        anyErrors |= (endExprs[i] == NULL);
    }

@@ -1965,20 +1974,21 @@ Stmt *
 ForeachStmt::TypeCheck() {
    bool anyErrors = false;
    for (unsigned int i = 0; i < startExprs.size(); ++i) {
+        // Typecheck first, to resolve function overloads
+        if (startExprs[i] != NULL)
+            startExprs[i] = startExprs[i]->TypeCheck();
        if (startExprs[i] != NULL)
            startExprs[i] = TypeConvertExpr(startExprs[i], 
                                            AtomicType::UniformInt32, 
                                            "foreach starting value");
-        if (startExprs[i] != NULL)
-            startExprs[i]->TypeCheck();
        anyErrors |= (startExprs[i] == NULL);
    }
    for (unsigned int i = 0; i < endExprs.size(); ++i) {
+        if (endExprs[i] != NULL)
+            endExprs[i] = endExprs[i]->TypeCheck();
        if (endExprs[i] != NULL)
            endExprs[i] = TypeConvertExpr(endExprs[i], AtomicType::UniformInt32,
                                          "foreach ending value");
-        if (endExprs[i] != NULL)
-            endExprs[i]->TypeCheck();
        anyErrors |= (endExprs[i] == NULL);
    }

@@ -2341,7 +2351,7 @@ PrintStmt::EmitCode(FunctionEmitContext *ctx) const {

    // Now we can emit code to call __do_print()
    llvm::Function *printFunc = m->module->getFunction("__do_print");
-    assert(printFunc);
+    Assert(printFunc);

    llvm::Value *mask = ctx->GetFullMask();
    // Set up the rest of the parameters to it
@@ -2404,7 +2414,7 @@ AssertStmt::EmitCode(FunctionEmitContext *ctx) const {
    llvm::Function *assertFunc = 
        isUniform ? m->module->getFunction("__do_assert_uniform") :
                    m->module->getFunction("__do_assert_varying");
-    assert(assertFunc != NULL);
+    Assert(assertFunc != NULL);

 #ifdef ISPC_IS_WINDOWS
    char errorString[2048];
--- a/stmt.h
+++ b/stmt.h
@@ -341,7 +341,7 @@ public:

    Like print() above, since we don't have strings as first-class types in
    the language, we need to do some gymnastics to support it.  Like
-    assert() in C, assert checks the given condition and prints an error
+    assert() in C, assert() checks the given condition and prints an error
    and calls abort if the condition fails.  For varying conditions, the
    assert triggers if it's true for any of the program instances.
 */
--- a/sym.cpp
+++ b/sym.cpp
@@ -72,7 +72,7 @@ SymbolTable::SymbolTable() {

 SymbolTable::~SymbolTable() {
    // Otherwise we have mismatched push/pop scopes
-    assert(variables.size() == 1 && functions.size() == 1 &&
+    Assert(variables.size() == 1 && functions.size() == 1 &&
           types.size() == 1);
    PopScope();
 }
@@ -88,15 +88,15 @@ SymbolTable::PushScope() {

 void
 SymbolTable::PopScope() { 
-    assert(variables.size() > 1);
+    Assert(variables.size() > 1);
    delete variables.back();
    variables.pop_back();

-    assert(functions.size() > 1);
+    Assert(functions.size() > 1);
    delete functions.back();
    functions.pop_back();

-    assert(types.size() > 1);
+    Assert(types.size() > 1);
    delete types.back();
    types.pop_back();
 }
@@ -104,7 +104,7 @@ SymbolTable::PopScope() {

 bool
 SymbolTable::AddVariable(Symbol *symbol) {
-    assert(symbol != NULL);
+    Assert(symbol != NULL);

    // Check to see if a symbol of the same name has already been declared.
    for (int i = (int)variables.size() - 1; i >= 0; --i) {
@@ -154,7 +154,7 @@ SymbolTable::LookupVariable(const char *name) {
 bool
 SymbolTable::AddFunction(Symbol *symbol) {
    const FunctionType *ft = dynamic_cast<const FunctionType *>(symbol->type);
-    assert(ft != NULL);
+    Assert(ft != NULL);
    if (LookupFunction(symbol->name.c_str(), ft) != NULL)
        // A function of the same name and type has already been added to
        // the symbol table
--- a/test_static.cpp
+++ b/test_static.cpp
@@ -39,6 +39,11 @@
 #define ISPC_IS_APPLE
 #endif

+#ifdef ISPC_IS_WINDOWS
+#include <windows.h>
+#endif // ISPC_IS_WINDOWS
+
+#include <assert.h>
 #include <string.h>
 #include <stdio.h>
 #include <assert.h>
--- a/tests/atomics-varyingptr-1.ispc
+++ b/tests/atomics-varyingptr-1.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+uniform unsigned int32 s[programCount];
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = 0;
+    float delta = 1;
+    if (programIndex < 2)
+        atomic_add_global(&s[programIndex], delta);
+    RET[programIndex] = s[programIndex];
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = RET[1] = 1;
+}
--- a/tests/atomics-varyingptr-2.ispc
+++ b/tests/atomics-varyingptr-2.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+uniform unsigned int32 s[programCount];
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = 0;
+    float delta = 1;
+    atomic_add_global(&s[programCount-1-programIndex], programIndex);
+    RET[programIndex] = s[programIndex];
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = programCount-1-programIndex;
+}
--- a/tests/atomics-varyingptr-3.ispc
+++ b/tests/atomics-varyingptr-3.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+uniform unsigned int32 s[programCount];
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    for (uniform int i = 0; i < programCount; ++i)
+        s[i] = 1234;
+    float a = aFOO[programIndex]; 
+    float b = 0;
+    float delta = 1;
+    a = atomic_max_global(&s[programIndex], programIndex);
+    RET[programIndex] = a;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1234;
+}
--- a/tests/atomics-varyingptr-4.ispc
+++ b/tests/atomics-varyingptr-4.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+uniform int32 s[programCount];
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    for (uniform int i = 0; i < programCount; ++i)
+        s[i] = -1234;
+    atomic_max_global(&s[programIndex], programIndex);
+    RET[programIndex] = s[programIndex];
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = programIndex;
+}
--- a/tests/gather-struct-vector.ispc
+++ b/tests/gather-struct-vector.ispc
@@ -0,0 +1,31 @@
+
+struct Ray {
+    float<3> v;
+};
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    Ray r[programCount];
+    for (uniform int i = 0; i < programCount; ++i) {
+        r[i].v.x = 100*i + programIndex;
+        r[i].v.y = 200*i + 2*programIndex;
+        r[i].v.z = 300*i + 3*programIndex;
+    }
+
+    Ray *rp = &r[programIndex/2];
+    RET[programIndex] = rp->v.z; 
+}
+
+export void result(uniform float RET[]) {
+    uniform int d0 = 0;
+    uniform int d1 = 0;
+    for (uniform int i = 0; i < programCount; i += 2) {
+        RET[i] = d0+d1;
+        d1 += 3;
+        RET[i+1] = d0+d1;
+        d0 += 300;
+        d1 += 3;
+    }
+}
--- a/tests/ptr-int-null-1.ispc
+++ b/tests/ptr-int-null-1.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+bool foo(int *ptr) {
+    return (ptr == NULL);
+}
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int a = 1;
+    uniform int * uniform b = 0;
+    RET[programIndex] = foo(0);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1;
+}
--- a/tests/ptr-int-null.ispc
+++ b/tests/ptr-int-null.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int a = 1;
+    uniform int * uniform b = 0;
+    RET[programIndex] = (b == NULL && b == 0 && 0 == b) ? 1 : 0;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1;
+}
--- a/tests/scatter-mask-1.ispc
+++ b/tests/scatter-mask-1.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+uniform float a[programCount];
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    int index = aFOO[programIndex]-1; 
+    if (index & 1)
+        a[index] = 1;
+    RET[programIndex] = a[programIndex]; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (programIndex & 1) ? 1 : 0;
+}
--- a/tests/scatter-mask-2.ispc
+++ b/tests/scatter-mask-2.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+uniform float a[programCount];
+
+static void foo(int index) {
+    a[index] = 1;
+}
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    int index = aFOO[programIndex]-1; 
+    if (index & 1)
+        foo(index);
+    RET[programIndex] = a[programIndex]; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (programIndex & 1) ? 1 : 0;
+}
--- a/tests/vector-varying-scatter-2.ispc
+++ b/tests/vector-varying-scatter-2.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float aa = aFOO[programIndex]; 
+    float<4> a = { -1, -2, -3, -4 };
+    if (programIndex < 4)
+        a[3-programIndex] = aa;
+//CO    print("%\n%\n%\n%\n", a[0], a[1], a[2], a[3]);
+    int i = clamp(3-programIndex, 0, 3);
+//CO    print("%\n%\n", i, a[i]);
+    RET[programIndex] = a[i];
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (programIndex < 4) ? 1+programIndex : -1;
+}
--- a/tests/vector-varying-scatter.ispc
+++ b/tests/vector-varying-scatter.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float aa = aFOO[programIndex]; 
+    float<4> a = { -1, -2, -3, -4 };
+    if (programIndex < 4)
+        a[3-programIndex] = aa;
+    RET[programIndex] = a[2];
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = -3;
+    RET[1] = 2;
+}
--- a/tests_errors/array-dim-huge.ispc
+++ b/tests_errors/array-dim-huge.ispc
@@ -0,0 +1,7 @@
+// Array dimension must be representable with a 32-bit integer.
+
+struct foo {
+    int x[0xffffffffffff];
+};
+
+
--- a/tests_errors/array-dim-negative.ispc
+++ b/tests_errors/array-dim-negative.ispc
@@ -0,0 +1,7 @@
+// Array dimension must be non-negative
+
+struct foo {
+    int x[-1];
+};
+
+
--- a/tests_errors/fptr-typecheck-1.ispc
+++ b/tests_errors/fptr-typecheck-1.ispc
@@ -0,0 +1,9 @@
+// Too many parameter values provided in function call
+
+float bar(float a, float b);
+
+export uniform int foo(uniform int x[], uniform int i[]) {
+    float (*fptr)(float, float) = bar;
+//CO    bar(0,1,2);
+    fptr(0., 1, 2);
+}
--- a/tests_errors/fptr-typecheck-2.ispc
+++ b/tests_errors/fptr-typecheck-2.ispc
@@ -0,0 +1,9 @@
+// Can't convert argument of type "void * const uniform" to type "float" for funcion call argument.
+
+float bar(float a, float b);
+
+export uniform int foo(uniform int x[], uniform int i[]) {
+    float (*fptr)(float, float) = bar;
+//CO    bar(0,1,2);
+    fptr(NULL, 1);
+}
--- a/tests_errors/fptr-typecheck-3.ispc
+++ b/tests_errors/fptr-typecheck-3.ispc
@@ -0,0 +1,8 @@
+// Too few parameter values provided in function call (1 provided, 2 expected).
+
+float bar(float a, float b);
+
+export uniform int foo(uniform int x[], uniform int i[]) {
+    float (*fptr)(float, float) = bar;
+    fptr(1.);
+}
--- a/tests_errors/lvalue-1.ispc
+++ b/tests_errors/lvalue-1.ispc
@@ -0,0 +1,8 @@
+// Left hand side of assignment statement can't be assigned to
+
+int foo() {return 2;}
+
+int bar()
+{
+    foo() = 2;
+}
--- a/tests_errors/lvalue-2.ispc
+++ b/tests_errors/lvalue-2.ispc
@@ -0,0 +1,6 @@
+// Left hand side of assignment statement can't be assigned to
+
+int bar(){ 
+    4 = 0;
+}
+
--- a/tests_errors/lvalue-3.ispc
+++ b/tests_errors/lvalue-3.ispc
@@ -0,0 +1,7 @@
+// Left hand side of assignment statement can't be assigned to
+
+int bar(){ 
+    int x;
+    4 = x;
+}
+
--- a/type.cpp
+++ b/type.cpp
@@ -293,7 +293,7 @@ AtomicType::GetAsUniformType() const {

 const Type *
 AtomicType::GetSOAType(int width) const {
-    assert(width > 0);
+    Assert(width > 0);
    return new ArrayType(this, width);
 }

@@ -353,7 +353,10 @@ AtomicType::Mangle() const {
 std::string
 AtomicType::GetCDeclaration(const std::string &name) const {
    std::string ret;
-    assert(isUniform);
+    if (isUniform == false) {
+        Assert(m->errorCount > 0);
+        return ret;
+    }
    if (isConst) ret += "const ";

    switch (basicType) {
@@ -567,7 +570,7 @@ EnumType::GetAsUniformType() const {

 const Type *
 EnumType::GetSOAType(int width) const {
-    assert(width > 0);
+    Assert(width > 0);
    return new ArrayType(this, width);
 }

@@ -641,9 +644,9 @@ EnumType::GetDIType(llvm::DIDescriptor scope) const {
    std::vector<llvm::Value *> enumeratorDescriptors;
    for (unsigned int i = 0; i < enumerators.size(); ++i) {
        unsigned int enumeratorValue;
-        assert(enumerators[i]->constValue != NULL);
+        Assert(enumerators[i]->constValue != NULL);
        int count = enumerators[i]->constValue->AsUInt32(&enumeratorValue);
-        assert(count == 1);
+        Assert(count == 1);

        llvm::Value *descriptor = 
            m->diBuilder->createEnumerator(enumerators[i]->name, enumeratorValue);
@@ -935,7 +938,7 @@ const Type *SequentialType::GetElementType(int index) const {
 ArrayType::ArrayType(const Type *c, int a) 
    : child(c), numElements(a) {
    // 0 -> unsized array.
-    assert(numElements >= 0);
+    Assert(numElements >= 0);
 }


@@ -1134,7 +1137,7 @@ ArrayType::GetDIType(llvm::DIDescriptor scope) const {

 ArrayType *
 ArrayType::GetSizedArray(int sz) const {
-    assert(numElements == 0);
+    Assert(numElements == 0);
    return new ArrayType(child, sz);
 }

@@ -1175,7 +1178,7 @@ ArrayType::SizeUnsizedArrays(const Type *type, Expr *initExpr) {
        for (unsigned int i = 1; i < exprList->exprs.size(); ++i) {
            if (exprList->exprs[i] == NULL) {
                // We should have seen an error earlier in this case.
-                assert(m->errorCount > 0);
+                Assert(m->errorCount > 0);
                continue;
            }

@@ -1201,9 +1204,9 @@ ArrayType::SizeUnsizedArrays(const Type *type, Expr *initExpr) {

 SOAArrayType::SOAArrayType(const StructType *eltType, int nElem, int sw) 
    : ArrayType(eltType, nElem), soaWidth(sw) {
-    assert(soaWidth > 0);
+    Assert(soaWidth > 0);
    if (numElements > 0)
-        assert((numElements % soaWidth) == 0);
+        Assert((numElements % soaWidth) == 0);
 }


@@ -1334,8 +1337,8 @@ SOAArrayType::soaType() const {

 VectorType::VectorType(const AtomicType *b, int a) 
    : base(b), numElements(a) {
-    assert(numElements > 0);
-    assert(base != NULL);
+    Assert(numElements > 0);
+    Assert(base != NULL);
 }


@@ -1716,7 +1719,7 @@ StructType::GetDIType(llvm::DIDescriptor scope) const {
        // element starts at an offset that's the correct alignment.
        if (currentSize > 0 && (currentSize % eltAlign))
            currentSize += eltAlign - (currentSize % eltAlign);
-        assert((currentSize == 0) || (currentSize % eltAlign) == 0);
+        Assert((currentSize == 0) || (currentSize % eltAlign) == 0);

        llvm::DIFile diFile = elementPositions[i].GetDIFile();
        int line = elementPositions[i].first_line;
@@ -1755,7 +1758,7 @@ StructType::GetDIType(llvm::DIDescriptor scope) const {

 const Type *
 StructType::GetElementType(int i) const {
-    assert(i < (int)elementTypes.size());
+    Assert(i < (int)elementTypes.size());
    // If the struct is uniform qualified, then each member comes out with
    // the same type as in the original source file.  If it's varying, then
    // all members are promoted to varying.
@@ -1955,7 +1958,7 @@ FunctionType::FunctionType(const Type *r, const std::vector<const Type *> &a,
      paramTypes(a), paramNames(std::vector<std::string>(a.size(), "")),
      paramDefaults(std::vector<ConstExpr *>(a.size(), NULL)),
      paramPositions(std::vector<SourcePos>(a.size(), p)) {
-    assert(returnType != NULL);
+    Assert(returnType != NULL);
 }


@@ -1966,10 +1969,10 @@ FunctionType::FunctionType(const Type *r, const std::vector<const Type *> &a,
                           bool it, bool is, bool ec) 
    : isTask(it), isExported(is), isExternC(ec), returnType(r), paramTypes(a), 
      paramNames(an), paramDefaults(ad), paramPositions(ap) {
-    assert(paramTypes.size() == paramNames.size() && 
+    Assert(paramTypes.size() == paramNames.size() && 
           paramNames.size() == paramDefaults.size() &&
           paramDefaults.size() == paramPositions.size());
-    assert(returnType != NULL);
+    Assert(returnType != NULL);
 }


@@ -2124,14 +2127,14 @@ FunctionType::GetDIType(llvm::DIDescriptor scope) const {

 LLVM_TYPE_CONST llvm::FunctionType *
 FunctionType::LLVMFunctionType(llvm::LLVMContext *ctx, bool includeMask) const {
-    if (isTask == true) assert(includeMask == true);
+    if (isTask == true) Assert(includeMask == true);

    // Get the LLVM Type *s for the function arguments
    std::vector<LLVM_TYPE_CONST llvm::Type *> llvmArgTypes;
    for (unsigned int i = 0; i < paramTypes.size(); ++i) {
        if (!paramTypes[i])
            return NULL;
-        assert(paramTypes[i] != AtomicType::Void);
+        Assert(paramTypes[i] != AtomicType::Void);

        LLVM_TYPE_CONST llvm::Type *t = paramTypes[i]->LLVMType(ctx);
        if (t == NULL)
@@ -2167,28 +2170,28 @@ FunctionType::LLVMFunctionType(llvm::LLVMContext *ctx, bool includeMask) const {

 const Type *
 FunctionType::GetParameterType(int i) const { 
-    assert(i < (int)paramTypes.size());
+    Assert(i < (int)paramTypes.size());
    return paramTypes[i];
 }


 ConstExpr *
 FunctionType::GetParameterDefault(int i) const { 
-    assert(i < (int)paramDefaults.size());
+    Assert(i < (int)paramDefaults.size());
    return paramDefaults[i]; 
 }


 const SourcePos &
 FunctionType::GetParameterSourcePos(int i) const { 
-    assert(i < (int)paramPositions.size());
+    Assert(i < (int)paramPositions.size());
    return paramPositions[i];
 }


 const std::string &
 FunctionType::GetParameterName(int i) const { 
-    assert(i < (int)paramNames.size());
+    Assert(i < (int)paramNames.size());
    return paramNames[i]; 
 }

@@ -2241,7 +2244,7 @@ lVectorConvert(const Type *type, SourcePos pos, const char *reason, int vecSize)
 const Type *
 Type::MoreGeneralType(const Type *t0, const Type *t1, SourcePos pos, const char *reason, 
                      bool forceVarying, int vecSize) {
-    assert(reason != NULL);
+    Assert(reason != NULL);

    // First, if we need to go varying, promote both of the types to be
    // varying.
@@ -2312,7 +2315,7 @@ Type::MoreGeneralType(const Type *t0, const Type *t1, SourcePos pos, const char
        // The 'more general' version of the two vector element types must
        // be an AtomicType (that's all that vectors can hold...)
        const AtomicType *at = dynamic_cast<const AtomicType *>(t);
-        assert(at != NULL);
+        Assert(at != NULL);

        return new VectorType(at, vt0->GetElementCount());
    }
@@ -2327,7 +2330,7 @@ Type::MoreGeneralType(const Type *t0, const Type *t1, SourcePos pos, const char
            return NULL;

        const AtomicType *at = dynamic_cast<const AtomicType *>(t);
-        assert(at != NULL);
+        Assert(at != NULL);
        return new VectorType(at, vt0->GetElementCount());
    }
    else if (vt1) {
@@ -2339,7 +2342,7 @@ Type::MoreGeneralType(const Type *t0, const Type *t1, SourcePos pos, const char
            return NULL;

        const AtomicType *at = dynamic_cast<const AtomicType *>(t);
-        assert(at != NULL);
+        Assert(at != NULL);
        return new VectorType(at, vt1->GetElementCount());
    }

@@ -2352,7 +2355,7 @@ Type::MoreGeneralType(const Type *t0, const Type *t1, SourcePos pos, const char
    const EnumType *et1 = dynamic_cast<const EnumType *>(t1->GetReferenceTarget());
    if (et0 != NULL && et1 != NULL) {
        // Two different enum types -> make them uint32s...
-        assert(et0->IsVaryingType() == et1->IsVaryingType());
+        Assert(et0->IsVaryingType() == et1->IsVaryingType());
        return et0->IsVaryingType() ? AtomicType::VaryingUInt32 :
                AtomicType::UniformUInt32;
    }
@@ -2383,7 +2386,7 @@ Type::MoreGeneralType(const Type *t0, const Type *t1, SourcePos pos, const char

    // Now all we can do is promote atomic types...
    if (at0 == NULL || at1 == NULL) {
-        assert(reason != NULL);
+        Assert(reason != NULL);
        Error(pos, "Implicit conversion from type \"%s\" to \"%s\" for %s not possible.",
              t0->GetString().c_str(), t1->GetString().c_str(), reason);
        return NULL;
--- a/util.cpp
+++ b/util.cpp
@@ -45,7 +45,6 @@
 #include <stdio.h>

 #include <stdio.h>
-#include <assert.h>
 #include <ctype.h>
 #include <stdarg.h>
 #include <stdlib.h>
@@ -147,7 +146,7 @@ lPrintWithWordBreaks(const char *buf, int columnWidth, FILE *out) {
    fputs(buf, out);
 #else
    int column = 0;
-    assert(strchr(buf, ':') != NULL);
+    Assert(strchr(buf, ':') != NULL);
    int indent = strchr(buf, ':') - buf + 2;
    int width = std::max(40, columnWidth - 2);

@@ -267,7 +266,7 @@ Error(SourcePos p, const char *fmt, ...) {
    va_list args;
    va_start(args, fmt);
    lPrint("Error", p, fmt, args);
-    ++m->errorCount;
+    if (m != NULL) ++m->errorCount;
    va_end(args);
 }

@@ -292,7 +291,7 @@ Warning(SourcePos p, const char *fmt, ...) {
    va_list args;
    va_start(args, fmt);
    lPrint(g->warningsAsErrors ? "Error" : "Warning", p, fmt, args);
-    if (g->warningsAsErrors)
+    if (g->warningsAsErrors && m != NULL)
        ++m->errorCount;
    va_end(args);
 }
@@ -313,6 +312,12 @@ PerformanceWarning(SourcePos p, const char *fmt, ...) {
 void
 FatalError(const char *file, int line, const char *message) {
    fprintf(stderr, "%s(%d): FATAL ERROR: %s\n", file, line, message);
+    fprintf(stderr, "***\n"
+            "*** Please file a bug report at https://github.com/ispc/ispc/issues\n"
+            "*** (Including as much information as you can about how to "
+            "reproduce this error).\n"
+            "*** You have apparently encountered a bug in the compiler that we'd "
+            "like to fix!\n***\n");
    abort();
 }

@@ -392,7 +397,7 @@ GetDirectoryAndFileName(const std::string &currentDirectory,
    char path[MAX_PATH];
    const char *combPath = PathCombine(path, currentDirectory.c_str(),
                                       relativeName.c_str());
-    assert(combPath != NULL);
+    Assert(combPath != NULL);
    const char *filenamePtr = PathFindFileName(combPath);
    *filename = filenamePtr;
    *directory = std::string(combPath, filenamePtr - combPath);
@@ -414,9 +419,9 @@ GetDirectoryAndFileName(const std::string &currentDirectory,
    // now, we need to separate it into the base name and the directory
    const char *fp = fullPath.c_str();
    const char *basenameStart = strrchr(fp, '/');
-    assert(basenameStart != NULL);
+    Assert(basenameStart != NULL);
    ++basenameStart;
-    assert(basenameStart != '\0');
+    Assert(basenameStart != '\0');
    *filename = basenameStart;
    *directory = std::string(fp, basenameStart - fp);
 #endif // ISPC_IS_WINDOWS
Author	SHA1	Message	Date
Matt Pharr	6f6e28077f	Release notes and doxygen bump for 1.1.1	2011-12-15 13:17:08 -08:00
Matt Pharr	0a9a7c939a	Fix test runner script to not crash if one of the tests_errors didn't return the expected result.	2011-12-15 12:38:41 -08:00
Matt Pharr	f30a5dea79	Linux build fixes	2011-12-15 12:23:26 -08:00
Matt Pharr	018b547c40	Fix language builtin assert() (which was broken by `8d1b77b`).	2011-12-15 12:10:27 -08:00
Matt Pharr	e82a720223	Fix various warnings / build issues on Windows	2011-12-15 12:06:38 -08:00
Matt Pharr	8d1b77b235	Have assertion macro and FATAL() text ask user to file a bug, provide URL to do so. Switch to Assert() from assert() to make it clear it's not the C stdlib one we're using any more.	2011-12-15 11:11:16 -08:00
Matt Pharr	b8987faeee	Do assignment lvalue error checking in type checking Added some tests related to this. Also improved source file position reporting in error reporting.	2011-12-15 11:09:23 -08:00
Matt Pharr	17fdab2793	Issue errors if array dimensions are negative or too large to fit in 32 bits.	2011-12-15 06:00:42 -08:00
Matt Pharr	1fa6520cb6	Improvements to constant int parsing. Accept 'u' and 'l' suffixes to force the constants to be corresponding types. Just carry around a single 64-bit int value in yylval rather than having both 32- and 64-bit variants.	2011-12-15 06:00:42 -08:00
Matt Pharr	b6af5c16c6	Remove old / unused warnings.	2011-12-15 06:00:41 -08:00
Matt Pharr	10ebe88abf	Directly emit code for the mask checks at the start of complex functions. Previously, we used an IfStmt to wrap complex functions with the equivalent of a "cif" to check to see if the mask was all on, all off, or mixed at the start of executing non-trivial functions. This had the unintended side effect of suggesting to other parts of the compiler that the entire function was under varying control flow (which in turn led to some small code quality issues.) Now, we emit the equivalent code directly.	2011-12-15 06:00:41 -08:00
Matt Pharr	c0b41ad6f5	Fix bug in mask selection for references. We should always use the full mask when storing to a reference, since we don't in general know what it refers to (and thence the appropriate mask to use for its target).	2011-12-15 06:00:41 -08:00
Matt Pharr	9920b30318	Fix bug that led to incorrect code with return statements. The conceptual error was the assumption that not being under varying control flow implied that the mask was all on; this is not the case if some of the instances have executed a return earlier in the function's execution. The error in practice would be that the mask would be assumed to be all-on for things like memory writes, so there would be unintended side-effects for the instances that had returned.	2011-12-15 06:00:31 -08:00
Matt Pharr	07f218137a	Actually typecheck the arguments to functions called through function pointers. (Somehow this wasn't being done before.) Errors are now issued if too few arguments are used when calling through a function pointer, too many arguments are used, or if any of them can't be type converted to the parameter type.	2011-12-14 12:22:49 -08:00
Matt Pharr	89a5248f4f	Print better error messages when function overload resolution fails.	2011-12-14 11:41:34 -08:00
Matt Pharr	891919074e	Partial fix of a malformed program crasher. Starts to address issue #135, but then a later assertion hits.	2011-12-14 11:41:02 -08:00
Matt Pharr	4adf527a4d	Fix numerous typos in documentation (goodness)	2011-12-14 10:26:35 -08:00
Matt Pharr	533b539780	Add additional examples to better explain execution model to documentation.	2011-12-14 10:23:19 -08:00
Matt Pharr	6f26ae9801	Fix bugs with offsetting for varying values with gathers/scatters. Fixes issue #134.	2011-12-12 14:13:46 -08:00
Matt Pharr	ddcdfff3ae	Fix run_tests.py to print all output from tests (if any)	2011-12-12 14:13:01 -08:00
Matt Pharr	5b48354d9a	Fix crashes from malformed programs.	2011-12-12 13:47:46 -08:00
Matt Pharr	46bfef3fce	Add option to turn off codegen improvements when mask 'all on' is statically known.	2011-12-11 16:16:36 -08:00
Matt Pharr	20536bb339	Fix mandelbrot_tasks example	2011-12-11 15:21:11 -08:00
Matt Pharr	f6605ee465	Small cleanup: allocate storage for the full mask in the FunctionEmitContext constructor	2011-12-10 13:33:28 -08:00
Matt Pharr	034507a35b	Update examples: bulk task launch in stencil/mandelbrot, use foreach more.	2011-12-10 11:11:30 -08:00
Matt Pharr	0b2febcec0	Update volume rendering workload: use AVX, remove reduce_equal() path. Both of these changes gave a performance benefit!	2011-12-09 17:40:50 -08:00
Matt Pharr	d2fa735ef1	Provide ISPC_POINTER_SIZE predefined maacro (32 or 64)	2011-12-09 16:37:42 -08:00
Matt Pharr	20f34b67da	Fix typo in documentation	2011-12-09 16:31:03 -08:00
Matt Pharr	03f3db1e89	Fix bugs in ForeachStmt::TypeCheck() and Optimize() methods. Specifically, we weren't storing the results passed back from when we called those methods of the start and end exprs. This manifested itself as overloaded functions there not resolving properly.	2011-12-08 15:29:20 -08:00
Matt Pharr	9805b0742d	Switch to avx-x2 for the stencil workload	2011-12-08 14:36:09 -08:00
Matt Pharr	6000c696b2	Small fixes to optimization disabling code.	2011-12-08 14:35:57 -08:00
Matt Pharr	5a2edf723b	Update with latest performance numbers.	2011-12-08 14:35:22 -08:00
Matt Pharr	aec7da740a	Fix malformed program crashes.	2011-12-08 14:35:12 -08:00
Matt Pharr	a79bc75b72	Add a number of symbol names to list to make internal after loading builtins. Fixes issue #131; because they weren't being marked as internal before, when compiling to multiple targets these would lead to multiply-defined symbols.	2011-12-07 08:30:38 -08:00
Matt Pharr	eaaebf7928	Small documentation cleanups	2011-12-06 16:52:02 -08:00
Matt Pharr	198aa9620e	Fix bug with mask used for gather/scatter code generation. We should always use the full mask for this, never the internal mask. Added tests for this.	2011-12-06 15:51:56 -08:00
Matt Pharr	27c53a3c25	Try 3 on warning about no output file specified	2011-12-06 14:44:41 -08:00
Matt Pharr	bd70182369	Add some additional tests	2011-12-06 14:26:52 -08:00
Matt Pharr	04df63d955	Update run_tests.py to work on Windows. Removed JIT-based testing path entirely.	2011-12-06 13:46:20 -08:00
Matt Pharr	d59131d670	Fix warning to not print "Warning" twice	2011-12-06 09:03:44 -08:00
Matt Pharr	9475e13d81	Issue a warning if no output file is specified.	2011-12-06 08:21:34 -08:00
Matt Pharr	765d86076f	Basic support for AVX2 when building with LLVM3.1svn For now this target just uses the same builtins-.ll files as the regular AVX1 target. Once the gather intrinsic is available from LLVM, we'll want to have custom target files that call out to that for gathers. (The integer min/max intrinsics should be wired up to the __{min,max}_varying_{int,uint}() builtins at that point as well.)	2011-12-06 08:20:53 -08:00
Matt Pharr	e2b6ed3db8	Fix built for LLVM2.9 and 3.1svn	2011-12-06 08:08:41 -08:00