Merge pull request #420 from jbrodman/master

Fix for c++ backend
Fix/Hack to avoid the cbackend generating spurious array type declarations.
2013-01-08 11:55:03 -08:00 · 2013-01-08 14:53:17 -05:00 · 2013-01-08 14:33:32 -05:00 · 2013-01-08 11:32:34 -08:00 · 2013-01-08 10:28:02 -08:00 · 2013-01-08 08:40:01 -08:00
546 changed files with 121755 additions and 10428 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -5,4 +5,11 @@ ispc
 ispc_test
 objs
 docs/doxygen
-docs/ispc.html
+docs/*.html
+tests*/*cpp
+tests*/*run
+examples/*/*.png
+examples/*/*.ppm
+examples/*/objs/*
+
+
--- a/64
+++ b/64
@@ -2,6 +2,15 @@
 # ispc Makefile
 #

+# If you have your own special version of llvm and/or clang, change
+# these variables to match.
+LLVM_CONFIG=$(shell which llvm-config)
+CLANG_INCLUDE=$(shell $(LLVM_CONFIG) --includedir)
+
+# Add llvm bin to the path so any scripts run will go to the right llvm-config
+LLVM_BIN= $(shell $(LLVM_CONFIG) --bindir)
+export PATH:=$(LLVM_BIN):$(PATH)
+
 ARCH_OS = $(shell uname)
 ifeq ($(ARCH_OS), Darwin)
 	ARCH_OS2 = "OSX"
@@ -10,27 +19,17 @@ else
 endif
 ARCH_TYPE = $(shell arch)

-ifeq ($(shell llvm-config --version), 3.1svn)
-  LLVM_LIBS=-lLLVMAsmParser -lLLVMInstrumentation -lLLVMLinker			\
-	-lLLVMArchive -lLLVMBitReader -lLLVMDebugInfo -lLLVMJIT -lLLVMipo	\
-	-lLLVMBitWriter -lLLVMTableGen -lLLVMCBackendInfo			\
-	-lLLVMX86Disassembler -lLLVMX86CodeGen -lLLVMSelectionDAG		\
-	-lLLVMAsmPrinter -lLLVMX86AsmParser -lLLVMX86Desc -lLLVMX86Info		\
-	-lLLVMX86AsmPrinter -lLLVMX86Utils -lLLVMMCDisassembler	-lLLVMMCParser	\
-	-lLLVMCodeGen -lLLVMScalarOpts	-lLLVMInstCombine -lLLVMTransformUtils	\
-	-lLLVMipa -lLLVMAnalysis -lLLVMMCJIT -lLLVMRuntimeDyld			\
-	-lLLVMExecutionEngine -lLLVMTarget -lLLVMMC -lLLVMObject -lLLVMCore 	\
-	-lLLVMSupport
-else
-  LLVM_LIBS=$(shell llvm-config --libs)
-endif
+LLVM_LIBS=$(shell $(LLVM_CONFIG) --libs)

 CLANG=clang
 CLANG_LIBS = -lclangFrontend -lclangDriver \
             -lclangSerialization -lclangParse -lclangSema \
             -lclangAnalysis -lclangAST -lclangLex -lclangBasic
+ifneq ($(shell $(LLVM_CONFIG) --version), 3.0)
+  CLANG_LIBS += -lclangEdit
+endif

-ISPC_LIBS=$(shell llvm-config --ldflags) $(CLANG_LIBS) $(LLVM_LIBS) \
+ISPC_LIBS=$(shell $(LLVM_CONFIG) --ldflags) $(CLANG_LIBS) $(LLVM_LIBS) \
 	-lpthread

 ifeq ($(ARCH_OS),Linux)
@@ -41,8 +40,8 @@ ifeq ($(ARCH_OS2),Msys)
 	ISPC_LIBS += -lshlwapi -limagehlp -lpsapi
 endif

-LLVM_CXXFLAGS=$(shell llvm-config --cppflags)
-LLVM_VERSION=LLVM_$(shell llvm-config --version | sed s/\\./_/)
+LLVM_CXXFLAGS=$(shell $(LLVM_CONFIG) --cppflags)
+LLVM_VERSION=LLVM_$(shell $(LLVM_CONFIG) --version | sed -e s/\\./_/ -e s/svn//)
 LLVM_VERSION_DEF=-D$(LLVM_VERSION)

 BUILD_DATE=$(shell date +%Y%m%d)
@@ -50,15 +49,16 @@ BUILD_VERSION=$(shell git log --abbrev-commit --abbrev=16 | head -1)

 CXX=g++
 CPP=cpp
-OPT=-g3
-CXXFLAGS=$(OPT) $(LLVM_CXXFLAGS) -I. -Iobjs/ -Wall $(LLVM_VERSION_DEF) \
+OPT=-O2
+CXXFLAGS=$(OPT) $(LLVM_CXXFLAGS) -I. -Iobjs/ -I$(CLANG_INCLUDE)  \
+	-Wall $(LLVM_VERSION_DEF) \
 	-DBUILD_DATE="\"$(BUILD_DATE)\"" -DBUILD_VERSION="\"$(BUILD_VERSION)\""

 LDFLAGS=
 ifeq ($(ARCH_OS),Linux)
  # try to link everything statically under Linux (including libstdc++) so
  # that the binaries we generate will be portable across distributions...
-    LDFLAGS=-static
+#    LDFLAGS=-static
 endif

 LEX=flex
@@ -71,8 +71,8 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
 	type.cpp util.cpp
 HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
 	opt.h stmt.h sym.h type.h util.h
-TARGETS=avx1 avx1-x2 avx2 avx2-x2 sse2 sse2-x2 sse4 sse4-x2 generic-4 generic-8 \
-	generic-16 generic-1
+TARGETS=avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 sse2 sse2-x2 sse4 sse4-x2 \
+	generic-4 generic-8 generic-16 generic-32 generic-64 generic-1
 BUILTINS_SRC=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS))) \
 	builtins/dispatch.ll
 BUILTINS_OBJS=$(addprefix builtins-, $(notdir $(BUILTINS_SRC:.ll=.o))) \
@@ -86,10 +86,10 @@ OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_OBJS) \

 default: ispc

-.PHONY: dirs clean depend doxygen print_llvm_src
+.PHONY: dirs clean depend doxygen print_llvm_src llvm_check
 .PRECIOUS: objs/builtins-%.cpp

-depend: $(CXX_SRC) $(HEADERS)
+depend: llvm_check $(CXX_SRC) $(HEADERS)
 	@echo Updating dependencies
 	@gcc -MM $(CXXFLAGS) $(CXX_SRC) | sed 's_^\([a-z]\)_objs/\1_g' > depend

@@ -99,7 +99,15 @@ dirs:
 	@echo Creating objs/ directory
 	@/bin/mkdir -p objs

-print_llvm_src:
+llvm_check:
+	@llvm-config --version > /dev/null || \
+	(echo; \
+	 echo "******************************************"; \
+	 echo "ERROR: llvm-config not found in your PATH";  \
+	 echo "******************************************"; \
+	 echo; exit 1)
+
+print_llvm_src: llvm_check
 	@echo Using LLVM `llvm-config --version` from `llvm-config --libdir`

 clean:
@@ -111,7 +119,7 @@ doxygen:

 ispc: print_llvm_src dirs $(OBJS)
 	@echo Creating ispc executable
-	@$(CXX) $(LDFLAGS) -o $@ $(OBJS) $(ISPC_LIBS)
+	@$(CXX) $(OPT) $(LDFLAGS) -o $@ $(OBJS) $(ISPC_LIBS)

 objs/%.o: %.cpp
 	@echo Compiling $<
@@ -121,6 +129,10 @@ objs/cbackend.o: cbackend.cpp
 	@echo Compiling $<
 	@$(CXX) -fno-rtti -fno-exceptions $(CXXFLAGS) -o $@ -c $<

+objs/opt.o: opt.cpp
+	@echo Compiling $<
+	@$(CXX) -fno-rtti $(CXXFLAGS) -o $@ -c $<
+
 objs/%.o: objs/%.cpp
 	@echo Compiling $<
 	@$(CXX) $(CXXFLAGS) -o $@ -c $<
--- a/ast.cpp
+++ b/ast.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2011, Intel Corporation
+  Copyright (c) 2011-2012, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -32,8 +32,10 @@
 */

 /** @file ast.cpp
-    @brief 
-*/
+
+    @brief General functionality related to abstract syntax trees and
+    traversal of them.
+ */

 #include "ast.h"
 #include "expr.h"
@@ -53,10 +55,10 @@ ASTNode::~ASTNode() {
 // AST

 void
-AST::AddFunction(Symbol *sym, const std::vector<Symbol *> &args, Stmt *code) {
+AST::AddFunction(Symbol *sym, Stmt *code) {
    if (sym == NULL)
        return;
-    functions.push_back(new Function(sym, args, code));
+    functions.push_back(new Function(sym, code));
 }


@@ -90,6 +92,8 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
        DoStmt *dos;
        ForStmt *fs;
        ForeachStmt *fes;
+        ForeachActiveStmt *fas;
+        ForeachUniqueStmt *fus;
        CaseStmt *cs;
        DefaultStmt *defs;
        SwitchStmt *ss;
@@ -99,6 +103,7 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
        PrintStmt *ps;
        AssertStmt *as;
        DeleteStmt *dels;
+        UnmaskedStmt *ums;

        if ((es = dynamic_cast<ExprStmt *>(node)) != NULL)
            es->expr = (Expr *)WalkAST(es->expr, preFunc, postFunc, data);
@@ -135,6 +140,13 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
                                                   postFunc, data);
            fes->stmts = (Stmt *)WalkAST(fes->stmts, preFunc, postFunc, data);
        }
+        else if ((fas = dynamic_cast<ForeachActiveStmt *>(node)) != NULL) {
+            fas->stmts = (Stmt *)WalkAST(fas->stmts, preFunc, postFunc, data);
+        }
+        else if ((fus = dynamic_cast<ForeachUniqueStmt *>(node)) != NULL) {
+            fus->expr = (Expr *)WalkAST(fus->expr, preFunc, postFunc, data);
+            fus->stmts = (Stmt *)WalkAST(fus->stmts, preFunc, postFunc, data);
+        }
        else if ((cs = dynamic_cast<CaseStmt *>(node)) != NULL)
            cs->stmts = (Stmt *)WalkAST(cs->stmts, preFunc, postFunc, data);
        else if ((defs = dynamic_cast<DefaultStmt *>(node)) != NULL)
@@ -151,7 +163,7 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
        else if ((ls = dynamic_cast<LabeledStmt *>(node)) != NULL)
            ls->stmt = (Stmt *)WalkAST(ls->stmt, preFunc, postFunc, data);
        else if ((rs = dynamic_cast<ReturnStmt *>(node)) != NULL)
-            rs->val = (Expr *)WalkAST(rs->val, preFunc, postFunc, data);
+            rs->expr = (Expr *)WalkAST(rs->expr, preFunc, postFunc, data);
        else if ((sl = dynamic_cast<StmtList *>(node)) != NULL) {
            std::vector<Stmt *> &sls = sl->stmts;
            for (unsigned int i = 0; i < sls.size(); ++i)
@@ -163,6 +175,8 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
            as->expr = (Expr *)WalkAST(as->expr, preFunc, postFunc, data);
        else if ((dels = dynamic_cast<DeleteStmt *>(node)) != NULL)
            dels->expr = (Expr *)WalkAST(dels->expr, preFunc, postFunc, data);
+        else if ((ums = dynamic_cast<UnmaskedStmt *>(node)) != NULL)
+            ums->stmts = (Stmt *)WalkAST(ums->stmts, preFunc, postFunc, data);
        else
            FATAL("Unhandled statement type in WalkAST()");
    }
@@ -180,7 +194,8 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
        MemberExpr *me;
        TypeCastExpr *tce;
        ReferenceExpr *re;
-        DereferenceExpr *dre;
+        PtrDerefExpr *ptrderef;
+        RefDerefExpr *refderef;
        SizeOfExpr *soe;
        AddressOfExpr *aoe;
        NewExpr *newe;
@@ -221,8 +236,12 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
            tce->expr = (Expr *)WalkAST(tce->expr, preFunc, postFunc, data);
        else if ((re = dynamic_cast<ReferenceExpr *>(node)) != NULL)
            re->expr = (Expr *)WalkAST(re->expr, preFunc, postFunc, data);
-        else if ((dre = dynamic_cast<DereferenceExpr *>(node)) != NULL)
-            dre->expr = (Expr *)WalkAST(dre->expr, preFunc, postFunc, data);
+        else if ((ptrderef = dynamic_cast<PtrDerefExpr *>(node)) != NULL)
+            ptrderef->expr = (Expr *)WalkAST(ptrderef->expr, preFunc, postFunc,
+                                             data);
+        else if ((refderef = dynamic_cast<RefDerefExpr *>(node)) != NULL)
+            refderef->expr = (Expr *)WalkAST(refderef->expr, preFunc, postFunc,
+                                             data);
        else if ((soe = dynamic_cast<SizeOfExpr *>(node)) != NULL)
            soe->expr = (Expr *)WalkAST(soe->expr, preFunc, postFunc, data);
        else if ((aoe = dynamic_cast<AddressOfExpr *>(node)) != NULL)
@@ -300,19 +319,39 @@ TypeCheck(Stmt *stmt) {
 }


+struct CostData {
+    CostData() { cost = foreachDepth = 0; }
+
+    int cost;
+    int foreachDepth;
+};
+
+
 static bool
-lCostCallback(ASTNode *node, void *c) {
-    int *cost = (int *)c;
-    *cost += node->EstimateCost();
+lCostCallbackPre(ASTNode *node, void *d) {
+    CostData *data = (CostData *)d;
+    if (dynamic_cast<ForeachStmt *>(node) != NULL)
+        ++data->foreachDepth;
+    if (data->foreachDepth == 0)
+        data->cost += node->EstimateCost();
    return true;
 }


+static ASTNode *
+lCostCallbackPost(ASTNode *node, void *d) {
+    CostData *data = (CostData *)d;
+    if (dynamic_cast<ForeachStmt *>(node) != NULL)
+        --data->foreachDepth;
+    return node;
+}
+
+
 int
 EstimateCost(ASTNode *root) {
-    int cost = 0;
-    WalkAST(root, lCostCallback, NULL, &cost);
-    return cost;
+    CostData data;
+    WalkAST(root, lCostCallbackPre, lCostCallbackPost, &data);
+    return data.cost;
 }


@@ -323,14 +362,22 @@ static bool
 lCheckAllOffSafety(ASTNode *node, void *data) {
    bool *okPtr = (bool *)data;

-    if (dynamic_cast<FunctionCallExpr *>(node) != NULL) {
-        // FIXME: If we could somehow determine that the function being
-        // called was safe (and all of the args Exprs were safe, then it'd
-        // be nice to be able to return true here.  (Consider a call to
-        // e.g. floatbits() in the stdlib.)  Unfortunately for now we just
-        // have to be conservative.
-        *okPtr = false;
-        return false;
+    FunctionCallExpr *fce;
+    if ((fce = dynamic_cast<FunctionCallExpr *>(node)) != NULL) {
+        if (fce->func == NULL)
+            return false;
+
+        const Type *type = fce->func->GetType();
+        const PointerType *pt = CastType<PointerType>(type);
+        if (pt != NULL)
+            type = pt->GetBaseType();
+        const FunctionType *ftype = CastType<FunctionType>(type);
+        Assert(ftype != NULL);
+
+        if (ftype->isSafe == false) {
+            *okPtr = false;
+            return false;
+        }
    }

    if (dynamic_cast<AssertStmt *>(node) != NULL) {
@@ -350,17 +397,29 @@ lCheckAllOffSafety(ASTNode *node, void *data) {
        return false;
    }

-    if (g->target.allOffMaskIsSafe == true)
-        // Don't worry about memory accesses if we have a target that can
-        // safely run them with the mask all off
-        return true;
+    if (dynamic_cast<ForeachStmt *>(node) != NULL ||
+        dynamic_cast<ForeachActiveStmt *>(node) != NULL ||
+        dynamic_cast<ForeachUniqueStmt *>(node) != NULL ||
+        dynamic_cast<UnmaskedStmt *>(node) != NULL) {
+        // The various foreach statements also shouldn't be run with an
+        // all-off mask.  Since they can re-establish an 'all on' mask,
+        // this would be pretty unintuitive.  (More generally, it's
+        // possibly a little strange to allow foreach in the presence of
+        // any non-uniform control flow...)
+        //
+        // Similarly, the implementation of foreach_unique assumes as a
+        // precondition that the mask won't be all off going into it, so
+        // we'll enforce that here...
+        *okPtr = false;
+        return false;
+    }

    IndexExpr *ie;
    if ((ie = dynamic_cast<IndexExpr *>(node)) != NULL && ie->baseExpr != NULL) {
        const Type *type = ie->baseExpr->GetType();
        if (type == NULL)
            return true;
-        if (dynamic_cast<const ReferenceType *>(type) != NULL)
+        if (CastType<ReferenceType>(type) != NULL)
            type = type->GetReferenceTarget();

        ConstExpr *ce = dynamic_cast<ConstExpr *>(ie->index);
@@ -370,16 +429,14 @@ lCheckAllOffSafety(ASTNode *node, void *data) {
            return false;
        }

-        const PointerType *pointerType = 
-            dynamic_cast<const PointerType *>(type);
+        const PointerType *pointerType = CastType<PointerType>(type);
        if (pointerType != NULL) {
            // pointer[index] -> can't be sure -> not safe
            *okPtr = false;
            return false;
        }

-        const SequentialType *seqType = 
-            dynamic_cast<const SequentialType *>(type);
+        const SequentialType *seqType = CastType<SequentialType>(type);
        Assert(seqType != NULL);
        int nElements = seqType->GetElementCount();
        if (nElements == 0) {
@@ -409,13 +466,9 @@ lCheckAllOffSafety(ASTNode *node, void *data) {
        return false;
    }

-    DereferenceExpr *de;
-    if ((de = dynamic_cast<DereferenceExpr *>(node)) != NULL) {
-        const Type *exprType = de->expr->GetType();
-        if (dynamic_cast<const PointerType *>(exprType) != NULL) {
-            *okPtr = false;
-            return false;
-        }
+    if (dynamic_cast<PtrDerefExpr *>(node) != NULL) {
+        *okPtr = false;
+        return false;
    }

    return true;
--- a/ast.h
+++ b/ast.h
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2011, Intel Corporation
+  Copyright (c) 2011-2012, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -84,8 +84,7 @@ class AST {
 public:
    /** Add the AST for a function described by the given declaration
        information and source code. */
-    void AddFunction(Symbol *sym, const std::vector<Symbol *> &args, 
-                     Stmt *code);
+    void AddFunction(Symbol *sym, Stmt *code);

    /** Generate LLVM IR for all of the functions into the current
        module. */
--- a/buildispc.bat
+++ b/buildispc.bat
@@ -2,8 +2,8 @@

 REM If LLVM_INSTALL_DIR isn't set globally in your environment,
 REM it can be set here_
-set LLVM_INSTALL_DIR=c:\users\mmp\llvm-dev
-set LLVM_VERSION=3.1svn
+REM set LLVM_INSTALL_DIR=c:\users\mmp\llvm-dev
+REM set LLVM_VERSION=3.2

 REM Both the LLVM binaries and python need to be in the path
 set path=%LLVM_INSTALL_DIR%\bin;%PATH%;c:\cygwin\bin
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -47,12 +47,25 @@

 #include <math.h>
 #include <stdlib.h>
-#include <llvm/LLVMContext.h>
-#include <llvm/Module.h>
-#include <llvm/Type.h>
-#include <llvm/DerivedTypes.h>
-#include <llvm/Instructions.h>
-#include <llvm/Intrinsics.h>
+#if defined(LLVM_3_2)
+  #include <llvm/Attributes.h>
+#endif
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
+  #include <llvm/LLVMContext.h>
+  #include <llvm/Module.h>
+  #include <llvm/Type.h>
+  #include <llvm/Instructions.h>
+  #include <llvm/Intrinsics.h>
+  #include <llvm/DerivedTypes.h>
+#else
+  #include <llvm/IR/Attributes.h>
+  #include <llvm/IR/LLVMContext.h>
+  #include <llvm/IR/Module.h>
+  #include <llvm/IR/Type.h>
+  #include <llvm/IR/Instructions.h>
+  #include <llvm/IR/Intrinsics.h>
+  #include <llvm/IR/DerivedTypes.h>
+#endif
 #include <llvm/Linker.h>
 #include <llvm/Target/TargetMachine.h>
 #include <llvm/ADT/Triple.h>
@@ -157,7 +170,7 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {

 static void
 lCreateSymbol(const std::string &name, const Type *returnType, 
-              const std::vector<const Type *> &argTypes, 
+              llvm::SmallVector<const Type *, 8> &argTypes, 
              const llvm::FunctionType *ftype, llvm::Function *func, 
              SymbolTable *symbolTable) {
    SourcePos noPos;
@@ -199,7 +212,7 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
    // bool, so just have a one-off override for that one...
    if (g->target.maskBitCount != 1 && name == "__sext_varying_bool") {
        const Type *returnType = AtomicType::VaryingInt32;
-        std::vector<const Type *> argTypes;
+        llvm::SmallVector<const Type *, 8> argTypes;
        argTypes.push_back(AtomicType::VaryingBool);

        FunctionType *funcType = new FunctionType(returnType, argTypes, noPos);
@@ -229,7 +242,7 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
        // Iterate over the arguments and try to find their equivalent ispc
        // types.  Track if any of the arguments has an integer type.
        bool anyIntArgs = false;
-        std::vector<const Type *> argTypes;
+        llvm::SmallVector<const Type *, 8> argTypes;
        for (unsigned int j = 0; j < ftype->getNumParams(); ++j) {
            const llvm::Type *llvmArgType = ftype->getParamType(j);
            const Type *type = lLLVMTypeToISPCType(llvmArgType, intAsUnsigned);
@@ -291,7 +304,7 @@ lCheckModuleIntrinsics(llvm::Module *module) {
        if (!strncmp(funcName.c_str(), "llvm.x86.", 9)) {
            llvm::Intrinsic::ID id = (llvm::Intrinsic::ID)func->getIntrinsicID();
            Assert(id != 0);
-            LLVM_TYPE_CONST llvm::Type *intrinsicType = 
+            llvm::Type *intrinsicType = 
                llvm::Intrinsic::getType(*g->ctx, id);
            intrinsicType = llvm::PointerType::get(intrinsicType, 0);
            Assert(func->getType() == intrinsicType);
@@ -322,6 +335,8 @@ lSetInternalFunctions(llvm::Module *module) {
        "__add_varying_double",
        "__add_varying_int32",
        "__add_varying_int64",
+        "__all",
+        "__any",
        "__aos_to_soa3_float",
        "__aos_to_soa3_float16",
        "__aos_to_soa3_float4",
@@ -411,12 +426,17 @@ lSetInternalFunctions(llvm::Module *module) {
        "__extract_int64",
        "__extract_int8",
        "__fastmath",
+        "__float_to_half_uniform",
+        "__float_to_half_varying",
        "__floatbits_uniform_int32",
        "__floatbits_varying_int32",
        "__floor_uniform_double",
        "__floor_uniform_float",
        "__floor_varying_double",
        "__floor_varying_float",
+        "__get_system_isa",
+        "__half_to_float_uniform",
+        "__half_to_float_varying",
        "__insert_int16",
        "__insert_int32",
        "__insert_int64",
@@ -438,6 +458,12 @@ lSetInternalFunctions(llvm::Module *module) {
        "__max_varying_uint32",
        "__max_varying_uint64",
        "__memory_barrier",
+        "__memcpy32",
+        "__memcpy64",
+        "__memmove32",
+        "__memmove64",
+        "__memset32",
+        "__memset64",
        "__min_uniform_double",
        "__min_uniform_float",
        "__min_uniform_int32",
@@ -454,9 +480,11 @@ lSetInternalFunctions(llvm::Module *module) {
        "__new_uniform",
        "__new_varying32",
        "__new_varying64",
+        "__none",
        "__num_cores",
        "__packed_load_active",
        "__packed_store_active",
+        "__pause",
        "__popcnt_int32",
        "__popcnt_int64",
        "__prefetch_read_uniform_1",
@@ -465,12 +493,13 @@ lSetInternalFunctions(llvm::Module *module) {
        "__prefetch_read_uniform_nt",
        "__rcp_uniform_float",
        "__rcp_varying_float",
+        "__rdrand_i16",
+        "__rdrand_i32",
+        "__rdrand_i64",
        "__reduce_add_double",
        "__reduce_add_float",
        "__reduce_add_int32",
        "__reduce_add_int64",
-        "__reduce_add_uint32",
-        "__reduce_add_uint64",
        "__reduce_equal_double",
        "__reduce_equal_float",
        "__reduce_equal_int32",
@@ -499,6 +528,7 @@ lSetInternalFunctions(llvm::Module *module) {
        "__round_varying_float",
        "__rsqrt_uniform_float",
        "__rsqrt_varying_float",
+        "__set_system_isa",
        "__sext_uniform_bool",
        "__sext_varying_bool",
        "__shuffle2_double",
@@ -527,6 +557,8 @@ lSetInternalFunctions(llvm::Module *module) {
        "__sqrt_uniform_float",
        "__sqrt_varying_double",
        "__sqrt_varying_float",
+        "__stdlib_acosf",
+        "__stdlib_asinf",
        "__stdlib_atan",
        "__stdlib_atan2",
        "__stdlib_atan2f",
@@ -606,11 +638,18 @@ AddBitcodeToModule(const unsigned char *bitcode, int length,
               mTriple.getVendor() == bcTriple.getVendor());
        bcModule->setTargetTriple(mTriple.str());

+        // This is also suboptimal; LLVM issues a warning about linking
+        // modules with different datalayouts, due to things like
+        // bulitins-c.c having the regular IA layout, but the generic
+        // targets having a layout with 16-bit alignment for 16xi1 vectors.
+        // As long as builtins-c.c doesn't have any 16xi1 vector types
+        // (which it shouldn't!), then this override is safe.
+        if (g->target.isa == Target::GENERIC)
+            bcModule->setDataLayout(module->getDataLayout());
+
        std::string(linkError);
        if (llvm::Linker::LinkModules(module, bcModule, 
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
                                      llvm::Linker::DestroySource,
-#endif // LLVM_3_0
                                      &linkError))
            Error(SourcePos(), "Error linking stdlib bitcode: %s", linkError.c_str());
        lSetInternalFunctions(module);
@@ -627,15 +666,37 @@ AddBitcodeToModule(const unsigned char *bitcode, int length,
 static void
 lDefineConstantInt(const char *name, int val, llvm::Module *module,
                   SymbolTable *symbolTable) {
-    Symbol *pw = new Symbol(name, SourcePos(), AtomicType::UniformConstInt32,
-                            SC_STATIC);
-    pw->constValue = new ConstExpr(pw->type, val, SourcePos());
-    LLVM_TYPE_CONST llvm::Type *ltype = LLVMTypes::Int32Type;
+    Symbol *sym = 
+        new Symbol(name, SourcePos(), AtomicType::UniformInt32->GetAsConstType(),
+                   SC_STATIC);
+    sym->constValue = new ConstExpr(sym->type, val, SourcePos());
+    llvm::Type *ltype = LLVMTypes::Int32Type;
    llvm::Constant *linit = LLVMInt32(val);
-    pw->storagePtr = new llvm::GlobalVariable(*module, ltype, true, 
-                                              llvm::GlobalValue::InternalLinkage,
-                                              linit, pw->name.c_str());
-    symbolTable->AddVariable(pw);
+    // Use WeakODRLinkage rather than InternalLinkage so that a definition
+    // survives even if it's not used in the module, so that the symbol is
+    // there in the debugger.
+    llvm::GlobalValue::LinkageTypes linkage = g->generateDebuggingSymbols ?
+        llvm::GlobalValue::WeakODRLinkage : llvm::GlobalValue::InternalLinkage;
+    sym->storagePtr = new llvm::GlobalVariable(*module, ltype, true, linkage,
+                                               linit, name);
+    symbolTable->AddVariable(sym);
+
+    if (m->diBuilder != NULL) {
+        llvm::DIFile file;
+        llvm::DIType diType = sym->type->GetDIType(file);
+        Assert(diType.Verify());
+        // FIXME? DWARF says that this (and programIndex below) should
+        // have the DW_AT_artifical attribute.  It's not clear if this
+        // matters for anything though.
+        llvm::DIGlobalVariable var = 
+            m->diBuilder->createGlobalVariable(name, 
+                                               file,
+                                               0 /* line */,
+                                               diType,
+                                               true /* static */,
+                                               sym->storagePtr);
+        Assert(var.Verify());
+    }
 }


@@ -643,13 +704,17 @@ lDefineConstantInt(const char *name, int val, llvm::Module *module,
 static void
 lDefineConstantIntFunc(const char *name, int val, llvm::Module *module,
                       SymbolTable *symbolTable) {
-    std::vector<const Type *> args;
+    llvm::SmallVector<const Type *, 8> args;
    FunctionType *ft = new FunctionType(AtomicType::UniformInt32, args, SourcePos());
    Symbol *sym = new Symbol(name, SourcePos(), ft, SC_STATIC);

    llvm::Function *func = module->getFunction(name);
    Assert(func != NULL); // it should be declared already...
+#if defined(LLVM_3_2)
+    func->addFnAttr(llvm::Attributes::AlwaysInline);
+#else
    func->addFnAttr(llvm::Attribute::AlwaysInline);
+#endif
    llvm::BasicBlock *bblock = llvm::BasicBlock::Create(*g->ctx, "entry", func, 0);
    llvm::ReturnInst::Create(*g->ctx, LLVMInt32(val), bblock);

@@ -661,20 +726,37 @@ lDefineConstantIntFunc(const char *name, int val, llvm::Module *module,

 static void
 lDefineProgramIndex(llvm::Module *module, SymbolTable *symbolTable) {
-    Symbol *pidx = new Symbol("programIndex", SourcePos(), 
-                              AtomicType::VaryingConstInt32, SC_STATIC);
+    Symbol *sym = 
+        new Symbol("programIndex", SourcePos(), 
+                   AtomicType::VaryingInt32->GetAsConstType(), SC_STATIC);

    int pi[ISPC_MAX_NVEC];
    for (int i = 0; i < g->target.vectorWidth; ++i)
        pi[i] = i;
-    pidx->constValue = new ConstExpr(pidx->type, pi, SourcePos());
+    sym->constValue = new ConstExpr(sym->type, pi, SourcePos());

-    LLVM_TYPE_CONST llvm::Type *ltype = LLVMTypes::Int32VectorType;
+    llvm::Type *ltype = LLVMTypes::Int32VectorType;
    llvm::Constant *linit = LLVMInt32Vector(pi);
-    pidx->storagePtr = new llvm::GlobalVariable(*module, ltype, true, 
-                                                llvm::GlobalValue::InternalLinkage, linit, 
-                                                pidx->name.c_str());
-    symbolTable->AddVariable(pidx);
+    // See comment in lDefineConstantInt() for why WeakODRLinkage is used here
+    llvm::GlobalValue::LinkageTypes linkage = g->generateDebuggingSymbols ?
+        llvm::GlobalValue::WeakODRLinkage : llvm::GlobalValue::InternalLinkage;
+    sym->storagePtr = new llvm::GlobalVariable(*module, ltype, true, linkage,
+                                               linit, sym->name.c_str());
+    symbolTable->AddVariable(sym);
+
+    if (m->diBuilder != NULL) {
+        llvm::DIFile file;
+        llvm::DIType diType = sym->type->GetDIType(file);
+        Assert(diType.Verify());
+        llvm::DIGlobalVariable var =
+            m->diBuilder->createGlobalVariable(sym->name.c_str(), 
+                                               file,
+                                               0 /* line */,
+                                               diType,
+                                               false /* static */,
+                                               sym->storagePtr);
+        Assert(var.Verify());
+    }
 }


@@ -756,6 +838,26 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
            FATAL("logic error in DefineStdlib");
        }
        break;
+    case Target::AVX11:
+        switch (g->target.vectorWidth) {
+        case 8:
+            extern unsigned char builtins_bitcode_avx11[];
+            extern int builtins_bitcode_avx11_length;
+            AddBitcodeToModule(builtins_bitcode_avx11, 
+                               builtins_bitcode_avx11_length, 
+                               module, symbolTable);
+            break;
+        case 16:
+            extern unsigned char builtins_bitcode_avx11_x2[];
+            extern int builtins_bitcode_avx11_x2_length;
+            AddBitcodeToModule(builtins_bitcode_avx11_x2, 
+                               builtins_bitcode_avx11_x2_length,
+                               module,  symbolTable);
+            break;
+        default:
+            FATAL("logic error in DefineStdlib");
+        }
+        break;
    case Target::AVX2:
        switch (g->target.vectorWidth) {
        case 8:
@@ -799,6 +901,20 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
                               builtins_bitcode_generic_16_length, 
                               module, symbolTable);
            break;
+        case 32:
+            extern unsigned char builtins_bitcode_generic_32[];
+            extern int builtins_bitcode_generic_32_length;
+            AddBitcodeToModule(builtins_bitcode_generic_32, 
+                               builtins_bitcode_generic_32_length, 
+                               module, symbolTable);
+            break;
+        case 64:
+            extern unsigned char builtins_bitcode_generic_64[];
+            extern int builtins_bitcode_generic_64_length;
+            AddBitcodeToModule(builtins_bitcode_generic_64, 
+                               builtins_bitcode_generic_64_length, 
+                               module, symbolTable);
+            break;
 	case 1:
            extern unsigned char builtins_bitcode_generic_1[];
            extern int builtins_bitcode_generic_1_length;
@@ -831,10 +947,14 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
                       symbolTable);
    lDefineConstantInt("__math_lib_system", (int)Globals::Math_System, module,
                       symbolTable);
-    lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload, module,
-                           symbolTable);
+    lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload,
+                           module, symbolTable);

-    lDefineConstantInt("__have_native_half", (g->target.isa == Target::AVX2),
+    lDefineConstantInt("__have_native_half", g->target.hasHalf, module, 
+                       symbolTable);
+    lDefineConstantInt("__have_native_rand", g->target.hasRand, module, 
+                       symbolTable);
+    lDefineConstantInt("__have_native_transcendentals", g->target.hasTranscendentals,
                       module, symbolTable);

    if (includeStdlibISPC) {
--- a/builtins/builtins.c
+++ b/builtins/builtins.c
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -59,22 +59,39 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdarg.h>
+#include <string.h>

 typedef int Bool;

-#define PRINT_SCALAR(fmt, type)  \
-    printf(fmt, *((type *)ptr)); \
+#define PRINT_BUF_SIZE 4096
+
+#define APPEND(str)                                        \
+    do {                                                   \
+        int offset = bufp - &printString[0];               \
+        *bufp = '\0';                                      \
+        strncat(bufp, str, PRINT_BUF_SIZE-offset);         \
+        bufp += strlen(str);                               \
+        if (bufp >= &printString[PRINT_BUF_SIZE])          \
+            goto done;                                     \
+    } while (0) /* eat semicolon */
+
+
+#define PRINT_SCALAR(fmt, type)                  \
+    sprintf(tmpBuf, fmt, *((type *)ptr));        \
+    APPEND(tmpBuf);                              \
    break

 #define PRINT_VECTOR(fmt, type)                                         \
-    putchar('[');                                                       \
+    *bufp++ = '[';                                                      \
+    if (bufp == &printString[PRINT_BUF_SIZE]) break;                    \
    for (int i = 0; i < width; ++i) {                                   \
        /* only print the value if the current lane is executing */     \
-        if (mask & (1<<i))                                              \
-            printf(fmt, ((type *)ptr)[i]);                              \
+        if (mask & (1ull<<i))                                           \
+            sprintf(tmpBuf, fmt, ((type *)ptr)[i]);                     \
        else                                                            \
-            printf("((" fmt "))", ((type *)ptr)[i]);                    \
-        putchar(i != width-1 ? ',' : ']');                              \
+            sprintf(tmpBuf, "((" fmt "))", ((type *)ptr)[i]);           \
+        APPEND(tmpBuf);                                                 \
+        *bufp++ = (i != width-1 ? ',' : ']');                           \
    }                                                                   \
    break

@@ -89,16 +106,18 @@ typedef int Bool;
    @param mask    Current lane mask when the print statemnt is called
    @param args    Array of pointers to the values to be printed
 */
-void __do_print(const char *format, const char *types, int width, int mask, 
+void __do_print(const char *format, const char *types, int width, uint64_t mask, 
                void **args) {
-    if (mask == 0) 
-        return;
+    char printString[PRINT_BUF_SIZE+1]; // +1 for trailing NUL
+    char *bufp = &printString[0];
+    char tmpBuf[256];

    int argCount = 0;
-    while (*format) {
+    while (*format && bufp < &printString[PRINT_BUF_SIZE]) {
        // Format strings are just single percent signs.
-        if (*format != '%')
-            putchar(*format);
+        if (*format != '%') {
+            *bufp++ = *format;
+        }
        else {
            if (*types) {
                void *ptr = args[argCount++];
@@ -107,17 +126,22 @@ void __do_print(const char *format, const char *types, int width, int mask,
                // printf() formatting string.
                switch (*types) {
                case 'b': {
-                    printf("%s", *((Bool *)ptr) ? "true" : "false");
+                    sprintf(tmpBuf, "%s", *((Bool *)ptr) ? "true" : "false");
+                    APPEND(tmpBuf);
                    break;
                }
                case 'B': {
-                    putchar('[');
+                    *bufp++ = '[';
+                    if (bufp == &printString[PRINT_BUF_SIZE])
+                        break;
                    for (int i = 0; i < width; ++i) {
-                        if (mask & (1<<i))
-                            printf("%s", ((Bool *)ptr)[i] ? "true" : "false");
+                        if (mask & (1ull << i)) {
+                            sprintf(tmpBuf, "%s", ((Bool *)ptr)[i] ? "true" : "false");
+                            APPEND(tmpBuf);
+                        }
                        else
-                            printf("_________");
-                        putchar(i != width-1 ? ',' : ']');
+                            APPEND("_________");
+                        *bufp++ = (i != width-1) ? ',' : ']';
                    }
                    break;
                }
@@ -136,14 +160,18 @@ void __do_print(const char *format, const char *types, int width, int mask,
                case 'p': PRINT_SCALAR("%p", void *);
                case 'P': PRINT_VECTOR("%p", void *);
                default:
-                    printf("UNKNOWN TYPE ");
-                    putchar(*types);
+                    APPEND("UNKNOWN TYPE ");
+                    *bufp++ = *types;
                }
                ++types;
            }
        }
        ++format;
    }
+
+ done:
+    *bufp = '\0';
+    fputs(printString, stdout);
    fflush(stdout);
 }

--- a/builtins/dispatch.ll
+++ b/builtins/dispatch.ll
@@ -48,8 +48,8 @@ declare void @abort() noreturn
 ;; corresponding to one of the Target::ISA enumerant values that gives the
 ;; most capable ISA that the curremt system can run.
 ;;
-;; Note: clang from LLVM 2.9 should be used if this is updated, for maximum
-;; backwards compatibility for anyone building ispc with LLVM 2.9.
+;; Note: clang from LLVM 3.0 should be used if this is updated, for maximum
+;; backwards compatibility for anyone building ispc with LLVM 3.0
 ;;
 ;; #include <stdint.h>
 ;; #include <stdlib.h>
@@ -76,13 +76,19 @@ declare void @abort() noreturn
 ;;     /* NOTE: the values returned below must be the same as the
 ;;        corresponding enumerant values in Target::ISA. */
 ;;     if ((info[2] & (1 << 28)) != 0) {
-;;         // AVX1 for sure. Do we have AVX2?
-;;         // Call cpuid with eax=7, ecx=0
-;;         __cpuid_count(info, 7, 0);
-;;         if ((info[1] & (1 << 5)) != 0)
-;;             return 3; // AVX2
-;;         else
-;;             return 2; // AVX1
+;;        if ((info[2] & (1 << 29)) != 0 &&  // F16C
+;;            (info[2] & (1 << 30)) != 0) {  // RDRAND
+;;            // So far, so good.  AVX2?
+;;            // Call cpuid with eax=7, ecx=0
+;;            int info2[4];
+;;            __cpuid_count(info2, 7, 0);
+;;            if ((info2[1] & (1 << 5)) != 0)
+;;                return 4;
+;;            else
+;;                return 3;
+;;        }
+;;        // Regular AVX
+;;        return 2;
 ;;     }
 ;;     else if ((info[2] & (1 << 19)) != 0)
 ;;         return 1; // SSE4
@@ -92,41 +98,44 @@ declare void @abort() noreturn
 ;;         abort();
 ;; }

-%0 = type { i32, i32, i32, i32 }
-
-define i32 @__get_system_isa() nounwind ssp {
+define i32 @__get_system_isa() nounwind uwtable ssp {
 entry:
-  %0 = tail call %0 asm sideeffect "cpuid", "={ax},={bx},={cx},={dx},0,~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind
-  %asmresult9.i = extractvalue %0 %0, 2
-  %asmresult10.i = extractvalue %0 %0, 3
-  %and = and i32 %asmresult9.i, 268435456
+  %0 = tail call { i32, i32, i32, i32 } asm sideeffect "cpuid", "={ax},={bx},={cx},={dx},0,~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind
+  %asmresult5.i = extractvalue { i32, i32, i32, i32 } %0, 2
+  %asmresult6.i = extractvalue { i32, i32, i32, i32 } %0, 3
+  %and = and i32 %asmresult5.i, 268435456
  %cmp = icmp eq i32 %and, 0
-  br i1 %cmp, label %if.else7, label %if.then
+  br i1 %cmp, label %if.else13, label %if.then

 if.then:                                          ; preds = %entry
-  %1 = tail call %0 asm sideeffect "xchg$(l$)\09$(%$)ebx, $1\0A\09cpuid\0A\09xchg$(l$)\09$(%$)ebx, $1\0A\09", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 7, i32 0) nounwind
-  %asmresult9.i24 = extractvalue %0 %1, 1
-  %and4 = lshr i32 %asmresult9.i24, 5
-  %2 = and i32 %and4, 1
-  %3 = or i32 %2, 2
+  %1 = and i32 %asmresult5.i, 1610612736
+  %2 = icmp eq i32 %1, 1610612736
+  br i1 %2, label %if.then7, label %return
+
+if.then7:                                         ; preds = %if.then
+  %3 = tail call { i32, i32, i32, i32 } asm sideeffect "xchg$(l$)\09$(%$)ebx, $1\0A\09cpuid\0A\09xchg$(l$)\09$(%$)ebx, $1\0A\09", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 7, i32 0) nounwind
+  %asmresult4.i28 = extractvalue { i32, i32, i32, i32 } %3, 1
+  %and10 = lshr i32 %asmresult4.i28, 5
+  %4 = and i32 %and10, 1
+  %5 = add i32 %4, 3
  br label %return

-if.else7:                                         ; preds = %entry
-  %and10 = and i32 %asmresult9.i, 524288
-  %cmp11 = icmp eq i32 %and10, 0
-  br i1 %cmp11, label %if.else13, label %return
+if.else13:                                        ; preds = %entry
+  %and15 = and i32 %asmresult5.i, 524288
+  %cmp16 = icmp eq i32 %and15, 0
+  br i1 %cmp16, label %if.else18, label %return

-if.else13:                                        ; preds = %if.else7
-  %and16 = and i32 %asmresult10.i, 67108864
-  %cmp17 = icmp eq i32 %and16, 0
-  br i1 %cmp17, label %if.else19, label %return
+if.else18:                                        ; preds = %if.else13
+  %and20 = and i32 %asmresult6.i, 67108864
+  %cmp21 = icmp eq i32 %and20, 0
+  br i1 %cmp21, label %if.else23, label %return

-if.else19:                                        ; preds = %if.else13
+if.else23:                                        ; preds = %if.else18
  tail call void @abort() noreturn nounwind
  unreachable

-return:                                           ; preds = %if.else13, %if.else7, %if.then
-  %retval.0 = phi i32 [ %3, %if.then ], [ 1, %if.else7 ], [ 0, %if.else13 ]
+return:                                           ; preds = %if.else18, %if.else13, %if.then7, %if.then
+  %retval.0 = phi i32 [ %5, %if.then7 ], [ 2, %if.then ], [ 1, %if.else13 ], [ 0, %if.else18 ]
  ret i32 %retval.0
 }

--- a/builtins/target-avx-common.ll
+++ b/builtins/target-avx-common.ll
@@ -254,10 +254,10 @@ define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; double precision sqrt

-declare <2 x double> @llvm.x86.sse.sqrt.sd(<2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone

 define double @__sqrt_uniform_double(double) nounwind alwaysinline {
-  sse_unary_scalar(ret, 2, double, @llvm.x86.sse.sqrt.sd, %0)
+  sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
  ret double %ret
 }

--- a/builtins/target-avx-x2.ll
+++ b/builtins/target-avx-x2.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2010-2012, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -158,13 +158,13 @@ declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind
 declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone

 define <16 x float> @__max_varying_float(<16 x float>,
-                                                  <16 x float>) nounwind readonly alwaysinline {
+                                         <16 x float>) nounwind readonly alwaysinline {
  binary8to16(call, float, @llvm.x86.avx.max.ps.256, %0, %1)
  ret <16 x float> %call
 }

 define <16 x float> @__min_varying_float(<16 x float>,
-                                                  <16 x float>) nounwind readonly alwaysinline {
+                                         <16 x float>) nounwind readonly alwaysinline {
  binary8to16(call, float, @llvm.x86.avx.min.ps.256, %0, %1)
  ret <16 x float> %call
 }
@@ -175,7 +175,7 @@ define <16 x float> @__min_varying_float(<16 x float>,

 declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone

-define i32 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
+define i64 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
  %floatmask = bitcast <16 x i32> %0 to <16 x float>
  %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -186,9 +186,57 @@ define i32 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {

  %v1shift = shl i32 %v1, 8
  %v = or i32 %v1shift, %v0
-  ret i32 %v
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
 }

+define i1 @__any(<16 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <16 x i32> %0 to <16 x float>
+  %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v0 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask0) nounwind readnone
+  %mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+          <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %v1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask1) nounwind readnone
+
+  %v1shift = shl i32 %v1, 8
+  %v = or i32 %v1shift, %v0
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<16 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <16 x i32> %0 to <16 x float>
+  %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v0 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask0) nounwind readnone
+  %mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+          <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %v1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask1) nounwind readnone
+
+  %v1shift = shl i32 %v1, 8
+  %v = or i32 %v1shift, %v0
+  %cmp = icmp eq i32 %v, 65535
+  ret i1 %cmp
+}
+
+define i1 @__none(<16 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <16 x i32> %0 to <16 x float>
+  %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v0 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask0) nounwind readnone
+  %mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+          <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %v1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask1) nounwind readnone
+
+  %v1shift = shl i32 %v1, 8
+  %v = or i32 %v1shift, %v0
+  %cmp = icmp eq i32 %v, 0
+  ret i1 %cmp
+}
+
+
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal float ops

@@ -224,7 +272,7 @@ reduce_equal(16)
 ;; horizontal int32 ops

 define <16 x i32> @__add_varying_int32(<16 x i32>,
-                                                <16 x i32>) nounwind readnone alwaysinline {
+                                       <16 x i32>) nounwind readnone alwaysinline {
  %s = add <16 x i32> %0, %1
  ret <16 x i32> %s
 }
@@ -252,11 +300,6 @@ define i32 @__reduce_max_int32(<16 x i32>) nounwind readnone alwaysinline {
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;; horizontal uint32 ops

-define i32 @__reduce_add_uint32(<16 x i32> %v) nounwind readnone alwaysinline {
-  %r = call i32 @__reduce_add_int32(<16 x i32> %v)
-  ret i32 %r
-}
-
 define i32 @__reduce_min_uint32(<16 x i32>) nounwind readnone alwaysinline {
  reduce16(i32, @__min_varying_uint32, @__min_uniform_uint32)
 }
@@ -334,11 +377,6 @@ define i64 @__reduce_max_int64(<16 x i64>) nounwind readnone alwaysinline {
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;; horizontal uint64 ops

-define i64 @__reduce_add_uint64(<16 x i64> %v) nounwind readnone alwaysinline {
-  %r = call i64 @__reduce_add_int64(<16 x i64> %v)
-  ret i64 %r
-}
-
 define i64 @__reduce_min_uint64(<16 x i64>) nounwind readnone alwaysinline {
  reduce16(i64, @__min_varying_uint64, @__min_uniform_uint64)
 }
@@ -352,19 +390,14 @@ define i64 @__reduce_max_uint64(<16 x i64>) nounwind readnone alwaysinline {
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts

-load_and_broadcast(16, i8, 8)
-load_and_broadcast(16, i16, 16)
-load_and_broadcast(16, i32, 32)
-load_and_broadcast(16, i64, 64)
-
 ; no masked load instruction for i8 and i16 types??
-masked_load(16, i8,  8,  1)
-masked_load(16, i16, 16, 2)
+masked_load(i8,  1)
+masked_load(i16, 2)

 declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x float> %mask)
 declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
 
-define <16 x i32> @__masked_load_32(i8 *, <16 x i32> %mask) nounwind alwaysinline {
+define <16 x i32> @__masked_load_i32(i8 *, <16 x i32> %mask) nounwind alwaysinline {
  %floatmask = bitcast <16 x i32> %mask to <16 x float>
  %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -382,7 +415,7 @@ define <16 x i32> @__masked_load_32(i8 *, <16 x i32> %mask) nounwind alwaysinlin
 }


-define <16 x i64> @__masked_load_64(i8 *, <16 x i32> %mask) nounwind alwaysinline {
+define <16 x i64> @__masked_load_i64(i8 *, <16 x i32> %mask) nounwind alwaysinline {
  ; double up masks, bitcast to doubles
  %mask0 = shufflevector <16 x i32> %mask, <16 x i32> undef,
     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
@@ -416,6 +449,7 @@ define <16 x i64> @__masked_load_64(i8 *, <16 x i32> %mask) nounwind alwaysinlin
  ret <16 x i64> %val
 }

+masked_load_float_double()

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store
@@ -423,15 +457,15 @@ define <16 x i64> @__masked_load_64(i8 *, <16 x i32> %mask) nounwind alwaysinlin
 ; FIXME: there is no AVX instruction for these, but we could be clever
 ; by packing the bits down and setting the last 3/4 or half, respectively,
 ; of the mask to zero...  Not sure if this would be a win in the end
-gen_masked_store(16, i8, 8)
-gen_masked_store(16, i16, 16)
+gen_masked_store(i8)
+gen_masked_store(i16)

 ; note that mask is the 2nd parameter, not the 3rd one!!
 declare void @llvm.x86.avx.maskstore.ps.256(i8 *, <8 x float>, <8 x float>)
 declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>)

-define void @__masked_store_32(<16 x i32>* nocapture, <16 x i32>, 
-                               <16 x i32>) nounwind alwaysinline {
+define void @__masked_store_i32(<16 x i32>* nocapture, <16 x i32>, 
+                                <16 x i32>) nounwind alwaysinline {
  %ptr = bitcast <16 x i32> * %0 to i8 *
  %val = bitcast <16 x i32> %1 to <16 x float>
  %mask = bitcast <16 x i32> %2 to <16 x float>
@@ -453,8 +487,8 @@ define void @__masked_store_32(<16 x i32>* nocapture, <16 x i32>,
  ret void
 }

-define void @__masked_store_64(<16 x i64>* nocapture, <16 x i64>,
-                               <16 x i32> %mask) nounwind alwaysinline {
+define void @__masked_store_i64(<16 x i64>* nocapture, <16 x i64>,
+                                <16 x i32> %mask) nounwind alwaysinline {
  %ptr = bitcast <16 x i64> * %0 to i8 *
  %val = bitcast <16 x i64> %1 to <16 x double>

@@ -492,14 +526,15 @@ define void @__masked_store_64(<16 x i64>* nocapture, <16 x i64>,
  ret void
 }

+masked_store_float_double()

 masked_store_blend_8_16_by_16()

 declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
                                                <8 x float>) nounwind readnone

-define void @__masked_store_blend_32(<16 x i32>* nocapture, <16 x i32>, 
-                                     <16 x i32>) nounwind alwaysinline {
+define void @__masked_store_blend_i32(<16 x i32>* nocapture, <16 x i32>, 
+                                      <16 x i32>) nounwind alwaysinline {
  %maskAsFloat = bitcast <16 x i32> %2 to <16 x float>
  %oldValue = load <16 x i32>* %0, align 4
  %oldAsFloat = bitcast <16 x i32> %oldValue to <16 x float>
@@ -536,8 +571,8 @@ define void @__masked_store_blend_32(<16 x i32>* nocapture, <16 x i32>,
 declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>,
                                                 <4 x double>) nounwind readnone

-define void @__masked_store_blend_64(<16 x i64>* nocapture %ptr, <16 x i64> %newi64, 
-                                     <16 x i32> %mask) nounwind alwaysinline {
+define void @__masked_store_blend_i64(<16 x i64>* nocapture %ptr, <16 x i64> %newi64, 
+                                      <16 x i32> %mask) nounwind alwaysinline {
  %oldValue = load <16 x i64>* %ptr, align 8
  %old = bitcast <16 x i64> %oldValue to <16 x double>
  %old0d = shufflevector <16 x double> %old, <16 x double> undef,
@@ -597,10 +632,12 @@ define void @__masked_store_blend_64(<16 x i64>* nocapture %ptr, <16 x i64> %new
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; scatter

-gen_scatter(16, i8)
-gen_scatter(16, i16)
-gen_scatter(16, i32)
-gen_scatter(16, i64)
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; double precision sqrt
--- a/builtins/target-avx.ll
+++ b/builtins/target-avx.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2010-2012, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -158,13 +158,13 @@ declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind
 declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone

 define <8 x float> @__max_varying_float(<8 x float>,
-                                                 <8 x float>) nounwind readonly alwaysinline {
+                                        <8 x float>) nounwind readonly alwaysinline {
  %call = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %0, <8 x float> %1)
  ret <8 x float> %call
 }

 define <8 x float> @__min_varying_float(<8 x float>,
-                                                 <8 x float>) nounwind readonly alwaysinline {
+                                        <8 x float>) nounwind readonly alwaysinline {
  %call = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %0, <8 x float> %1)
  ret <8 x float> %call
 }
@@ -175,10 +175,32 @@ define <8 x float> @__min_varying_float(<8 x float>,

 declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone

-define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
+define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
  %floatmask = bitcast <8 x i32> %0 to <8 x float>
  %v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone
-  ret i32 %v
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
+}
+
+define i1 @__any(<8 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<8 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone
+  %cmp = icmp eq i32 %v, 255
+  ret i1 %cmp
+}
+
+define i1 @__none(<8 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone
+  %cmp = icmp eq i32 %v, 0
+  ret i1 %cmp
 }

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -239,11 +261,6 @@ define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;; horizontal uint32 ops

-define i32 @__reduce_add_uint32(<8 x i32> %v) nounwind readnone alwaysinline {
-  %r = call i32 @__reduce_add_int32(<8 x i32> %v)
-  ret i32 %r
-}
-
 define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
  reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32)
 }
@@ -315,11 +332,6 @@ define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone alwaysinline {
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;; horizontal uint64 ops

-define i64 @__reduce_add_uint64(<8 x i64> %v) nounwind readnone alwaysinline {
-  %r = call i64 @__reduce_add_int64(<8 x i64> %v)
-  ret i64 %r
-}
-
 define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone alwaysinline {
  reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
 }
@@ -333,19 +345,15 @@ define i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone alwaysinline {
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts

-load_and_broadcast(8, i8, 8)
-load_and_broadcast(8, i16, 16)
-load_and_broadcast(8, i32, 32)
-load_and_broadcast(8, i64, 64)

 ; no masked load instruction for i8 and i16 types??
-masked_load(8, i8,  8,  1)
-masked_load(8, i16, 16, 2)
+masked_load(i8,  1)
+masked_load(i16, 2)

 declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x float> %mask)
 declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
 
-define <8 x i32> @__masked_load_32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
+define <8 x i32> @__masked_load_i32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
  %floatmask = bitcast <8 x i32> %mask to <8 x float>
  %floatval = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8 * %0, <8 x float> %floatmask)
  %retval = bitcast <8 x float> %floatval to <8 x i32>
@@ -353,7 +361,7 @@ define <8 x i32> @__masked_load_32(i8 *, <8 x i32> %mask) nounwind alwaysinline
 }


-define <8 x i64> @__masked_load_64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
+define <8 x i64> @__masked_load_i64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
  ; double up masks, bitcast to doubles
  %mask0 = shufflevector <8 x i32> %mask, <8 x i32> undef,
     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
@@ -372,19 +380,20 @@ define <8 x i64> @__masked_load_64(i8 *, <8 x i32> %mask) nounwind alwaysinline
  ret <8 x i64> %val
 }

+masked_load_float_double()

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store

-gen_masked_store(8, i8, 8)
-gen_masked_store(8, i16, 16)
+gen_masked_store(i8)
+gen_masked_store(i16)

 ; note that mask is the 2nd parameter, not the 3rd one!!
 declare void @llvm.x86.avx.maskstore.ps.256(i8 *, <8 x float>, <8 x float>)
 declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>)

-define void @__masked_store_32(<8 x i32>* nocapture, <8 x i32>, 
-                               <8 x i32>) nounwind alwaysinline {
+define void @__masked_store_i32(<8 x i32>* nocapture, <8 x i32>, 
+                                <8 x i32>) nounwind alwaysinline {
  %ptr = bitcast <8 x i32> * %0 to i8 *
  %val = bitcast <8 x i32> %1 to <8 x float>
  %mask = bitcast <8 x i32> %2 to <8 x float>
@@ -392,8 +401,8 @@ define void @__masked_store_32(<8 x i32>* nocapture, <8 x i32>,
  ret void
 }

-define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>,
-                               <8 x i32> %mask) nounwind alwaysinline {
+define void @__masked_store_i64(<8 x i64>* nocapture, <8 x i64>,
+                                <8 x i32> %mask) nounwind alwaysinline {
  %ptr = bitcast <8 x i64> * %0 to i8 *
  %val = bitcast <8 x i64> %1 to <8 x double>

@@ -417,14 +426,13 @@ define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>,
 }


-
 masked_store_blend_8_16_by_8()

 declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
                                                <8 x float>) nounwind readnone

-define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>, 
-                                     <8 x i32>) nounwind alwaysinline {
+define void @__masked_store_blend_i32(<8 x i32>* nocapture, <8 x i32>, 
+                                      <8 x i32>) nounwind alwaysinline {
  %mask_as_float = bitcast <8 x i32> %2 to <8 x float>
  %oldValue = load <8 x i32>* %0, align 4
  %oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float>
@@ -438,8 +446,8 @@ define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
 }


-define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new, 
-                                     <8 x i32> %i32mask) nounwind alwaysinline {
+define void @__masked_store_blend_i64(<8 x i64>* nocapture %ptr, <8 x i64> %new, 
+                                      <8 x i32> %i32mask) nounwind alwaysinline {
  %oldValue = load <8 x i64>* %ptr, align 8
  %mask = bitcast <8 x i32> %i32mask to <8 x float>

@@ -488,14 +496,17 @@ define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
  ret void
 }

+masked_store_float_double()

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; scatter

-gen_scatter(8, i8)
-gen_scatter(8, i16)
-gen_scatter(8, i32)
-gen_scatter(8, i64)
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; double precision sqrt
--- a/builtins/target-avx1-x2.ll
+++ b/builtins/target-avx1-x2.ll
@@ -31,6 +31,8 @@

 include(`target-avx-x2.ll')

+rdrand_decls()
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max

@@ -61,17 +63,19 @@ define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonl
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; half conversion routines

+ifelse(NO_HALF_DECLARES, `1', `', `
 declare float @__half_to_float_uniform(i16 %v) nounwind readnone
 declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
 declare i16 @__float_to_half_uniform(float %v) nounwind readnone
 declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+')

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather

-gen_gather(16, i8)
-gen_gather(16, i16)
-gen_gather(16, i32)
-gen_gather(16, i64)
-
-
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
--- a/builtins/target-avx1.ll
+++ b/builtins/target-avx1.ll
@@ -31,6 +31,8 @@

 include(`target-avx.ll')

+rdrand_decls()
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max

@@ -61,15 +63,19 @@ define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly a
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; half conversion routines

+ifelse(NO_HALF_DECLARES, `1', `', `
 declare float @__half_to_float_uniform(i16 %v) nounwind readnone
 declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
 declare i16 @__float_to_half_uniform(float %v) nounwind readnone
 declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+')

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather

-gen_gather(8, i8)
-gen_gather(8, i16)
-gen_gather(8, i32)
-gen_gather(8, i64)
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
--- a/builtins/target-avx11-x2.ll
+++ b/builtins/target-avx11-x2.ll
@@ -0,0 +1,132 @@
+;;  Copyright (c) 2012, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+include(`target-avx-x2.ll')
+
+ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
+       LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
+       `rdrand_definition()')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret <16 x i32> %ret
+}
+
+define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret <16 x i32> %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret <16 x i32> %ret
+}
+
+define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret <16 x i32> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather(i8)
+gen_gather(i16)
+gen_gather(i32)
+gen_gather(float)
+gen_gather(i64)
+gen_gather(double)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float/half conversions
+
+ifelse(LLVM_VERSION, `LLVM_3_0', `
+;; nothing to define...
+', `
+declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
+; 0 is round nearest even
+declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
+
+define <16 x float> @__half_to_float_varying(<16 x i16> %v) nounwind readnone {
+  %r_0 = shufflevector <16 x i16> %v, <16 x i16> undef,
+             <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %vr_0 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_0)
+  %r_1 = shufflevector <16 x i16> %v, <16 x i16> undef,
+             <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vr_1 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_1)
+  %r = shufflevector <8 x float> %vr_0, <8 x float> %vr_1, 
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %r
+}
+
+define <16 x i16> @__float_to_half_varying(<16 x float> %v) nounwind readnone {
+  %r_0 = shufflevector <16 x float> %v, <16 x float> undef,
+             <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %vr_0 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_0, i32 0)
+  %r_1 = shufflevector <16 x float> %v, <16 x float> undef,
+             <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vr_1 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_1, i32 0)
+  %r = shufflevector <8 x i16> %vr_0, <8 x i16> %vr_1, 
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i16> %r
+}
+
+define float @__half_to_float_uniform(i16 %v) nounwind readnone {
+  %v1 = bitcast i16 %v to <1 x i16>
+  %vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
+  %r = extractelement <8 x float> %rv, i32 0
+  ret float %r
+}
+
+define i16 @__float_to_half_uniform(float %v) nounwind readnone {
+  %v1 = bitcast float %v to <1 x float>
+  %vv = shufflevector <1 x float> %v1, <1 x float> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  ; round to nearest even
+  %rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
+  %r = extractelement <8 x i16> %rv, i32 0
+  ret i16 %r
+}
+'
+)
--- a/builtins/target-avx11.ll
+++ b/builtins/target-avx11.ll
@@ -0,0 +1,115 @@
+;;  Copyright (c) 2012, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+include(`target-avx.ll')
+
+ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
+       LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
+       `rdrand_definition()')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret <8 x i32> %ret
+}
+
+define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret <8 x i32> %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret <8 x i32> %ret
+}
+
+define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret <8 x i32> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather(i8)
+gen_gather(i16)
+gen_gather(i32)
+gen_gather(float)
+gen_gather(i64)
+gen_gather(double)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float/half conversions
+
+ifelse(LLVM_VERSION, `LLVM_3_0', `
+;; nothing to define...
+', `
+declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
+; 0 is round nearest even
+declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
+
+define <8 x float> @__half_to_float_varying(<8 x i16> %v) nounwind readnone {
+  %r = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %v)
+  ret <8 x float> %r
+}
+
+define <8 x i16> @__float_to_half_varying(<8 x float> %v) nounwind readnone {
+  %r = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %v, i32 0)
+  ret <8 x i16> %r
+}
+
+define float @__half_to_float_uniform(i16 %v) nounwind readnone {
+  %v1 = bitcast i16 %v to <1 x i16>
+  %vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
+  %r = extractelement <8 x float> %rv, i32 0
+  ret float %r
+}
+
+define i16 @__float_to_half_uniform(float %v) nounwind readnone {
+  %v1 = bitcast float %v to <1 x float>
+  %vv = shufflevector <1 x float> %v1, <1 x float> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  ; round to nearest even
+  %rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
+  %r = extractelement <8 x i16> %rv, i32 0
+  ret i16 %r
+}
+')
--- a/builtins/target-avx2-x2.ll
+++ b/builtins/target-avx2-x2.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2010-2012, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -29,8 +29,16 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  

+ifelse(LLVM_VERSION, `LLVM_3_0', `',
+       LLVM_VERSION, `LLVM_3_1', `',
+       `define(`HAVE_GATHER', `1')')
+
 include(`target-avx-x2.ll')

+ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
+       LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
+       `rdrand_definition()')
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max

@@ -66,6 +74,9 @@ define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonl
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float/half conversions

+ifelse(LLVM_VERSION, `LLVM_3_0', `
+;; nothing to define...
+', `
 declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
 ; 0 is round nearest even
 declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
@@ -116,14 +127,435 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone {
  %r = extractelement <8 x i16> %rv, i32 0
  ret i16 %r
 }
-
+')

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather

-gen_gather(16, i8)
-gen_gather(16, i16)
-gen_gather(16, i32)
-gen_gather(16, i64)
+declare void @llvm.trap() noreturn nounwind
+
+; $1: type
+; $2: var base name
+define(`extract_4s', `
+  %$2_1 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$2_2 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %$2_3 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %$2_4 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+')
+
+; $1: type
+; $2: var base name
+define(`extract_8s', `
+  %$2_1 = shufflevector <16 x $1> %$2, <16 x $1> undef,
+                    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %$2_2 = shufflevector <16 x $1> %$2, <16 x $1> undef,
+                    <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+')
+
+; $1: element type
+; $2: ret name
+; $3: v1
+; $4: v2
+define(`assemble_8s', `
+  %$2 = shufflevector <8 x $1> %$3, <8 x $1> %$4,
+                      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                                  i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+')
+
+; $1: element type
+; $2: ret name
+; $3: v1
+; $4: v2
+; $5: v3
+; $6: v4
+define(`assemble_4s', `
+  %$2_1 = shufflevector <4 x $1> %$3, <4 x $1> %$4,
+                    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %$2_2 = shufflevector <4 x $1> %$5, <4 x $1> %$6,
+                    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  assemble_8s($1, $2, $2_1, $2_2)
+')
+
+ifelse(LLVM_VERSION, `LLVM_3_0', `
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)',
+LLVM_VERSION, `LLVM_3_1', `
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)', `
+
+gen_gather(i8)
+gen_gather(i16)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int32 gathers
+
+declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %target, i8 * %ptr,
+                       <8 x i32> %indices, <8 x i32> %mask, i8 %scale) readonly nounwind
+declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x i32> %mask, i8 %scale) readonly nounwind
+
+define <16 x i32> @__gather_base_offsets32_i32(i8 * %ptr, i32 %scale, <16 x i32> %offsets,
+                             <16 x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  extract_8s(i32, offsets)
+  extract_8s(i32, vecmask)
+
+  %v1 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * %ptr,
+                             <8 x i32> %offsets_1, <8 x i32> %vecmask_1, i8 %scale8)
+  %v2 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * %ptr,
+                             <8 x i32> %offsets_2, <8 x i32> %vecmask_2, i8 %scale8)
+
+  assemble_8s(i32, v, v1, v2)
+
+  ret <16 x i32> %v
+}


+define <16 x i32> @__gather_base_offsets64_i32(i8 * %ptr,
+                             i32 %scale, <16 x i64> %offsets,
+                             <16 x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+
+  extract_4s(i32, vecmask)
+  extract_4s(i64, offsets)
+
+  %v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_1, <4 x i32> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_2, <4 x i32> %vecmask_2, i8 %scale8)
+  %v3 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_3, <4 x i32> %vecmask_3, i8 %scale8)
+  %v4 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_4, <4 x i32> %vecmask_4, i8 %scale8)
+
+  assemble_4s(i32, v, v1, v2, v3, v4)
+
+  ret <16 x i32> %v
+}
+
+
+define <16 x i32> @__gather32_i32(<16 x i32> %ptrs, 
+                                  <16 x i32> %vecmask) nounwind readonly alwaysinline {
+  extract_8s(i32, ptrs)
+  extract_8s(i32, vecmask)
+
+  %v1 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * null,
+                       <8 x i32> %ptrs_1, <8 x i32> %vecmask_1, i8 1)
+  %v2 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * null,
+                       <8 x i32> %ptrs_2, <8 x i32> %vecmask_2, i8 1)
+
+  assemble_8s(i32, v, v1, v2)
+
+  ret <16 x i32> %v
+}
+
+
+define <16 x i32> @__gather64_i32(<16 x i64> %ptrs, 
+                                  <16 x i32> %vecmask) nounwind readonly alwaysinline {
+  extract_4s(i64, ptrs)
+  extract_4s(i32, vecmask)
+
+  %v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
+                      <4 x i64> %ptrs_1, <4 x i32> %vecmask_1, i8 1)
+  %v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
+                      <4 x i64> %ptrs_2, <4 x i32> %vecmask_2, i8 1)
+  %v3 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
+                      <4 x i64> %ptrs_3, <4 x i32> %vecmask_3, i8 1)
+  %v4 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
+                      <4 x i64> %ptrs_4, <4 x i32> %vecmask_4, i8 1)
+
+  assemble_4s(i32, v, v1, v2, v3, v4)
+
+  ret <16 x i32> %v
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float gathers
+
+declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %target, i8 * %ptr,
+                       <8 x i32> %indices, <8 x float> %mask, i8 %scale8) readonly nounwind
+declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x float> %mask, i8 %scale8) readonly nounwind
+
+define <16 x float> @__gather_base_offsets32_float(i8 * %ptr,
+                                  i32 %scale, <16 x i32> %offsets,
+                                  <16 x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %mask = bitcast <16 x i32> %vecmask to <16 x float>
+  extract_8s(i32, offsets)
+  extract_8s(float, mask)
+
+  %v1 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * %ptr,
+                       <8 x i32> %offsets_1, <8 x float> %mask_1, i8 %scale8)
+  %v2 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * %ptr,
+                       <8 x i32> %offsets_2, <8 x float> %mask_2, i8 %scale8)
+
+  assemble_8s(float, v, v1, v2)
+
+  ret <16 x float> %v
+}
+
+
+define <16 x float> @__gather_base_offsets64_float(i8 * %ptr,
+                                   i32 %scale, <16 x i64> %offsets,
+                                   <16 x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %mask = bitcast <16 x i32> %vecmask to <16 x float>
+  extract_4s(i64, offsets)
+  extract_4s(float, mask)
+
+  %v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, 
+                     <4 x i64> %offsets_1, <4 x float> %mask_1, i8 %scale8)
+  %v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, 
+                     <4 x i64> %offsets_2, <4 x float> %mask_2, i8 %scale8)
+  %v3 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, 
+                     <4 x i64> %offsets_3, <4 x float> %mask_3, i8 %scale8)
+  %v4 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, 
+                     <4 x i64> %offsets_4, <4 x float> %mask_4, i8 %scale8)
+
+  assemble_4s(float, v, v1, v2, v3, v4)
+
+  ret <16 x float> %v
+}
+
+
+define <16 x float> @__gather32_float(<16 x i32> %ptrs, 
+                                      <16 x i32> %vecmask) nounwind readonly alwaysinline {
+  %mask = bitcast <16 x i32> %vecmask to <16 x float>
+  extract_8s(float, mask)
+  extract_8s(i32, ptrs)
+
+  %v1 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * null,
+                     <8 x i32> %ptrs_1, <8 x float> %mask_1, i8 1)
+  %v2 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * null,
+                     <8 x i32> %ptrs_2, <8 x float> %mask_2, i8 1)
+
+  assemble_8s(float, v, v1, v2)
+
+  ret <16 x float> %v
+}
+
+
+define <16 x float> @__gather64_float(<16 x i64> %ptrs, 
+                                      <16 x i32> %vecmask) nounwind readonly alwaysinline {
+  %mask = bitcast <16 x i32> %vecmask to <16 x float>
+  extract_4s(i64, ptrs)
+  extract_4s(float, mask)
+
+  %v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
+                      <4 x i64> %ptrs_1, <4 x float> %mask_1, i8 1)
+  %v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
+                      <4 x i64> %ptrs_2, <4 x float> %mask_2, i8 1)
+  %v3 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
+                      <4 x i64> %ptrs_3, <4 x float> %mask_3, i8 1)
+  %v4 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
+                      <4 x i64> %ptrs_4, <4 x float> %mask_4, i8 1)
+
+  assemble_4s(float, v, v1, v2, v3, v4)
+
+  ret <16 x float> %v
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int64 gathers
+
+declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %target, i8 * %ptr,
+                       <4 x i32> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
+declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
+
+define <16 x i64> @__gather_base_offsets32_i64(i8 * %ptr,
+                             i32 %scale, <16 x i32> %offsets,
+                             <16 x i32> %mask32) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask = sext <16 x i32> %mask32 to <16 x i64>
+  extract_4s(i32, offsets)
+  extract_4s(i64, vecmask)
+
+  %v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
+                             <4 x i32> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
+                             <4 x i32> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8)
+  %v3 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
+                             <4 x i32> %offsets_3, <4 x i64> %vecmask_3, i8 %scale8)
+  %v4 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
+                             <4 x i32> %offsets_4, <4 x i64> %vecmask_4, i8 %scale8)
+
+  assemble_4s(i64, v, v1, v2, v3, v4)
+
+  ret <16 x i64> %v
+}
+
+
+define <16 x i64> @__gather_base_offsets64_i64(i8 * %ptr,
+                             i32 %scale, <16 x i64> %offsets,
+                             <16 x i32> %mask32) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask = sext <16 x i32> %mask32 to <16 x i64>
+  extract_4s(i64, offsets)
+  extract_4s(i64, vecmask)
+
+  %v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8)
+  %v3 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_3, <4 x i64> %vecmask_3, i8 %scale8)
+  %v4 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_4, <4 x i64> %vecmask_4, i8 %scale8)
+
+  assemble_4s(i64, v, v1, v2, v3, v4)
+
+  ret <16 x i64> %v
+}
+
+
+define <16 x i64> @__gather32_i64(<16 x i32> %ptrs, 
+                                  <16 x i32> %mask32) nounwind readonly alwaysinline {
+  %vecmask = sext <16 x i32> %mask32 to <16 x i64>
+  extract_4s(i32, ptrs)
+  extract_4s(i64, vecmask)
+
+  %v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i32> %ptrs_1, <4 x i64> %vecmask_1, i8 1)
+  %v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i32> %ptrs_2, <4 x i64> %vecmask_2, i8 1)
+  %v3 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i32> %ptrs_3, <4 x i64> %vecmask_3, i8 1)
+  %v4 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i32> %ptrs_4, <4 x i64> %vecmask_4, i8 1)
+
+  assemble_4s(i64, v, v1, v2, v3, v4)
+
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @__gather64_i64(<16 x i64> %ptrs, 
+                                  <16 x i32> %mask32) nounwind readonly alwaysinline {
+  %vecmask = sext <16 x i32> %mask32 to <16 x i64>
+  extract_4s(i64, ptrs)
+  extract_4s(i64, vecmask)
+
+  %v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i64> %ptrs_1, <4 x i64> %vecmask_1, i8 1)
+  %v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i64> %ptrs_2, <4 x i64> %vecmask_2, i8 1)
+  %v3 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i64> %ptrs_3, <4 x i64> %vecmask_3, i8 1)
+  %v4 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i64> %ptrs_4, <4 x i64> %vecmask_4, i8 1)
+
+  assemble_4s(i64, v, v1, v2, v3, v4)
+
+  ret <16 x i64> %v
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double gathers
+
+declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
+declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %target, i8 * %ptr,
+                       <4 x i32> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
+
+define <16 x double> @__gather_base_offsets32_double(i8 * %ptr,
+                             i32 %scale, <16 x i32> %offsets,
+                             <16 x i32> %mask32) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask64 = sext <16 x i32> %mask32 to <16 x i64>
+  %vecmask = bitcast <16 x i64> %vecmask64 to <16 x double>
+  extract_4s(i32, offsets)
+  extract_4s(double, vecmask)
+
+  %v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
+                             <4 x i32> %offsets_1, <4 x double> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
+                             <4 x i32> %offsets_2, <4 x double> %vecmask_2, i8 %scale8)
+  %v3 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
+                             <4 x i32> %offsets_3, <4 x double> %vecmask_3, i8 %scale8)
+  %v4 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
+                             <4 x i32> %offsets_4, <4 x double> %vecmask_4, i8 %scale8)
+
+  assemble_4s(double, v, v1, v2, v3, v4)
+
+  ret <16 x double> %v
+}
+
+
+define <16 x double> @__gather_base_offsets64_double(i8 * %ptr,
+                             i32 %scale, <16 x i64> %offsets,
+                             <16 x i32> %mask32) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask64 = sext <16 x i32> %mask32 to <16 x i64>
+  %vecmask = bitcast <16 x i64> %vecmask64 to <16 x double>
+  extract_4s(i64, offsets)
+  extract_4s(double, vecmask)
+
+  %v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_1, <4 x double> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_2, <4 x double> %vecmask_2, i8 %scale8)
+  %v3 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_3, <4 x double> %vecmask_3, i8 %scale8)
+  %v4 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_4, <4 x double> %vecmask_4, i8 %scale8)
+
+  assemble_4s(double, v, v1, v2, v3, v4)
+
+  ret <16 x double> %v
+}
+
+
+define <16 x double> @__gather32_double(<16 x i32> %ptrs, 
+                                        <16 x i32> %mask32) nounwind readonly alwaysinline {
+  %vecmask64 = sext <16 x i32> %mask32 to <16 x i64>
+  %vecmask = bitcast <16 x i64> %vecmask64 to <16 x double>
+  extract_4s(i32, ptrs)
+  extract_4s(double, vecmask)
+
+  %v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i32> %ptrs_1, <4 x double> %vecmask_1, i8 1)
+  %v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i32> %ptrs_2, <4 x double> %vecmask_2, i8 1)
+  %v3 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i32> %ptrs_3, <4 x double> %vecmask_3, i8 1)
+  %v4 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i32> %ptrs_4, <4 x double> %vecmask_4, i8 1)
+
+  assemble_4s(double, v, v1, v2, v3, v4)
+
+  ret <16 x double> %v
+}
+
+
+define <16 x double> @__gather64_double(<16 x i64> %ptrs, 
+                                        <16 x i32> %mask32) nounwind readonly alwaysinline {
+  %vecmask64 = sext <16 x i32> %mask32 to <16 x i64>
+  %vecmask = bitcast <16 x i64> %vecmask64 to <16 x double>
+  extract_4s(i64, ptrs)
+  extract_4s(double, vecmask)
+
+  %v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i64> %ptrs_1, <4 x double> %vecmask_1, i8 1)
+  %v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i64> %ptrs_2, <4 x double> %vecmask_2, i8 1)
+  %v3 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i64> %ptrs_3, <4 x double> %vecmask_3, i8 1)
+  %v4 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i64> %ptrs_4, <4 x double> %vecmask_4, i8 1)
+
+  assemble_4s(double, v, v1, v2, v3, v4)
+
+  ret <16 x double> %v
+}
+
+')
--- a/builtins/target-avx2.ll
+++ b/builtins/target-avx2.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2010-2012, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -29,8 +29,16 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  

+ifelse(LLVM_VERSION, `LLVM_3_0', `',
+       LLVM_VERSION, `LLVM_3_1', `',
+       `define(`HAVE_GATHER', `1')')
+
 include(`target-avx.ll')

+ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
+       LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
+       `rdrand_definition()')
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max

@@ -66,6 +74,9 @@ define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly a
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float/half conversions

+ifelse(LLVM_VERSION, `LLVM_3_0', `
+;; nothing to define...
+', `
 declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
 ; 0 is round nearest even
 declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
@@ -100,11 +111,323 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone {
  %r = extractelement <8 x i16> %rv, i32 0
  ret i16 %r
 }
+')

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather

-gen_gather(8, i8)
-gen_gather(8, i16)
-gen_gather(8, i32)
-gen_gather(8, i64)
+declare void @llvm.trap() noreturn nounwind
+
+define(`extract_4s', `
+  %$2_1 = shufflevector <8 x $1> %$2, <8 x $1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$2_2 = shufflevector <8 x $1> %$2, <8 x $1> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+')
+
+ifelse(LLVM_VERSION, `LLVM_3_0', `
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)',
+LLVM_VERSION, `LLVM_3_1', `
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)', `
+
+gen_gather(i8)
+gen_gather(i16)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int32 gathers
+
+declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %target, i8 * %ptr,
+                       <8 x i32> %indices, <8 x i32> %mask, i8 %scale) readonly nounwind
+declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x i32> %mask, i8 %scale) readonly nounwind
+
+define <8 x i32> @__gather_base_offsets32_i32(i8 * %ptr,
+                             i32 %scale, <8 x i32> %offsets,
+                             <8 x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+
+  %v = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * %ptr,
+                             <8 x i32> %offsets, <8 x i32> %vecmask, i8 %scale8)
+
+  ret <8 x i32> %v
+}
+
+
+define <8 x i32> @__gather_base_offsets64_i32(i8 * %ptr,
+                             i32 %scale, <8 x i64> %offsets,
+                             <8 x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  extract_4s(i32, vecmask)
+  extract_4s(i64, offsets)
+
+  %v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_1, <4 x i32> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_2, <4 x i32> %vecmask_2, i8 %scale8)
+
+  %v = shufflevector <4 x i32> %v1, <4 x i32> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %v
+}
+
+
+define <8 x i32> @__gather32_i32(<8 x i32> %ptrs, 
+                                 <8 x i32> %vecmask) nounwind readonly alwaysinline {
+  %v = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * null,
+                      <8 x i32> %ptrs, <8 x i32> %vecmask, i8 1)
+  ret <8 x i32> %v
+}
+
+
+define <8 x i32> @__gather64_i32(<8 x i64> %ptrs, 
+                                 <8 x i32> %vecmask) nounwind readonly alwaysinline {
+  extract_4s(i64, ptrs)
+  extract_4s(i32, vecmask)
+
+  %v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
+                      <4 x i64> %ptrs_1, <4 x i32> %vecmask_1, i8 1)
+  %v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
+                      <4 x i64> %ptrs_2, <4 x i32> %vecmask_2, i8 1)
+
+  %v = shufflevector <4 x i32> %v1, <4 x i32> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %v
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float gathers
+
+declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %target, i8 * %ptr,
+                       <8 x i32> %indices, <8 x float> %mask, i8 %scale8) readonly nounwind
+declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x float> %mask, i8 %scale8) readonly nounwind
+
+define <8 x float> @__gather_base_offsets32_float(i8 * %ptr,
+                                  i32 %scale, <8 x i32> %offsets,
+                                  <8 x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %mask = bitcast <8 x i32> %vecmask to <8 x float>
+
+  %v = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * %ptr,
+                       <8 x i32> %offsets, <8 x float> %mask, i8 %scale8)
+
+  ret <8 x float> %v
+}
+
+
+define <8 x float> @__gather_base_offsets64_float(i8 * %ptr,
+                                   i32 %scale, <8 x i64> %offsets,
+                                   <8 x i32> %vecmask) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %mask = bitcast <8 x i32> %vecmask to <8 x float>
+  extract_4s(i64, offsets)
+  extract_4s(float, mask)
+
+  %v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, 
+                     <4 x i64> %offsets_1, <4 x float> %mask_1, i8 %scale8)
+  %v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, 
+                     <4 x i64> %offsets_2, <4 x float> %mask_2, i8 %scale8)
+
+  %v = shufflevector <4 x float> %v1, <4 x float> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %v
+}
+
+
+define <8 x float> @__gather32_float(<8 x i32> %ptrs, 
+                                     <8 x i32> %vecmask) nounwind readonly alwaysinline {
+  %mask = bitcast <8 x i32> %vecmask to <8 x float>
+
+  %v = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * null,
+                     <8 x i32> %ptrs, <8 x float> %mask, i8 1)
+
+  ret <8 x float> %v
+}
+
+
+define <8 x float> @__gather64_float(<8 x i64> %ptrs, 
+                                     <8 x i32> %vecmask) nounwind readonly alwaysinline {
+  %mask = bitcast <8 x i32> %vecmask to <8 x float>
+  extract_4s(i64, ptrs)
+  extract_4s(float, mask)
+
+  %v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
+                      <4 x i64> %ptrs_1, <4 x float> %mask_1, i8 1)
+  %v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
+                      <4 x i64> %ptrs_2, <4 x float> %mask_2, i8 1)
+
+  %v = shufflevector <4 x float> %v1, <4 x float> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %v
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int64 gathers
+
+declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %target, i8 * %ptr,
+                       <4 x i32> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
+declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
+
+define <8 x i64> @__gather_base_offsets32_i64(i8 * %ptr,
+                             i32 %scale, <8 x i32> %offsets,
+                             <8 x i32> %mask32) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask = sext <8 x i32> %mask32 to <8 x i64>
+  extract_4s(i32, offsets)
+  extract_4s(i64, vecmask)
+
+  %v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
+                             <4 x i32> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
+                             <4 x i32> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8)
+
+  %v = shufflevector <4 x i64> %v1, <4 x i64> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i64> %v
+}
+
+
+define <8 x i64> @__gather_base_offsets64_i64(i8 * %ptr,
+                             i32 %scale, <8 x i64> %offsets,
+                             <8 x i32> %mask32) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask = sext <8 x i32> %mask32 to <8 x i64>
+  extract_4s(i64, offsets)
+  extract_4s(i64, vecmask)
+
+  %v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8)
+
+  %v = shufflevector <4 x i64> %v1, <4 x i64> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i64> %v
+}
+
+
+define <8 x i64> @__gather32_i64(<8 x i32> %ptrs, 
+                                 <8 x i32> %mask32) nounwind readonly alwaysinline {
+  %vecmask = sext <8 x i32> %mask32 to <8 x i64>
+
+  extract_4s(i32, ptrs)
+  extract_4s(i64, vecmask)
+
+  %v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i32> %ptrs_1, <4 x i64> %vecmask_1, i8 1)
+  %v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i32> %ptrs_2, <4 x i64> %vecmask_2, i8 1)
+  %v = shufflevector <4 x i64> %v1, <4 x i64> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i64> %v
+}
+
+
+define <8 x i64> @__gather64_i64(<8 x i64> %ptrs, 
+                                 <8 x i32> %mask32) nounwind readonly alwaysinline {
+  %vecmask = sext <8 x i32> %mask32 to <8 x i64>
+  extract_4s(i64, ptrs)
+  extract_4s(i64, vecmask)
+
+  %v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i64> %ptrs_1, <4 x i64> %vecmask_1, i8 1)
+  %v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
+                      <4 x i64> %ptrs_2, <4 x i64> %vecmask_2, i8 1)
+
+  %v = shufflevector <4 x i64> %v1, <4 x i64> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i64> %v
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double gathers
+
+declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %target, i8 * %ptr,
+                       <4 x i64> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
+declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %target, i8 * %ptr,
+                       <4 x i32> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
+
+define <8 x double> @__gather_base_offsets32_double(i8 * %ptr,
+                             i32 %scale, <8 x i32> %offsets,
+                             <8 x i32> %mask32) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask64 = sext <8 x i32> %mask32 to <8 x i64>
+  %vecmask = bitcast <8 x i64> %vecmask64 to <8 x double>
+  extract_4s(i32, offsets)
+  extract_4s(double, vecmask)
+
+  %v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
+                             <4 x i32> %offsets_1, <4 x double> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
+                             <4 x i32> %offsets_2, <4 x double> %vecmask_2, i8 %scale8)
+
+  %v = shufflevector <4 x double> %v1, <4 x double> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %v
+}
+
+define <8 x double> @__gather_base_offsets64_double(i8 * %ptr,
+                             i32 %scale, <8 x i64> %offsets,
+                             <8 x i32> %mask32) nounwind readonly alwaysinline {
+  %scale8 = trunc i32 %scale to i8
+  %vecmask64 = sext <8 x i32> %mask32 to <8 x i64>
+  %vecmask = bitcast <8 x i64> %vecmask64 to <8 x double>
+  extract_4s(i64, offsets)
+  extract_4s(double, vecmask)
+
+  %v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_1, <4 x double> %vecmask_1, i8 %scale8)
+  %v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, 
+                      <4 x i64> %offsets_2, <4 x double> %vecmask_2, i8 %scale8)
+
+  %v = shufflevector <4 x double> %v1, <4 x double> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %v
+}
+
+define <8 x double> @__gather32_double(<8 x i32> %ptrs, 
+                                       <8 x i32> %mask32) nounwind readonly alwaysinline {
+  %vecmask64 = sext <8 x i32> %mask32 to <8 x i64>
+  %vecmask = bitcast <8 x i64> %vecmask64 to <8 x double>
+  extract_4s(i32, ptrs)
+  extract_4s(double, vecmask)
+
+  %v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i32> %ptrs_1, <4 x double> %vecmask_1, i8 1)
+  %v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i32> %ptrs_2, <4 x double> %vecmask_2, i8 1)
+
+  %v = shufflevector <4 x double> %v1, <4 x double> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %v
+}
+
+define <8 x double> @__gather64_double(<8 x i64> %ptrs, 
+                                       <8 x i32> %mask32) nounwind readonly alwaysinline {
+  %vecmask64 = sext <8 x i32> %mask32 to <8 x i64>
+  %vecmask = bitcast <8 x i64> %vecmask64 to <8 x double>
+  extract_4s(i64, ptrs)
+  extract_4s(double, vecmask)
+
+  %v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i64> %ptrs_1, <4 x double> %vecmask_1, i8 1)
+  %v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
+                      <4 x i64> %ptrs_2, <4 x double> %vecmask_2, i8 1)
+
+  %v = shufflevector <4 x double> %v1, <4 x double> %v2,
+                     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+  ret <8 x double> %v
+}
+
+')
--- a/builtins/target-generic-1.ll
+++ b/builtins/target-generic-1.ll
@@ -13,42 +13,44 @@ aossoa()
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store

-gen_masked_store(1, i8, 8)
-gen_masked_store(1, i16, 16)
-gen_masked_store(1, i32, 32)
-gen_masked_store(1, i64, 64)
+gen_masked_store(i8)
+gen_masked_store(i16)
+gen_masked_store(i32)
+gen_masked_store(i64)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts

-load_and_broadcast(1, i8, 8)
-load_and_broadcast(1, i16, 16)
-load_and_broadcast(1, i32, 32)
-load_and_broadcast(1, i64, 64)

-masked_load(1, i8,  8,  1)
-masked_load(1, i16, 16, 2)
-masked_load(1, i32, 32, 4)
-masked_load(1, i64, 64, 8)
+masked_load(i8,  1)
+masked_load(i16, 2)
+masked_load(i32, 4)
+masked_load(float, 4)
+masked_load(i64, 8)
+masked_load(double, 8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter

 ; define these with the macros from stdlib.m4

-gen_gather(1, i8)
-gen_gather(1, i16)
-gen_gather(1, i32)
-gen_gather(1, i64)
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)

-gen_scatter(1, i8)
-gen_scatter(1, i16)
-gen_scatter(1, i32)
-gen_scatter(1, i64)
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)


 define  <1 x i8> @__vselect_i8(<1 x i8>, <1 x i8> ,
-                                         <1 x i32> %mask) nounwind readnone alwaysinline {
+                               <1 x i32> %mask) nounwind readnone alwaysinline {
 ;  %mv = trunc <1 x i32> %mask to <1 x i8>
 ;  %notmask = xor <1 x i8> %mv, <i8 -1>
 ;  %cleared_old = and <1 x i8> %0, %notmask
@@ -69,7 +71,7 @@ define  <1 x i8> @__vselect_i8(<1 x i8>, <1 x i8> ,
 }

 define  <1 x i16> @__vselect_i16(<1 x i16>, <1 x i16> ,
-                                         <1 x i32> %mask) nounwind readnone alwaysinline {
+                                 <1 x i32> %mask) nounwind readnone alwaysinline {
 ;  %mv = trunc <1 x i32> %mask to <1 x i16>
 ;  %notmask = xor <1 x i16> %mv, <i16 -1>
 ;  %cleared_old = and <1 x i16> %0, %notmask
@@ -91,7 +93,7 @@ define  <1 x i16> @__vselect_i16(<1 x i16>, <1 x i16> ,


 define  <1 x i32> @__vselect_i32(<1 x i32>, <1 x i32> ,
-                                         <1 x i32> %mask) nounwind readnone alwaysinline {
+                                 <1 x i32> %mask) nounwind readnone alwaysinline {
 ;  %notmask = xor <1 x i32> %mask, <i32 -1>
 ;  %cleared_old = and <1 x i32> %0, %notmask
 ;  %masked_new = and <1 x i32> %1, %mask
@@ -109,8 +111,9 @@ define  <1 x i32> @__vselect_i32(<1 x i32>, <1 x i32> ,
   ret <1 x i32> %r

 }
+
 define  <1 x i64> @__vselect_i64(<1 x i64>, <1 x i64> ,
-                                         <1 x i32> %mask) nounwind readnone alwaysinline {
+                                 <1 x i32> %mask) nounwind readnone alwaysinline {
 ;  %newmask = zext <1 x i32> %mask to <1 x i64>
 ;  %notmask = xor <1 x i64> %newmask, <i64 -1>
 ;  %cleared_old = and <1 x i64> %0, %notmask
@@ -131,7 +134,7 @@ define  <1 x i64> @__vselect_i64(<1 x i64>, <1 x i64> ,
 }

 define  <1 x float> @__vselect_float(<1 x float>, <1 x float>,
-                                             <1 x i32> %mask) nounwind readnone alwaysinline {
+                                     <1 x i32> %mask) nounwind readnone alwaysinline {
 ;  %v0 = bitcast <1 x float> %0 to <1 x i32>
 ;  %v1 = bitcast <1 x float> %1 to <1 x i32>
 ;  %r = call <1 x i32> @__vselect_i32(<1 x i32> %v0, <1 x i32> %v1, <1 x i32> %mask)
@@ -154,23 +157,23 @@ define  <1 x float> @__vselect_float(<1 x float>, <1 x float>,
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store

-define void @__masked_store_blend_8(<1 x i8>* nocapture, <1 x i8>, 
+define void @__masked_store_blend_i8(<1 x i8>* nocapture, <1 x i8>,
                                     <1 x i32> %mask) nounwind alwaysinline {
  %val = load <1 x i8> * %0, align 4
  %newval = call <1 x i8> @__vselect_i8(<1 x i8> %val, <1 x i8> %1, <1 x i32> %mask) 
  store <1 x i8> %newval, <1 x i8> * %0, align 4
  ret void
 }
-define void @__masked_store_blend_16(<1 x i16>* nocapture, <1 x i16>, 
-                                     <1 x i32> %mask) nounwind alwaysinline {
+
+define void @__masked_store_blend_i16(<1 x i16>* nocapture, <1 x i16>, 
+                                      <1 x i32> %mask) nounwind alwaysinline {
  %val = load <1 x i16> * %0, align 4
  %newval = call <1 x i16> @__vselect_i16(<1 x i16> %val, <1 x i16> %1, <1 x i32> %mask) 
  store <1 x i16> %newval, <1 x i16> * %0, align 4
  ret void
 }

-
-define void @__masked_store_blend_32(<1 x i32>* nocapture, <1 x i32>, 
+define void @__masked_store_blend_i32(<1 x i32>* nocapture, <1 x i32>, 
                                     <1 x i32> %mask) nounwind alwaysinline {
  %val = load <1 x i32> * %0, align 4
  %newval = call <1 x i32> @__vselect_i32(<1 x i32> %val, <1 x i32> %1, <1 x i32> %mask) 
@@ -178,20 +181,43 @@ define void @__masked_store_blend_32(<1 x i32>* nocapture, <1 x i32>,
  ret void
 }

-define void @__masked_store_blend_64(<1 x i64>* nocapture, <1 x i64>,
-                                     <1 x i32> %mask) nounwind alwaysinline {
+define void @__masked_store_blend_i64(<1 x i64>* nocapture, <1 x i64>,
+                                      <1 x i32> %mask) nounwind alwaysinline {
  %val = load <1 x i64> * %0, align 4
  %newval = call <1 x i64> @__vselect_i64(<1 x i64> %val, <1 x i64> %1, <1 x i32> %mask) 
  store <1 x i64> %newval, <1 x i64> * %0, align 4
  ret void
 }

-define  i32 @__movmsk(<1 x i32>) nounwind readnone alwaysinline {
+masked_store_float_double()
+
+define  i64 @__movmsk(<1 x i32>) nounwind readnone alwaysinline {
  %item = extractelement <1 x i32> %0, i32 0
  %v = lshr i32 %item, 31
-  ret i32 %v
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
 }

+define  i1 @__any(<1 x i32>) nounwind readnone alwaysinline {
+  %item = extractelement <1 x i32> %0, i32 0
+  %v = lshr i32 %item, 31
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define  i1 @__all(<1 x i32>) nounwind readnone alwaysinline {
+  %item = extractelement <1 x i32> %0, i32 0
+  %v = lshr i32 %item, 31
+  %cmp = icmp eq i32 %v, 1
+  ret i1 %cmp
+}
+
+define  i1 @__none(<1 x i32>) nounwind readnone alwaysinline {
+  %item = extractelement <1 x i32> %0, i32 0
+  %v = lshr i32 %item, 31
+  %cmp = icmp eq i32 %v, 0
+  ret i1 %cmp
+}


 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -476,11 +502,6 @@ define  i32 @__reduce_max_int32(<1 x i32>) nounwind readnone {
  ret i32 %r
 }

-define  i32 @__reduce_add_uint32(<1 x i32> %v) nounwind readnone {
-  %r = call i32 @__reduce_add_int32(<1 x i32> %v)
-  ret i32 %r
-}
-
 define  i32 @__reduce_min_uint32(<1 x i32>) nounwind readnone {
  %r = extractelement <1 x i32> %0, i32 0
  ret i32 %r
@@ -932,4 +953,3 @@ declare float @__half_to_float_uniform(i16 %v) nounwind readnone
 declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
 declare i16 @__float_to_half_uniform(float %v) nounwind readnone
 declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
-
--- a/builtins/target-generic-32.ll
+++ b/builtins/target-generic-32.ll
@@ -0,0 +1,33 @@
+;;  Copyright (c) 2010-2012, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`WIDTH',`32')
+include(`target-generic-common.ll')
--- a/builtins/target-generic-64.ll
+++ b/builtins/target-generic-64.ll
@@ -0,0 +1,33 @@
+;;  Copyright (c) 2010-2012, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`WIDTH',`64')
+include(`target-generic-common.ll')
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2010-2012, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -29,12 +29,18 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  

+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128-v16:16:16-v32:32:32";
+
 define(`MASK',`i1')
+define(`HAVE_GATHER',`1')
+define(`HAVE_SCATTER',`1')
+
 include(`util.m4')

 stdlib_core()
 scans()
 reduce_equal(WIDTH)
+rdrand_decls()

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; broadcast/rotate/shuffle
@@ -46,6 +52,20 @@ declare <WIDTH x i16> @__smear_i16(i16) nounwind readnone
 declare <WIDTH x i32> @__smear_i32(i32) nounwind readnone
 declare <WIDTH x i64> @__smear_i64(i64) nounwind readnone

+declare <WIDTH x float> @__setzero_float() nounwind readnone
+declare <WIDTH x double> @__setzero_double() nounwind readnone
+declare <WIDTH x i8> @__setzero_i8() nounwind readnone
+declare <WIDTH x i16> @__setzero_i16() nounwind readnone
+declare <WIDTH x i32> @__setzero_i32() nounwind readnone
+declare <WIDTH x i64> @__setzero_i64() nounwind readnone
+
+declare <WIDTH x float> @__undef_float() nounwind readnone
+declare <WIDTH x double> @__undef_double() nounwind readnone
+declare <WIDTH x i8> @__undef_i8() nounwind readnone
+declare <WIDTH x i16> @__undef_i16() nounwind readnone
+declare <WIDTH x i32> @__undef_i32() nounwind readnone
+declare <WIDTH x i64> @__undef_i64() nounwind readnone
+
 declare <WIDTH x float> @__broadcast_float(<WIDTH x float>, i32) nounwind readnone
 declare <WIDTH x double> @__broadcast_double(<WIDTH x double>, i32) nounwind readnone
 declare <WIDTH x i8> @__broadcast_i8(<WIDTH x i8>, i32) nounwind readnone
@@ -201,7 +221,10 @@ declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; reductions

-declare i32 @__movmsk(<WIDTH x i1>) nounwind readnone 
+declare i64 @__movmsk(<WIDTH x i1>) nounwind readnone 
+declare i1 @__any(<WIDTH x i1>) nounwind readnone 
+declare i1 @__all(<WIDTH x i1>) nounwind readnone 
+declare i1 @__none(<WIDTH x i1>) nounwind readnone 

 declare float @__reduce_add_float(<WIDTH x float>) nounwind readnone
 declare float @__reduce_min_float(<WIDTH x float>) nounwind readnone 
@@ -211,7 +234,6 @@ declare i32 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone
 declare i32 @__reduce_min_int32(<WIDTH x i32>) nounwind readnone 
 declare i32 @__reduce_max_int32(<WIDTH x i32>) nounwind readnone 

-declare i32 @__reduce_add_uint32(<WIDTH x i32>) nounwind readnone 
 declare i32 @__reduce_min_uint32(<WIDTH x i32>) nounwind readnone 
 declare i32 @__reduce_max_uint32(<WIDTH x i32>) nounwind readnone 

@@ -223,34 +245,48 @@ declare i64 @__reduce_add_int64(<WIDTH x i64>) nounwind readnone
 declare i64 @__reduce_min_int64(<WIDTH x i64>) nounwind readnone 
 declare i64 @__reduce_max_int64(<WIDTH x i64>) nounwind readnone 

-declare i64 @__reduce_add_uint64(<WIDTH x i64>) nounwind readnone 
 declare i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone 
 declare i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone 

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts

-load_and_broadcast(WIDTH, i8, 8)
-load_and_broadcast(WIDTH, i16, 16)
-load_and_broadcast(WIDTH, i32, 32)
-load_and_broadcast(WIDTH, i64, 64)

-declare <WIDTH x i8> @__masked_load_8(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
-declare <WIDTH x i16> @__masked_load_16(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
-declare <WIDTH x i32> @__masked_load_32(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
-declare <WIDTH x i64> @__masked_load_64(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
+declare <WIDTH x i8> @__masked_load_i8(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
+declare <WIDTH x i16> @__masked_load_i16(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
+declare <WIDTH x i32> @__masked_load_i32(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
+declare <WIDTH x float> @__masked_load_float(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
+declare <WIDTH x i64> @__masked_load_i64(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
+declare <WIDTH x double> @__masked_load_double(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly

-declare void @__masked_store_8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
-                               <WIDTH x i1>) nounwind 
-declare void @__masked_store_16(<WIDTH x i16>* nocapture, <WIDTH x i16>, 
+declare void @__masked_store_i8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
                                <WIDTH x i1>) nounwind 
-declare void @__masked_store_32(<WIDTH x i32>* nocapture, <WIDTH x i32>, 
-                                <WIDTH x i1>) nounwind 
-declare void @__masked_store_64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
-                                <WIDTH x i1> %mask) nounwind 
+declare void @__masked_store_i16(<WIDTH x i16>* nocapture, <WIDTH x i16>, 
+                                 <WIDTH x i1>) nounwind 
+declare void @__masked_store_i32(<WIDTH x i32>* nocapture, <WIDTH x i32>, 
+                                 <WIDTH x i1>) nounwind 
+declare void @__masked_store_float(<WIDTH x float>* nocapture, <WIDTH x float>, 
+                                   <WIDTH x i1>) nounwind 
+declare void @__masked_store_i64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
+                                 <WIDTH x i1> %mask) nounwind 
+declare void @__masked_store_double(<WIDTH x double>* nocapture, <WIDTH x double>,
+                                    <WIDTH x i1> %mask) nounwind 

-ifelse(LLVM_VERSION, `LLVM_3_1svn',`
-define void @__masked_store_blend_8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
+ifelse(LLVM_VERSION, `LLVM_3_0', `
+declare void @__masked_store_blend_i8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
+                                      <WIDTH x i1>) nounwind 
+declare void @__masked_store_blend_i16(<WIDTH x i16>* nocapture, <WIDTH x i16>, 
+                                       <WIDTH x i1>) nounwind 
+declare void @__masked_store_blend_i32(<WIDTH x i32>* nocapture, <WIDTH x i32>, 
+                                       <WIDTH x i1>) nounwind 
+declare void @__masked_store_blend_float(<WIDTH x float>* nocapture, <WIDTH x float>, 
+                                       <WIDTH x i1>) nounwind 
+declare void @__masked_store_blend_i64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
+                                       <WIDTH x i1> %mask) nounwind 
+declare void @__masked_store_blend_double(<WIDTH x double>* nocapture, <WIDTH x double>,
+                                       <WIDTH x i1> %mask) nounwind 
+', `
+define void @__masked_store_blend_i8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
                                     <WIDTH x i1>) nounwind alwaysinline {
  %v = load <WIDTH x i8> * %0
  %v1 = select <WIDTH x i1> %2, <WIDTH x i8> %1, <WIDTH x i8> %v
@@ -258,57 +294,64 @@ define void @__masked_store_blend_8(<WIDTH x i8>* nocapture, <WIDTH x i8>,
  ret void
 }

-define void @__masked_store_blend_16(<WIDTH x i16>* nocapture, <WIDTH x i16>, 
-                                     <WIDTH x i1>) nounwind alwaysinline {
+define void @__masked_store_blend_i16(<WIDTH x i16>* nocapture, <WIDTH x i16>, 
+                                      <WIDTH x i1>) nounwind alwaysinline {
  %v = load <WIDTH x i16> * %0
  %v1 = select <WIDTH x i1> %2, <WIDTH x i16> %1, <WIDTH x i16> %v
  store <WIDTH x i16> %v1, <WIDTH x i16> * %0
  ret void
 }

-define void @__masked_store_blend_32(<WIDTH x i32>* nocapture, <WIDTH x i32>, 
-                                     <WIDTH x i1>) nounwind alwaysinline {
+define void @__masked_store_blend_i32(<WIDTH x i32>* nocapture, <WIDTH x i32>, 
+                                      <WIDTH x i1>) nounwind alwaysinline {
  %v = load <WIDTH x i32> * %0
  %v1 = select <WIDTH x i1> %2, <WIDTH x i32> %1, <WIDTH x i32> %v
  store <WIDTH x i32> %v1, <WIDTH x i32> * %0
  ret void
 }

-define void @__masked_store_blend_64(<WIDTH x i64>* nocapture,
+define void @__masked_store_blend_float(<WIDTH x float>* nocapture, <WIDTH x float>, 
+                                        <WIDTH x i1>) nounwind alwaysinline {
+  %v = load <WIDTH x float> * %0
+  %v1 = select <WIDTH x i1> %2, <WIDTH x float> %1, <WIDTH x float> %v
+  store <WIDTH x float> %v1, <WIDTH x float> * %0
+  ret void
+}
+
+define void @__masked_store_blend_i64(<WIDTH x i64>* nocapture,
                            <WIDTH x i64>, <WIDTH x i1>) nounwind alwaysinline {
  %v = load <WIDTH x i64> * %0
  %v1 = select <WIDTH x i1> %2, <WIDTH x i64> %1, <WIDTH x i64> %v
  store <WIDTH x i64> %v1, <WIDTH x i64> * %0
  ret void
 }
-',`
-declare void @__masked_store_blend_8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
-                                     <WIDTH x i1>) nounwind 
-declare void @__masked_store_blend_16(<WIDTH x i16>* nocapture, <WIDTH x i16>, 
-                                      <WIDTH x i1>) nounwind 
-declare void @__masked_store_blend_32(<WIDTH x i32>* nocapture, <WIDTH x i32>, 
-                                      <WIDTH x i1>) nounwind 
-declare void @__masked_store_blend_64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
-                                      <WIDTH x i1> %mask) nounwind 
+
+define void @__masked_store_blend_double(<WIDTH x double>* nocapture,
+                            <WIDTH x double>, <WIDTH x i1>) nounwind alwaysinline {
+  %v = load <WIDTH x double> * %0
+  %v1 = select <WIDTH x i1> %2, <WIDTH x double> %1, <WIDTH x double> %v
+  store <WIDTH x double> %v1, <WIDTH x double> * %0
+  ret void
+}
 ')

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter

 define(`gather_scatter', `
-declare <WIDTH x $1> @__gather_base_offsets32_$1(i8 * nocapture, <WIDTH x i32>,
-                        i32, <WIDTH x i32>, <WIDTH x i1>) nounwind readonly 
-declare <WIDTH x $1> @__gather_base_offsets64_$1(i8 * nocapture, <WIDTH x i64>,
-                        i32, <WIDTH x i64>, <WIDTH x i1>) nounwind readonly 
+declare <WIDTH x $1> @__gather_base_offsets32_$1(i8 * nocapture, i32, <WIDTH x i32>,
+                                                 <WIDTH x i1>) nounwind readonly 
+declare <WIDTH x $1> @__gather_base_offsets64_$1(i8 * nocapture, i32, <WIDTH x i64>,
+                                                  <WIDTH x i1>) nounwind readonly 
 declare <WIDTH x $1> @__gather32_$1(<WIDTH x i32>, 
                                    <WIDTH x i1>) nounwind readonly 
 declare <WIDTH x $1> @__gather64_$1(<WIDTH x i64>, 
                                    <WIDTH x i1>) nounwind readonly 

-declare void @__scatter_base_offsets32_$1(i8* nocapture, <WIDTH x i32>,
-                  i32, <WIDTH x i32>, <WIDTH x $1>, <WIDTH x i1>) nounwind 
-declare void @__scatter_base_offsets64_$1(i8* nocapture, <WIDTH x i64>,
-                  i32, <WIDTH x i64>, <WIDTH x $1>, <WIDTH x i1>) nounwind 
+declare void @__scatter_base_offsets32_$1(i8* nocapture, i32, <WIDTH x i32>,
+                                          <WIDTH x $1>, <WIDTH x i1>) nounwind 
+declare void @__scatter_base_offsets64_$1(i8* nocapture, i32, <WIDTH x i64>,
+                                          <WIDTH x $1>, <WIDTH x i1>) nounwind 
 declare void @__scatter32_$1(<WIDTH x i32>, <WIDTH x $1>,
                             <WIDTH x i1>) nounwind 
 declare void @__scatter64_$1(<WIDTH x i64>, <WIDTH x $1>,
@@ -318,7 +361,9 @@ declare void @__scatter64_$1(<WIDTH x i64>, <WIDTH x $1>,
 gather_scatter(i8)
 gather_scatter(i16)
 gather_scatter(i32)
+gather_scatter(float)
 gather_scatter(i64)
+gather_scatter(double)

 declare i32 @__packed_load_active(i32 * nocapture, <WIDTH x i32> * nocapture,
                                  <WIDTH x i1>) nounwind
--- a/builtins/target-sse2-common.ll
+++ b/builtins/target-sse2-common.ll
@@ -33,6 +33,7 @@ ctlztz()
 define_prefetches()
 define_shuffles()
 aossoa()
+rdrand_decls()

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
--- a/builtins/target-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2010-2012, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -295,7 +295,7 @@ define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {

 declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone

-define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
+define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
  ; first do two 4-wide movmsk calls
  %floatmask = bitcast <8 x i32> %0 to <8 x float>
  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
@@ -309,7 +309,62 @@ define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
  ; of the second one
  %v1s = shl i32 %v1, 4
  %v = or i32 %v0, %v1s
-  ret i32 %v
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
+}
+
+define i1 @__any(<8 x i32>) nounwind readnone alwaysinline {
+  ; first do two 4-wide movmsk calls
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
+  %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
+
+  ; and shift the first one over by 4 before ORing it with the value 
+  ; of the second one
+  %v1s = shl i32 %v1, 4
+  %v = or i32 %v0, %v1s
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<8 x i32>) nounwind readnone alwaysinline {
+  ; first do two 4-wide movmsk calls
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
+  %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
+
+  ; and shift the first one over by 4 before ORing it with the value 
+  ; of the second one
+  %v1s = shl i32 %v1, 4
+  %v = or i32 %v0, %v1s
+  %cmp = icmp eq i32 %v, 255
+  ret i1 %cmp
+}
+
+define i1 @__none(<8 x i32>) nounwind readnone alwaysinline {
+  ; first do two 4-wide movmsk calls
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
+  %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
+
+  ; and shift the first one over by 4 before ORing it with the value 
+  ; of the second one
+  %v1s = shl i32 %v1, 4
+  %v = or i32 %v0, %v1s
+  %cmp = icmp eq i32 %v, 0
+  ret i1 %cmp
 }

 define <4 x float> @__vec4_add_float(<4 x float> %v0,
@@ -360,11 +415,6 @@ define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
  reduce8(i32, @__max_varying_int32, @__max_uniform_int32)
 }

-define i32 @__reduce_add_uint32(<8 x i32> %v) nounwind readnone alwaysinline {
-  %r = call i32 @__reduce_add_int32(<8 x i32> %v)
-  ret i32 %r
-}
-
 define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
  reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32)
 }
@@ -397,7 +447,7 @@ define double @__reduce_max_double(<8 x double>) nounwind readnone {
 }

 define <4 x i64> @__add_varying_int64(<4 x i64>,
-                                               <4 x i64>) nounwind readnone alwaysinline {
+                                      <4 x i64>) nounwind readnone alwaysinline {
  %r = add <4 x i64> %0, %1
  ret <4 x i64> %r
 }
@@ -432,28 +482,30 @@ reduce_equal(8)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts

-load_and_broadcast(8, i8, 8)
-load_and_broadcast(8, i16, 16)
-load_and_broadcast(8, i32, 32)
-load_and_broadcast(8, i64, 64)

-masked_load(8, i8,  8,  1)
-masked_load(8, i16, 16, 2)
-masked_load(8, i32, 32, 4)
-masked_load(8, i64, 64, 8)
+masked_load(i8,  1)
+masked_load(i16, 2)
+masked_load(i32, 4)
+masked_load(float, 4)
+masked_load(i64, 8)
+masked_load(double, 8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter

-gen_gather(8, i8)
-gen_gather(8, i16)
-gen_gather(8, i32)
-gen_gather(8, i64)
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)

-gen_scatter(8, i8)
-gen_scatter(8, i16)
-gen_scatter(8, i32)
-gen_scatter(8, i64)
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float rounding
@@ -557,23 +609,23 @@ define <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alway
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store

-gen_masked_store(8, i8, 8)
-gen_masked_store(8, i16, 16)
-gen_masked_store(8, i32, 32)
-gen_masked_store(8, i64, 64)
+gen_masked_store(i8)
+gen_masked_store(i16)
+gen_masked_store(i32)
+gen_masked_store(i64)

 masked_store_blend_8_16_by_8()

-define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>, 
-                                     <8 x i32> %mask) nounwind alwaysinline {
+define void @__masked_store_blend_i32(<8 x i32>* nocapture, <8 x i32>, 
+                                      <8 x i32> %mask) nounwind alwaysinline {
  %val = load <8 x i32> * %0, align 4
  %newval = call <8 x i32> @__vselect_i32(<8 x i32> %val, <8 x i32> %1, <8 x i32> %mask) 
  store <8 x i32> %newval, <8 x i32> * %0, align 4
  ret void
 }

-define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
-                                     <8 x i32> %mask) nounwind alwaysinline {
+define void @__masked_store_blend_i64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
+                                      <8 x i32> %mask) nounwind alwaysinline {
  %oldValue = load <8 x i64>* %ptr, align 8

  ; Do 8x64-bit blends by doing two <8 x i32> blends, where the <8 x i32> values
@@ -616,6 +668,8 @@ define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
  ret void
 }

+masked_store_float_double()
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; double precision sqrt

--- a/builtins/target-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2010-2012, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -239,10 +239,32 @@ define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {

 declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone

-define i32 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
+define i64 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
  %floatmask = bitcast <4 x i32> %0 to <4 x float>
  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
-  ret i32 %v
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
+}
+
+define i1 @__any(<4 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i32> %0 to <4 x float>
+  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<4 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i32> %0 to <4 x float>
+  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+  %cmp = icmp eq i32 %v, 15
+  ret i1 %cmp
+}
+
+define i1 @__none(<4 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i32> %0 to <4 x float>
+  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+  %cmp = icmp eq i32 %v, 0
+  ret i1 %cmp
 }

 define float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline {
@@ -281,18 +303,13 @@ define i32 @__reduce_max_int32(<4 x i32>) nounwind readnone {
  reduce4(i32, @__max_varying_int32, @__max_uniform_int32)
 }

-define i32 @__reduce_add_uint32(<4 x i32> %v) nounwind readnone {
-  %r = call i32 @__reduce_add_int32(<4 x i32> %v)
-  ret i32 %r
-}
-
 define i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone {
  reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32)
 }

 define i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone {
  reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32)
- }
+}


 define double @__reduce_add_double(<4 x double>) nounwind readnone {
@@ -349,16 +366,16 @@ reduce_equal(4)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store

-define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>, 
-                                     <4 x i32> %mask) nounwind alwaysinline {
+define void @__masked_store_blend_i32(<4 x i32>* nocapture, <4 x i32>, 
+                                      <4 x i32> %mask) nounwind alwaysinline {
  %val = load <4 x i32> * %0, align 4
  %newval = call <4 x i32> @__vselect_i32(<4 x i32> %val, <4 x i32> %1, <4 x i32> %mask) 
  store <4 x i32> %newval, <4 x i32> * %0, align 4
  ret void
 }

-define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
-                                     <4 x i32> %mask) nounwind alwaysinline {
+define void @__masked_store_blend_i64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
+                                      <4 x i32> %mask) nounwind alwaysinline {
  %oldValue = load <4 x i64>* %ptr, align 8

  ; Do 4x64-bit blends by doing two <4 x i32> blends, where the <4 x i32> values
@@ -400,6 +417,8 @@ define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
 }


+masked_store_float_double()
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp

@@ -551,35 +570,37 @@ define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind r

 masked_store_blend_8_16_by_4()

-gen_masked_store(4, i8, 8)
-gen_masked_store(4, i16, 16)
-gen_masked_store(4, i32, 32)
-gen_masked_store(4, i64, 64)
+gen_masked_store(i8)
+gen_masked_store(i16)
+gen_masked_store(i32)
+gen_masked_store(i64)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts

-load_and_broadcast(4, i8, 8)
-load_and_broadcast(4, i16, 16)
-load_and_broadcast(4, i32, 32)
-load_and_broadcast(4, i64, 64)

-masked_load(4, i8,  8,  1)
-masked_load(4, i16, 16, 2)
-masked_load(4, i32, 32, 4)
-masked_load(4, i64, 64, 8)
+masked_load(i8,  1)
+masked_load(i16, 2)
+masked_load(i32, 4)
+masked_load(float, 4)
+masked_load(i64, 8)
+masked_load(double, 8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter

 ; define these with the macros from stdlib.m4

-gen_gather(4, i8)
-gen_gather(4, i16)
-gen_gather(4, i32)
-gen_gather(4, i64)
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)

-gen_scatter(4, i8)
-gen_scatter(4, i16)
-gen_scatter(4, i32)
-gen_scatter(4, i64)
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)
--- a/builtins/target-sse4-common.ll
+++ b/builtins/target-sse4-common.ll
@@ -33,6 +33,7 @@ ctlztz()
 define_prefetches()
 define_shuffles()
 aossoa()
+rdrand_decls()

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding floats
--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2010-2012, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -221,13 +221,13 @@ define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly al
 ; unsigned int min/max

 define <8 x i32> @__min_varying_uint32(<8 x i32>,
-                                                <8 x i32>) nounwind readonly alwaysinline {
+                                       <8 x i32>) nounwind readonly alwaysinline {
  binary4to8(call, i32, @llvm.x86.sse41.pminud, %0, %1)
  ret <8 x i32> %call
 }

 define <8 x i32> @__max_varying_uint32(<8 x i32>,
-                                                <8 x i32>) nounwind readonly alwaysinline {
+                                       <8 x i32>) nounwind readonly alwaysinline {
  binary4to8(call, i32, @llvm.x86.sse41.pmaxud, %0, %1)
  ret <8 x i32> %call
 }
@@ -237,7 +237,7 @@ define <8 x i32> @__max_varying_uint32(<8 x i32>,

 declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone

-define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
+define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
  ; first do two 4-wide movmsk calls
  %floatmask = bitcast <8 x i32> %0 to <8 x float>
  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
@@ -251,7 +251,62 @@ define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
  ; of the second one
  %v1s = shl i32 %v1, 4
  %v = or i32 %v0, %v1s
-  ret i32 %v
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
+}
+
+define i1 @__any(<8 x i32>) nounwind readnone alwaysinline {
+  ; first do two 4-wide movmsk calls
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
+  %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
+
+  ; and shift the first one over by 4 before ORing it with the value 
+  ; of the second one
+  %v1s = shl i32 %v1, 4
+  %v = or i32 %v0, %v1s
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<8 x i32>) nounwind readnone alwaysinline {
+  ; first do two 4-wide movmsk calls
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
+  %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
+
+  ; and shift the first one over by 4 before ORing it with the value 
+  ; of the second one
+  %v1s = shl i32 %v1, 4
+  %v = or i32 %v0, %v1s
+  %cmp = icmp eq i32 %v, 255
+  ret i1 %cmp
+}
+
+define i1 @__none(<8 x i32>) nounwind readnone alwaysinline {
+  ; first do two 4-wide movmsk calls
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
+  %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
+
+  ; and shift the first one over by 4 before ORing it with the value 
+  ; of the second one
+  %v1s = shl i32 %v1, 4
+  %v = or i32 %v0, %v1s
+  %cmp = icmp eq i32 %v, 0
+  ret i1 %cmp
 }

 define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
@@ -287,11 +342,6 @@ define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
  reduce8by4(i32, @llvm.x86.sse41.pmaxsd, @__max_uniform_int32)
 }

-define i32 @__reduce_add_uint32(<8 x i32> %v) nounwind readnone alwaysinline {
-  %r = call i32 @__reduce_add_int32(<8 x i32> %v)
-  ret i32 %r
-}
-
 define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
  reduce8by4(i32, @llvm.x86.sse41.pminud, @__min_uniform_uint32)
 }
@@ -324,7 +374,7 @@ define double @__reduce_max_double(<8 x double>) nounwind readnone {
 }

 define <4 x i64> @__add_varying_int64(<4 x i64>,
-                                               <4 x i64>) nounwind readnone alwaysinline {
+                                      <4 x i64>) nounwind readnone alwaysinline {
  %r = add <4 x i64> %0, %1
  ret <4 x i64> %r
 }
@@ -359,28 +409,30 @@ reduce_equal(8)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts

-load_and_broadcast(8, i8, 8)
-load_and_broadcast(8, i16, 16)
-load_and_broadcast(8, i32, 32)
-load_and_broadcast(8, i64, 64)

-masked_load(8, i8,  8,  1)
-masked_load(8, i16, 16, 2)
-masked_load(8, i32, 32, 4)
-masked_load(8, i64, 64, 8)
+masked_load(i8,  1)
+masked_load(i16, 2)
+masked_load(i32, 4)
+masked_load(float, 4)
+masked_load(i64, 8)
+masked_load(double, 8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter

-gen_gather(8, i8)
-gen_gather(8, i16)
-gen_gather(8, i32)
-gen_gather(8, i64)
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)

-gen_scatter(8, i8)
-gen_scatter(8, i16)
-gen_scatter(8, i32)
-gen_scatter(8, i64)
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float rounding
@@ -443,18 +495,18 @@ define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store

-gen_masked_store(8, i8, 8)
-gen_masked_store(8, i16, 16)
-gen_masked_store(8, i32, 32)
-gen_masked_store(8, i64, 64)
+gen_masked_store(i8)
+gen_masked_store(i16)
+gen_masked_store(i32)
+gen_masked_store(i64)

 masked_store_blend_8_16_by_8()

 declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
                                             <4 x float>) nounwind readnone

-define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>, 
-                                     <8 x i32> %mask) nounwind alwaysinline {
+define void @__masked_store_blend_i32(<8 x i32>* nocapture, <8 x i32>, 
+                                      <8 x i32> %mask) nounwind alwaysinline {
  ; do two 4-wide blends with blendvps
  %mask_as_float = bitcast <8 x i32> %mask to <8 x float>
  %mask_a = shufflevector <8 x float> %mask_as_float, <8 x float> undef,
@@ -483,8 +535,8 @@ define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
  ret void
 }

-define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
-                                     <8 x i32> %mask) nounwind alwaysinline {
+define void @__masked_store_blend_i64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
+                                      <8 x i32> %mask) nounwind alwaysinline {
  ; implement this as 4 blends of <4 x i32>s, which are actually bitcast
  ; <2 x i64>s...

@@ -550,6 +602,7 @@ define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
  ret void
 }

+masked_store_float_double()

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; double precision sqrt
--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2010-2012, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -271,10 +271,32 @@ define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alway

 declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone

-define i32 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
+define i64 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
  %floatmask = bitcast <4 x i32> %0 to <4 x float>
  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
-  ret i32 %v
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
+}
+
+define i1 @__any(<4 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i32> %0 to <4 x float>
+  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<4 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i32> %0 to <4 x float>
+  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+  %cmp = icmp eq i32 %v, 15
+  ret i1 %cmp
+}
+
+define i1 @__none(<4 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i32> %0 to <4 x float>
+  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+  %cmp = icmp eq i32 %v, 0
+  ret i1 %cmp
 }

 declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
@@ -312,18 +334,13 @@ define i32 @__reduce_max_int32(<4 x i32>) nounwind readnone {
  reduce4(i32, @__max_varying_int32, @__max_uniform_int32)
 }

-define i32 @__reduce_add_uint32(<4 x i32> %v) nounwind readnone {
-  %r = call i32 @__reduce_add_int32(<4 x i32> %v)
-  ret i32 %r
-}
-
 define i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone {
  reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32)
 }

 define i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone {
  reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32)
- }
+}


 define double @__reduce_add_double(<4 x double>) nounwind readnone {
@@ -383,8 +400,8 @@ declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
                                             <4 x float>) nounwind readnone


-define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>, 
-                                     <4 x i32> %mask) nounwind alwaysinline {
+define void @__masked_store_blend_i32(<4 x i32>* nocapture, <4 x i32>, 
+                                      <4 x i32> %mask) nounwind alwaysinline {
  %mask_as_float = bitcast <4 x i32> %mask to <4 x float>
  %oldValue = load <4 x i32>* %0, align 4
  %oldAsFloat = bitcast <4 x i32> %oldValue to <4 x float>
@@ -398,8 +415,8 @@ define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>,
 }


-define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
-                                     <4 x i32> %i32mask) nounwind alwaysinline {
+define void @__masked_store_blend_i64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
+                                      <4 x i32> %i32mask) nounwind alwaysinline {
  %oldValue = load <4 x i64>* %ptr, align 8
  %mask = bitcast <4 x i32> %i32mask to <4 x float>

@@ -450,35 +467,39 @@ define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,

 masked_store_blend_8_16_by_4()

-gen_masked_store(4, i8, 8)
-gen_masked_store(4, i16, 16)
-gen_masked_store(4, i32, 32)
-gen_masked_store(4, i64, 64)
+gen_masked_store(i8)
+gen_masked_store(i16)
+gen_masked_store(i32)
+gen_masked_store(i64)
+
+masked_store_float_double()

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts

-load_and_broadcast(4, i8, 8)
-load_and_broadcast(4, i16, 16)
-load_and_broadcast(4, i32, 32)
-load_and_broadcast(4, i64, 64)

-masked_load(4, i8,  8,  1)
-masked_load(4, i16, 16, 2)
-masked_load(4, i32, 32, 4)
-masked_load(4, i64, 64, 8)
+masked_load(i8,  1)
+masked_load(i16, 2)
+masked_load(i32, 4)
+masked_load(float, 4)
+masked_load(i64, 8)
+masked_load(double, 8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter

 ; define these with the macros from stdlib.m4

-gen_gather(4, i8)
-gen_gather(4, i16)
-gen_gather(4, i32)
-gen_gather(4, i64)
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)

-gen_scatter(4, i8)
-gen_scatter(4, i16)
-gen_scatter(4, i32)
-gen_scatter(4, i64)
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)
--- a/builtins/util.m4
+++ b/builtins/util.m4
--- a/cbackend.cpp
+++ b/cbackend.cpp
--- a/contrib/ispc.vim
+++ b/contrib/ispc.vim
@@ -17,7 +17,7 @@ syn keyword	ispcStatement	cbreak ccontinue creturn launch print reference soa sy
 syn keyword	ispcConditional	cif
 syn keyword	ispcRepeat	cdo cfor cwhile
 syn keyword	ispcBuiltin	programCount programIndex	
-syn keyword	ispcType	export int8 int16 int32 int64
+syn keyword	ispcType	export uniform varying int8 int16 int32 int64

 " Default highlighting
 command -nargs=+ HiLink hi def link <args>
--- a/contrib/ispc.vim.README
+++ b/contrib/ispc.vim.README
@@ -0,0 +1,8 @@
+To install vim syntax highlighting for ispc files:
+
+1) Copy ispc.vim into ~/.vim/syntax/ispc.vim (create if necessary)
+2) Create a filetype for ispc files to correspond to that syntax file
+   To do this, create and append the following line to ~/.vim/ftdetect/ispc.vim
+
+au BufRead,BufNewFile *.ispc set filetype=ispc
+
--- a/ctx.cpp
+++ b/ctx.cpp
--- a/ctx.h
+++ b/ctx.h
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -40,10 +40,20 @@

 #include "ispc.h"
 #include <map>
-#include <llvm/InstrTypes.h>
-#include <llvm/Instructions.h>
-#include <llvm/Analysis/DIBuilder.h>
-#include <llvm/Analysis/DebugInfo.h>
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
+  #include <llvm/InstrTypes.h>
+  #include <llvm/Instructions.h>
+#else
+  #include <llvm/IR/InstrTypes.h>
+  #include <llvm/IR/Instructions.h>
+#endif
+#if defined(LLVM_3_1)
+  #include <llvm/Analysis/DebugInfo.h>
+  #include <llvm/Analysis/DIBuilder.h>
+#else
+  #include <llvm/DebugInfo.h>
+  #include <llvm/DIBuilder.h>
+#endif

 struct CFInfo;

@@ -153,16 +163,17 @@ public:
                   bool uniformControlFlow);

    /** Informs FunctionEmitContext of the value of the mask at the start
-        of a loop body. */
-    void SetLoopMask(llvm::Value *mask);
+        of a loop body or switch statement. */
+    void SetBlockEntryMask(llvm::Value *mask);

    /** Informs FunctionEmitContext that code generation for a loop is
        finished. */
    void EndLoop();

-    /** Indicates that code generation for a 'foreach' or 'foreach_tiled'
-        loop is about to start. */
-    void StartForeach();
+    /** Indicates that code generation for a 'foreach', 'foreach_tiled',
+        'foreach_active', or 'foreach_unique' loop is about to start. */
+    enum ForeachType { FOREACH_REGULAR, FOREACH_ACTIVE, FOREACH_UNIQUE };
+    void StartForeach(ForeachType ft);
    void EndForeach();

    /** Emit code for a 'break' statement in a loop.  If doCoherenceCheck
@@ -230,6 +241,13 @@ public:

    bool InForeachLoop() const;

+    /** Temporarily disables emission of performance warnings from gathers
+        and scatters from subsequent code. */
+    void DisableGatherScatterWarnings();
+
+    /** Reenables emission of gather/scatter performance warnings. */
+    void EnableGatherScatterWarnings();
+
    void SetContinueTarget(llvm::BasicBlock *bb) { continueTarget = bb; }

    /** Step through the code and find label statements; create a basic
@@ -241,6 +259,10 @@ public:
        new basic block that it starts. */
    llvm::BasicBlock *GetLabeledBasicBlock(const std::string &label);

+    /** Returns a vector of all labels in the context. This is
+        simply the key set of the labelMap */
+    std::vector<std::string> GetLabels();
+
    /** Called to generate code for 'return' statement; value is the
        expression in the return statement (if non-NULL), and
        doCoherenceCheck indicates whether instructions should be generated
@@ -265,7 +287,7 @@ public:
    llvm::Value *None(llvm::Value *mask);

    /** Given a boolean mask value of type LLVMTypes::MaskType, return an
-        i32 value wherein the i'th bit is on if and only if the i'th lane
+        i64 value wherein the i'th bit is on if and only if the i'th lane
        of the mask is on. */
    llvm::Value *LaneMask(llvm::Value *mask);

@@ -331,7 +353,7 @@ public:

    /** Emits debugging information for the function parameter represented
        by sym.  */
-    void EmitFunctionParameterDebugInfo(Symbol *sym);
+    void EmitFunctionParameterDebugInfo(Symbol *sym, int parameterNum);
    /** @} */

    /** @name IR instruction emission
@@ -373,25 +395,35 @@ public:
        array, for pointer types). */
    llvm::Value *SmearUniform(llvm::Value *value, const char *name = NULL);

-    llvm::Value *BitCastInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
+    llvm::Value *BitCastInst(llvm::Value *value, llvm::Type *type,
                             const char *name = NULL);
    llvm::Value *PtrToIntInst(llvm::Value *value, const char *name = NULL);
-    llvm::Value *PtrToIntInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
+    llvm::Value *PtrToIntInst(llvm::Value *value, llvm::Type *type,
                              const char *name = NULL);
-    llvm::Value *IntToPtrInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
+    llvm::Value *IntToPtrInst(llvm::Value *value, llvm::Type *type,
                              const char *name = NULL);

-    llvm::Instruction *TruncInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
+    llvm::Instruction *TruncInst(llvm::Value *value, llvm::Type *type,
                                 const char *name = NULL);
    llvm::Instruction *CastInst(llvm::Instruction::CastOps op, llvm::Value *value,
-                                LLVM_TYPE_CONST llvm::Type *type, const char *name = NULL);
-    llvm::Instruction *FPCastInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type, 
+                                llvm::Type *type, const char *name = NULL);
+    llvm::Instruction *FPCastInst(llvm::Value *value, llvm::Type *type, 
                                  const char *name = NULL);
-    llvm::Instruction *SExtInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type, 
+    llvm::Instruction *SExtInst(llvm::Value *value, llvm::Type *type, 
                                const char *name = NULL);
-    llvm::Instruction *ZExtInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type, 
+    llvm::Instruction *ZExtInst(llvm::Value *value, llvm::Type *type, 
                                const char *name = NULL);

+    /** Given two integer-typed values (but possibly one vector and the
+        other not, and or of possibly-different bit-widths), update their
+        values as needed so that the two have the same (more general)
+        type. */ 
+    void MatchIntegerTypes(llvm::Value **v0, llvm::Value **v1);
+
+    /** Create a new slice pointer out of the given pointer to an soa type
+        and an integer offset to a slice within that type. */
+    llvm::Value *MakeSlicePointer(llvm::Value *ptr, llvm::Value *offset);
+
    /** These GEP methods are generalizations of the standard ones in LLVM;
        they support both uniform and varying basePtr values as well as
        uniform and varying index values (arrays of indices).  Varying base
@@ -412,7 +444,8 @@ public:
        the type of the pointer, though it may be NULL if the base pointer
        is uniform. */
    llvm::Value *AddElementOffset(llvm::Value *basePtr, int elementNum,
-                                  const Type *ptrType, const char *name = NULL);
+                                  const Type *ptrType, const char *name = NULL,
+                                  const PointerType **resultPtrType = NULL);

    /** Load from the memory location(s) given by lvalue, using the given
        mask.  The lvalue may be varying, in which case this corresponds to
@@ -430,7 +463,7 @@ public:
        instruction is added at the start of the function in the entry
        basic block; if it should be added to the current basic block, then
        the atEntryBlock parameter should be false. */ 
-    llvm::Value *AllocaInst(LLVM_TYPE_CONST llvm::Type *llvmType, 
+    llvm::Value *AllocaInst(llvm::Type *llvmType, 
                            const char *name = NULL, int align = 0, 
                            bool atEntryBlock = true);

@@ -443,7 +476,14 @@ public:
        varying, the given storeMask is used to mask the stores so that
        they only execute for the active program instances. */
    void StoreInst(llvm::Value *value, llvm::Value *ptr,
-                   llvm::Value *storeMask, const Type *ptrType);
+                   llvm::Value *storeMask, const Type *valueType,
+                   const Type *ptrType);
+
+    /** Copy count bytes of memory from the location pointed to by src to
+        the location pointed to by dest.  (src and dest must not be
+        overlapping.) */ 
+    void MemcpyInst(llvm::Value *dest, llvm::Value *src, llvm::Value *count,
+                    llvm::Value *align = NULL);

    void BranchInst(llvm::BasicBlock *block);
    void BranchInst(llvm::BasicBlock *trueBlock, llvm::BasicBlock *falseBlock,
@@ -460,7 +500,7 @@ public:
    llvm::Value *InsertInst(llvm::Value *v, llvm::Value *eltVal, int elt, 
                            const char *name = NULL);

-    llvm::PHINode *PhiNode(LLVM_TYPE_CONST llvm::Type *type, int count, 
+    llvm::PHINode *PhiNode(llvm::Type *type, int count, 
                           const char *name = NULL);
    llvm::Instruction *SelectInst(llvm::Value *test, llvm::Value *val0,
                                  llvm::Value *val1, const char *name = NULL);
@@ -531,9 +571,9 @@ private:
        for error messages and debugging symbols. */
    SourcePos funcStartPos;

-    /** If currently in a loop body, the value of the mask at the start of
-        the loop. */
-    llvm::Value *loopMask;
+    /** If currently in a loop body or switch statement, the value of the
+        mask at the start of it. */
+    llvm::Value *blockEntryMask;

    /** If currently in a loop body or switch statement, this is a pointer
        to memory to store a mask value that represents which of the lanes
@@ -607,12 +647,12 @@ private:
    std::vector<CFInfo *> controlFlowInfo;

    /** DIFile object corresponding to the source file where the current
-        function was defined (used for debugging info0. */
+        function was defined (used for debugging info). */
    llvm::DIFile diFile;

    /** DISubprogram corresponding to this function (used for debugging
        info). */
-    llvm::DISubprogram diFunction;
+    llvm::DISubprogram diSubprogram;

    /** These correspond to the current set of nested scopes in the
        function. */
@@ -626,6 +666,10 @@ private:
        tasks launched from the current function. */
    llvm::Value *launchGroupHandlePtr;

+    /** Nesting count of the number of times calling code has disabled (and
+        not yet reenabled) gather/scatter performance warnings. */
+    int disableGSWarningCount;
+
    std::map<std::string, llvm::BasicBlock *> labelMap;

    static bool initLabelBBlocks(ASTNode *node, void *data);
@@ -646,12 +690,19 @@ private:

    CFInfo *popCFState();

-    void scatter(llvm::Value *value, llvm::Value *ptr, const Type *ptrType, 
-                 llvm::Value *mask);
+    void scatter(llvm::Value *value, llvm::Value *ptr, const Type *valueType,
+                 const Type *ptrType, llvm::Value *mask);
    void maskedStore(llvm::Value *value, llvm::Value *ptr, const Type *ptrType,
                     llvm::Value *mask);
-    llvm::Value *gather(llvm::Value *ptr, const Type *ptrType, llvm::Value *mask,
-                        const char *name);
+    void storeUniformToSOA(llvm::Value *value, llvm::Value *ptr, 
+                           llvm::Value *mask, const Type *valueType,
+                           const PointerType *ptrType);
+    llvm::Value *loadUniformFromSOA(llvm::Value *ptr, llvm::Value *mask,
+                                    const PointerType *ptrType, const char *name);
+
+    llvm::Value *gather(llvm::Value *ptr, const PointerType *ptrType,
+                        llvm::Value *mask, const char *name);
+
    llvm::Value *addVaryingOffsetsIfNeeded(llvm::Value *ptr, const Type *ptrType);
 };

--- a/decl.cpp
+++ b/decl.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -33,7 +33,7 @@

 /** @file decl.cpp
    @brief Implementations of classes related to turning declarations into 
-           symbols and types.
+           symbol names and types.
 */

 #include "decl.h"
@@ -44,6 +44,7 @@
 #include "stmt.h"
 #include "expr.h"
 #include <stdio.h>
+#include <string.h>
 #include <set>

 static void
@@ -55,6 +56,8 @@ lPrintTypeQualifiers(int typeQualifiers) {
    if (typeQualifiers & TYPEQUAL_TASK)      printf("task ");
    if (typeQualifiers & TYPEQUAL_SIGNED)    printf("signed ");
    if (typeQualifiers & TYPEQUAL_UNSIGNED)  printf("unsigned ");
+    if (typeQualifiers & TYPEQUAL_EXPORT)    printf("export ");
+    if (typeQualifiers & TYPEQUAL_UNMASKED)  printf("unmasked ");
 }


@@ -69,12 +72,21 @@ lApplyTypeQualifiers(int typeQualifiers, const Type *type, SourcePos pos) {
    if ((typeQualifiers & TYPEQUAL_CONST) != 0)
        type = type->GetAsConstType();

-    if ((typeQualifiers & TYPEQUAL_UNIFORM) != 0)
-        type = type->GetAsUniformType();
-    else if ((typeQualifiers & TYPEQUAL_VARYING) != 0)
-        type = type->GetAsVaryingType();
+    if ((typeQualifiers & TYPEQUAL_UNIFORM) != 0) {
+        if (Type::Equal(type, AtomicType::Void))
+            Error(pos, "\"uniform\" qualifier is illegal with \"void\" type.");
+        else
+            type = type->GetAsUniformType();
+    }
+    else if ((typeQualifiers & TYPEQUAL_VARYING) != 0) {
+        if (Type::Equal(type, AtomicType::Void))
+            Error(pos, "\"varying\" qualifier is illegal with \"void\" type.");
+        else
+            type = type->GetAsVaryingType();
+    }
    else
-        type = type->GetAsUnboundVariabilityType();
+        if (Type::Equal(type, AtomicType::Void) == false)
+            type = type->GetAsUnboundVariabilityType();

    if ((typeQualifiers & TYPEQUAL_UNSIGNED) != 0) {
        if ((typeQualifiers & TYPEQUAL_SIGNED) != 0)
@@ -84,15 +96,20 @@ lApplyTypeQualifiers(int typeQualifiers, const Type *type, SourcePos pos) {
        const Type *unsignedType = type->GetAsUnsignedType();
        if (unsignedType != NULL)
            type = unsignedType;
-        else
+        else {
+            const Type *resolvedType = 
+                type->ResolveUnboundVariability(Variability::Varying);
            Error(pos, "\"unsigned\" qualifier is illegal with \"%s\" type.",
-                  type->ResolveUnboundVariability(Type::Varying)->GetString().c_str());
+                  resolvedType->GetString().c_str());
+        }
    }

-    if ((typeQualifiers & TYPEQUAL_SIGNED) != 0 && type->IsIntType() == false)
+    if ((typeQualifiers & TYPEQUAL_SIGNED) != 0 && type->IsIntType() == false) {
+        const Type *resolvedType = 
+            type->ResolveUnboundVariability(Variability::Varying);
        Error(pos, "\"signed\" qualifier is illegal with non-integer type "
-              "\"%s\".", 
-              type->ResolveUnboundVariability(Type::Varying)->GetString().c_str());
+              "\"%s\".", resolvedType->GetString().c_str());
+    }

    return type;
 }
@@ -112,18 +129,59 @@ DeclSpecs::DeclSpecs(const Type *t, StorageClass sc, int tq) {

 const Type *
 DeclSpecs::GetBaseType(SourcePos pos) const {
-    const Type *bt = baseType;
+    const Type *retType = baseType;
+
+    if (retType == NULL) {
+        Warning(pos, "No type specified in declaration.  Assuming int32.");
+        retType = AtomicType::UniformInt32->GetAsUnboundVariabilityType();
+    }
+
    if (vectorSize > 0) {
-        const AtomicType *atomicType = dynamic_cast<const AtomicType *>(bt);
+        const AtomicType *atomicType = CastType<AtomicType>(retType);
        if (atomicType == NULL) {
            Error(pos, "Only atomic types (int, float, ...) are legal for vector "
                  "types.");
            return NULL;
        }
-        bt = new VectorType(atomicType, vectorSize);
+        retType = new VectorType(atomicType, vectorSize);
    }

-    return lApplyTypeQualifiers(typeQualifiers, bt, pos);
+    retType = lApplyTypeQualifiers(typeQualifiers, retType, pos);
+    
+    if (soaWidth > 0) {
+        const StructType *st = CastType<StructType>(retType);
+
+        if (st == NULL) {
+            Error(pos, "Illegal to provide soa<%d> qualifier with non-struct "
+                  "type \"%s\".", soaWidth, retType->GetString().c_str());
+            return NULL;
+        }
+        else if (soaWidth <= 0 || (soaWidth & (soaWidth - 1)) != 0) {
+            Error(pos, "soa<%d> width illegal.  Value must be positive power "
+                  "of two.", soaWidth);
+            return NULL;
+        }
+
+        if (st->IsUniformType()) {
+            Error(pos, "\"uniform\" qualifier and \"soa<%d>\" qualifier can't "
+                  "both be used in a type declaration.", soaWidth);
+            return NULL;
+        }
+        else if (st->IsVaryingType()) {
+            Error(pos, "\"varying\" qualifier and \"soa<%d>\" qualifier can't "
+                  "both be used in a type declaration.", soaWidth);
+            return NULL;
+        }
+        else
+            retType = st->GetAsSOAType(soaWidth);
+
+        if (soaWidth < g->target.vectorWidth)
+            PerformanceWarning(pos, "soa<%d> width smaller than gang size %d "
+                               "currently leads to inefficient code to access "
+                               "soa types.", soaWidth, g->target.vectorWidth);
+    }
+    
+    return retType;
 }


@@ -133,7 +191,6 @@ lGetStorageClassName(StorageClass storageClass) {
    case SC_NONE:     return "";
    case SC_EXTERN:   return "extern";
    case SC_EXTERN_C: return "extern \"C\"";
-    case SC_EXPORT:   return "export";
    case SC_STATIC:   return "static";
    case SC_TYPEDEF:  return "typedef";
    default:          FATAL("Unhandled storage class in lGetStorageClassName");
@@ -162,31 +219,30 @@ Declarator::Declarator(DeclaratorKind dk, SourcePos p)
    : pos(p), kind(dk) { 
    child = NULL;
    typeQualifiers = 0;
+    storageClass = SC_NONE;
    arraySize = -1;
-    sym = NULL;
+    type = NULL;
    initExpr = NULL;
 }


 void
 Declarator::InitFromDeclSpecs(DeclSpecs *ds) {
-    const Type *t = GetType(ds);
-    Symbol *sym = GetSymbol();
-    if (sym != NULL) {
-        sym->type = t;
-        sym->storageClass = ds->storageClass;
+    const Type *baseType = ds->GetBaseType(pos);
+    InitFromType(baseType, ds);
+
+    if (type == NULL) {
+        AssertPos(pos, m->errorCount > 0);
+        return;
    }
-}

+    storageClass = ds->storageClass;

-Symbol *
-Declarator::GetSymbol() const {
-    // The symbol lives at the last child in the chain, so walk down there
-    // and return the one there.
-    const Declarator *d = this;
-    while (d->child != NULL)
-        d = d->child;
-    return d->sym;
+    if (ds->declSpecList.size() > 0 && 
+        CastType<FunctionType>(type) == NULL) {
+        Error(pos, "__declspec specifiers for non-function type \"%s\" are "
+              "not used.", type->GetString().c_str());
+    }
 }


@@ -196,11 +252,11 @@ Declarator::Print(int indent) const {
    pos.Print();

    lPrintTypeQualifiers(typeQualifiers);
-    Symbol *sym = GetSymbol();
-    if (sym != NULL)
-        printf("%s", sym->name.c_str());
+    printf("%s ", lGetStorageClassName(storageClass));
+    if (name.size() > 0)
+        printf("%s", name.c_str());
    else
-        printf("(null symbol)");
+        printf("(unnamed)");

    printf(", array size = %d", arraySize);

@@ -234,132 +290,157 @@ Declarator::Print(int indent) const {
 }


-Symbol *
-Declarator::GetFunctionInfo(DeclSpecs *ds, std::vector<Symbol *> *funArgs) {
-    const FunctionType *type = 
-        dynamic_cast<const FunctionType *>(GetType(ds));
-    if (type == NULL)
-        return NULL;
-
-    Symbol *declSym = GetSymbol();
-    Assert(declSym != NULL);
-
-    // Get the symbol for the function from the symbol table.  (It should
-    // already have been added to the symbol table by AddGlobal() by the
-    // time we get here.)
-    Symbol *funSym = m->symbolTable->LookupFunction(declSym->name.c_str(), type);
-    if (funSym != NULL)
-        // May be NULL due to error earlier in compilation
-        funSym->pos = pos;
-
-    // Walk down to the declarator for the function.  (We have to get past
-    // the stuff that specifies the function's return type before we get to
-    // the function's declarator.)
-    Declarator *d = this;
-    while (d != NULL && d->kind != DK_FUNCTION)
-        d = d->child;
-    Assert(d != NULL);
-
-    for (unsigned int i = 0; i < d->functionParams.size(); ++i) {
-        Symbol *sym = d->GetSymbolForFunctionParameter(i);
-        sym->type = sym->type->ResolveUnboundVariability(Type::Varying);
-        funArgs->push_back(sym);
-    }
-
-    if (funSym != NULL)
-        funSym->type = funSym->type->ResolveUnboundVariability(Type::Varying);
-
-    return funSym;
-}
-
-
-const Type *
-Declarator::GetType(const Type *base, DeclSpecs *ds) const {
+void
+Declarator::InitFromType(const Type *baseType, DeclSpecs *ds) {
    bool hasUniformQual = ((typeQualifiers & TYPEQUAL_UNIFORM) != 0);
    bool hasVaryingQual = ((typeQualifiers & TYPEQUAL_VARYING) != 0);
    bool isTask =         ((typeQualifiers & TYPEQUAL_TASK) != 0);
+    bool isExported =     ((typeQualifiers & TYPEQUAL_EXPORT) != 0);
    bool isConst =        ((typeQualifiers & TYPEQUAL_CONST) != 0);
+    bool isUnmasked =     ((typeQualifiers & TYPEQUAL_UNMASKED) != 0);

    if (hasUniformQual && hasVaryingQual) {
        Error(pos, "Can't provide both \"uniform\" and \"varying\" qualifiers.");
-        return NULL;
+        return;
    }
-    if (kind != DK_FUNCTION && isTask)
+    if (kind != DK_FUNCTION && isTask) {
        Error(pos, "\"task\" qualifier illegal in variable declaration.");
-
-    Type::Variability variability = Type::Unbound;
+        return;
+    }
+    if (kind != DK_FUNCTION && isUnmasked) {
+        Error(pos, "\"unmasked\" qualifier illegal in variable declaration.");
+        return;
+    }
+    if (kind != DK_FUNCTION && isExported) {
+        Error(pos, "\"export\" qualifier illegal in variable declaration.");
+        return;
+    }
+    
+    Variability variability(Variability::Unbound);
    if (hasUniformQual)
-        variability = Type::Uniform;
+        variability = Variability::Uniform;
    else if (hasVaryingQual)
-        variability = Type::Varying;
+        variability = Variability::Varying;

-    const Type *type = base;
-    switch (kind) {
-    case DK_BASE:
+    if (kind == DK_BASE) {
        // All of the type qualifiers should be in the DeclSpecs for the
        // base declarator
-        Assert(typeQualifiers == 0);
-        Assert(child == NULL);
-        return type;
-
-    case DK_POINTER:
-        type = new PointerType(type, variability, isConst);
-        if (child != NULL)
-            return child->GetType(type, ds);
+        AssertPos(pos, typeQualifiers == 0);
+        AssertPos(pos, child == NULL);
+        type = baseType;
+    }
+    else if (kind == DK_POINTER) {
+        /* For now, any pointer to an SOA type gets the slice property; if
+           we add the capability to declare pointers as slices or not,
+           we'll want to set this based on a type qualifier here. */
+        const Type *ptrType = new PointerType(baseType, variability, isConst,
+                                              baseType->IsSOAType());
+        if (child != NULL) {
+            child->InitFromType(ptrType, ds);
+            type = child->type;
+            name = child->name;
+        }
        else
-            return type;
-        break;
-
-    case DK_REFERENCE:
-        if (hasUniformQual)
+            type = ptrType;
+    }
+    else if (kind == DK_REFERENCE) {
+        if (hasUniformQual) {
            Error(pos, "\"uniform\" qualifier is illegal to apply to references.");
-        if (hasVaryingQual)
+            return;
+        }
+        if (hasVaryingQual) {
            Error(pos, "\"varying\" qualifier is illegal to apply to references.");
-        if (isConst)
+            return;
+        }
+        if (isConst) {
            Error(pos, "\"const\" qualifier is to illegal apply to references.");
-
+            return;
+        }
        // The parser should disallow this already, but double check.
-        if (dynamic_cast<const ReferenceType *>(type) != NULL) {
+        if (CastType<ReferenceType>(baseType) != NULL) {
            Error(pos, "References to references are illegal.");
-            return NULL;
+            return;
        }

-        type = new ReferenceType(type);
-        if (child != NULL)
-            return child->GetType(type, ds);
+        const Type *refType = new ReferenceType(baseType);
+        if (child != NULL) {
+            child->InitFromType(refType, ds);
+            type = child->type;
+            name = child->name;
+        }
        else
-            return type;
-        break;
+            type = refType;
+    }
+    else if (kind == DK_ARRAY) {
+        if (Type::Equal(baseType, AtomicType::Void)) {
+            Error(pos, "Arrays of \"void\" type are illegal.");
+            return;
+        }
+        if (CastType<ReferenceType>(baseType)) {
+            Error(pos, "Arrays of references (type \"%s\") are illegal.",
+                  baseType->GetString().c_str());
+            return;
+        }

-    case DK_ARRAY:
-        type = new ArrayType(type, arraySize);
-        if (child)
-            return child->GetType(type, ds);
+        const Type *arrayType = new ArrayType(baseType, arraySize);
+        if (child != NULL) {
+            child->InitFromType(arrayType, ds);
+            type = child->type;
+            name = child->name;
+        }
        else
-            return type;
-        break;
-
-    case DK_FUNCTION: {
-        std::vector<const Type *> args;
-        std::vector<std::string> argNames;
-        std::vector<ConstExpr *> argDefaults;
-        std::vector<SourcePos> argPos;
-
+            type = arrayType;
+    }
+    else if (kind == DK_FUNCTION) {
+        llvm::SmallVector<const Type *, 8> args;
+        llvm::SmallVector<std::string, 8> argNames;
+        llvm::SmallVector<Expr *, 8> argDefaults;
+        llvm::SmallVector<SourcePos, 8> argPos;
+        
        // Loop over the function arguments and store the names, types,
        // default values (if any), and source file positions each one in
        // the corresponding vector.
        for (unsigned int i = 0; i < functionParams.size(); ++i) {
            Declaration *d = functionParams[i];

-            Symbol *sym = GetSymbolForFunctionParameter(i);
+            if (d == NULL) {
+                AssertPos(pos, m->errorCount > 0);
+                continue;
+            }
+            if (d->declarators.size() == 0) {
+                // function declaration like foo(float), w/o a name for the
+                // parameter; wire up a placeholder Declarator for it
+                d->declarators.push_back(new Declarator(DK_BASE, pos));
+                d->declarators[0]->InitFromDeclSpecs(d->declSpecs);
+            }
+
+            AssertPos(pos, d->declarators.size() == 1);
+            Declarator *decl = d->declarators[0];
+            if (decl == NULL || decl->type == NULL) {
+                AssertPos(pos, m->errorCount > 0);
+                continue;
+            }
+
+            if (decl->name == "") {
+                // Give a name to any anonymous parameter declarations
+                char buf[32];
+                sprintf(buf, "__anon_parameter_%d", i);
+                decl->name = buf;
+            }
+            decl->type = decl->type->ResolveUnboundVariability(Variability::Varying);

            if (d->declSpecs->storageClass != SC_NONE)
-                Error(sym->pos, "Storage class \"%s\" is illegal in "
+                Error(decl->pos, "Storage class \"%s\" is illegal in "
                      "function parameter declaration for parameter \"%s\".", 
                      lGetStorageClassName(d->declSpecs->storageClass),
-                      sym->name.c_str());
+                      decl->name.c_str());
+            if (Type::Equal(decl->type, AtomicType::Void)) {
+                Error(decl->pos, "Parameter with type \"void\" illegal in function "
+                      "parameter list.");
+                decl->type = NULL;
+            }

-            const ArrayType *at = dynamic_cast<const ArrayType *>(sym->type);
+            const ArrayType *at = CastType<ArrayType>(decl->type);
            if (at != NULL) {
                // As in C, arrays are passed to functions as pointers to
                // their element type.  We'll just immediately make this
@@ -369,144 +450,124 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
                // report this differently than it was originally declared
                // in the function, but it's not clear that this is a
                // significant problem.)
-                sym->type = PointerType::GetUniform(at->GetElementType());
+                const Type *targetType = at->GetElementType();
+                if (targetType == NULL) {
+                    AssertPos(pos, m->errorCount > 0);
+                    return;
+                }
+
+                decl->type = PointerType::GetUniform(targetType, at->IsSOAType());

                // Make sure there are no unsized arrays (other than the
                // first dimension) in function parameter lists.
-                at = dynamic_cast<const ArrayType *>(at->GetElementType());
+                at = CastType<ArrayType>(targetType);
                while (at != NULL) {
                    if (at->GetElementCount() == 0)
-                        Error(sym->pos, "Arrays with unsized dimensions in "
+                        Error(decl->pos, "Arrays with unsized dimensions in "
                              "dimensions after the first one are illegal in "
                              "function parameter lists.");
-                    at = dynamic_cast<const ArrayType *>(at->GetElementType());
+                    at = CastType<ArrayType>(at->GetElementType());
                }
            }

-            args.push_back(sym->type);
-            argNames.push_back(sym->name);
-            argPos.push_back(sym->pos);
+            args.push_back(decl->type);
+            argNames.push_back(decl->name);
+            argPos.push_back(decl->pos);

-            ConstExpr *init = NULL;
-            if (d->declarators.size()) {
-                // Try to find an initializer expression; if there is one,
-                // it lives down to the base declarator.
-                Declarator *decl = d->declarators[0];
-                while (decl->child != NULL) {
-                    Assert(decl->initExpr == NULL);
+            Expr *init = NULL;
+            // Try to find an initializer expression.
+            while (decl != NULL) {
+                if (decl->initExpr != NULL) {
+                    decl->initExpr = TypeCheck(decl->initExpr);
+                    decl->initExpr = Optimize(decl->initExpr);
+                    if (decl->initExpr != NULL) {
+                        init = dynamic_cast<ConstExpr *>(decl->initExpr);
+                        if (init == NULL)
+                            init = dynamic_cast<NullPointerExpr *>(decl->initExpr);
+                        if (init == NULL)
+                            Error(decl->initExpr->pos, "Default value for parameter "
+                                  "\"%s\" must be a compile-time constant.", 
+                                  decl->name.c_str());
+                    }
+                    break;
+                }
+                else
                    decl = decl->child;
-                }
-
-                if (decl->initExpr != NULL &&
-                    (decl->initExpr = TypeCheck(decl->initExpr)) != NULL &&
-                    (decl->initExpr = Optimize(decl->initExpr)) != NULL &&
-                    (init = dynamic_cast<ConstExpr *>(decl->initExpr)) == NULL) {
-                    Error(decl->initExpr->pos, "Default value for parameter "
-                          "\"%s\" must be a compile-time constant.", 
-                          sym->name.c_str());
-                }
            }
            argDefaults.push_back(init);
        }

-        const Type *returnType = type;
+        const Type *returnType = baseType;
        if (returnType == NULL) {
            Error(pos, "No return type provided in function declaration.");
-            return NULL;
+            return;
+        }
+
+        if (CastType<FunctionType>(returnType) != NULL) {
+            Error(pos, "Illegal to return function type from function.");
+            return;
        }
        
-        bool isExported = ds && (ds->storageClass == SC_EXPORT);
-        bool isExternC =  ds && (ds->storageClass == SC_EXTERN_C);
-        bool isTask =     ds && ((ds->typeQualifiers & TYPEQUAL_TASK) != 0);
+        returnType = returnType->ResolveUnboundVariability(Variability::Varying);

+        bool isExternC =  ds && (ds->storageClass == SC_EXTERN_C);
+        bool isExported = ds && ((ds->typeQualifiers & TYPEQUAL_EXPORT) != 0);
+        bool isTask =     ds && ((ds->typeQualifiers & TYPEQUAL_TASK) != 0);
+        bool isUnmasked = ds && ((ds->typeQualifiers & TYPEQUAL_UNMASKED) != 0);
+        
        if (isExported && isTask) {
            Error(pos, "Function can't have both \"task\" and \"export\" "
                  "qualifiers");
-            return NULL;
+            return;
        }
        if (isExternC && isTask) {
            Error(pos, "Function can't have both \"extern \"C\"\" and \"task\" "
                  "qualifiers");
-            return NULL;
+            return;
        }
        if (isExternC && isExported) {
            Error(pos, "Function can't have both \"extern \"C\"\" and \"export\" "
                  "qualifiers");
-            return NULL;
+            return;
+        }
+        if (isUnmasked && isExported)
+            Warning(pos, "\"unmasked\" qualifier is redundant for exported "
+                    "functions.");
+
+        if (child == NULL) {
+            AssertPos(pos, m->errorCount > 0);
+            return;
        }

-        const Type *functionType = 
+        const FunctionType *functionType = 
            new FunctionType(returnType, args, argNames, argDefaults,
-                             argPos, isTask, isExported, isExternC);
-        functionType = functionType->ResolveUnboundVariability(Type::Varying);
-        return child->GetType(functionType, ds);
-    }
-    default:
-        FATAL("Unexpected decl kind");
-        return NULL;
-    }
+                             argPos, isTask, isExported, isExternC, isUnmasked);

-#if 0
-            // Make sure we actually have an array of structs ..
-            const StructType *childStructType = 
-                dynamic_cast<const StructType *>(childType);
-            if (childStructType == NULL) {
-                Error(pos, "Illegal to provide soa<%d> qualifier with non-struct "
-                      "type \"%s\".", soaWidth, childType->GetString().c_str());
-                return new ArrayType(childType, arraySize == -1 ? 0 : arraySize);
+        // handle any explicit __declspecs on the function
+        if (ds != NULL) {
+            for (int i = 0; i < (int)ds->declSpecList.size(); ++i) {
+                std::string str = ds->declSpecList[i].first;
+                SourcePos pos = ds->declSpecList[i].second;
+
+                if (str == "safe")
+                    (const_cast<FunctionType *>(functionType))->isSafe = true;
+                else if (!strncmp(str.c_str(), "cost", 4)) {
+                    int cost = atoi(str.c_str() + 4);
+                    if (cost < 0)
+                        Error(pos, "Negative function cost %d is illegal.",
+                              cost);
+                    (const_cast<FunctionType *>(functionType))->costOverride = cost;
+                }
+                else
+                    Error(pos, "__declspec parameter \"%s\" unknown.", str.c_str());
            }
-            else if ((soaWidth & (soaWidth - 1)) != 0) {
-                Error(pos, "soa<%d> width illegal.  Value must be power of two.",
-                      soaWidth);
-                return NULL;
-            }
-            else if (arraySize != -1 && (arraySize % soaWidth) != 0) {
-                Error(pos, "soa<%d> width must evenly divide array size %d.",
-                      soaWidth, arraySize);
-                return NULL;
-            }
-            return new SOAArrayType(childStructType, arraySize == -1 ? 0 : arraySize,
-                                    soaWidth);
-#endif
-}
-
-
-const Type *
-Declarator::GetType(DeclSpecs *ds) const {
-    const Type *baseType = ds->GetBaseType(pos);
-    const Type *type = GetType(baseType, ds);
-    return type;
-}
-
-
-Symbol *
-Declarator::GetSymbolForFunctionParameter(int paramNum) const {
-    Assert(paramNum < (int)functionParams.size());
-    Declaration *d = functionParams[paramNum];
-
-    char buf[32];
-    Symbol *sym;
-    if (d->declarators.size() == 0) {
-        // function declaration like foo(float), w/o a name for
-        // the parameter
-        sprintf(buf, "__anon_parameter_%d", paramNum);
-        sym = new Symbol(buf, pos);
-        sym->type = d->declSpecs->GetBaseType(pos);
-    }
-    else {
-        Assert(d->declarators.size() == 1);
-        sym = d->declarators[0]->GetSymbol();
-        if (sym == NULL) {
-            // Handle more complex anonymous declarations like
-            // float (float **).
-            sprintf(buf, "__anon_parameter_%d", paramNum);
-            sym = new Symbol(buf, d->declarators[0]->pos);
-            sym->type = d->declarators[0]->GetType(d->declSpecs);
        }
-    }
-    return sym;
-}

+        child->InitFromType(functionType, ds);
+        type = child->type;
+        name = child->name;
+    }
+}

 ///////////////////////////////////////////////////////////////////////////
 // Declaration
@@ -537,18 +598,23 @@ Declaration::GetVariableDeclarations() const {

    for (unsigned int i = 0; i < declarators.size(); ++i) {
        Declarator *decl = declarators[i];
-        if (decl == NULL)
+        if (decl == NULL || decl->type == NULL) {
            // Ignore earlier errors
+            Assert(m->errorCount > 0);
            continue;
+        }

-        Symbol *sym = decl->GetSymbol();
-        sym->type = sym->type->ResolveUnboundVariability(Type::Varying);
-
-        if (dynamic_cast<const FunctionType *>(sym->type) == NULL) {
+        if (Type::Equal(decl->type, AtomicType::Void))
+            Error(decl->pos, "\"void\" type variable illegal in declaration.");
+        else if (CastType<FunctionType>(decl->type) == NULL) {
+            decl->type = decl->type->ResolveUnboundVariability(Variability::Varying);
+            Symbol *sym = new Symbol(decl->name, decl->pos, decl->type,
+                                     decl->storageClass);
            m->symbolTable->AddVariable(sym);
            vars.push_back(VariableDeclaration(sym, decl->initExpr));
        }
    }
+
    return vars;
 }

@@ -559,18 +625,19 @@ Declaration::DeclareFunctions() {

    for (unsigned int i = 0; i < declarators.size(); ++i) {
        Declarator *decl = declarators[i];
-        if (decl == NULL)
+        if (decl == NULL || decl->type == NULL) {
            // Ignore earlier errors
+            Assert(m->errorCount > 0);
            continue;
+        }

-        Symbol *sym = decl->GetSymbol();
-        sym->type = sym->type->ResolveUnboundVariability(Type::Varying);
-
-        if (dynamic_cast<const FunctionType *>(sym->type) == NULL)
+        const FunctionType *ftype = CastType<FunctionType>(decl->type);
+        if (ftype == NULL)
            continue;

        bool isInline = (declSpecs->typeQualifiers & TYPEQUAL_INLINE);
-        m->AddFunctionDeclaration(sym, isInline);
+        m->AddFunctionDeclaration(decl->name, ftype, decl->storageClass,
+                                  isInline, decl->pos);
    }
 }

@@ -584,13 +651,14 @@ Declaration::Print(int indent) const {
        declarators[i]->Print(indent+4);
 }

+
 ///////////////////////////////////////////////////////////////////////////

 void
 GetStructTypesNamesPositions(const std::vector<StructDeclaration *> &sd,
-                             std::vector<const Type *> *elementTypes,
-                             std::vector<std::string> *elementNames,
-                             std::vector<SourcePos> *elementPositions) {
+                             llvm::SmallVector<const Type *, 8> *elementTypes,
+                             llvm::SmallVector<std::string, 8> *elementNames,
+                             llvm::SmallVector<SourcePos, 8> *elementPositions) {
    std::set<std::string> seenNames;
    for (unsigned int i = 0; i < sd.size(); ++i) {
        const Type *type = sd[i]->type;
@@ -600,35 +668,41 @@ GetStructTypesNamesPositions(const std::vector<StructDeclaration *> &sd,
        // FIXME: making this fake little DeclSpecs here is really
        // disgusting
        DeclSpecs ds(type);
-        if (type->IsUniformType()) 
-            ds.typeQualifiers |= TYPEQUAL_UNIFORM;
-        else if (type->IsVaryingType())
-            ds.typeQualifiers |= TYPEQUAL_VARYING;
+        if (Type::Equal(type, AtomicType::Void) == false) {
+            if (type->IsUniformType()) 
+                ds.typeQualifiers |= TYPEQUAL_UNIFORM;
+            else if (type->IsVaryingType())
+                ds.typeQualifiers |= TYPEQUAL_VARYING;
+            else if (type->GetSOAWidth() != 0)
+                ds.soaWidth = type->GetSOAWidth();
+            // FIXME: ds.vectorSize?
+        }

        for (unsigned int j = 0; j < sd[i]->declarators->size(); ++j) {
            Declarator *d = (*sd[i]->declarators)[j];
            d->InitFromDeclSpecs(&ds);

-            Symbol *sym = d->GetSymbol();
+            if (Type::Equal(d->type, AtomicType::Void))
+                Error(d->pos, "\"void\" type illegal for struct member.");

-            const ArrayType *arrayType = 
-                dynamic_cast<const ArrayType *>(sym->type);
-            if (arrayType != NULL && arrayType->GetElementCount() == 0) {
-                Error(d->pos, "Unsized arrays aren't allowed in struct "
-                      "definitions.");
-                elementTypes->push_back(NULL);
-            }
-            else
-                elementTypes->push_back(sym->type);
+            elementTypes->push_back(d->type);

-            if (seenNames.find(sym->name) != seenNames.end())
+            if (seenNames.find(d->name) != seenNames.end())
                Error(d->pos, "Struct member \"%s\" has same name as a "
-                      "previously-declared member.", sym->name.c_str());
+                      "previously-declared member.", d->name.c_str());
            else
-                seenNames.insert(sym->name);
+                seenNames.insert(d->name);

-            elementNames->push_back(sym->name);
-            elementPositions->push_back(sym->pos);
+            elementNames->push_back(d->name);
+            elementPositions->push_back(d->pos);
        }
    }
+
+    for (int i = 0; i < (int)elementTypes->size() - 1; ++i) {
+        const ArrayType *arrayType = CastType<ArrayType>((*elementTypes)[i]);
+
+        if (arrayType != NULL && arrayType->GetElementCount() == 0)
+            Error((*elementPositions)[i], "Unsized arrays aren't allowed except "
+                  "for the last member in a struct definition.");
+    }
 }
--- a/decl.h
+++ b/decl.h
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -47,30 +47,21 @@
    variables--here, that the declaration has the 'static' and 'uniform'
    qualifiers, and that it's basic type is 'int'.  Then for each variable
    declaration, the Declaraiton class holds an instance of a Declarator,
-    which in turn records the per-variable information like the symbol
-    name, array size (if any), initializer expression, etc.
+    which in turn records the per-variable information like the name, array
+    size (if any), initializer expression, etc.  
 */

 #ifndef ISPC_DECL_H
 #define ISPC_DECL_H

 #include "ispc.h"
+#include <llvm/ADT/SmallVector.h>

 struct VariableDeclaration;

 class Declaration;
 class Declarator;

-enum StorageClass {
-    SC_NONE,
-    SC_EXTERN,
-    SC_EXPORT,
-    SC_STATIC,
-    SC_TYPEDEF,
-    SC_EXTERN_C
-};
-
-
 /* Multiple qualifiers can be provided with types in declarations;
   therefore, they are set up so that they can be ANDed together into an
   int. */
@@ -82,6 +73,8 @@ enum StorageClass {
 #define TYPEQUAL_SIGNED     (1<<4)
 #define TYPEQUAL_UNSIGNED   (1<<5)
 #define TYPEQUAL_INLINE     (1<<6)
+#define TYPEQUAL_EXPORT     (1<<7)
+#define TYPEQUAL_UNMASKED   (1<<8)

 /** @brief Representation of the declaration specifiers in a declaration.

@@ -90,7 +83,8 @@ enum StorageClass {
 */
 class DeclSpecs {
 public:
-    DeclSpecs(const Type *t = NULL, StorageClass sc = SC_NONE, int tq = TYPEQUAL_NONE);
+    DeclSpecs(const Type *t = NULL, StorageClass sc = SC_NONE,
+              int tq = TYPEQUAL_NONE);

    void Print() const;

@@ -117,6 +111,8 @@ public:
        SOA width specified.  Otherwise this is zero.
     */
    int soaWidth;
+
+    std::vector<std::pair<std::string, SourcePos> > declSpecList;
 };


@@ -138,25 +134,11 @@ public:
    Declarator(DeclaratorKind dk, SourcePos p);

    /** Once a DeclSpecs instance is available, this method completes the
-        initialization of the Symbol, setting its Type accordingly.
+        initialization of the type member.
     */
    void InitFromDeclSpecs(DeclSpecs *ds);

-    /** Get the actual type of the combination of Declarator and the given
-        DeclSpecs.  If an explicit base type is provided, the declarator is
-        applied to that type; otherwise the base type from the DeclSpecs is
-        used. */
-    const Type *GetType(DeclSpecs *ds) const;
-    const Type *GetType(const Type *base, DeclSpecs *ds) const;
-
-    /** Returns the symbol corresponding to the function declared by this
-        declarator and symbols for its arguments in *args. */
-    Symbol *GetFunctionInfo(DeclSpecs *ds, std::vector<Symbol *> *args);
-
-    Symbol *GetSymbolForFunctionParameter(int paramNum) const;
-
-    /** Returns the symbol associated with the declarator. */
-    Symbol *GetSymbol() const;
+    void InitFromType(const Type *base, DeclSpecs *ds);

    void Print(int indent) const;

@@ -177,18 +159,24 @@ public:
    /** Type qualifiers provided with the declarator. */
    int typeQualifiers;

+    StorageClass storageClass;
+
    /** For array declarators, this gives the declared size of the array.
        Unsized arrays have arraySize == 0. */ 
    int arraySize;

-    /** Symbol associated with the declarator. */
-    Symbol *sym;
+    /** Name associated with the declarator. */
+    std::string name;

    /** Initialization expression for the variable.  May be NULL. */
    Expr *initExpr;

+    /** Type of the declarator.  This is NULL until InitFromDeclSpecs() or
+        InitFromType() is called. */
+    const Type *type;
+
    /** For function declarations, this holds the Declaration *s for the
-        funciton's parameters. */
+        function's parameters. */
    std::vector<Declaration *> functionParams;
 };

@@ -233,8 +221,8 @@ struct StructDeclaration {
 /** Given a set of StructDeclaration instances, this returns the types of
    the elements of the corresponding struct and their names. */
 extern void GetStructTypesNamesPositions(const std::vector<StructDeclaration *> &sd,
-                                         std::vector<const Type *> *elementTypes,
-                                         std::vector<std::string> *elementNames,
-                                         std::vector<SourcePos> *elementPositions);
+                                         llvm::SmallVector<const Type *, 8> *elementTypes,
+                                         llvm::SmallVector<std::string, 8> *elementNames,
+                                         llvm::SmallVector<SourcePos, 8> *elementPositions);

 #endif // ISPC_DECL_H
--- a/docs/ReleaseNotes.txt
+++ b/docs/ReleaseNotes.txt
@@ -1,3 +1,199 @@
+=== v1.3.0 === (29 June 2012)
+
+This is a major new release of ispc, with support for more compilation
+targets and a number of additions to the language.  As usual, the quality
+of generated code has also been improved in a number of cases and a number
+of small bugs have been fixed.
+
+New targets:
+
+* This release provides "beta" support for compiling to Intel® Xeon
+  Phi™ processor, code named Knights Corner, the first processor in
+  the Intel® Many Integrated Core Architecture.  See
+  http://ispc.github.com/ispc.html#compiling-for-the-intel-xeon-phi-architecture
+  for more details on this support.
+
+* This release also has an "avx1.1" target, which provides support for the
+  new instructions in the Intel Ivy Bridge microarchitecutre. 
+
+New language features:
+
+* The foreach_active statement allows iteration over the active program
+  instances in a gang.  (See
+  http://ispc.github.com/ispc.html#iteration-over-active-program-instances-foreach-active)
+
+* foreach_unique allows iterating over subsets of program instances in a
+  gang that share the same value of a variable.  (See
+  http://ispc.github.com/ispc.html#iteration-over-unique-elements-foreach-unique)
+
+* An "unmasked" function qualifier and statement in the language allow
+  re-activating execution of all program instances in a gang.  (See
+  http://ispc.github.com/ispc.html#re-establishing-the-execution-mask
+
+Standard library updates:
+
+* The seed_rng() function has been modified to take a "varying" seed value
+  when a varying RNGState is being initialized.
+
+* An isnan() function has been added, to check for floating-point "not a
+  number" values.
+
+* The float_to_srgb8() routine does high performance conversion of
+  floating-point color values to SRGB8 format.
+
+Other changes:
+
+* A number of bugfixes have been made for compiler crashes with malformed
+  programs.
+
+* Floating-point comparisons are now "unordered", so that any comparison
+  where one of the operands is a "not a number" value returns false.  (This
+  matches standard IEEE floating-point behavior.)
+
+* The code generated for 'break' statements in "varying" loops has been
+  improved for some common cases. 
+
+* Compile time and compiler memory use have both been improved,
+  particularly for large input programs.
+
+* A nubmer of bugs have been fixed in the debugging information generated
+  by the compiler when the "-g" command-line flag is used.
+
+=== v1.2.2 === (20 April 2012)
+
+This release includes a number of small additions to functionality and a
+number of bugfixes.  New functionality includes:
+
+* It's now possible to forward declare structures as in C/C++: "struct
+  Foo;".  After such a declaration, structs with pointers to "Foo" and
+  functions that take pointers or references to Foo structs can be declared
+  without the entire definition of Foo being available.
+
+* New built-in types size_t, ptrdiff_t, and [u]intptr_t are now available,
+  corresponding to the equivalent types in C.
+
+* The standard library now provides atomic_swap*() and
+  atomic_compare_exchange*() functions for void * types.
+
+* The C++ backend has seen a number of improvements to the quality and
+  readability of generated code.
+
+A number of bugs have been fixed in this release as well.  The most
+significant are:
+
+* Fixed a bug where nested loops could cause a compiler crash in some
+  circumstances (issues #240, and #229)
+
+* Gathers could access invlaid mamory (and cause the program to crash) in
+  some circumstances (#235)
+
+* References to temporary values are now handled properly when passed to a
+  function that takes a reference typed parameter.
+
+* A case where incorrect code could be generated for compile-time-constant
+  initializers has been fixed (#234).
+
+=== v1.2.1 === (6 April 2012)
+
+This release contains only minor new functionality and is mostly for many
+small bugfixes and improvements to error handling and error reporting.
+The new functionality that is present is:
+
+* Significantly more efficient versions of the float / half conversion
+  routines are now available in the standard library, thanks to Fabian
+  Giesen.
+
+* The last member of a struct can now be a zero-length array; this allows
+  the trick of dynamically allocating enough storage for the struct and
+  some number of array elements at the end of it.
+
+Significant bugs fixed include:
+
+* Issue #205: When a target ISA isn't specified, use the host system's
+  capabilities to choose a target for which it will be able to run the
+  generated code.
+
+* Issues #215 and #217: Don't allocate storage for global variables that
+  are declared "extern".
+
+* Issue #197: Allow NULL as a default argument value in a function
+  declaration.
+
+* Issue #223: Fix bugs where taking the address of a function wouldn't work
+  as expected.
+
+* Issue #224: When there are overloaded variants of a function that take
+  both reference and const reference parameters, give the non-const
+  reference preference when matching values of that underlying type.
+
+* Issue #225: An error is issed when a varying lvalue is assigned to a
+  reference type (rather than crashing).
+
+* Issue #193: Permit conversions from array types to void *, not just the
+  pointer type of the underlying array element.
+
+* Issue #199: Still evaluate expressions that are cast to (void).
+
+The documentation has also been improved, with FAQs added to clarify some
+aspects of the ispc pointer model.
+
+=== v1.2.0 === (20 March 2012)
+
+This is a major new release of ispc, with a number of significant
+improvements to functionality, performance, and compiler robustness.  It
+does, however, include three small changes to language syntax and semantics
+that may require changes to existing programs:
+
+* Syntax for the "launch" keyword has been cleaned up; it's now no longer
+  necessary to bracket the launched function call with angle brackets.
+  (In other words, now use "launch foo();", rather than "launch < foo() >;".
+
+* When using pointers, the pointed-to data type is now "uniform" by
+  default.  Use the varying keyword to specify varying pointed-to types when
+  needed.  (i.e. "float *ptr" is a varying pointer to uniform float data,
+  whereas previously it was a varying pointer to varying float values.)
+  Use "varying float *" to specify a varying pointer to varying float data,
+  and so forth.
+
+* The details of "uniform" and "varying" and how they interact with struct
+  types have been cleaned up.  Now, when a struct type is declared, if the
+  struct elements don't have explicit "uniform" or "varying" qualifiers,
+  they are said to have "unbound" variability.  When a struct type is
+  instantiated, any unbound variability elements inherit the variability of
+  the parent struct type. See http://ispc.github.com/ispc.html#struct-types
+  for more details.
+
+ispc has a new language feature that makes it much easier to use the
+efficient "(array of) structure of arrays" (AoSoA, or SoA) memory layout of
+data.  A new "soa<n>" qualifier can be applied to structure types to
+specify an n-wide SoA version of the corresponding type.  Array indexing
+and pointer operations with arrays SoA types automatically handles the
+two-stage indexing calculation to access the data.  See
+http://ispc.github.com/ispc.html#structure-of-array-types for more details.
+
+For more efficient access of data that is still in "array of structures"
+(AoS) format, ispc has a new "memory coalescing" optimization that
+automatically detects series of strided loads and/or gathers that can be
+transformed into a more efficient set of vector loads and shuffles.  A
+diagnostic is emitted when this optimization is successfully applied. 
+
+Smaller changes in this release:
+
+* The standard library now provides memcpy(), memmove() and memset()
+  functions, as well as single-precision asin() and acos() functions.
+
+* -I can now be specified on the command-line to specify a search path for
+  #include files.
+
+* A number of improvements have been made to error reporting from the
+  parser, and a number of cases where malformed programs could cause the
+  compiler to crash have been fixed.
+
+* A number of small improvements to the quality and performance of generated
+  code have been made, including finding more cases where 32-bit addressing
+  calculations can be safely done on 64-bit systems and generating better
+  code for initializer expressions.
+
 === v1.1.4 === (4 February 2012)

 There are two major bugfixes for Windows in this release.  First, a number
--- a/docs/build.sh
+++ b/docs/build.sh
@@ -1,11 +1,14 @@
 #!/bin/bash

 for i in ispc perfguide faq; do
-    rst2html.py --template=template.txt --link-stylesheet \
+    rst2html --template=template.txt --link-stylesheet \
        --stylesheet-path=css/style.css $i.rst > $i.html
 done

-rst2html.py --template=template-perf.txt --link-stylesheet \
+rst2html --template=template-news.txt --link-stylesheet \
+    --stylesheet-path=css/style.css news.rst > news.html
+
+rst2html --template=template-perf.txt --link-stylesheet \
        --stylesheet-path=css/style.css perf.rst > perf.html

 #rst2latex --section-numbering --documentclass=article --documentoptions=DIV=9,10pt,letterpaper ispc.txt > ispc.tex
--- a/docs/faq.rst
+++ b/docs/faq.rst
@@ -14,11 +14,24 @@ distribution.
  + `Why are there multiple versions of exported ispc functions in the assembly output?`_
  + `How can I more easily see gathers and scatters in generated assembly?`_

+* Running The Compiler
+
+  + `Why is it required to use one of the "generic" targets with C++ output?`_
+  + `Why won't the compiler generate an object file or assembly output with the "generic" targets?`_
+
+* Language Details
+
+  + `What is the difference between "int *foo" and "int foo[]"?`_
+  + `Why are pointed-to types "uniform" by default?`_
+  + `What am I getting an error about assigning a varying lvalue to a reference type?`_ 
+  
 * Interoperability

  + `How can I supply an initial execution mask in the call from the application?`_
  + `How can I generate a single binary executable with support for multiple instruction sets?`_
  + `How can I determine at run-time which vector instruction set's instructions were selected to execute?`_
+  + `Is it possible to inline ispc functions in C/C++ code?`_
+  + `Why is it illegal to pass "varying" values from C/C++ to ispc functions?`_ 

 * Programming Techniques

@@ -26,6 +39,8 @@ distribution.
  + `How can a gang of program instances generate variable amounts of output efficiently?`_
  + `Is it possible to use ispc for explicit vector programming?`_
  + `How can I debug my ispc programs using Valgrind?`_
+  + `foreach statements generate more complex assembly than I'd expect; what's going on?`_
+  + `How do I launch an individual task for each active program instance?`_

 Understanding ispc's Output
 ===========================
@@ -212,6 +227,174 @@ easier to understand:
            jmp        ___pseudo_scatter_base_offsets32_32 ## TAILCALL


+Running The Compiler
+====================
+
+Why is it required to use one of the "generic" targets with C++ output?
+-----------------------------------------------------------------------
+
+The C++ output option transforms the provided ``ispc`` program source into
+C++ code where each basic operation in the program (addition, comparison,
+etc.) is represented as a function call to an as-yet-undefined function,
+chaining the results of these calls together to perform the required
+computations.  It is then expected that the user will provide the
+implementation of these functions via a header file with ``inline``
+functions defined for each of these functions and then use a C++ compiler
+to generate a final object file.  (Examples of these headers include
+``examples/intrinsics/sse4.h`` and ``examples/intrinsics/knc.h`` in the
+``ispc`` distribution.)
+
+If a target other than one of the "generic" ones is used with C++ output,
+then the compiler will transform certain operations into particular code
+sequences that may not be desired for the actual final target; for example,
+SSE targets that don't have hardware "gather" instructions will transform a
+gather into a sequence of scalar load instructions.  When this in turn is
+transformed to C++ code, the fact that the loads were originally a gather
+is lost, and the header file of function definitions wouldn't have a chance
+to map the "gather" to a target-specific operation, as the ``knc.h`` header
+does, for example.  Thus, the "generic" targets exist to provide basic
+targets of various vector widths, without imposing any limitations on the
+final target's capabilities.
+
+Why won't the compiler generate an object file or assembly output with the "generic" targets?
+---------------------------------------------------------------------------------------------
+
+As described in the above FAQ entry, when compiling to the "generic"
+targets, ``ispc`` generates vector code for the source program that
+transforms every basic operation in the program (addition, comparison,
+etc.) into a separate function call.
+
+While there is no fundamental reason that the compiler couldn't generate
+target-specific object code with a function call to an undefined function
+for each primitive operation, doing so wouldn't actually be useful in
+practice--providing definitions of these functions in a separate object
+file and actually performing function calls for each of them (versus
+turning them into inline function calls) would be a highly inefficient way
+to run the program.
+
+Therefore, in the interests of encouraging the  use of the system,
+these types of output are disallowed.
+
+
+Language Details
+================
+
+What is the difference between "int \*foo" and "int foo[]"?
+-----------------------------------------------------------
+
+In C and C++, declaring a function to take a parameter ``int *foo`` and
+``int foo[]`` results in the same type for the parameter.  Both are
+pointers to integers.  In ``ispc``, these are different types.  The first
+one is a varying pointer to a uniform integer value in memory, while the
+second results in a uniform pointer to the start of an array of varying
+integer values in memory.
+
+To understand why the first is a varying pointer to a uniform integer,
+first recall that types without explicit rate qualifiers (``uniform``,
+``varying``, or ``soa<>``) are ``varying`` by default.  Second, recall from
+the `discussion of pointer types in the ispc User's Guide`_ that pointed-to
+types without rate qualifiers are ``uniform`` by default.  (This second
+rule is discussed further below, in `Why are pointed-to types "uniform" by
+default?`_.)  The type of ``int *foo`` follows from these.
+
+.. _discussion of pointer types in the ispc User's Guide: ispc.html#pointer-types 
+
+Conversely, in a function body, ``int foo[10]`` represents a declaration of
+a 10-element array of varying ``int`` values.  In that we'd certainly like
+to be able to pass such an array to a function that takes a ``int []``
+parameter, the natural type for an ``int []`` parameter is a uniform
+pointer to varying integer values.
+
+In terms of compatibility with C/C++, it's unfortunate that this
+distinction exists, though any other set of rules seems to introduce more
+awkwardness than this one.  (Though we're interested to hear ideas to
+improve these rules!).
+
+Why are pointed-to types "uniform" by default?
+----------------------------------------------
+
+In ``ispc``, types without rate qualifiers are "varying" by default, but
+types pointed to by pointers without rate qualifiers are "uniform" by
+default.  Why this difference?
+
+::
+
+    int foo;  // no rate qualifier, "varying int".
+    uniform int *foo;  // pointer type has no rate qualifier, pointed-to does.
+                       // "varying pointer to uniform int".
+    int *foo;  // neither pointer type nor pointed-to type ("int") have
+               // rate qualifiers. Pointer type is varying by default,
+               // pointed-to is uniform. "varying pointer to uniform int".
+    varying int *foo;   // varying pointer to varying int
+
+The first rule, having types without rate qualifiers be varying by default,
+is a default that keeps the number of "uniform" or "varying" qualifiers in
+``ispc`` programs low.  Most ``ispc`` programs use mostly "varying"
+variables, so this rule allows most variables to be declared without also
+requiring rate qualifiers.
+
+On a related note, this rule allows many C/C++ functions to be used to
+define equivalent functions in the SPMD execution model that ``ispc``
+provides with little or no modification:
+
+::
+
+    // scalar add in C/C++, SPMD/vector add in ispc
+    int add(int a, int b) { return a + b; }
+
+This motivation also explains why ``uniform int *foo`` represents a varying
+pointer; having pointers be varying by default if they don't have rate
+qualifiers similarly helps with porting code from C/C++ to ``ispc``.
+
+The tricker issue is why pointed-to types are "uniform" by default.  In our
+experience, data in memory that is accessed via pointers is most often
+uniform; this generally includes all data that has been allocated and
+initialized by the C/C++ application code. In practice, "varying" types are
+more generally (but not exclusively) used for local data in ``ispc``
+functions.  Thus, making the pointed-to type uniform by default leads to
+more concise code for the most common cases.
+
+
+What am I getting an error about assigning a varying lvalue to a reference type?
+--------------------------------------------------------------------------------
+
+Given code like the following:
+
+::
+
+    uniform float a[...];
+    int index = ...;
+    float &r = a[index];
+
+``ispc`` issues the error "Initializer for reference-type variable "r" must
+have a uniform lvalue type.".  The underlying issue stems from how
+references are represented in the code generated by ``ispc``.  Recall that
+``ispc`` supports both uniform and varying pointer types--a uniform pointer
+points to the same location in memory for all program instances in the
+gang, while a varying pointer allows each program instance to have its own
+pointer value.
+
+References are represented a pointer in the code generated by ``ispc``,
+though this is generally opaque to the user; in ``ispc``, they are
+specifically uniform pointers.  This design decision was made so that given
+code like this:
+
+::
+
+    extern void func(float &val);
+    float foo = ...;
+    func(foo);
+
+Then the reference would be handled efficiently as a single pointer, rather
+than unnecessarily being turned into a gang-size of pointers.
+
+However, an implication of this decision is that it's not possible for
+references to refer to completely different things for each of the program
+instances.  (And hence the error that is issued).  In cases where a unique
+per-program-instance pointer is needed, a varying pointer should be used
+instead of a reference.
+
+
 Interoperability
 ================

@@ -346,6 +529,92 @@ In a similar fashion, it's possible to find out at run-time the value of
    export uniform int width() { return programCount; }


+Is it possible to inline ispc functions in C/C++ code?
+------------------------------------------------------
+
+If you're willing to use the ``clang`` C/C++ compiler that's part of the
+LLVM tool suite, then it is possible to inline ``ispc`` code with C/C++
+(and conversely, to inline C/C++ calls in ``ispc``).  Doing so can provide
+performance advantages when calling out to short functions written in the
+"other" language.  Note that you don't need to use ``clang`` to compile all
+of your C/C++ code, but only for the files where you want to be able to
+inline.  In order to do this, you must have a full installation of LLVM
+version 3.0 or later, including the ``clang`` compiler.
+
+The basic approach is to have the various compilers emit LLVM intermediate
+representation (IR) code and to then use tools from LLVM to link together
+the IR from the compilers and then re-optimize it, which gives the LLVM
+optimizer the opportunity to do additional inlining and cross-function
+optimizations.  If you have source files ``foo.ispc`` and ``foo.cpp``,
+first emit LLVM IR:
+
+::
+
+   ispc --emit-llvm -o foo_ispc.bc foo.ispc
+   clang -O2 -c -emit-llvm -o foo_cpp.bc foo.cpp
+
+Next, link the two IR files into a single file and run the LLVM optimizer
+on the result:
+
+::
+  
+    llvm-link foo_ispc.bc foo_cpp.bc -o - | opt -O3 -o foo_opt.bc
+
+And finally, generate a native object file:
+
+::
+
+   llc -filetype=obj foo_opt.bc -o foo.o
+
+This file can in turn be linked in with the rest of your object files when
+linking your applicaiton.
+
+(Note that if you're using the AVX instruction set, you must provide the
+``-mattr=+avx`` flag to ``llc``.)
+    
+
+Why is it illegal to pass "varying" values from C/C++ to ispc functions?
+------------------------------------------------------------------------
+
+If any of the types in the parameter list to an exported function is
+"varying" (including recursively, and members of structure types, etc.),
+then ``ispc`` will issue an error and refuse to compile the function:
+
+::
+
+    % echo "export int add(int x) { return ++x; }" | ispc
+    <stdin>:1:12: Error: Illegal to return a "varying" type from exported function "foo" 
+    <stdin>:1:20: Error: Varying parameter "x" is illegal in an exported function. 
+
+While there's no fundamental reason why this isn't possible, recall the
+definition of "varying" variables: they have one value for each program
+instance in the gang.  As such, the number of values and amount of storage
+required to represent a varying variable depends on the gang size
+(i.e. ``programCount``), which can have different values depending on the
+compilation target.
+
+``ispc`` therefore prohibits passing "varying" values between the
+application and the ``ispc`` program in order to prevent the
+application-side code from depending on a particular gang size, in order to
+encourage portability to different gang sizes.  (A generally desirable
+programming practice.)
+
+For cases where the size of data is actually fixed from the application
+side, the value can be passed via a pointer to a short ``uniform`` array,
+as follows:
+
+::
+
+    export void add4(uniform int ptr[4]) {
+        foreach (i = 0 ... 4)
+            ptr[i]++;
+    }
+
+On the 4-wide SSE instruction set, this compiles to a single vector add
+instruction (and associated move instructions), while it still also
+efficiently computes the correct result on 8-wide AVX targets.
+
+
 Programming Techniques
 ======================

@@ -480,3 +749,131 @@ you can use ``--target=sse4`` when compiling to run with ``valgrind``.
 Note that ``valgrind`` does not yet support programs that use the AVX
 instruction set.

+foreach statements generate more complex assembly than I'd expect; what's going on?
+-----------------------------------------------------------------------------------
+
+Given a simple ``foreach`` loop like the following:
+
+::
+
+    void foo(uniform float a[], uniform int count) {
+        foreach (i = 0 ... count)
+            a[i] *= 2;
+    }
+
+
+the ``ispc`` compiler generates approximately 40 instructions--why isn't
+the generated code simpler?
+
+There are two main components to the code: one handles
+``programCount``-sized chunks of elements of the array, and the other
+handles any excess elements at the end of the array that don't completely
+fill a gang.  The code for the main loop is essentially what one would
+expect: a vector of values are laoded from the array, the multiply is done,
+and the result is stored.
+
+::
+
+    LBB0_2:                                 ## %foreach_full_body
+	movslq	%edx, %rdx
+	vmovups	(%rdi,%rdx), %ymm1
+	vmulps	%ymm0, %ymm1, %ymm1
+	vmovups	%ymm1, (%rdi,%rdx)
+	addl	$32, %edx
+	addl	$8, %eax
+	cmpl	%ecx, %eax
+	jl	LBB0_2
+
+
+Then, there is a sequence of instructions that handles any additional
+elements at the end of the array.  (These instructions don't execute if
+there aren't any left-over values to process, but they do lengthen the
+amount of generated code.)
+
+::
+
+  ## BB#4:                                ## %partial_inner_only
+	vmovd	%eax, %xmm0
+	vinsertf128	$1, %xmm0, %ymm0, %ymm0
+	vpermilps	$0, %ymm0, %ymm0 ## ymm0 = ymm0[0,0,0,0,4,4,4,4]
+	vextractf128	$1, %ymm0, %xmm3
+	vmovd	%esi, %xmm2
+	vmovaps	LCPI0_1(%rip), %ymm1
+	vextractf128	$1, %ymm1, %xmm4
+	vpaddd	%xmm4, %xmm3, %xmm3
+        # ....
+	vmulps	LCPI0_0(%rip), %ymm1, %ymm1
+	vmaskmovps	%ymm1, %ymm0, (%rdi,%rax)
+
+
+If you know that the number of elements to be processed will always be an
+exact multiple of the 8, 16, etc., then adding a simple assignment to
+``count`` like the one below gives the compiler enough information to be
+able to eliminate the code for the additional array elements.
+
+::
+
+    void foo(uniform float a[], uniform int count) {
+        // This assignment doesn't change the value of count
+        // if it's a multiple of 16, but it gives the compiler
+        // insight into this fact, allowing for simpler code to
+        // be generated for the foreach loop.
+        count = (count & ~(16-1));
+        foreach (i = 0 ... count)
+            a[i] *= 2;
+    }
+
+With this new version of ``foo()``, only the code for the first loop above
+is generated.
+
+
+How do I launch an individual task for each active program instance?
+--------------------------------------------------------------------
+
+Recall from the `discussion of "launch" in the ispc User's Guide`_ that a
+``launch`` statement launches a single task corresponding to a single gang
+of executing program instances, where the indices of the active program
+instances are the same as were active when the ``launch`` statement
+executed.
+
+.. _discussion of "launch" in the ispc User's Guide: ispc.html#task-parallelism-launch-and-sync-statements
+
+In some situations, it's desirable to be able to launch an individual task
+for each executing program instance.  For example, we might be performing
+an iterative computation where a subset of the program instances determine
+that an item they are responsible for requires additional processing.
+
+::
+
+    bool itemNeedsMoreProcessing(int);
+    int itemNum = ...;
+    if (itemNeedsMoreProcessing(itemNum)) {
+        // do additional work 
+    }
+
+For performance reasons, it may be desirable to apply an entire gang's
+worth of comptuation to each item that needs additional processing; 
+there may be available parallelism in this computation such that we'd like
+to process each of the items with SPMD computation.
+
+In this case, the ``foreach_active`` and ``unmasked`` constructs can be
+applied together to accomplish this goal.
+
+::
+
+    // do additional work 
+    task void doWork(uniform int index);
+    foreach_active (index) {
+        unmasked {
+            launch doWork(extract(itemNum, index)); 
+        }
+    }
+
+Recall that the body of the ``foreach_active`` loop runs once for each
+active program instance, with each active program instance's
+``programIndex`` value available in ``index`` in the above.  In the loop,
+we can re-establish an "all on" execution mask, enabling execution in all
+of the program instances in the gang, such that execution in ``doWork()``
+starts with all instances running.  (Alternatively, the ``unmasked`` block
+could be in the definition of ``doWork()``.)
+
--- a/docs/ispc.rst
+++ b/docs/ispc.rst
--- a/docs/news.rst
+++ b/docs/news.rst
@@ -0,0 +1,71 @@
+=========
+ispc News
+=========
+
+ispc 1.3.0 is Released
+----------------------
+
+A major new version of ``ispc`` has been released.  In addition to a number
+of new language features, this release notably features initial support for
+compiling to the Intel Xeon Phi (Many Integrated Core) architecture.
+
+ispc 1.2.1 is Released
+----------------------
+
+This is a bugfix release, fixing approximately 20 bugs in the system and
+improving error handling and error reporting.  New functionality includes
+very efficient float/half conversion routines thanks to Fabian 
+Giesen.  See the `1.2.1 release notes`_ for details.
+
+.. _1.2.1 release notes: https://github.com/ispc/ispc/tree/master/docs/ReleaseNotes.txt
+
+ispc 1.2.0 is Released
+-----------------------
+
+A new major release was posted on March 20, 2012.  This release includes
+significant new functionality for cleanly handling "structure of arrays"
+(SoA) data layout and a new model for how uniform and varying are handled
+with structure types.  
+
+Paper on ispc To Appear in InPar 2012
+-------------------------------------
+
+A technical paper on ``ispc``, `ispc: A SPMD Compiler for High-Performance
+CPU Programming`_, by Matt Pharr and William R. Mark, has been accepted to
+the `InPar 2012`_ conference. This paper describes a number of the design
+features and key characteristics of the ``ispc`` implementation.
+
+(© 2012 IEEE. Personal use of this material is permitted. Permission from
+IEEE must be obtained for all other uses, in any current or future media,
+including reprinting/republishing this material for advertising or
+promotional purposes, creating new collective works, for resale or
+redistribution to servers or lists, or reuse of any copyrighted component
+of this work in other works.).
+
+.. _ispc\: A SPMD Compiler for High-Performance CPU Programming: https://github.com/downloads/ispc/ispc/ispc_inpar_2012.pdf
+.. _InPar 2012: http://innovativeparallel.org/
+
+ispc 1.1.4 is Released
+----------------------
+
+On February 4, 2012, the 1.1.4 release of ``ispc`` was posted; new features
+include ``new`` and ``delete`` for dynamic memory allocation in ``ispc``
+programs, "local" atomic operations in the standard library, and a new
+scalar compilation target.  See the `1.1.4 release notes`_ for details.
+
+.. _1.1.4 release notes: https://github.com/ispc/ispc/tree/master/docs/ReleaseNotes.txt
+
+
+ispc 1.1.3 is Released
+----------------------
+
+With this release, the language now supports "switch" statements, with the same semantics and syntax as in C.
+
+This release includes fixes for two important performance related issues:
+the quality of code generated for "foreach" statements has been
+substantially improved, and performance regression with code for "gathers"
+that was introduced in v1.1.2 has been fixed in this release.
+
+Thanks to Jean-Luc Duprat for a number of patches that improve support for
+building on various platforms, and to Pierre-Antoine Lacaze for patches so
+that ispc builds under MinGW.
--- a/docs/perfguide.rst
+++ b/docs/perfguide.rst
@@ -13,6 +13,7 @@ the most out of ``ispc`` in practice.
  + `Improving Control Flow Coherence With "foreach_tiled"`_
  + `Using Coherent Control Flow Constructs`_
  + `Use "uniform" Whenever Appropriate`_
+  + `Use "Structure of Arrays" Layout When Possible`_

 * `Tips and Techniques`_

@@ -20,6 +21,7 @@ the most out of ``ispc`` in practice.
  + `Avoid 64-bit Addressing Calculations When Possible`_
  + `Avoid Computation With 8 and 16-bit Integer Types`_
  + `Implementing Reductions Efficiently`_
+  + `Using "foreach_active" Effectively`_
  + `Using Low-level Vector Tricks`_
  + `The "Fast math" Option`_
  + `"inline" Aggressively`_
@@ -247,6 +249,76 @@ but it's always best to provide the compiler with as much help as possible
 to understand the actual form of your computation.


+Use "Structure of Arrays" Layout When Possible
+----------------------------------------------
+
+In general, memory access performance (for both reads and writes) is best
+when the running program instances access a contiguous region of memory; in
+this case efficient vector load and store instructions can often be used
+rather than gathers and scatters.  As an example of this issue, consider an
+array of a simple point datatype laid out and accessed in conventional
+"array of structures" (AOS) layout:
+
+::
+
+    struct Point { float x, y, z; };
+    uniform Point pts[...];
+    float v = pts[programIndex].x;
+
+In the above code, the access to ``pts[programIndex].x`` accesses
+non-sequential memory locations, due to the ``y`` and ``z`` values between
+the desired ``x`` values in memory.  A "gather" is required to get the
+value of ``v``, with a corresponding decrease in performance.
+
+If ``Point`` was defined as a "structure of arrays" (SOA) type, the access
+can be much more efficient:
+
+::
+
+    struct Point8 { float x[8], y[8], z[8]; };
+    uniform Point8 pts8[...];
+    int majorIndex = programIndex / 8;
+    int minorIndex = programIndex % 8;
+    float v = pts8[majorIndex].x[minorIndex];
+
+In this case, each ``Point8`` has 8 ``x`` values contiguous in memory
+before 8 ``y`` values and then 8 ``z`` values.  If the gang size is 8 or
+less, the access for ``v`` will have the same value of ``majorIndex`` for
+all program instances and will access consecutive elements of the ``x[8]``
+array with a vector load.  (For larger gang sizes, two 8-wide vector loads
+would be issues, which is also quite efficient.)
+
+However, the syntax in the above code is messy; accessing SOA data in this
+fashion is much less elegant than the corresponding code for accessing the
+data with AOS layout.  The ``soa`` qualifier in ``ispc`` can be used to
+cause the corresponding transformation to be made to the ``Point`` type,
+while preserving the clean syntax for data access that comes with AOS
+layout:
+
+::
+
+    soa<8> Point pts[...]; 
+    float v = pts[programIndex].x;
+
+Thanks to having SOA layout a first-class concept in the language's type
+system, it's easy to write functions that convert data between the
+layouts.  For example, the ``aos_to_soa`` function below converts ``count``
+elements of the given ``Point`` type from AOS to 8-wide SOA layout.  (It
+assumes that the caller has pre-allocated sufficient space in the
+``pts_soa`` output array.
+
+::
+
+    void aos_to_soa(uniform Point pts_aos[], uniform int count,
+                    soa<8> pts_soa[]) {
+         foreach (i = 0 ... count)
+             pts_soa[i] = pts_aos[i];
+    }
+
+Analogously, a function could be written to convert back from SOA to AOS if
+needed.
+
+
 Tips and Techniques
 ===================

@@ -339,6 +411,12 @@ based on the index, it can be worth doing.  See the example
 ``examples/volume_rendering`` in the ``ispc`` distribution for the use of
 this technique in an instance where it is beneficial to performance.

+Understanding Memory Read Coalescing
+------------------------------------
+
+XXXX todo
+
+
 Avoid 64-bit Addressing Calculations When Possible
 --------------------------------------------------

@@ -433,6 +511,43 @@ values--very efficient code in the end.
        return reduce_add(sum);
    } 

+Using "foreach_active" Effectively
+----------------------------------
+
+For high-performance code,
+
+For example, consider this segment of code, from the introduction of
+``foreach_active`` in the ispc User's Guide:
+
+::
+
+    uniform float array[...] = { ... };    
+    int index = ...;
+    foreach_active (i) {
+        ++array[index];
+    }  
+
+Here, ``index`` was assumed to possibly have the same value for multiple
+program instances, so the updates to ``array[index]`` are serialized by the
+``foreach_active`` statement in order to not have undefined results when
+``index`` values do collide.
+
+The code generated by the compiler can be improved  in this case by making
+it clear that only a single element of the array is accessed by
+``array[index]`` and that thus a general gather or scatter isn't required.
+Specifically, by using the ``extract()`` function from the standard library
+to extract the current program instance's value of ``index`` into a
+``uniform`` variable and then using that to index into ``array``, as below,
+more efficient code is generated.
+
+::
+
+    foreach_active (instanceNum) {
+        uniform int unifIndex = extract(index, instanceNum);
+        ++array[unifIndex];
+    }
+
+
 Using Low-level Vector Tricks
 -----------------------------

@@ -547,7 +662,7 @@ gathers happen.)

    extern "C" {
        void ISPCInstrument(const char *fn, const char *note, 
-                            int line, int mask);
+                            int line, uint64_t mask);
    }

 This function is passed the file name of the ``ispc`` file running, a short
@@ -560,7 +675,7 @@ as follows:

 ::

-   ISPCInstrument("foo.ispc", "function entry", 55, 0xf);
+   ISPCInstrument("foo.ispc", "function entry", 55, 0xfull);

 This call indicates that at the currently executing program has just
 entered the function defined at line 55 of the file ``foo.ispc``, with a
--- a/docs/template-news.txt
+++ b/docs/template-news.txt
@@ -0,0 +1,66 @@
+%(head_prefix)s
+%(head)s
+<script type="text/javascript">
+
+  var _gaq = _gaq || [];
+  _gaq.push(['_setAccount', 'UA-1486404-4']);
+  _gaq.push(['_trackPageview']);
+
+  (function() {
+    var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
+    ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
+    var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
+  })();
+
+</script>
+%(stylesheet)s
+%(body_prefix)s
+<div id="wrap">
+  <div id="wrap2">
+    <div id="header">
+      <h1 id="logo">Intel SPMD Program Compiler</h1>
+      <div id="slogan">An open-source compiler for high-performance SIMD programming on
+      the CPU</div>
+    </div>
+    <div id="nav">
+      <div id="nbar">
+        <ul>
+          <li><a href="index.html">Overview</a></li>
+          <li id="selected"><a href="news.html">News</a></li>
+          <li><a href="features.html">Features</a></li>
+          <li><a href="downloads.html">Downloads</a></li>
+          <li><a href="documentation.html">Documentation</a></li>
+          <li><a href="perf.html">Performance</a></li>
+          <li><a href="contrib.html">Contributors</a></li>
+        </ul>
+      </div>
+    </div>
+    <div id="content-wrap">
+      <div id="sidebar">
+          <div class="widgetspace">
+            <h1>Resources</h1>
+            <ul class="menu">
+              <li><a href="http://github.com/ispc/ispc/">ispc page on github</a></li>
+              <li><a href="http://groups.google.com/group/ispc-users/">ispc
+              users mailing list</a></li>
+              <li><a href="http://groups.google.com/group/ispc-dev/">ispc
+              developers mailing list</a></li>
+              <li><a href="http://github.com/ispc/ispc/wiki/">Wiki</a></li>
+              <li><a href="http://github.com/ispc/ispc/issues/">Bug tracking</a></li>
+              <li><a href="doxygen/index.html">Doxygen</a></li>
+            </ul>
+        </div>
+      </div>
+%(body_pre_docinfo)s
+%(docinfo)s
+<div id="content">
+%(body)s
+</div>
+    <div class="clearfix"></div>
+    <div id="footer"> &copy; 2011-2012 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue  by: <a href="http://www.themebin.com/">ThemeBin</a>
+      <!-- Please Do Not remove this link, thank u -->
+      </div>
+      </div>
+      </div>
+      </div>
+%(body_suffix)s
--- a/docs/template-perf.txt
+++ b/docs/template-perf.txt
@@ -26,10 +26,12 @@
      <div id="nbar">
        <ul>
          <li><a href="index.html">Overview</a></li>
+          <li><a href="news.html">News</a></li>
          <li><a href="features.html">Features</a></li>
          <li><a href="downloads.html">Downloads</a></li>
          <li><a href="documentation.html">Documentation</a></li>
          <li id="selected"><a href="perf.html">Performance</a></li>
+          <li><a href="contrib.html">Contributors</a></li>
        </ul>
      </div>
    </div>
@@ -55,7 +57,7 @@
 %(body)s
 </div>
    <div class="clearfix"></div>
-    <div id="footer"> &copy; 2011 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue  by: <a href="http://www.themebin.com/">ThemeBin</a>
+    <div id="footer"> &copy; 2011-2012 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue  by: <a href="http://www.themebin.com/">ThemeBin</a>
      <!-- Please Do Not remove this link, thank u -->
      </div>
      </div>
--- a/docs/template.txt
+++ b/docs/template.txt
@@ -26,10 +26,12 @@
      <div id="nbar">
        <ul>
          <li><a href="index.html">Overview</a></li>
+          <li><a href="news.html">News</a></li>
          <li><a href="features.html">Features</a></li>
          <li><a href="downloads.html">Downloads</a></li>
          <li id="selected"><a href="documentation.html">Documentation</a></li>
          <li><a href="perf.html">Performance</a></li>
+          <li><a href="contrib.html">Contributors</a></li>
        </ul>
      </div>
    </div>
@@ -55,7 +57,7 @@
 %(body)s
 </div>
    <div class="clearfix"></div>
-    <div id="footer"> &copy; 2011 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue  by: <a href="http://www.themebin.com/">ThemeBin</a>
+    <div id="footer"> &copy; 2011-2012 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue  by: <a href="http://www.themebin.com/">ThemeBin</a>
      <!-- Please Do Not remove this link, thank u -->
      </div>
      </div>
--- a/doxygen.cfg
+++ b/doxygen.cfg
@@ -31,7 +31,7 @@ PROJECT_NAME           = "Intel SPMD Program Compiler"
 # This could be handy for archiving the generated documentation or
 # if some version control system is used.

-PROJECT_NUMBER         = 1.1.4
+PROJECT_NUMBER         = 1.3.0

 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
 # base path where the generated documentation will be put.
--- a/examples/README.txt
+++ b/examples/README.txt
@@ -39,9 +39,6 @@ example implementation of this function that counts the number of times the
 callback is made and records some statistics about control flow coherence
 is provided in the instrument.cpp file.

-*** Note: on Linux, this example currently hits an assertion in LLVM during
-*** compilation
-

 Deferred
 ========
@@ -76,6 +73,14 @@ This directory includes three implementations of the algorithm:
  light culling and shading.


+GMRES
+=====
+
+An implementation of the generalized minimal residual method for solving
+sparse matrix equations.
+(http://en.wikipedia.org/wiki/Generalized_minimal_residual_method)
+
+
 Mandelbrot
 ==========

@@ -110,6 +115,13 @@ This program implements both the Black-Scholes and Binomial options pricing
 models in both ispc and regular serial C++ code.


+Perfbench
+=========
+
+This runs a number of microbenchmarks to measure system performance and
+code generation quality.
+
+
 RT
 ==

--- a/examples/aobench/ao.ispc
+++ b/examples/aobench/ao.ispc
@@ -50,7 +50,6 @@ struct Isect {
 struct Sphere {
    vec        center;
    float      radius;
-
 };

 struct Plane {
@@ -83,7 +82,7 @@ static inline void vnormalize(vec &v) {


 static void
-ray_plane_intersect(Isect &isect, Ray &ray, Plane &plane) {
+ray_plane_intersect(Isect &isect, Ray &ray, uniform Plane &plane) {
    float d = -dot(plane.p, plane.n);
    float v = dot(ray.dir, plane.n);

@@ -103,7 +102,7 @@ ray_plane_intersect(Isect &isect, Ray &ray, Plane &plane) {


 static inline void
-ray_sphere_intersect(Isect &isect, Ray &ray, Sphere &sphere) {
+ray_sphere_intersect(Isect &isect, Ray &ray, uniform Sphere &sphere) {
    vec rs = ray.org - sphere.center;

    float B = dot(rs, ray.dir);
@@ -148,7 +147,7 @@ orthoBasis(vec basis[3], vec n) {


 static float
-ambient_occlusion(Isect &isect, Plane &plane, Sphere spheres[3], 
+ambient_occlusion(Isect &isect, uniform Plane &plane, uniform Sphere spheres[3],
                  RNGState &rngstate) {
    float eps = 0.0001f;
    vec p, n;
@@ -204,14 +203,14 @@ ambient_occlusion(Isect &isect, Plane &plane, Sphere spheres[3],
 static void ao_scanlines(uniform int y0, uniform int y1, uniform int w, 
                         uniform int h,  uniform int nsubsamples, 
                         uniform float image[]) {
-    static Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
-    static Sphere spheres[3] = {
+    static uniform Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
+    static uniform Sphere spheres[3] = {
        { { -2.0f, 0.0f, -3.5f }, 0.5f },
        { { -0.5f, 0.0f, -3.0f }, 0.5f },
        { { 1.0f, 0.0f, -2.2f }, 0.5f } };
    RNGState rngstate;

-    seed_rng(&rngstate, y0);
+    seed_rng(&rngstate, programIndex + (y0 << (programIndex & 15)));
    float invSamples = 1.f / nsubsamples;

    foreach_tiled(y = y0 ... y1, x = 0 ... w, 
@@ -269,5 +268,5 @@ static void task ao_task(uniform int width, uniform int height,

 export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples, 
                          uniform float image[]) {
-    launch[h] < ao_task(w, h, nsubsamples, image) >;
+    launch[h] ao_task(w, h, nsubsamples, image);
 }
--- a/examples/aobench_instrumented/ao.ispc
+++ b/examples/aobench_instrumented/ao.ispc
@@ -211,7 +211,7 @@ static void ao_scanlines(uniform int y0, uniform int y1, uniform int w,
        { { 1.0f, 0.0f, -2.2f }, 0.5f } };
    RNGState rngstate;

-    seed_rng(&rngstate, y0);
+    seed_rng(&rngstate, programIndex + (y0 << (programIndex & 15)));

    // Compute the mapping between the 'programCount'-wide program
    // instances running in parallel and samples in the image.  
@@ -329,5 +329,5 @@ static void task ao_task(uniform int width, uniform int height,

 export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples, 
                          uniform float image[]) {
-    launch[h] < ao_task(w, h, nsubsamples, image) >;
+    launch[h] ao_task(w, h, nsubsamples, image);
 }
--- a/examples/common.mk
+++ b/examples/common.mk
@@ -1,16 +1,22 @@

 TASK_CXX=../tasksys.cpp
 TASK_LIB=-lpthread
-TASK_OBJ=tasksys.o
+TASK_OBJ=objs/tasksys.o

 CXX=g++
 CXXFLAGS=-Iobjs/ -O2 -m64
+CC=gcc
+CCFLAGS=-Iobjs/ -O2 -m64
+
 LIBS=-lm $(TASK_LIB) -lstdc++
 ISPC=ispc -O2 --arch=x86-64 $(ISPC_FLAGS)
 ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc.o $(ISPC_SRC:.ispc=)_ispc_sse2.o \
 	$(ISPC_SRC:.ispc=)_ispc_sse4.o $(ISPC_SRC:.ispc=)_ispc_avx.o)
 ISPC_HEADER=objs/$(ISPC_SRC:.ispc=_ispc.h)
-CPP_OBJS=$(addprefix objs/, $(CPP_SRC:.cpp=.o) $(TASK_OBJ))
+
+CPP_OBJS=$(addprefix objs/, $(CPP_SRC:.cpp=.o))
+CC_OBJS=$(addprefix objs/, $(CC_SRC:.c=.o))
+OBJS=$(CPP_OBJS) $(CC_OBJS) $(TASK_OBJ) $(ISPC_OBJS)

 default: $(EXAMPLE)

@@ -26,12 +32,15 @@ objs/%.cpp objs/%.o objs/%.h: dirs
 clean:
 	/bin/rm -rf objs *~ $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16

-$(EXAMPLE): $(CPP_OBJS) $(ISPC_OBJS)
+$(EXAMPLE): $(OBJS)
 	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)

 objs/%.o: %.cpp dirs $(ISPC_HEADER)
 	$(CXX) $< $(CXXFLAGS) -c -o $@

+objs/%.o: %.c dirs $(ISPC_HEADER)
+	$(CC) $< $(CCFLAGS) -c -o $@
+
 objs/%.o: ../%.cpp dirs
 	$(CXX) $< $(CXXFLAGS) -c -o $@

--- a/examples/deferred/common.cpp
+++ b/examples/deferred/common.cpp
@@ -204,6 +204,7 @@ void WriteFrame(const char *filename, const InputData *input,
    fprintf(out, "P6 %d %d 255\n", input->header.framebufferWidth, 
            input->header.framebufferHeight);
    fwrite(framebufferAOS, imageBytes, 1, out);
+    fclose(out);

    lAlignedFree(framebufferAOS);
 }
--- a/examples/deferred/kernels.ispc
+++ b/examples/deferred/kernels.ispc
@@ -35,35 +35,35 @@

 struct InputDataArrays
 {
-    uniform float * uniform zBuffer;
-    uniform unsigned int16 * uniform normalEncoded_x; // half float
-    uniform unsigned int16 * uniform normalEncoded_y; // half float
-    uniform unsigned int16 * uniform specularAmount; // half float
-    uniform unsigned int16 * uniform specularPower; // half float
-    uniform unsigned int8 * uniform albedo_x; // unorm8
-    uniform unsigned int8 * uniform albedo_y; // unorm8
-    uniform unsigned int8 * uniform albedo_z; // unorm8
-    uniform float * uniform lightPositionView_x;
-    uniform float * uniform lightPositionView_y;
-    uniform float * uniform lightPositionView_z;
-    uniform float * uniform lightAttenuationBegin;
-    uniform float * uniform lightColor_x;
-    uniform float * uniform lightColor_y;
-    uniform float * uniform lightColor_z;
-    uniform float * uniform lightAttenuationEnd;
+    float *zBuffer;
+    unsigned int16 *normalEncoded_x; // half float
+    unsigned int16 *normalEncoded_y; // half float
+    unsigned int16 *specularAmount; // half float
+    unsigned int16 *specularPower; // half float
+    unsigned int8 *albedo_x; // unorm8
+    unsigned int8 *albedo_y; // unorm8
+    unsigned int8 *albedo_z; // unorm8
+    float *lightPositionView_x;
+    float *lightPositionView_y;
+    float *lightPositionView_z;
+    float *lightAttenuationBegin;
+    float *lightColor_x;
+    float *lightColor_y;
+    float *lightColor_z;
+    float *lightAttenuationEnd;
 };

 struct InputHeader
 {
-    uniform float cameraProj[4][4];
-    uniform float cameraNear;
-    uniform float cameraFar;
+    float cameraProj[4][4];
+    float cameraNear;
+    float cameraFar;

-    uniform int32 framebufferWidth;
-    uniform int32 framebufferHeight;
-    uniform int32 numLights;
-    uniform int32 inputDataChunkSize;
-    uniform int32 inputDataArrayOffsets[idaNum];
+    int32 framebufferWidth;
+    int32 framebufferHeight;
+    int32 numLights;
+    int32 inputDataChunkSize;
+    int32 inputDataArrayOffsets[idaNum];
 };


@@ -327,8 +327,8 @@ ShadeTile(

                // Reconstruct normal from G-buffer
                float surface_normal_x, surface_normal_y, surface_normal_z;
-                float normal_x = half_to_float_fast(inputData.normalEncoded_x[gBufferOffset]);
-                float normal_y = half_to_float_fast(inputData.normalEncoded_y[gBufferOffset]);
+                float normal_x = half_to_float(inputData.normalEncoded_x[gBufferOffset]);
+                float normal_y = half_to_float(inputData.normalEncoded_y[gBufferOffset]);
                    
                float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
                float m = sqrt(4.0f * f - 1.0f);
@@ -339,9 +339,9 @@ ShadeTile(

                // Load other G-buffer parameters
                float surface_specularAmount = 
-                    half_to_float_fast(inputData.specularAmount[gBufferOffset]);
+                    half_to_float(inputData.specularAmount[gBufferOffset]);
                float surface_specularPower  = 
-                    half_to_float_fast(inputData.specularPower[gBufferOffset]);
+                    half_to_float(inputData.specularPower[gBufferOffset]);
                float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
                float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
                float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
@@ -514,9 +514,9 @@ RenderStatic(uniform InputHeader &inputHeader,

    // Launch a task to render each tile, each of which is MIN_TILE_WIDTH
    // by MIN_TILE_HEIGHT pixels.
-    launch[num_groups] < RenderTile(num_groups_x, num_groups_y,
-                                    inputHeader, inputData, visualizeLightCount,
-                                    framebuffer_r, framebuffer_g, framebuffer_b) >;
+    launch[num_groups] RenderTile(num_groups_x, num_groups_y,
+                                  inputHeader, inputData, visualizeLightCount,
+                                  framebuffer_r, framebuffer_g, framebuffer_b);
 }


@@ -575,8 +575,6 @@ SplitTileMinMax(
    uniform float light_positionView_z_array[],
    uniform float light_attenuationEnd_array[],
    // Outputs
-    // TODO: ISPC doesn't currently like multidimensionsal arrays so we'll do the
-    // indexing math ourselves
    uniform int32 subtileIndices[],
    uniform int32 subtileIndicesPitch,
    uniform int32 subtileNumLights[]
--- a/examples/deferred/main.cpp
+++ b/examples/deferred/main.cpp
@@ -87,7 +87,7 @@ int main(int argc, char** argv) {
        framebuffer.clear();
        reset_and_start_timer();
        for (int j = 0; j < nframes; ++j)
-            ispc::RenderStatic(&input->header, &input->arrays, 
+            ispc::RenderStatic(input->header, input->arrays,
                               VISUALIZE_LIGHT_COUNT,
                               framebuffer.r, framebuffer.g, framebuffer.b);
        double mcycles = get_elapsed_mcycles() / nframes;
--- a/examples/examples.sln
+++ b/examples/examples.sln
@@ -23,6 +23,8 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "stencil", "stencil\stencil.
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "deferred_shading", "deferred\deferred_shading.vcxproj", "{87F53C53-957E-4E91-878A-BC27828FB9EB}"
 EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "perfbench", "perfbench\perfbench.vcxproj", "{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}"
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|Win32 = Debug|Win32
@@ -119,6 +121,14 @@ Global
 		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|Win32.Build.0 = Release|Win32
 		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|x64.ActiveCfg = Release|x64
 		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|x64.Build.0 = Release|x64
+		{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Debug|Win32.ActiveCfg = Debug|Win32
+		{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Debug|Win32.Build.0 = Debug|Win32
+		{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Debug|x64.ActiveCfg = Debug|x64
+		{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Debug|x64.Build.0 = Debug|x64
+		{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Release|Win32.ActiveCfg = Release|Win32
+		{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Release|Win32.Build.0 = Release|Win32
+		{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Release|x64.ActiveCfg = Release|x64
+		{D923BB7E-A7C8-4850-8FCF-0EB9CE35B4E8}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
--- a/examples/gmres/Makefile
+++ b/examples/gmres/Makefile
@@ -0,0 +1,8 @@
+
+EXAMPLE=gmres
+CPP_SRC=algorithm.cpp main.cpp matrix.cpp
+CC_SRC=mmio.c
+ISPC_SRC=matrix.ispc
+ISPC_TARGETS=sse2,sse4-x2,avx-x2
+
+include ../common.mk
--- a/examples/gmres/algorithm.cpp
+++ b/examples/gmres/algorithm.cpp
@@ -0,0 +1,231 @@
+/*
+  Copyright (c) 2012, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+
+/*===========================================================================*\
+|* Includes
+\*===========================================================================*/
+#include "algorithm.h"
+#include "stdio.h"
+#include "debug.h"
+
+
+/*===========================================================================*\
+|* GMRES
+\*===========================================================================*/
+/* upper_triangular_right_solve:
+ * ----------------------------
+ * Given upper triangular matrix R and rhs vector b, solve for
+ * x.  This "solve" ignores the rows, columns of R that are greater than the
+ * dimensions of x.
+ */
+void upper_triangular_right_solve (const DenseMatrix &R, const Vector &b, Vector &x) 
+{
+    // Dimensionality check
+    ASSERT(R.rows() >= b.size());
+    ASSERT(R.cols() >= x.size());
+    ASSERT(b.size() >= x.size());
+
+    int max_row = x.size() - 1;
+
+    // first solve step:
+    x[max_row] = b[max_row] / R(max_row, max_row);
+
+    for (int row = max_row - 1; row >= 0; row--) {
+        double xi = b[row];
+        for (int col = max_row; col > row; col--)
+            xi -= x[col] * R(row, col);
+        x[row] = xi / R(row, row);
+    }
+}
+
+/* create_rotation (used in gmres):
+ * -------------------------------
+ * Construct a Givens rotation to zero out the lowest non-zero entry in a partially
+ * factored Hessenburg matrix.  Note that the previous Givens rotations should be
+ * applied to this column before creating a new rotation.
+ */
+void create_rotation (const DenseMatrix &H, size_t col, Vector &Cn, Vector &Sn) 
+{
+    double a = H(col,     col);
+    double b = H(col + 1, col);
+    double r;
+
+    if (b == 0) {
+        Cn[col] = copysign(1, a);
+        Sn[col] = 0;
+    } 
+    else if (a == 0) {
+        Cn[col] = 0;
+        Sn[col] = copysign(1, b);
+    }
+    else {
+        r       = sqrt(a*a + b*b);
+        Sn[col] = -b / r;
+        Cn[col] =  a / r;
+    }
+}
+
+/* Applies the 'col'th Givens rotation stored in vectors Sn and Cn to the 'col'th 
+ * column of the DenseMatrix M.  (Previous columns don't need the rotation applied b/c
+ * presumeably, the first col-1 columns are already upper triangular, and so their
+ * entries in the col and col+1 rows are 0.)
+ */
+void apply_rotation (DenseMatrix &H, size_t col, Vector &Cn, Vector &Sn) 
+{
+    double c = Cn[col];
+    double s = Sn[col];
+    double tmp    = c * H(col, col) - s * H(col+1, col);
+    H(col+1, col) = s * H(col, col) + c * H(col+1, col);
+    H(col,   col) = tmp;
+}
+
+/* Applies the 'col'th Givens rotation to the vector.
+ */
+void apply_rotation (Vector &v, size_t col, Vector &Cn, Vector &Sn) 
+{
+    double a = v[col];
+    double b = v[col + 1];
+
+    double c = Cn[col];
+    double s = Sn[col];
+
+    v[col]     = c * a - s * b;
+    v[col + 1] = s * a + c * b;
+}
+
+/* Applies the first 'col' Givens rotations to the newly-created column
+ * of H.  (Leaves other columns alone.)
+ */
+void update_column (DenseMatrix &H, size_t col, Vector &Cn, Vector &Sn) 
+{
+    for (int i = 0; i < col; i++) {
+        double c    = Cn[i];
+        double s    = Sn[i];
+        double t    = c * H(i,col) - s * H(i+1,col);
+        H(i+1, col) = s * H(i,col) + c * H(i+1,col);
+        H(i,   col) = t;
+    }
+}
+
+/* After a new column has been added to the hessenburg matrix, factor it back into
+ * an upper-triangular matrix by:
+ * - applying the previous Givens rotations to the new column
+ * - computing the new Givens rotation to make the column upper triangluar
+ * - applying the new Givens rotation to the column, and
+ * - applying the new Givens rotation to the solution vector
+ */
+void update_qr_decomp (DenseMatrix &H, Vector &s, size_t col, Vector &Cn, Vector &Sn)
+{
+    update_column(  H, col, Cn, Sn);
+    create_rotation(H, col, Cn, Sn);
+    apply_rotation( H, col, Cn, Sn);
+    apply_rotation( s, col, Cn, Sn);
+}
+
+void gmres (const Matrix &A, const Vector &b, Vector &x, int num_iters, double max_err)  
+{
+    DEBUG_PRINT("gmres starting!\n");
+    x.zero();
+
+    ASSERT(A.rows() == A.cols());
+    DenseMatrix Qstar(num_iters + 1, A.rows());
+    DenseMatrix H(num_iters + 1, num_iters);
+
+    // arrays for storing parameters of givens rotations
+    Vector Sn(num_iters);
+    Vector Cn(num_iters);
+
+    // array for storing the rhs projected onto the hessenburg's column space
+    Vector G(num_iters+1);
+    G.zero();
+
+    double beta = b.norm();
+    G[0] = beta;
+
+    // temp vector, stores Aqi
+    Vector w(A.rows());
+
+    w.copy(b);
+    w.normalize();
+    Qstar.set_row(0, w);
+
+    int iter = 0;
+    Vector temp(A.rows(), false);
+    double rel_err;
+
+    while (iter < num_iters) 
+    {
+        // w = Aqi
+        Qstar.row(iter, temp);
+        A.multiply(temp, w);
+
+        // construct ith column of H, i+1th row of Qstar:        
+        for (int row = 0; row <= iter; row++) {
+            Qstar.row(row, temp);
+            H(row, iter) = temp.dot(w);
+            w.add_ax(-H(row, iter), temp);
+        }
+
+        H(iter+1, iter) = w.norm();
+        w.divide(H(iter+1, iter));
+        Qstar.set_row(iter+1, w);
+
+        update_qr_decomp (H, G, iter, Cn, Sn);
+
+        rel_err = fabs(G[iter+1] / beta);
+
+        if (rel_err < max_err)
+            break;
+
+        if (iter % 100 == 0)
+            DEBUG_PRINT("Iter %d: %f err\n", iter, rel_err);
+
+        iter++;
+    }
+
+    if (iter == num_iters) {
+        fprintf(stderr, "Error: gmres failed to converge in %d iterations (relative err: %f)\n", num_iters, rel_err);
+        exit(-1);
+    }
+
+    // We've reached an acceptable solution (?):
+
+    DEBUG_PRINT("gmres completed in %d iterations (rel. resid. %f, max %f)\n", num_iters, rel_err, max_err);
+    Vector y(iter+1);
+    upper_triangular_right_solve(H, G, y);
+    for (int i = 0; i < iter + 1; i++) {
+        Qstar.row(i, temp);
+        x.add_ax(y[i], temp);
+    }
+}
--- a/examples/gmres/algorithm.h
+++ b/examples/gmres/algorithm.h
@@ -0,0 +1,50 @@
+/*
+  Copyright (c) 2012, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+
+#ifndef __ALGORITHM_H__
+#define __ALGORITHM_H__
+
+#include "matrix.h"
+
+
+/* Generalized Minimal Residual Method:
+ * -----------------------------------
+ * Takes a square matrix and an rhs and uses GMRES to find an estimate for x.
+ * The specified error is relative.
+ */
+void gmres (const Matrix &A, const Vector &b, Vector &x, int num_iters, double err);
+
+
+
+#endif
--- a/examples/gmres/data/c-18/c-18.mtx
+++ b/examples/gmres/data/c-18/c-18.mtx
--- a/examples/gmres/data/c-18/c-18_b.mtx
+++ b/examples/gmres/data/c-18/c-18_b.mtx
--- a/examples/gmres/data/c-21/c-21.mtx
+++ b/examples/gmres/data/c-21/c-21.mtx
--- a/examples/gmres/data/c-21/c-21_b.mtx
+++ b/examples/gmres/data/c-21/c-21_b.mtx
--- a/examples/gmres/data/c-22/c-22.mtx
+++ b/examples/gmres/data/c-22/c-22.mtx
--- a/examples/gmres/data/c-22/c-22_b.mtx
+++ b/examples/gmres/data/c-22/c-22_b.mtx
--- a/examples/gmres/data/c-25/c-25.mtx
+++ b/examples/gmres/data/c-25/c-25.mtx
--- a/examples/gmres/data/c-25/c-25_b.mtx
+++ b/examples/gmres/data/c-25/c-25_b.mtx
--- a/examples/gmres/debug.h
+++ b/examples/gmres/debug.h
@@ -0,0 +1,55 @@
+/*
+  Copyright (c) 2012, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+
+#ifndef __DEBUG_H__
+#define __DEBUG_H__
+
+#include <cassert>
+
+
+/**************************************************************\
+| Macros
+\**************************************************************/
+#define DEBUG
+
+#ifdef DEBUG
+#define ASSERT(expr) assert(expr)
+#define DEBUG_PRINT(...) printf(__VA_ARGS__)
+#else
+#define ASSERT(expr)
+#define DEBUG_PRINT(...)
+#endif
+
+
+#endif
--- a/examples/gmres/main.cpp
+++ b/examples/gmres/main.cpp
@@ -0,0 +1,79 @@
+/*
+  Copyright (c) 2012, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+
+#include "matrix.h"
+#include "algorithm.h"
+#include "util.h"
+#include <cmath>
+#include "../timing.h"
+
+
+int main (int argc, char **argv) 
+{
+    if (argc < 4) {
+        printf("usage: %s <input-matrix> <input-rhs> <output-file>\n", argv[0]);
+        return -1;
+    }
+
+    double gmres_cycles;
+
+    DEBUG_PRINT("Loading A...\n");
+    Matrix *A = CRSMatrix::matrix_from_mtf(argv[1]);
+    if (A == NULL) 
+        return -1;
+    DEBUG_PRINT("... size: %lu\n", A->cols());
+
+    DEBUG_PRINT("Loading b...\n");
+    Vector *b = Vector::vector_from_mtf(argv[2]);
+    if (b == NULL)
+        return -1;
+
+    Vector x(A->cols());
+    DEBUG_PRINT("Beginning gmres...\n");
+    gmres(*A, *b, x, A->cols() / 2, .01);
+
+    // Write result out to file
+    x.to_mtf(argv[argc-1]);
+
+    // Compute residual (double-check)
+#ifdef DEBUG
+    Vector bprime(b->size());
+    A->multiply(x, bprime);
+    Vector resid(bprime.size(), &(bprime[0]));
+    resid.subtract(*b);
+    DEBUG_PRINT("residual error check: %lg\n", resid.norm() / b->norm());
+#endif
+    // Print profiling results
+    DEBUG_PRINT("-- Total mcycles to solve : %.03f --\n", gmres_cycles);
+}
--- a/examples/gmres/matrix.cpp
+++ b/examples/gmres/matrix.cpp
@@ -0,0 +1,246 @@
+/*
+  Copyright (c) 2012, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+
+/**************************************************************\
+| Includes
+\**************************************************************/
+#include "matrix.h"
+#include "matrix_ispc.h"
+
+extern "C" {
+#include "mmio.h"
+}
+
+/**************************************************************\
+| DenseMatrix methods
+\**************************************************************/
+void DenseMatrix::multiply (const Vector &v, Vector &r) const 
+{
+    // Dimensionality check
+    ASSERT(v.size() == cols());
+    ASSERT(r.size() == rows());
+
+    for (int i = 0; i < rows(); i++)
+        r[i] = v.dot(entries + i * num_cols);
+}
+
+const Vector *DenseMatrix::row (size_t row) const {
+    return new Vector(num_cols, entries + row * num_cols, true);
+}
+
+void DenseMatrix::row (size_t row, Vector &r) {
+    r.entries = entries + row * cols();
+    r._size   = cols();
+}
+
+void DenseMatrix::set_row(size_t row, const Vector &v) 
+{
+    ASSERT(v.size() == num_cols);
+    memcpy(entries + row * num_cols, v.entries, num_cols * sizeof(double));
+}
+
+
+/**************************************************************\
+| CRSMatrix Methods
+\**************************************************************/
+#include <stdio.h>
+#include <stdlib.h>
+#include <vector>
+#include <algorithm>
+
+
+struct entry {
+    int row;
+    int col;
+    double val;
+};
+
+bool compare_entries(struct entry i, struct entry j) {
+    if (i.row < j.row)
+        return true;
+    if (i.row > j.row)
+        return false;
+
+    return i.col < j.col;
+}
+
+#define ERR_OUT(...) { fprintf(stderr, __VA_ARGS__); return NULL; }
+
+CRSMatrix *CRSMatrix::matrix_from_mtf (char *path) {
+    FILE *f;
+    MM_typecode matcode;
+
+    int m, n, nz;
+
+    if ((f = fopen(path, "r")) == NULL) 
+        ERR_OUT("Error: %s does not name a valid/readable file.\n", path);
+
+    if (mm_read_banner(f, &matcode) != 0)
+        ERR_OUT("Error: Could not process Matrix Market banner.\n");
+
+    if (mm_is_complex(matcode)) 
+        ERR_OUT("Error: Application does not support complex numbers.\n")
+
+    if (mm_is_dense(matcode))
+        ERR_OUT("Error: supplied matrix is dense (should be sparse.)\n");
+
+    if (!mm_is_matrix(matcode))
+        ERR_OUT("Error: %s does not encode a matrix.\n", path)
+
+    if (mm_read_mtx_crd_size(f, &m, &n, &nz) != 0)
+        ERR_OUT("Error: could not read matrix size from file.\n");
+
+    if (m != n)
+        ERR_OUT("Error: Application does not support non-square matrices.");
+
+    std::vector<struct entry> entries;
+    entries.resize(nz);
+
+    for (int i = 0; i < nz; i++) {
+        fscanf(f, "%d %d %lg\n", &entries[i].row, &entries[i].col, &entries[i].val);
+        // Adjust from 1-based to 0-based
+        entries[i].row--;
+        entries[i].col--;
+    }
+
+    sort(entries.begin(), entries.end(), compare_entries);
+
+    CRSMatrix *M = new CRSMatrix(m, n, nz);
+    int cur_row = -1;
+    for (int i = 0; i < nz; i++) {
+        while (entries[i].row > cur_row)
+            M->row_offsets[++cur_row] = i;
+        M->entries[i] = entries[i].val;
+        M->columns[i] = entries[i].col;
+    }
+
+    return M;
+}
+
+Vector *Vector::vector_from_mtf (char *path) {
+    FILE *f;
+    MM_typecode matcode;
+
+    int m, n, nz;
+
+    if ((f = fopen(path, "r")) == NULL) 
+        ERR_OUT("Error: %s does not name a valid/readable file.\n", path);
+
+    if (mm_read_banner(f, &matcode) != 0)
+        ERR_OUT("Error: Could not process Matrix Market banner.\n");
+
+    if (mm_is_complex(matcode)) 
+        ERR_OUT("Error: Application does not support complex numbers.\n")
+
+    if (mm_is_dense(matcode)) {
+        if (mm_read_mtx_array_size(f, &m, &n) != 0)
+            ERR_OUT("Error: could not read matrix size from file.\n");
+    } else {
+        if (mm_read_mtx_crd_size(f, &m, &n, &nz) != 0)
+            ERR_OUT("Error: could not read matrix size from file.\n");
+    }
+    if (n != 1)
+        ERR_OUT("Error: %s does not describe a vector.\n", path);
+
+    Vector *x = new Vector(m);
+
+    if (mm_is_dense(matcode)) {
+        double val;
+        for (int i = 0; i < m; i++) {
+            fscanf(f, "%lg\n", &val);
+            (*x)[i] = val;
+        }
+    }
+    else {
+        x->zero();
+        double val;
+        int row;
+        int col;
+        for (int i = 0; i < nz; i++) {
+            fscanf(f, "%d %d %lg\n", &row, &col, &val);
+            (*x)[row-1] = val;
+        }
+    }
+    return x;
+}
+
+#define ERR(...) { fprintf(stderr, __VA_ARGS__); exit(-1); }
+
+void Vector::to_mtf (char *path) {
+    FILE *f;
+    MM_typecode matcode;
+
+    mm_initialize_typecode(&matcode);
+    mm_set_matrix(&matcode);
+    mm_set_real(&matcode);
+    mm_set_dense(&matcode);
+    mm_set_general(&matcode);
+
+    if ((f = fopen(path, "w")) == NULL)
+        ERR("Error: cannot open/write to %s\n", path);
+
+    mm_write_banner(f, matcode);
+    mm_write_mtx_array_size(f, size(), 1);
+    for (int i = 0; i < size(); i++)
+        fprintf(f, "%lg\n", entries[i]);
+
+    fclose(f);
+}
+
+void CRSMatrix::multiply (const Vector &v, Vector &r) const
+{
+    ASSERT(v.size() == cols());
+    ASSERT(r.size() == rows());
+
+    for (int row = 0; row < rows(); row++) 
+    {
+        int row_offset = row_offsets[row];
+        int next_offset = ((row + 1 == rows()) ? _nonzeroes : row_offsets[row + 1]);
+
+        double sum = 0;
+        for (int i = row_offset; i < next_offset; i++)
+        {
+            sum += v[columns[i]] * entries[i];
+        }
+        r[row] = sum;
+    }
+}
+
+void CRSMatrix::zero ( ) 
+{
+    entries.clear();
+    row_offsets.clear();
+    columns.clear();
+    _nonzeroes = 0;
+}
--- a/examples/gmres/matrix.h
+++ b/examples/gmres/matrix.h
@@ -0,0 +1,279 @@
+/*
+  Copyright (c) 2012, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+
+#ifndef __MATRIX_H__
+#define __MATRIX_H__
+
+/**************************************************************\
+| Includes
+\**************************************************************/
+#include <cstring> // size_t
+#include <cstdlib> // malloc, memcpy, etc.
+#include <cmath>   // sqrt
+#include <vector>
+
+#include "debug.h"
+#include "matrix_ispc.h"
+
+
+class DenseMatrix;
+/**************************************************************\
+| Vector class
+\**************************************************************/
+class Vector {
+ public:
+    static Vector *vector_from_mtf(char *path);
+    void to_mtf (char *path);
+
+    Vector(size_t size, bool alloc_mem=true) 
+        {
+            shared_ptr = false;
+            _size      = size;
+			
+            if (alloc_mem)
+                entries = (double *) malloc(sizeof(double) * _size);
+            else {
+                shared_ptr = true;
+                entries    = NULL;
+            }
+        }
+
+    Vector(size_t size, double *content, bool share_ptr=false) 
+        {
+            _size = size;
+            if (share_ptr) {
+                entries = content;
+                shared_ptr = true;
+            }
+            else {
+                shared_ptr = false;
+                entries = (double *) malloc(sizeof(double) * _size);
+                memcpy(entries, content, sizeof(double) * _size);
+            }
+        }
+
+    ~Vector() { if (!shared_ptr) free(entries); }
+
+    const double & operator [] (size_t index) const 
+    { 
+        ASSERT(index < _size); 
+        return *(entries + index); 
+    }
+
+    double &operator [] (size_t index) 
+    {
+        ASSERT(index < _size);
+        return *(entries + index);
+    }
+
+    bool operator == (const Vector &v) const 
+    {
+        if (v.size() != _size)
+            return false;
+
+        for (int i = 0; i < _size; i++)
+            if (entries[i] != v[i])
+                return false;
+
+        return true;
+    }
+
+    size_t size() const {return _size; }
+
+    double dot (const Vector &b) const 
+    {
+        ASSERT(b.size() == this->size());
+        return ispc::vector_dot(entries, b.entries, size());
+    }
+
+    double dot (const double * const b) const 
+    {
+        return ispc::vector_dot(entries, b, size());
+    }
+
+    void zero () 
+    {
+        ispc::zero(entries, size()); 
+    }
+
+    double norm () const { return sqrtf(dot(entries)); }
+
+    void normalize () { this->divide(this->norm()); }
+
+    void add (const Vector &a) 
+    {
+        ASSERT(size() == a.size());
+        ispc::vector_add(entries, a.entries, size());
+    }
+
+    void subtract (const Vector &s)
+    {
+        ASSERT(size() == s.size());
+        ispc::vector_sub(entries, s.entries, size());
+    }
+
+    void multiply (double scalar) 
+    {
+        ispc::vector_mult(entries, scalar, size());
+    }
+
+    void divide (double scalar) 
+    {
+        ispc::vector_div(entries, scalar, size());
+    }
+
+    // Note: x may be longer than *(this)
+    void add_ax (double a, const Vector &x) {
+        ASSERT(x.size() >= size());
+        ispc::vector_add_ax(entries, a, x.entries, size());
+    }
+
+    // Note that copy only copies the first size() elements of the
+    // supplied vector, i.e. the supplied vector can be longer than
+    // this one.  This is useful in least squares calculations.
+    void copy (const Vector &other) {
+        ASSERT(other.size() >= size());
+        memcpy(entries, other.entries, size() * sizeof(double));
+    }
+
+    friend class DenseMatrix;
+
+ private:
+    size_t  _size;
+    bool     shared_ptr;
+    double  *entries;
+};
+
+
+/**************************************************************\
+| Matrix base class
+\**************************************************************/
+class Matrix {
+    friend class Vector;
+	
+ public:
+    Matrix(size_t size_r, size_t size_c) 
+        { 
+            num_rows = size_r; 
+            num_cols = size_c; 
+        }
+    ~Matrix(){}
+
+    size_t rows() const { return num_rows; }
+    size_t cols() const { return num_cols; }
+
+    virtual void multiply (const Vector &v, Vector &r) const = 0;
+    virtual void zero () = 0;
+
+ protected:
+    size_t num_rows;
+    size_t num_cols;
+};
+
+/**************************************************************\
+| DenseMatrix class
+\**************************************************************/
+class DenseMatrix : public Matrix { 
+    friend class Vector;
+
+ public:
+ DenseMatrix(size_t size_r, size_t size_c) : Matrix(size_r, size_c) 
+        {
+            entries = (double *) malloc(size_r * size_c * sizeof(double));
+        }
+
+ DenseMatrix(size_t size_r, size_t size_c, const double *content) : Matrix (size_r, size_c)
+        {
+            entries = (double *) malloc(size_r * size_c * sizeof(double));
+            memcpy(entries, content, size_r * size_c * sizeof(double));
+        }
+
+    virtual void multiply (const Vector &v, Vector &r) const;
+
+    double &operator () (unsigned int r, unsigned int c)
+    {
+        return *(entries + r * num_cols + c);
+    }
+
+    const double &operator () (unsigned int r, unsigned int c) const
+    {
+        return *(entries + r * num_cols + c);			
+    }
+
+    const Vector *row(size_t row) const;
+    void          row(size_t row, Vector &r);
+    void      set_row(size_t row, const Vector &v);
+
+    virtual void zero() { ispc::zero(entries, rows() * cols()); }
+
+    void copy (const DenseMatrix &other) 
+    {
+        ASSERT(rows() == other.rows());
+        ASSERT(cols() == other.cols());
+        memcpy(entries, other.entries, rows() * cols() * sizeof(double));
+    }
+
+ private:
+    double *entries;
+    bool shared_ptr;
+};
+
+/**************************************************************\
+| CSRMatrix (compressed row storage, a sparse matrix format)
+\**************************************************************/
+class CRSMatrix : public Matrix { 
+ public:
+    CRSMatrix (size_t size_r, size_t size_c, size_t nonzeroes) :
+    Matrix(size_r, size_c) 
+        {
+            _nonzeroes = nonzeroes;
+            entries.resize(nonzeroes);
+            columns.resize(nonzeroes);
+            row_offsets.resize(size_r);
+        }
+
+    virtual void multiply(const Vector &v, Vector &r) const;
+
+    virtual void zero();
+
+    static CRSMatrix *matrix_from_mtf (char *path);
+
+ private:
+    unsigned int        _nonzeroes;
+    std::vector<double>  entries;
+    std::vector<int>     row_offsets;
+    std::vector<int>     columns;
+};
+
+#endif
--- a/examples/gmres/matrix.ispc
+++ b/examples/gmres/matrix.ispc
@@ -0,0 +1,122 @@
+/*
+  Copyright (c) 2012, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+
+/**************************************************************\
+| General
+\**************************************************************/
+export void zero (uniform double data[],
+                  uniform int size)
+{
+    foreach (i = 0 ... size)
+        data[i] = 0.0;
+}
+
+
+/**************************************************************\
+| Vector helpers
+\**************************************************************/
+export void vector_add (uniform double a[], 
+                        const uniform double b[], 
+                        const uniform int size) 
+{
+    foreach (i = 0 ... size)
+        a[i] += b[i];
+}
+
+export void vector_sub (uniform double a[], 
+                        const uniform double b[], 
+                        const uniform int size) 
+{
+    foreach (i = 0 ... size)
+        a[i] -= b[i];
+}
+
+export void vector_mult (uniform double a[],
+                         const uniform double b,
+                         const uniform int size)
+{
+    foreach (i = 0 ... size)
+        a[i] *= b;
+}
+
+export void vector_div (uniform double a[],
+                        const uniform double b,
+                        const uniform int size)
+{
+    foreach (i = 0 ... size)
+        a[i] /= b;
+}
+
+export void vector_add_ax (uniform double r[],
+                           const uniform double a,
+                           const uniform double x[],
+                           const uniform int    size)
+{
+    foreach (i = 0 ... size)
+        r[i] += a * x[i];
+}
+
+export uniform double vector_dot (const uniform double a[],
+                                  const uniform double b[],
+                                  const uniform int size)
+{
+    varying double sum = 0.0;
+    foreach (i = 0 ... size)
+        sum += a[i] * b[i];
+    return reduce_add(sum);
+}
+
+/**************************************************************\
+| Matrix helpers
+\**************************************************************/
+export void sparse_multiply (const uniform double entries[],
+                             const uniform double columns[],
+                             const uniform double row_offsets[],
+                             const uniform int rows,
+                             const uniform int cols,
+                             const uniform int nonzeroes,
+                             const uniform double v[],
+                             uniform double r[]) 
+{
+    foreach (row = 0 ... rows) {
+        int row_offset = row_offsets[row];
+        int next_offset = ((row + 1 == rows) ? nonzeroes : row_offsets[row+1]);
+
+        double sum = 0;
+        for (int j = row_offset; j < next_offset; j++)
+            sum += v[columns[j]] * entries[j];
+        r[row] = sum;
+    }
+}
+
--- a/examples/gmres/mmio.c
+++ b/examples/gmres/mmio.c
@@ -0,0 +1,511 @@
+/* 
+*   Matrix Market I/O library for ANSI C
+*
+*   See http://math.nist.gov/MatrixMarket for details.
+*
+*
+*/
+
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <ctype.h>
+
+#include "mmio.h"
+
+int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_,
+                double **val_, int **I_, int **J_)
+{
+    FILE *f;
+    MM_typecode matcode;
+    int M, N, nz;
+    int i;
+    double *val;
+    int *I, *J;
+ 
+    if ((f = fopen(fname, "r")) == NULL)
+            return -1;
+ 
+ 
+    if (mm_read_banner(f, &matcode) != 0)
+    {
+        printf("mm_read_unsymetric: Could not process Matrix Market banner ");
+        printf(" in file [%s]\n", fname);
+        return -1;
+    }
+ 
+ 
+ 
+    if ( !(mm_is_real(matcode) && mm_is_matrix(matcode) &&
+            mm_is_sparse(matcode)))
+    {
+        fprintf(stderr, "Sorry, this application does not support ");
+        fprintf(stderr, "Market Market type: [%s]\n",
+                mm_typecode_to_str(matcode));
+        return -1;
+    }
+ 
+    /* find out size of sparse matrix: M, N, nz .... */
+ 
+    if (mm_read_mtx_crd_size(f, &M, &N, &nz) !=0)
+    {
+        fprintf(stderr, "read_unsymmetric_sparse(): could not parse matrix size.\n");
+        return -1;
+    }
+ 
+    *M_ = M;
+    *N_ = N;
+    *nz_ = nz;
+ 
+    /* reseve memory for matrices */
+ 
+    I = (int *) malloc(nz * sizeof(int));
+    J = (int *) malloc(nz * sizeof(int));
+    val = (double *) malloc(nz * sizeof(double));
+ 
+    *val_ = val;
+    *I_ = I;
+    *J_ = J;
+ 
+    /* NOTE: when reading in doubles, ANSI C requires the use of the "l"  */
+    /*   specifier as in "%lg", "%lf", "%le", otherwise errors will occur */
+    /*  (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15)            */
+ 
+    for (i=0; i<nz; i++)
+    {
+        fscanf(f, "%d %d %lg\n", &I[i], &J[i], &val[i]);
+        I[i]--;  /* adjust from 1-based to 0-based */
+        J[i]--;
+    }
+    fclose(f);
+ 
+    return 0;
+}
+
+int mm_is_valid(MM_typecode matcode)
+{
+    if (!mm_is_matrix(matcode)) return 0;
+    if (mm_is_dense(matcode) && mm_is_pattern(matcode)) return 0;
+    if (mm_is_real(matcode) && mm_is_hermitian(matcode)) return 0;
+    if (mm_is_pattern(matcode) && (mm_is_hermitian(matcode) || 
+                mm_is_skew(matcode))) return 0;
+    return 1;
+}
+
+int mm_read_banner(FILE *f, MM_typecode *matcode)
+{
+    char line[MM_MAX_LINE_LENGTH];
+    char banner[MM_MAX_TOKEN_LENGTH];
+    char mtx[MM_MAX_TOKEN_LENGTH]; 
+    char crd[MM_MAX_TOKEN_LENGTH];
+    char data_type[MM_MAX_TOKEN_LENGTH];
+    char storage_scheme[MM_MAX_TOKEN_LENGTH];
+    char *p;
+
+
+    mm_clear_typecode(matcode);  
+
+    if (fgets(line, MM_MAX_LINE_LENGTH, f) == NULL) 
+        return MM_PREMATURE_EOF;
+
+    if (sscanf(line, "%s %s %s %s %s", banner, mtx, crd, data_type, 
+        storage_scheme) != 5)
+        return MM_PREMATURE_EOF;
+
+    for (p=mtx; *p!='\0'; *p=tolower(*p),p++);  /* convert to lower case */
+    for (p=crd; *p!='\0'; *p=tolower(*p),p++);  
+    for (p=data_type; *p!='\0'; *p=tolower(*p),p++);
+    for (p=storage_scheme; *p!='\0'; *p=tolower(*p),p++);
+
+    /* check for banner */
+    if (strncmp(banner, MatrixMarketBanner, strlen(MatrixMarketBanner)) != 0)
+        return MM_NO_HEADER;
+
+    /* first field should be "mtx" */
+    if (strcmp(mtx, MM_MTX_STR) != 0)
+        return  MM_UNSUPPORTED_TYPE;
+    mm_set_matrix(matcode);
+
+
+    /* second field describes whether this is a sparse matrix (in coordinate
+            storgae) or a dense array */
+
+
+    if (strcmp(crd, MM_SPARSE_STR) == 0)
+        mm_set_sparse(matcode);
+    else
+    if (strcmp(crd, MM_DENSE_STR) == 0)
+            mm_set_dense(matcode);
+    else
+        return MM_UNSUPPORTED_TYPE;
+    
+
+    /* third field */
+
+    if (strcmp(data_type, MM_REAL_STR) == 0)
+        mm_set_real(matcode);
+    else
+    if (strcmp(data_type, MM_COMPLEX_STR) == 0)
+        mm_set_complex(matcode);
+    else
+    if (strcmp(data_type, MM_PATTERN_STR) == 0)
+        mm_set_pattern(matcode);
+    else
+    if (strcmp(data_type, MM_INT_STR) == 0)
+        mm_set_integer(matcode);
+    else
+        return MM_UNSUPPORTED_TYPE;
+    
+
+    /* fourth field */
+
+    if (strcmp(storage_scheme, MM_GENERAL_STR) == 0)
+        mm_set_general(matcode);
+    else
+    if (strcmp(storage_scheme, MM_SYMM_STR) == 0)
+        mm_set_symmetric(matcode);
+    else
+    if (strcmp(storage_scheme, MM_HERM_STR) == 0)
+        mm_set_hermitian(matcode);
+    else
+    if (strcmp(storage_scheme, MM_SKEW_STR) == 0)
+        mm_set_skew(matcode);
+    else
+        return MM_UNSUPPORTED_TYPE;
+        
+
+    return 0;
+}
+
+int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz)
+{
+    if (fprintf(f, "%d %d %d\n", M, N, nz) != 3)
+        return MM_COULD_NOT_WRITE_FILE;
+    else 
+        return 0;
+}
+
+int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz )
+{
+    char line[MM_MAX_LINE_LENGTH];
+    int num_items_read;
+
+    /* set return null parameter values, in case we exit with errors */
+    *M = *N = *nz = 0;
+
+    /* now continue scanning until you reach the end-of-comments */
+    do 
+    {
+        if (fgets(line,MM_MAX_LINE_LENGTH,f) == NULL) 
+            return MM_PREMATURE_EOF;
+    }while (line[0] == '%');
+
+    /* line[] is either blank or has M,N, nz */
+    if (sscanf(line, "%d %d %d", M, N, nz) == 3)
+        return 0;
+        
+    else
+    do
+    { 
+        num_items_read = fscanf(f, "%d %d %d", M, N, nz); 
+        if (num_items_read == EOF) return MM_PREMATURE_EOF;
+    }
+    while (num_items_read != 3);
+
+    return 0;
+}
+
+
+int mm_read_mtx_array_size(FILE *f, int *M, int *N)
+{
+    char line[MM_MAX_LINE_LENGTH];
+    int num_items_read;
+    /* set return null parameter values, in case we exit with errors */
+    *M = *N = 0;
+	
+    /* now continue scanning until you reach the end-of-comments */
+    do 
+    {
+        if (fgets(line,MM_MAX_LINE_LENGTH,f) == NULL) 
+            return MM_PREMATURE_EOF;
+    }while (line[0] == '%');
+
+    /* line[] is either blank or has M,N, nz */
+    if (sscanf(line, "%d %d", M, N) == 2)
+        return 0;
+        
+    else /* we have a blank line */
+    do
+    { 
+        num_items_read = fscanf(f, "%d %d", M, N); 
+        if (num_items_read == EOF) return MM_PREMATURE_EOF;
+    }
+    while (num_items_read != 2);
+
+    return 0;
+}
+
+int mm_write_mtx_array_size(FILE *f, int M, int N)
+{
+    if (fprintf(f, "%d %d\n", M, N) != 2)
+        return MM_COULD_NOT_WRITE_FILE;
+    else 
+        return 0;
+}
+
+
+
+/*-------------------------------------------------------------------------*/
+
+/******************************************************************/
+/* use when I[], J[], and val[]J, and val[] are already allocated */
+/******************************************************************/
+
+int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[],
+        double val[], MM_typecode matcode)
+{
+    int i;
+    if (mm_is_complex(matcode))
+    {
+        for (i=0; i<nz; i++)
+            if (fscanf(f, "%d %d %lg %lg", &I[i], &J[i], &val[2*i], &val[2*i+1])
+                != 4) return MM_PREMATURE_EOF;
+    }
+    else if (mm_is_real(matcode))
+    {
+        for (i=0; i<nz; i++)
+        {
+            if (fscanf(f, "%d %d %lg\n", &I[i], &J[i], &val[i])
+                != 3) return MM_PREMATURE_EOF;
+
+        }
+    }
+
+    else if (mm_is_pattern(matcode))
+    {
+        for (i=0; i<nz; i++)
+            if (fscanf(f, "%d %d", &I[i], &J[i])
+                != 2) return MM_PREMATURE_EOF;
+    }
+    else
+        return MM_UNSUPPORTED_TYPE;
+
+    return 0;
+        
+}
+
+int mm_read_mtx_crd_entry(FILE *f, int *I, int *J,
+        double *real, double *imag, MM_typecode matcode)
+{
+    if (mm_is_complex(matcode))
+    {
+            if (fscanf(f, "%d %d %lg %lg", I, J, real, imag)
+                != 4) return MM_PREMATURE_EOF;
+    }
+    else if (mm_is_real(matcode))
+    {
+            if (fscanf(f, "%d %d %lg\n", I, J, real)
+                != 3) return MM_PREMATURE_EOF;
+
+    }
+
+    else if (mm_is_pattern(matcode))
+    {
+            if (fscanf(f, "%d %d", I, J) != 2) return MM_PREMATURE_EOF;
+    }
+    else
+        return MM_UNSUPPORTED_TYPE;
+
+    return 0;
+        
+}
+
+
+/************************************************************************
+    mm_read_mtx_crd()  fills M, N, nz, array of values, and return
+                        type code, e.g. 'MCRS'
+
+                        if matrix is complex, values[] is of size 2*nz,
+                            (nz pairs of real/imaginary values)
+************************************************************************/
+
+int mm_read_mtx_crd(char *fname, int *M, int *N, int *nz, int **I, int **J, 
+        double **val, MM_typecode *matcode)
+{
+    int ret_code;
+    FILE *f;
+
+    if (strcmp(fname, "stdin") == 0) f=stdin;
+    else
+    if ((f = fopen(fname, "r")) == NULL)
+        return MM_COULD_NOT_READ_FILE;
+
+
+    if ((ret_code = mm_read_banner(f, matcode)) != 0)
+        return ret_code;
+
+    if (!(mm_is_valid(*matcode) && mm_is_sparse(*matcode) && 
+            mm_is_matrix(*matcode)))
+        return MM_UNSUPPORTED_TYPE;
+
+    if ((ret_code = mm_read_mtx_crd_size(f, M, N, nz)) != 0)
+        return ret_code;
+
+
+    *I = (int *)  malloc(*nz * sizeof(int));
+    *J = (int *)  malloc(*nz * sizeof(int));
+    *val = NULL;
+
+    if (mm_is_complex(*matcode))
+    {
+        *val = (double *) malloc(*nz * 2 * sizeof(double));
+        ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, 
+                *matcode);
+        if (ret_code != 0) return ret_code;
+    }
+    else if (mm_is_real(*matcode))
+    {
+        *val = (double *) malloc(*nz * sizeof(double));
+        ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, 
+                *matcode);
+        if (ret_code != 0) return ret_code;
+    }
+
+    else if (mm_is_pattern(*matcode))
+    {
+        ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, 
+                *matcode);
+        if (ret_code != 0) return ret_code;
+    }
+
+    if (f != stdin) fclose(f);
+    return 0;
+}
+
+int mm_write_banner(FILE *f, MM_typecode matcode)
+{
+    char *str = mm_typecode_to_str(matcode);
+    int ret_code;
+
+    ret_code = fprintf(f, "%s %s\n", MatrixMarketBanner, str);
+    free(str);
+    if (ret_code !=2 )
+        return MM_COULD_NOT_WRITE_FILE;
+    else
+        return 0;
+}
+
+int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[],
+        double val[], MM_typecode matcode)
+{
+    FILE *f;
+    int i;
+
+    if (strcmp(fname, "stdout") == 0) 
+        f = stdout;
+    else
+    if ((f = fopen(fname, "w")) == NULL)
+        return MM_COULD_NOT_WRITE_FILE;
+    
+    /* print banner followed by typecode */
+    fprintf(f, "%s ", MatrixMarketBanner);
+    fprintf(f, "%s\n", mm_typecode_to_str(matcode));
+
+    /* print matrix sizes and nonzeros */
+    fprintf(f, "%d %d %d\n", M, N, nz);
+
+    /* print values */
+    if (mm_is_pattern(matcode))
+        for (i=0; i<nz; i++)
+            fprintf(f, "%d %d\n", I[i], J[i]);
+    else
+    if (mm_is_real(matcode))
+        for (i=0; i<nz; i++)
+            fprintf(f, "%d %d %20.16g\n", I[i], J[i], val[i]);
+    else
+    if (mm_is_complex(matcode))
+        for (i=0; i<nz; i++)
+            fprintf(f, "%d %d %20.16g %20.16g\n", I[i], J[i], val[2*i], 
+                        val[2*i+1]);
+    else
+    {
+        if (f != stdout) fclose(f);
+        return MM_UNSUPPORTED_TYPE;
+    }
+
+    if (f !=stdout) fclose(f);
+
+    return 0;
+}
+  
+
+/**
+*  Create a new copy of a string s.  mm_strdup() is a common routine, but
+*  not part of ANSI C, so it is included here.  Used by mm_typecode_to_str().
+*
+*/
+char *mm_strdup(const char *s)
+{
+	int len = strlen(s);
+	char *s2 = (char *) malloc((len+1)*sizeof(char));
+	return strcpy(s2, s);
+}
+
+char  *mm_typecode_to_str(MM_typecode matcode)
+{
+    char buffer[MM_MAX_LINE_LENGTH];
+    char *types[4];
+	char *mm_strdup(const char *);
+    int error =0;
+
+    /* check for MTX type */
+    if (mm_is_matrix(matcode)) 
+        types[0] = MM_MTX_STR;
+    else
+        error=1;
+
+    /* check for CRD or ARR matrix */
+    if (mm_is_sparse(matcode))
+        types[1] = MM_SPARSE_STR;
+    else
+    if (mm_is_dense(matcode))
+        types[1] = MM_DENSE_STR;
+    else
+        return NULL;
+
+    /* check for element data type */
+    if (mm_is_real(matcode))
+        types[2] = MM_REAL_STR;
+    else
+    if (mm_is_complex(matcode))
+        types[2] = MM_COMPLEX_STR;
+    else
+    if (mm_is_pattern(matcode))
+        types[2] = MM_PATTERN_STR;
+    else
+    if (mm_is_integer(matcode))
+        types[2] = MM_INT_STR;
+    else
+        return NULL;
+
+
+    /* check for symmetry type */
+    if (mm_is_general(matcode))
+        types[3] = MM_GENERAL_STR;
+    else
+    if (mm_is_symmetric(matcode))
+        types[3] = MM_SYMM_STR;
+    else 
+    if (mm_is_hermitian(matcode))
+        types[3] = MM_HERM_STR;
+    else 
+    if (mm_is_skew(matcode))
+        types[3] = MM_SKEW_STR;
+    else
+        return NULL;
+
+    sprintf(buffer,"%s %s %s %s", types[0], types[1], types[2], types[3]);
+    return mm_strdup(buffer);
+
+}
--- a/examples/gmres/mmio.h
+++ b/examples/gmres/mmio.h
@@ -0,0 +1,135 @@
+/* 
+*   Matrix Market I/O library for ANSI C
+*
+*   See http://math.nist.gov/MatrixMarket for details.
+*
+*
+*/
+
+#ifndef MM_IO_H
+#define MM_IO_H
+
+#define MM_MAX_LINE_LENGTH 1025
+#define MatrixMarketBanner "%%MatrixMarket"
+#define MM_MAX_TOKEN_LENGTH 64
+
+typedef char MM_typecode[4];
+
+#include <stdio.h>
+
+char *mm_typecode_to_str(MM_typecode matcode);
+
+int mm_read_banner(FILE *f, MM_typecode *matcode);
+int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz);
+int mm_read_mtx_array_size(FILE *f, int *M, int *N);
+
+int mm_write_banner(FILE *f, MM_typecode matcode);
+int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz);
+int mm_write_mtx_array_size(FILE *f, int M, int N);
+
+
+/********************* MM_typecode query fucntions ***************************/
+
+#define mm_is_matrix(typecode)	((typecode)[0]=='M')
+
+#define mm_is_sparse(typecode)	((typecode)[1]=='C')
+#define mm_is_coordinate(typecode)((typecode)[1]=='C')
+#define mm_is_dense(typecode)	((typecode)[1]=='A')
+#define mm_is_array(typecode)	((typecode)[1]=='A')
+
+#define mm_is_complex(typecode)	((typecode)[2]=='C')
+#define mm_is_real(typecode)		((typecode)[2]=='R')
+#define mm_is_pattern(typecode)	((typecode)[2]=='P')
+#define mm_is_integer(typecode) ((typecode)[2]=='I')
+
+#define mm_is_symmetric(typecode)((typecode)[3]=='S')
+#define mm_is_general(typecode)	((typecode)[3]=='G')
+#define mm_is_skew(typecode)	((typecode)[3]=='K')
+#define mm_is_hermitian(typecode)((typecode)[3]=='H')
+
+int mm_is_valid(MM_typecode matcode);		/* too complex for a macro */
+
+
+/********************* MM_typecode modify fucntions ***************************/
+
+#define mm_set_matrix(typecode)	((*typecode)[0]='M')
+#define mm_set_coordinate(typecode)	((*typecode)[1]='C')
+#define mm_set_array(typecode)	((*typecode)[1]='A')
+#define mm_set_dense(typecode)	mm_set_array(typecode)
+#define mm_set_sparse(typecode)	mm_set_coordinate(typecode)
+
+#define mm_set_complex(typecode)((*typecode)[2]='C')
+#define mm_set_real(typecode)	((*typecode)[2]='R')
+#define mm_set_pattern(typecode)((*typecode)[2]='P')
+#define mm_set_integer(typecode)((*typecode)[2]='I')
+
+
+#define mm_set_symmetric(typecode)((*typecode)[3]='S')
+#define mm_set_general(typecode)((*typecode)[3]='G')
+#define mm_set_skew(typecode)	((*typecode)[3]='K')
+#define mm_set_hermitian(typecode)((*typecode)[3]='H')
+
+#define mm_clear_typecode(typecode) ((*typecode)[0]=(*typecode)[1]= \
+									(*typecode)[2]=' ',(*typecode)[3]='G')
+
+#define mm_initialize_typecode(typecode) mm_clear_typecode(typecode)
+
+
+/********************* Matrix Market error codes ***************************/
+
+
+#define MM_COULD_NOT_READ_FILE	11
+#define MM_PREMATURE_EOF		12
+#define MM_NOT_MTX				13
+#define MM_NO_HEADER			14
+#define MM_UNSUPPORTED_TYPE		15
+#define MM_LINE_TOO_LONG		16
+#define MM_COULD_NOT_WRITE_FILE	17
+
+
+/******************** Matrix Market internal definitions ********************
+
+   MM_matrix_typecode: 4-character sequence
+
+				    ojbect 		sparse/   	data        storage 
+						  		dense     	type        scheme
+
+   string position:	 [0]        [1]			[2]         [3]
+
+   Matrix typecode:  M(atrix)  C(oord)		R(eal)   	G(eneral)
+						        A(array)	C(omplex)   H(ermitian)
+											P(attern)   S(ymmetric)
+								    		I(nteger)	K(kew)
+
+ ***********************************************************************/
+
+#define MM_MTX_STR		"matrix"
+#define MM_ARRAY_STR	"array"
+#define MM_DENSE_STR	"array"
+#define MM_COORDINATE_STR "coordinate" 
+#define MM_SPARSE_STR	"coordinate"
+#define MM_COMPLEX_STR	"complex"
+#define MM_REAL_STR		"real"
+#define MM_INT_STR		"integer"
+#define MM_GENERAL_STR  "general"
+#define MM_SYMM_STR		"symmetric"
+#define MM_HERM_STR		"hermitian"
+#define MM_SKEW_STR		"skew-symmetric"
+#define MM_PATTERN_STR  "pattern"
+
+
+/*  high level routines */
+
+int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[],
+		 double val[], MM_typecode matcode);
+int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[],
+		double val[], MM_typecode matcode);
+int mm_read_mtx_crd_entry(FILE *f, int *I, int *J, double *real, double *img,
+			MM_typecode matcode);
+
+int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_,
+                double **val_, int **I_, int **J_);
+
+
+
+#endif
--- a/examples/gmres/util.h
+++ b/examples/gmres/util.h
@@ -0,0 +1,53 @@
+/*
+  Copyright (c) 2012, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+
+#ifndef __UTIL_H__
+#define __UTIL_H__
+
+#include <stdio.h>
+#include "matrix.h"
+
+
+inline void printMatrix (DenseMatrix &M, const char *name) {
+    printf("Matrix %s:\n", name);
+    for (int row = 0; row < M.rows(); row++) {
+        printf("row %2d: ", row + 1);
+        for (int col = 0; col < M.cols(); col++)
+            printf("%6f ", M(row, col));
+        printf("\n");
+    }
+    printf("\n");
+}
+
+#endif
--- a/examples/intrinsics/generic-16.h
+++ b/examples/intrinsics/generic-16.h
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -55,6 +55,7 @@ typedef int64_t __vec1_i64;

 struct __vec16_i1 {
    __vec16_i1() { }
+    __vec16_i1(const uint16_t &vv) : v(vv) { }
    __vec16_i1(uint32_t v0, uint32_t v1, uint32_t v2, uint32_t v3,
               uint32_t v4, uint32_t v5, uint32_t v6, uint32_t v7,
               uint32_t v8, uint32_t v9, uint32_t v10, uint32_t v11,
@@ -193,13 +194,22 @@ static FORCEINLINE TYPE NAME(TYPE a, TYPE b) {                      \
   return ret;                                                      \
 }

-#define CMP_OP(TYPE, CAST, NAME, OP)                                \
-static FORCEINLINE __vec16_i1 NAME(TYPE a, TYPE b) {                \
+#define CMP_OP(TYPE, SUFFIX, CAST, NAME, OP)                        \
+static FORCEINLINE __vec16_i1 NAME##_##SUFFIX(TYPE a, TYPE b) {     \
   __vec16_i1 ret;                                                  \
   ret.v = 0;                                                       \
   for (int i = 0; i < 16; ++i)                                     \
       ret.v |= ((CAST)(a.v[i]) OP (CAST)(b.v[i])) << i;            \
   return ret;                                                      \
+}                                                                   \
+static FORCEINLINE __vec16_i1 NAME##_##SUFFIX##_and_mask(TYPE a, TYPE b,       \
+                                              __vec16_i1 mask) {    \
+   __vec16_i1 ret;                                                  \
+   ret.v = 0;                                                       \
+   for (int i = 0; i < 16; ++i)                                     \
+       ret.v |= ((CAST)(a.v[i]) OP (CAST)(b.v[i])) << i;            \
+   ret.v &= mask.v;                                                 \
+   return ret;                                                      \
 }

 #define INSERT_EXTRACT(VTYPE, STYPE)                                  \
@@ -211,14 +221,16 @@ static FORCEINLINE void __insert_element(VTYPE *v, int index, STYPE val) { \
 }

 #define LOAD_STORE(VTYPE, STYPE)                       \
-static FORCEINLINE VTYPE __load(VTYPE *p, int align) { \
+template <int ALIGN>                                   \
+static FORCEINLINE VTYPE __load(const VTYPE *p) {      \
    STYPE *ptr = (STYPE *)p;                           \
    VTYPE ret;                                         \
    for (int i = 0; i < 16; ++i)                       \
        ret.v[i] = ptr[i];                             \
    return ret;                                        \
 }                                                      \
-static FORCEINLINE void __store(VTYPE *p, VTYPE v, int align) {    \
+template <int ALIGN>                                   \
+static FORCEINLINE void __store(VTYPE *p, VTYPE v) {   \
    STYPE *ptr = (STYPE *)p;                           \
    for (int i = 0; i < 16; ++i)                       \
        ptr[i] = v.v[i];                               \
@@ -259,13 +271,29 @@ static FORCEINLINE TYPE NAME(TYPE a, int32_t b) {                   \
   return ret;                                                      \
 }

-#define SMEAR(VTYPE, NAME, STYPE)               \
-static FORCEINLINE VTYPE __smear_##NAME(STYPE v) {        \
-    VTYPE ret;                                  \
-    for (int i = 0; i < 16; ++i)                \
-        ret.v[i] = v;                           \
-    return ret;                                 \
-}                                               \
+#define SMEAR(VTYPE, NAME, STYPE)                                  \
+template <class RetVecType> VTYPE __smear_##NAME(STYPE);           \
+template <> FORCEINLINE VTYPE __smear_##NAME<VTYPE>(STYPE v) {     \
+    VTYPE ret;                                                     \
+    for (int i = 0; i < 16; ++i)                                   \
+        ret.v[i] = v;                                              \
+    return ret;                                                    \
+}
+
+#define SETZERO(VTYPE, NAME)                                       \
+template <class RetVecType> VTYPE __setzero_##NAME();              \
+template <> FORCEINLINE VTYPE __setzero_##NAME<VTYPE>() {          \
+    VTYPE ret;                                                     \
+    for (int i = 0; i < 16; ++i)                                   \
+        ret.v[i] = 0;                                              \
+    return ret;                                                    \
+}
+
+#define UNDEF(VTYPE, NAME)                                         \
+template <class RetVecType> VTYPE __undef_##NAME();                \
+template <> FORCEINLINE VTYPE __undef_##NAME<VTYPE>() {            \
+    return VTYPE();                                                \
+}

 #define BROADCAST(VTYPE, NAME, STYPE)                 \
 static FORCEINLINE VTYPE __broadcast_##NAME(VTYPE v, int index) {   \
@@ -311,11 +339,23 @@ INSERT_EXTRACT(__vec1_d, double)
 ///////////////////////////////////////////////////////////////////////////
 // mask ops

-static FORCEINLINE uint32_t __movmsk(__vec16_i1 mask) {
-    return mask.v;
+static FORCEINLINE uint64_t __movmsk(__vec16_i1 mask) {
+    return (uint64_t)mask.v;
 }

-static FORCEINLINE __vec16_i1 __equal(__vec16_i1 a, __vec16_i1 b) {
+static FORCEINLINE bool __any(__vec16_i1 mask) {
+    return (mask.v!=0);
+}
+
+static FORCEINLINE bool __all(__vec16_i1 mask) {
+    return (mask.v==0xFFFF);
+}
+
+static FORCEINLINE bool __none(__vec16_i1 mask) {
+    return (mask.v==0);
+}
+
+static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) {
    __vec16_i1 r;
    r.v = (a.v & b.v) | (~a.v & ~b.v);
    return r;
@@ -339,6 +379,24 @@ static FORCEINLINE __vec16_i1 __or(__vec16_i1 a, __vec16_i1 b) {
    return r;
 }

+static FORCEINLINE __vec16_i1 __not(__vec16_i1 v) {
+    __vec16_i1 r;
+    r.v = ~v.v;
+    return r;
+}
+
+static FORCEINLINE __vec16_i1 __and_not1(__vec16_i1 a, __vec16_i1 b) {
+    __vec16_i1 r;
+    r.v = ~a.v & b.v;
+    return r;
+}
+
+static FORCEINLINE __vec16_i1 __and_not2(__vec16_i1 a, __vec16_i1 b) {
+    __vec16_i1 r;
+    r.v = a.v & ~b.v;
+    return r;
+}
+
 static FORCEINLINE __vec16_i1 __select(__vec16_i1 mask, __vec16_i1 a, 
                                       __vec16_i1 b) {
    __vec16_i1 r;
@@ -362,18 +420,36 @@ static FORCEINLINE void __insert_element(__vec16_i1 *vec, int index,
        vec->v |= (1 << index);
 }

-static FORCEINLINE __vec16_i1 __load(__vec16_i1 *p, int align) {
+template <int ALIGN> static FORCEINLINE __vec16_i1 __load(const __vec16_i1 *p) {
    uint16_t *ptr = (uint16_t *)p;
    __vec16_i1 r;
    r.v = *ptr;
    return r;
 }

-static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v, int align) {
+template <int ALIGN> static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v) {
    uint16_t *ptr = (uint16_t *)p;
    *ptr = v.v;
 }

+template <class RetVecType> __vec16_i1 __smear_i1(int i);
+template <> FORCEINLINE __vec16_i1 __smear_i1<__vec16_i1>(int v) {
+    return __vec16_i1(v, v, v, v, v, v, v, v, 
+                      v, v, v, v, v, v, v, v);
+}
+
+template <class RetVecType> __vec16_i1 __setzero_i1();
+template <> FORCEINLINE __vec16_i1 __setzero_i1<__vec16_i1>() {
+    return __vec16_i1(0, 0, 0, 0, 0, 0, 0, 0, 
+                      0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+template <class RetVecType> __vec16_i1 __undef_i1();
+template <> FORCEINLINE __vec16_i1 __undef_i1<__vec16_i1>() {
+    return __vec16_i1();
+}
+
+
 ///////////////////////////////////////////////////////////////////////////
 // int8

@@ -398,20 +474,22 @@ SHIFT_UNIFORM(__vec16_i8, uint8_t, __lshr, >>)
 SHIFT_UNIFORM(__vec16_i8, int8_t, __ashr, >>)
 SHIFT_UNIFORM(__vec16_i8, int8_t, __shl, <<)

-CMP_OP(__vec16_i8, int8_t,  __equal, ==)
-CMP_OP(__vec16_i8, int8_t,  __not_equal, !=)
-CMP_OP(__vec16_i8, uint8_t, __unsigned_less_equal, <=)
-CMP_OP(__vec16_i8, int8_t,  __signed_less_equal, <=)
-CMP_OP(__vec16_i8, uint8_t, __unsigned_greater_equal, >=)
-CMP_OP(__vec16_i8, int8_t,  __signed_greater_equal, >=)
-CMP_OP(__vec16_i8, uint8_t, __unsigned_less_than, <)
-CMP_OP(__vec16_i8, int8_t,  __signed_less_than, <)
-CMP_OP(__vec16_i8, uint8_t, __unsigned_greater_than, >)
-CMP_OP(__vec16_i8, int8_t,  __signed_greater_than, >)
+CMP_OP(__vec16_i8, i8, int8_t,  __equal, ==)
+CMP_OP(__vec16_i8, i8, int8_t,  __not_equal, !=)
+CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_less_equal, <=)
+CMP_OP(__vec16_i8, i8, int8_t,  __signed_less_equal, <=)
+CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_greater_equal, >=)
+CMP_OP(__vec16_i8, i8, int8_t,  __signed_greater_equal, >=)
+CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_less_than, <)
+CMP_OP(__vec16_i8, i8, int8_t,  __signed_less_than, <)
+CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_greater_than, >)
+CMP_OP(__vec16_i8, i8, int8_t,  __signed_greater_than, >)

 SELECT(__vec16_i8)
 INSERT_EXTRACT(__vec16_i8, int8_t)
 SMEAR(__vec16_i8, i8, int8_t)
+SETZERO(__vec16_i8, i8)
+UNDEF(__vec16_i8, i8)
 BROADCAST(__vec16_i8, i8, int8_t)
 ROTATE(__vec16_i8, i8, int8_t)
 SHUFFLES(__vec16_i8, i8, int8_t)
@@ -441,20 +519,22 @@ SHIFT_UNIFORM(__vec16_i16, uint16_t, __lshr, >>)
 SHIFT_UNIFORM(__vec16_i16, int16_t, __ashr, >>)
 SHIFT_UNIFORM(__vec16_i16, int16_t, __shl, <<)

-CMP_OP(__vec16_i16, int16_t,  __equal, ==)
-CMP_OP(__vec16_i16, int16_t,  __not_equal, !=)
-CMP_OP(__vec16_i16, uint16_t, __unsigned_less_equal, <=)
-CMP_OP(__vec16_i16, int16_t,  __signed_less_equal, <=)
-CMP_OP(__vec16_i16, uint16_t, __unsigned_greater_equal, >=)
-CMP_OP(__vec16_i16, int16_t,  __signed_greater_equal, >=)
-CMP_OP(__vec16_i16, uint16_t, __unsigned_less_than, <)
-CMP_OP(__vec16_i16, int16_t,  __signed_less_than, <)
-CMP_OP(__vec16_i16, uint16_t, __unsigned_greater_than, >)
-CMP_OP(__vec16_i16, int16_t,  __signed_greater_than, >)
+CMP_OP(__vec16_i16, i16, int16_t,  __equal, ==)
+CMP_OP(__vec16_i16, i16, int16_t,  __not_equal, !=)
+CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_less_equal, <=)
+CMP_OP(__vec16_i16, i16, int16_t,  __signed_less_equal, <=)
+CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_greater_equal, >=)
+CMP_OP(__vec16_i16, i16, int16_t,  __signed_greater_equal, >=)
+CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_less_than, <)
+CMP_OP(__vec16_i16, i16, int16_t,  __signed_less_than, <)
+CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_greater_than, >)
+CMP_OP(__vec16_i16, i16, int16_t,  __signed_greater_than, >)

 SELECT(__vec16_i16)
 INSERT_EXTRACT(__vec16_i16, int16_t)
 SMEAR(__vec16_i16, i16, int16_t)
+SETZERO(__vec16_i16, i16)
+UNDEF(__vec16_i16, i16)
 BROADCAST(__vec16_i16, i16, int16_t)
 ROTATE(__vec16_i16, i16, int16_t)
 SHUFFLES(__vec16_i16, i16, int16_t)
@@ -484,20 +564,22 @@ SHIFT_UNIFORM(__vec16_i32, uint32_t, __lshr, >>)
 SHIFT_UNIFORM(__vec16_i32, int32_t, __ashr, >>)
 SHIFT_UNIFORM(__vec16_i32, int32_t, __shl, <<)

-CMP_OP(__vec16_i32, int32_t,  __equal, ==)
-CMP_OP(__vec16_i32, int32_t,  __not_equal, !=)
-CMP_OP(__vec16_i32, uint32_t, __unsigned_less_equal, <=)
-CMP_OP(__vec16_i32, int32_t,  __signed_less_equal, <=)
-CMP_OP(__vec16_i32, uint32_t, __unsigned_greater_equal, >=)
-CMP_OP(__vec16_i32, int32_t,  __signed_greater_equal, >=)
-CMP_OP(__vec16_i32, uint32_t, __unsigned_less_than, <)
-CMP_OP(__vec16_i32, int32_t,  __signed_less_than, <)
-CMP_OP(__vec16_i32, uint32_t, __unsigned_greater_than, >)
-CMP_OP(__vec16_i32, int32_t,  __signed_greater_than, >)
+CMP_OP(__vec16_i32, i32, int32_t,  __equal, ==)
+CMP_OP(__vec16_i32, i32, int32_t,  __not_equal, !=)
+CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_less_equal, <=)
+CMP_OP(__vec16_i32, i32, int32_t,  __signed_less_equal, <=)
+CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_greater_equal, >=)
+CMP_OP(__vec16_i32, i32, int32_t,  __signed_greater_equal, >=)
+CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_less_than, <)
+CMP_OP(__vec16_i32, i32, int32_t,  __signed_less_than, <)
+CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_greater_than, >)
+CMP_OP(__vec16_i32, i32, int32_t,  __signed_greater_than, >)

 SELECT(__vec16_i32)
 INSERT_EXTRACT(__vec16_i32, int32_t)
 SMEAR(__vec16_i32, i32, int32_t)
+SETZERO(__vec16_i32, i32)
+UNDEF(__vec16_i32, i32)
 BROADCAST(__vec16_i32, i32, int32_t)
 ROTATE(__vec16_i32, i32, int32_t)
 SHUFFLES(__vec16_i32, i32, int32_t)
@@ -527,20 +609,22 @@ SHIFT_UNIFORM(__vec16_i64, uint64_t, __lshr, >>)
 SHIFT_UNIFORM(__vec16_i64, int64_t, __ashr, >>)
 SHIFT_UNIFORM(__vec16_i64, int64_t, __shl, <<)

-CMP_OP(__vec16_i64, int64_t,  __equal, ==)
-CMP_OP(__vec16_i64, int64_t,  __not_equal, !=)
-CMP_OP(__vec16_i64, uint64_t, __unsigned_less_equal, <=)
-CMP_OP(__vec16_i64, int64_t,  __signed_less_equal, <=)
-CMP_OP(__vec16_i64, uint64_t, __unsigned_greater_equal, >=)
-CMP_OP(__vec16_i64, int64_t,  __signed_greater_equal, >=)
-CMP_OP(__vec16_i64, uint64_t, __unsigned_less_than, <)
-CMP_OP(__vec16_i64, int64_t,  __signed_less_than, <)
-CMP_OP(__vec16_i64, uint64_t, __unsigned_greater_than, >)
-CMP_OP(__vec16_i64, int64_t,  __signed_greater_than, >)
+CMP_OP(__vec16_i64, i64, int64_t,  __equal, ==)
+CMP_OP(__vec16_i64, i64, int64_t,  __not_equal, !=)
+CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_less_equal, <=)
+CMP_OP(__vec16_i64, i64, int64_t,  __signed_less_equal, <=)
+CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_greater_equal, >=)
+CMP_OP(__vec16_i64, i64, int64_t,  __signed_greater_equal, >=)
+CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_less_than, <)
+CMP_OP(__vec16_i64, i64, int64_t,  __signed_less_than, <)
+CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_greater_than, >)
+CMP_OP(__vec16_i64, i64, int64_t,  __signed_greater_than, >)

 SELECT(__vec16_i64)
 INSERT_EXTRACT(__vec16_i64, int64_t)
 SMEAR(__vec16_i64, i64, int64_t)
+SETZERO(__vec16_i64, i64)
+UNDEF(__vec16_i64, i64)
 BROADCAST(__vec16_i64, i64, int64_t)
 ROTATE(__vec16_i64, i64, int64_t)
 SHUFFLES(__vec16_i64, i64, int64_t)
@@ -554,14 +638,14 @@ BINARY_OP(__vec16_f, __sub, -)
 BINARY_OP(__vec16_f, __mul, *)
 BINARY_OP(__vec16_f, __div, /)

-CMP_OP(__vec16_f, float, __equal, ==)
-CMP_OP(__vec16_f, float, __not_equal, !=)
-CMP_OP(__vec16_f, float, __less_than, <)
-CMP_OP(__vec16_f, float, __less_equal, <=)
-CMP_OP(__vec16_f, float, __greater_than, >)
-CMP_OP(__vec16_f, float, __greater_equal, >=)
+CMP_OP(__vec16_f, float, float, __equal, ==)
+CMP_OP(__vec16_f, float, float, __not_equal, !=)
+CMP_OP(__vec16_f, float, float, __less_than, <)
+CMP_OP(__vec16_f, float, float, __less_equal, <=)
+CMP_OP(__vec16_f, float, float, __greater_than, >)
+CMP_OP(__vec16_f, float, float, __greater_equal, >=)

-static FORCEINLINE __vec16_i1 __ordered(__vec16_f a, __vec16_f b) {
+static FORCEINLINE __vec16_i1 __ordered_float(__vec16_f a, __vec16_f b) {
    __vec16_i1 ret;
    ret.v = 0;
    for (int i = 0; i < 16; ++i)
@@ -569,6 +653,14 @@ static FORCEINLINE __vec16_i1 __ordered(__vec16_f a, __vec16_f b) {
    return ret;
 }

+static FORCEINLINE __vec16_i1 __unordered_float(__vec16_f a, __vec16_f b) {
+    __vec16_i1 ret;
+    ret.v = 0;
+    for (int i = 0; i < 16; ++i)
+        ret.v |= ((a.v[i] != a.v[i]) || (b.v[i] != b.v[i])) ? (1 << i) : 0;
+    return ret;
+}
+
 #if 0
      case Instruction::FRem: intrinsic = "__frem"; break;
 #endif
@@ -576,11 +668,128 @@ static FORCEINLINE __vec16_i1 __ordered(__vec16_f a, __vec16_f b) {
 SELECT(__vec16_f)
 INSERT_EXTRACT(__vec16_f, float)
 SMEAR(__vec16_f, float, float)
+SETZERO(__vec16_f, float)
+UNDEF(__vec16_f, float)
 BROADCAST(__vec16_f, float, float)
 ROTATE(__vec16_f, float, float)
 SHUFFLES(__vec16_f, float, float)
 LOAD_STORE(__vec16_f, float)

+static FORCEINLINE float __exp_uniform_float(float v) {
+    return expf(v);
+}
+
+static FORCEINLINE __vec16_f __exp_varying_float(__vec16_f v) {
+    __vec16_f ret;
+    for (int i = 0; i < 16; ++i)
+        ret.v[i] = expf(v.v[i]);
+    return ret;
+}
+
+static FORCEINLINE float __log_uniform_float(float v) {
+    return logf(v);
+}
+
+static FORCEINLINE __vec16_f __log_varying_float(__vec16_f v) {
+    __vec16_f ret;
+    for (int i = 0; i < 16; ++i)
+        ret.v[i] = logf(v.v[i]);
+    return ret;
+}
+
+static FORCEINLINE float __pow_uniform_float(float a, float b) {
+    return powf(a, b);
+}
+
+static FORCEINLINE __vec16_f __pow_varying_float(__vec16_f a, __vec16_f b) {
+    __vec16_f ret;
+    for (int i = 0; i < 16; ++i)
+        ret.v[i] = powf(a.v[i], b.v[i]);
+    return ret;
+}
+
+static FORCEINLINE int __intbits(float v) {
+    union {
+        float f;
+        int i;
+    } u;
+    u.f = v;
+    return u.i;
+}
+
+static FORCEINLINE float __floatbits(int v) {
+    union {
+        float f;
+        int i;
+    } u;
+    u.i = v;
+    return u.f;
+}
+
+static FORCEINLINE float __half_to_float_uniform(int16_t h) {
+    static const uint32_t shifted_exp = 0x7c00 << 13; // exponent mask after shift
+
+    int32_t o = ((int32_t)(h & 0x7fff)) << 13;     // exponent/mantissa bits
+    uint32_t exp = shifted_exp & o;   // just the exponent
+    o += (127 - 15) << 23;        // exponent adjust
+
+    // handle exponent special cases
+    if (exp == shifted_exp) // Inf/NaN?
+        o += (128 - 16) << 23;    // extra exp adjust
+    else if (exp == 0) { // Zero/Denormal?
+        o += 1 << 23;             // extra exp adjust
+        o = __intbits(__floatbits(o) - __floatbits(113 << 23)); // renormalize
+    }
+
+    o |= ((int32_t)(h & 0x8000)) << 16;    // sign bit
+    return __floatbits(o);
+}
+
+
+static FORCEINLINE __vec16_f __half_to_float_varying(__vec16_i16 v) {
+    __vec16_f ret;
+    for (int i = 0; i < 16; ++i)
+        ret.v[i] = __half_to_float_uniform(v.v[i]);
+    return ret;
+}
+
+
+static FORCEINLINE int16_t __float_to_half_uniform(float f) {
+    uint32_t sign_mask = 0x80000000u;
+    int32_t o;
+
+    int32_t fint = __intbits(f);
+    int32_t sign = fint & sign_mask;
+    fint ^= sign;
+
+    int32_t f32infty = 255 << 23;
+    o = (fint > f32infty) ? 0x7e00 : 0x7c00; 
+
+    // (De)normalized number or zero
+    // update fint unconditionally to save the blending; we don't need it
+    // anymore for the Inf/NaN case anyway.
+    const uint32_t round_mask = ~0xfffu; 
+    const int32_t magic = 15 << 23;
+    const int32_t f16infty = 31 << 23;
+
+    int32_t fint2 = __intbits(__floatbits(fint & round_mask) * __floatbits(magic)) - round_mask;
+    fint2 = (fint2 > f16infty) ? f16infty : fint2; // Clamp to signed infinity if overflowed
+
+    if (fint < f32infty)
+        o = fint2 >> 13; // Take the bits!
+
+    return (o | (sign >> 16));
+}
+
+
+static FORCEINLINE __vec16_i16 __float_to_half_varying(__vec16_f v) {
+    __vec16_i16 ret;
+    for (int i = 0; i < 16; ++i)
+        ret.v[i] = __float_to_half_uniform(v.v[i]);
+    return ret;
+}
+
+
 ///////////////////////////////////////////////////////////////////////////
 // double

@@ -589,14 +798,14 @@ BINARY_OP(__vec16_d, __sub, -)
 BINARY_OP(__vec16_d, __mul, *)
 BINARY_OP(__vec16_d, __div, /)

-CMP_OP(__vec16_d, double, __equal, ==)
-CMP_OP(__vec16_d, double, __not_equal, !=)
-CMP_OP(__vec16_d, double, __less_than, <)
-CMP_OP(__vec16_d, double, __less_equal, <=)
-CMP_OP(__vec16_d, double, __greater_than, >)
-CMP_OP(__vec16_d, double, __greater_equal, >=)
+CMP_OP(__vec16_d, double, double, __equal, ==)
+CMP_OP(__vec16_d, double, double, __not_equal, !=)
+CMP_OP(__vec16_d, double, double, __less_than, <)
+CMP_OP(__vec16_d, double, double, __less_equal, <=)
+CMP_OP(__vec16_d, double, double, __greater_than, >)
+CMP_OP(__vec16_d, double, double, __greater_equal, >=)

-static FORCEINLINE __vec16_i1 __ordered(__vec16_d a, __vec16_d b) {
+static FORCEINLINE __vec16_i1 __ordered_double(__vec16_d a, __vec16_d b) {
    __vec16_i1 ret;
    ret.v = 0;
    for (int i = 0; i < 16; ++i)
@@ -604,6 +813,14 @@ static FORCEINLINE __vec16_i1 __ordered(__vec16_d a, __vec16_d b) {
    return ret;
 }

+static FORCEINLINE __vec16_i1 __unordered_double(__vec16_d a, __vec16_d b) {
+    __vec16_i1 ret;
+    ret.v = 0;
+    for (int i = 0; i < 16; ++i)
+        ret.v |= ((a.v[i] != a.v[i]) || (b.v[i] != b.v[i])) ? (1 << i) : 0;
+    return ret;
+}
+
 #if 0
      case Instruction::FRem: intrinsic = "__frem"; break;
 #endif
@@ -611,6 +828,8 @@ static FORCEINLINE __vec16_i1 __ordered(__vec16_d a, __vec16_d b) {
 SELECT(__vec16_d)
 INSERT_EXTRACT(__vec16_d, double)
 SMEAR(__vec16_d, double, double)
+SETZERO(__vec16_d, double)
+UNDEF(__vec16_d, double)
 BROADCAST(__vec16_d, double, double)
 ROTATE(__vec16_d, double, double)
 SHUFFLES(__vec16_d, double, double)
@@ -962,8 +1181,8 @@ REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_max_uint64, >)
 ///////////////////////////////////////////////////////////////////////////
 // masked load/store

-static FORCEINLINE __vec16_i8 __masked_load_8(void *p,
-                                              __vec16_i1 mask) {
+static FORCEINLINE __vec16_i8 __masked_load_i8(void *p,
+                                               __vec16_i1 mask) {
    __vec16_i8 ret;
    int8_t *ptr = (int8_t *)p;
    for (int i = 0; i < 16; ++i)
@@ -972,8 +1191,8 @@ static FORCEINLINE __vec16_i8 __masked_load_8(void *p,
    return ret;
 }

-static FORCEINLINE __vec16_i16 __masked_load_16(void *p,
-                                                __vec16_i1 mask) {
+static FORCEINLINE __vec16_i16 __masked_load_i16(void *p,
+                                                 __vec16_i1 mask) {
    __vec16_i16 ret;
    int16_t *ptr = (int16_t *)p;
    for (int i = 0; i < 16; ++i)
@@ -982,8 +1201,8 @@ static FORCEINLINE __vec16_i16 __masked_load_16(void *p,
    return ret;
 }

-static FORCEINLINE __vec16_i32 __masked_load_32(void *p,
-                                                __vec16_i1 mask) {
+static FORCEINLINE __vec16_i32 __masked_load_i32(void *p,
+                                                 __vec16_i1 mask) {
    __vec16_i32 ret;
    int32_t *ptr = (int32_t *)p;
    for (int i = 0; i < 16; ++i)
@@ -992,8 +1211,18 @@ static FORCEINLINE __vec16_i32 __masked_load_32(void *p,
    return ret;
 }

-static FORCEINLINE __vec16_i64 __masked_load_64(void *p,
-                                                __vec16_i1 mask) {
+static FORCEINLINE __vec16_f __masked_load_float(void *p,
+                                                 __vec16_i1 mask) {
+    __vec16_f ret;
+    float *ptr = (float *)p;
+    for (int i = 0; i < 16; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret.v[i] = ptr[i];
+    return ret;
+}
+
+static FORCEINLINE __vec16_i64 __masked_load_i64(void *p,
+                                                 __vec16_i1 mask) {
    __vec16_i64 ret;
    int64_t *ptr = (int64_t *)p;
    for (int i = 0; i < 16; ++i)
@@ -1002,31 +1231,49 @@ static FORCEINLINE __vec16_i64 __masked_load_64(void *p,
    return ret;
 }

-static FORCEINLINE void __masked_store_8(void *p, __vec16_i8 val,
-                                         __vec16_i1 mask) {
+static FORCEINLINE __vec16_d __masked_load_double(void *p,
+                                                  __vec16_i1 mask) {
+    __vec16_d ret;
+    double *ptr = (double *)p;
+    for (int i = 0; i < 16; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret.v[i] = ptr[i];
+    return ret;
+}
+
+static FORCEINLINE void __masked_store_i8(void *p, __vec16_i8 val,
+                                          __vec16_i1 mask) {
    int8_t *ptr = (int8_t *)p;
    for (int i = 0; i < 16; ++i)
        if ((mask.v & (1 << i)) != 0)
            ptr[i] = val.v[i];
 }

-static FORCEINLINE void __masked_store_16(void *p, __vec16_i16 val,
-                                          __vec16_i1 mask) {
+static FORCEINLINE void __masked_store_i16(void *p, __vec16_i16 val,
+                                           __vec16_i1 mask) {
    int16_t *ptr = (int16_t *)p;
    for (int i = 0; i < 16; ++i)
        if ((mask.v & (1 << i)) != 0)
            ptr[i] = val.v[i];
 }

-static FORCEINLINE void __masked_store_32(void *p, __vec16_i32 val,
-                                          __vec16_i1 mask) {
+static FORCEINLINE void __masked_store_i32(void *p, __vec16_i32 val,
+                                           __vec16_i1 mask) {
    int32_t *ptr = (int32_t *)p;
    for (int i = 0; i < 16; ++i)
        if ((mask.v & (1 << i)) != 0)
            ptr[i] = val.v[i];
 }

-static FORCEINLINE void __masked_store_64(void *p, __vec16_i64 val,
+static FORCEINLINE void __masked_store_float(void *p, __vec16_f val,
+                                             __vec16_i1 mask) {
+    float *ptr = (float *)p;
+    for (int i = 0; i < 16; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val.v[i];
+}
+
+static FORCEINLINE void __masked_store_i64(void *p, __vec16_i64 val,
                                          __vec16_i1 mask) {
    int64_t *ptr = (int64_t *)p;
    for (int i = 0; i < 16; ++i)
@@ -1034,24 +1281,42 @@ static FORCEINLINE void __masked_store_64(void *p, __vec16_i64 val,
            ptr[i] = val.v[i];
 }

-static FORCEINLINE void __masked_store_blend_8(void *p, __vec16_i8 val,
-                                               __vec16_i1 mask) {
-    __masked_store_8(p, val, mask);
+static FORCEINLINE void __masked_store_double(void *p, __vec16_d val,
+                                              __vec16_i1 mask) {
+    double *ptr = (double *)p;
+    for (int i = 0; i < 16; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val.v[i];
 }

-static FORCEINLINE void __masked_store_blend_16(void *p, __vec16_i16 val,
+static FORCEINLINE void __masked_store_blend_i8(void *p, __vec16_i8 val,
                                                __vec16_i1 mask) {
-    __masked_store_16(p, val, mask);
+    __masked_store_i8(p, val, mask);
 }

-static FORCEINLINE void __masked_store_blend_32(void *p, __vec16_i32 val,
-                                                __vec16_i1 mask) {
-    __masked_store_32(p, val, mask);
+static FORCEINLINE void __masked_store_blend_i16(void *p, __vec16_i16 val,
+                                                 __vec16_i1 mask) {
+    __masked_store_i16(p, val, mask);
 }

-static FORCEINLINE void __masked_store_blend_64(void *p, __vec16_i64 val,
-                                                __vec16_i1 mask) {
-    __masked_store_64(p, val, mask);
+static FORCEINLINE void __masked_store_blend_i32(void *p, __vec16_i32 val,
+                                                 __vec16_i1 mask) {
+    __masked_store_i32(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_float(void *p, __vec16_f val,
+                                                   __vec16_i1 mask) {
+    __masked_store_float(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_i64(void *p, __vec16_i64 val,
+                                                 __vec16_i1 mask) {
+    __masked_store_i64(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_double(void *p, __vec16_d val,
+                                                    __vec16_i1 mask) {
+    __masked_store_double(p, val, mask);
 }

 ///////////////////////////////////////////////////////////////////////////
@@ -1060,29 +1325,31 @@ static FORCEINLINE void __masked_store_blend_64(void *p, __vec16_i64 val,
 // offsets * offsetScale is in bytes (for all of these)

 #define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                  \
-static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset,    \
-                              uint32_t scale, OTYPE constOffset, \
-                              __vec16_i1 mask) {                        \
+static FORCEINLINE VTYPE FUNC(unsigned char *b, uint32_t scale,         \
+                              OTYPE offset, __vec16_i1 mask) {          \
    VTYPE ret;                                                          \
    int8_t *base = (int8_t *)b;                                         \
    for (int i = 0; i < 16; ++i)                                        \
        if ((mask.v & (1 << i)) != 0) {                                 \
-            STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] +  \
-                                   constOffset.v[i]);                   \
+            STYPE *ptr = (STYPE *)(base + scale * offset.v[i]);         \
            ret.v[i] = *ptr;                                            \
        }                                                               \
    return ret;                                                         \
 }
    

-GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_base_offsets32_i8)
-GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_base_offsets64_i8)
+GATHER_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i32, __gather_base_offsets32_i8)
+GATHER_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i64, __gather_base_offsets64_i8)
 GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16)
 GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16)
 GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __gather_base_offsets32_i32)
 GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32)
+GATHER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i32, __gather_base_offsets32_float)
+GATHER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i64, __gather_base_offsets64_float)
 GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64)
 GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64)
+GATHER_BASE_OFFSETS(__vec16_d,   double,  __vec16_i32, __gather_base_offsets32_double)
+GATHER_BASE_OFFSETS(__vec16_d,   double,  __vec16_i64, __gather_base_offsets64_double)

 #define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)         \
 static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec16_i1 mask) {   \
@@ -1095,39 +1362,46 @@ static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec16_i1 mask) {   \
    return ret;                                             \
 }

-GATHER_GENERAL(__vec16_i8, int8_t, __vec16_i32, __gather32_i8)
-GATHER_GENERAL(__vec16_i8, int8_t, __vec16_i64, __gather64_i8)
+GATHER_GENERAL(__vec16_i8,  int8_t,  __vec16_i32, __gather32_i8)
+GATHER_GENERAL(__vec16_i8,  int8_t,  __vec16_i64, __gather64_i8)
 GATHER_GENERAL(__vec16_i16, int16_t, __vec16_i32, __gather32_i16)
 GATHER_GENERAL(__vec16_i16, int16_t, __vec16_i64, __gather64_i16)
 GATHER_GENERAL(__vec16_i32, int32_t, __vec16_i32, __gather32_i32)
 GATHER_GENERAL(__vec16_i32, int32_t, __vec16_i64, __gather64_i32)
+GATHER_GENERAL(__vec16_f,   float,   __vec16_i32, __gather32_float)
+GATHER_GENERAL(__vec16_f,   float,   __vec16_i64, __gather64_float)
 GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i32, __gather32_i64)
 GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __gather64_i64)
+GATHER_GENERAL(__vec16_d,   double,  __vec16_i32, __gather32_double)
+GATHER_GENERAL(__vec16_d,   double,  __vec16_i64, __gather64_double)

 // scatter

 #define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                 \
-static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset,     \
-                             uint32_t scale, OTYPE constOffset,         \
-                             VTYPE val, __vec16_i1 mask) {              \
+static FORCEINLINE void FUNC(unsigned char *b, uint32_t scale,          \
+                             OTYPE offset, VTYPE val,                   \
+                             __vec16_i1 mask) {                         \
    int8_t *base = (int8_t *)b;                                         \
    for (int i = 0; i < 16; ++i)                                        \
        if ((mask.v & (1 << i)) != 0) {                                 \
-            STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] +  \
-                                   constOffset.v[i]);                   \
+            STYPE *ptr = (STYPE *)(base + scale * offset.v[i]);         \
            *ptr = val.v[i];                                            \
        }                                                               \
 }
    

-SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_base_offsets32_i8)
-SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_base_offsets64_i8)
+SCATTER_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i32, __scatter_base_offsets32_i8)
+SCATTER_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i64, __scatter_base_offsets64_i8)
 SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_base_offsets32_i16)
 SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64_i16)
 SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __scatter_base_offsets32_i32)
 SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32)
+SCATTER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i32, __scatter_base_offsets32_float)
+SCATTER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i64, __scatter_base_offsets64_float)
 SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64)
 SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64)
+SCATTER_BASE_OFFSETS(__vec16_d,   double,  __vec16_i32, __scatter_base_offsets32_double)
+SCATTER_BASE_OFFSETS(__vec16_d,   double,  __vec16_i64, __scatter_base_offsets64_double)

 #define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)                 \
 static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) {  \
@@ -1139,14 +1413,18 @@ static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) {  \
        }                                                            \
 }

-SCATTER_GENERAL(__vec16_i8, int8_t, __vec16_i32, __scatter32_i8)
-SCATTER_GENERAL(__vec16_i8, int8_t, __vec16_i64, __scatter64_i8)
+SCATTER_GENERAL(__vec16_i8,  int8_t,  __vec16_i32, __scatter32_i8)
+SCATTER_GENERAL(__vec16_i8,  int8_t,  __vec16_i64, __scatter64_i8)
 SCATTER_GENERAL(__vec16_i16, int16_t, __vec16_i32, __scatter32_i16)
 SCATTER_GENERAL(__vec16_i16, int16_t, __vec16_i64, __scatter64_i16)
 SCATTER_GENERAL(__vec16_i32, int32_t, __vec16_i32, __scatter32_i32)
 SCATTER_GENERAL(__vec16_i32, int32_t, __vec16_i64, __scatter64_i32)
+SCATTER_GENERAL(__vec16_f,   float,   __vec16_i32, __scatter32_float)
+SCATTER_GENERAL(__vec16_f,   float,   __vec16_i64, __scatter64_float)
 SCATTER_GENERAL(__vec16_i64, int64_t, __vec16_i32, __scatter32_i64)
 SCATTER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __scatter64_i64)
+SCATTER_GENERAL(__vec16_d,   double,  __vec16_i32, __scatter32_double)
+SCATTER_GENERAL(__vec16_d,   double,  __vec16_i64, __scatter64_double)

 ///////////////////////////////////////////////////////////////////////////
 // packed load/store
--- a/examples/intrinsics/generic-32.h
+++ b/examples/intrinsics/generic-32.h
--- a/examples/intrinsics/generic-64.h
+++ b/examples/intrinsics/generic-64.h
--- a/examples/intrinsics/knc.h
+++ b/examples/intrinsics/knc.h
--- a/examples/intrinsics/knc2x.h
+++ b/examples/intrinsics/knc2x.h
--- a/examples/intrinsics/sse4.h
+++ b/examples/intrinsics/sse4.h
--- a/examples/mandelbrot/mandelbrot.ispc
+++ b/examples/mandelbrot/mandelbrot.ispc
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -40,8 +40,10 @@ static inline int mandel(float c_re, float c_im, int count) {

        float new_re = z_re*z_re - z_im*z_im;
        float new_im = 2.f * z_re * z_im;
-        z_re = c_re + new_re;
-        z_im = c_im + new_im;
+        unmasked {
+            z_re = c_re + new_re;
+            z_im = c_im + new_im;
+        }
    }

    return i;
--- a/examples/mandelbrot_tasks/mandelbrot.ispc
+++ b/examples/mandelbrot_tasks/mandelbrot.ispc
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -41,8 +41,10 @@ mandel(float c_re, float c_im, int count) {

        float new_re = z_re*z_re - z_im*z_im;
        float new_im = 2.f * z_re * z_im;
-        z_re = c_re + new_re;
-        z_im = c_im + new_im;
+        unmasked {
+            z_re = c_re + new_re;
+            z_im = c_im + new_im;
+        }
    }

    return i;
@@ -79,6 +81,6 @@ mandelbrot_ispc(uniform float x0, uniform float y0,
    uniform float dy = (y1 - y0) / height;
    uniform int span = 4;

-    launch[height/span] < mandelbrot_scanline(x0, dx, y0, dy, width, height, span,
-                                              maxIterations, output) >;
+    launch[height/span] mandelbrot_scanline(x0, dx, y0, dy, width, height, span,
+                                            maxIterations, output);
 }
--- a/examples/options/options.ispc
+++ b/examples/options/options.ispc
@@ -77,7 +77,7 @@ black_scholes_ispc_tasks(uniform float Sa[], uniform float Xa[], uniform float T
                         uniform float ra[], uniform float va[], 
                         uniform float result[], uniform int count) {
    uniform int nTasks = max((int)64, (int)count/16384);
-    launch[nTasks] < bs_task(Sa, Xa, Ta, ra, va, result, count) >;
+    launch[nTasks] bs_task(Sa, Xa, Ta, ra, va, result, count);
 }


@@ -150,5 +150,5 @@ binomial_put_ispc_tasks(uniform float Sa[], uniform float Xa[],
                        uniform float va[], uniform float result[], 
                        uniform int count) {
    uniform int nTasks = max((int)64, (int)count/16384);
-    launch[nTasks] < binomial_task(Sa, Xa, Ta, ra, va, result, count) >;
+    launch[nTasks] binomial_task(Sa, Xa, Ta, ra, va, result, count);
 }
--- a/examples/perfbench/Makefile
+++ b/examples/perfbench/Makefile
@@ -0,0 +1,7 @@
+
+EXAMPLE=perbench
+CPP_SRC=perfbench.cpp perfbench_serial.cpp
+ISPC_SRC=perfbench.ispc
+ISPC_TARGETS=sse2,sse4,avx
+
+include ../common.mk
--- a/examples/perfbench/perfbench.cpp
+++ b/examples/perfbench/perfbench.cpp
@@ -0,0 +1,108 @@
+/*
+  Copyright (c) 2012, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#define NOMINMAX
+#pragma warning (disable: 4244)
+#pragma warning (disable: 4305)
+#endif
+
+#include <stdio.h>
+#include <algorithm>
+#include "../timing.h"
+
+#include "perfbench_ispc.h"
+
+typedef void (FuncType)(float *, int, float *, float *);
+
+struct PerfTest {
+    FuncType *aFunc;
+    const char *aName;
+    FuncType *bFunc;
+    const char *bName;
+    const char *testName;
+};
+
+extern void xyzSumAOS(float *a, int count, float *zeros, float *result);
+extern void xyzSumSOA(float *a, int count, float *zeros, float *result);
+
+
+static void
+lInitData(float *ptr, int count) {
+    for (int i = 0; i < count; ++i)
+        ptr[i] = float(i) / (1024.f * 1024.f);
+}
+
+static PerfTest tests[] = { 
+    { xyzSumAOS, "serial", ispc::xyzSumAOS, "ispc", "AOS vector element sum (with coalescing)" },
+    { xyzSumAOS, "serial", ispc::xyzSumAOSStdlib, "ispc", "AOS vector element sum (stdlib swizzle)" },
+    { xyzSumAOS, "serial", ispc::xyzSumAOSNoCoalesce, "ispc", "AOS vector element sum (no coalescing)" },
+    { xyzSumSOA, "serial", ispc::xyzSumSOA, "ispc", "SOA vector element sum" },
+    { ispc::gathers, "gather", ispc::loads, "vector load", "Memory reads" },
+    { ispc::scatters, "scatter", ispc::stores, "vector store", "Memory writes" },
+};
+
+int main() {
+    int count = 3*64*1024;
+    float *a = new float[count];
+    float zeros[32] = { 0 };
+
+    int nTests = sizeof(tests) / sizeof(tests[0]);
+    for (int i = 0; i < nTests; ++i) {
+        lInitData(a, count);
+        reset_and_start_timer();
+        float resultA[3] = { 0, 0, 0 };
+        for (int j = 0; j < 100; ++j)
+            tests[i].aFunc(a, count, zeros, resultA);
+        double aTime = get_elapsed_mcycles();
+
+        lInitData(a, count);
+        reset_and_start_timer();
+        float resultB[3] = { 0, 0, 0 };
+        for (int j = 0; j < 100; ++j)
+            tests[i].bFunc(a, count, zeros, resultB);
+        double bTime = get_elapsed_mcycles();
+
+        printf("%-40s: [%.2f] M cycles %s, [%.2f] M cycles %s (%.2fx speedup).\n",
+               tests[i].testName, aTime, tests[i].aName, bTime, tests[i].bName,
+               aTime/bTime);
+#if 0
+        printf("\t(%f %f %f) - (%f %f %f)\n", resultSerial[0], resultSerial[1],
+               resultSerial[2], resultISPC[0], resultISPC[1], resultISPC[2]);
+#endif
+    }
+
+    return 0;
+}
+
--- a/examples/perfbench/perfbench.ispc
+++ b/examples/perfbench/perfbench.ispc
@@ -0,0 +1,170 @@
+/*
+  Copyright (c) 2012, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+export void xyzSumAOS(uniform float array[], uniform int count,
+                      uniform float zeros[], uniform float result[]) {
+    float xsum = 0, ysum = 0, zsum = 0;
+    foreach (i = 0 ... count/3) {
+        float x = array[3*i];
+        float y = array[3*i+1];
+        float z = array[3*i+2];
+
+        xsum += x;
+        ysum += y;
+        zsum += z;
+    }
+    result[0] = reduce_add(xsum);
+    result[1] = reduce_add(ysum);
+    result[2] = reduce_add(zsum);
+}
+
+export void xyzSumAOSStdlib(uniform float array[], uniform int count,
+                            uniform float zeros[], uniform float result[]) {
+    float xsum = 0, ysum = 0, zsum = 0;
+    for (uniform int i = 0; i < 64*1024 /*count/3*/; i += programCount) {
+        float x, y, z;
+        aos_to_soa3(&array[3*i], &x, &y, &z);
+
+        xsum += x;
+        ysum += y;
+        zsum += z;
+    }
+    result[0] = reduce_add(xsum);
+    result[1] = reduce_add(ysum);
+    result[2] = reduce_add(zsum);
+}
+
+export void xyzSumAOSNoCoalesce(uniform float array[], uniform int count,
+                                uniform float zerosArray[], uniform float result[]) {
+    int zeros = zerosArray[programIndex];
+    float xsum = 0, ysum = 0, zsum = 0;
+    foreach (i = 0 ... count/3) {
+        float x = array[3*i+zeros];
+        float y = array[3*i+1+zeros];
+        float z = array[3*i+2+zeros];
+
+        xsum += x;
+        ysum += y;
+        zsum += z;
+    }
+    result[0] = reduce_add(xsum);
+    result[1] = reduce_add(ysum);
+    result[2] = reduce_add(zsum);
+}
+
+export void xyzSumSOA(uniform float array[], uniform int count,
+                      uniform float zeros[], uniform float result[]) {
+    float xsum = 0, ysum = 0, zsum = 0;
+    uniform float * uniform ap = array;
+    assert(programCount <= 8);
+
+    for (uniform int i = 0; i < count/3; i += 8, ap += 24) {
+        for (uniform int j = 0; j < 8; j += programCount) {
+            float x = ap[j + programIndex];
+            float y = ap[8 + j + programIndex];
+            float z = ap[16 + j + programIndex];
+
+            xsum += x;
+            ysum += y;
+            zsum += z;
+        }
+    }
+    result[0] = reduce_add(xsum);
+    result[1] = reduce_add(ysum);
+    result[2] = reduce_add(zsum);
+}
+
+export void gathers(uniform float array[], uniform int count,
+                    uniform float zeros[], uniform float result[]) {
+    float sum = 0;
+    int zero = zeros[programIndex];
+    foreach (i = 0 ... count)
+        sum += array[i + zero];
+    result[0] = reduce_add(sum);
+}
+
+
+export void loads(uniform float array[], uniform int count,
+                  uniform float zeros[], uniform float result[]) {
+    float sum = 0;
+    foreach (i = 0 ... count)
+        sum += array[i];
+    result[0] = reduce_add(sum);
+}
+
+
+export void scatters(uniform float array[], uniform int count,
+                     uniform float zeros[], uniform float result[]) {
+    int zero = zeros[programIndex];
+    foreach (i = 0 ... count)
+        array[i + zero] = zero;
+}
+
+
+export void stores(uniform float array[], uniform int count,
+                   uniform float zeros[], uniform float result[]) {
+    int zero = zeros[programIndex];
+    foreach (i = 0 ... count)
+        array[i] = zero;
+}
+
+export void normalizeAOSNoCoalesce(uniform float array[], uniform int count,
+                                   uniform float zeroArray[]) {
+    float zeros = zeroArray[programIndex];
+    foreach (i = 0 ... count/3) {
+        float x = array[3*i+zeros];
+        float y = array[3*i+1+zeros];
+        float z = array[3*i+2+zeros];
+
+        float l2 = x*x + y*y + z*z;
+        
+        array[3*i] /= l2;
+        array[3*i+1] /= l2;
+        array[3*i+2] /= l2;
+    }
+}
+
+export void normalizeSOA(uniform float array[], uniform int count,
+                         uniform float zeros[]) {
+    foreach (i = 0 ... count/3) {
+        float x = array[3*i];
+        float y = array[3*i+1];
+        float z = array[3*i+2];
+
+        float l2 = x*x + y*y + z*z;
+        
+        array[3*i] /= l2;
+        array[3*i+1] /= l2;
+        array[3*i+2] /= l2;
+    }
+}
--- a/examples/perfbench/perfbench.vcxproj
+++ b/examples/perfbench/perfbench.vcxproj
@@ -0,0 +1,175 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{d923bb7e-a7c8-4850-8fcf-0eb9ce35b4e8}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>perfbench</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="perfbench.cpp" />
+    <ClCompile Include="perfbench_serial.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="perfbench.ispc">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
--- a/examples/perfbench/perfbench_serial.cpp
+++ b/examples/perfbench/perfbench_serial.cpp
@@ -0,0 +1,61 @@
+/*
+  Copyright (c) 2012, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#include <math.h>
+
+void
+xyzSumAOS(float *a, int count, float *zeros, float *result) {
+    float xsum = 0, ysum = 0, zsum = 0;
+    for (int i = 0; i < count; i += 3) {
+        xsum += a[i];
+        ysum += a[i+1];
+        zsum += a[i+2];
+    }
+    result[0] = xsum;
+    result[1] = ysum;
+    result[2] = zsum;
+}
+
+void
+xyzSumSOA(float *a, int count, float *zeros, float *result) {
+    float xsum = 0, ysum = 0, zsum = 0;
+    for (int i = 0; i < count/3; ++i) {
+        float *p = a + (i >> 3) * 24 + (i & 7);
+        xsum += p[0];
+        ysum += p[8];
+        zsum += p[16];
+    }
+    result[0] = xsum;
+    result[1] = ysum;
+    result[2] = zsum;
+}
--- a/examples/rt/rt.ispc
+++ b/examples/rt/rt.ispc
@@ -43,17 +43,17 @@ struct Ray {
 };

 struct Triangle {
-    uniform float p[3][4];
-    uniform int id;
-    uniform int pad[3];
+    float p[3][4];
+    int id;
+    int pad[3];
 };

 struct LinearBVHNode {
-    uniform float bounds[2][3];
-    uniform unsigned int offset;     // num primitives for leaf, second child for interior
-    uniform unsigned int8 nPrimitives;
-    uniform unsigned int8 splitAxis;
-    uniform unsigned int16 pad;
+    float bounds[2][3];
+    unsigned int offset;     // num primitives for leaf, second child for interior
+    unsigned int8 nPrimitives;
+    unsigned int8 splitAxis;
+    unsigned int16 pad;
 };

 static inline float3 Cross(const float3 v1, const float3 v2) {
@@ -88,9 +88,12 @@ static void generateRay(uniform const float raster2camera[4][4],
    camy /= camw;
    camz /= camw;

-    ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy + camera2world[0][2] * camz;
-    ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy + camera2world[1][2] * camz;
-    ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy + camera2world[2][2] * camz;
+    ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy + 
+        camera2world[0][2] * camz;
+    ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy + 
+        camera2world[1][2] * camz;
+    ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy + 
+        camera2world[2][2] * camz;

    ray.origin.x = camera2world[0][3] / camera2world[3][3];
    ray.origin.y = camera2world[1][3] / camera2world[3][3];
@@ -143,7 +146,7 @@ static bool BBoxIntersect(const uniform float bounds[2][3],



-static bool TriIntersect(const Triangle &tri, Ray &ray) {
+static bool TriIntersect(const uniform Triangle &tri, Ray &ray) {
    uniform float3 p0 = { tri.p[0][0], tri.p[0][1], tri.p[0][2] };
    uniform float3 p1 = { tri.p[1][0], tri.p[1][1], tri.p[1][2] };
    uniform float3 p2 = { tri.p[2][0], tri.p[2][1], tri.p[2][2] };
@@ -183,8 +186,8 @@ static bool TriIntersect(const Triangle &tri, Ray &ray) {
 }


-bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[], 
-                  Ray &r) {
+bool BVHIntersect(const uniform LinearBVHNode nodes[], 
+                  const uniform Triangle tris[], Ray &r) {
    Ray ray = r;
    bool hit = false;
    // Follow ray through BVH nodes to find primitive intersections
@@ -193,7 +196,7 @@ bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],

    while (true) {
        // Check ray against BVH node
-        LinearBVHNode node = nodes[nodeNum];
+        uniform LinearBVHNode node = nodes[nodeNum];
        if (any(BBoxIntersect(node.bounds, ray))) {
            uniform unsigned int nPrimitives = node.nPrimitives;
            if (nPrimitives > 0) {
@@ -239,8 +242,8 @@ static void raytrace_tile(uniform int x0, uniform int x1,
                          const uniform float raster2camera[4][4], 
                          const uniform float camera2world[4][4],
                          uniform float image[], uniform int id[],
-                          const LinearBVHNode nodes[],
-                          const Triangle triangles[]) {
+                          const uniform LinearBVHNode nodes[],
+                          const uniform Triangle triangles[]) {
    uniform float widthScale = (float)(baseWidth) / (float)(width);
    uniform float heightScale = (float)(baseHeight) / (float)(height);

@@ -262,8 +265,8 @@ export void raytrace_ispc(uniform int width, uniform int height,
                          const uniform float raster2camera[4][4], 
                          const uniform float camera2world[4][4],
                          uniform float image[], uniform int id[],
-                          const LinearBVHNode nodes[],
-                          const Triangle triangles[]) {
+                          const uniform LinearBVHNode nodes[],
+                          const uniform Triangle triangles[]) {
    raytrace_tile(0, width, 0, height, width, height, baseWidth, baseHeight,
                  raster2camera, camera2world, image,
                  id, nodes, triangles);
@@ -275,8 +278,8 @@ task void raytrace_tile_task(uniform int width, uniform int height,
                             const uniform float raster2camera[4][4], 
                             const uniform float camera2world[4][4],
                             uniform float image[], uniform int id[],
-                             const LinearBVHNode nodes[],
-                             const Triangle triangles[]) {
+                             const uniform LinearBVHNode nodes[],
+                             const uniform Triangle triangles[]) {
    uniform int dx = 16, dy = 16; // must match dx, dy below
    uniform int xBuckets = (width + (dx-1)) / dx;
    uniform int x0 = (taskIndex % xBuckets) * dx;
@@ -295,14 +298,14 @@ export void raytrace_ispc_tasks(uniform int width, uniform int height,
                                const uniform float raster2camera[4][4], 
                                const uniform float camera2world[4][4],
                                uniform float image[], uniform int id[],
-                                const LinearBVHNode nodes[],
-                                const Triangle triangles[]) {
+                                const uniform LinearBVHNode nodes[],
+                                const uniform Triangle triangles[]) {
    uniform int dx = 16, dy = 16;
    uniform int xBuckets = (width + (dx-1)) / dx;
    uniform int yBuckets = (height + (dy-1)) / dy;
    uniform int nTasks = xBuckets * yBuckets;
-    launch[nTasks] < raytrace_tile_task(width, height, baseWidth, baseHeight, 
-                                        raster2camera, camera2world, 
-                                        image, id, nodes, triangles) >;
+    launch[nTasks] raytrace_tile_task(width, height, baseWidth, baseHeight, 
+                                      raster2camera, camera2world, 
+                                      image, id, nodes, triangles);
 }

--- a/examples/rt/rt_serial.cpp
+++ b/examples/rt/rt_serial.cpp
@@ -123,9 +123,12 @@ static void generateRay(const float raster2camera[4][4],
    camy /= camw;
    camz /= camw;

-    ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy + camera2world[0][2] * camz;
-    ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy + camera2world[1][2] * camz;
-    ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy + camera2world[2][2] * camz;
+    ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy +
+        camera2world[0][2] * camz;
+    ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy +
+        camera2world[1][2] * camz;
+    ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy +
+        camera2world[2][2] * camz;

    ray.origin.x = camera2world[0][3] / camera2world[3][3];
    ray.origin.y = camera2world[1][3] / camera2world[3][3];
--- a/examples/stencil/stencil.ispc
+++ b/examples/stencil/stencil.ispc
@@ -88,11 +88,11 @@ loop_stencil_ispc_tasks(uniform int t0, uniform int t1,
        // Parallelize across cores as well: each task will work on a slice
        // of 1 in the z extent of the volume.
        if ((t & 1) == 0)
-            launch[z1-z0] < stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz, 
-                                              coef, vsq, Aeven, Aodd) >;
+            launch[z1-z0] stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz, 
+                                            coef, vsq, Aeven, Aodd);
        else
-            launch[z1-z0] < stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz, 
-                                              coef, vsq, Aodd, Aeven) >;
+            launch[z1-z0] stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz, 
+                                            coef, vsq, Aodd, Aeven);

        // We need to wait for all of the launched tasks to finish before
        // starting the next iteration.
--- a/examples/tasksys.cpp
+++ b/examples/tasksys.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2011, Intel Corporation
+  Copyright (c) 2011-2012, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -40,21 +40,68 @@
  Runtime Requirements" for information about the task-related entrypoints
  that are implemented here.

-  There are three task systems in this file: one built using Microsoft's
-  Concurrency Runtime, one built with Apple's Grand Central Dispatch, and
-  one built on top of bare pthreads.
+  There are several task systems in this file, built using:
+    - Microsoft's Concurrency Runtime (ISPC_USE_CONCRT)
+    - Apple's Grand Central Dispatch (ISPC_USE_GCD)
+    - bare pthreads (ISPC_USE_PTHREADS, ISPC_USE_PTHREADS_FULLY_SUBSCRIBED)
+    - Cilk Plus (ISPC_USE_CILK)
+    - TBB (ISPC_USE_TBB_TASK_GROUP, ISPC_USE_TBB_PARALLEL_FOR)
+    - OpenMP (ISPC_USE_OMP)
+
+  The task system implementation can be selected at compile time, by defining 
+  the appropriate preprocessor symbol on the command line (for e.g.: -D ISPC_USE_TBB).
+  Not all combinations of platform and task system are meaningful.
+  If no task system is requested, a reasonable default task system for the platform
+  is selected.  Here are the task systems that can be selected:
+
+#define ISPC_USE_GCD
+#define ISPC_USE_CONCRT
+#define ISPC_USE_PTHREADS
+#define ISPC_USE_PTHREADS_FULLY_SUBSCRIBED
+#define ISPC_USE_CILK
+#define ISPC_USE_OMP
+#define ISPC_USE_TBB_TASK_GROUP
+#define ISPC_USE_TBB_PARALLEL_FOR
+
+  The ISPC_USE_PTHREADS_FULLY_SUBSCRIBED model essentially takes over the machine
+  by assigning one pthread to each hyper-thread, and then uses spinlocks and atomics
+  for task management.  This model is useful for KNC where tasks can take over 
+  the machine, but less so when there are other tasks that need running on the machine.
+
+#define ISPC_USE_CREW
+
 */

+#if !(defined ISPC_USE_CONCRT          || defined ISPC_USE_GCD              || \
+      defined ISPC_USE_PTHREADS        || defined ISPC_USE_PTHREADS_FULLY_SUBSCRIBED || \
+      defined ISPC_USE_TBB_TASK_GROUP  || defined ISPC_USE_TBB_PARALLEL_FOR || \
+      defined ISPC_USE_OMP             || defined ISPC_USE_CILK             )
+
+    // If no task model chosen from the compiler cmdline, pick a reasonable default
+    #if defined(_WIN32) || defined(_WIN64)
+      #define ISPC_USE_CONCRT
+    #elif defined(__linux__)
+    #define ISPC_USE_PTHREADS
+    #elif defined(__APPLE__)
+      #define ISPC_USE_GCD
+    #endif
+    #if defined(__KNC__)
+      #define ISPC_USE_PTHREADS
+    #endif
+
+#endif // No task model specified on compiler cmdline
+
 #if defined(_WIN32) || defined(_WIN64)
-  #define ISPC_IS_WINDOWS
-  #define ISPC_USE_CONCRT
+#define ISPC_IS_WINDOWS
 #elif defined(__linux__)
-  #define ISPC_IS_LINUX
-  #define ISPC_USE_PTHREADS
+#define ISPC_IS_LINUX
 #elif defined(__APPLE__)
-  #define ISPC_IS_APPLE
-  #define ISPC_USE_GCD
+#define ISPC_IS_APPLE
 #endif
+#if defined(__KNC__)
+#define ISPC_IS_KNC
+#endif
+

 #define DBG(x) 

@@ -83,9 +130,37 @@
  #include <vector>
  #include <algorithm>
 #endif // ISPC_USE_PTHREADS
+#ifdef ISPC_USE_PTHREADS_FULLY_SUBSCRIBED
+#include <pthread.h>
+#include <semaphore.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/param.h>
+#include <sys/sysctl.h>
+#include <vector>
+#include <algorithm>
+//#include <stdexcept>
+#include <stack>
+#endif // ISPC_USE_PTHREADS_FULLY_SUBSCRIBED
+#ifdef ISPC_USE_TBB_PARALLEL_FOR
+  #include <tbb/parallel_for.h>
+#endif // ISPC_USE_TBB_PARALLEL_FOR
+#ifdef ISPC_USE_TBB_TASK_GROUP
+  #include <tbb/task_group.h>
+#endif // ISPC_USE_TBB_TASK_GROUP
+#ifdef ISPC_USE_CILK
+  #include <cilk/cilk.h>
+#endif // ISPC_USE_TBB
+#ifdef ISPC_USE_OMP
+  #include <omp.h>
+#endif // ISPC_USE_OMP
 #ifdef ISPC_IS_LINUX
  #include <malloc.h>
 #endif // ISPC_IS_LINUX
+
 #include <stdio.h>
 #include <stdint.h>
 #include <stdlib.h>
@@ -107,6 +182,13 @@ struct TaskInfo {
 #endif
 };

+// ispc expects these functions to have C linkage / not be mangled
+extern "C" { 
+    void ISPCLaunch(void **handlePtr, void *f, void *data, int count);
+    void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment);
+    void ISPCSync(void *handle);
+}
+
 ///////////////////////////////////////////////////////////////////////////
 // TaskGroupBase

@@ -181,7 +263,7 @@ inline TaskGroupBase::~TaskGroupBase() {
    // Note: don't delete memBuffers[0], since it points to the start of
    // the "mem" member!
    for (int i = 1; i < NUM_MEM_BUFFERS; ++i)
-        delete[] memBuffers[i];
+        delete[](memBuffers[i]);
 }


@@ -224,10 +306,10 @@ TaskGroupBase::GetTaskInfo(int index) {
 inline void *
 TaskGroupBase::AllocMemory(int64_t size, int32_t alignment) {
    char *basePtr = memBuffers[curMemBuffer];
-    int64_t iptr = (int64_t)(basePtr + curMemBufferOffset);
+    intptr_t iptr = (intptr_t)(basePtr + curMemBufferOffset);
    iptr = (iptr + (alignment-1)) & ~(alignment-1);

-    int newOffset = int(iptr + size - (int64_t)basePtr);
+    int newOffset = int(iptr - (intptr_t)basePtr + size);
    if (newOffset < memBufferSize[curMemBuffer]) {
        curMemBufferOffset = newOffset;
        return (char *)iptr;
@@ -249,14 +331,6 @@ TaskGroupBase::AllocMemory(int64_t size, int32_t alignment) {
 ///////////////////////////////////////////////////////////////////////////
 // Atomics and the like

-#ifndef ISPC_IS_WINDOWS
-static inline void
-lMemFence() {
-    __asm__ __volatile__("mfence":::"memory");
-}
-#endif // !ISPC_IS_WINDOWS
-
-
 #if (__SIZEOF_POINTER__ == 4) || defined(__i386__) || defined(_WIN32)
 #define ISPC_POINTER_BYTES 4
 #elif (__SIZEOF_POINTER__ == 8) || defined(__x86_64__) || defined(__amd64__) || defined(_WIN64)
@@ -266,6 +340,15 @@ lMemFence() {
 #endif // __SIZEOF_POINTER__


+static inline void
+lMemFence() {
+    // Windows atomic functions already contain the fence
+    // KNC doesn't need the memory barrier
+#if !defined ISPC_IS_KNC || !defined ISPC_IS_WINDOWS
+        __asm__ __volatile__("mfence":::"memory");
+#endif
+}
+
 static void *
 lAtomicCompareAndSwapPointer(void **v, void *newValue, void *oldValue) {
 #ifdef ISPC_IS_WINDOWS
@@ -288,11 +371,11 @@ lAtomicCompareAndSwapPointer(void **v, void *newValue, void *oldValue) {
 #endif // ISPC_IS_WINDOWS
 }

-
-
-#ifndef ISPC_IS_WINDOWS
 static int32_t 
 lAtomicCompareAndSwap32(volatile int32_t *v, int32_t newValue, int32_t oldValue) {
+#ifdef ISPC_IS_WINDOWS
+    return InterlockedCompareExchange(v, newValue, oldValue);
+#else
    int32_t result;
    __asm__ __volatile__("lock\ncmpxchgl %2,%1"
                          : "=a"(result), "=m"(*v)
@@ -300,9 +383,22 @@ lAtomicCompareAndSwap32(volatile int32_t *v, int32_t newValue, int32_t oldValue)
                          : "memory");
    lMemFence();
    return result;
+#endif // ISPC_IS_WINDOWS
 }
-#endif // !ISPC_IS_WINDOWS

+static inline int32_t 
+lAtomicAdd(volatile int32_t *v, int32_t delta) {
+#ifdef ISPC_IS_WINDOWS
+    return InterlockedAdd(v, delta);
+#else
+    int32_t origValue;
+    __asm__ __volatile__("lock\n"
+        "xaddl %0,%1"
+        : "=r"(origValue), "=m"(*v) : "0"(delta)
+        : "memory");
+    return origValue;
+#endif
+}

 ///////////////////////////////////////////////////////////////////////////

@@ -366,6 +462,50 @@ private:

 #endif // ISPC_USE_PTHREADS

+#ifdef ISPC_USE_CILK
+
+class TaskGroup : public TaskGroupBase {
+public:
+    void Launch(int baseIndex, int count);
+    void Sync();
+
+};
+
+#endif // ISPC_USE_CILK
+
+#ifdef ISPC_USE_OMP
+
+class TaskGroup : public TaskGroupBase {
+public:
+    void Launch(int baseIndex, int count);
+    void Sync();
+
+};
+
+#endif // ISPC_USE_OMP
+
+#ifdef ISPC_USE_TBB_PARALLEL_FOR
+
+class TaskGroup : public TaskGroupBase {
+public:
+    void Launch(int baseIndex, int count);
+    void Sync();
+
+};
+
+#endif // ISPC_USE_TBB_PARALLEL_FOR
+
+#ifdef ISPC_USE_TBB_TASK_GROUP
+
+class TaskGroup : public TaskGroupBase {
+public:
+    void Launch(int baseIndex, int count);
+    void Sync();
+private:
+    tbb::task_group tbbTaskGroup;
+};
+
+#endif // ISPC_USE_TBB_TASK_GROUP

 ///////////////////////////////////////////////////////////////////////////
 // Grand Central Dispatch
@@ -487,18 +627,6 @@ static pthread_mutex_t taskSysMutex;
 static std::vector<TaskGroup *> activeTaskGroups;
 static sem_t *workerSemaphore;

-
-static inline int32_t 
-lAtomicAdd(int32_t *v, int32_t delta) {
-    int32_t origValue;
-    __asm__ __volatile__("lock\n"
-                         "xaddl %0,%1"
-                         : "=r"(origValue), "=m"(*v) : "0"(delta)
-                         : "memory");
-    return origValue;
-}
-
-
 static void *
 lTaskEntry(void *arg) {
    int threadIndex = (int)((int64_t)arg);
@@ -724,11 +852,15 @@ TaskGroup::Sync() {
                    exit(1);
                }
                // FIXME: We basically end up busy-waiting here, which is
-                // extra wasteful in a world with hyperthreading.  It would
+                // extra wasteful in a world with hyper-threading.  It would
                // be much better to put this thread to sleep on a
                // condition variable that was signaled when the last task
                // in this group was finished.
-                sleep(0);
+#ifndef ISPC_IS_KNC
+                usleep(1);
+#else
+                _mm_delay_32(8);
+#endif
                continue;
            }

@@ -772,6 +904,124 @@ TaskGroup::Sync() {
 #endif // ISPC_USE_PTHREADS

 ///////////////////////////////////////////////////////////////////////////
+// Cilk Plus
+
+#ifdef ISPC_USE_CILK
+
+static void
+InitTaskSystem() {
+    // No initialization needed
+}
+
+inline void
+TaskGroup::Launch(int baseIndex, int count) {
+    cilk_for(int i = 0; i < count; i++) {
+        TaskInfo *ti = GetTaskInfo(baseIndex + i);
+
+        // Actually run the task. 
+        // Cilk does not expose the task -> thread mapping so we pretend it's 1:1
+        ti->func(ti->data, ti->taskIndex, ti->taskCount, ti->taskIndex, ti->taskCount);
+    }
+}
+
+inline void
+TaskGroup::Sync() {
+}
+
+#endif // ISPC_USE_CILK
+
+///////////////////////////////////////////////////////////////////////////
+// OpenMP
+
+#ifdef ISPC_USE_OMP
+
+static void
+InitTaskSystem() {
+        // No initialization needed
+}
+
+inline void
+TaskGroup::Launch(int baseIndex, int count) {
+#pragma omp parallel for
+    for(int i = 0; i < count; i++) {
+        TaskInfo *ti = GetTaskInfo(baseIndex + i);
+
+        // Actually run the task. 
+        int threadIndex = omp_get_thread_num();
+        int threadCount = omp_get_num_threads();
+        ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount);
+    }
+}
+
+inline void
+TaskGroup::Sync() {
+}
+
+#endif // ISPC_USE_OMP
+
+///////////////////////////////////////////////////////////////////////////
+// Thread Building Blocks
+
+#ifdef ISPC_USE_TBB_PARALLEL_FOR
+
+static void
+InitTaskSystem() {
+    // No initialization needed by default
+    //tbb::task_scheduler_init();
+}
+
+inline void
+TaskGroup::Launch(int baseIndex, int count) {
+    tbb::parallel_for(0, count, [=](int i) {
+        TaskInfo *ti = GetTaskInfo(baseIndex + i);
+
+        // Actually run the task. 
+        // TBB does not expose the task -> thread mapping so we pretend it's 1:1
+        int threadIndex = ti->taskIndex;
+        int threadCount = ti->taskCount;
+
+        ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount);
+    });
+}
+
+inline void
+TaskGroup::Sync() {
+}
+
+#endif // ISPC_USE_TBB_PARALLEL_FOR
+
+#ifdef ISPC_USE_TBB_TASK_GROUP
+
+static void
+InitTaskSystem() {
+    // No initialization needed by default
+    //tbb::task_scheduler_init();
+}
+
+inline void
+TaskGroup::Launch(int baseIndex, int count) {
+    for (int i = 0; i < count; i++) {
+        tbbTaskGroup.run([=]() {
+            TaskInfo *ti = GetTaskInfo(baseIndex + i);
+
+            // TBB does not expose the task -> thread mapping so we pretend it's 1:1
+            int threadIndex = ti->taskIndex;
+            int threadCount = ti->taskCount;
+            ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount);
+        });
+    }
+}
+
+inline void
+TaskGroup::Sync() {
+    tbbTaskGroup.wait();
+}
+
+#endif // ISPC_USE_TBB_TASK_GROUP
+
+///////////////////////////////////////////////////////////////////////////
+
+#ifndef ISPC_USE_PTHREADS_FULLY_SUBSCRIBED

 #define MAX_FREE_TASK_GROUPS 64
 static TaskGroup *freeTaskGroups[MAX_FREE_TASK_GROUPS];
@@ -783,7 +1033,6 @@ AllocTaskGroup() {
        if (tg != NULL) {
            void *ptr = lAtomicCompareAndSwapPointer((void **)(&freeTaskGroups[i]), NULL, tg);
            if (ptr != NULL) {
-                assert(ptr == tg);
                return (TaskGroup *)ptr;
            }
        }
@@ -810,13 +1059,6 @@ FreeTaskGroup(TaskGroup *tg) {

 ///////////////////////////////////////////////////////////////////////////

-// ispc expects these functions to have C linkage / not be mangled
-extern "C" { 
-    void ISPCLaunch(void **handlePtr, void *f, void *data, int count);
-    void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment);
-    void ISPCSync(void *handle);
-}
-
 void
 ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count) {
    TaskGroup *taskGroup;
@@ -863,3 +1105,250 @@ ISPCAlloc(void **taskGroupPtr, int64_t size, int32_t alignment) {

    return taskGroup->AllocMemory(size, alignment);
 }
+
+#else  // ISPC_USE_PTHREADS_FULLY_SUBSCRIBED
+
+#define MAX_LIVE_TASKS 1024
+
+pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
+
+// Small structure used to hold the data for each task
+struct Task {
+public:
+    TaskFuncType func;
+    void *data;
+    volatile int32_t taskIndex;
+    int taskCount;
+
+    volatile int numDone;
+    int liveIndex; // index in live task queue
+
+    inline int  noMoreWork() { return taskIndex >= taskCount; }
+    /*! given thread is done working on this task --> decrease num locks */
+    // inline void lock() { lAtomicAdd(&locks,1); }
+    // inline void unlock() { lAtomicAdd(&locks,-1); }
+    inline int  nextJob() { return lAtomicAdd(&taskIndex,1); }
+    inline int  numJobs() { return taskCount; }
+    inline void schedule(int idx) { taskIndex = 0; numDone = 0; liveIndex = idx; }
+    inline void run(int idx, int threadIdx);
+    inline void markOneDone() { lAtomicAdd(&numDone,1); }
+    inline void wait()
+    {
+        while (!noMoreWork()) {
+            int next = nextJob();
+            if (next < numJobs()) run(next, 0);
+        }
+        while (numDone != taskCount) {
+#ifndef ISPC_IS_KNC
+            usleep(1);
+#else
+            _mm_delay_32(8);
+#endif
+        }
+    }
+};
+
+///////////////////////////////////////////////////////////////////////////
+class TaskSys {
+    static int numThreadsRunning;
+    struct LiveTask
+    {
+        volatile int locks; /*!< num locks on this task. gets
+                                 initialized to NUM_THREADS+1, then counted
+                                 down by every thread that sees this. this
+                                 value is only valid when 'active' is set
+                                 to true */
+        volatile int active; /*! workers will spin on this until it
+                                 becomes active */
+        Task *task;
+
+        inline void doneWithThis() { lAtomicAdd(&locks,-1); }
+        LiveTask() : active(0), locks(-1) {}
+    };
+
+public:
+    volatile int nextScheduleIndex; /*! next index in the task queue
+                                        where we'll insert a live task */
+
+    // inline int inc_begin() { int old = begin; begin = (begin+1)%MAX_TASKS; return old; }
+    // inline int inc_end() { int old = end; end = (end+1)%MAX_TASKS; return old; }
+
+    LiveTask taskQueue[MAX_LIVE_TASKS];
+    std::stack<Task *> taskMem;
+
+    static TaskSys *global;
+
+    TaskSys() : nextScheduleIndex(0)
+    {
+        TaskSys::global = this;
+        Task *mem = new Task[MAX_LIVE_TASKS]; //< could actually be more than _live_ tasks
+        for (int i=0;i<MAX_LIVE_TASKS;i++) {
+            taskMem.push(mem+i);
+        }
+        createThreads();
+    }
+
+    inline Task *allocOne()
+    {
+        pthread_mutex_lock(&mutex);
+        if (taskMem.empty()) {
+            fprintf(stderr, "Too many live tasks.  "
+                    "Change the value of MAX_LIVE_TASKS and recompile.\n");
+            exit(1);
+        }
+        Task *task = taskMem.top();
+        taskMem.pop();
+        pthread_mutex_unlock(&mutex);
+        return task;
+    }
+
+    static inline void init()
+    {
+        if (global) return;
+        pthread_mutex_lock(&mutex);
+        if (global == NULL) global = new TaskSys;
+        pthread_mutex_unlock(&mutex);
+    }
+
+    void createThreads();
+    int nThreads;
+    pthread_t *thread;
+
+    void threadFct();
+
+    inline void schedule(Task *t)
+    {
+        pthread_mutex_lock(&mutex);
+        int liveIndex = nextScheduleIndex;
+        nextScheduleIndex = (nextScheduleIndex+1)%MAX_LIVE_TASKS;
+        if (taskQueue[liveIndex].active) {
+            fprintf(stderr, "Out of task queue resources.  "
+                    "Change the value of MAX_LIVE_TASKS and recompile.\n");
+            exit(1);
+        }
+        taskQueue[liveIndex].task = t;
+        t->schedule(liveIndex);
+        taskQueue[liveIndex].locks = numThreadsRunning+1; // num _worker_ threads plus creator
+        taskQueue[liveIndex].active = true;
+        pthread_mutex_unlock(&mutex);
+    }
+
+    void sync(Task *task)
+    {
+        task->wait();
+        int liveIndex = task->liveIndex;
+        while (taskQueue[liveIndex].locks > 1) {
+#ifndef ISPC_IS_KNC
+            usleep(1);
+#else
+            _mm_delay_32(8);
+#endif
+        }
+        _mm_free(task->data);
+        pthread_mutex_lock(&mutex);
+        taskMem.push(task); // recycle task index
+        taskQueue[liveIndex].active = false;
+        pthread_mutex_unlock(&mutex);
+    }
+};
+
+
+void TaskSys::threadFct() 
+{
+    int myIndex = 0; //lAtomicAdd(&threadIdx,1);
+    while (1) {
+        while (!taskQueue[myIndex].active) {
+#ifndef ISPC_IS_KNC
+            usleep(4);
+#else
+            _mm_delay_32(32);
+#endif
+            continue;
+        }
+
+        Task *mine = taskQueue[myIndex].task;
+        while (!mine->noMoreWork()) {
+            int job = mine->nextJob();
+            if (job >= mine->numJobs()) break;
+            mine->run(job,myIndex);
+        }
+        taskQueue[myIndex].doneWithThis();
+        myIndex = (myIndex+1)%MAX_LIVE_TASKS;
+    }
+}
+
+
+inline void Task::run(int idx, int threadIdx) {
+    (*this->func)(data,threadIdx,TaskSys::global->nThreads,idx,taskCount);
+    markOneDone();
+}
+
+
+void *_threadFct(void *data) {
+    ((TaskSys*)data)->threadFct();
+    return NULL;
+}
+
+
+void TaskSys::createThreads() 
+{
+    init();
+    int reserved = 4;
+    int minid = 2;
+    nThreads = sysconf(_SC_NPROCESSORS_ONLN) - reserved;
+
+    thread = (pthread_t *)malloc(nThreads * sizeof(pthread_t));
+
+    numThreadsRunning = 0;
+    for (int i = 0; i < nThreads; ++i) {
+        pthread_attr_t attr;
+        pthread_attr_init(&attr);
+        pthread_attr_setstacksize(&attr, 2*1024 * 1024);
+
+        int threadID = minid+i;
+        cpu_set_t cpuset;
+        CPU_ZERO(&cpuset);
+        CPU_SET(threadID,&cpuset);
+        int ret = pthread_attr_setaffinity_np(&attr,sizeof(cpuset),&cpuset);
+
+        int err = pthread_create(&thread[i], &attr, &_threadFct, this);
+        ++numThreadsRunning;
+        if (err != 0) {
+            fprintf(stderr, "Error creating pthread %d: %s\n", i, strerror(err));
+            exit(1);
+        }
+    }
+}
+
+TaskSys * TaskSys::global = NULL;
+int TaskSys::numThreadsRunning = 0;
+
+///////////////////////////////////////////////////////////////////////////
+
+void ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count) 
+{
+    Task *ti = *(Task**)taskGroupPtr;
+    ti->func = (TaskFuncType)func;
+    ti->data = data;
+    ti->taskIndex = 0;
+    ti->taskCount = count;
+    TaskSys::global->schedule(ti);
+}
+
+void ISPCSync(void *h) 
+{
+    Task *task = (Task *)h; 
+    assert(task);
+    TaskSys::global->sync(task);
+}
+
+void *ISPCAlloc(void **taskGroupPtr, int64_t size, int32_t alignment) 
+{
+    TaskSys::init();
+    Task *task = TaskSys::global->allocOne();
+    *taskGroupPtr = task;
+    task->data = _mm_malloc(size,alignment);
+    return task->data;//*taskGroupPtr;
+}
+
+#endif // ISPC_USE_PTHREADS_FULLY_SUBSCRIBED
--- a/examples/timing.h
+++ b/examples/timing.h
@@ -43,9 +43,15 @@ extern "C" {
 #endif /* __cplusplus */
    __inline__ uint64_t rdtsc() {
        uint32_t low, high;
+#ifdef __x86_64
        __asm__ __volatile__ (
            "xorl %%eax,%%eax \n    cpuid"
            ::: "%rax", "%rbx", "%rcx", "%rdx" );
+#else
+        __asm__ __volatile__ (
+            "xorl %%eax,%%eax \n    cpuid"
+            ::: "%eax", "%ebx", "%ecx", "%edx" );
+#endif
        __asm__ __volatile__ (
                              "rdtsc" : "=a" (low), "=d" (high));
        return (uint64_t)high << 32 | low;
--- a/examples/volume_rendering/volume.ispc
+++ b/examples/volume_rendering/volume.ispc
@@ -336,6 +336,6 @@ volume_ispc_tasks(uniform float density[], uniform int nVoxels[3],
    // Launch tasks to work on (dx,dy)-sized tiles of the image
    uniform int dx = 8, dy = 8;
    uniform int nTasks = ((width+(dx-1))/dx) * ((height+(dy-1))/dy);
-    launch[nTasks] < volume_task(density, nVoxels, raster2camera, camera2world, 
-                                 width, height, image) >;
+    launch[nTasks] volume_task(density, nVoxels, raster2camera, camera2world, 
+                               width, height, image);
 }
--- a/expr.cpp
+++ b/expr.cpp
--- a/expr.h
+++ b/expr.h
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -284,6 +284,10 @@ public:
    int EstimateCost() const;

    Expr *baseExpr, *index;
+
+private:
+    mutable const Type *type;
+    mutable const PointerType *lvalueType;
 };


@@ -299,7 +303,6 @@ public:
    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
    llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
    const Type *GetType() const;
-    const Type *GetLValueType() const;
    Symbol *GetBaseSymbol() const;
    void Print() const;
    Expr *Optimize();
@@ -321,6 +324,9 @@ public:
        member is found.  (i.e. this is true if the MemberExpr was a '->'
        operator, and is false if it was a '.' operator. */
    bool dereferenceExpr;
+
+protected:
+    mutable const Type *type, *lvalueType;
 };


@@ -531,26 +537,48 @@ public:
 };


-/** @brief Expression that represents dereferencing a reference to get its
-    value. */
-class DereferenceExpr : public Expr {
+/** @brief Common base class that provides shared functionality for
+    PtrDerefExpr and RefDerefExpr. */
+class DerefExpr : public Expr {
 public:
-    DereferenceExpr(Expr *e, SourcePos p);
+    DerefExpr(Expr *e, SourcePos p);

    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
    llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
-    const Type *GetType() const;
    const Type *GetLValueType() const;
    Symbol *GetBaseSymbol() const;
-    void Print() const;
-    Expr *TypeCheck();
    Expr *Optimize();
-    int EstimateCost() const;

    Expr *expr;
 };


+/** @brief Expression that represents dereferencing a pointer to get its
+    value. */
+class PtrDerefExpr : public DerefExpr {
+public:
+    PtrDerefExpr(Expr *e, SourcePos p);
+
+    const Type *GetType() const;
+    void Print() const;
+    Expr *TypeCheck();
+    int EstimateCost() const;
+};
+
+
+/** @brief Expression that represents dereferencing a reference to get its
+    value. */
+class RefDerefExpr : public DerefExpr {
+public:
+    RefDerefExpr(Expr *e, SourcePos p);
+
+    const Type *GetType() const;
+    void Print() const;
+    Expr *TypeCheck();
+    int EstimateCost() const;
+};
+
+
 /** Expression that represents taking the address of an expression. */
 class AddressOfExpr : public Expr {
 public:
@@ -563,6 +591,7 @@ public:
    Expr *TypeCheck();
    Expr *Optimize();
    int EstimateCost() const;
+    llvm::Constant *GetConstant(const Type *type) const;

    Expr *expr;
 };
@@ -630,20 +659,26 @@ public:
        function overloading, this method resolves which actual function
        the arguments match best.  If the argCouldBeNULL parameter is
        non-NULL, each element indicates whether the corresponding argument
-        is the number zero, indicating that it could be a NULL pointer.
-        This parameter may be NULL (for cases where overload resolution is
-        being done just given type information without the parameter
-        argument expressions being available.  It returns true on success.
+        is the number zero, indicating that it could be a NULL pointer, and
+        if argIsConstant is non-NULL, each element indicates whether the
+        corresponding argument is a compile-time constant value.  Both of
+        these parameters may be NULL (for cases where overload resolution
+        is being done just given type information without the parameter
+        argument expressions being available.  This function returns true
+        on success.
     */
    bool ResolveOverloads(SourcePos argPos,
                          const std::vector<const Type *> &argTypes,
-                          const std::vector<bool> *argCouldBeNULL = NULL);
+                          const std::vector<bool> *argCouldBeNULL = NULL,
+                          const std::vector<bool> *argIsConstant = NULL);
    Symbol *GetMatchingFunction();

 private:
-    bool tryResolve(int (*matchFunc)(const Type *, const Type *),
-                    SourcePos argPos, const std::vector<const Type *> &argTypes,
-                    const std::vector<bool> *argCouldBeNULL);
+    std::vector<Symbol *> getCandidateFunctions(int argCount) const;
+    static int computeOverloadCost(const FunctionType *ftype,
+                                   const std::vector<const Type *> &argTypes,
+                                   const std::vector<bool> *argCouldBeNULL,
+                            const std::vector<bool> *argIsConstant);

    /** Name of the function that is being called. */
    std::string name;
--- a/func.cpp
+++ b/func.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2011, Intel Corporation
+  Copyright (c) 2011-2012, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -46,12 +46,21 @@
 #include "util.h"
 #include <stdio.h>

-#include <llvm/LLVMContext.h>
-#include <llvm/Module.h>
-#include <llvm/Type.h>
-#include <llvm/DerivedTypes.h>
-#include <llvm/Instructions.h>
-#include <llvm/Intrinsics.h>
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
+  #include <llvm/LLVMContext.h>
+  #include <llvm/Module.h>
+  #include <llvm/Type.h>
+  #include <llvm/Instructions.h>
+  #include <llvm/Intrinsics.h>
+  #include <llvm/DerivedTypes.h>
+#else
+  #include <llvm/IR/LLVMContext.h>
+  #include <llvm/IR/Module.h>
+  #include <llvm/IR/Type.h>
+  #include <llvm/IR/Instructions.h>
+  #include <llvm/IR/Intrinsics.h>
+  #include <llvm/IR/DerivedTypes.h>
+#endif
 #include <llvm/PassManager.h>
 #include <llvm/PassRegistry.h>
 #include <llvm/Transforms/IPO.h>
@@ -59,16 +68,14 @@
 #include <llvm/Support/FileUtilities.h>
 #include <llvm/Target/TargetMachine.h>
 #include <llvm/Target/TargetOptions.h>
-#include <llvm/Target/TargetData.h>
 #include <llvm/PassManager.h>
 #include <llvm/Analysis/Verifier.h>
 #include <llvm/Support/CFG.h>
 #include <llvm/Support/ToolOutputFile.h>
 #include <llvm/Assembly/PrintModulePass.h>

-Function::Function(Symbol *s, const std::vector<Symbol *> &a, Stmt *c) {
+Function::Function(Symbol *s, Stmt *c) {
    sym = s;
-    args = a;
    code = c;

    maskSymbol = m->symbolTable->LookupVariable("__mask");
@@ -101,12 +108,20 @@ Function::Function(Symbol *s, const std::vector<Symbol *> &a, Stmt *c) {
        printf("\n\n\n");
    }

-    const FunctionType *type = dynamic_cast<const FunctionType *>(sym->type);
+    const FunctionType *type = CastType<FunctionType>(sym->type);
    Assert(type != NULL);

-    for (unsigned int i = 0; i < args.size(); ++i)
-        if (dynamic_cast<const ReferenceType *>(args[i]->type) == NULL)
-            args[i]->parentFunction = this;
+    for (int i = 0; i < type->GetNumParameters(); ++i) {
+        const char *paramName = type->GetParameterName(i).c_str();
+        Symbol *sym = m->symbolTable->LookupVariable(paramName);
+        if (sym == NULL)
+            Assert(strncmp(paramName, "__anon_parameter_", 17) == 0);
+        args.push_back(sym);
+
+        const Type *t = type->GetParameterType(i);
+        if (sym != NULL && CastType<ReferenceType>(t) == NULL)
+            sym->parentFunction = this;
+    }

    if (type->isTask) {
        threadIndexSym = m->symbolTable->LookupVariable("threadIndex");
@@ -125,7 +140,7 @@ Function::Function(Symbol *s, const std::vector<Symbol *> &a, Stmt *c) {

 const Type *
 Function::GetReturnType() const {
-    const FunctionType *type = dynamic_cast<const FunctionType *>(sym->type);
+    const FunctionType *type = CastType<FunctionType>(sym->type);
    Assert(type != NULL);
    return type->GetReturnType();
 }
@@ -133,7 +148,7 @@ Function::GetReturnType() const {

 const FunctionType *
 Function::GetType() const {
-    const FunctionType *type = dynamic_cast<const FunctionType *>(sym->type);
+    const FunctionType *type = CastType<FunctionType>(sym->type);
    Assert(type != NULL);
    return type;
 }
@@ -145,7 +160,8 @@ Function::GetType() const {
    'mem2reg' pass will in turn promote to SSA registers..
 */
 static void
-lCopyInTaskParameter(int i, llvm::Value *structArgPtr, const std::vector<Symbol *> &args,
+lCopyInTaskParameter(int i, llvm::Value *structArgPtr, const 
+                     std::vector<Symbol *> &args,
                     FunctionEmitContext *ctx) {
    // We expect the argument structure to come in as a poitner to a
    // structure.  Confirm and figure out its type here.
@@ -157,9 +173,13 @@ lCopyInTaskParameter(int i, llvm::Value *structArgPtr, const std::vector<Symbol
        llvm::dyn_cast<const llvm::StructType>(pt->getElementType());

    // Get the type of the argument we're copying in and its Symbol pointer
-    LLVM_TYPE_CONST llvm::Type *argType = argStructType->getElementType(i);
+    llvm::Type *argType = argStructType->getElementType(i);
    Symbol *sym = args[i];

+    if (sym == NULL)
+        // anonymous parameter, so don't worry about it
+        return;
+
    // allocate space to copy the parameter in to
    sym->storagePtr = ctx->AllocaInst(argType, sym->name.c_str());

@@ -170,7 +190,7 @@ lCopyInTaskParameter(int i, llvm::Value *structArgPtr, const std::vector<Symbol
    // memory
    llvm::Value *ptrval = ctx->LoadInst(ptr, sym->name.c_str());
    ctx->StoreInst(ptrval, sym->storagePtr);
-    ctx->EmitFunctionParameterDebugInfo(sym);
+    ctx->EmitFunctionParameterDebugInfo(sym, i);
 }


@@ -186,14 +206,14 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
    // value
    maskSymbol->storagePtr = ctx->GetFullMaskPointer();

-    // add debugging info for __mask, programIndex, ...
+    // add debugging info for __mask
    maskSymbol->pos = firstStmtPos;
    ctx->EmitVariableDebugInfo(maskSymbol);

 #if 0
    llvm::BasicBlock *entryBBlock = ctx->GetCurrentBasicBlock();
 #endif
-    const FunctionType *type = dynamic_cast<const FunctionType *>(sym->type);
+    const FunctionType *type = CastType<FunctionType>(sym->type);
    Assert(type != NULL);
    if (type->isTask == true) {
        // For tasks, we there should always be three parmeters: the
@@ -211,13 +231,15 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
        for (unsigned int i = 0; i < args.size(); ++i)
            lCopyInTaskParameter(i, structParamPtr, args, ctx);

-        // Copy in the mask as well.
-        int nArgs = (int)args.size();
-        // The mask is the last parameter in the argument structure
-        llvm::Value *ptr = ctx->AddElementOffset(structParamPtr, nArgs, NULL,
-                                                  "task_struct_mask");
-        llvm::Value *ptrval = ctx->LoadInst(ptr, "mask");
-        ctx->SetFunctionMask(ptrval);
+        if (type->isUnmasked == false) {
+            // Copy in the mask as well.
+            int nArgs = (int)args.size();
+            // The mask is the last parameter in the argument structure
+            llvm::Value *ptr = ctx->AddElementOffset(structParamPtr, nArgs, NULL,
+                                                     "task_struct_mask");
+            llvm::Value *ptrval = ctx->LoadInst(ptr, "mask");
+            ctx->SetFunctionMask(ptrval);
+        }

        // Copy threadIndex and threadCount into stack-allocated storage so
        // that their symbols point to something reasonable.
@@ -240,13 +262,17 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
        llvm::Function::arg_iterator argIter = function->arg_begin(); 
        for (unsigned int i = 0; i < args.size(); ++i, ++argIter) {
            Symbol *sym = args[i];
+            if (sym == NULL)
+                // anonymous function parameter
+                continue;
+
            argIter->setName(sym->name.c_str());

            // Allocate stack storage for the parameter and emit code
            // to store the its value there.
            sym->storagePtr = ctx->AllocaInst(argIter->getType(), sym->name.c_str());
            ctx->StoreInst(argIter, sym->storagePtr);
-            ctx->EmitFunctionParameterDebugInfo(sym);
+            ctx->EmitFunctionParameterDebugInfo(sym, i);
        }

        // If the number of actual function arguments is equal to the
@@ -254,9 +280,13 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
        // don't have a mask parameter, so set it to be all on.  This
        // happens for exmaple with 'export'ed functions that the app
        // calls.
-        if (argIter == function->arg_end())
+        if (argIter == function->arg_end()) {
+            Assert(type->isUnmasked || type->isExported);
            ctx->SetFunctionMask(LLVMMaskAllOn);
+        }
        else {
+            Assert(type->isUnmasked == false);
+
            // Otherwise use the mask to set the entry mask value
            argIter->setName("__mask");
            Assert(argIter->getType() == LLVMTypes::MaskType);
@@ -279,21 +309,30 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
        // on, all off, or mixed.  If this is a simple function, then this
        // isn't worth the code bloat / overhead.
        bool checkMask = (type->isTask == true) || 
-            ((function->hasFnAttr(llvm::Attribute::AlwaysInline) == false) &&
+            (
+#if defined(LLVM_3_1)
+              (function->hasFnAttr(llvm::Attribute::AlwaysInline) == false)
+#elif defined(LLVM_3_2)
+              (function->getFnAttributes().hasAttribute(llvm::Attributes::AlwaysInline) == false)
+#else // LLVM 3.3+
+              (function->getAttributes().getFnAttributes().hasAttribute(llvm::Attribute::AlwaysInline) == false)
+#endif
+             &&
             costEstimate > CHECK_MASK_AT_FUNCTION_START_COST);
+        checkMask &= (type->isUnmasked == false);
        checkMask &= (g->target.maskingIsFree == false);
        checkMask &= (g->opt.disableCoherentControlFlow == false);
-
+        
        if (checkMask) {
            llvm::Value *mask = ctx->GetFunctionMask();
            llvm::Value *allOn = ctx->All(mask);
            llvm::BasicBlock *bbAllOn = ctx->CreateBasicBlock("all_on");
-            llvm::BasicBlock *bbNotAll = ctx->CreateBasicBlock("not_all_on");
+            llvm::BasicBlock *bbSomeOn = ctx->CreateBasicBlock("some_on");

            // Set up basic blocks for goto targets
            ctx->InitializeLabelMap(code);

-            ctx->BranchInst(bbAllOn, bbNotAll, allOn);
+            ctx->BranchInst(bbAllOn, bbSomeOn, allOn);
            // all on: we've determined dynamically that the mask is all
            // on.  Set the current mask to "all on" explicitly so that
            // codegen for this path can be improved with this knowledge in
@@ -305,23 +344,11 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
            if (ctx->GetCurrentBasicBlock())
                ctx->ReturnInst();

-            // not all on: figure out if no instances are running, or if
-            // some of them are
-            ctx->SetCurrentBasicBlock(bbNotAll);
-            ctx->SetFunctionMask(mask);
-            llvm::BasicBlock *bbNoneOn = ctx->CreateBasicBlock("none_on");
-            llvm::BasicBlock *bbSomeOn = ctx->CreateBasicBlock("some_on");
-            llvm::Value *anyOn = ctx->Any(mask);
-            ctx->BranchInst(bbSomeOn, bbNoneOn, anyOn);
-            
-            // Everyone is off; get out of here.
-            ctx->SetCurrentBasicBlock(bbNoneOn);
-            ctx->ReturnInst();
-
-            // some on: reset the mask to the value it had at function
-            // entry and emit the code.  Resetting the mask here is
-            // important, due to the "all on" setting of it for the path
-            // above
+            // not all on: however, at least one lane must be running,
+            // since we should never run with all off...  some on: reset
+            // the mask to the value it had at function entry and emit the
+            // code.  Resetting the mask here is important, due to the "all
+            // on" setting of it for the path above.
            ctx->SetCurrentBasicBlock(bbSomeOn);
            ctx->SetFunctionMask(mask);

@@ -355,7 +382,7 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
        // issue a warning.  Also need to warn if it's the entry block for
        // the function (in which case it will not have predeccesors but is
        // still reachable.)
-        if (type->GetReturnType() != AtomicType::Void &&
+        if (Type::Equal(type->GetReturnType(), AtomicType::Void) == false &&
            (pred_begin(ec.bblock) != pred_end(ec.bblock) || (ec.bblock == entryBBlock)))
            Warning(sym->pos, "Missing return statement in function returning \"%s\".",
                    type->rType->GetString().c_str());
@@ -415,19 +442,22 @@ Function::GenerateIR() {
        // If the function is 'export'-qualified, emit a second version of
        // it without a mask parameter and without name mangling so that
        // the application can call it
-        const FunctionType *type = dynamic_cast<const FunctionType *>(sym->type);
+        const FunctionType *type = CastType<FunctionType>(sym->type);
        Assert(type != NULL);
        if (type->isExported) {
            if (!type->isTask) {
-                LLVM_TYPE_CONST llvm::FunctionType *ftype = 
-                    type->LLVMFunctionType(g->ctx);
+                llvm::FunctionType *ftype = type->LLVMFunctionType(g->ctx, true);
                llvm::GlobalValue::LinkageTypes linkage = llvm::GlobalValue::ExternalLinkage;
                std::string functionName = sym->name;
                if (g->mangleFunctionsWithTarget)
                    functionName += std::string("_") + g->target.GetISAString();
                llvm::Function *appFunction = 
                    llvm::Function::Create(ftype, linkage, functionName.c_str(), m->module);
+#if defined(LLVM_3_1)
                appFunction->setDoesNotThrow(true);
+#else
+                appFunction->setDoesNotThrow();
+#endif

                if (appFunction->getName() != functionName) {
                    // this was a redefinition for which we already emitted an
--- a/func.h
+++ b/func.h
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2011, Intel Corporation
+  Copyright (c) 2011-2012, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -43,7 +43,7 @@

 class Function {
 public:
-    Function(Symbol *sym, const std::vector<Symbol *> &args, Stmt *code);
+    Function(Symbol *sym, Stmt *code);

    const Type *GetReturnType() const;
    const FunctionType *GetType() const;
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -41,27 +41,41 @@
 #include "llvmutil.h"
 #include <stdio.h>
 #ifdef ISPC_IS_WINDOWS
-#include <windows.h>
-#include <direct.h>
-#define strcasecmp stricmp
+  #include <windows.h>
+  #include <direct.h>
+  #define strcasecmp stricmp
+#else
+  #include <sys/types.h>
+  #include <unistd.h>
+#endif
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
+  #include <llvm/LLVMContext.h>
+  #include <llvm/Module.h>
+  #include <llvm/Instructions.h>
+#else
+  #include <llvm/IR/LLVMContext.h>
+  #include <llvm/IR/Module.h>
+  #include <llvm/IR/Instructions.h>
+#endif
+#if defined(LLVM_3_1)
+  #include <llvm/Analysis/DebugInfo.h>
+  #include <llvm/Analysis/DIBuilder.h>
+#else
+  #include <llvm/DebugInfo.h>
+  #include <llvm/DIBuilder.h>
 #endif
-#include <llvm/LLVMContext.h>
-#include <llvm/Module.h>
-#include <llvm/Analysis/DIBuilder.h>
-#include <llvm/Analysis/DebugInfo.h>
 #include <llvm/Support/Dwarf.h>
-#include <llvm/Instructions.h>
 #include <llvm/Target/TargetMachine.h>
 #include <llvm/Target/TargetOptions.h>
-#include <llvm/Target/TargetData.h>
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
-  #include <llvm/Support/TargetRegistry.h>
-  #include <llvm/Support/TargetSelect.h>
-#else
-  #include <llvm/Target/TargetRegistry.h>
-  #include <llvm/Target/TargetSelect.h>
-  #include <llvm/Target/SubtargetFeature.h>
+#if defined(LLVM_3_1)
+  #include <llvm/Target/TargetData.h>
+#elif defined(LLVM_3_2)
+  #include <llvm/DataLayout.h>
+#else // LLVM 3.3+
+  #include <llvm/IR/DataLayout.h>
 #endif
+#include <llvm/Support/TargetRegistry.h>
+#include <llvm/Support/TargetSelect.h>
 #include <llvm/Support/Host.h>

 Globals *g;
@@ -70,31 +84,124 @@ Module *m;
 ///////////////////////////////////////////////////////////////////////////
 // Target

+#ifndef ISPC_IS_WINDOWS
+static void __cpuid(int info[4], int infoType) {
+    __asm__ __volatile__ ("cpuid"
+                          : "=a" (info[0]), "=b" (info[1]), "=c" (info[2]), "=d" (info[3])
+                          : "0" (infoType));
+}
+
+/* Save %ebx in case it's the PIC register */
+static void __cpuidex(int info[4], int level, int count) {
+  __asm__ __volatile__ ("xchg{l}\t{%%}ebx, %1\n\t"
+                        "cpuid\n\t"
+                        "xchg{l}\t{%%}ebx, %1\n\t"
+                        : "=a" (info[0]), "=r" (info[1]), "=c" (info[2]), "=d" (info[3])
+                        : "0" (level), "2" (count));
+}
+#endif // ISPC_IS_WINDOWS
+
+
+static const char *
+lGetSystemISA() {
+    int info[4];
+    __cpuid(info, 1);
+
+    if ((info[2] & (1 << 28)) != 0) {  // AVX
+        // AVX1 for sure....
+        // Ivy Bridge?
+        if ((info[2] & (1 << 29)) != 0 &&  // F16C
+            (info[2] & (1 << 30)) != 0) {  // RDRAND
+            // So far, so good.  AVX2?
+            // Call cpuid with eax=7, ecx=0
+            int info2[4];
+            __cpuidex(info2, 7, 0);
+            if ((info2[1] & (1 << 5)) != 0)
+                return "avx2";
+            else
+                return "avx1.1";
+        }
+        // Regular AVX
+        return "avx";
+    }
+    else if ((info[2] & (1 << 19)) != 0)
+        return "sse4";
+    else if ((info[3] & (1 << 26)) != 0)
+        return "sse2";
+    else {
+        fprintf(stderr, "Unable to detect supported SSE/AVX ISA.  Exiting.\n");
+        exit(1);
+    }
+}
+
+
+static const char *supportedCPUs[] = { 
+    "atom", "penryn", "core2", "corei7", "corei7-avx"
+#if defined(LLVM_3_2) || defined(LLVM_3_3)
+    , "core-avx-i", "core-avx2"
+#endif // LLVM_3_2 or LLVM_3_3
+};
+
+
 bool
 Target::GetTarget(const char *arch, const char *cpu, const char *isa,
                  bool pic, Target *t) {
+    if (isa == NULL) {
+        if (cpu != NULL) {
+            // If a CPU was specified explicitly, try to pick the best
+            // possible ISA based on that.
+            if (!strcmp(cpu, "core-avx2"))
+                isa = "avx2";
+            else if (!strcmp(cpu, "core-avx-i"))
+                isa = "avx1.1";
+            else if (!strcmp(cpu, "sandybridge") ||
+                !strcmp(cpu, "corei7-avx"))
+                isa = "avx";
+            else if (!strcmp(cpu, "corei7") ||
+                     !strcmp(cpu, "penryn"))
+                isa = "sse4";
+            else
+                isa = "sse2";
+            Warning(SourcePos(), "No --target specified on command-line.  "
+                    "Using ISA \"%s\" based on specified CPU \"%s\".", isa,
+                    cpu);
+        }
+        else {
+            // No CPU and no ISA, so use CPUID to figure out what this CPU
+            // supports.
+            isa = lGetSystemISA();
+            Warning(SourcePos(), "No --target specified on command-line.  "
+                    "Using system ISA \"%s\".", isa);
+        }
+    }
+
    if (cpu == NULL) {
        std::string hostCPU = llvm::sys::getHostCPUName();
        if (hostCPU.size() > 0)
            cpu = strdup(hostCPU.c_str());
        else {
-            fprintf(stderr, "Warning: unable to determine host CPU!\n");
+            Warning(SourcePos(), "Unable to determine host CPU!\n");
            cpu = "generic";
        }
    }
+    else {
+        bool foundCPU = false;
+        for (int i = 0; i < int(sizeof(supportedCPUs) / sizeof(supportedCPUs[0])); 
+             ++i) {
+            if (!strcmp(cpu, supportedCPUs[i])) {
+                foundCPU = true;
+                break;
+            }
+        }
+        if (foundCPU == false) {
+            fprintf(stderr, "Error: CPU type \"%s\" unknown. Supported CPUs: "
+                    "%s.\n", cpu, SupportedTargetCPUs().c_str());
+            return false;
+        }
+    }
+
    t->cpu = cpu;

-    if (isa == NULL) {
-        if (!strcasecmp(cpu, "atom"))
-            isa = "sse2";
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
-        else if (!strcasecmp(cpu, "sandybridge") ||
-                 !strcasecmp(cpu, "corei7-avx"))
-            isa = "avx";
-#endif // LLVM_3_0
-        else
-            isa = "sse4";
-    }
    if (arch == NULL)
        arch = "x86-64";

@@ -125,13 +232,16 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
        t->arch = arch;
    }

+    // This is the case for most of them
+    t->hasHalf = t->hasRand = t->hasTranscendentals = false;
+    t->hasGather = t->hasScatter = false;
+
    if (!strcasecmp(isa, "sse2")) {
        t->isa = Target::SSE2;
        t->nativeVectorWidth = 4;
        t->vectorWidth = 4;
        t->attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt";
        t->maskingIsFree = false;
-        t->allOffMaskIsSafe = false;
        t->maskBitCount = 32;
    }
    else if (!strcasecmp(isa, "sse2-x2")) {
@@ -140,7 +250,6 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
        t->vectorWidth = 8;
        t->attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt";
        t->maskingIsFree = false;
-        t->allOffMaskIsSafe = false;
        t->maskBitCount = 32;
    }
    else if (!strcasecmp(isa, "sse4")) {
@@ -149,7 +258,6 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
        t->vectorWidth = 4;
        t->attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
        t->maskingIsFree = false;
-        t->allOffMaskIsSafe = false;
        t->maskBitCount = 32;
    }
    else if (!strcasecmp(isa, "sse4x2") || !strcasecmp(isa, "sse4-x2")) {
@@ -158,7 +266,6 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
        t->vectorWidth = 8;
        t->attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
        t->maskingIsFree = false;
-        t->allOffMaskIsSafe = false;
        t->maskBitCount = 32;
    }
    else if (!strcasecmp(isa, "generic-4")) {
@@ -166,73 +273,136 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
        t->nativeVectorWidth = 4;
        t->vectorWidth = 4;
        t->maskingIsFree = true;
-        t->allOffMaskIsSafe = true;
        t->maskBitCount = 1;
+        t->hasHalf = true;
+        t->hasTranscendentals = true;
+        t->hasGather = t->hasScatter = true;
    }
    else if (!strcasecmp(isa, "generic-8")) {
        t->isa = Target::GENERIC;
        t->nativeVectorWidth = 8;
        t->vectorWidth = 8;
        t->maskingIsFree = true;
-        t->allOffMaskIsSafe = true;
        t->maskBitCount = 1;
+        t->hasHalf = true;
+        t->hasTranscendentals = true;
+        t->hasGather = t->hasScatter = true;
    }
    else if (!strcasecmp(isa, "generic-16")) {
        t->isa = Target::GENERIC;
        t->nativeVectorWidth = 16;
        t->vectorWidth = 16;
        t->maskingIsFree = true;
-        t->allOffMaskIsSafe = true;
        t->maskBitCount = 1;
+        t->hasHalf = true;
+        t->hasTranscendentals = true;
+        t->hasGather = t->hasScatter = true;
+    }
+    else if (!strcasecmp(isa, "generic-32")) {
+        t->isa = Target::GENERIC;
+        t->nativeVectorWidth = 32;
+        t->vectorWidth = 32;
+        t->maskingIsFree = true;
+        t->maskBitCount = 1;
+        t->hasHalf = true;
+        t->hasTranscendentals = true;
+        t->hasGather = t->hasScatter = true;
+    }
+    else if (!strcasecmp(isa, "generic-64")) {
+        t->isa = Target::GENERIC;
+        t->nativeVectorWidth = 64;
+        t->vectorWidth = 64;
+        t->maskingIsFree = true;
+        t->maskBitCount = 1;
+        t->hasHalf = true;
+        t->hasTranscendentals = true;
+        t->hasGather = t->hasScatter = true;
    }
    else if (!strcasecmp(isa, "generic-1")) {
        t->isa = Target::GENERIC;
        t->nativeVectorWidth = 1;
        t->vectorWidth = 1;
        t->maskingIsFree = false;
-        t->allOffMaskIsSafe = false;
        t->maskBitCount = 32;
    }
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
-    else if (!strcasecmp(isa, "avx")) {
+    else if (!strcasecmp(isa, "avx") || !strcasecmp(isa, "avx1")) {
        t->isa = Target::AVX;
        t->nativeVectorWidth = 8;
        t->vectorWidth = 8;
        t->attributes = "+avx,+popcnt,+cmov";
        t->maskingIsFree = false;
-        t->allOffMaskIsSafe = false;
        t->maskBitCount = 32;
    }
-    else if (!strcasecmp(isa, "avx-x2")) {
+    else if (!strcasecmp(isa, "avx-x2") || !strcasecmp(isa, "avx1-x2")) {
        t->isa = Target::AVX;
        t->nativeVectorWidth = 8;
        t->vectorWidth = 16;
        t->attributes = "+avx,+popcnt,+cmov";
        t->maskingIsFree = false;
-        t->allOffMaskIsSafe = false;
        t->maskBitCount = 32;
    }
-#endif // LLVM 3.0+
-#if defined(LLVM_3_1svn)
+    else if (!strcasecmp(isa, "avx1.1")) {
+        t->isa = Target::AVX11;
+        t->nativeVectorWidth = 8;
+        t->vectorWidth = 8;
+        t->attributes = "+avx,+popcnt,+cmov,+f16c,+rdrand";
+        t->maskingIsFree = false;
+        t->maskBitCount = 32;
+        t->hasHalf = true;
+#if !defined(LLVM_3_1)
+        // LLVM 3.2+ only
+        t->hasRand = true;
+#endif
+    }
+    else if (!strcasecmp(isa, "avx1.1-x2")) {
+        t->isa = Target::AVX11;
+        t->nativeVectorWidth = 8;
+        t->vectorWidth = 16;
+        t->attributes = "+avx,+popcnt,+cmov,+f16c,+rdrand";
+        t->maskingIsFree = false;
+        t->maskBitCount = 32;
+        t->hasHalf = true;
+#if !defined(LLVM_3_1)
+        // LLVM 3.2+ only
+        t->hasRand = true;
+#endif
+    }
    else if (!strcasecmp(isa, "avx2")) {
        t->isa = Target::AVX2;
        t->nativeVectorWidth = 8;
        t->vectorWidth = 8;
-        t->attributes = "+avx2,+popcnt,+cmov,+f16c";
+        t->attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrand"
+#ifndef LLVM_3_1
+            ",+fma"
+#endif // !LLVM_3_1
+            ;
        t->maskingIsFree = false;
-        t->allOffMaskIsSafe = false;
        t->maskBitCount = 32;
+        t->hasHalf = true;
+#if !defined(LLVM_3_1)
+        // LLVM 3.2+ only
+        t->hasRand = true;
+        t->hasGather = true;
+#endif
    }
    else if (!strcasecmp(isa, "avx2-x2")) {
        t->isa = Target::AVX2;
        t->nativeVectorWidth = 16;
        t->vectorWidth = 16;
-        t->attributes = "+avx2,+popcnt,+cmov,+f16c";
+        t->attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrand"
+#ifndef LLVM_3_1
+            ",+fma"
+#endif // !LLVM_3_1
+            ;
        t->maskingIsFree = false;
-        t->allOffMaskIsSafe = false;
        t->maskBitCount = 32;
+        t->hasHalf = true;
+#if !defined(LLVM_3_1)
+        // LLVM 3.2+ only
+        t->hasRand = true;
+        t->hasGather = true;
+#endif
    }
-#endif // LLVM 3.1
    else {
        fprintf(stderr, "Target ISA \"%s\" is unknown.  Choices are: %s\n", 
                isa, SupportedTargetISAs());
@@ -241,25 +411,31 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,

    if (!error) {
        llvm::TargetMachine *targetMachine = t->GetTargetMachine();
+#if defined(LLVM_3_1)
        const llvm::TargetData *targetData = targetMachine->getTargetData();
        t->is32Bit = (targetData->getPointerSize() == 4);
+#else
+        int addressSpace = 0;
+        const llvm::DataLayout *dataLayout = targetMachine->getDataLayout();
+        t->is32Bit = (dataLayout->getPointerSize(addressSpace) == 4);
+#endif
+        Assert(t->vectorWidth <= ISPC_MAX_NVEC);
    }

    return !error;
 }


-const char *
+std::string
 Target::SupportedTargetCPUs() {
-    return "atom, barcelona, core2, corei7, "
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
-        "corei7-avx, "
-#endif
-        "istanbul, nocona, penryn, "
-#ifdef LLVM_2_9
-        "sandybridge, "
-#endif
-        "westmere";
+    std::string ret;
+    int count = sizeof(supportedCPUs) / sizeof(supportedCPUs[0]);
+    for (int i = 0; i < count; ++i) {
+        ret += supportedCPUs[i];
+        if (i != count - 1)
+            ret += ", ";
+    }
+    return ret;
 }


@@ -271,14 +447,9 @@ Target::SupportedTargetArchs() {

 const char *
 Target::SupportedTargetISAs() {
-    return "sse2, sse2-x2, sse4, sse4-x2"
-#ifndef LLVM_2_9
-        ", avx, avx-x2"
-#endif // !LLVM_2_9
-#ifdef LLVM_3_1svn
-        ", avx2, avx2-x2"
-#endif // LLVM_3_1svn
-        ", generic-4, generic-8, generic-16, generic-1";
+    return "sse2, sse2-x2, sse4, sse4-x2, avx, avx-x2"
+        ", avx1.1, avx1.1-x2, avx2, avx2-x2"
+        ", generic-1, generic-4, generic-8, generic-16, generic-32";
 }


@@ -286,11 +457,7 @@ std::string
 Target::GetTripleString() const {
    llvm::Triple triple;
    // Start with the host triple as the default
-#if defined(LLVM_3_1) || defined(LLVM_3_1svn)
    triple.setTriple(llvm::sys::getDefaultTargetTriple());
-#else
-    triple.setTriple(llvm::sys::getHostTriple());
-#endif

    // And override the arch in the host triple based on what the user
    // specified.  Here we need to deal with the fact that LLVM uses one
@@ -315,30 +482,15 @@ Target::GetTargetMachine() const {

    llvm::Reloc::Model relocModel = generatePIC ? llvm::Reloc::PIC_ : 
                                                  llvm::Reloc::Default;
-#if defined(LLVM_3_1svn)
    std::string featuresString = attributes;
    llvm::TargetOptions options;
-    if (g->opt.fastMath == true)
-        options.UnsafeFPMath = 1;
+#if !defined(LLVM_3_1)
+    if (g->opt.disableFMA == false)
+        options.AllowFPOpFusion = llvm::FPOpFusion::Fast;
+#endif // !LLVM_3_1
    llvm::TargetMachine *targetMachine = 
        target->createTargetMachine(triple, cpu, featuresString, options,
                                    relocModel);
-#elif defined(LLVM_3_0)
-    std::string featuresString = attributes;
-    llvm::TargetMachine *targetMachine = 
-        target->createTargetMachine(triple, cpu, featuresString, relocModel);
-#else // LLVM 2.9
-#ifdef ISPC_IS_APPLE
-    relocModel = llvm::Reloc::PIC_;
-#endif // ISPC_IS_APPLE
-    std::string featuresString = cpu + std::string(",") + attributes;
-    llvm::TargetMachine *targetMachine = 
-        target->createTargetMachine(triple, featuresString);
-#ifndef ISPC_IS_WINDOWS
-    targetMachine->setRelocationModel(relocModel);
-#endif // !ISPC_IS_WINDOWS
-#endif // LLVM_2_9
-
    Assert(targetMachine != NULL);

    targetMachine->setAsmVerbosityDefault(true);
@@ -355,6 +507,8 @@ Target::GetISAString() const {
        return "sse4";
    case Target::AVX:
        return "avx";
+    case Target::AVX11:
+        return "avx11";
    case Target::AVX2:
        return "avx2";
    case Target::GENERIC:
@@ -367,7 +521,7 @@ Target::GetISAString() const {


 static bool
-lGenericTypeLayoutIndeterminate(LLVM_TYPE_CONST llvm::Type *type) {
+lGenericTypeLayoutIndeterminate(llvm::Type *type) {
    if (type->isPrimitiveType() || type->isIntegerTy())
        return false;

@@ -376,18 +530,18 @@ lGenericTypeLayoutIndeterminate(LLVM_TYPE_CONST llvm::Type *type) {
        type == LLVMTypes::Int1VectorType)
        return true;

-    LLVM_TYPE_CONST llvm::ArrayType *at = 
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(type);
+    llvm::ArrayType *at = 
+        llvm::dyn_cast<llvm::ArrayType>(type);
    if (at != NULL)
        return lGenericTypeLayoutIndeterminate(at->getElementType());

-    LLVM_TYPE_CONST llvm::PointerType *pt = 
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::PointerType>(type);
+    llvm::PointerType *pt = 
+        llvm::dyn_cast<llvm::PointerType>(type);
    if (pt != NULL)
        return false;

-    LLVM_TYPE_CONST llvm::StructType *st =
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::StructType>(type);
+    llvm::StructType *st =
+        llvm::dyn_cast<llvm::StructType>(type);
    if (st != NULL) {
        for (int i = 0; i < (int)st->getNumElements(); ++i)
            if (lGenericTypeLayoutIndeterminate(st->getElementType(i)))
@@ -395,29 +549,24 @@ lGenericTypeLayoutIndeterminate(LLVM_TYPE_CONST llvm::Type *type) {
        return false;
    }

-    Assert(llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(type));
+    Assert(llvm::isa<llvm::VectorType>(type));
    return true;
 }


 llvm::Value *
-Target::SizeOf(LLVM_TYPE_CONST llvm::Type *type, 
+Target::SizeOf(llvm::Type *type, 
               llvm::BasicBlock *insertAtEnd) {
    if (isa == Target::GENERIC &&
        lGenericTypeLayoutIndeterminate(type)) {
        llvm::Value *index[1] = { LLVMInt32(1) };
-        LLVM_TYPE_CONST llvm::PointerType *ptrType = llvm::PointerType::get(type, 0);
+        llvm::PointerType *ptrType = llvm::PointerType::get(type, 0);
        llvm::Value *voidPtr = llvm::ConstantPointerNull::get(ptrType);
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
        llvm::ArrayRef<llvm::Value *> arrayRef(&index[0], &index[1]);
        llvm::Instruction *gep = 
            llvm::GetElementPtrInst::Create(voidPtr, arrayRef, "sizeof_gep",
                                            insertAtEnd);
-#else
-        llvm::Instruction *gep =
-            llvm::GetElementPtrInst::Create(voidPtr, &index[0], &index[1],
-                                            "sizeof_gep", insertAtEnd);
-#endif
+
        if (is32Bit || g->opt.force32BitAddressing)
            return new llvm::PtrToIntInst(gep, LLVMTypes::Int32Type, 
                                          "sizeof_int", insertAtEnd);
@@ -426,9 +575,18 @@ Target::SizeOf(LLVM_TYPE_CONST llvm::Type *type,
                                          "sizeof_int", insertAtEnd);
    }

+#if defined(LLVM_3_1)
    const llvm::TargetData *td = GetTargetMachine()->getTargetData();
    Assert(td != NULL);
-    uint64_t byteSize = td->getTypeSizeInBits(type) / 8;
+    uint64_t bitSize = td->getTypeSizeInBits(type);
+#else
+    const llvm::DataLayout *dl = GetTargetMachine()->getDataLayout();
+    Assert(dl != NULL);
+    uint64_t bitSize = dl->getTypeSizeInBits(type);
+#endif
+
+    Assert((bitSize % 8) == 0);
+    uint64_t byteSize = bitSize / 8;
    if (is32Bit || g->opt.force32BitAddressing)
        return LLVMInt32((int32_t)byteSize);
    else
@@ -437,23 +595,18 @@ Target::SizeOf(LLVM_TYPE_CONST llvm::Type *type,


 llvm::Value *
-Target::StructOffset(LLVM_TYPE_CONST llvm::Type *type, int element,
+Target::StructOffset(llvm::Type *type, int element,
                     llvm::BasicBlock *insertAtEnd) {
    if (isa == Target::GENERIC && 
        lGenericTypeLayoutIndeterminate(type) == true) {
        llvm::Value *indices[2] = { LLVMInt32(0), LLVMInt32(element) };
-        LLVM_TYPE_CONST llvm::PointerType *ptrType = llvm::PointerType::get(type, 0);
+        llvm::PointerType *ptrType = llvm::PointerType::get(type, 0);
        llvm::Value *voidPtr = llvm::ConstantPointerNull::get(ptrType);
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
        llvm::ArrayRef<llvm::Value *> arrayRef(&indices[0], &indices[2]);
        llvm::Instruction *gep = 
            llvm::GetElementPtrInst::Create(voidPtr, arrayRef, "offset_gep",
                                            insertAtEnd);
-#else
-        llvm::Instruction *gep =
-            llvm::GetElementPtrInst::Create(voidPtr, &indices[0], &indices[2],
-                                            "offset_gep", insertAtEnd);
-#endif
+
        if (is32Bit || g->opt.force32BitAddressing)
            return new llvm::PtrToIntInst(gep, LLVMTypes::Int32Type, 
                                          "offset_int", insertAtEnd);
@@ -462,12 +615,22 @@ Target::StructOffset(LLVM_TYPE_CONST llvm::Type *type, int element,
                                          "offset_int", insertAtEnd);
    }

+    llvm::StructType *structType = 
+        llvm::dyn_cast<llvm::StructType>(type);
+    if (structType == NULL || structType->isSized() == false) {
+        Assert(m->errorCount > 0);
+        return NULL;
+    }
+
+#if defined(LLVM_3_1)
    const llvm::TargetData *td = GetTargetMachine()->getTargetData();
    Assert(td != NULL);
-    LLVM_TYPE_CONST llvm::StructType *structType = 
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::StructType>(type);
-    Assert(structType != NULL);
    const llvm::StructLayout *sl = td->getStructLayout(structType);
+#else
+    const llvm::DataLayout *dl = GetTargetMachine()->getDataLayout();
+    Assert(dl != NULL);
+    const llvm::StructLayout *sl = dl->getStructLayout(structType);
+#endif
    Assert(sl != NULL);

    uint64_t offset = sl->getElementOffset(element);
@@ -488,6 +651,8 @@ Opt::Opt() {
    force32BitAddressing = true;
    unrollLoops = true;
    disableAsserts = false;
+    disableFMA = false;
+    forceAlignedMemory = false;
    disableMaskAllOnOptimizations = false;
    disableHandlePseudoMemoryOps = false;
    disableBlendedMaskedStores = false;
@@ -497,6 +662,7 @@ Opt::Opt() {
    disableMaskedStoreToStore = false;
    disableGatherScatterFlattening = false;
    disableUniformMemoryOptimizations = false;
+    disableCoalescing = false;
 }

 ///////////////////////////////////////////////////////////////////////////
@@ -510,12 +676,16 @@ Globals::Globals() {
    debugPrint = false;
    disableWarnings = false;
    warningsAsErrors = false;
+    quiet = false;
+    forceColoredOutput = false;
    disableLineWrap = false;
    emitPerfWarnings = true;
    emitInstrumentation = false;
    generateDebuggingSymbols = false;
+    enableFuzzTest = false;
+    fuzzTestSeed = -1;
    mangleFunctionsWithTarget = false;
-
+    
    ctx = new llvm::LLVMContext;

 #ifdef ISPC_IS_WINDOWS
@@ -548,7 +718,9 @@ llvm::DIFile
 SourcePos::GetDIFile() const {
    std::string directory, filename;
    GetDirectoryAndFileName(g->currentDirectory, name, &directory, &filename);
-    return m->diBuilder->createFile(filename, directory);
+    llvm::DIFile ret = m->diBuilder->createFile(filename, directory);
+    Assert(ret.Verify());
+    return ret;
 }


--- a/ispc.h
+++ b/ispc.h
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -38,8 +38,10 @@
 #ifndef ISPC_H
 #define ISPC_H

-#if !defined(LLVM_2_9) && !defined(LLVM_3_0) && !defined(LLVM_3_0svn) && !defined(LLVM_3_1svn)
-#error "Only LLVM 2.9, 3.0, and the 3.1 development branch are supported"
+#define ISPC_VERSION "1.3.1dev"
+
+#if !defined(LLVM_3_1) && !defined(LLVM_3_2) && !defined(LLVM_3_3)
+#error "Only LLVM 3.1, 3.2 and the 3.3 development branch are supported"
 #endif

 #if defined(_WIN32) || defined(_WIN64)
@@ -49,6 +51,9 @@
 #elif defined(__APPLE__)
 #define ISPC_IS_APPLE
 #endif
+#if defined(__KNC__)
+#define ISPC_IS_KNC
+#endif

 #include <stdint.h>
 #include <stdlib.h>
@@ -56,20 +61,10 @@
 #include <vector>
 #include <string>

-#define Assert(expr)                                            \
-    ((void)((expr) ? 0 : __Assert (#expr, __FILE__, __LINE__)))
-#define __Assert(expr, file, line)                                      \
-    ((void)fprintf(stderr, "%s:%u: Assertion failed: \"%s\"\n"          \
-                   "***\n*** Please file a bug report at "              \
-                   "https://github.com/ispc/ispc/issues\n*** (Including as much " \
-                   "information as you can about how to reproduce this error).\n" \
-                   "*** You have apparently encountered a bug in the compiler that " \
-                   "we'd like to fix!\n***\n", file, line, expr), abort(), 0)
-
 /** @def ISPC_MAX_NVEC maximum vector size of any of the compliation
    targets.
 */
-#define ISPC_MAX_NVEC 16
+#define ISPC_MAX_NVEC 64

 // Forward declarations of a number of widely-used LLVM types
 namespace llvm {
@@ -90,12 +85,6 @@ namespace llvm {
    class Value;
 }

-// llvm::Type *s are no longer const in llvm 3.0
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
-#define LLVM_TYPE_CONST
-#else
-#define LLVM_TYPE_CONST const
-#endif

 class ArrayType;
 class AST;
@@ -107,12 +96,22 @@ class ExprList;
 class Function;
 class FunctionType;
 class Module;
+class PointerType;
 class Stmt;
 class Symbol;
 class SymbolTable;
 class Type;
 struct VariableDeclaration;

+enum StorageClass {
+    SC_NONE,
+    SC_EXTERN,
+    SC_STATIC,
+    SC_TYPEDEF,
+    SC_EXTERN_C
+};
+
+
 /** @brief Representation of a range of positions in a source file.

    This class represents a range of characters in a source file
@@ -139,11 +138,25 @@ struct SourcePos {
    bool operator==(const SourcePos &p2) const;
 };

+
 /** Returns a SourcePos that encompasses the extent of both of the given
    extents. */
 SourcePos Union(const SourcePos &p1, const SourcePos &p2);


+
+// Assert
+
+extern void DoAssert(const char *file, int line, const char *expr);
+extern void DoAssertPos(SourcePos pos, const char *file, int line, const char *expr);
+
+#define Assert(expr)                                            \
+    ((void)((expr) ? 0 : ((void)DoAssert (__FILE__, __LINE__, #expr), 0)))
+
+#define AssertPos(pos, expr)                                     \
+    ((void)((expr) ? 0 : ((void)DoAssertPos (pos, __FILE__, __LINE__, #expr), 0)))
+
+
 /** @brief Structure that defines a compilation target 

    This structure defines a compilation target for the ispc compiler.
@@ -161,7 +174,7 @@ struct Target {

    /** Returns a comma-delimited string giving the names of the currently
        supported target CPUs. */
-    static const char *SupportedTargetCPUs();
+    static std::string SupportedTargetCPUs();

    /** Returns a comma-delimited string giving the names of the currently
        supported target architectures. */
@@ -179,13 +192,13 @@ struct Target {
    const char *GetISAString() const;

    /** Returns the size of the given type */
-    llvm::Value *SizeOf(LLVM_TYPE_CONST llvm::Type *type,
+    llvm::Value *SizeOf(llvm::Type *type,
                        llvm::BasicBlock *insertAtEnd);

    /** Given a structure type and an element number in the structure,
        returns a value corresponding to the number of bytes from the start
        of the structure where the element is located. */
-    llvm::Value *StructOffset(LLVM_TYPE_CONST llvm::Type *type,
+    llvm::Value *StructOffset(llvm::Type *type,
                              int element, llvm::BasicBlock *insertAtEnd);

    /** llvm Target object representing this target. */
@@ -197,7 +210,7 @@ struct Target {
        flexible/performant of them will apear last in the enumerant.  Note
        also that __best_available_isa() needs to be updated if ISAs are
        added or the enumerant values are reordered.  */
-    enum ISA { SSE2, SSE4, AVX, AVX2, GENERIC, NUM_ISAS };
+    enum ISA { SSE2, SSE4, AVX, AVX11, AVX2, GENERIC, NUM_ISAS };

    /** Instruction set being compiled to. */
    ISA isa;
@@ -233,16 +246,27 @@ struct Target {
        natively. */
    bool maskingIsFree;

-    /** Is it safe to run code with the mask all if: e.g. on SSE, the fast
-        gather trick assumes that at least one program instance is running
-        (so that it can safely assume that the array base pointer is
-        valid). */
-    bool allOffMaskIsSafe;
-
    /** How many bits are used to store each element of the mask: e.g. this
        is 32 on SSE/AVX, since that matches the HW better, but it's 1 for
        the generic target. */
    int maskBitCount;
+
+    /** Indicates whether the target has native support for float/half
+        conversions. */
+    bool hasHalf;
+
+    /** Indicates whether there is an ISA random number instruction. */
+    bool hasRand;
+
+    /** Indicates whether the target has a native gather instruction */
+    bool hasGather;
+
+    /** Indicates whether the target has a native scatter instruction */
+    bool hasScatter;
+
+    /** Indicates whether the target has support for transcendentals (beyond
+        sqrt, which we assume that all of them handle). */
+    bool hasTranscendentals;
 };


@@ -283,6 +307,16 @@ struct Opt {
        performance in the generated code). */
    bool disableAsserts;

+    /** Indicates whether FMA instructions should be disabled (on targets
+        that support them). */
+    bool disableFMA;
+
+    /** Always generate aligned vector load/store instructions; this
+        implies a guarantee that all dynamic access through pointers that
+        becomes a vector load/store will be a cache-aligned sequence of
+        locations. */
+    bool forceAlignedMemory;
+
    /** If enabled, disables the various optimizations that kick in when
        the execution mask can be determined to be "all on" at compile
        time. */
@@ -339,6 +373,10 @@ struct Opt {
        than gathers/scatters.  This is likely only useful for measuring
        the impact of this optimization. */
    bool disableUniformMemoryOptimizations;
+
+    /** Disables optimizations that coalesce incoherent scalar memory
+        access from gathers into wider vector operations, when possible. */
+    bool disableCoalescing;
 };

 /** @brief This structure collects together a number of global variables. 
@@ -388,6 +426,13 @@ struct Globals {
        possible performance pitfalls. */
    bool emitPerfWarnings;

+    /** Indicates whether all printed output should be surpressed. */
+    bool quiet;
+
+    /** Always use ANSI escape sequences to colorize warning and error
+        messages, even if piping output to a file, etc. */
+    bool forceColoredOutput;
+
    /** Indicates whether calls should be emitted in the program to an
        externally-defined program instrumentation function. (See the
        "Instrumenting your ispc programs" section in the user's
@@ -402,6 +447,14 @@ struct Globals {
        vector width to them. */
    bool mangleFunctionsWithTarget;

+    /** If enabled, the lexer will randomly replace some tokens returned
+        with other tokens, in order to test error condition handling in the
+        compiler. */
+    bool enableFuzzTest;
+
+    /** Seed for random number generator used for fuzz testing. */
+    int fuzzTestSeed;
+
    /** Global LLVMContext object */
    llvm::LLVMContext *ctx;

@@ -412,11 +465,14 @@ struct Globals {
    /** Arguments to pass along to the C pre-processor, if it is run on the
        program before compilation. */
    std::vector<std::string> cppArgs;
+
+    /** Additional user-provided directories to search when processing
+        #include directives in the preprocessor. */
+    std::vector<std::string> includePath;
 };

 enum {
    COST_ASSIGN = 1,
-    COST_COHERENT_BREAK_CONTINE = 4,
    COST_COMPLEX_ARITH_OP = 4,
    COST_DELETE = 32,
    COST_DEREF = 4,
@@ -427,7 +483,7 @@ enum {
    COST_GOTO = 4,
    COST_LOAD = 2,
    COST_NEW = 32,
-    COST_REGULAR_BREAK_CONTINUE = 2,
+    COST_BREAK_CONTINUE = 3,
    COST_RETURN = 4,
    COST_SELECT = 4,
    COST_SIMPLE_ARITH_LOGIC_OP = 1,
--- a/ispc.vcxproj
+++ b/ispc.vcxproj
@@ -20,6 +20,8 @@
    <ClCompile Include="func.cpp" />
    <ClCompile Include="gen-bitcode-avx1.cpp" />
    <ClCompile Include="gen-bitcode-avx1-x2.cpp" />
+    <ClCompile Include="gen-bitcode-avx11.cpp" />
+    <ClCompile Include="gen-bitcode-avx11-x2.cpp" />
    <ClCompile Include="gen-bitcode-avx2.cpp" />
    <ClCompile Include="gen-bitcode-avx2-x2.cpp" />
    <ClCompile Include="gen-bitcode-c-32.cpp" />
@@ -29,6 +31,8 @@
    <ClCompile Include="gen-bitcode-generic-4.cpp" />
    <ClCompile Include="gen-bitcode-generic-8.cpp" />
    <ClCompile Include="gen-bitcode-generic-16.cpp" />
+    <ClCompile Include="gen-bitcode-generic-32.cpp" />
+    <ClCompile Include="gen-bitcode-generic-64.cpp" />
    <ClCompile Include="gen-bitcode-sse2.cpp" />
    <ClCompile Include="gen-bitcode-sse2-x2.cpp" />
    <ClCompile Include="gen-bitcode-sse4.cpp" />
@@ -186,6 +190,32 @@
      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx1-x2.cpp</Message>
    </CustomBuild>
  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-avx11.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx11.ll | python bitcode2cpp.py builtins\target-avx11.ll &gt; gen-bitcode-avx11.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx11.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx11.ll | python bitcode2cpp.py builtins\target-avx11.ll &gt; gen-bitcode-avx11.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx11.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx11.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx11.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-avx11-x2.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx11-x2.ll | python bitcode2cpp.py builtins\target-avx11-x2.ll &gt; gen-bitcode-avx11-x2.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx11-x2.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx11-x2.ll | python bitcode2cpp.py builtins\target-avx11-x2.ll &gt; gen-bitcode-avx11-x2.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx11-x2.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-avx-common.ll;builtins\targets-avx-x2.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx11-x2.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx11-x2.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
  <ItemGroup>
    <CustomBuild Include="builtins\target-avx2.ll">
      <FileType>Document</FileType>
@@ -264,6 +294,32 @@
      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-generic-16.cpp</Message>
    </CustomBuild>
  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-generic-32.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-32.ll | python bitcode2cpp.py builtins\target-generic-32.ll &gt; gen-bitcode-generic-32.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-generic-32.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-32.ll | python bitcode2cpp.py builtins\target-generic-32.ll &gt; gen-bitcode-generic-32.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-generic-32.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-generic-32.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-generic-32.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-generic-64.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-64.ll | python bitcode2cpp.py builtins\target-generic-64.ll &gt; gen-bitcode-generic-64.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-generic-64.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-64.ll | python bitcode2cpp.py builtins\target-generic-64.ll &gt; gen-bitcode-generic-64.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-generic-64.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-generic-64.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-generic-64.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
  <ItemGroup>
    <CustomBuild Include="lex.ll">
      <FileType>Document</FileType>
@@ -324,7 +380,7 @@
      <PrecompiledHeader>NotUsing</PrecompiledHeader>
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>NOMINMAX;LLVM_3_0</PreprocessorDefinitions>
+      <PreprocessorDefinitions>NOMINMAX;%LLVM_VERSION%</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)\include;.;.\winstuff;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
      <DisableSpecificWarnings>4146;4800;4996;4355;4624</DisableSpecificWarnings>
    </ClCompile>
@@ -332,7 +388,7 @@
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
-      <AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmParser.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMDebugInfo.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCDisassembler.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Desc.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;LLVMipa.lib;LLVMipo.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangEdit.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmParser.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMDebugInfo.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCDisassembler.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Desc.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;LLVMipa.lib;LLVMipo.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
@@ -342,7 +398,7 @@
      <Optimization>MaxSpeed</Optimization>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>NOMINMAX;LLVM_3_0</PreprocessorDefinitions>
+      <PreprocessorDefinitions>NOMINMAX;%LLVM_VERSION%</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)\include;.;.\winstuff;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
      <DisableSpecificWarnings>4146;4800;4996;4355;4624</DisableSpecificWarnings>
    </ClCompile>
@@ -352,7 +408,7 @@
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
      <AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
-      <AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmParser.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMDebugInfo.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCDisassembler.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Desc.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;LLVMipa.lib;LLVMipo.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangEdit.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmParser.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMDebugInfo.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCDisassembler.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Desc.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;LLVMipa.lib;LLVMipo.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
    </Link>
  </ItemDefinitionGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
--- a/lex.ll
+++ b/lex.ll
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2010-2012, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -43,31 +43,294 @@
 #include <stdint.h>

 static uint64_t lParseBinary(const char *ptr, SourcePos pos, char **endPtr);
+static int lParseInteger(bool dotdotdot);
 static void lCComment(SourcePos *);
 static void lCppComment(SourcePos *);
 static void lHandleCppHash(SourcePos *);
 static void lStringConst(YYSTYPE *, SourcePos *);
 static double lParseHexFloat(const char *ptr);
+extern void RegisterDependency(const std::string &fileName);

 #define YY_USER_ACTION \
-    yylloc->first_line = yylloc->last_line; \
-    yylloc->first_column = yylloc->last_column; \
-    yylloc->last_column += yyleng;
+    yylloc.first_line = yylloc.last_line; \
+    yylloc.first_column = yylloc.last_column; \
+    yylloc.last_column += yyleng;

 #ifdef ISPC_IS_WINDOWS
 inline int isatty(int) { return 0; }
+#else
+#include <unistd.h>
 #endif // ISPC_IS_WINDOWS

+static int allTokens[] = { 
+  TOKEN_ASSERT, TOKEN_BOOL, TOKEN_BREAK, TOKEN_CASE,
+  TOKEN_CDO, TOKEN_CFOR, TOKEN_CIF, TOKEN_CWHILE,
+  TOKEN_CONST, TOKEN_CONTINUE, TOKEN_DEFAULT, TOKEN_DO,
+  TOKEN_DELETE, TOKEN_DOUBLE, TOKEN_ELSE, TOKEN_ENUM,
+  TOKEN_EXPORT, TOKEN_EXTERN, TOKEN_FALSE, TOKEN_FLOAT, TOKEN_FOR,
+  TOKEN_FOREACH, TOKEN_FOREACH_ACTIVE, TOKEN_FOREACH_TILED,
+  TOKEN_FOREACH_UNIQUE, TOKEN_GOTO, TOKEN_IF, TOKEN_IN, TOKEN_INLINE,
+  TOKEN_INT, TOKEN_INT8, TOKEN_INT16, TOKEN_INT, TOKEN_INT64, TOKEN_LAUNCH,
+  TOKEN_NEW, TOKEN_NULL, TOKEN_PRINT, TOKEN_RETURN, TOKEN_SOA, TOKEN_SIGNED,
+  TOKEN_SIZEOF, TOKEN_STATIC, TOKEN_STRUCT, TOKEN_SWITCH, TOKEN_SYNC,
+  TOKEN_TASK, TOKEN_TRUE, TOKEN_TYPEDEF, TOKEN_UNIFORM, TOKEN_UNMASKED,
+  TOKEN_UNSIGNED, TOKEN_VARYING, TOKEN_VOID, TOKEN_WHILE, 
+  TOKEN_STRING_C_LITERAL, TOKEN_DOTDOTDOT, 
+  TOKEN_FLOAT_CONSTANT,
+  TOKEN_INT32_CONSTANT, TOKEN_UINT32_CONSTANT, 
+  TOKEN_INT64_CONSTANT, TOKEN_UINT64_CONSTANT, 
+  TOKEN_INC_OP, TOKEN_DEC_OP, TOKEN_LEFT_OP, TOKEN_RIGHT_OP, TOKEN_LE_OP,
+  TOKEN_GE_OP, TOKEN_EQ_OP, TOKEN_NE_OP, TOKEN_AND_OP, TOKEN_OR_OP,
+  TOKEN_MUL_ASSIGN, TOKEN_DIV_ASSIGN, TOKEN_MOD_ASSIGN, TOKEN_ADD_ASSIGN,
+  TOKEN_SUB_ASSIGN, TOKEN_LEFT_ASSIGN, TOKEN_RIGHT_ASSIGN, TOKEN_AND_ASSIGN,
+  TOKEN_XOR_ASSIGN, TOKEN_OR_ASSIGN, TOKEN_PTR_OP,
+  ';', '{', '}', ',', ':', '=', '(', ')', '[', ']', '.', '&', '!', '~', '-',
+  '+', '*', '/', '%', '<', '>', '^', '|', '?',
+};
+
+std::map<int, std::string> tokenToName;
+std::map<std::string, std::string> tokenNameRemap;
+
+void ParserInit() {
+    tokenToName[TOKEN_ASSERT] = "assert";
+    tokenToName[TOKEN_BOOL] = "bool";
+    tokenToName[TOKEN_BREAK] = "break";
+    tokenToName[TOKEN_CASE] = "case";
+    tokenToName[TOKEN_CDO] = "cdo";
+    tokenToName[TOKEN_CFOR] = "cfor";
+    tokenToName[TOKEN_CIF] = "cif";
+    tokenToName[TOKEN_CWHILE] = "cwhile";
+    tokenToName[TOKEN_CONST] = "const";
+    tokenToName[TOKEN_CONTINUE] = "continue";
+    tokenToName[TOKEN_DEFAULT] = "default";
+    tokenToName[TOKEN_DO] = "do";
+    tokenToName[TOKEN_DELETE] = "delete";
+    tokenToName[TOKEN_DOUBLE] = "double";
+    tokenToName[TOKEN_ELSE] = "else";
+    tokenToName[TOKEN_ENUM] = "enum";
+    tokenToName[TOKEN_EXPORT] = "export";
+    tokenToName[TOKEN_EXTERN] = "extern";
+    tokenToName[TOKEN_FALSE] = "false";
+    tokenToName[TOKEN_FLOAT] = "float";
+    tokenToName[TOKEN_FOR] = "for";
+    tokenToName[TOKEN_FOREACH] = "foreach";
+    tokenToName[TOKEN_FOREACH_ACTIVE] = "foreach_active";
+    tokenToName[TOKEN_FOREACH_TILED] = "foreach_tiled";
+    tokenToName[TOKEN_FOREACH_UNIQUE] = "foreach_unique";
+    tokenToName[TOKEN_GOTO] = "goto";
+    tokenToName[TOKEN_IF] = "if";
+    tokenToName[TOKEN_IN] = "in";
+    tokenToName[TOKEN_INLINE] = "inline";
+    tokenToName[TOKEN_INT] = "int";
+    tokenToName[TOKEN_INT8] = "int8";
+    tokenToName[TOKEN_INT16] = "int16";
+    tokenToName[TOKEN_INT] = "int";
+    tokenToName[TOKEN_INT64] = "int64";
+    tokenToName[TOKEN_LAUNCH] = "launch";
+    tokenToName[TOKEN_NEW] = "new";
+    tokenToName[TOKEN_NULL] = "NULL";
+    tokenToName[TOKEN_PRINT] = "print";
+    tokenToName[TOKEN_RETURN] = "return";
+    tokenToName[TOKEN_SOA] = "soa";
+    tokenToName[TOKEN_SIGNED] = "signed";
+    tokenToName[TOKEN_SIZEOF] = "sizeof";
+    tokenToName[TOKEN_STATIC] = "static";
+    tokenToName[TOKEN_STRUCT] = "struct";
+    tokenToName[TOKEN_SWITCH] = "switch";
+    tokenToName[TOKEN_SYNC] = "sync";
+    tokenToName[TOKEN_TASK] = "task";
+    tokenToName[TOKEN_TRUE] = "true";
+    tokenToName[TOKEN_TYPEDEF] = "typedef";
+    tokenToName[TOKEN_UNIFORM] = "uniform";
+    tokenToName[TOKEN_UNMASKED] = "unmasked";
+    tokenToName[TOKEN_UNSIGNED] = "unsigned";
+    tokenToName[TOKEN_VARYING] = "varying";
+    tokenToName[TOKEN_VOID] = "void";
+    tokenToName[TOKEN_WHILE] = "while";
+    tokenToName[TOKEN_STRING_C_LITERAL] = "\"C\"";
+    tokenToName[TOKEN_DOTDOTDOT] = "...";
+    tokenToName[TOKEN_FLOAT_CONSTANT] = "TOKEN_FLOAT_CONSTANT";
+    tokenToName[TOKEN_INT32_CONSTANT] = "TOKEN_INT32_CONSTANT";
+    tokenToName[TOKEN_UINT32_CONSTANT] = "TOKEN_UINT32_CONSTANT";
+    tokenToName[TOKEN_INT64_CONSTANT] = "TOKEN_INT64_CONSTANT";
+    tokenToName[TOKEN_UINT64_CONSTANT] = "TOKEN_UINT64_CONSTANT";
+    tokenToName[TOKEN_INC_OP] = "++";
+    tokenToName[TOKEN_DEC_OP] = "--";
+    tokenToName[TOKEN_LEFT_OP] = "<<";
+    tokenToName[TOKEN_RIGHT_OP] = ">>";
+    tokenToName[TOKEN_LE_OP] = "<=";
+    tokenToName[TOKEN_GE_OP] = ">=";
+    tokenToName[TOKEN_EQ_OP] = "==";
+    tokenToName[TOKEN_NE_OP] = "!=";
+    tokenToName[TOKEN_AND_OP] = "&&";
+    tokenToName[TOKEN_OR_OP] = "||";
+    tokenToName[TOKEN_MUL_ASSIGN] = "*=";
+    tokenToName[TOKEN_DIV_ASSIGN] = "/=";
+    tokenToName[TOKEN_MOD_ASSIGN] = "%=";
+    tokenToName[TOKEN_ADD_ASSIGN] = "+=";
+    tokenToName[TOKEN_SUB_ASSIGN] = "-=";
+    tokenToName[TOKEN_LEFT_ASSIGN] = "<<=";
+    tokenToName[TOKEN_RIGHT_ASSIGN] = ">>=";
+    tokenToName[TOKEN_AND_ASSIGN] = "&=";
+    tokenToName[TOKEN_XOR_ASSIGN] = "^=";
+    tokenToName[TOKEN_OR_ASSIGN] = "|=";
+    tokenToName[TOKEN_PTR_OP] = "->";
+    tokenToName[';'] = ";";
+    tokenToName['{'] = "{";
+    tokenToName['}'] = "}";
+    tokenToName[','] = ",";
+    tokenToName[':'] = ":";
+    tokenToName['='] = "=";
+    tokenToName['('] = "(";
+    tokenToName[')'] = ")";
+    tokenToName['['] = "[";
+    tokenToName[']'] = "]";
+    tokenToName['.'] = ".";
+    tokenToName['&'] = "&";
+    tokenToName['!'] = "!";
+    tokenToName['~'] = "~";
+    tokenToName['-'] = "-";
+    tokenToName['+'] = "+";
+    tokenToName['*'] = "*";
+    tokenToName['/'] = "/";
+    tokenToName['%'] = "%";
+    tokenToName['<'] = "<";
+    tokenToName['>'] = ">";
+    tokenToName['^'] = "^";
+    tokenToName['|'] = "|";
+    tokenToName['?'] = "?";
+    tokenToName[';'] = ";";
+
+    tokenNameRemap["TOKEN_ASSERT"] = "\'assert\'";
+    tokenNameRemap["TOKEN_BOOL"] = "\'bool\'";
+    tokenNameRemap["TOKEN_BREAK"] = "\'break\'";
+    tokenNameRemap["TOKEN_CASE"] = "\'case\'";
+    tokenNameRemap["TOKEN_CDO"] = "\'cdo\'";
+    tokenNameRemap["TOKEN_CFOR"] = "\'cfor\'";
+    tokenNameRemap["TOKEN_CIF"] = "\'cif\'";
+    tokenNameRemap["TOKEN_CWHILE"] = "\'cwhile\'";
+    tokenNameRemap["TOKEN_CONST"] = "\'const\'";
+    tokenNameRemap["TOKEN_CONTINUE"] = "\'continue\'";
+    tokenNameRemap["TOKEN_DEFAULT"] = "\'default\'";
+    tokenNameRemap["TOKEN_DO"] = "\'do\'";
+    tokenNameRemap["TOKEN_DELETE"] = "\'delete\'";
+    tokenNameRemap["TOKEN_DOUBLE"] = "\'double\'";
+    tokenNameRemap["TOKEN_ELSE"] = "\'else\'";
+    tokenNameRemap["TOKEN_ENUM"] = "\'enum\'";
+    tokenNameRemap["TOKEN_EXPORT"] = "\'export\'";
+    tokenNameRemap["TOKEN_EXTERN"] = "\'extern\'";
+    tokenNameRemap["TOKEN_FALSE"] = "\'false\'";
+    tokenNameRemap["TOKEN_FLOAT"] = "\'float\'";
+    tokenNameRemap["TOKEN_FOR"] = "\'for\'";
+    tokenNameRemap["TOKEN_FOREACH"] = "\'foreach\'";
+    tokenNameRemap["TOKEN_FOREACH_ACTIVE"] = "\'foreach_active\'";
+    tokenNameRemap["TOKEN_FOREACH_TILED"] = "\'foreach_tiled\'";
+    tokenNameRemap["TOKEN_FOREACH_UNIQUE"] = "\'foreach_unique\'";
+    tokenNameRemap["TOKEN_GOTO"] = "\'goto\'";
+    tokenNameRemap["TOKEN_IDENTIFIER"] = "identifier";
+    tokenNameRemap["TOKEN_IF"] = "\'if\'";
+    tokenNameRemap["TOKEN_IN"] = "\'in\'";
+    tokenNameRemap["TOKEN_INLINE"] = "\'inline\'";
+    tokenNameRemap["TOKEN_INT"] = "\'int\'";
+    tokenNameRemap["TOKEN_INT8"] = "\'int8\'";
+    tokenNameRemap["TOKEN_INT16"] = "\'int16\'";
+    tokenNameRemap["TOKEN_INT"] = "\'int\'";
+    tokenNameRemap["TOKEN_INT64"] = "\'int64\'";
+    tokenNameRemap["TOKEN_LAUNCH"] = "\'launch\'";
+    tokenNameRemap["TOKEN_NEW"] = "\'new\'";
+    tokenNameRemap["TOKEN_NULL"] = "\'NULL\'";
+    tokenNameRemap["TOKEN_PRINT"] = "\'print\'";
+    tokenNameRemap["TOKEN_RETURN"] = "\'return\'";
+    tokenNameRemap["TOKEN_SOA"] = "\'soa\'";
+    tokenNameRemap["TOKEN_SIGNED"] = "\'signed\'";
+    tokenNameRemap["TOKEN_SIZEOF"] = "\'sizeof\'";
+    tokenNameRemap["TOKEN_STATIC"] = "\'static\'";
+    tokenNameRemap["TOKEN_STRUCT"] = "\'struct\'";
+    tokenNameRemap["TOKEN_SWITCH"] = "\'switch\'";
+    tokenNameRemap["TOKEN_SYNC"] = "\'sync\'";
+    tokenNameRemap["TOKEN_TASK"] = "\'task\'";
+    tokenNameRemap["TOKEN_TRUE"] = "\'true\'";
+    tokenNameRemap["TOKEN_TYPEDEF"] = "\'typedef\'";
+    tokenNameRemap["TOKEN_UNIFORM"] = "\'uniform\'";
+    tokenNameRemap["TOKEN_UNMASKED"] = "\'unmasked\'";
+    tokenNameRemap["TOKEN_UNSIGNED"] = "\'unsigned\'";
+    tokenNameRemap["TOKEN_VARYING"] = "\'varying\'";
+    tokenNameRemap["TOKEN_VOID"] = "\'void\'";
+    tokenNameRemap["TOKEN_WHILE"] = "\'while\'";
+    tokenNameRemap["TOKEN_STRING_C_LITERAL"] = "\"C\"";
+    tokenNameRemap["TOKEN_DOTDOTDOT"] = "\'...\'";
+    tokenNameRemap["TOKEN_FLOAT_CONSTANT"] = "float constant";
+    tokenNameRemap["TOKEN_INT32_CONSTANT"] = "int32 constant";
+    tokenNameRemap["TOKEN_UINT32_CONSTANT"] = "unsigned int32 constant";
+    tokenNameRemap["TOKEN_INT64_CONSTANT"] = "int64 constant";
+    tokenNameRemap["TOKEN_UINT64_CONSTANT"] = "unsigned int64 constant";
+    tokenNameRemap["TOKEN_INC_OP"] = "\'++\'";
+    tokenNameRemap["TOKEN_DEC_OP"] = "\'--\'";
+    tokenNameRemap["TOKEN_LEFT_OP"] = "\'<<\'";
+    tokenNameRemap["TOKEN_RIGHT_OP"] = "\'>>\'";
+    tokenNameRemap["TOKEN_LE_OP"] = "\'<=\'";
+    tokenNameRemap["TOKEN_GE_OP"] = "\'>=\'";
+    tokenNameRemap["TOKEN_EQ_OP"] = "\'==\'";
+    tokenNameRemap["TOKEN_NE_OP"] = "\'!=\'";
+    tokenNameRemap["TOKEN_AND_OP"] = "\'&&\'";
+    tokenNameRemap["TOKEN_OR_OP"] = "\'||\'";
+    tokenNameRemap["TOKEN_MUL_ASSIGN"] = "\'*=\'";
+    tokenNameRemap["TOKEN_DIV_ASSIGN"] = "\'/=\'";
+    tokenNameRemap["TOKEN_MOD_ASSIGN"] = "\'%=\'";
+    tokenNameRemap["TOKEN_ADD_ASSIGN"] = "\'+=\'";
+    tokenNameRemap["TOKEN_SUB_ASSIGN"] = "\'-=\'";
+    tokenNameRemap["TOKEN_LEFT_ASSIGN"] = "\'<<=\'";
+    tokenNameRemap["TOKEN_RIGHT_ASSIGN"] = "\'>>=\'";
+    tokenNameRemap["TOKEN_AND_ASSIGN"] = "\'&=\'";
+    tokenNameRemap["TOKEN_XOR_ASSIGN"] = "\'^=\'";
+    tokenNameRemap["TOKEN_OR_ASSIGN"] = "\'|=\'";
+    tokenNameRemap["TOKEN_PTR_OP"] = "\'->\'";
+    tokenNameRemap["$end"] = "end of file";
+}
+
+
+inline int ispcRand() {
+#ifdef ISPC_IS_WINDOWS
+    return rand();
+#else
+    return lrand48();
+#endif
+}
+
+#define RT \
+    if (g->enableFuzzTest) { \
+        int r = ispcRand() % 40; \
+        if (r == 0) { \
+            Warning(yylloc, "Fuzz test dropping token"); \
+        } \
+        else if (r == 1) { \
+            Assert (tokenToName.size() > 0); \
+            int nt = sizeof(allTokens) / sizeof(allTokens[0]); \
+            int tn = ispcRand() % nt; \
+            yylval.stringVal = new std::string(yytext); /* just in case */\
+            Warning(yylloc, "Fuzz test replaced token with \"%s\"", tokenToName[allTokens[tn]].c_str()); \
+            return allTokens[tn]; \
+        } \
+        else if (r == 2) { \
+            Symbol *sym = m->symbolTable->RandomSymbol(); \
+            if (sym != NULL) { \
+                yylval.stringVal = new std::string(sym->name); \
+                Warning(yylloc, "Fuzz test replaced with identifier \"%s\".", sym->name.c_str()); \
+                return TOKEN_IDENTIFIER; \
+            } \
+        } \
+        /*  TOKEN_TYPE_NAME */ \
+     } else /* swallow semicolon */
+
 %}

 %option nounput
 %option noyywrap
-%option bison-bridge
-%option bison-locations
 %option nounistd

 WHITESPACE [ \t\r]+
-INT_NUMBER (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[kMG]?
+INT_NUMBER (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]*
+INT_NUMBER_DOTDOTDOT (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]*\.\.\.
 FLOAT_NUMBER (([0-9]+|(([0-9]+\.[0-9]*[fF]?)|(\.[0-9]+)))([eE][-+]?[0-9]+)?[fF]?)
 HEX_FLOAT_NUMBER (0x[01](\.[0-9a-fA-F]*)?p[-+]?[0-9]+[fF]?)

@@ -75,200 +338,167 @@ IDENT [a-zA-Z_][a-zA-Z_0-9]*
 ZO_SWIZZLE ([01]+[w-z]+)+|([01]+[rgba]+)+|([01]+[uv]+)+

 %%
-"/*"            { lCComment(yylloc); }
-"//"            { lCppComment(yylloc); }
+"/*"            { lCComment(&yylloc); }
+"//"            { lCppComment(&yylloc); }

-__assert { return TOKEN_ASSERT; }
-bool { return TOKEN_BOOL; }
-break { return TOKEN_BREAK; }
-case { return TOKEN_CASE; }
-cbreak { return TOKEN_CBREAK; }
-ccontinue { return TOKEN_CCONTINUE; }
-cdo { return TOKEN_CDO; }
-cfor { return TOKEN_CFOR; }
-cif { return TOKEN_CIF; }
-cwhile { return TOKEN_CWHILE; }
-const { return TOKEN_CONST; }
-continue { return TOKEN_CONTINUE; }
-creturn { return TOKEN_CRETURN; }
-default { return TOKEN_DEFAULT; }
-do { return TOKEN_DO; }
-delete { return TOKEN_DELETE; }
-delete\[\] { return TOKEN_DELETE; }
-double { return TOKEN_DOUBLE; }
-else { return TOKEN_ELSE; }
-enum { return TOKEN_ENUM; }
-export { return TOKEN_EXPORT; }
-extern { return TOKEN_EXTERN; }
-false { return TOKEN_FALSE; }
-float { return TOKEN_FLOAT; }
-for { return TOKEN_FOR; }
-foreach { return TOKEN_FOREACH; }
-foreach_tiled { return TOKEN_FOREACH_TILED; }
-goto { return TOKEN_GOTO; }
-if { return TOKEN_IF; }
-inline { return TOKEN_INLINE; }
-int { return TOKEN_INT; }
-int8 { return TOKEN_INT8; }
-int16 { return TOKEN_INT16; }
-int32 { return TOKEN_INT; }
-int64 { return TOKEN_INT64; }
-launch { return TOKEN_LAUNCH; }
-new { return TOKEN_NEW; }
-NULL { return TOKEN_NULL; }
-print { return TOKEN_PRINT; }
-reference { Error(*yylloc, "\"reference\" qualifier is no longer supported; "
-                           "please use C++-style '&' syntax for references "
-                           "instead."); }
-return { return TOKEN_RETURN; }
-soa { return TOKEN_SOA; }
-signed { return TOKEN_SIGNED; }
-sizeof { return TOKEN_SIZEOF; }
-static { return TOKEN_STATIC; }
-struct { return TOKEN_STRUCT; }
-switch { return TOKEN_SWITCH; }
-sync { return TOKEN_SYNC; }
-task { return TOKEN_TASK; }
-true { return TOKEN_TRUE; }
-typedef { return TOKEN_TYPEDEF; }
-uniform { return TOKEN_UNIFORM; }
-unsigned { return TOKEN_UNSIGNED; }
-varying { return TOKEN_VARYING; }
-void { return TOKEN_VOID; }
-while { return TOKEN_WHILE; }
-\"C\" { return TOKEN_STRING_C_LITERAL; }
-\.\.\. { return TOKEN_DOTDOTDOT; }
+__assert { RT; return TOKEN_ASSERT; }
+bool { RT; return TOKEN_BOOL; }
+break { RT; return TOKEN_BREAK; }
+case { RT; return TOKEN_CASE; }
+cbreak { RT; Warning(yylloc, "\"cbreak\" is deprecated. Use \"break\"."); return TOKEN_BREAK; }
+ccontinue { RT; Warning(yylloc, "\"ccontinue\" is deprecated. Use \"continue\"."); return TOKEN_CONTINUE; }
+cdo { RT; return TOKEN_CDO; }
+cfor { RT; return TOKEN_CFOR; }
+cif { RT; return TOKEN_CIF; }
+cwhile { RT; return TOKEN_CWHILE; }
+const { RT; return TOKEN_CONST; }
+continue { RT; return TOKEN_CONTINUE; }
+creturn { RT; Warning(yylloc, "\"creturn\" is deprecated. Use \"return\"."); return TOKEN_RETURN; }
+__declspec { RT; return TOKEN_DECLSPEC; }
+default { RT; return TOKEN_DEFAULT; }
+do { RT; return TOKEN_DO; }
+delete { RT; return TOKEN_DELETE; }
+delete\[\] { RT; return TOKEN_DELETE; }
+double { RT; return TOKEN_DOUBLE; }
+else { RT; return TOKEN_ELSE; }
+enum { RT; return TOKEN_ENUM; }
+export { RT; return TOKEN_EXPORT; }
+extern { RT; return TOKEN_EXTERN; }
+false { RT; return TOKEN_FALSE; }
+float { RT; return TOKEN_FLOAT; }
+for { RT; return TOKEN_FOR; }
+foreach { RT; return TOKEN_FOREACH; }
+foreach_active { RT; return TOKEN_FOREACH_ACTIVE; }
+foreach_tiled { RT; return TOKEN_FOREACH_TILED; }
+foreach_unique { RT; return TOKEN_FOREACH_UNIQUE; }
+goto { RT; return TOKEN_GOTO; }
+if { RT; return TOKEN_IF; }
+in { RT; return TOKEN_IN; }
+inline { RT; return TOKEN_INLINE; }
+int { RT; return TOKEN_INT; }
+int8 { RT; return TOKEN_INT8; }
+int16 { RT; return TOKEN_INT16; }
+int32 { RT; return TOKEN_INT; }
+int64 { RT; return TOKEN_INT64; }
+launch { RT; return TOKEN_LAUNCH; }
+new { RT; return TOKEN_NEW; }
+NULL { RT; return TOKEN_NULL; }
+print { RT; return TOKEN_PRINT; }
+return { RT; return TOKEN_RETURN; }
+soa { RT; return TOKEN_SOA; }
+signed { RT; return TOKEN_SIGNED; }
+sizeof { RT; return TOKEN_SIZEOF; }
+static { RT; return TOKEN_STATIC; }
+struct { RT; return TOKEN_STRUCT; }
+switch { RT; return TOKEN_SWITCH; }
+sync { RT; return TOKEN_SYNC; }
+task { RT; return TOKEN_TASK; }
+true { RT; return TOKEN_TRUE; }
+typedef { RT; return TOKEN_TYPEDEF; }
+uniform { RT; return TOKEN_UNIFORM; }
+unmasked { RT; return TOKEN_UNMASKED; }
+unsigned { RT; return TOKEN_UNSIGNED; }
+varying { RT; return TOKEN_VARYING; }
+void { RT; return TOKEN_VOID; }
+while { RT; return TOKEN_WHILE; }
+\"C\" { RT; return TOKEN_STRING_C_LITERAL; }
+\.\.\. { RT; return TOKEN_DOTDOTDOT; }

-L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL; }
+L?\"(\\.|[^\\"])*\" { lStringConst(&yylval, &yylloc); return TOKEN_STRING_LITERAL; }

 {IDENT} { 
+    RT;
    /* We have an identifier--is it a type name or an identifier?
       The symbol table will straighten us out... */
-    yylval->stringVal = new std::string(yytext);
+    yylval.stringVal = new std::string(yytext);
    if (m->symbolTable->LookupType(yytext) != NULL)
        return TOKEN_TYPE_NAME;
    else
        return TOKEN_IDENTIFIER; 
 }

-{INT_NUMBER}+(u|U|l|L)*? { 
-    int ls = 0, us = 0;
+{INT_NUMBER} { 
+    RT;
+    return lParseInteger(false);
+}

-    char *endPtr = NULL;
-    if (yytext[0] == '0' && yytext[1] == 'b')
-        yylval->intVal = lParseBinary(yytext+2, *yylloc, &endPtr);
-    else {
-#if defined(ISPC_IS_WINDOWS) && !defined(__MINGW32__)
-        yylval->intVal = _strtoui64(yytext, &endPtr, 0);
-#else
-        // FIXME: should use strtouq and then issue an error if we can't
-        // fit into 64 bits...
-        yylval->intVal = strtoull(yytext, &endPtr, 0);
-#endif
-    }
-
-    bool kilo = false, mega = false, giga = false;
-    for (; *endPtr; endPtr++) {
-        if (*endPtr == 'k')
-            kilo = true;
-        else if (*endPtr == 'M')
-            mega = true;
-        else if (*endPtr == 'G')
-            giga = true;        
-        else if (*endPtr == 'l' || *endPtr == 'L')
-            ls++;
-        else if (*endPtr == 'u' || *endPtr == 'U')
-            us++;
-    }
-    if (kilo)
-        yylval->intVal *= 1024;
-    if (mega)
-        yylval->intVal *= 1024*1024;
-    if (giga)
-        yylval->intVal *= 1024*1024*1024;
-
-    if (ls >= 2)
-        return us ? TOKEN_UINT64_CONSTANT : TOKEN_INT64_CONSTANT;
-    else if (ls == 1)
-        return us ? TOKEN_UINT32_CONSTANT : TOKEN_INT32_CONSTANT;
-
-    // See if we can fit this into a 32-bit integer...
-    if ((yylval->intVal & 0xffffffff) == yylval->intVal)
-        return us ? TOKEN_UINT32_CONSTANT : TOKEN_INT32_CONSTANT;
-    else
-        return us ? TOKEN_UINT64_CONSTANT : TOKEN_INT64_CONSTANT;
+{INT_NUMBER_DOTDOTDOT} {
+    RT;
+    return lParseInteger(true);
 }


 {FLOAT_NUMBER} { 
-    yylval->floatVal = (float)atof(yytext); 
+    RT;
+    yylval.floatVal = (float)atof(yytext);
    return TOKEN_FLOAT_CONSTANT; 
 }

 {HEX_FLOAT_NUMBER} {
-    yylval->floatVal = (float)lParseHexFloat(yytext); 
+    RT;
+    yylval.floatVal = (float)lParseHexFloat(yytext); 
    return TOKEN_FLOAT_CONSTANT; 
 }

-"++" { return TOKEN_INC_OP; }
-"--" { return TOKEN_DEC_OP; }
-"<<" { return TOKEN_LEFT_OP; }
-">>" { return TOKEN_RIGHT_OP; }
-"<=" { return TOKEN_LE_OP; }
-">=" { return TOKEN_GE_OP; }
-"==" { return TOKEN_EQ_OP; }
-"!=" { return TOKEN_NE_OP; }
-"&&" { return TOKEN_AND_OP; }
-"||" { return TOKEN_OR_OP; }
-"*=" { return TOKEN_MUL_ASSIGN; }
-"/=" { return TOKEN_DIV_ASSIGN; }
-"%=" { return TOKEN_MOD_ASSIGN; }
-"+=" { return TOKEN_ADD_ASSIGN; }
-"-=" { return TOKEN_SUB_ASSIGN; }
-"<<=" { return TOKEN_LEFT_ASSIGN; }
-">>=" { return TOKEN_RIGHT_ASSIGN; }
-"&=" { return TOKEN_AND_ASSIGN; }
-"^=" { return TOKEN_XOR_ASSIGN; }
-"|=" { return TOKEN_OR_ASSIGN; }
-"->" { return TOKEN_PTR_OP; }
-";"             { return ';'; }
-("{"|"<%")      { return '{'; }
-("}"|"%>")      { return '}'; }
-","             { return ','; }
-":"             { return ':'; }
-"="             { return '='; }
-"("             { return '('; }
-")"             { return ')'; }
-("["|"<:")      { return '['; }
-("]"|":>")      { return ']'; }
-"."             { return '.'; }
-"&"             { return '&'; }
-"!"             { return '!'; }
-"~"             { return '~'; }
-"-"             { return '-'; }
-"+"             { return '+'; }
-"*"             { return '*'; }
-"/"             { return '/'; }
-"%"             { return '%'; }
-"<"             { return '<'; }
-">"             { return '>'; }
-"^"             { return '^'; }
-"|"             { return '|'; }
-"?"             { return '?'; }
+"++" { RT; return TOKEN_INC_OP; }
+"--" { RT; return TOKEN_DEC_OP; }
+"<<" { RT; return TOKEN_LEFT_OP; }
+">>" { RT; return TOKEN_RIGHT_OP; }
+"<=" { RT; return TOKEN_LE_OP; }
+">=" { RT; return TOKEN_GE_OP; }
+"==" { RT; return TOKEN_EQ_OP; }
+"!=" { RT; return TOKEN_NE_OP; }
+"&&" { RT; return TOKEN_AND_OP; }
+"||" { RT; return TOKEN_OR_OP; }
+"*=" { RT; return TOKEN_MUL_ASSIGN; }
+"/=" { RT; return TOKEN_DIV_ASSIGN; }
+"%=" { RT; return TOKEN_MOD_ASSIGN; }
+"+=" { RT; return TOKEN_ADD_ASSIGN; }
+"-=" { RT; return TOKEN_SUB_ASSIGN; }
+"<<=" { RT; return TOKEN_LEFT_ASSIGN; }
+">>=" { RT; return TOKEN_RIGHT_ASSIGN; }
+"&=" { RT; return TOKEN_AND_ASSIGN; }
+"^=" { RT; return TOKEN_XOR_ASSIGN; }
+"|=" { RT; return TOKEN_OR_ASSIGN; }
+"->" { RT; return TOKEN_PTR_OP; }
+";"             { RT; return ';'; }
+("{"|"<%")      { RT; return '{'; }
+("}"|"%>")      { RT; return '}'; }
+","             { RT; return ','; }
+":"             { RT; return ':'; }
+"="             { RT; return '='; }
+"("             { RT; return '('; }
+")"             { RT; return ')'; }
+("["|"<:")      { RT; return '['; }
+("]"|":>")      { RT; return ']'; }
+"."             { RT; return '.'; }
+"&"             { RT; return '&'; }
+"!"             { RT; return '!'; }
+"~"             { RT; return '~'; }
+"-"             { RT; return '-'; }
+"+"             { RT; return '+'; }
+"*"             { RT; return '*'; }
+"/"             { RT; return '/'; }
+"%"             { RT; return '%'; }
+"<"             { RT; return '<'; }
+">"             { RT; return '>'; }
+"^"             { RT; return '^'; }
+"|"             { RT; return '|'; }
+"?"             { RT; return '?'; }

 {WHITESPACE} { }

 \n {
-    yylloc->last_line++; 
-    yylloc->last_column = 1; 
+    yylloc.last_line++; 
+    yylloc.last_column = 1; 
 }

 #(line)?[ ][0-9]+[ ]\"(\\.|[^\\"])*\"[^\n]* { 
-    lHandleCppHash(yylloc); 
+    lHandleCppHash(&yylloc); 
 }

 . {
-    Error(*yylloc, "Illegal character: %c (0x%x)", yytext[0], int(yytext[0]));
+    Error(yylloc, "Illegal character: %c (0x%x)", yytext[0], int(yytext[0]));
    YY_USER_ACTION 
 }

@@ -304,13 +534,94 @@ lParseBinary(const char *ptr, SourcePos pos, char **endPtr) {
 }


+static int
+lParseInteger(bool dotdotdot) {
+    int ls = 0, us = 0;
+
+    char *endPtr = NULL;
+    if (yytext[0] == '0' && yytext[1] == 'b')
+        yylval.intVal = lParseBinary(yytext+2, yylloc, &endPtr);
+    else {
+#if defined(ISPC_IS_WINDOWS) && !defined(__MINGW32__)
+        yylval.intVal = _strtoui64(yytext, &endPtr, 0);
+#else
+        // FIXME: should use strtouq and then issue an error if we can't
+        // fit into 64 bits...
+        yylval.intVal = strtoull(yytext, &endPtr, 0);
+#endif
+    }
+
+    bool kilo = false, mega = false, giga = false;
+    for (; *endPtr; endPtr++) {
+        if (*endPtr == 'k')
+            kilo = true;
+        else if (*endPtr == 'M')
+            mega = true;
+        else if (*endPtr == 'G')
+            giga = true;        
+        else if (*endPtr == 'l' || *endPtr == 'L')
+            ls++;
+        else if (*endPtr == 'u' || *endPtr == 'U')
+            us++;
+        else
+            Assert(dotdotdot && *endPtr == '.');
+    }
+    if (kilo)
+        yylval.intVal *= 1024;
+    if (mega)
+        yylval.intVal *= 1024*1024;
+    if (giga)
+        yylval.intVal *= 1024*1024*1024;
+
+    if (dotdotdot) {
+        if (ls >= 2)
+            return us ? TOKEN_UINT64DOTDOTDOT_CONSTANT : TOKEN_INT64DOTDOTDOT_CONSTANT;
+        else if (ls == 1)
+            return us ? TOKEN_UINT32DOTDOTDOT_CONSTANT : TOKEN_INT32DOTDOTDOT_CONSTANT;
+
+        // See if we can fit this into a 32-bit integer...
+        if ((yylval.intVal & 0xffffffff) == yylval.intVal)
+            return us ? TOKEN_UINT32DOTDOTDOT_CONSTANT : TOKEN_INT32DOTDOTDOT_CONSTANT;
+        else
+            return us ? TOKEN_UINT64DOTDOTDOT_CONSTANT : TOKEN_INT64DOTDOTDOT_CONSTANT;
+    }
+    else {
+        if (ls >= 2)
+            return us ? TOKEN_UINT64_CONSTANT : TOKEN_INT64_CONSTANT;
+        else if (ls == 1)
+            return us ? TOKEN_UINT32_CONSTANT : TOKEN_INT32_CONSTANT;
+        else if (us) {
+            // u suffix only
+            if (yylval.intVal <= 0xffffffffL)
+                return TOKEN_UINT32_CONSTANT;
+            else
+                return TOKEN_UINT64_CONSTANT;
+        }
+        else {
+            // No u or l suffix           
+            // First, see if we can fit this into a 32-bit integer...
+            if (yylval.intVal <= 0x7fffffffULL)
+                return TOKEN_INT32_CONSTANT;
+            else if (yylval.intVal <= 0xffffffffULL)
+                return TOKEN_UINT32_CONSTANT;
+            else if (yylval.intVal <= 0x7fffffffffffffffULL)
+                return TOKEN_INT64_CONSTANT;
+            else
+                return TOKEN_UINT64_CONSTANT;
+        }
+    }
+}
+
+
 /** Handle a C-style comment in the source. 
 */
 static void
 lCComment(SourcePos *pos) {
    char c, prev = 0;
-  
+
    while ((c = yyinput()) != 0) {
+        ++pos->last_column;
+
        if (c == '\n') {
            pos->last_line++;
            pos->last_column = 1;
@@ -373,6 +684,7 @@ static void lHandleCppHash(SourcePos *pos) {
        ++src;
    }
    pos->name = strdup(filename.c_str());
+    RegisterDependency(filename);
 }


@@ -415,7 +727,7 @@ lEscapeChar(char *str, char *pChar, SourcePos *pos)
            str = tail - 1;
            break;
        default:
-            Error(*pos, "Bad character escape sequence: '%s'\n.", str);
+            Error(*pos, "Bad character escape sequence: '%s'.", str);
            break;
        }
    }
@@ -435,7 +747,7 @@ lStringConst(YYSTYPE *yylval, SourcePos *pos)
    std::string str;
    p = strchr(yytext, '"') + 1;
    while (*p != '\"') {
-       char cval;
+       char cval = '\0';
       p = lEscapeChar(p, &cval, pos);
       str.push_back(cval);
    } 
--- a/Show More
+++ b/Show More