commit 18af5226ba8ddbcd9bb9f8eebbbc7d5f515d71b6
Author: Matt Pharr <matt@pharr.org>
Date:   Tue Jun 21 06:23:29 2011 -0700

    Initial commit.

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 00000000..f3d74a9a
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+*.pyc
+*~
diff --git a/LICENSE.txt b/LICENSE.txt
new file mode 100644
index 00000000..918a5b57
--- /dev/null
+++ b/LICENSE.txt
@@ -0,0 +1,116 @@
+Copyright (c) 2010-2011, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+===========================================================================
+Copyrights and Licenses for Third Party Software Distrubted with 
+The Intel(r) SPMD Program Compiler
+===========================================================================
+
+ISPC incorporates code from the Syrah library, which is covered by the
+following license:
+
+Copyright (c) 2009, Stanford University, and authors listed below.
+All rights reserved.
+
+Original authors:
+  Solomon Boulos
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+Neither the name of Stanford University nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+---------------------------------------------------------------------------
+
+Binary distributions of ISPC are linked with the LLVM libraries, which are
+covered by the following license:
+
+University of Illinois/NCSA
+Open Source License
+
+Copyright (c) 2003-2010 University of Illinois at Urbana-Champaign.
+All rights reserved.
+
+Developed by:
+
+    LLVM Team
+
+    University of Illinois at Urbana-Champaign
+
+    http://llvm.org
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal with
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimers.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimers in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the names of the LLVM Team, University of Illinois at
+      Urbana-Champaign, nor the names of its contributors may be used to
+      endorse or promote products derived from this Software without specific
+      prior written permission.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+SOFTWARE.
diff --git a/Makefile b/Makefile
new file mode 100644
index 00000000..cc9ee60c
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,118 @@
+#
+# ispc Makefile
+#
+
+ARCH = $(shell uname)
+
+CLANG=clang
+LLVM_LIBS=$(shell llvm-config --ldflags --libs) -lpthread -ldl
+LLVM_CXXFLAGS=$(shell llvm-config --cppflags)
+LLVM_VERSION_DEF=-DLLVM_$(shell llvm-config --version | sed s/\\./_/)
+
+BUILD_DATE=$(shell date +%Y%m%d)
+BUILD_VERSION=$(shell git log | head -1)
+
+CXX=g++
+CPP=cpp
+CXXFLAGS=-g3 $(LLVM_CXXFLAGS) -I. -Iobjs/ -Wall $(LLVM_VERSION_DEF) \
+	-DBUILD_DATE="\"$(BUILD_DATE)\"" -DBUILD_VERSION="\"$(BUILD_VERSION)\""
+
+LDFLAGS=
+ifeq ($(ARCH),Linux)
+  # try to link everything statically under Linux (including libstdc++) so
+  # that the binaries we generate will be portable across distributions...
+  LDFLAGS=-static -L/usr/lib/gcc/x86_64-linux-gnu/4.4
+endif
+
+LEX=flex
+YACC=bison -d -v -t
+
+###########################################################################
+
+CXX_SRC=builtins.cpp ctx.cpp decl.cpp expr.cpp ispc.cpp \
+	llvmutil.cpp main.cpp module.cpp opt.cpp stmt.cpp sym.cpp type.cpp \
+	util.cpp
+HEADERS=builtins.h ctx.h decl.h expr.h ispc.h llvmutil.h module.h \
+	opt.h stmt.h sym.h type.h util.h
+STDLIB_SRC=stdlib-avx.ll stdlib-sse2.ll stdlib-sse4.ll stdlib-sse4x2.ll
+BISON_SRC=parse.yy
+FLEX_SRC=lex.ll
+
+OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(STDLIB_SRC:.ll=.o) stdlib-c.o stdlib_ispc.o \
+	$(BISON_SRC:.yy=.o) $(FLEX_SRC:.ll=.o))
+
+default: ispc ispc_test
+
+.PHONY: dirs clean depend doxygen
+.PRECIOUS: objs/stdlib-%.cpp
+
+depend: $(CXX_SRC) $(HEADERS)
+	@echo Updating dependencies
+	@gcc -MM $(CXXFLAGS) $(CXX_SRC) | sed 's_^\([a-z]\)_objs/\1_g' > depend
+
+-include depend
+
+dirs:
+	@echo Creating objs/ directory
+	@/bin/mkdir -p objs
+
+clean:
+	/bin/rm -rf objs ispc ispc_test
+
+doxygen:
+	/bin/rm -rf docs/doxygen
+	doxygen doxygen.cfg
+
+ispc: dirs $(OBJS)
+	@echo Creating ispc executable
+	@$(CXX) $(LDFLAGS) -o $@ $(OBJS) $(LLVM_LIBS)
+
+ispc_test: dirs ispc_test.cpp
+	@echo Creating ispc_test executable
+	@$(CXX) $(LDFLAGS) $(CXXFLAGS) -o $@ ispc_test.cpp $(LLVM_LIBS)
+
+objs/%.o: %.cpp
+	@echo Compiling $<
+	@$(CXX) $(CXXFLAGS) -o $@ -c $<
+
+objs/parse.cc: parse.yy
+	@echo Running bison on $<
+	@$(YACC) -o $@ $<
+
+objs/parse.o: objs/parse.cc $(HEADERS)
+	@echo Compiling $<
+	@$(CXX) $(CXXFLAGS) -o $@ -c $<
+
+objs/lex.cpp: lex.ll
+	@echo Running flex on $<
+	@$(LEX) -o $@ $<
+
+objs/lex.o: objs/lex.cpp $(HEADERS)
+	@echo Compiling $<
+	@$(CXX) $(CXXFLAGS) -o $@ -c $<
+
+$(STDLIB_SRC): stdlib.m4
+
+objs/stdlib-%.cpp: stdlib-%.ll
+	@echo Creating C++ source from stdlib file $<
+	@m4 stdlib.m4 $< | ./bitcode2cpp.py $< > $@
+
+objs/stdlib-%.o: objs/stdlib-%.cpp
+	@echo Compiling $<
+	@$(CXX) $(CXXFLAGS) -o $@ -c $<
+
+objs/stdlib-c.cpp: stdlib-c.c
+	@echo Creating C++ source from stdlib file $<
+	@$(CLANG) -I /opt/l1om/usr/include/ -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py $< > $@
+
+objs/stdlib-c.o: objs/stdlib-c.cpp
+	@echo Compiling $<
+	@$(CXX) $(CXXFLAGS) -o $@ -c $<
+
+objs/stdlib_ispc.cpp: stdlib.ispc
+	@echo Creating C++ source from $<
+	@$(CPP) -DISPC=1 -DPI=3.1415936535 $< | ./stdlib2cpp.py > $@
+
+objs/stdlib_ispc.o: objs/stdlib_ispc.cpp
+	@echo Compiling $<
+	@$(CXX) $(CXXFLAGS) -o $@ -c $<
diff --git a/READMErst.txt b/READMErst.txt
new file mode 100644
index 00000000..def29dc8
--- /dev/null
+++ b/READMErst.txt
@@ -0,0 +1,22 @@
+==============================
+Intel(r) SPMD Program Compiler
+==============================
+
+Welcome to the Intel(r) SPMD Program Compiler (ispc)!  
+
+ispc is a new compiler for "single program, multiple data" (SPMD)
+programs. Under the SPMD model, the programmer writes a program that mostly
+appears to be a regular serial program, though the execution model is
+actually that a number of program instances execute in parallel on the
+hardware. ispc compiles a C-based SPMD programming language to run on the
+SIMD units of CPUs; it frequently provides a a 3x or more speedup on CPUs
+with 4-wide SSE units, without any of the difficulty of writing intrinsics
+code.
+
+ispc is an open source compiler under the BSD license; see the file
+LICENSE.txt.  ispc supports Windows, Mac, and Linux, with both x86 and
+x86-64 targets. It currently supports the SSE2 and SSE4 instruction sets,
+though support for AVX should be available soon.
+
+For more information and examples, as well as a wiki and the bug database,
+see the ispc distribution site, http://ispc.github.com.
diff --git a/bitcode2cpp.py b/bitcode2cpp.py
new file mode 100755
index 00000000..b61f6f8e
--- /dev/null
+++ b/bitcode2cpp.py
@@ -0,0 +1,34 @@
+#!/usr/bin/python
+
+import sys
+import string
+import re
+import subprocess
+
+length=0
+
+src=str(sys.argv[1])
+
+target = re.sub(".*stdlib-", "", src)
+target = re.sub("\.ll$", "", target)
+target = re.sub("\.c$", "", target)
+target = re.sub("-", "_", target)
+
+try:
+    as_out=subprocess.Popen([ "llvm-as", "-", "-o", "-"], stdout=subprocess.PIPE)
+except IOError:
+    print >> sys.stderr, "Couldn't open " + src
+    sys.exit(1)
+
+print "unsigned char stdlib_bitcode_" + target + "[] = {"
+for line in as_out.stdout.readlines():
+    length = length + len(line)
+    for c in line:
+        print ord(c)
+        print ", "
+print " 0 };\n\n"
+print "int stdlib_bitcode_" + target + "_length = " + str(length) + ";\n"
+
+as_out.wait()
+
+sys.exit(as_out.returncode)
diff --git a/builtins.cpp b/builtins.cpp
new file mode 100644
index 00000000..d2a49c7e
--- /dev/null
+++ b/builtins.cpp
@@ -0,0 +1,617 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file builtins.cpp
+    @brief Definitions of functions related to setting up the standard library 
+           and other builtins.
+*/
+
+#include "builtins.h"
+#include "type.h"
+#include "util.h"
+#include "sym.h"
+#include "expr.h"
+#include "llvmutil.h"
+#include "module.h"
+#include "ctx.h"
+
+#include <math.h>
+#include <stdlib.h>
+#include <llvm/LLVMContext.h>
+#include <llvm/Module.h>
+#include <llvm/Type.h>
+#include <llvm/DerivedTypes.h>
+#include <llvm/Instructions.h>
+#include <llvm/Linker.h>
+#include <llvm/Support/MemoryBuffer.h>
+#include <llvm/Bitcode/ReaderWriter.h>
+
+extern int yyparse();
+struct yy_buffer_state;
+extern yy_buffer_state *yy_scan_string(const char *);
+
+
+/** Given an LLVM type, try to find the equivalent ispc type.  Note that
+    this is an under-constrained problem due to LLVM's type representations
+    carrying less information than ispc's.  (For example, LLVM doesn't
+    distinguish between signed and unsigned integers in its types.)  
+
+    However, because this function is only used for generating ispc
+    declarations of functions defined in LLVM bitcode in the stdlib-*.ll
+    files, in practice we can get enough of what we need for the relevant
+    cases to make things work.
+ */
+static const Type *
+lLLVMTypeToISPCType(const llvm::Type *t) {
+    if (t == LLVMTypes::VoidType)
+        return AtomicType::Void;
+    else if (t == LLVMTypes::BoolType)
+        return AtomicType::UniformBool;
+    else if (t == LLVMTypes::Int32Type)
+        return AtomicType::UniformInt32;
+    else if (t == LLVMTypes::FloatType)
+        return AtomicType::UniformFloat;
+    else if (t == LLVMTypes::DoubleType)
+        return AtomicType::UniformDouble;
+    else if (t == LLVMTypes::Int64Type)
+        return AtomicType::UniformInt64;
+    else if (t == LLVMTypes::Int32VectorType)
+        return AtomicType::VaryingInt32;
+    else if (t == LLVMTypes::FloatVectorType)
+        return AtomicType::VaryingFloat;
+    else if (t == LLVMTypes::DoubleVectorType)
+        return AtomicType::VaryingDouble;
+    else if (t == LLVMTypes::Int64VectorType)
+        return AtomicType::VaryingInt64;
+    else if (t == LLVMTypes::Int32PointerType)
+        return new ReferenceType(AtomicType::UniformInt32, false);
+    else if (t == LLVMTypes::FloatPointerType)
+        return new ReferenceType(AtomicType::UniformFloat, false);
+    else if (t == LLVMTypes::Int32VectorPointerType)
+        return new ReferenceType(AtomicType::VaryingInt32, false);
+    else if (t == LLVMTypes::FloatVectorPointerType)
+        return new ReferenceType(AtomicType::VaryingFloat, false);
+    else if (llvm::isa<const llvm::PointerType>(t)) {
+        const llvm::PointerType *pt = llvm::dyn_cast<const llvm::PointerType>(t);
+
+        // Is it a pointer to an unsized array of objects?  If so, then
+        // create the equivalent ispc type.  Note that it has to be a
+        // reference to an array, since ispc passes arrays to functions by
+        // reference.
+        //
+        // FIXME: generalize this to do more than uniform int32s (that's
+        // all that's necessary for the stdlib currently.)
+        const llvm::ArrayType *at = 
+            llvm::dyn_cast<const llvm::ArrayType>(pt->getElementType());
+        if (at && at->getNumElements() == 0 &&
+            at->getElementType() == LLVMTypes::Int32Type)
+            return new ReferenceType(new ArrayType(AtomicType::UniformInt32, 0),
+                                     false);
+    }
+
+    return NULL;
+}
+
+
+/** Given an LLVM function declaration, synthesize the equivalent ispc
+    symbol for the function (if possible).  Returns true on success, false
+    on failure.
+ */
+static bool
+lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
+    SourcePos noPos;
+    noPos.name = "__stdlib";
+
+    const llvm::FunctionType *ftype = func->getFunctionType();
+    std::string name = func->getName();
+
+    const Type *returnType = lLLVMTypeToISPCType(ftype->getReturnType());
+    if (!returnType)
+        // return type not representable in ispc -> not callable from ispc
+        return false;
+
+    // Iterate over the arguments and try to find their equivalent ispc
+    // types.
+    std::vector<const Type *> argTypes;
+    for (unsigned int i = 0; i < ftype->getNumParams(); ++i) {
+        const llvm::Type *llvmArgType = ftype->getParamType(i);
+        const Type *type = lLLVMTypeToISPCType(llvmArgType);
+        if (type == NULL)
+            return false;
+        argTypes.push_back(type);
+    }
+
+    FunctionType *funcType = new FunctionType(returnType, argTypes, noPos);
+    Symbol *sym = new Symbol(name, noPos, funcType);
+    sym->function = func;
+    symbolTable->AddFunction(sym);
+    return true;
+}
+
+
+/** Given an LLVM module, create ispc symbols for the functions in the
+    module.
+ */
+static void
+lAddModuleSymbols(llvm::Module *module, SymbolTable *symbolTable) {
+#if 0
+    // FIXME: handle globals?
+    assert(module->global_empty());
+#endif
+
+    llvm::Module::iterator iter;
+    for (iter = module->begin(); iter != module->end(); ++iter) {
+        llvm::Function *func = iter;
+        lCreateISPCSymbol(func, symbolTable);
+    }
+}
+
+/** Declare the function symbol 'bool __is_compile_time_constant_mask(mask type)'.  
+    This function will never be defined; it's just a placeholder
+    that will be handled during the optimization process.  See the
+    discussion of the implementation of CompileTimeConstantResolvePass for
+    more details.
+ */
+static void
+lDeclareCompileTimeConstant(llvm::Module *module) {
+    SourcePos noPos;
+    noPos.name = "__stdlib";
+
+    std::vector<const llvm::Type *> argTypes;
+    argTypes.push_back(LLVMTypes::MaskType);
+
+    llvm::FunctionType *fType = 
+        llvm::FunctionType::get(LLVMTypes::BoolType, argTypes, false);
+    llvm::Function *func =
+        llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
+                               "__is_compile_time_constant_mask", module);
+    func->setOnlyReadsMemory(true);
+    func->setDoesNotThrow(true);
+}
+
+
+/** Declare the 'pseudo-gather' functions.  When the ispc front-end needs
+    to perform a gather, it generates a call to one of these functions,
+    which have signatures:
+    
+    varying int32 __pseudo_gather(varying int32 *, mask)
+    varying int64 __pseudo_gather(varying int64 *, mask)
+
+    These functions are never actually implemented; the
+    GatherScatterFlattenOpt optimization pass finds them and then converts
+    them to make calls to the following functions, which represent gathers
+    from a common base pointer with offsets.  This approach allows the
+    front-end to be relatively simple in how it emits address calculation
+    for gathers.
+
+    varying int32 __pseudo_gather_base_offsets_32(uniform int32 *base, 
+                                                  int32 offsets, mask)
+    varying int64 __pseudo_gather_base_offsets_64(uniform int64 *base, 
+                                                  int64 offsets, mask)
+
+    Then, the GSImprovementsPass optimizations finds these and either
+    converts them to native gather functions or converts them to vector
+    loads, if equivalent.
+ */
+static void
+lDeclarePseudoGathers(llvm::Module *module) {
+    SourcePos noPos;
+    noPos.name = "__stdlib";
+
+    {
+        std::vector<const llvm::Type *> argTypes;
+        argTypes.push_back(LLVMTypes::VoidPointerVectorType);
+        argTypes.push_back(LLVMTypes::MaskType);
+
+        llvm::FunctionType *fType = 
+            llvm::FunctionType::get(LLVMTypes::Int32VectorType, argTypes, false);
+        llvm::Function *func =
+            llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
+                                   "__pseudo_gather_32", module);
+        func->setOnlyReadsMemory(true);
+        func->setDoesNotThrow(true);
+
+        fType = llvm::FunctionType::get(LLVMTypes::Int64VectorType, argTypes, false);
+        func = llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
+                                      "__pseudo_gather_64", module);
+        func->setOnlyReadsMemory(true);
+        func->setDoesNotThrow(true);
+    }
+
+    {
+        std::vector<const llvm::Type *> argTypes;
+        argTypes.push_back(LLVMTypes::VoidPointerType);
+        argTypes.push_back(LLVMTypes::Int32VectorType);
+        argTypes.push_back(LLVMTypes::MaskType);
+
+        llvm::FunctionType *fType = 
+            llvm::FunctionType::get(LLVMTypes::Int32VectorType, argTypes, false);
+        llvm::Function *func =
+            llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
+                                   "__pseudo_gather_base_offsets_32", module);
+        func->setOnlyReadsMemory(true);
+        func->setDoesNotThrow(true);
+
+        fType = llvm::FunctionType::get(LLVMTypes::Int64VectorType, argTypes, false);
+        func = llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
+                                      "__pseudo_gather_base_offsets_64", module);
+        func->setOnlyReadsMemory(true);
+        func->setDoesNotThrow(true);
+    }
+}
+
+
+/** Similarly to the 'pseudo-gathers' defined by lDeclarePseudoGathers(),
+    we also declare (but never define) pseudo-scatter instructions with
+    signatures:
+
+    void __pseudo_scatter_32(varying int32 *, varying int32 values, mask)
+    void __pseudo_scatter_64(varying int64 *, varying int64 values, mask)
+
+    The GatherScatterFlattenOpt optimization pass also finds these and
+    transforms them to scatters like:
+
+    void __pseudo_scatter_base_offsets_32(uniform int32 *base, 
+                    varying int32 offsets, varying int32 values, mask)
+    void __pseudo_scatter_base_offsets_64(uniform int64 *base, 
+                    varying int62 offsets, varying int64 values, mask)
+
+    And the GSImprovementsPass in turn converts these to actual native
+    scatters or masked stores.  
+*/
+static void
+lDeclarePseudoScatters(llvm::Module *module) {
+    SourcePos noPos;
+    noPos.name = "__stdlib";
+
+    {
+        std::vector<const llvm::Type *> argTypes;
+        argTypes.push_back(LLVMTypes::VoidPointerVectorType);
+        argTypes.push_back(LLVMTypes::Int32VectorType);
+        argTypes.push_back(LLVMTypes::MaskType);
+
+        llvm::FunctionType *fType = 
+            llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
+        llvm::Function *func =
+            llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
+                                   "__pseudo_scatter_32", module);
+        func->setDoesNotThrow(true);
+    }
+    {
+        std::vector<const llvm::Type *> argTypes;
+        argTypes.push_back(LLVMTypes::VoidPointerVectorType);
+        argTypes.push_back(LLVMTypes::Int64VectorType);
+        argTypes.push_back(LLVMTypes::MaskType);
+
+        llvm::FunctionType *fType = 
+            llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
+        llvm::Function *func =
+            llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
+                                   "__pseudo_scatter_64", module);
+        func->setDoesNotThrow(true);
+    }
+
+    {
+        std::vector<const llvm::Type *> argTypes;
+        argTypes.push_back(LLVMTypes::VoidPointerType);
+        argTypes.push_back(LLVMTypes::Int32VectorType);
+        argTypes.push_back(LLVMTypes::Int32VectorType);
+        argTypes.push_back(LLVMTypes::MaskType);
+
+        llvm::FunctionType *fType = 
+            llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
+        llvm::Function *func =
+            llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
+                                   "__pseudo_scatter_base_offsets_32", module);
+        func->setDoesNotThrow(true);
+    }
+    {
+        std::vector<const llvm::Type *> argTypes;
+        argTypes.push_back(LLVMTypes::VoidPointerType);
+        argTypes.push_back(LLVMTypes::Int32VectorType);
+        argTypes.push_back(LLVMTypes::Int64VectorType);
+        argTypes.push_back(LLVMTypes::MaskType);
+
+        llvm::FunctionType *fType = 
+            llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
+        llvm::Function *func =
+            llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
+                                   "__pseudo_scatter_base_offsets_64", module);
+        func->setDoesNotThrow(true);
+    }
+}
+
+
+/** This function declares placeholder masked store functions for the
+    front-end to use.
+
+    void __pseudo_masked_store_32(uniform int32 *ptr, varying int32 values, mask)
+    void __pseudo_masked_store_64(uniform int64 *ptr, varying int64 values, mask)
+
+    These in turn are converted to native masked stores or to regular
+    stores (if the mask is all on) by the MaskedStoreOptPass optimization
+    pass.
+ */
+static void
+lDeclarePseudoMaskedStore(llvm::Module *module) {
+    SourcePos noPos;
+    noPos.name = "__stdlib";
+
+    {
+    std::vector<const llvm::Type *> argTypes;
+    argTypes.push_back(LLVMTypes::Int32VectorPointerType);
+    argTypes.push_back(LLVMTypes::Int32VectorType);
+    argTypes.push_back(LLVMTypes::MaskType);
+
+    llvm::FunctionType *fType = 
+        llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
+    llvm::Function *func = 
+        llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
+                               "__pseudo_masked_store_32", module);
+    func->setDoesNotThrow(true);
+    func->addFnAttr(llvm::Attribute::AlwaysInline);
+    func->setDoesNotCapture(1, true);
+    }
+
+    {
+    std::vector<const llvm::Type *> argTypes;
+    argTypes.push_back(LLVMTypes::Int64VectorPointerType);
+    argTypes.push_back(LLVMTypes::Int64VectorType);
+    argTypes.push_back(LLVMTypes::MaskType);
+
+    llvm::FunctionType *fType = 
+        llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
+    llvm::Function *func = 
+        llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
+                               "__pseudo_masked_store_64", module);
+    func->setDoesNotThrow(true);
+    func->addFnAttr(llvm::Attribute::AlwaysInline);
+    func->setDoesNotCapture(1, true);
+    }
+}
+
+
+/** This utility function takes serialized binary LLVM bitcode and adds its
+    definitions to the given module.  Functions in the bitcode that can be
+    mapped to ispc functions are also added to the symbol table.
+
+    @param bitcode     Binary LLVM bitcode (e.g. the contents of a *.bc file)
+    @param length      Length of the bitcode buffer
+    @param module      Module to link the bitcode into
+    @param symbolTable Symbol table to add definitions to
+ */
+static void
+lAddBitcode(const unsigned char *bitcode, int length,
+            llvm::Module *module, SymbolTable *symbolTable) {
+    std::string bcErr;
+    llvm::StringRef sb = llvm::StringRef((char *)bitcode, length);
+    llvm::MemoryBuffer *bcBuf = llvm::MemoryBuffer::getMemBuffer(sb);
+    llvm::Module *bcModule = llvm::ParseBitcodeFile(bcBuf, *g->ctx, &bcErr);
+    if (!bcModule)
+        Error(SourcePos(), "Error parsing stdlib bitcode: %s", bcErr.c_str());
+    else {
+        std::string(linkError);
+        if (llvm::Linker::LinkModules(module, bcModule, &linkError))
+            Error(SourcePos(), "Error linking stdlib bitcode: %s", linkError.c_str());
+        lAddModuleSymbols(module, symbolTable);
+    }
+}
+
+
+/** Utility routine that defines a constant int32 with given value, adding
+    the symbol to both the ispc symbol table and the given LLVM module.
+ */
+static void
+lDefineConstantInt(const char *name, int val, llvm::Module *module,
+                   SymbolTable *symbolTable) {
+    Symbol *pw = new Symbol(name, SourcePos(), AtomicType::UniformConstInt32);
+    pw->isStatic = true;
+    pw->constValue = new ConstExpr(pw->type, val, SourcePos());
+    const llvm::Type *ltype = LLVMTypes::Int32Type;
+    llvm::Constant *linit = LLVMInt32(val);
+    pw->storagePtr = new llvm::GlobalVariable(*module, ltype, true, 
+                                              llvm::GlobalValue::InternalLinkage,
+                                              linit, pw->name.c_str());
+    symbolTable->AddVariable(pw);
+}
+
+
+static void
+lDefineProgramIndex(llvm::Module *module, SymbolTable *symbolTable) {
+    Symbol *pidx = new Symbol("programIndex", SourcePos(), 
+                              AtomicType::VaryingConstInt32);
+    pidx->isStatic = true;
+
+    int pi[ISPC_MAX_NVEC];
+    for (int i = 0; i < g->target.vectorWidth; ++i)
+        pi[i] = i;
+    pidx->constValue = new ConstExpr(pidx->type, pi, SourcePos());
+
+    const llvm::Type *ltype = LLVMTypes::Int32VectorType;
+    llvm::Constant *linit = LLVMInt32Vector(pi);
+    pidx->storagePtr = new llvm::GlobalVariable(*module, ltype, true, 
+                                                llvm::GlobalValue::InternalLinkage, linit, 
+                                                pidx->name.c_str());
+    symbolTable->AddVariable(pidx);
+}
+
+
+void
+DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *module,
+             bool includeStdlibISPC) {
+    // Add the definitions from the compiled stdlib-c.c file
+    extern unsigned char stdlib_bitcode_c[];
+    extern int stdlib_bitcode_c_length;
+    lAddBitcode(stdlib_bitcode_c, stdlib_bitcode_c_length, module, symbolTable);
+
+    // Next, add the target's custom implementations of the various needed
+    // builtin functions (e.g. __masked_store_32(), etc).
+    switch (g->target.isa) {
+    case Target::SSE2:
+        extern unsigned char stdlib_bitcode_sse2[];
+        extern int stdlib_bitcode_sse2_length;
+        lAddBitcode(stdlib_bitcode_sse2, stdlib_bitcode_sse2_length, module,
+                    symbolTable);
+        break;
+    case Target::SSE4:
+        extern unsigned char stdlib_bitcode_sse4[];
+        extern int stdlib_bitcode_sse4_length;
+        extern unsigned char stdlib_bitcode_sse4x2[];
+        extern int stdlib_bitcode_sse4x2_length;
+        switch (g->target.vectorWidth) {
+        case 4: 
+            lAddBitcode(stdlib_bitcode_sse4, stdlib_bitcode_sse4_length, 
+                        module, symbolTable);
+            break;
+        case 8:
+            lAddBitcode(stdlib_bitcode_sse4x2, stdlib_bitcode_sse4x2_length, 
+                        module, symbolTable);
+            break;
+        default:
+            FATAL("logic error in DefineStdlib");
+        }
+        break;
+    case Target::AVX:
+        extern unsigned char stdlib_bitcode_avx[];
+        extern int stdlib_bitcode_avx_length;
+        lAddBitcode(stdlib_bitcode_avx, stdlib_bitcode_avx_length, module, 
+                    symbolTable);
+        break;
+    default:
+        FATAL("logic error");
+    }
+
+    // Add a declaration of void *ISPCMalloc(int64_t).  The user is
+    // responsible for linking in a definition of this if it's needed by
+    // the compiled program.
+    { std::vector<const llvm::Type *> argTypes;
+        argTypes.push_back(llvm::Type::getInt64Ty(*ctx));
+        llvm::FunctionType *ftype = llvm::FunctionType::get(LLVMTypes::VoidPointerType, 
+                                                            argTypes, false);
+        llvm::Function *func = 
+            llvm::Function::Create(ftype, llvm::GlobalValue::ExternalLinkage,
+                                   "ISPCMalloc", module);
+        func->setDoesNotThrow(true);
+    }
+
+    // Add a declaration of void ISPCFree(void *).  The user is
+    // responsible for linking in a definition of this if it's needed by
+    // the compiled program.
+    { std::vector<const llvm::Type *> argTypes;
+        argTypes.push_back(LLVMTypes::VoidPointerType);
+        llvm::FunctionType *ftype = llvm::FunctionType::get(LLVMTypes::VoidPointerType, 
+                                                            argTypes, false);
+        llvm::Function *func = 
+            llvm::Function::Create(ftype, llvm::GlobalValue::ExternalLinkage,
+                                   "ISPCFree", module);
+        func->setDoesNotThrow(true);
+    }
+
+    // Add a declaration of void ISPCLaunch(void *funcPtr, void *data).
+    // The user is responsible for linking in a definition of this if it's
+    // needed by the compiled program.
+    { std::vector<const llvm::Type *> argTypes;
+        argTypes.push_back(LLVMTypes::VoidPointerType);
+        argTypes.push_back(LLVMTypes::VoidPointerType);
+        llvm::FunctionType *ftype = llvm::FunctionType::get(LLVMTypes::VoidType, 
+                                                            argTypes, false);
+        llvm::Function *func = 
+            llvm::Function::Create(ftype, llvm::GlobalValue::ExternalLinkage,
+                                   "ISPCLaunch", module);
+        func->setDoesNotThrow(true);
+    }
+
+    // Add a declaration of void ISPCSync().  The user is responsible for
+    // linking in a definition of this if it's needed by the compiled
+    // program.
+    { 
+        std::vector<const llvm::Type *> argTypes;
+        llvm::FunctionType *ftype = llvm::FunctionType::get(LLVMTypes::VoidType, 
+                                                            argTypes, false);
+        llvm::Function *func = 
+            llvm::Function::Create(ftype, llvm::GlobalValue::ExternalLinkage,
+                                   "ISPCSync", module);
+        func->setDoesNotThrow(true);
+    }
+
+    // Add a declaration of void ISPCInstrument(void *, void *, int, int).
+    // The user is responsible for linking in a definition of this if it's
+    // needed by the compiled program.
+    { 
+        std::vector<const llvm::Type *> argTypes;
+        argTypes.push_back(llvm::PointerType::get(llvm::Type::getInt8Ty(*g->ctx), 0));
+        argTypes.push_back(llvm::PointerType::get(llvm::Type::getInt8Ty(*g->ctx), 0));
+        argTypes.push_back(LLVMTypes::Int32Type);
+        argTypes.push_back(LLVMTypes::Int32Type);
+        llvm::FunctionType *ftype = llvm::FunctionType::get(LLVMTypes::VoidType, 
+                                                            argTypes, false);
+        llvm::Function *func = 
+            llvm::Function::Create(ftype, llvm::GlobalValue::ExternalLinkage,
+                                   "ISPCInstrument", module);
+        func->setDoesNotThrow(true);
+    }
+
+    // Declare various placeholder functions that the optimizer will later
+    // find and replace with something more useful.
+    lDeclareCompileTimeConstant(module);
+    lDeclarePseudoGathers(module);
+    lDeclarePseudoScatters(module);
+    lDeclarePseudoMaskedStore(module);
+
+    // define the 'programCount' builtin variable
+    lDefineConstantInt("programCount", g->target.vectorWidth, module, symbolTable);
+
+    // define the 'programIndex' builtin
+    lDefineProgramIndex(module, symbolTable);
+
+    // Define __math_lib stuff.  This is used by stdlib.ispc, for example, to
+    // figure out which math routines to end up calling...
+    lDefineConstantInt("__math_lib", (int)g->mathLib, module, symbolTable);
+    lDefineConstantInt("__math_lib_ispc", (int)Globals::Math_ISPC, module,
+                       symbolTable);
+    lDefineConstantInt("__math_lib_ispc_fast", (int)Globals::Math_ISPCFast, 
+                       module, symbolTable);
+    lDefineConstantInt("__math_lib_svml", (int)Globals::Math_SVML, module,
+                       symbolTable);
+    lDefineConstantInt("__math_lib_system", (int)Globals::Math_System, module,
+                       symbolTable);
+
+    if (includeStdlibISPC) {
+        // If the user wants the standard library to be included, parse the
+        // serialized version of the stdlib.ispc file to get its definitions
+        // added.
+        extern const char *stdlib_code;
+        yy_scan_string(stdlib_code);
+        yyparse();
+    }
+}
diff --git a/builtins.h b/builtins.h
new file mode 100644
index 00000000..485cc369
--- /dev/null
+++ b/builtins.h
@@ -0,0 +1,58 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file builtins.h
+    @brief Declarations of functions related to builtins and the 
+           standard library
+*/
+
+#ifndef ISPC_STDLIB_H
+#define ISPC_STDLIB_H 1
+
+#include "ispc.h"
+
+/** Adds declarations and definitions of ispc standard library functions
+    and types to the given module.
+
+    @param symbolTable     SymbolTable in which to add symbol definitions for
+                           stdlib stuff
+    @param ctx             llvm::LLVMContext to use for getting types and the
+                           like for standard library definitions
+    @param module          Module in which to add the declarations/definitions
+    @param includeStdlib   Indicates whether the definitions from the stdlib.ispc
+                           file should be added to the module.
+ */
+void DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *module,
+                  bool includeStdlib);
+
+#endif // ISPC_STDLIB_H
diff --git a/ctx.cpp b/ctx.cpp
new file mode 100644
index 00000000..eb2d4d81
--- /dev/null
+++ b/ctx.cpp
@@ -0,0 +1,1903 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file ctx.cpp
+    @brief Implementation of the FunctionEmitContext class
+*/
+
+#include "ctx.h"
+#include "util.h"
+#include "llvmutil.h"
+#include "type.h"
+#include "stmt.h"
+#include "expr.h"
+#include "module.h"
+#include "sym.h"
+#include <map>
+#include <llvm/DerivedTypes.h>
+#include <llvm/Instructions.h>
+#include <llvm/Support/Dwarf.h>
+#include <llvm/Metadata.h>
+#include <llvm/Module.h>
+
+/** This is a small utility structure that records information related to one
+    level of nested control flow.  It's mostly used in correctly restoring
+    the mask and other state as we exit control flow nesting levels. 
+*/
+struct CFInfo {
+    /** Returns a new instance of the structure that represents entering an
+        'if' statement */
+    static CFInfo *GetIf(bool isUniform, llvm::Value *savedMask);
+
+    /** Returns a new instance of the structure that represents entering a
+        loop. */
+    static CFInfo *GetLoop(bool isUniform, llvm::BasicBlock *breakTarget,
+                           llvm::BasicBlock *continueTarget, 
+                           llvm::Value *savedBreakLanesPtr,
+                           llvm::Value *savedContinueLanesPtr,
+                           llvm::Value *savedMask, llvm::Value *savedLoopMask);
+
+    bool IsIf() { return type == If; }
+    bool IsLoop() { return type == Loop; }
+    bool IsVaryingType() { return !isUniform; }
+    bool IsUniform() { return isUniform; }
+
+    enum CFType { If, Loop };
+    CFType type;
+    bool isUniform;
+    llvm::BasicBlock *savedBreakTarget, *savedContinueTarget;
+    llvm::Value *savedBreakLanesPtr, *savedContinueLanesPtr;
+    llvm::Value *savedMask, *savedLoopMask;
+
+private:
+    CFInfo(CFType t, bool uniformIf, llvm::Value *sm) {
+        assert(t == If);
+        type = t;
+        isUniform = uniformIf;
+        savedBreakTarget = savedContinueTarget = NULL;
+        savedBreakLanesPtr = savedContinueLanesPtr = NULL;
+        savedMask = savedLoopMask = sm;
+    }
+    CFInfo(CFType t, bool iu, llvm::BasicBlock *bt, llvm::BasicBlock *ct,
+           llvm::Value *sb, llvm::Value *sc, llvm::Value *sm,
+           llvm::Value *lm) {
+        assert(t == Loop);
+        type = t;
+        isUniform = iu;
+        savedBreakTarget = bt;
+        savedContinueTarget = ct;
+        savedBreakLanesPtr = sb;
+        savedContinueLanesPtr = sc;
+        savedMask = sm;
+        savedLoopMask = lm;
+    }
+};
+
+
+CFInfo *
+CFInfo::GetIf(bool isUniform, llvm::Value *savedMask) {
+    return new CFInfo(If, isUniform, savedMask);
+}
+
+
+CFInfo *
+CFInfo::GetLoop(bool isUniform, llvm::BasicBlock *breakTarget,
+                llvm::BasicBlock *continueTarget, 
+                llvm::Value *savedBreakLanesPtr,
+                llvm::Value *savedContinueLanesPtr,
+                llvm::Value *savedMask, llvm::Value *savedLoopMask) {
+    return new CFInfo(Loop, isUniform, breakTarget, continueTarget,
+                      savedBreakLanesPtr, savedContinueLanesPtr,
+                      savedMask, savedLoopMask);
+}
+
+///////////////////////////////////////////////////////////////////////////
+
+FunctionEmitContext::FunctionEmitContext(const Type *rt, llvm::Function *function,
+                                         Symbol *funSym, SourcePos firstStmtPos) {
+    /* Create a new basic block to store all of the allocas */
+    allocaBlock = llvm::BasicBlock::Create(*g->ctx, "allocas", function, 0);
+    bblock = llvm::BasicBlock::Create(*g->ctx, "entry", function, 0);
+    /* But jump from it immediately into the real entry block */
+    llvm::BranchInst::Create(bblock, allocaBlock);
+
+    maskPtr = AllocaInst(LLVMTypes::MaskType, "mask_memory");
+    StoreInst(LLVMMaskAllOn, maskPtr);
+
+    funcStartPos = funSym->pos;
+    returnType = rt;
+    entryMask = NULL;
+    loopMask = NULL;
+    breakLanesPtr = continueLanesPtr = NULL;
+    breakTarget = continueTarget = NULL;
+
+    returnedLanesPtr = AllocaInst(LLVMTypes::MaskType, "returned_lanes_memory");
+    StoreInst(LLVMMaskAllOff, returnedLanesPtr);
+
+    if (!returnType || returnType == AtomicType::Void)
+        returnValuePtr = NULL;
+    else {
+        const llvm::Type *ftype = returnType->LLVMType(g->ctx);
+        returnValuePtr = AllocaInst(ftype, "return_value_memory");
+        // FIXME: don't do this store???
+        StoreInst(llvm::Constant::getNullValue(ftype), returnValuePtr);
+    }
+
+#ifndef LLVM_2_8
+    if (m->diBuilder) {
+        /* If debugging is enabled, tell the debug information emission
+           code about this new function */
+        diFile = funcStartPos.GetDIFile();
+        llvm::DIType retType = rt->GetDIType(diFile);
+        int flags = llvm::DIDescriptor::FlagPrototyped; // ??
+        diFunction = m->diBuilder->createFunction(diFile, /* scope */
+                                                  function->getName(), // mangled
+                                                  funSym->name,
+                                                  diFile,
+                                                  funcStartPos.first_line,
+                                                  retType,
+                                                  funSym->isStatic,
+                                                  true, /* is definition */
+                                                  flags,
+                                                  g->opt.level > 0,
+                                                  function);
+        /* And start a scope representing the initial function scope */
+        StartScope();
+    }
+#endif // LLVM_2_8
+
+    launchedTasks = false;
+
+    // connect the funciton's mask memory to the __mask symbol
+    Symbol *maskSymbol = m->symbolTable->LookupVariable("__mask");
+    assert(maskSymbol != NULL);
+    maskSymbol->storagePtr = maskPtr;
+
+#ifndef LLVM_2_8
+    // add debugging info for __mask, programIndex, ...
+    if (m->diBuilder) {
+        maskSymbol->pos = funcStartPos;
+        EmitVariableDebugInfo(maskSymbol);
+
+        llvm::DIFile file = funcStartPos.GetDIFile();
+        Symbol *programIndexSymbol = m->symbolTable->LookupVariable("programIndex");
+        assert(programIndexSymbol && programIndexSymbol->storagePtr);
+        m->diBuilder->createGlobalVariable(programIndexSymbol->name, 
+                                           file,
+                                           funcStartPos.first_line,
+                                           programIndexSymbol->type->GetDIType(file),
+                                           true /* static */,
+                                           programIndexSymbol->storagePtr);
+
+        Symbol *programCountSymbol = m->symbolTable->LookupVariable("programCount");
+        assert(programCountSymbol);
+        m->diBuilder->createGlobalVariable(programCountSymbol->name, 
+                                           file,
+                                           funcStartPos.first_line,
+                                           programCountSymbol->type->GetDIType(file),
+                                           true /* static */,
+                                           programCountSymbol->storagePtr);
+    }
+#endif
+}
+
+
+FunctionEmitContext::~FunctionEmitContext() {
+    assert(controlFlowInfo.size() == 0);
+#ifndef LLVM_2_8
+    assert(debugScopes.size() == (m->diBuilder ? 1 : 0));
+#endif
+}
+
+
+llvm::BasicBlock *
+FunctionEmitContext::GetCurrentBasicBlock() {
+    return bblock;
+}
+
+
+void
+FunctionEmitContext::SetCurrentBasicBlock(llvm::BasicBlock *bb) {
+    bblock = bb;
+}
+
+
+llvm::Value *
+FunctionEmitContext::GetMask() {
+    return LoadInst(maskPtr, NULL, "load_mask");
+}
+
+
+void
+FunctionEmitContext::SetEntryMask(llvm::Value *value) {
+    entryMask = value;
+    SetMask(value);
+}
+
+
+void
+FunctionEmitContext::SetLoopMask(llvm::Value *value) {
+    loopMask = value;
+}
+
+
+void
+FunctionEmitContext::SetMask(llvm::Value *value) {
+    StoreInst(value, maskPtr);
+}
+
+
+void
+FunctionEmitContext::MaskAnd(llvm::Value *oldMask, llvm::Value *test) {
+    llvm::Value *mask = BinaryOperator(llvm::Instruction::And, oldMask, 
+                                       test, "oldMask&test");
+    SetMask(mask);
+}
+
+
+void
+FunctionEmitContext::MaskAndNot(llvm::Value *oldMask, llvm::Value *test) {
+    llvm::Value *notTest = BinaryOperator(llvm::Instruction::Xor, test, LLVMMaskAllOn,
+                                          "~test");
+    llvm::Value *mask = BinaryOperator(llvm::Instruction::And, oldMask, notTest,
+                                       "oldMask&~test");
+    SetMask(mask);
+}
+
+
+void
+FunctionEmitContext::BranchIfMaskAny(llvm::BasicBlock *btrue, llvm::BasicBlock *bfalse) {
+    assert(bblock != NULL);
+    llvm::Value *any = Any(GetMask());
+    BranchInst(btrue, bfalse, any);
+    // It's illegal to add any additional instructions to the basic block
+    // now that it's terminated, so set bblock to NULL to be safe
+    bblock = NULL;
+}
+
+
+void
+FunctionEmitContext::BranchIfMaskAll(llvm::BasicBlock *btrue, llvm::BasicBlock *bfalse) {
+    assert(bblock != NULL);
+    llvm::Value *all = All(GetMask());
+    BranchInst(btrue, bfalse, all);
+    // It's illegal to add any additional instructions to the basic block
+    // now that it's terminated, so set bblock to NULL to be safe
+    bblock = NULL;
+}
+
+
+void
+FunctionEmitContext::BranchIfMaskNone(llvm::BasicBlock *btrue, llvm::BasicBlock *bfalse) {
+    assert(bblock != NULL);
+    // switch sense of true/false bblocks
+    BranchIfMaskAny(bfalse, btrue);
+    // It's illegal to add any additional instructions to the basic block
+    // now that it's terminated, so set bblock to NULL to be safe
+    bblock = NULL;
+}
+
+
+void
+FunctionEmitContext::StartUniformIf(llvm::Value *oldMask) {
+    controlFlowInfo.push_back(CFInfo::GetIf(true, oldMask));
+}
+
+
+void
+FunctionEmitContext::StartVaryingIf(llvm::Value *oldMask) {
+    controlFlowInfo.push_back(CFInfo::GetIf(false, oldMask));
+}
+
+
+void
+FunctionEmitContext::EndIf() {
+    // Make sure we match up with a Start{Uniform,Varying}If().
+    assert(controlFlowInfo.size() > 0 && controlFlowInfo.back()->IsIf());
+    CFInfo *ci = controlFlowInfo.back();
+    controlFlowInfo.pop_back();
+
+    // 'uniform' ifs don't change the mask so we only need to restore the
+    // mask going into the if for 'varying' if statements
+    if (!ci->IsUniform() && bblock != NULL) {
+        // We can't just restore the mask as it was going into the 'if'
+        // statement.  First we have to take into account any program
+        // instances that have executed 'return' statements; the restored
+        // mask must be off for those lanes.
+        restoreMaskGivenReturns(ci->savedMask);
+
+        // If the 'if' statement is inside a loop with a 'varying'
+        // consdition, we also need to account for any break or continue
+        // statements that executed inside the 'if' statmeent; we also must
+        // leave the lane masks for the program instances that ran those
+        // off after we restore the mask after the 'if'.  The code below
+        // ends up being optimized out in the case that there were no break
+        // or continue statements (and breakLanesPtr and continueLanesPtr
+        // have their initial 'all off' values), so we don't need to check
+        // for that here.
+        if (breakLanesPtr != NULL) {
+            assert(continueLanesPtr != NULL);
+
+            // newMask = (oldMask & ~(breakLanes | continueLanes))
+            llvm::Value *oldMask = GetMask();
+            llvm::Value *breakLanes = LoadInst(breakLanesPtr, NULL, 
+                                               "break_lanes");
+            llvm::Value *continueLanes = LoadInst(continueLanesPtr, NULL, 
+                                                  "continue_lanes");
+            llvm::Value *breakOrContinueLanes = 
+                BinaryOperator(llvm::Instruction::Or, breakLanes, continueLanes,
+                               "break|continue_lanes");
+            llvm::Value *notBreakOrContinue = NotOperator(breakOrContinueLanes,
+                                                          "!(break|continue)_lanes");
+            llvm::Value *newMask = 
+                BinaryOperator(llvm::Instruction::And, oldMask, notBreakOrContinue, 
+                               "new_mask");
+            SetMask(newMask);
+        }
+    }
+}
+
+
+void
+FunctionEmitContext::StartLoop(llvm::BasicBlock *bt, llvm::BasicBlock *ct, 
+                               bool uniformCF, llvm::Value *oldMask) {
+    // Store the current values of various loop-related state so that we
+    // can restore it when we exit this loop.
+    controlFlowInfo.push_back(CFInfo::GetLoop(uniformCF, breakTarget, 
+                                              continueTarget, breakLanesPtr,
+                                              continueLanesPtr, oldMask, loopMask));
+    if (uniformCF)
+        // If the loop has a uniform condition, we don't need to track
+        // which lanes 'break' or 'continue'; all of the running ones go
+        // together, so we just jump
+        breakLanesPtr = continueLanesPtr = NULL;
+    else {
+        // For loops with varying conditions, allocate space to store masks
+        // that record which lanes have done these
+        continueLanesPtr = AllocaInst(LLVMTypes::MaskType, "continue_lanes_memory");
+        StoreInst(LLVMMaskAllOff, continueLanesPtr);
+        breakLanesPtr = AllocaInst(LLVMTypes::MaskType, "break_lanes_memory");
+        StoreInst(LLVMMaskAllOff, breakLanesPtr);
+    }
+
+    breakTarget = bt;
+    continueTarget = ct;
+    loopMask = NULL; // this better be set by the loop!
+}
+
+
+void
+FunctionEmitContext::EndLoop() {
+    assert(controlFlowInfo.size() && !controlFlowInfo.back()->IsIf());
+    CFInfo *ci = controlFlowInfo.back();
+    controlFlowInfo.pop_back();
+
+    // Restore the break/continue state information to what it was before
+    // we went into this loop.
+    breakTarget = ci->savedBreakTarget;
+    continueTarget = ci->savedContinueTarget;
+    breakLanesPtr = ci->savedBreakLanesPtr;
+    continueLanesPtr = ci->savedContinueLanesPtr;
+    loopMask = ci->savedLoopMask;
+
+    if (!ci->IsUniform())
+        // If the loop had a 'uniform' test, then it didn't make any
+        // changes to the mask so there's nothing to restore.  If it had a
+        // varying test, we need to restore the mask to what it was going
+        // into the loop, but still leaving off any lanes that executed a
+        // 'return' statement.
+        restoreMaskGivenReturns(ci->savedMask);
+}
+
+
+void
+FunctionEmitContext::restoreMaskGivenReturns(llvm::Value *oldMask) {
+    if (!bblock)
+        return;
+
+    // Restore the mask to the given old mask, but leave off any lanes that
+    // executed a return statement.
+    // newMask = (oldMask & ~returnedLanes)
+    llvm::Value *returnedLanes = LoadInst(returnedLanesPtr, NULL, "returned_lanes");
+    llvm::Value *notReturned = NotOperator(returnedLanes, "~returned_lanes");
+    llvm::Value *newMask = BinaryOperator(llvm::Instruction::And,
+                                          oldMask, notReturned, "new_mask");
+    SetMask(newMask);
+}
+
+
+void
+FunctionEmitContext::Break(bool doCoherenceCheck) {
+    if (breakTarget == NULL) {
+        Error(currentPos, "\"break\" statement is illegal outside of for/while/do loops.");
+        return;
+    }
+
+    // If all of the enclosing 'if' tests in the loop have uniform control
+    // flow or if we can tell that the mask is all on, then we can just
+    // jump to the break location.
+    if (ifsInLoopAllUniform() || GetMask() == LLVMMaskAllOn) {
+        BranchInst(breakTarget);
+        if (ifsInLoopAllUniform() && doCoherenceCheck)
+            Warning(currentPos, "Coherent break statement not necessary in fully uniform "
+                    "control flow.");
+        // Set bblock to NULL since the jump has terminated the basic block
+        bblock = NULL;
+    }
+    else {
+        // Otherwise we need to update the mask of the lanes that have
+        // executed a 'break' statement:
+        // breakLanes = breakLanes | mask
+        assert(breakLanesPtr != NULL);
+        llvm::Value *mask = GetMask();
+        llvm::Value *breakMask = LoadInst(breakLanesPtr, NULL, "break_mask");
+        llvm::Value *newMask = BinaryOperator(llvm::Instruction::Or,
+                                              mask, breakMask, "mask|break_mask");
+        StoreInst(newMask, breakLanesPtr);
+
+        // Set the current mask to be all off, just in case there are any
+        // statements in the same scope after the 'break'.  Most of time
+        // this will be optimized away since we'll likely end the scope of
+        // an 'if' statement and restore the mask then.
+        SetMask(LLVMMaskAllOff);
+
+        if (doCoherenceCheck)
+            // If the user has indicated that this is a 'coherent' break
+            // statement, then check to see if the mask is all off.  If so,
+            // we have to conservatively jump to the continueTarget, not
+            // the breakTarget, since part of the reason the mask is all
+            // off may be due to 'continue' statements that executed in the
+            // current loop iteration.  
+            // FIXME: if the loop only has break statements and no
+            // continues, we can jump to breakTarget in that case.
+            jumpIfAllLoopLanesAreDone(continueTarget);
+    }
+}
+
+
+void
+FunctionEmitContext::Continue(bool doCoherenceCheck) {
+    if (!continueTarget) {
+        Error(currentPos, "\"continue\" statement illegal outside of for/while/do loops.");
+        return;
+    }
+
+    if (ifsInLoopAllUniform() || GetMask() == LLVMMaskAllOn) {
+        // Similarly to 'break' statements, we can immediately jump to the
+        // continue target if we're only in 'uniform' control flow within
+        // loop or if we can tell that the mask is all on.
+        AddInstrumentationPoint("continue: uniform CF, jumped");
+        if (ifsInLoopAllUniform() && doCoherenceCheck)
+            Warning(currentPos, "Coherent continue statement not necessary in fully uniform "
+                    "control flow.");
+        BranchInst(continueTarget);
+        bblock = NULL;
+    }
+    else {
+        // Otherwise update the stored value of which lanes have 'continue'd.
+        // continueLanes = continueLanes | mask
+        assert(continueLanesPtr);
+        llvm::Value *mask = GetMask();
+        llvm::Value *continueMask = 
+            LoadInst(continueLanesPtr, NULL, "continue_mask");
+        llvm::Value *newMask = BinaryOperator(llvm::Instruction::Or,
+                                              mask, continueMask, "mask|continueMask");
+        StoreInst(newMask, continueLanesPtr);
+
+        // And set the current mask to be all off in case there are any
+        // statements in the same scope after the 'continue'
+        SetMask(LLVMMaskAllOff);
+
+        if (doCoherenceCheck) 
+            // If this is a 'coherent continue' statement, then emit the
+            // code to see if all of the lanes are now off due to
+            // breaks/continues and jump to the continue target if so.
+            jumpIfAllLoopLanesAreDone(continueTarget);
+    }
+}
+
+
+/** This function checks to see if all of the 'if' statements (if any)
+    between the current scope and the first enclosing loop have 'uniform'
+    tests.
+ */
+bool
+FunctionEmitContext::ifsInLoopAllUniform() const {
+    assert(controlFlowInfo.size() > 0);
+    // Go backwards through controlFlowInfo, since we add new nested scopes
+    // to the back.  Stop once we come to the first enclosing loop.
+    int i = controlFlowInfo.size() - 1;
+    while (i >= 0 && controlFlowInfo[i]->type != CFInfo::Loop) {
+        if (controlFlowInfo[i]->isUniform == false)
+            // Found a scope due to an 'if' statement with a varying test
+            return false;
+        --i;
+    }
+    assert(i >= 0); // else we didn't find a loop!
+    return true;
+}
+
+
+void
+FunctionEmitContext::jumpIfAllLoopLanesAreDone(llvm::BasicBlock *target) {
+    // Check to see if (returned lanes | continued lanes | break lanes) is
+    // equal to the value of mask at the start of the loop iteration.  If
+    // so, everyone is done and we can jump to the given target
+    llvm::Value *returned = LoadInst(returnedLanesPtr, NULL, "returned_lanes");
+    llvm::Value *continued = LoadInst(continueLanesPtr, NULL, "continue_lanes");
+    llvm::Value *breaked = LoadInst(breakLanesPtr, NULL, "break_lanes");
+    llvm::Value *returnedOrContinued = BinaryOperator(llvm::Instruction::Or, 
+                                                      returned, continued,
+                                                      "returned|continued");
+    llvm::Value *returnedOrContinuedOrBreaked = 
+        BinaryOperator(llvm::Instruction::Or, returnedOrContinued,
+                       breaked, "returned|continued");
+
+    // Do we match the mask at loop entry?
+    llvm::Value *allRCB = MasksAllEqual(returnedOrContinuedOrBreaked, loopMask);
+    llvm::BasicBlock *bAll = CreateBasicBlock("all_continued_or_breaked");
+    llvm::BasicBlock *bNotAll = CreateBasicBlock("not_all_continued_or_breaked");
+    BranchInst(bAll, bNotAll, allRCB);
+
+    // If so, have an extra basic block along the way to add
+    // instrumentation, if the user asked for it.
+    bblock = bAll;
+    AddInstrumentationPoint("break/continue: all dynamically went");
+    BranchInst(target);
+
+    // And set the current basic block to a new one for future instructions
+    // for the path where we weren't able to jump
+    bblock = bNotAll;
+    AddInstrumentationPoint("break/continue: not all went");
+}
+
+
+void
+FunctionEmitContext::RestoreContinuedLanes() {
+    if (continueLanesPtr == NULL)
+        return;
+
+    // mask = mask & continueFlags
+    llvm::Value *mask = GetMask();
+    llvm::Value *continueMask = LoadInst(continueLanesPtr, NULL, "continue_mask");
+    llvm::Value *orMask = BinaryOperator(llvm::Instruction::Or,
+                                         mask, continueMask, "mask|continue_mask");
+    SetMask(orMask);
+
+    // continueLanes = 0
+    StoreInst(LLVMMaskAllOff, continueLanesPtr);
+}
+
+
+int
+FunctionEmitContext::VaryingCFDepth() const { 
+    int sum = 0;
+    for (unsigned int i = 0; i < controlFlowInfo.size(); ++i)
+        if (controlFlowInfo[i]->IsVaryingType())
+            ++sum;
+    return sum;
+}
+
+
+void
+FunctionEmitContext::CurrentLanesReturned(Expr *expr, bool doCoherenceCheck) {
+    if (returnType == AtomicType::Void) {
+        if (expr != NULL)
+            Error(expr->pos, "Can't return non-void type \"%s\" from void function.",
+                  expr->GetType()->GetString().c_str());
+    }
+    else {
+        if (expr == NULL) {
+            Error(funcStartPos,
+                  "Must provide return value for return statement for non-void function.");
+            return;
+        }
+        
+        // Use a masked store to store the value of the expression in the
+        // return value memory; this preserves the return values from other
+        // lanes that may have executed return statements previously.
+        Expr *r = expr->TypeConv(returnType, "return statement");
+        if (r != NULL) {
+            llvm::Value *retVal = r->GetValue(this);
+            StoreInst(retVal, returnValuePtr, GetMask(), returnType);
+        }
+    }
+
+    if (VaryingCFDepth() == 0) {
+        // If there is only uniform control flow between us and the
+        // function entry, then it's guaranteed that all lanes are running,
+        // so we can just emit a true return instruction
+        AddInstrumentationPoint("return: uniform control flow");
+        ReturnInst();
+    }
+    else {
+        // Otherwise we update the returnedLanes value by ANDing it with
+        // the current lane mask.
+        llvm::Value *oldReturnedLanes = LoadInst(returnedLanesPtr, NULL,
+                                                 "old_returned_lanes");
+        llvm::Value *newReturnedLanes = BinaryOperator(llvm::Instruction::Or, 
+                                                       oldReturnedLanes, 
+                                                       GetMask(), "old_mask|returned_lanes");
+        
+        // For 'coherent' return statements, emit code to check if all
+        // lanes have returned
+        if (doCoherenceCheck) {
+            // if newReturnedLanes == entryMask, get out of here!
+            llvm::Value *cmp = MasksAllEqual(entryMask, newReturnedLanes);
+            llvm::BasicBlock *bDoReturn = CreateBasicBlock("do_return");
+            llvm::BasicBlock *bNoReturn = CreateBasicBlock("no_return");
+            BranchInst(bDoReturn, bNoReturn, cmp);
+
+            bblock = bDoReturn;
+            AddInstrumentationPoint("return: all lanes have returned");
+            ReturnInst();
+
+            bblock = bNoReturn;
+        }
+        // Otherwise update returnedLanesPtr and turn off all of the lanes
+        // in the current mask so that any subsequent statements in the
+        // same scope after the return have no effect
+        StoreInst(newReturnedLanes, returnedLanesPtr);
+        AddInstrumentationPoint("return: some but not all lanes have returned");
+        SetMask(LLVMMaskAllOff);
+    }
+}
+
+
+llvm::Value *
+FunctionEmitContext::Any(llvm::Value *mask) {
+    llvm::Value *mmval = LaneMask(mask);
+    return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_NE, mmval,
+                   LLVMInt32(0), "any_mm_cmp");
+}
+
+
+llvm::Value *
+FunctionEmitContext::All(llvm::Value *mask) {
+    llvm::Value *mmval = LaneMask(mask);
+    return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, mmval,
+                   LLVMInt32((1<<g->target.vectorWidth)-1), "all_mm_cmp");
+}
+
+
+llvm::Value *
+FunctionEmitContext::LaneMask(llvm::Value *v) {
+    // Call the target-dependent movmsk function to turn the vector mask
+    // into an i32 value
+    std::vector<Symbol *> *mm = m->symbolTable->LookupFunction("__movmsk");
+    assert(mm && mm->size() == 1);
+    llvm::Function *fmm = (*mm)[0]->function;
+    return CallInst(fmm, v, "val_movmsk");
+}
+
+
+llvm::Value *
+FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
+    // Compare the two masks to get a vector of i1s
+    llvm::Value *cmp = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
+                               v1, v2, "v1==v2");
+    // Turn that into a bool vector type (often i32s)
+    cmp = I1VecToBoolVec(cmp);
+    // And see if it's all on
+    return All(cmp);
+}
+
+
+llvm::Value *
+FunctionEmitContext::GetStringPtr(const std::string &str) {
+    llvm::Constant *lstr = llvm::ConstantArray::get(*g->ctx, str);
+    llvm::GlobalValue::LinkageTypes linkage = llvm::GlobalValue::InternalLinkage;
+    llvm::Value *lstrPtr = new llvm::GlobalVariable(*m->module, lstr->getType(),
+                                                    true /*isConst*/, 
+                                                    linkage, lstr, "__str");
+    return new llvm::BitCastInst(lstrPtr, LLVMTypes::VoidPointerType, 
+                                 "str_void_ptr", bblock);
+}
+
+
+llvm::BasicBlock *
+FunctionEmitContext::CreateBasicBlock(const char *name) {
+    llvm::Function *function = bblock->getParent();
+    return llvm::BasicBlock::Create(*g->ctx, name, function);
+}
+
+
+llvm::Value *
+FunctionEmitContext::I1VecToBoolVec(llvm::Value *b) {
+    const llvm::ArrayType *at = llvm::dyn_cast<const llvm::ArrayType>(b->getType());
+    if (at) {
+        // If we're given an array of vectors of i1s, then do the
+        // conversion for each of the elements
+        const llvm::Type *boolArrayType = 
+            llvm::ArrayType::get(LLVMTypes::BoolVectorType, at->getNumElements());
+        llvm::Value *ret = llvm::UndefValue::get(boolArrayType);
+
+        for (unsigned int i = 0; i < at->getNumElements(); ++i) {
+            llvm::Value *elt = ExtractInst(b, i);
+            llvm::Value *sext = SExtInst(elt, LLVMTypes::BoolVectorType, 
+                                         "val_to_boolvec32");
+            ret = InsertInst(ret, sext, i);
+        }
+        return ret;
+    }
+    else
+        return SExtInst(b, LLVMTypes::BoolVectorType, "val_to_boolvec32");
+}
+
+
+llvm::Value *
+FunctionEmitContext::EmitMalloc(const llvm::Type *ty) {
+    // Emit code to compute the size of the given type using a GEP with a
+    // NULL base pointer, indexing one element of the given type, and
+    // casting the resulting 'pointer' to an int giving its size.
+    const llvm::Type *ptrType = llvm::PointerType::get(ty, 0);
+    llvm::Value *nullPtr = llvm::Constant::getNullValue(ptrType);
+    llvm::Value *index[1] = { LLVMInt32(1) };
+    llvm::Value *poffset = llvm::GetElementPtrInst::Create(nullPtr, &index[0], &index[1],
+                                                           "offset_ptr", bblock);
+    AddDebugPos(poffset);
+    llvm::Value *sizeOf =  PtrToIntInst(poffset, LLVMTypes::Int64Type, "offset_int");
+
+    // And given the size, call the malloc function
+    llvm::Function *fmalloc = m->module->getFunction("ISPCMalloc");
+    assert(fmalloc != NULL);
+    llvm::Value *mem = CallInst(fmalloc, sizeOf, "raw_argmem");
+    // Cast the void * back to the result pointer type
+    return BitCastInst(mem, ptrType, "mem_bitcast");
+}
+
+
+void
+FunctionEmitContext::EmitFree(llvm::Value *ptr) {
+    llvm::Value *freeArg = BitCastInst(ptr, LLVMTypes::VoidPointerType,
+                                       "argmemfree");
+    llvm::Function *ffree = m->module->getFunction("ISPCFree");
+    assert(ffree != NULL);
+    CallInst(ffree, freeArg);
+}
+
+
+static llvm::Value *
+lGetStringAsValue(llvm::BasicBlock *bblock, const char *s) {
+    llvm::Constant *sConstant = llvm::ConstantArray::get(*g->ctx, s);
+    llvm::Value *sPtr = new llvm::GlobalVariable(*m->module, sConstant->getType(), 
+                                                 true /* const */,
+                                                 llvm::GlobalValue::InternalLinkage,
+                                                 sConstant, s);
+    llvm::Value *indices[2] = { LLVMInt32(0), LLVMInt32(0) };
+    return llvm::GetElementPtrInst::Create(sPtr, &indices[0], &indices[2],
+                                           "sptr", bblock);
+}
+
+
+void
+FunctionEmitContext::AddInstrumentationPoint(const char *note) {
+    assert(note != NULL);
+    if (!g->emitInstrumentation)
+        return;
+
+    std::vector<llvm::Value *> args;
+    // arg 1: filename as string
+    args.push_back(lGetStringAsValue(bblock, currentPos.name));
+    // arg 2: provided note
+    args.push_back(lGetStringAsValue(bblock, note));
+    // arg 3: line number
+    args.push_back(LLVMInt32(currentPos.first_line));
+    // arg 4: current mask, movmsk'ed down to an int32
+    args.push_back(LaneMask(GetMask()));
+
+    llvm::Function *finst = m->module->getFunction("ISPCInstrument");
+    CallInst(finst, args, "");
+}
+
+
+void
+FunctionEmitContext::SetDebugPos(SourcePos pos) { 
+    currentPos = pos; 
+}
+
+
+SourcePos
+FunctionEmitContext::GetDebugPos() const {
+    return currentPos;
+}
+
+
+void
+FunctionEmitContext::AddDebugPos(llvm::Value *value, const SourcePos *pos, 
+                                 llvm::DIScope *scope) {
+#ifndef LLVM_2_8
+    llvm::Instruction *inst = llvm::dyn_cast<llvm::Instruction>(value);
+    if (inst != NULL && m->diBuilder) {
+        SourcePos p = pos ? *pos : currentPos;
+        if (p.first_line != 0)
+            // If first_line == 0, then we're in the middle of setting up
+            // the standard library or the like; don't add debug positions
+            // for those functions
+            inst->setDebugLoc(llvm::DebugLoc::get(p.first_line, p.first_column, 
+                                                  scope ? *scope : GetDIScope()));
+    }
+#endif
+}
+
+
+void
+FunctionEmitContext::StartScope() {
+#ifndef LLVM_2_8
+    if (m->diBuilder != NULL) {
+        llvm::DIScope parentScope;
+        if (debugScopes.size() > 0)
+            parentScope = debugScopes.back();
+        else
+            parentScope = diFunction;
+
+        llvm::DILexicalBlock lexicalBlock = 
+            m->diBuilder->createLexicalBlock(parentScope, diFile,
+                                             currentPos.first_line,
+                                             currentPos.first_column);
+        debugScopes.push_back(lexicalBlock);
+    }
+#endif
+}
+
+
+void
+FunctionEmitContext::EndScope() {
+#ifndef LLVM_2_8
+    if (m->diBuilder != NULL) {
+        assert(debugScopes.size() > 0);
+        debugScopes.pop_back();
+    }
+#endif
+}
+
+
+llvm::DIScope 
+FunctionEmitContext::GetDIScope() const {
+    assert(debugScopes.size() > 0);
+    return debugScopes.back();
+}
+
+
+void
+FunctionEmitContext::EmitVariableDebugInfo(Symbol *sym) {
+#ifndef LLVM_2_8
+    if (m->diBuilder == NULL)
+        return;
+
+    llvm::DIScope scope = GetDIScope();
+    llvm::DIVariable var = 
+        m->diBuilder->createLocalVariable(llvm::dwarf::DW_TAG_auto_variable,
+                                          scope,
+                                          sym->name,
+                                          sym->pos.GetDIFile(),
+                                          sym->pos.first_line,
+                                          sym->type->GetDIType(scope),
+                                          true /* preserve through opts */);
+    llvm::Instruction *declareInst = 
+        m->diBuilder->insertDeclare(sym->storagePtr, var, bblock);
+    AddDebugPos(declareInst, &sym->pos, &scope);
+#endif
+}
+
+
+void
+FunctionEmitContext::EmitFunctionParameterDebugInfo(Symbol *sym) {
+#ifndef LLVM_2_8
+    if (m->diBuilder == NULL)
+        return;
+
+    llvm::DIScope scope = diFunction;
+    llvm::DIVariable var = 
+        m->diBuilder->createLocalVariable(llvm::dwarf::DW_TAG_arg_variable,
+                                          scope,
+                                          sym->name,
+                                          sym->pos.GetDIFile(),
+                                          sym->pos.first_line,
+                                          sym->type->GetDIType(scope),
+                                          true /* preserve through opts */);
+    llvm::Instruction *declareInst = 
+        m->diBuilder->insertDeclare(sym->storagePtr, var, bblock);
+    AddDebugPos(declareInst, &sym->pos, &scope);
+#endif
+}
+
+
+/** If the given type is an array of vector types, then it's the
+    representation of an ispc VectorType with varying elements.  If it is
+    one of these, return the array size (i.e. the VectorType's size).
+    Otherwise return zero.
+ */
+static int
+lArrayVectorWidth(const llvm::Type *t) {
+    const llvm::ArrayType *arrayType = llvm::dyn_cast<const llvm::ArrayType>(t);
+    if (arrayType == NULL)
+        return 0;
+
+    // We shouldn't be seeing arrays of anything but vectors being passed
+    // to things like FunctionEmitContext::BinaryOperator() as operands
+    const llvm::VectorType *vectorElementType = 
+        llvm::dyn_cast<const llvm::VectorType>(arrayType->getElementType());
+    assert(vectorElementType != NULL &&
+           (int)vectorElementType->getNumElements() == g->target.vectorWidth);
+    return (int)arrayType->getNumElements();
+}
+
+
+llvm::Value *
+FunctionEmitContext::BinaryOperator(llvm::Instruction::BinaryOps inst, 
+                                    llvm::Value *v0, llvm::Value *v1, 
+                                    const char *name) {
+    if (v0 == NULL || v1 == NULL) {
+        assert(m->errorCount > 0);
+        return NULL;
+    }
+
+    assert(v0->getType() == v1->getType());
+    const llvm::Type *type = v0->getType();
+    int arraySize = lArrayVectorWidth(type);
+    if (arraySize == 0) {
+        llvm::Instruction *bop = 
+            llvm::BinaryOperator::Create(inst, v0, v1, name ? name : "", bblock);
+        AddDebugPos(bop);
+        return bop;
+    }
+    else {
+        // If this is an ispc VectorType, apply the binary operator to each
+        // of the elements of the array (which in turn should be either
+        // scalar types or llvm::VectorTypes.)
+        llvm::Value *ret = llvm::UndefValue::get(type);
+        for (int i = 0; i < arraySize; ++i) {
+            llvm::Value *a = ExtractInst(v0, i);
+            llvm::Value *b = ExtractInst(v1, i);
+            llvm::Value *op = BinaryOperator(inst, a, b);
+            ret = InsertInst(ret, op, i);
+        }
+        return ret;
+    }
+}
+
+
+llvm::Value *
+FunctionEmitContext::NotOperator(llvm::Value *v, const char *name) {
+    if (v == NULL) {
+        assert(m->errorCount > 0);
+        return NULL;
+    }
+
+    // Similarly to BinaryOperator, do the operation on all the elements of
+    // the array if we're given an array type; otherwise just do the
+    // regular llvm operation.
+    const llvm::Type *type = v->getType();
+    int arraySize = lArrayVectorWidth(type);
+    if (arraySize == 0) {
+        llvm::Instruction *binst = 
+            llvm::BinaryOperator::CreateNot(v, name ? name : "not", bblock);
+        AddDebugPos(binst);
+        return binst;
+    }
+    else {
+        llvm::Value *ret = llvm::UndefValue::get(type);
+        for (int i = 0; i < arraySize; ++i) {
+            llvm::Value *a = ExtractInst(v, i);
+            llvm::Value *op = 
+                llvm::BinaryOperator::CreateNot(a, name ? name : "not", bblock);
+            AddDebugPos(op);
+            ret = InsertInst(ret, op, i);
+        }
+        return ret;
+    }
+}
+
+
+// Given the llvm Type that represents an ispc VectorType, return an
+// equally-shaped type with boolean elements.  (This is the type that will
+// be returned from CmpInst with ispc VectorTypes).
+static const llvm::Type *
+lGetMatchingBoolVectorType(const llvm::Type *type) {
+    const llvm::ArrayType *arrayType = 
+        llvm::dyn_cast<const llvm::ArrayType>(type);
+    // should only be called for vector typed stuff...
+    assert(arrayType != NULL);
+
+    const llvm::VectorType *vectorElementType =
+        llvm::dyn_cast<const llvm::VectorType>(arrayType->getElementType());
+    assert(vectorElementType != NULL &&
+           (int)vectorElementType->getNumElements() == g->target.vectorWidth);
+
+    const llvm::Type *base = llvm::VectorType::get(LLVMTypes::BoolType, 
+                                                   g->target.vectorWidth);
+    return llvm::ArrayType::get(base, arrayType->getNumElements());
+}
+
+
+llvm::Value *
+FunctionEmitContext::CmpInst(llvm::Instruction::OtherOps inst, 
+                             llvm::CmpInst::Predicate pred,
+                             llvm::Value *v0, llvm::Value *v1, 
+                             const char *name) {
+    if (v0 == NULL || v1 == NULL) {
+        assert(m->errorCount > 0);
+        return NULL;
+    }
+
+    assert(v0->getType() == v1->getType());
+    const llvm::Type *type = v0->getType();
+    int arraySize = lArrayVectorWidth(type);
+    if (arraySize == 0) {
+        llvm::Instruction *ci = 
+            llvm::CmpInst::Create(inst, pred, v0, v1, name ? name : "cmp", 
+                                  bblock);
+        AddDebugPos(ci);
+        return ci;
+    }
+    else {
+        const llvm::Type *boolType = lGetMatchingBoolVectorType(type);
+        llvm::Value *ret = llvm::UndefValue::get(boolType);
+        for (int i = 0; i < arraySize; ++i) {
+            llvm::Value *a = ExtractInst(v0, i);
+            llvm::Value *b = ExtractInst(v1, i);
+            llvm::Value *op = CmpInst(inst, pred, a, b, name);
+            ret = InsertInst(ret, op, i);
+        }
+        return ret;
+    }
+}
+
+
+llvm::Value *
+FunctionEmitContext::BitCastInst(llvm::Value *value, const llvm::Type *type, 
+                                 const char *name) {
+    if (value == NULL) {
+        assert(m->errorCount > 0);
+        return NULL;
+    }
+
+    const llvm::Type *valType = value->getType();
+    const llvm::ArrayType *at = llvm::dyn_cast<const llvm::ArrayType>(valType);
+    if (at && llvm::isa<const llvm::PointerType>(at->getElementType())) {
+        // If we're bitcasting an array of pointers, we have a varying
+        // lvalue; apply the corresponding bitcast to each of the
+        // individual pointers and return the result array.
+        assert((int)at->getNumElements() == g->target.vectorWidth);
+
+        llvm::Value *ret = 
+            llvm::UndefValue::get(llvm::ArrayType::get(type, g->target.vectorWidth));
+        for (int i = 0; i < g->target.vectorWidth; ++i) {
+            llvm::Value *elt = ExtractInst(value, i);
+            llvm::Value *bc = BitCastInst(elt, type, name);
+            ret = InsertInst(ret, bc, i);
+        }
+        return ret;
+    }
+    else {
+        llvm::Instruction *inst = 
+            new llvm::BitCastInst(value, type, name ? name : "bitcast", bblock);
+        AddDebugPos(inst);
+        return inst;
+    }
+}
+
+
+llvm::Instruction *
+FunctionEmitContext::PtrToIntInst(llvm::Value *value, const llvm::Type *type,
+                                  const char *name) {
+    if (value == NULL) {
+        assert(m->errorCount > 0);
+        return NULL;
+    }
+
+    // TODO: we should probably handle the array case as in
+    // e.g. BitCastInst(), but we don't currently need that functionality
+    llvm::Instruction *inst = 
+        new llvm::PtrToIntInst(value, type, name ? name : "ptr2int", bblock);
+    AddDebugPos(inst);
+    return inst;
+}
+
+
+llvm::Instruction *
+FunctionEmitContext::IntToPtrInst(llvm::Value *value, const llvm::Type *type,
+                                  const char *name) {
+    if (value == NULL) {
+        assert(m->errorCount > 0);
+        return NULL;
+    }
+
+    // TODO: we should probably handle the array case as in
+    // e.g. BitCastInst(), but we don't currently need that functionality
+    llvm::Instruction *inst = 
+        new llvm::IntToPtrInst(value, type, name ? name : "int2ptr", bblock);
+    AddDebugPos(inst);
+    return inst;
+}
+
+
+llvm::Instruction *
+FunctionEmitContext::TruncInst(llvm::Value *value, const llvm::Type *type,
+                               const char *name) {
+    if (value == NULL) {
+        assert(m->errorCount > 0);
+        return NULL;
+    }
+
+    // TODO: we should probably handle the array case as in
+    // e.g. BitCastInst(), but we don't currently need that functionality
+    llvm::Instruction *inst = 
+        new llvm::TruncInst(value, type, name ? name : "trunc", bblock);
+    AddDebugPos(inst);
+    return inst;
+}
+
+
+llvm::Instruction *
+FunctionEmitContext::CastInst(llvm::Instruction::CastOps op, llvm::Value *value,
+                              const llvm::Type *type, const char *name) {
+    if (value == NULL) {
+        assert(m->errorCount > 0);
+        return NULL;
+    }
+
+    // TODO: we should probably handle the array case as in
+    // e.g. BitCastInst(), but we don't currently need that functionality
+    llvm::Instruction *inst = 
+        llvm::CastInst::Create(op, value, type, name ? name : "cast", bblock);
+    AddDebugPos(inst);
+    return inst;
+}
+
+
+llvm::Instruction *
+FunctionEmitContext::FPCastInst(llvm::Value *value, const llvm::Type *type, 
+                                const char *name) {
+    if (value == NULL) {
+        assert(m->errorCount > 0);
+        return NULL;
+    }
+
+    // TODO: we should probably handle the array case as in
+    // e.g. BitCastInst(), but we don't currently need that functionality
+    llvm::Instruction *inst = 
+        llvm::CastInst::CreateFPCast(value, type, name ? name : "fpcast", bblock);
+    AddDebugPos(inst);
+    return inst;
+}
+
+
+llvm::Instruction *
+FunctionEmitContext::SExtInst(llvm::Value *value, const llvm::Type *type, 
+                              const char *name) {
+    if (value == NULL) {
+        assert(m->errorCount > 0);
+        return NULL;
+    }
+
+    // TODO: we should probably handle the array case as in
+    // e.g. BitCastInst(), but we don't currently need that functionality
+    llvm::Instruction *inst = 
+        new llvm::SExtInst(value, type, name ? name : "sext", bblock);
+    AddDebugPos(inst);
+    return inst;
+}
+
+
+llvm::Instruction *
+FunctionEmitContext::ZExtInst(llvm::Value *value, const llvm::Type *type, 
+                              const char *name) {
+    if (value == NULL) {
+        assert(m->errorCount > 0);
+        return NULL;
+    }
+
+    // TODO: we should probably handle the array case as in
+    // e.g. BitCastInst(), but we don't currently need that functionality
+    llvm::Instruction *inst = 
+        new llvm::ZExtInst(value, type, name ? name : "zext", bblock);
+    AddDebugPos(inst);
+    return inst;
+}
+
+
+llvm::Value *
+FunctionEmitContext::GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index0, 
+                                       llvm::Value *index1, const char *name) {
+    if (basePtr == NULL || index0 == NULL || index1 == NULL) {
+        assert(m->errorCount > 0);
+        return NULL;
+    }
+
+    // FIXME: do we need need to handle the case of the first index being
+    // varying?  It's currently needed...
+    assert(!llvm::isa<const llvm::VectorType>(index0->getType()));
+
+    const llvm::Type *basePtrType = basePtr->getType();
+    const llvm::ArrayType *baseArrayType = 
+        llvm::dyn_cast<const llvm::ArrayType>(basePtrType);
+    bool baseIsVaryingTypePointer = (baseArrayType != NULL) && 
+        llvm::isa<const llvm::PointerType>(baseArrayType->getElementType());
+    bool indexIsVaryingType = llvm::isa<const llvm::VectorType>(index1->getType());
+
+    if (!indexIsVaryingType && !baseIsVaryingTypePointer) {
+        // The easy case: both the base pointer and the indices are
+        // uniform, so just emit the regular LLVM GEP instruction
+        llvm::Value *indices[2] = { index0, index1 };
+        llvm::Instruction *inst = 
+            llvm::GetElementPtrInst::Create(basePtr, &indices[0], &indices[2], 
+                                            name ? name : "gep", bblock);
+        AddDebugPos(inst);
+        return inst;
+    }
+    else {
+        // We have a varying pointer and/or indices; emit the appropriate
+        // GEP for each of the program instances
+        llvm::Value *lret = NULL;
+        for (int i = 0; i < g->target.vectorWidth; ++i) {
+            // Get the index, either using the same one if it's uniform or
+            // the one for this lane if it's varying
+            llvm::Value *indexElt;
+            if (indexIsVaryingType)
+                indexElt = ExtractInst(index1, i, "get_array_index");
+            else
+                indexElt = index1;
+
+            // Similarly figure out the appropriate base pointer
+            llvm::Value *aptr;
+            if (baseIsVaryingTypePointer)
+                aptr = ExtractInst(basePtr, i, "get_array_index");
+            else
+                aptr = basePtr;
+
+            // Do the GEP for this lane
+            llvm::Value *eltPtr = GetElementPtrInst(aptr, index0, indexElt, name);
+
+            if (lret == NULL) {
+                // This is kind of a hack: use the type from the GEP to
+                // figure out the return type and the first time through,
+                // create an undef value of that type here
+                const llvm::PointerType *elementPtrType = 
+                    llvm::dyn_cast<const llvm::PointerType>(eltPtr->getType());
+                const llvm::Type *elementType = elementPtrType->getElementType();
+                lret = llvm::UndefValue::get(LLVMPointerVectorType(elementType));
+            }
+
+            // And insert the result of the GEP into the return value
+            lret = InsertInst(lret, eltPtr, i, "elt_ptr_store");
+        }
+        return lret;
+    }
+}
+
+
+llvm::Value *
+FunctionEmitContext::GetElementPtrInst(llvm::Value *basePtr, int v0, int v1,
+                                       const char *name) {
+    return GetElementPtrInst(basePtr, LLVMInt32(v0), LLVMInt32(v1), name);
+}
+    
+
+llvm::Value *
+FunctionEmitContext::LoadInst(llvm::Value *lvalue, const Type *type, 
+                              const char *name) {
+    if (lvalue == NULL) {
+        assert(m->errorCount > 0);
+        return NULL;
+    }
+
+    if (llvm::isa<const llvm::PointerType>(lvalue->getType())) {
+        // If the lvalue is a straight up regular pointer, then just issue
+        // a regular load
+        llvm::Instruction *inst = new llvm::LoadInst(lvalue, name ? name : "load", bblock);
+        AddDebugPos(inst);
+        return inst;
+    }
+    else {
+        // Otherwise we should have a varying lvalue and it's time for a
+        // gather.  The "type" parameter only has to be non-NULL for the
+        // gather path here (we can't reliably figure out all of the type
+        // information we need from the LLVM::Type, so have to carry the
+        // ispc type in through this path..
+        assert(type != NULL);
+        assert(llvm::isa<const llvm::ArrayType>(lvalue->getType()));
+        return gather(lvalue, type, name);
+    }
+}
+
+
+llvm::Value *
+FunctionEmitContext::gather(llvm::Value *lvalue, const Type *type, 
+                            const char *name) {
+    // We should have a varying lvalue if we get here...
+    assert(llvm::dyn_cast<const llvm::ArrayType>(lvalue->getType()));
+
+    const llvm::Type *retType = type->LLVMType(g->ctx);
+
+    const StructType *st = dynamic_cast<const StructType *>(type);
+    if (st) {
+        // If we're gathering structures, do an element-wise gather
+        // recursively.
+        llvm::Value *retValue = llvm::UndefValue::get(retType);
+        for (int i = 0; i < st->NumElements(); ++i) {
+            llvm::Value *eltPtrs = GetElementPtrInst(lvalue, 0, i);
+            // This in turn will be another gather
+            llvm::Value *eltValues = LoadInst(eltPtrs, st->GetMemberType(i), 
+                                              name);
+            retValue = InsertInst(retValue, eltValues, i, "set_value");
+        }
+        return retValue;
+    }
+
+    const VectorType *vt = dynamic_cast<const VectorType *>(type);
+    if (vt) {
+        // Similarly, if it's a vector type, do a gather for each of the
+        // vector elements
+        llvm::Value *retValue = llvm::UndefValue::get(retType);
+        // FIXME: yuck.  Change lvalues to be pointers to arrays so that
+        // the GEP stuff in the loop below ends up computing pointers based
+        // on elements in the vectors rather than incorrectly advancing to
+        // the next vector...
+        const llvm::Type *eltType = 
+            vt->GetBaseType()->GetAsUniformType()->LLVMType(g->ctx);
+        lvalue = BitCastInst(lvalue, llvm::PointerType::get(llvm::ArrayType::get(eltType, 0), 0));
+
+        for (int i = 0; i < vt->GetElementCount(); ++i) {
+            llvm::Value *eltPtrs = GetElementPtrInst(lvalue, 0, i);
+            llvm::Value *eltValues = LoadInst(eltPtrs, vt->GetBaseType(), name);
+            retValue = InsertInst(retValue, eltValues, i, "set_value");
+        }
+        return retValue;
+    }
+
+    const ArrayType *at = dynamic_cast<const ArrayType *>(type);
+    if (at) {
+        // Arrays are also handled recursively and element-wise
+        llvm::Value *retValue = llvm::UndefValue::get(retType);
+        for (int i = 0; i < at->GetElementCount(); ++i) {
+            llvm::Value *eltPtrs = GetElementPtrInst(lvalue, 0, i);
+            llvm::Value *eltValues = LoadInst(eltPtrs, at->GetElementType(), name);
+            retValue = InsertInst(retValue, eltValues, i, "set_value");
+        }
+        return retValue;
+    }
+
+    // Otherwise we should just have a basic scalar type and we can go and
+    // do the actual gather
+    AddInstrumentationPoint("gather");
+
+    llvm::Value *mask = GetMask();
+    llvm::Function *gather = NULL;
+    // Figure out which gather function to call based on the size of
+    // the elements; will need to generalize this for 8 and 16-bit
+    // types.
+    if (retType == LLVMTypes::DoubleVectorType || 
+        retType == LLVMTypes::Int64VectorType)
+        gather = m->module->getFunction("__pseudo_gather_64");
+    else {
+        assert(retType == LLVMTypes::FloatVectorType || 
+               retType == LLVMTypes::Int32VectorType);
+        gather = m->module->getFunction("__pseudo_gather_32");
+    }
+    assert(gather);
+
+    llvm::Value *voidlvalue = BitCastInst(lvalue, LLVMTypes::VoidPointerType);
+    llvm::Instruction *call = CallInst(gather, voidlvalue, mask, name);
+    // Add metadata about the source file location so that the
+    // optimization passes can print useful performance warnings if we
+    // can't optimize out this gather
+    addGSMetadata(call, currentPos);
+
+    llvm::Value *val = BitCastInst(call, retType, "gather_bitcast");
+
+    return val;
+}
+
+
+/** Add metadata to the given instruction to encode the current source file
+    position.  This data is used in the lGetSourcePosFromMetadata()
+    function in opt.cpp. 
+*/
+void
+FunctionEmitContext::addGSMetadata(llvm::Instruction *inst, SourcePos pos) {
+    llvm::Value *str = llvm::MDString::get(*g->ctx, pos.name);
+#ifdef LLVM_2_8
+    llvm::MDNode *md = llvm::MDNode::get(*g->ctx, &str, 1);
+#else
+    llvm::MDNode *md = llvm::MDNode::get(*g->ctx, str);
+#endif
+    inst->setMetadata("filename", md);
+
+    llvm::Value *line = LLVMInt32(pos.first_line);
+#ifdef LLVM_2_8
+    md = llvm::MDNode::get(*g->ctx, &first_line, 1);
+#else
+    md = llvm::MDNode::get(*g->ctx, line);
+#endif
+    inst->setMetadata("line", md);
+
+    llvm::Value *column = LLVMInt32(pos.first_column);
+#ifdef LLVM_2_8
+    md = llvm::MDNode::get(*g->ctx, &first_column, 1);
+#else
+    md = llvm::MDNode::get(*g->ctx, column);
+#endif
+    inst->setMetadata("column", md);
+}
+
+
+llvm::Value *
+FunctionEmitContext::AllocaInst(const llvm::Type *llvmType, const char *name,
+                                int align, bool atEntryBlock) {
+    llvm::AllocaInst *inst = NULL;
+    if (atEntryBlock) {
+        // We usually insert it right before the jump instruction at the
+        // end of allocaBlock
+        llvm::Instruction *retInst = allocaBlock->getTerminator();
+        assert(retInst);
+        inst = new llvm::AllocaInst(llvmType, name ? name : "", retInst);
+    }
+    else
+        // Unless the caller overrode the default and wants it in the
+        // current basic block
+        inst = new llvm::AllocaInst(llvmType, name ? name : "", bblock);
+
+    if (align != 0)
+        inst->setAlignment(align);
+    // Don't add debugging info to alloca instructions
+    return inst;
+}
+
+
+/** Code to store the given varying value to the given location, only
+    storing the elements that correspond to active program instances as
+    given by the provided storeMask value.  Note that the lvalue is only a
+    single pointer, not a varying lvalue of one pointer per program
+    instance (that case is handled by scatters).
+ */
+void
+FunctionEmitContext::maskedStore(llvm::Value *rvalue, llvm::Value *lvalue,
+                                 const Type *rvalueType, 
+                                 llvm::Value *storeMask) {
+    if (rvalue == NULL || lvalue == NULL) {
+        assert(m->errorCount > 0);
+        return;
+    }
+
+    assert(llvm::isa<const llvm::PointerType>(lvalue->getType()));
+    
+    const StructType *structType = dynamic_cast<const StructType *>(rvalueType);
+    if (structType != NULL) {
+        // Assigning a structure
+        for (int i = 0; i < structType->NumElements(); ++i) {
+            llvm::Value *eltValue = ExtractInst(rvalue, i, "rvalue_member");
+            llvm::Value *eltLValue = GetElementPtrInst(lvalue, 0, i, 
+                                                       "struct_lvalue_ptr");
+            StoreInst(eltValue, eltLValue, storeMask, 
+                      structType->GetMemberType(i));
+        }
+        return;
+    }
+
+    const SequentialType *sequentialType = 
+        dynamic_cast<const SequentialType *>(rvalueType);
+    if (sequentialType != NULL) {
+        // Assigning arrays and vectors. Handle each element individually
+        // with what turns into a recursive call to makedStore()
+        for (int i = 0; i < sequentialType->GetElementCount(); ++i) {
+            llvm::Value *eltLValue = GetElementPtrInst(lvalue, 0, i, "lval_i_ptr");
+            llvm::Value *eltValue = ExtractInst(rvalue, i, "array_i_val");
+            StoreInst(eltValue, eltLValue, storeMask, 
+                      sequentialType->GetElementType());
+        }
+        return;
+    }
+
+    // We must have a regular atomic type at this point
+    assert(dynamic_cast<const AtomicType *>(rvalueType) != NULL);
+    rvalueType = rvalueType->GetAsNonConstType();
+
+    llvm::Function *maskedStoreFunc = NULL;
+    // Figure out if we need a 32-bit or 64-bit masked store.  This
+    // will need to be generalized when/if 8 and 16-bit data types are
+    // added.
+    if (rvalueType == AtomicType::VaryingDouble || 
+        rvalueType == AtomicType::VaryingInt64 ||
+        rvalueType == AtomicType::VaryingUInt64) {
+        maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_64");
+        lvalue = BitCastInst(lvalue, LLVMTypes::Int64VectorPointerType, 
+                             "lvalue_to_int64vecptr");
+        rvalue = BitCastInst(rvalue, LLVMTypes::Int64VectorType, 
+                             "rvalue_to_int64");
+    }
+    else {
+        assert(rvalueType == AtomicType::VaryingFloat ||
+               rvalueType == AtomicType::VaryingBool ||
+               rvalueType == AtomicType::VaryingInt32 ||
+               rvalueType == AtomicType::VaryingUInt32);
+
+        maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_32");
+        lvalue = BitCastInst(lvalue, LLVMTypes::Int32VectorPointerType, 
+                             "lvalue_to_int32vecptr");
+        if (rvalueType == AtomicType::VaryingFloat)
+            rvalue = BitCastInst(rvalue, LLVMTypes::Int32VectorType, 
+                                 "rvalue_to_int32");
+    }
+
+    std::vector<llvm::Value *> args;
+    args.push_back(lvalue);
+    args.push_back(rvalue);
+    args.push_back(storeMask);
+    CallInst(maskedStoreFunc, args);
+}
+
+
+
+/** Scatter the given varying value to the locations given by the varying
+    lvalue (which should be an array of pointers with size equal to the
+    target's vector width.  We want to store each rvalue element at the
+    corresponding pointer's location, *if* the mask for the corresponding
+    program instance are on.  If they're off, don't do anything.  
+*/
+void
+FunctionEmitContext::scatter(llvm::Value *rvalue, llvm::Value *lvalue, 
+                             llvm::Value *storeMask, const Type *rvalueType) {
+    assert(rvalueType->IsVaryingType());
+    assert(llvm::isa<const llvm::ArrayType>(lvalue->getType()));
+
+    const StructType *structType = dynamic_cast<const StructType *>(rvalueType);
+    if (structType) {
+        // Scatter the struct elements individually
+        for (int i = 0; i < structType->NumElements(); ++i) {
+            llvm::Value *lv = GetElementPtrInst(lvalue, 0, i);
+            llvm::Value *rv = ExtractInst(rvalue, i);
+            scatter(rv, lv, storeMask, structType->GetMemberType(i));
+        }
+        return;
+    }
+
+    const VectorType *vt = dynamic_cast<const VectorType *>(rvalueType);
+    if (vt) {
+        // FIXME: yuck.  Change lvalues to be pointers to arrays so that
+        // the GEP stuff in the loop below ends up computing pointers based
+        // on elements in the vectors rather than incorrectly advancing to
+        // the next vector...
+        const llvm::Type *eltType = vt->GetBaseType()->GetAsUniformType()->LLVMType(g->ctx);
+        lvalue = BitCastInst(lvalue, llvm::PointerType::get(llvm::ArrayType::get(eltType, 0), 0));
+
+        for (int i = 0; i < vt->GetElementCount(); ++i) {
+            llvm::Value *lv = GetElementPtrInst(lvalue, 0, i);
+            llvm::Value *rv = ExtractInst(rvalue, i);
+            scatter(rv, lv, storeMask, vt->GetElementType());
+        }
+        return;
+    }
+
+    // I think this should be impossible
+    assert(dynamic_cast<const ArrayType *>(rvalueType) == NULL);
+
+    // And everything should be atomic from here on out...
+    assert(dynamic_cast<const AtomicType *>(rvalueType) != NULL);
+
+    llvm::Function *func = NULL;
+    const llvm::Type *type = rvalue->getType();
+    if (type == LLVMTypes::DoubleVectorType || 
+        type == LLVMTypes::Int64VectorType) {
+        func = m->module->getFunction("__pseudo_scatter_64");
+        rvalue = BitCastInst(rvalue, LLVMTypes::Int64VectorType, "rvalue2int");
+    }
+    else {
+        // FIXME: if this hits, presumably it's due to needing int8 and/or
+        // int16 versions of scatter...
+        assert(type == LLVMTypes::FloatVectorType || 
+               type == LLVMTypes::Int32VectorType);
+        func = m->module->getFunction("__pseudo_scatter_32");
+        rvalue = BitCastInst(rvalue, LLVMTypes::Int32VectorType, "rvalue2int");
+    }
+    assert(func != NULL);
+    
+    AddInstrumentationPoint("scatter");
+
+    llvm::Value *voidlvalue = BitCastInst(lvalue, LLVMTypes::VoidPointerType);
+    std::vector<llvm::Value *> args;
+    args.push_back(voidlvalue);
+    args.push_back(rvalue);
+    args.push_back(storeMask);
+    llvm::Instruction *inst = CallInst(func, args);
+    addGSMetadata(inst, currentPos);
+}
+
+
+void
+FunctionEmitContext::StoreInst(llvm::Value *rvalue, llvm::Value *lvalue,
+                               const char *name) {
+    if (rvalue == NULL || lvalue == NULL) {
+        // may happen due to error elsewhere
+        assert(m->errorCount > 0);
+        return;
+    }
+
+    llvm::Instruction *inst = new llvm::StoreInst(rvalue, lvalue, name, bblock);
+    AddDebugPos(inst);
+}
+
+
+void
+FunctionEmitContext::StoreInst(llvm::Value *rvalue, llvm::Value *lvalue,
+                               llvm::Value *storeMask, const Type *rvalueType,
+                               const char *name) {
+    if (rvalue == NULL || lvalue == NULL) {
+        // may happen due to error elsewhere
+        assert(m->errorCount > 0);
+        return;
+    }
+
+    // Figure out what kind of store we're doing here
+    if (rvalueType->IsUniformType()) {
+        // The easy case; a regular store
+        llvm::Instruction *si = new llvm::StoreInst(rvalue, lvalue, name, bblock);
+        AddDebugPos(si);
+    }
+    else if (llvm::isa<const llvm::ArrayType>(lvalue->getType()))
+        // We have a varying lvalue (an array of pointers), so it's time to
+        // scatter
+        scatter(rvalue, lvalue, storeMask, rvalueType);
+    else if (storeMask == LLVMMaskAllOn) {
+        // Otherwise it is a masked store unless we can determine that the
+        // mask is all on...
+        llvm::Instruction *si = 
+            new llvm::StoreInst(rvalue, lvalue, name, bblock);
+        AddDebugPos(si);
+    }
+    else
+        maskedStore(rvalue, lvalue, rvalueType, storeMask);
+}
+
+
+void
+FunctionEmitContext::BranchInst(llvm::BasicBlock *dest) {
+    llvm::Instruction *b = llvm::BranchInst::Create(dest, bblock);
+    AddDebugPos(b);
+}
+
+
+void
+FunctionEmitContext::BranchInst(llvm::BasicBlock *trueBlock, 
+                                llvm::BasicBlock *falseBlock,
+                                llvm::Value *test) {
+    if (test == NULL) {
+        assert(m->errorCount > 0);
+        return;
+    }
+
+    llvm::Instruction *b = 
+        llvm::BranchInst::Create(trueBlock, falseBlock, test, bblock);
+    AddDebugPos(b);
+}
+
+
+llvm::Value *
+FunctionEmitContext::ExtractInst(llvm::Value *v, int elt, const char *name) {
+    if (v == NULL) {
+        assert(m->errorCount > 0);
+        return NULL;
+    }
+
+    llvm::Instruction *ei = NULL;
+    if (llvm::isa<const llvm::VectorType>(v->getType()))
+        ei = llvm::ExtractElementInst::Create(v, LLVMInt32(elt), 
+                                              name ? name : "extract", bblock);
+    else
+        ei = llvm::ExtractValueInst::Create(v, elt, name ? name : "extract",
+                                            bblock);
+    AddDebugPos(ei);
+    return ei;
+}
+
+
+llvm::Value *
+FunctionEmitContext::InsertInst(llvm::Value *v, llvm::Value *eltVal, int elt, 
+                                const char *name) {
+    if (v == NULL || eltVal == NULL) {
+        assert(m->errorCount > 0);
+        return NULL;
+    }
+
+    llvm::Instruction *ii = NULL;
+    if (llvm::isa<const llvm::VectorType>(v->getType()))
+        ii = llvm::InsertElementInst::Create(v, eltVal, LLVMInt32(elt), 
+                                             name ? name : "insert", bblock);
+    else
+        ii = llvm::InsertValueInst::Create(v, eltVal, elt, 
+                                           name ? name : "insert", bblock);
+    AddDebugPos(ii);
+    return ii;
+}
+
+
+llvm::PHINode *
+FunctionEmitContext::PhiNode(const llvm::Type *type, int count, 
+                             const char *name) {
+    llvm::PHINode *pn = llvm::PHINode::Create(type, 
+#if !defined(LLVM_2_8) && !defined(LLVM_2_9)
+                                              count, 
+#endif // !LLVM_2_8 && !LLVM_2_9
+                                              name ? name : "phi", bblock);
+    AddDebugPos(pn);
+    return pn;
+}
+
+
+llvm::Instruction *
+FunctionEmitContext::SelectInst(llvm::Value *test, llvm::Value *val0,
+                                llvm::Value *val1, const char *name) {
+    if (test == NULL || val0 == NULL || val1 == NULL) {
+        assert(m->errorCount > 0);
+        return NULL;
+    }
+
+    llvm::Instruction *inst = 
+        llvm::SelectInst::Create(test, val0, val1, name ? name : "select", 
+                                 bblock);
+    AddDebugPos(inst);
+    return inst;
+}
+
+
+llvm::Instruction *
+FunctionEmitContext::CallInst(llvm::Function *func, 
+                              const std::vector<llvm::Value *> &args,
+                              const char *name) {
+    if (func == NULL) {
+        assert(m->errorCount > 0);
+        return NULL;
+    }
+
+    llvm::Instruction *ci = 
+        llvm::CallInst::Create(func, args.begin(), args.end(), 
+                               name ? name : "", bblock);
+    AddDebugPos(ci);
+    return ci;
+}
+
+
+llvm::Instruction *
+FunctionEmitContext::CallInst(llvm::Function *func, llvm::Value *arg, 
+                              const char *name) {
+    if (func == NULL || arg == NULL) {
+        assert(m->errorCount > 0);
+        return NULL;
+    }
+
+    llvm::Value *args[] = { arg };
+    llvm::Instruction *ci = 
+        llvm::CallInst::Create(func, &args[0], &args[1], name ? name : "",
+                               bblock);
+    AddDebugPos(ci);
+    return ci;
+}
+
+
+llvm::Instruction *
+FunctionEmitContext::CallInst(llvm::Function *func, llvm::Value *arg0,
+                              llvm::Value *arg1, const char *name) {
+    if (func == NULL || arg0 == NULL || arg1 == NULL) {
+        assert(m->errorCount > 0);
+        return NULL;
+    }
+
+    llvm::Value *args[] = { arg0, arg1 };
+    llvm::Instruction *ci = 
+        llvm::CallInst::Create(func, &args[0], &args[2], name ? name : "", 
+                               bblock);
+    AddDebugPos(ci);
+    return ci;
+}
+
+
+llvm::Instruction *
+FunctionEmitContext::ReturnInst() {
+    if (launchedTasks) {
+        // Automatically add a sync call at the end of any function that
+        // launched tasks
+        SourcePos noPos;
+        noPos.name = "__auto_sync";
+        ExprStmt *es = new ExprStmt(new SyncExpr(noPos), noPos);
+        es->EmitCode(this); 
+        delete es;
+    }
+
+    llvm::Instruction *rinst = NULL;
+    if (returnValuePtr != NULL) {
+        // We have value(s) to return; load them from their storage
+        // location
+        llvm::Value *retVal = LoadInst(returnValuePtr, returnType,
+                                       "return_value");
+        rinst = llvm::ReturnInst::Create(*g->ctx, retVal, bblock);
+    }
+    else {
+        assert(returnType == AtomicType::Void);
+        rinst = llvm::ReturnInst::Create(*g->ctx, bblock);
+    }
+
+    AddDebugPos(rinst);
+    bblock = NULL;
+    return rinst;
+}
+
+
+llvm::Instruction *
+FunctionEmitContext::LaunchInst(llvm::Function *callee, 
+                                std::vector<llvm::Value *> &argVals) {
+    if (callee == NULL) {
+        assert(m->errorCount > 0);
+        return NULL;
+    }
+
+    launchedTasks = true;
+
+    const llvm::Type *argType = callee->arg_begin()->getType();
+    assert(llvm::PointerType::classof(argType));
+    const llvm::PointerType *pt = static_cast<const llvm::PointerType *>(argType);
+    assert(llvm::StructType::classof(pt->getElementType()));
+    const llvm::StructType *argStructType = 
+        static_cast<const llvm::StructType *>(pt->getElementType());
+    assert(argStructType->getNumElements() == argVals.size() + 1);
+
+    // Use alloca for space for the task args.  KEY DETAIL: pass false
+    // to the call of FunctionEmitContext::AllocaInst so that the alloca
+    // doesn't happen just once at the top of the function, but happens
+    // each time the enclosing basic block executes.
+    int align = 4 * RoundUpPow2(g->target.nativeVectorWidth);
+    llvm::Value *argmem = AllocaInst(argStructType, "argmem", align, false);
+    llvm::Value *voidmem = BitCastInst(argmem, LLVMTypes::VoidPointerType);
+
+    // Copy the values of the parameters into the appropriate place in
+    // the argument block
+    for (unsigned int i = 0; i < argVals.size(); ++i) {
+        llvm::Value *ptr = GetElementPtrInst(argmem, 0, i, "funarg");
+        // don't need to do masked store here, I think
+        StoreInst(argVals[i], ptr);
+    }
+
+    // copy in the mask
+    llvm::Value *mask = GetMask();
+    llvm::Value *ptr = GetElementPtrInst(argmem, 0, argVals.size(),
+                                         "funarg_mask");
+    StoreInst(mask, ptr);
+
+    // And emit the call to the user-supplied task launch function, passing
+    // a pointer to the task function being called and a pointer to the
+    // argument block we just filled in
+    llvm::Value *fptr = BitCastInst(callee, LLVMTypes::VoidPointerType);
+    llvm::Function *flaunch = m->module->getFunction("ISPCLaunch");
+    assert(flaunch != NULL);
+    return CallInst(flaunch, fptr, voidmem, "");
+}
diff --git a/ctx.h b/ctx.h
new file mode 100644
index 00000000..437c5e3f
--- /dev/null
+++ b/ctx.h
@@ -0,0 +1,507 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file ctx.h
+    @brief Declaration of the FunctionEmitContext class
+*/
+
+#ifndef ISPC_CTX_H
+#define ISPC_CTX_H 1
+
+#include "ispc.h"
+#include <llvm/InstrTypes.h>
+#include <llvm/Instructions.h>
+#ifndef LLVM_2_8
+#include <llvm/Analysis/DIBuilder.h>
+#endif
+#include <llvm/Analysis/DebugInfo.h>
+
+struct CFInfo;
+
+/** FunctionEmitContext is one of the key classes in ispc; it is used to
+    help with emitting the intermediate representation of a function during
+    compilation.  It carries information the current program context during
+    IR emission (e.g. the basic block into which instructions should be
+    added; or, the current source file and line number, so debugging
+    symbols can be correctly generated).  This class also provides a number
+    of helper routines that are useful for code that emits IR.
+ */
+class FunctionEmitContext {
+public:
+    /** Create a new FunctionEmitContext.
+        @param returnType   The return type of the function
+        @param function     LLVM function in the current module that corresponds
+                            to the function
+        @param funSym       Symbol that corresponds to the function
+        @param firstStmtPos Source file position of the first statement in the
+                            function
+     */
+    FunctionEmitContext(const Type *returnType, llvm::Function *function, Symbol *funSym,
+                        SourcePos firstStmtPos);
+    ~FunctionEmitContext();
+
+    /** @name Current basic block management
+        @{
+     */
+    /** Returns the current basic block pointer */ 
+    llvm::BasicBlock *GetCurrentBasicBlock();
+    
+    /** Set the given llvm::BasicBlock to be the basic block to emit
+        forthcoming instructions into. */
+    void SetCurrentBasicBlock(llvm::BasicBlock *bblock);
+
+    /** @name Mask management
+        @{
+     */
+    /** Returns the current mask value */ 
+    llvm::Value *GetMask();
+
+    /** Provides the value of the mask at function entry */
+    void SetEntryMask(llvm::Value *val);
+
+    /** Sets the mask to a new value */
+    void SetMask(llvm::Value *val);
+
+    /** Sets the mask to (oldMask & val) */
+    void MaskAnd(llvm::Value *oldMask, llvm::Value *val);
+
+    /** Sets the mask to (oldMask & ~val) */
+    void MaskAndNot(llvm::Value *oldMask, llvm::Value *test);
+
+    /** Emits a branch instruction to the basic block btrue if any of the
+        lanes of current mask are on and bfalse if none are on. */
+    void BranchIfMaskAny(llvm::BasicBlock *btrue, llvm::BasicBlock *bfalse);
+
+    /** Emits a branch instruction to the basic block btrue if all of the
+        lanes of current mask are on and bfalse if none are on. */
+    void BranchIfMaskAll(llvm::BasicBlock *btrue, llvm::BasicBlock *bfalse);
+
+    /** Emits a branch instruction to the basic block btrue if none of the
+        lanes of current mask are on and bfalse if none are on. */
+    void BranchIfMaskNone(llvm::BasicBlock *btrue, llvm::BasicBlock *bfalse);
+    /** @} */
+
+    /** @name Control flow management
+        @{
+    */
+    /** Notifies the FunctionEmitContext that we're starting emission of an
+        'if' statement with a uniform test.  The value of the mask going
+        into the 'if' statement is provided in the oldMask parameter. */
+    void StartUniformIf(llvm::Value *oldMask);
+
+    /** Notifies the FunctionEmitContext that we're starting emission of an
+        'if' statement with a varying test.  The value of the mask going
+        into the 'if' statement is provided in the oldMask parameter. */
+    void StartVaryingIf(llvm::Value *oldMask);
+
+    /** Notifies the FunctionEmitConitext that we're done emitting the IR
+        for an 'if' statement. */
+    void EndIf();
+
+    /** Notifies the FunctionEmitContext that we're starting to emit IR
+        for a loop.  Basic blocks are provides for where 'break' and
+        'continue' statements should jump to (if all running lanes want to
+        break or continue), uniformControlFlow indicates whether the loop
+        condition is 'uniform', and oldMask provides the current mask going
+        into the loop. */
+    void StartLoop(llvm::BasicBlock *breakTarget, llvm::BasicBlock *continueTarget, 
+                   bool uniformControlFlow, llvm::Value *oldMask);
+
+    /** Informs FunctionEmitContext of the value of the mask at the start
+        of a loop body. */
+    void SetLoopMask(llvm::Value *mask);
+
+    /** Informs FunctionEmitContext that code generation for a loop is
+        finished. */
+    void EndLoop();
+
+    /** Emit code for a 'break' statement in a loop.  If doCoherenceCheck
+        is true, then if we're in a 'varying' loop, code will be emitted to
+        see if all of the lanes want to break, in which case a jump to the
+        break target will be taken.  (For 'uniform' loops, the jump is
+        always done). */
+    void Break(bool doCoherenceCheck);
+
+    /** Emit code for a 'continue' statement in a loop.  If
+        doCoherenceCheck is true, then if we're in a 'varying' loop, code
+        will be emitted to see if all of the lanes want to continue, in
+        which case a jump to the continue target will be taken.  (For
+        'uniform' loops, the jump is always done). */
+    void Continue(bool doCoherenceCheck);
+
+    /** This method is called by code emitting IR for a loop at the end of
+        the loop body; it restores the lanes of the mask that executed a
+        'continue' statement when going through the loop body in the
+        previous iteration. */
+    void RestoreContinuedLanes();
+
+    /** Returns the current number of nested levels of 'varying' control
+        flow */
+    int VaryingCFDepth() const;
+
+    /** Called to generate code for 'return' statement; value is the
+        expression in the return statement (if non-NULL), and
+        doCoherenceCheck indicates whether instructions should be generated
+        to see if all of the currently-running lanes have returned (if
+        we're under varying control flow).  */
+    void CurrentLanesReturned(Expr *value, bool doCoherenceCheck);
+    /** @} */
+
+    /** @name Small helper/utility routines
+        @{ 
+    */
+    /** Given a boolean mask value of type LLVMTypes::MaskType, return an
+        i1 value that indicates if any of the mask lanes are on. */
+    llvm::Value *Any(llvm::Value *mask);
+
+    /** Given a boolean mask value of type LLVMTypes::MaskType, return an
+        i1 value that indicates if all of the mask lanes are on. */
+    llvm::Value *All(llvm::Value *mask);
+
+    /** Given a boolean mask value of type LLVMTypes::MaskType, return an
+        i32 value wherein the i'th bit is on if and only if the i'th lane
+        of the mask is on. */
+    llvm::Value *LaneMask(llvm::Value *mask);
+
+    /** Given two masks of type LLVMTypes::MaskType, return an i1 value
+        that indicates whether the two masks are equal. */
+    llvm::Value *MasksAllEqual(llvm::Value *mask1, llvm::Value *mask2);
+
+    /** Given a string, create an anonymous global variable to hold its
+        value and return the pointer to the string. */
+    llvm::Value *GetStringPtr(const std::string &str);
+
+    /** Create a new basic block with given name */
+    llvm::BasicBlock *CreateBasicBlock(const char *name);
+
+    /** Given a vector with element type i1, return a vector of type
+        LLVMTypes::BoolVectorType.  This method handles the conversion for
+        the targets where the bool vector element type is, for example,
+        i32. */
+    llvm::Value *I1VecToBoolVec(llvm::Value *b);
+
+    /** Emit code to call the user-supplied ISPCMalloc function to
+        allocate space for an object of thee given type.  Returns the
+        pointer value returned by the ISPCMalloc call. */
+    llvm::Value *EmitMalloc(const llvm::Type *ty);
+
+    /** Emit code to call the user-supplied ISPCFree function, passing it
+        the given pointer to storage previously allocated by an
+        EmitMalloc() call. */
+    void EmitFree(llvm::Value *ptr);
+
+    /** If the user has asked to compile the program with instrumentation,
+        this inserts a callback to the user-supplied instrumentation
+        function at the current point in the code. */
+    void AddInstrumentationPoint(const char *note);
+    /** @} */
+
+    /** @name Debugging support
+        @{
+    */
+    /** Set the current source file position; subsequent emitted
+        instructions will have this position associated with them if
+        debugging information is being generated. */
+    void SetDebugPos(SourcePos pos);
+
+    SourcePos GetDebugPos() const;
+
+    /** Adds debugging metadata to the given instruction.  If pos == NULL,
+        use FunctionEmitContext::currentPos as the source file position for
+        the instruction.  Similarly, if a DIScope is provided, it's used
+        and otherwise the scope is found from a GetDIScope() call.  This
+        takes a llvm::Value for the instruction rather than an
+        llvm::Instruction for convenience; in calling code we often have
+        Instructions stored using Value pointers; the code here returns
+        silently if it's not actually given an instruction. */
+    void AddDebugPos(llvm::Value *instruction, const SourcePos *pos = NULL, 
+                     llvm::DIScope *scope = NULL);
+
+    /** Inform the debugging information generation code that a new scope
+        is starting in the source program. */
+    void StartScope();
+
+    /** Inform the debugging information generation code that the current
+        scope is ending in the source program. */
+    void EndScope();
+
+    /** Returns the llvm::DIScope corresponding to the current program
+        scope. */
+    llvm::DIScope GetDIScope() const;
+
+    /** Emits debugging information for the variable represented by
+        sym.  */
+    void EmitVariableDebugInfo(Symbol *sym);
+
+    /** Emits debugging information for the function parameter represented
+        by sym.  */
+    void EmitFunctionParameterDebugInfo(Symbol *sym);
+    /** @} */
+
+    /** @name IR instruction emission
+        @brief These methods generally closely correspond to LLVM IR
+        instructions.  See the LLVM assembly language reference manual
+        (http://llvm.org/docs/LangRef.html) and the LLVM doxygen documentaion
+        (http://llvm.org/doxygen) for more information.  Here we will only
+        document significant generalizations to the functionality of the 
+        corresponding basic LLVM instructions.
+
+        Beyond actually emitting the instruction, the implementations of
+        these methods in FunctionEmitContext also handle adding debugging
+        metadata if debugging symbols are enabled, adding the instructions
+        to the current basic block, and handling generalizations like
+        'varying' lvalues, arithmetic operations with VectorType operands,
+        etc.
+        @{
+    */
+    /** Emit the binary operator given by the inst parameter.  If
+        llvm::Values corresponding to VectorTypes are given as operands,
+        this also handles applying the given operation to the vector
+        elements. */
+    llvm::Value *BinaryOperator(llvm::Instruction::BinaryOps inst,
+                                llvm::Value *v0, llvm::Value *v1, 
+                                const char *name = NULL);
+
+    /** Emit the "not" operator.  Like BinaryOperator(), this also handles
+        a VectorType-based operand. */
+    llvm::Value *NotOperator(llvm::Value *v, const char *name = NULL);
+
+    /** Emit a comparison instruction.  If the operands are VectorTypes,
+        then a value for the corresponding boolean VectorType is
+        returned. */
+    llvm::Value *CmpInst(llvm::Instruction::OtherOps inst, 
+                         llvm::CmpInst::Predicate pred,
+                         llvm::Value *v0, llvm::Value *v1, const char *name = NULL);
+
+    llvm::Value *BitCastInst(llvm::Value *value, const llvm::Type *type,
+                             const char *name = NULL);
+    llvm::Instruction *PtrToIntInst(llvm::Value *value, const llvm::Type *type,
+                                    const char *name = NULL);
+    llvm::Instruction *IntToPtrInst(llvm::Value *value, const llvm::Type *type,
+                                    const char *name = NULL);
+    llvm::Instruction *TruncInst(llvm::Value *value, const llvm::Type *type,
+                                 const char *name = NULL);
+    llvm::Instruction *CastInst(llvm::Instruction::CastOps op, llvm::Value *value,
+                                const llvm::Type *type, const char *name = NULL);
+    llvm::Instruction *FPCastInst(llvm::Value *value, const llvm::Type *type, 
+                                  const char *name = NULL);
+    llvm::Instruction *SExtInst(llvm::Value *value, const llvm::Type *type, 
+                                const char *name = NULL);
+    llvm::Instruction *ZExtInst(llvm::Value *value, const llvm::Type *type, 
+                                const char *name = NULL);
+
+    /** This GEP method is a generalization of the standard one in LLVM; it
+        supports both uniform and varying basePtr values (an array of
+        pointers) as well as uniform and varying index values (arrays of
+        indices). */
+    llvm::Value *GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index0,
+                                   llvm::Value *index1, const char *name = NULL);
+
+    /** This is a convenience method to generate a GEP instruction with
+        indices with values with known constant values as the ispc program
+        is being compiled. */
+    llvm::Value *GetElementPtrInst(llvm::Value *basePtr, int v0, int v1,
+                                   const char *name = NULL);
+
+    /** Load from the memory location(s) given by lvalue.  The lvalue may
+        be varying, in which case this corresponds to a gather from the
+        multiple memory locations given by the array of pointer values
+        given by the lvalue.  If the lvalue is not varying, then the type
+        parameter may be NULL. */
+    llvm::Value *LoadInst(llvm::Value *lvalue, const Type *type,
+                          const char *name = NULL);
+
+    /** Emits an alloca instruction to allocate stack storage for the given
+        type.  If a non-zero alignment is specified, the object is also
+        allocated at the given alignment.  By default, the alloca
+        instruction is added at the start of the function in the entry
+        basic block; if it should be added to the current basic block, then
+        the atEntryBlock parameter should be false. */ 
+    llvm::Value *AllocaInst(const llvm::Type *llvmType, const char *name = NULL,
+                            int align = 0, bool atEntryBlock = true);
+
+    /** Standard store instruction; for this variant, the lvalue must be a
+        single pointer, not a varying lvalue. */
+    void StoreInst(llvm::Value *rvalue, llvm::Value *lvalue, 
+                   const char *name = NULL);
+
+    /** In this variant of StoreInst(), the lvalue may be varying.  If so,
+        this corresponds to a scatter.  Whether the lvalue is uniform of
+        varying, the given storeMask is used to mask the stores so that
+        they only execute for the active program instances. */
+    void StoreInst(llvm::Value *rvalue, llvm::Value *lvalue,
+                   llvm::Value *storeMask, const Type *rvalueType,
+                   const char *name = NULL);
+
+    void BranchInst(llvm::BasicBlock *block);
+    void BranchInst(llvm::BasicBlock *trueBlock, llvm::BasicBlock *falseBlock,
+                    llvm::Value *test);
+
+    /** This convenience method maps to an llvm::ExtractElementInst if the
+        given value is a llvm::VectorType, and to an llvm::ExtractValueInst
+        otherwise. */
+    llvm::Value *ExtractInst(llvm::Value *v, int elt, const char *name = NULL);
+
+    /** This convenience method maps to an llvm::InsertElementInst if the
+        given value is a llvm::VectorType, and to an llvm::InsertValueInst
+        otherwise. */
+    llvm::Value *InsertInst(llvm::Value *v, llvm::Value *eltVal, int elt, 
+                            const char *name = NULL);
+
+    llvm::PHINode *PhiNode(const llvm::Type *type, int count, const char *name = NULL);
+    llvm::Instruction *SelectInst(llvm::Value *test, llvm::Value *val0,
+                                  llvm::Value *val1, const char *name = NULL);
+
+    llvm::Instruction *CallInst(llvm::Function *func, 
+                                const std::vector<llvm::Value *> &args,
+                                const char *name = NULL);
+    /** This is a convenience method that issues a call instruction to a
+        function that takes just a single argument. */
+    llvm::Instruction *CallInst(llvm::Function *func, llvm::Value *arg,
+                                const char *name = NULL);
+
+    /** This is a convenience method that issues a call instruction to a
+        function that takes two arguments. */
+    llvm::Instruction *CallInst(llvm::Function *func, llvm::Value *arg0,
+                                llvm::Value *arg1, const char *name = NULL);
+
+    /** Launch an asynchronous task to run the given function, passing it
+        he given argument values. */
+    llvm::Instruction *LaunchInst(llvm::Function *callee, 
+                                  std::vector<llvm::Value *> &argVals);
+
+    llvm::Instruction *ReturnInst();
+    /** @} */
+
+private:
+    /** The basic block into which we add any alloca instructions that need
+        to go at the very start of the function. */
+    llvm::BasicBlock *allocaBlock;
+
+    /** The current basic block into which we're emitting new
+        instructions */
+    llvm::BasicBlock *bblock;
+
+    /** Pointer to stack-allocated memory that stores the current value of
+        the program mask. */
+    llvm::Value *maskPtr;
+
+    /** Current source file position; if debugging information is being
+        generated, this position is used to set file/line information for
+        instructions. */
+    SourcePos currentPos;
+
+    /** Source file position where the function definition started.  Used
+        for error messages and debugging symbols. */
+    SourcePos funcStartPos;
+
+    /** Type of result that the current function returns. */
+    const Type *returnType;
+
+    /** Value of the program mask when the function starts execution.  */
+    llvm::Value *entryMask;
+
+    /** If currently in a loop body, the value of the mask at the start of
+        the loop. */
+    llvm::Value *loopMask;
+
+    /** If currently in a loop body, this is a pointer to memory to store a
+        mask value that represents which of the lanes have executed a
+        'break' statement.  If we're not in a loop body, this should be
+        NULL. */
+    llvm::Value *breakLanesPtr;
+
+    /** Similar to breakLanesPtr, if we're inside a loop, this is a pointer
+        to memory to record which of the program instances have executed a
+        'continue' statement. */
+    llvm::Value *continueLanesPtr;
+
+    /** If we're inside a loop, this gives the basic block immediately
+        after the current loop, which we will jump to if all of the lanes
+        have executed a break statement or are otherwise done with the
+        loop. */
+    llvm::BasicBlock *breakTarget;
+
+    /** If we're inside a loop, this gives the block to jump to if all of
+        the running lanes have executed a 'continue' statement. */
+    llvm::BasicBlock *continueTarget;
+
+    /** A pointer to memory that records which of the program instances
+        have executed a 'return' statement (and are thus really truly done
+        running any more instructions in this functions. */
+    llvm::Value *returnedLanesPtr;
+
+    /** A pointer to memory to store the return value for the function.
+        Since difference program instances may execute 'return' statements
+        at different times, we need to accumulate the return values as they
+        come in until we return for real. */
+    llvm::Value *returnValuePtr;
+
+    /** The CFInfo structure records information about a nesting level of
+        control flow.  This vector lets us see what control flow is going
+        around outside the current position in the function being
+        emitted. */
+    std::vector<CFInfo *> controlFlowInfo;
+
+    /** DIFile object corresponding to the source file where the current
+        function was defined (used for debugging info0. */
+    llvm::DIFile diFile;
+
+    /** DISubprogram corresponding to this function (used for debugging
+        info). */
+    llvm::DISubprogram diFunction;
+
+    /** These correspond to the current set of nested scopes in the
+        function. */
+    std::vector<llvm::DILexicalBlock> debugScopes;
+
+    /** True if a 'launch' statement has been encountered in the function. */
+    bool launchedTasks;
+
+    llvm::Value *pointerVectorToVoidPointers(llvm::Value *value);
+    static void addGSMetadata(llvm::Instruction *inst, SourcePos pos);
+    bool ifsInLoopAllUniform() const;
+    void jumpIfAllLoopLanesAreDone(llvm::BasicBlock *target);
+    llvm::Value *emitGatherCallback(llvm::Value *lvalue, llvm::Value *retPtr);
+
+    void restoreMaskGivenReturns(llvm::Value *oldMask);
+
+    void scatter(llvm::Value *rvalue, llvm::Value *lvalue, 
+                 llvm::Value *maskPtr, const Type *rvalueType);
+    llvm::Value *gather(llvm::Value *lvalue, const Type *type,
+                        const char *name);
+    void maskedStore(llvm::Value *rvalue, llvm::Value *lvalue,
+                     const Type *rvalueType, llvm::Value *maskPtr);
+};
+
+#endif // ISPC_CTX_H
diff --git a/decl.cpp b/decl.cpp
new file mode 100644
index 00000000..849347f4
--- /dev/null
+++ b/decl.cpp
@@ -0,0 +1,348 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file decl.cpp
+    @brief Implementations of classes related to turning declarations into 
+           symbols and types.
+*/
+
+#include "decl.h"
+#include "util.h"
+#include "sym.h"
+#include "type.h"
+#include "expr.h"
+#include <stdio.h>
+
+///////////////////////////////////////////////////////////////////////////
+// DeclSpecs
+
+DeclSpecs::DeclSpecs(const Type *t, StorageClass sc, int tq) {
+    baseType = t;
+    storageClass = sc;
+    typeQualifier = tq;
+    soaWidth = 0;
+    vectorSize = 0;
+}
+
+
+void
+DeclSpecs::Print() const {
+    if (storageClass == SC_EXTERN)   printf("extern ");
+    if (storageClass == SC_EXTERN_C) printf("extern \"C\" ");
+    if (storageClass == SC_EXPORT)   printf("export ");
+    if (storageClass == SC_STATIC)   printf("static ");
+    if (storageClass == SC_TYPEDEF)  printf("typedef ");
+
+    if (soaWidth > 0) printf("soa<%d> ", soaWidth);
+
+    if (typeQualifier & TYPEQUAL_INLINE)    printf("inline ");
+    if (typeQualifier & TYPEQUAL_CONST)     printf("const ");
+    if (typeQualifier & TYPEQUAL_UNIFORM)   printf("uniform ");
+    if (typeQualifier & TYPEQUAL_VARYING)   printf("varying ");
+    if (typeQualifier & TYPEQUAL_TASK)      printf("task ");
+    if (typeQualifier & TYPEQUAL_REFERENCE) printf("reference ");
+    if (typeQualifier & TYPEQUAL_UNSIGNED)  printf("unsigned ");
+
+    printf("%s", baseType->GetString().c_str());
+
+    if (vectorSize > 0) printf("<%d>", vectorSize);
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// Declarator
+
+Declarator::Declarator(Symbol *s, SourcePos p) 
+  : pos(p) { 
+    sym = s;
+    functionArgs = NULL;
+    isFunction = false;
+    initExpr = NULL;
+}
+
+
+void
+Declarator::AddArrayDimension(int size) {
+    assert(size > 0 || size == -1); // -1 -> unsized
+    arraySize.push_back(size);
+}
+
+
+void
+Declarator::InitFromDeclSpecs(DeclSpecs *ds) {
+    sym->type = GetType(ds);
+
+    if (ds->storageClass == SC_STATIC)
+        sym->isStatic = true;
+}
+
+
+void
+Declarator::Print() const {
+    printf("%s", sym->name.c_str());
+    if (initExpr != NULL) {
+        printf(" = (");
+        initExpr->Print();
+        printf(")");
+    }
+    pos.Print();
+}
+
+
+static const Type *
+lGetType(const Declarator *decl, DeclSpecs *ds, 
+         std::vector<int>::const_iterator arrayIter) {
+    if (arrayIter == decl->arraySize.end()) {
+        // If we don't have an array (or have processed all of the array
+        // dimensions in previous recursive calls), we can go ahead and
+        // figure out the final non-array type we have here.
+        const Type *type = ds->baseType;
+        if (type == NULL) {
+            Error(decl->pos, "Type not provided in variable declaration for variable \"%s\".",
+                  decl->sym->name.c_str());
+            return NULL;
+        }
+
+        // Account for 'unsigned' and 'const' qualifiers in the type
+        if ((ds->typeQualifier & TYPEQUAL_UNSIGNED) != 0) {
+            const Type *unsignedType = type->GetAsUnsignedType();
+            if (unsignedType != NULL)
+                type = unsignedType;
+            else
+                Error(decl->pos, "\"unsigned\" qualifier is illegal with \"%s\" type.",
+                      type->GetString().c_str());
+        }
+        if ((ds->typeQualifier & TYPEQUAL_CONST) != 0)
+            type = type->GetAsConstType();
+
+        if (ds->vectorSize > 0) {
+            const AtomicType *atomicType = dynamic_cast<const AtomicType *>(type);
+            if (atomicType == NULL) {
+                Error(decl->pos, "Only atomic types (int, float, ...) are legal for vector "
+                      "types.");
+                return NULL;
+            }
+            type = new VectorType(atomicType, ds->vectorSize);
+        }
+
+        // if uniform/varying is specified explicitly, then go with that
+        if ((ds->typeQualifier & TYPEQUAL_UNIFORM) != 0)
+            return type->GetAsUniformType();
+        else if ((ds->typeQualifier & TYPEQUAL_VARYING) != 0)
+            return type->GetAsVaryingType();
+        else {
+            // otherwise, structs are uniform by default and everything
+            // else is varying by default
+            if (dynamic_cast<const StructType *>(type) != NULL)
+                return type->GetAsUniformType();
+            else
+                return type->GetAsVaryingType();
+        }
+    }
+    else {
+        // Peel off one dimension of the array
+        int arraySize = *arrayIter;
+        ++arrayIter;
+
+        // Get the type, not including the arraySize dimension peeled off
+        // above.
+        const Type *childType = lGetType(decl, ds, arrayIter);
+
+        int soaWidth = ds->soaWidth;
+        if (soaWidth == 0)
+            // If there's no "soa<n>" stuff going on, just return a regular
+            // array with the appropriate size 
+            return new ArrayType(childType, arraySize == -1 ? 0 : arraySize);
+       else {
+            // Make sure we actually have an array of structs ..
+            const StructType *childStructType = 
+                dynamic_cast<const StructType *>(childType);
+            if (childStructType == NULL) {
+                Error(decl->pos, "Illegal to provide soa<%d> qualifier with non-struct "
+                      "type \"%s\".", soaWidth, childType->GetString().c_str());
+                return new ArrayType(childType, arraySize == -1 ? 0 : arraySize);
+            }
+            else if ((soaWidth & (soaWidth - 1)) != 0) {
+                Error(decl->pos, "soa<%d> width illegal.  Value must be power of two.",
+                      soaWidth);
+                return NULL;
+            }
+            else if (arraySize != -1 && (arraySize % soaWidth) != 0) {
+                Error(decl->pos, "soa<%d> width must evenly divide array size %d.",
+                      soaWidth, arraySize);
+                return NULL;
+            }
+            return new SOAArrayType(childStructType, arraySize == -1 ? 0 : arraySize,
+                                    soaWidth);
+        }
+    }
+}
+
+
+const Type *
+Declarator::GetType(DeclSpecs *ds) const {
+    bool hasUniformQual = ((ds->typeQualifier & TYPEQUAL_UNIFORM) != 0);
+    bool hasVaryingQual = ((ds->typeQualifier & TYPEQUAL_VARYING) != 0);
+    bool isTask =         ((ds->typeQualifier & TYPEQUAL_TASK) != 0);
+    bool isReference =    ((ds->typeQualifier & TYPEQUAL_REFERENCE) != 0);
+
+    if (hasUniformQual && hasVaryingQual) {
+        Error(pos, "Can't provide both \"uniform\" and \"varying\" qualifiers.");
+        return NULL;
+    }
+
+    if (isFunction) {
+        std::vector<const Type *> args;
+        std::vector<std::string> argNames;
+        if (functionArgs) {
+            // Loop over the function arguments and get names and types for
+            // each one in the args and argNames arrays
+            for (unsigned int i = 0; i < functionArgs->size(); ++i) {
+                Declaration *d = (*functionArgs)[i];
+                Symbol *sym;
+                if (d->declarators.size() == 0) {
+                    // function declaration like foo(float), w/o a name for
+                    // the parameter
+                    char buf[32];
+                    sprintf(buf, "__anon_parameter_%d", i);
+                    sym = new Symbol(buf, pos);
+                    Declarator *declarator = new Declarator(sym, sym->pos);
+                    sym->type = declarator->GetType(ds);
+                    d->declarators.push_back(declarator);
+                }
+                else {
+                    assert(d->declarators.size() == 1);
+                    sym = d->declarators[0]->sym;
+                }
+
+                // Arrays are passed by reference, so convert array
+                // parameters to be references here.
+                if (dynamic_cast<const ArrayType *>(sym->type) != NULL)
+                    sym->type = new ReferenceType(sym->type, sym->type->IsConstType());
+
+                args.push_back(sym->type);
+                argNames.push_back(sym->name);
+            }
+        }
+
+        if (ds->baseType == NULL) {
+            Warning(pos, "No return type provided in declaration of function \"%s\". "
+                    "Treating as \"void\".", sym->name.c_str());
+            ds->baseType = AtomicType::Void;
+        }
+
+        if (isReference) {
+            Error(pos, "Function return types can't be reference types.");
+            return NULL;
+        }
+
+        const Type *returnType = lGetType(this, ds, arraySize.begin());
+        if (returnType == NULL)
+            return NULL;
+
+        bool isExported = (ds->storageClass == SC_EXPORT);
+        bool isExternC =  (ds->storageClass == SC_EXTERN_C);
+        return new FunctionType(returnType, args, pos, &argNames, isTask, 
+                                isExported, isExternC);
+    }
+    else {
+        if (isTask)
+            Error(pos, "\"task\" qualifier illegal in variable declaration \"%s\".",
+                  sym->name.c_str());
+
+        const Type *type = lGetType(this, ds, arraySize.begin());
+
+        if (type != NULL && isReference) {
+            bool hasConstQual = ((ds->typeQualifier & TYPEQUAL_CONST) != 0);
+            type = new ReferenceType(type, hasConstQual);
+        }
+
+        return type;
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////
+// Declaration
+
+void
+Declaration::AddSymbols(SymbolTable *st) const {
+    assert(declSpecs->storageClass != SC_TYPEDEF);
+
+    for (unsigned int i = 0; i < declarators.size(); ++i)
+       if (declarators[i])
+           st->AddVariable(declarators[i]->sym);
+}
+
+
+void
+Declaration::Print() const {
+    printf("Declaration: specs [");
+    declSpecs->Print();
+    printf("], declarators [");
+    for (unsigned int i = 0 ; i < declarators.size(); ++i) {
+        declarators[i]->Print();
+        printf("%s", (i == declarators.size() - 1) ? "]" : ", ");
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////
+
+void
+GetStructTypesAndNames(const std::vector<StructDeclaration *> &sd,
+                       std::vector<const Type *> *elementTypes,
+                       std::vector<std::string> *elementNames) {
+    for (unsigned int i = 0; i < sd.size(); ++i) {
+        const Type *type = sd[i]->type;
+        // FIXME: making this fake little DeclSpecs here is really
+        // disgusting
+        DeclSpecs ds(type);
+        if (type->IsUniformType()) 
+            ds.typeQualifier |= TYPEQUAL_UNIFORM;
+        else
+            ds.typeQualifier |= TYPEQUAL_VARYING;
+
+        for (unsigned int j = 0; j < sd[i]->declarators->size(); ++j) {
+            Declarator *d = (*sd[i]->declarators)[j];
+            d->InitFromDeclSpecs(&ds);
+
+            // if it's an unsized array, make it a reference to an unsized
+            // array, so the caller can pass a pointer...
+            const ArrayType *at = dynamic_cast<const ArrayType *>(d->sym->type);
+            if (at && at->GetElementCount() == 0)
+                d->sym->type = new ReferenceType(d->sym->type, type->IsConstType());
+
+            elementTypes->push_back(d->sym->type);
+            elementNames->push_back(d->sym->name);
+        }
+    }
+}
diff --git a/decl.h b/decl.h
new file mode 100644
index 00000000..84f6147e
--- /dev/null
+++ b/decl.h
@@ -0,0 +1,203 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file decl.h
+    @brief Declarations related to type declarations; the parser basically
+    creates instances of these classes, which are then turned into actual
+    Types.
+
+    Three classes work together to represent declarations.  As an example,
+    consider a declaration like:
+
+    static uniform int foo, bar[10];
+
+    An instance of the Declaration class represents this entire declaration
+    of two variables, 'foo' and 'bar'.  It holds a single instance of the
+    DeclSpecs class represents the common specifiers for all of the
+    variables--here, that the declaration has the 'static' and 'uniform'
+    qualifiers, and that it's basic type is 'int'.  Then for each variable
+    declaration, the Declaraiton class holds an instance of a Declarator,
+    which in turn records the per-variable information like the symbol
+    name, array size (if any), initializer expression, etc.
+*/
+
+#ifndef ISPC_DECL_H
+#define ISPC_DECL_H
+
+#include "ispc.h"
+
+enum StorageClass {
+    SC_NONE,
+    SC_EXTERN,
+    SC_EXPORT,
+    SC_STATIC,
+    SC_TYPEDEF,
+    SC_EXTERN_C
+};
+
+
+/* Multiple qualifiers can be provided with types in declarations;
+   therefore, they are set up so that they can be ANDed together into an
+   int. */
+#define TYPEQUAL_NONE           0
+#define TYPEQUAL_CONST      (1<<0)
+#define TYPEQUAL_UNIFORM    (1<<1)
+#define TYPEQUAL_VARYING    (1<<2)
+#define TYPEQUAL_TASK       (1<<3)
+#define TYPEQUAL_REFERENCE  (1<<4)
+#define TYPEQUAL_UNSIGNED   (1<<5)
+#define TYPEQUAL_INLINE     (1<<6)
+
+/** @brief Representation of the declaration specifiers in a declaration.
+
+    In other words, this represents all of the stuff that applies to all of
+    the (possibly multiple) variables in a declaration.
+ */
+class DeclSpecs {
+public:
+    DeclSpecs(const Type *t = NULL, StorageClass sc = SC_NONE, int tq = TYPEQUAL_NONE);
+
+    void Print() const;
+
+    StorageClass storageClass;
+
+    /** Zero or more of the TYPEQUAL_* values, ANDed together. */
+    int typeQualifier;
+
+    /** The basic type provided in the declaration; this should be an
+        AtomicType, a StructType, or a VectorType; other types (like
+        ArrayTypes) will end up being created if a particular declaration
+        has an array size, etc.
+    */
+    const Type *baseType;
+
+    /** If this is a declaration with a vector type, this gives the vector
+        width.  For non-vector types, this is zero.
+     */
+    int vectorSize;
+
+    /** If this is a declaration with an "soa<n>" qualifier, this gives the
+        SOA width specified.  Otherwise this is zero.
+     */
+    int soaWidth;
+};
+
+
+/** @brief Representation of the declaration of a single variable.  
+
+    In conjunction with an instance of the DeclSpecs, this gives us
+    everything we need for a full variable declaration.
+ */
+class Declarator {
+public:
+    Declarator(Symbol *s, SourcePos p);
+
+    /** As the parser peels off array dimension declarations after the
+        symbol name, it calls this method to provide them to the
+        Declarator.
+     */
+    void AddArrayDimension(int size);
+
+    /** Once a DeclSpecs instance is available, this method completes the
+        initialization of the Symbol, setting its Type accordingly.
+     */
+    void InitFromDeclSpecs(DeclSpecs *ds);
+
+    /** Get the actual type of the combination of Declarator and the given
+        DeclSpecs */
+    const Type *GetType(DeclSpecs *ds) const;
+
+    void Print() const;
+
+    const SourcePos pos;
+    Symbol *sym;
+    /** If this declarator includes an array specification, the sizes of
+        the array dimensions are represented here.
+     */
+    std::vector<int> arraySize;
+    /** Initialization expression for the variable.  May be NULL. */
+    Expr *initExpr;
+    bool isFunction;
+    std::vector<Declaration *> *functionArgs;
+};
+
+
+/** @brief Representation of a full declaration of one or more variables,
+    including the shared DeclSpecs as well as the per-variable Declarators.
+ */
+class Declaration {
+public:
+    Declaration(DeclSpecs *ds, std::vector<Declarator *> *dlist = NULL) {
+        declSpecs = ds;
+        if (dlist != NULL)
+            declarators = *dlist;
+        for (unsigned int i = 0; i < declarators.size(); ++i)
+            if (declarators[i] != NULL)
+                declarators[i]->InitFromDeclSpecs(declSpecs);
+    }
+    Declaration(DeclSpecs *ds, Declarator *d) {
+        declSpecs = ds;
+        if (d) {
+            d->InitFromDeclSpecs(ds);
+            declarators.push_back(d);
+        }
+    }
+
+    /** Adds the symbols for the variables in the declaration to the symbol
+        table. */
+    void AddSymbols(SymbolTable *st) const;
+    void Print() const;
+
+    DeclSpecs *declSpecs;
+    std::vector<Declarator *> declarators;
+};
+
+
+/** The parser creates instances of StructDeclaration for the members of
+    structs as it's parsing their declarations. */
+struct StructDeclaration {
+    StructDeclaration(const Type *t, std::vector<Declarator *> *d)
+        : type(t), declarators(d) { }
+
+    const Type *type;
+    std::vector<Declarator *> *declarators;
+};
+
+
+/** Given a set of StructDeclaration instances, this returns the types of
+    the elements of the corresponding struct and their names. */
+extern void GetStructTypesAndNames(const std::vector<StructDeclaration *> &sd,
+                                   std::vector<const Type *> *elementTypes,
+                                   std::vector<std::string> *elementNames);
+
+#endif // ISPC_DECL_H
diff --git a/docs/build.sh b/docs/build.sh
new file mode 100755
index 00000000..6de1e93d
--- /dev/null
+++ b/docs/build.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+rst2html ispc.txt > ispc.html
+
+#rst2latex --section-numbering --documentclass=article --documentoptions=DIV=9,10pt,letterpaper ispc.txt > ispc.tex
+#pdflatex ispc.tex
+#/bin/rm -f ispc.aux ispc.log ispc.out ispc.tex
diff --git a/docs/ispc.txt b/docs/ispc.txt
new file mode 100644
index 00000000..76f13595
--- /dev/null
+++ b/docs/ispc.txt
@@ -0,0 +1,2640 @@
+=========================================
+Intel® SPMD Program Compiler User's Guide
+=========================================
+
+``ispc`` is a compiler for writing SPMD (single program multiple data)
+programs to run on the CPU.  The SPMD programming approach is widely known
+to graphics and GPGPU programmers; it is used for GPU shaders and CUDA\* and
+OpenCL\* kernels, for example.  The main idea behind SPMD is that one writes
+programs as if they were operating on a single data element (a pixel for a
+pixel shader, for example), but then the underlying hardware and runtime
+system executes multiple invocations of the program in parallel with
+different inputs (the values for different pixels, for example).
+
+The main goals behind ``ispc`` are to:
+
+* Build a small C-like language that can deliver good performance to
+  performance-oriented programmers who want to run SPMD programs on
+  CPUs.
+* Provide a thin abstraction layer between the programmer and the
+  hardware--in particular, to follow the lesson from C for serial programs
+  of having an execution and data model where the programmer can cleanly
+  reason about the mapping of their source program to compiled assembly
+  language and the underlying hardware.
+* Harness the computational power of the Single Program, Multiple Data (SIMD) vector
+  units without the extremely low-programmer-productivity activity of directly
+  writing intrinsics.
+* Explore opportunities from close-coupling between C/C++ application code
+  and SPMD ``ispc`` code running on the same processor--lightweight funcion
+  calls betwen the two languages, sharing data directly via pointers without
+  copying or reformating, etc.
+
+``ispc`` has already successfully delivered significant speedups for a
+number of non-trivial workloads that aren't handled well by other
+compilation approaches (e.g. loop auto-vectorization.)
+
+Contents:
+
+* `Recent Changes to ISPC`_
+
+* `Getting Started with ISPC`_
+
+  + `Installing ISPC`_
+  + `Compiling and Running a Simple ISPC Program`_
+
+* `Using The ISPC Compiler`_
+
+  + `Command-line Options`_
+
+* `The ISPC Language`_
+
+  + `Lexical Structure`_
+  + `Basic Types and Type Qualifiers`_
+  + `Short Vector Types`_
+  + `Struct and Array Types`_
+  + `Declarations and Initializers`_
+  + `Function Declarations`_
+  + `Expressions`_
+  + `Control Flow`_
+  + `Functions`_
+  + `C Constructs not in ISPC`_
+
+* `Parallel Execution Model in ISPC`_
+
+  + `The SPMD-on-SIMD Execution Model`_
+  + `Uniform and Varying Qualifiers`_
+  + `Mapping Data to Program Instances`_
+  + `"Coherent" Control Flow Statements`_
+  + `Program Instance Convergence`_
+  + `Data Races`_
+  + `Uniform Variables and Varying Control Flow`_
+  + `Task Parallelism in ISPC`_
+
+* `The ISPC Standard Library`_
+
+  + `Math Functions`_
+  + `Output Functions`_
+  + `Cross-Lane Operations`_
+  + `Low-Level Bits`_
+
+* `Interoperability with the Application`_
+
+  + `Interoperability Overview`_
+  + `Data Layout`_
+  + `Data Alignment and Aliasing`_
+
+* `Using ISPC Effectively`_
+
+  + `Restructuring Existing Programs to Use ISPC`_
+  + `Understanding How to Interoperate With the Application's Data`_
+  + `Communicating Between SPMD Program Instances`_
+  + `Gather and Scatter`_
+  + `Low-level Vector Tricks`_
+  + `Debugging`_
+  + `The "Fast math" Option`_
+  + `"Inline" Aggressively`_
+  + `Small Performance Tricks`_
+  + `Instrumenting Your ISPC Programs`_
+
+* `Disclaimer and Legal Information`_
+
+* `Optimization Notice`_
+
+Recent Changes to ISPC
+======================
+
+This section summarizes recent changes and bugfixes.
+
+* 17 May: Fixed a number of bugs related to error handling in Windows*.  In
+  particular, if you use the ``/E`` command line flag to ``cl.exe`` (rather
+  than ``/EP``) when using it as a preprocessor, then ``ispc`` will
+  correctly report the source file position with warnings and errors.
+
+* 15 May: Improved error messages and warnings in many cases.  For example,
+  the column number is reported along with the line number and
+  the source line with the error is printed as part of the message.
+
+* 8 May: ``ispc``'s typechecker has been substantially improved in how it
+  handles ``const``-qualified types.  Some programs that previously
+  compiled may now fail with errors related to ``const``.  For example,
+  ``ispc`` issues an error message if you try to assign a member of a const
+  structure.
+
+* 2 May: "uniform" short-vector types are now stored across the lanes of
+  the SIMD registers.  This enables you to also write classic 'explicit
+  vector' computation in ``ispc`` as well.  This change does change how
+  these types are laid out in memory; see `Data Layout`_ for more details.)
+
+Getting Started with ISPC
+=========================
+
+Installing ISPC
+---------------
+
+The `ispc downloads web page`_ has prebuilt executables for Windows\*,
+Linux\* and Mac OS\* available for download.  Alternatively, you can
+download the source code from that page and build it yourself; see see the
+`ispc wiki`_ for instructions about building ``ispc`` from source.
+
+.. _ispc downloads web page:downloads.html
+.. _ispc wiki: http://github.com/ispc/ispc/wiki
+
+Once you have an executable for your system, copy it into a directory
+that's in your ``PATH``.  Congratulations--you've now installed ``ispc``.
+
+Compiling and Running a Simple ISPC Program
+-------------------------------------------
+
+The directory ``examples/simple`` in the ``ispc`` distribution includes a
+simple example of how to use ``ispc`` with a short C++ program.  See the
+file ``simple.ispc`` in that directory (also reproduced here.)
+
+::
+
+    export void simple(uniform float vin[], uniform float vout[], 
+                       uniform int count) {
+        for (uniform int i = 0; i < count; i += programCount) {
+            int index = i + programIndex;
+            float v = vin[index];
+            if (v < 3.)
+                v = v * v;
+            else
+                v = sqrt(v);
+            vout[index] = v;
+        }
+    }
+
+This program loops over an array of values in ``vin`` and computes an
+output value for each one.  For each value in ``vin``, if its value is less
+than three, the output is the value squared, otherwise it's the square root
+of the value.
+
+The first thing to notice in this program is the presence of the ``export``
+keyword in the function definition; this indicates that the function should
+be made available to be called from application code.  The ``uniform``
+qualifiers on the parameters to ``simple`` as well as for the variable
+``i`` indicate that the correpsonding variables are non-vector
+quantities--they are discussed in detail in the `Uniform and Varying
+Qualifiers`_ section.
+
+Each iteration of the for loop works on a number of input values in
+parallel.  The built-in ``programCount`` variable indicates how many
+program instances are running in parallel; it is equal to the SIMD width of
+the machine.  (For example, the value is four on Intel® SSE, eight on
+Intel® AVX, etc.)  Thus, we can see that each execution of the loop will
+work on that many output values in parallel.  There is an implicit
+assumption that ``programCount`` divides the ``count`` parameter without
+remainder; the more general case case can be handled with a small amount of
+additional code.
+
+To load the ``programCount``-worth of values, the program computes an index
+using the sum of ``i``, which gives the first value to work on in this
+iteration, and ``programIndex``, which gives a unique integer identifier
+for each running program instance, counting from zero.  Thus, the load from
+``vin`` loads the values at offset ``i+0``, ``i+1``, ``i+2``, ..., from the
+``vin`` array into the vector variable ``v``.  This general idiom should be
+familiar to CUDA\* or OpenCL\* programmers, where thread ids serve a
+similar role to ``programIndex`` in ``ispc``.  See the section `Mapping
+Data to Program Instances`_ for more detail.
+
+The program can then proceed, doing computation and control flow based on
+the values loaded.  The result from the running program instances is
+written to the ``vout`` array before the next loop iteration runs.
+
+For a simple program like this one, the performance difference versus a
+regular scalar C/C++ implementation are minimal.  For more
+complex programs that do more substantial amounts of computation, doing
+that computation in parallel across the machine's SIMD lanes can have a
+substantial performance benefit.
+
+On Linux\* and Mac OS\*, the makefile in that directory compiles this program.
+For Windows\*, open the ``examples/examples.sln`` file in Microsoft Visual
+C++ 2010\* to build this (and the other) examples.  In either case,
+build it now!  We'll walk through the details of the compilation steps in
+the following section, `Using The ISPC Compiler`_.)  In addition to
+compiling the ``ispc`` program, in this case the ``ispc`` compiler also
+generates a small header file, ``simple.h``.  This header file includes the
+declaration for the C-callable function that the above ``ispc`` program is
+compiled to.  The relevant parts of this file are:
+
+::
+
+  #ifdef __cplusplus
+  extern "C" {
+  #endif // __cplusplus
+      extern void simple(float vin[], float vout[], int32_t count);
+  #ifdef __cplusplus
+  }
+  #endif // __cplusplus
+
+It's not mandatory to ``#include`` the generated header file in your C/C++
+code (you can alternatively use a manually-written ``extern`` declaration
+of the ``ispc`` functions you use), but it's a helpful check to ensure that
+the function signatures are as expected on both sides.
+
+Here is the main program, ``simple.cpp``, which calls the ``ispc`` function
+above.
+
+::
+
+  #include <stdio.h>
+  #include "simple.h"
+
+  int main() {
+      float vin[16], vout[16];
+      for (int i = 0; i < 16; ++i)
+          vin[i] = i;
+
+      simple(vin, vout, 16);
+
+      for (int i = 0; i < 16; ++i)
+          printf("%d: simple(%f) = %f\n", i, vin[i], vout[i]);
+  }
+
+Note that the call to the ``ispc`` function in the middle of ``main()`` is
+a regular function call.  (And it has the same overhead as a C/C++ function
+call, for that matter.)
+
+When the executable ``simple`` runs, it generates the expected output:
+ 
+::
+
+    0: simple(0.000000) = 0.000000
+    1: simple(1.000000) = 1.000000
+    2: simple(2.000000) = 4.000000
+    3: simple(3.000000) = 1.732051
+    ...
+
+There is also a small example of using ``ispc`` to compute the Mandelbrot
+set; see the `Mandelbrot set example`_ page on the ``ispc`` website for a
+walkthrough of it.
+
+.. _Mandelbrot set example: http://ispc.github.com/example.html
+
+Using The ISPC Compiler
+=======================
+
+To go from a ``ispc`` source file to an object file that can be linked
+with application code, enter the following command
+
+::
+
+   ispc foo.ispc -o foo.o
+
+On Linux\* and Mac OS\*, ``ispc`` automatically runs the C preprocessor on
+your input program; under Windows\*, this must be done manually.  With
+Microsoft Visual C++ 2010\*, the following custom build step for
+``ispc`` source files takes care of this job:
+
+::
+
+  cl /E /TP %(Filename).ispc | ispc - -o %(Filename).obj -h %(Filename).h
+
+The ``cl`` call runs the C preprocessor on the ``ispc`` file; the result is
+piped to ``ispc`` to generate an object file and a header.  As an example,
+see the file ``simple.vcxproj`` in the ``examples/simple`` directory of the
+``ispc`` distribution.
+
+Command-line Options
+--------------------
+
+The ``ispc`` executable can be run with ``--help`` to print a list of
+accepted command-line arguments.  By default, the compiler compiles the
+provided program (and issues warnings and errors), but doesn't
+generate any output.  
+
+If the ``-o`` flag is given, it will generate an output file (a native
+object file by default).  To generate a text assembly file, pass
+``--emit-asm``:
+
+::
+
+   ispc foo.ispc -o foo.s --emit-asm
+
+To generate LLVM bitcode, use the ``--emit-llvm`` flag.
+
+By default, an optimized x86-64 object file tuned for Intel® Core
+processors CPUs is built.  You can use the ``--arch`` command line flag to
+specify a 32-bit x86 target:
+
+::
+
+   ispc foo.ispc -o foo.obj --arch=x86
+
+Optimizations can be turned off with ``-O0``:
+
+::
+
+   ispc foo.ispc -o foo.obj -O0
+
+On Mac\* and Linux\*, there is early support for generating debugging
+symbols; this is enabled with the ``-g`` command-line flag.
+
+The ``-h`` flag can also be used to direct ``ispc`` to generate a C/C++
+header file that includes C/C++ declarations of the C-callable ``ispc``
+functions and the types passed to it.
+
+On Linux\* and Mac OS\*, ``-D`` can be used to specify definitions to be
+passed along to the C pre-prcessor, which runs over the program input
+before it's compiled.  On Windows®, pre-processor definitions should be
+provided to the ``cl`` call.
+
+By default, the compiler generates x86-64 Intel® SSE4 code.  To generate
+32-bit code, you can use the the ``--arch=x86`` command-line flag.  To
+select Intel® SSE2, use ``--target=sse2``.
+
+``ispc`` supports an alternative method for generating Intel® SSE4 code,
+where the program is "doubled up" and eight instances of it run in
+parallel, rather than just four.  For workloads that don't require large
+numbers of registers, this method can lead to significantly more efficient
+execution thanks to greater instruction level parallelism.  This option is
+selected with ``--target=sse4x2``.
+
+The compiler issues a number of performance warnings for code constructs
+that compile to relatively inefficient code.  These warnings can be
+silenced with the ``--wno-perf`` flag (or by using ``--woff``, which turns
+off all warnings.)
+
+
+The ISPC Language
+=================
+
+``ispc``'s syntax is based on C and is designed to be as similar to C
+as much as possible.  Between syntactic differences and the fundamentally
+parallel execution model (versus C's serial model), C code is not directly
+portable to ``ispc``, although starting with working C code and porting it
+to ``ispc`` can be an efficient way to write ``ispc`` programs. 
+
+Lexical Structure
+-----------------
+
+Tokens in ``ispc`` are delimted by white-space and comments.  The
+white-space characters are the usual set of spaces, tabs, and carriage
+returns/line feeds.  Comments can be delinated with ``//``, which starts a
+comment that continues to the end of the line, or the start of a comment
+can be delinated with ``/*`` and the end with ``*/``.  Like in C/C++,
+comments can't be nested.
+
+Identifiers in ``ispc`` are sequences of characters that start with an
+underscore or an upper-case or lower-case letter, and then followed by
+zero or more letters, numbers, or underscores.
+
+Integer numeric constants can be specified in base 10 or in hexidecimal.
+Base 10 constants are given by a sequence of one or more digits from 0 to
+9.  Hexidecimal constants are denoted by a leading ``0x`` and then one or
+more digits from 0-9, a-f, or A-F.
+
+Floating-point constants can be specified in one of three ways.  First,
+they may be a sequence of zero or more digits from 0 to 9, followed by a
+period, followed by zero or more digits from 0 to 9. (There must be at
+least one digit before or after the period).
+
+The second option is scientific notation, where a base value is specified
+as the first form of a floating-point constant but is then followed by an
+"e" or "E", then a plus sign or a minus sign, and then an exponent.
+
+Finally, floating-point constants may be specified as hexidecimal
+constants; this form can ensure a perfectly bit-accurate representation of
+a particular floating-point number.  These are specified with an "0x"
+prefix, followed by a zero or a one, a period, and then the remainder of
+the mantissa in hexidecimal form, with digits from 0-9, a-f, or A-F.  The
+start of the exponent is denoted by a "p", which is then followed by an
+optional plus or minus sign and then digits from 0 to 9.  For example:
+
+::
+
+  float two = 0x1p+1;  // 2.0
+  float pi  = 0x1.921fb54442d18p+1;  // 3.1415926535...
+  float neg = -0x1.ffep+11;  // -4095.
+
+Floating-point constants can optionally have a "f" or "F" suffix (``ispc``
+currently treats all floating-point constants as having 32-bit precision,
+making this suffix unnecessary.)
+
+String constants in ``ispc`` are denoted by an opening double quote ``"``
+followed by any character other than a newline, up to a closing double
+quote.  Within the string, a number of special escape sequences can be used
+to specify special characters.  These sequences all start with an initial
+``\`` and are listed below:
+
+.. list-table:: Escape sequences in strings
+
+  * - ``\\``
+    - backslash: ``\``
+  * - ``\"``
+    - double quotation mark: ``"``
+  * - ``\'``
+    - single quotation mark: ``'``
+  * - ``\a`` 
+    - bell (alert)
+  * - ``\b``
+    - backspace character
+  * - ``\f``
+    - formfeed character
+  * - ``\n``
+    - newline
+  * - ``\r``
+    - carriabe return
+  * - ``\t``
+    - horizontal tab
+  * - ``\v``
+    - vertical tab
+  * - ``\`` followed by one or more digits from 0-8
+    - ASCII character in octal notation
+  * - ``\x``, followed by one or more digits from 0-9, a-f, A-F 
+    - ASCII character in hexidecimal notation
+
+``ispc`` doesn't support a string data type; string constants can be passed
+as the first argument to the ``print()`` statement, however.  ``ispc`` also
+doesn't support character constants.
+
+The following identifiers are reserved as language keywords: ``bool``,
+``break``, ``case``, ``cbreak``, ``ccontinue``, ``cdo``, ``cfor``,
+``char``, ``cif``, ``cwhile``, ``const``, ``continue``, ``creturn``,
+``default``, ``do``, ``double``, ``else``, ``enum``, ``export``,
+``extern``, ``false``, ``float``, ``for``, ``goto``, ``if``, ``inline``, ``int``,
+``int32``, ``int64``, ``launch``, ``print``, ``reference``, ``return``,
+``signed``, ``sizeof``, ``soa``, ``static``, ``struct``, ``switch``,
+``sync``, ``task``, ``true``, ``typedef``, ``uniform``, ``union``,
+``unsigned``, ``varying``, ``void``, ``volatile``, ``while``.
+
+``ispc`` defines the following operators and punctuation:
+
+.. list-table:: Operators
+
+  * - Symbols
+    - Use
+  * - ``=``
+    - Assignment
+  * - ``+``, ``-``, \*, ``/``, ``%``
+    - Arithmetic operators
+  * - ``&``, ``|``, ``^``, ``!``, ``~``, ``&&``, ``||``, ``<<``, ``>>``
+    - Logical and bitwise operators
+  * - ``++``, ``--``
+    - Pre/post increment/decrement
+  * - ``<``, ``<=``, ``>``, ``>=``, ``==``, ``!=``
+    - Relational operators
+  * - ``*=``, ``/=``, ``+=``, ``-=``, ``<<=``, ``>>=``, ``&=``, ``|=``
+    - Compound assignment operators
+  * - ``?``, ``:``
+    - Selection operators
+  * - ``;``
+    - Statement separator
+  * - ``,``
+    - Expression separator
+  * - ``.``
+    - Member access
+
+A number of tokens are used for grouping in ``ispc``:
+
+.. list-table:: Grouping Tokens
+
+  * - ``(``, ``)``
+    - Parenthesization of expressions, function calls, delimiting specifiers
+      for control flow constructs.
+  * - ``[``, ``]``
+    - Array and short-vector indexing
+  * - ``{``, ``}``
+    - Compound statements  
+
+
+Basic Types and Type Qualifiers
+-------------------------------
+
+``ispc`` is a statically-typed language.  It supports a variety of basic
+types.
+
+* ``void``: "empty" type representing no value.
+* ``bool``: boolean value; may be assigned ``true``, ``false``, or the
+  value of a boolean expression.
+* ``int``: 32-bit signed integer; may also be specified as ``int32``.
+* ``unsigned int``: 32-bit unsigned integer; may also be specified as
+  ``unsigned int32``.
+* ``float``: 32-bit floating point value
+* ``int64``: 64-bit signed integer.
+* ``unsigned int64``: 64-bit unsigned integer.
+* ``double``: 64-bit double-precision floating point value.
+
+Implicit type conversion between values of different types is done
+automatically by the ``ispc`` compiler.  Thus, a value of ``float`` type
+can be assigned to a variable of ``int`` type directly.  In binary
+arithmetic expressions with mixed types, types are promoted to the "more
+general" of the two types, with the following precedence:
+
+::
+
+  double > uint64 > int64 > float > uint32 > int32 > bool
+
+In other words, adding an ``int64`` to a ``double`` causes the ``int64`` to
+be converted to a ``double``, the addition to be performed, and a
+``double`` value to be returned.  If a different conversion behavior is
+desired, then explicit type-casts can be used, where the destination type
+is provided in parenthesis around the expression:
+
+::
+
+    double foo = 1. / 3.;
+    int bar = (float)bar + (float)bar;  // 32-bit float addition
+
+Note: if a ``bool`` is converted to an integer numeric type (``int``,
+``int64``, etc.), then the conversion is done with sign extension, not zero
+extension.  Thus, the resulting value has all bits set if the ``bool`` is
+``true``; for example, ``0xffffffff`` for ``int32``.  This differs from C
+and C++, where a ``true`` bool is converted to the integer value one.
+
+Variables can be declared with the ``const`` qualifier, which prohibits
+their modification.
+
+::
+
+    const float PI = 3.1415926535;
+
+As in C, the ``extern`` qualifier can be used to declare a function or
+global variable defined in another source file, and the ``static``
+qualifier can be used to define a variable or function that is only visible
+in the current scope.  The values of ``static`` variables declared in
+functions are preserved across function calls.
+
+The ``typedef`` keyword can be used to name types:
+
+::
+
+  typedef Float3 float[3];
+
+``typedef`` doesn't create a new type: it just provides an alternative name
+for an existing type.  Thus, in the above example, it is legal to pass a
+value with ``float[3]`` type to a function that has been declared to take a
+``Float3`` parameter.
+
+``ispc`` provides a ``reference`` qualifier that can be used for passing
+values to functions by reference so that functions can return multiple
+results or modify existing variables.
+
+::
+
+    void increment(reference float f) {
+        ++f;
+    }
+
+``ispc`` doesn't currently support pointer types.
+
+
+Short Vector Types
+------------------
+
+``ispc`` supports a parameterized type to define short vectors.  These
+short vectors can only be used with basic types like ``float`` and ``int``;
+they can't be applied to arrays or structures.  Note: ``ispc`` does *not*
+use these short vectors to facilitate program vectorization; they are
+purely a syntactic convenience.  Using them or writing the corresponding
+code without them shouldn't lead to any noticeable performance differences
+between the two approaches.
+
+Syntax similar to C++ templates is used to declare these types:
+
+::
+
+    float<3> foo;   // vector of three floats
+    double<6> bar;
+
+The length of these vectors can be arbitrarily long, though the expected
+usage model is relatively short vectors.
+
+You can use ``typedef`` to create types that don't carry around
+the brackets around the vector length:
+
+::
+
+    typedef float<3> float3;
+
+``ispc`` doesn't support templates in general.  In particular,
+not only must the vector length be a compile-time constant, but it's
+also not possible to write functions that are parameterized by vector
+length.
+
+::
+
+    uniform int i = foo();
+    // ERROR: length must be compile-time constant
+    float<i> vec; 
+    // ERROR: can't write functions parameterized by vector length
+    float<N> func(float<N> val); 
+    
+Arithmetic on these short vector types works as one would expect; the
+operation is applied component-wise to the values in the vector.  Here is a
+short example:
+
+::
+
+    float<3> func(float<3> a, float<3> b) {
+        a += b;    // add individual elements of a and b
+        a *= 2.;   // multiply all elements of a by 2
+        bool<3> test = a < b;  // component-wise comparison
+        return test ? a : b;   // return each minimum component
+    }
+
+As shown by the above code, scalar types automatically convert to
+corresponding vector types when used in vector expressions.  In this
+example, the constant ``2.`` above is converted to a three-vector of 2s for
+the multiply in the second line of the function implementation.
+
+Type conversion between other short vector types also works as one would
+expect, though the two vector types must have the same length:
+
+::
+
+    float<3> foo = ...;
+    int<3> bar = foo;    // ok, cast elements to ints
+    int<4> bat = foo;    // ERROR: different vector lengths
+    float<4> bing = foo; // ERROR: different vector lengths
+
+There are two mechanisms to access the individual elements of these short
+vector data types.  The first is with the array indexing operator:
+
+::
+
+    float<4> foo;
+    for (uniform int i = 0; i < 4; ++i)
+        foo[i] = i;
+
+``ispc`` also provides a specialized mechanism for naming and accessing
+the first few elements of short vectors based on an overloading of
+the structure member access operator.  The syntax is similar to that used
+in HLSL, for example.
+
+::
+
+    float<3> position;
+    position.x = ...;
+    position.y = ...;
+    position.z = ...;
+
+More specifically, the first element of any short vector type can be
+accessed with ``.x`` or ``.r``, the second with ``.y`` or ``.g``, the third
+with ``.z`` or ``.b``, and the fourth with ``.w`` or ``.a``.  Just like
+using the array indexing operator with an index that is greater than the
+vector size, accessing an element that is beyond the vector's size is
+undefined behavior and may cause your program to crash.
+
+Note: ``ispc`` doesn't support the "swizzling" operations that languages
+like HLSL do.  Only a single element of the vector can be accessed at a
+time with these member operators.
+
+::
+
+    float<3> foo = ...;
+    float<2> bar = foo.xy;  // ERROR
+    foo.xz = ...;           // ERROR
+    func(foo.xyx);          // ERROR
+
+For convenience, short vectors can be initialized with a list of individual
+element values:
+
+::
+
+    float x = ..., y = ..., z = ...;
+    float<3> pos = { x, y, z };
+
+
+Struct and Array Types
+----------------------
+
+More complex data structures can be built using ``struct`` and arrays.
+
+::
+
+    struct Foo {
+        float time;
+        int flags[10];
+    };
+
+The size of arrays must be a compile-time constant, though functions can be
+declared to take "unsized arrays" as parameters so that arrays of any size
+may be passed:
+
+::
+
+    void foo(float array[], int length);
+
+As in C++, after a ``struct`` is declared, an instance can be created using
+the ``struct``'s name:
+
+::
+
+    Foo f;
+
+Alternatively, ``struct`` can be used before the structure name:
+
+::
+
+    struct Foo f;
+
+
+Declarations and Initializers
+-----------------------------
+
+Variables are declared and assigned just as in C:
+
+::
+
+    float foo = 0, bar[5];
+    float bat = func(foo);
+
+If a variable is declared without an initializer expression, then its value
+is undefined until a value is assigned to it.  Reading an undefined
+variable may lead to unexpected program behavior.
+
+Any variable that is declared at file scope (i.e. outside a function) is a
+global variable.  If a global variable is qualified with the ``static``
+keyword, then its only visible within the compilation unit in which it was
+defined.  As in C/C++, a variable with a ``static`` qualifier inside a
+functions maintains its value across function invocations.
+
+Like C++, variables don't need to be declared at the start of a basic
+block:
+
+::
+
+    int foo = ...;
+    if (foo < 2) { ... }
+    int bar = ...;
+
+Variables can also be declared in ``for`` statement initializers:
+
+::
+
+    for (int i = 0; ...)
+
+Arrays can be initialized with either a scalar value or with individual
+element values in braces:
+
+::
+
+    int foo[10] = x;  // all ten elements take the value of x
+    int bar[2][4] = { { 1, 2, 3, 4 }, { 5, 6, 7, 8 } };
+
+Structures can also be initialized both with scalar values or with element
+values in braces:
+
+::
+
+    struct Color { float r, g, b; };
+    ....
+    Color c = 1; // all are one
+    Color d = { 0.5, .75, 1.0 }; // r = 0.5, ...
+
+
+Function Declarations
+---------------------
+
+Functions can be declared with a number of qualifiers that affect their
+visibility and capabilities.  As in C/C++, functions have global visibility
+by default.  If a function is declared with a ``static`` qualifier, then it
+is only visible in the file in which it was declared.
+
+Any function that can be launched with the ``launch`` construct in ``ispc``
+must have a ``task`` qualifier; see `Task Parallelism in ISPC`_ for more
+discussion of launching tasks in ``ispc``.
+
+Functions that are intended to be called from C/C++ application code must
+have the ``export`` qualifier.  This causes them to have regular C linkage
+and to have their declarations included in header files, if the ``ispc``
+compiler is directed to generated a C/C++ header file for the file it
+compiled.
+
+Finally, any function defined with an ``inline`` qualifier will always be
+inlined by ``ispc``; ``inline`` is not a hint, but forces inlining.  The
+compiler will opportunistically inline short functions depending on their
+complexity, but any function that should always be inlined should have the
+``inline`` qualifier.
+
+
+Expressions
+-----------
+
+All of the operators from C that you'd expect for writing expressions are
+present.  Rather than enumerating all of them, here is a short summary of
+the range of them available in action.
+
+::
+
+    unsigned int i = 0x1234feed;
+    unsigned int j = (i << 3) ^ ~(i - 3);
+    i += j / 6;
+    float f = 1.234e+23;
+    float g = j * f / (2.f * i);
+    double h = (g < 2) ? f : g/5;
+
+Structure member access and array indexing also work as in C.
+
+::
+
+   struct Foo { float f[5]; int i; };
+   Foo foo = { { 1,2,3,4,5 }, 2 };
+   return foo.f[4] - foo.i;
+    
+
+Control Flow
+------------
+
+``ispc`` supports most of C's control flow constructs, including ``if``,
+``for``, ``while``, ``do``.  You can use ``break`` and ``continue``
+statements in ``for``, ``while``, and ``do`` loops.
+
+There are variants of the ``if``, ``do``, ``while``, ``for``, ``break``,
+``continue``, and ``return`` statements (``cif``, ``cdo``, ``cwhile``,
+``cfor``, ``cbreak``, ``ccontinue``, and ``creturn``, respectively) that
+provide the compiler a hint that the control flow is expected to be
+coherent at that particular point, thus allowing the compiler to do
+additional optimizations for that case.  These are described in the
+`"Coherent" Control Flow Statements`_ section.
+
+``ispc`` does not support ``switch`` statements or ``goto``.
+
+Functions
+---------
+
+Like C, functions must be declared before they are called, though a forward
+declaration can be used before the actual function definition.  Functions
+can be overloaded by parameter type.  Given multiple definitions of a
+function, ``ispc`` uses the following methods to try to find a match.  If
+a single match of a given type is found, it is used; if multiple matches of
+a given type are found, an error is issued.
+
+* All parameter types match exactly.
+* All parameter types match exactly, where any ``reference``-qualified
+  parameters are considered equivalent to their underlying type.
+* Parameters match with only promotions from ``uniform`` to ``varying``
+  type.
+* Parameters match using standard type conversion (``int`` to ``float``,
+  ``float`` to ``int``.)
+
+Also like C, arrays are passed to functions by reference.
+
+
+C Constructs not in ISPC
+-------------------------
+
+The following C features are not available in ``ispc``.
+
+* ``enum`` s
+* Pointers and function pointers
+* ``char`` and ``short`` types
+* ``switch`` statements
+* bitfield members in structures
+* ``union``
+* ``goto``
+
+
+Parallel Execution Model in ISPC
+================================
+
+Though ``ispc`` has C-based syntax, it is inherently a language for
+parallel computation.  Understanding the details of ``ispc``'s parallel
+execution model is critical for writing efficient and correct programs in
+``ispc``.
+
+``ispc`` supports both task parallelism to parallelize across multiple
+cores and SPMD parallelism to parallelize across the SIMD vector lanes on a
+single core.  This section focuses on SPMD parallelism.  See the section
+`Task Parallelism in ISPC`_ for discussion of task parallelism in ``ispc``.
+
+The SPMD-on-SIMD Execution Model
+--------------------------------
+
+In the SPMD model as implemented in ``ispc``, you programs that compute a
+set of outputs based on a set of inputs.  You must write these
+programs so that it is safe to run multiple instances of them in
+parallel--i.e. given a program an a set of inputs, the programs shouldn't
+have any assumptions about the order in which they will be run over the
+inputs, whether one program instances will have completed before another
+runs. [#]_
+
+.. [#] This is essentially the same requirement that languages like CUDA\*
+   and OpenCL\* place on the programmer.
+
+Given this guarantee, the ``ispc`` compiler can safely execute multiple
+program instances in parallel, across the SIMD lanes of a single CPU.  In
+many cases, this execution approach can achieve higher overall performance
+than if the program instances had been run serially.
+
+Upon entry to a ``ispc`` function, the execution model switches from
+the application's serial model to SPMD.  Conceptually, a number of
+``ispc`` program instances will start running in parallel.  This
+parallelism doesn't involve launching hardware threads.  Rather, one
+program instance is mapped to each of the SIMD lanes of the CPU's vector
+unit (Intel® SSE or Intel® AVX).
+
+If a ``ispc`` program is written to do a the following computation:
+
+::
+
+    float x = ..., y = ...;
+    return x+y;
+
+and if the ``ispc`` program is running four-wide on a CPU that supports the
+Intel® SSE instructions, then four program instances are running in
+parallel, each adding a pair of scalar values.  However, these four program
+instances store their individual scalar values for ``x`` and ``y`` in the
+lanes of an Intel® SSE vector register, so the addition operation for all
+four program instances can be done in parallel with a single ``addps``
+instruction.
+
+Program execution is more complicated in the presence of control flow.  The
+details are handled by the ``ispc`` compiler, but you may find it helpful
+to understand what is going on in order to be a more effective ``ispc``
+programmer.  In particular, the mapping of SPMD to SIMD lanes can lead to
+reductions in this SIMD efficiency as different program instances want to
+perform different computations.  For example, consider a simple ``if``
+statement:
+
+::
+
+   float x = ..., y = ...;
+   if (x < y) {
+      ... 
+   } else {
+      ...
+   }
+
+In general, the test ``x<y`` has a different result for different running
+SPMD program instances.  Some of the currently running program instances
+want to execute the statements for the "true" case and some want to execute
+the statements for the "false" case.  ``ispc`` processes this case by
+generating code that executes for both cases and masking the results, such
+that the "true" code doesn't have any side effects for the program
+instances that want to run the "false" code, and vice versa.  Thus, the
+correct reusult is computed for all of the program instances in the end,
+though with some overhead relative to a scalar implementation where code
+for only one of the two cases needs to run.
+
+``for``, ``while``, and ``do`` statements are similar.  Their loops must
+run until all of the running SPMD program instances are ready to exit the
+loop.  Thus in an extreme case of a loop like:
+
+::
+
+    // assume limit has the values (1,1,1,1000) for the
+    // current running program instances
+    int limit = ...;  
+    for (int i = 0; i < limit; ++i) {
+        ...
+    }
+
+The loop body needs to execute 1000 times, since one of the SPMD
+program instances has a value of 1000 for ``limit``.  For the other three
+running program instances, the right result will still be computed, as the
+code run the additional 999 times won't have any side effects for them.  However,
+the result will have poor SIMD utilization as the majority of the loop
+iterations don't benefit three out of the four currently running program
+instances.  Thus, finding ways to structure the computation
+so that the currently running program instances have similar desired
+control flow paths leads to better overall efficiency.
+
+
+Uniform and Varying Qualifiers
+------------------------------
+
+To write high-performance code, you need to understand the distinction
+between ``uniform`` and ``varying`` data types.
+
+If a variable has a ``uniform`` qualifier, then there is only a single
+instance of that variable for all of the currently-executing program
+instances.  (As such, it necessarily has the same value across all of the
+program instances.)  ``uniform`` variables can be modified as the program
+executes, but only in ways that preserve the property that they have the
+same value across all of the program instances.  Assigning a
+non-``uniform`` (i.e., ``varying``) value to a ``uniform`` variable causes
+a compile-time error.
+
+When appropriate, declaring variables as ``uniform`` types can allow the
+compiler to produce substantially better code.  Consider for example an
+image filtering operation where the program loops over adjacent pixels:
+
+::
+
+    float box3x3(uniform float image[32][32], int x, int y) {
+        float sum = 0;
+        for (int dy = -1; dy <= 1; ++dy)
+            for (int dx = -1; dx <= 1; ++dx)
+                sum += image[y+dy][x+dx];
+        return sum / 9.;
+    }
+
+Under the SPMD execution model, a number of program instances are running
+this function in parallel (and in general, we will assume that this
+function will end up being called with different values for ``x`` and ``y``
+for the running program instances.)  However, all of the program instances
+will want to execute the same number of iterations of the ``for`` loops,
+with all of them having the same values for ``dx`` and ``dy`` each time
+through. [#]_
+
+.. [#] In this case, a sufficiently smart compiler could determine that
+   ``dx`` and ``dy`` have the same value for all program instances and thus
+   generate more optimized code from the start, though ``ispc`` isn't yet
+   this clever.  Put another way, the ``ispc`` approach is generally that
+   the programmer shouldn't have to wonder if the compiler was smart or not
+   in a particular case, thus avoiding performance surprises.
+
+If these are instead implemented with ``dx`` and ``dy`` declared as
+``uniform`` variables, then the ``ispc`` compiler can generate more
+efficient code for the loops, taking advantage of the fact that these
+values are the same for all program instances.
+
+::
+
+        for (uniform int dy = -1; dy <= 1; ++dy)
+            for (uniform int dx = -1; dx <= 1; ++dx)
+                sum += image[y+dy][x+dx];
+
+In particular, ``ispc`` can avoid the overhead of checking to see if any
+of the running program instances wants to do another loop iteration.
+Instead, ``ispc`` can
+generate code where all instances always do the same iterations.
+
+A related benefit comes in ``if`` statements--if the test in an ``if``
+statement is purely based on ``uniform`` quantities, then the result will
+by definition be the same for all of the running program instances. Thus,
+the code for only one of the two cases needs to execute.  ``ispc`` can
+generate code that jumps to one of the two, avoiding the overhead of
+needing to run the code for both cases.
+
+``uniform`` variables will implicitly type-convert to varying types as
+required:
+
+::
+
+   uniform int x = ...;
+   int y = ...;
+   int z = x * y;
+
+Conversely, it is a compile-time error to assign a varying value to a
+``uniform`` type:
+
+::
+
+    float f = ....;
+    uniform float uf = f;  // ERROR
+
+Arrays themselves aren't uniform or varying, but the elements that they
+store are:
+
+::
+
+    float foo[10];
+    uniform float bar[10];
+
+Continuing the connection to data types in memory, the first declaration
+corresponds to 10 four-wide float values (on Intel® SSE), and the second to
+10 single float values.
+
+
+Mapping Data to Program Instances
+---------------------------------
+
+An important part of SPMD programming is how to map the set of running
+instances to the set of inputs to the program.  
+
+If the application has created an array of floating-point values on which
+the following computation needs to be completed:
+
+::
+
+    // C++ code
+    int count = ...;
+    float *data = new float[count];
+    float *result = new float[count];
+    ... initialize data ...
+    ispc_func(data, count, result);
+
+And if we have a ``ispc`` function declared as follows, then, given a
+number of program instances running in parallel, how do the program
+instances determine which elements of the array to work on?
+
+::
+
+    // ispc code
+    export void ispc_func(uniform float data[],
+                          uniform int count,
+                          uniform float result[]) {
+        ...
+     
+``ispc`` provides two built-in variables to help with this data mapping
+across the set of running SPMD program instances.  The first,
+``programCount`` gives the number of program instances that are executing
+in parallel; for example, it may have the value 4 on most targets that
+support Intel® and 8 on targets that support Intel® AVX.  The second,
+``programIndex``, gives the index of the SIMD-lane being used for the
+current program instance.  (In other words, it's a varying integer value
+that has value zero for the first program instance, and so forth.)
+
+Given these, ``ispc_func`` might be implemented as:
+
+::
+
+    for (uniform int i = 0; i < count; i += programCount) {
+        float d = data[i + programIndex];
+        float r = ....
+        result[i + programIndex] = r;
+    }
+
+This code implicitly assumes that ``programCount`` evenly divides
+``count``.  The more general case could be:
+
+::
+
+    for (uniform int i = 0; i < count; i += programCount) {
+        if (i + programIndex < programCount) {
+            float d = data[i + programIndex];
+            ...
+
+Some performance improvement may come from removing the ``if`` test from
+the loop:
+
+::
+
+    uniform int fullCount = count - (count % programCount);
+    uniform int i;
+    for (i = 0; i < fullCount; i += programCount) {
+        float d = data[i + programIndex];
+         ...
+    }
+    if (i + programIndex < count) {
+        float d = data[i + programIndex];
+        ...
+    }
+
+For a more complex example, consider a ray tracer that wants to trace 4
+rays per pixel.  To write code that works on one pixel at a time on a
+machine that supports Intel® SSE, and 2 pixels at a time on a machine that
+supports Intel® AVX, see the following:
+
+::
+
+    // compute sample offsets for the pixel or pixels being processed    
+    uniform float xOffsetBase[4] = { 0, 0, 0.5, 0.5 };
+    uniform float yOffsetBase[4] = { 0, 0.5, 0, 0.5 };
+    float xOffset[programIndex % 4], yOffset[programIndex % 4];
+
+    // compute steps
+    uniform int dx, dy;
+    if (programCount == 4) { dx = dy = 1; }
+    else if (programCount == 8) { 
+        dx = 2; dy = 1; 
+        xOffset += programIndex / 4;
+    }
+    else if (programCount == 16) { 
+        xOffset += programIndex / 8;
+        yOffset += (programIndex / 4) & 0x1;
+        dx = dy = 2; 
+    }
+
+    for (uniform int y = 0; y < height; y += dy) {
+       for (uniform int x = 0; x < width; x += dx) {
+           float xSample = x + xOffset, ySample = y + yOffset;
+           // process samples in parallel ... 
+       }
+    }
+
+"Coherent" Control Flow Statements
+----------------------------------
+
+``ispc`` provides a few mechanisms for you to supply a hint that control
+flow is expected to be coherent at a particular point in the program's
+execution.  These mechanisms provide the compiler a hint that it's worth
+emitting extra code to check to see if the control flow is in fact coherent
+at run-time, in which case it can jump to a simpler code path or otherwise
+save work.
+
+The first of these statements is ``cif``, indicating an ``if`` statement
+that is expected to be coherent.  Recall from the `The
+SPMD-on-SIMD Execution Model`_ section that ``if`` statements with a
+``uniform`` test compile to more efficient code than ``if`` tests with
+varying tests.  ``cif`` can provide many benefits of ``if`` with a
+uniform test in the case where the test is actually varying.
+
+The usage of ``cif`` in code is just the same as ``if``:
+
+::
+
+    cif (x < y) { 
+        ...
+    } else {
+        ...
+    }
+
+``cif`` provides a hint to the compiler that you expect that most of the
+executing SPMD programs will all have the same result for the ``if``
+condition.  In this case, the code the compiler generates for the ``if``
+test is along the lines of the following pseudo-code:
+
+::
+
+   bool expr = /* evaluate cif condition */
+   if (all(expr)) {
+       // run "true" case of if test only
+   } else if (!any(expr)) {
+       // run "false" case of if test only
+   } else {
+       // run both true and false cases, updating mask appropriately
+   }
+
+(For comparison, see the discussion of how regular ``if`` statements are
+executed from the `The SPMD-on-SIMD Execution Model`_
+section.)
+
+For ``if`` statements where the different running SPMD program instances
+don't have coherent values for the boolean ``if`` test, using ``cif``
+introduces some additional overhead from the ``all`` and ``any`` tests as
+well as the corresponding branches.  For cases where the the program
+instances often do compute the same boolean value, this overhead is
+worthwhile.  If the control flow is in fact usually incoherent, this
+overhead only costs performance.
+
+In a similar fashion, ``ispc`` provides ``cfor``, ``cwhile``, ``cdo``,
+``cbreak``, ``ccontinue``, and ``creturn`` statements.  These statements
+are semantically the same as the corresponding non-"c"-prefixed functions.
+
+For example, when ``ispc`` encounters a regular ``continue`` statement in
+the middle of loop, it disables the mask bits for the program instances
+that executed the ``continue`` and then executes the remainder of the loop
+body, under the expectation that other executing program instances will
+still need to run those instructions.  If you expect that all running
+program instances will often execute ``continue`` together, then
+``ccontinue`` provides the compiler a hint to do extra work to check if
+every running program instance continued, in which case it can jump to the
+end of the loop, saving the work of executing the otherwise meaningless
+instructions.
+
+
+Program Instance Convergence
+----------------------------
+
+Unlike languages such as OpenCL\* and CUDA\*, these executing program
+instances are guaranteed to be maximally converged--if two program
+instances follow the same control path, they are guaranteed to execute each
+operation at the same time.  In the presence of divergent control flow:
+
+::
+
+   if (test) {
+     // true
+   }
+   else {
+     // false
+   }
+
+It is guaranteed that all program instances that were running before the
+``if`` test will also be running after the end of the ``else`` block.
+There is thus no need for a ``syncthreads``--type construct to synchronize
+the executing program instances in cases where program instances would like
+to share data or commicate with each other.
+
+
+Data Races
+----------
+
+Although the SPMD model assumes that program instances are independent, you
+can write code that has data races across the program instances.  For
+example, the following code causes all program instances to try to write
+different values to the same location:
+
+::
+
+    uniform int array[32] = 0;
+    int index = 0;
+    array[index] = programIndex;
+    
+In this case, the behavior of the program is undefined.
+
+
+Uniform Variables and Varying Control Flow
+------------------------------------------
+
+Operations may be executed even if none of the program instances needs to
+run them based on their control flow.  Consider an ``if``/``else`` test;
+the statements in the ``else`` block may be executed even if the test
+evaluates to ``true`` for all of the running program instances.  In
+general, the executed statements are masked, such that they have no side
+effects for the program instances that don't want to be running them, so
+there is no visible side-effect of executing the ``else`` statements.
+There is, however, one case where this part of the execution model can
+become apparent.
+
+Consider the cast of modifying the value of a ``uniform`` variable under
+varying control flow:
+
+::
+
+   extern void foo();
+   uniform int a;
+
+   if (test) { // varying test
+       ++a;    // modifying uniform under varying control flow
+       foo();
+   }
+
+When possible, ``ispc`` detects that the control flow is varying and issues
+an warning if a uniform variable is modified in this case.  Here, ``a`` may
+be modified in the above code even if *none* of the program instances
+evaluated a true value for the test, given the ``ispc`` execution model.
+
+
+Task Parallelism in ISPC
+------------------------
+
+One option for combining task-parallelism with ``ispc`` is to just use
+regular task parallelism in the C/C++ application code (be it through
+Intel® Cilk(tm), Intel® Thread Building Blocks or another task system,
+etc.), and for tasks to use ``ispc`` for SPMD parallelism across the vector
+lanes as appropriate.  Alternatively, ``ispc`` also has some support for
+launching tasks from ``ispc`` code.  The approach is similar to Intel®
+Cilk's task launch feature.  (See the ``examples/mandelbrot_tasks`` example
+to see it used in a non-trivial example.)
+
+Any function that is launched as a task must be declared with the ``task``
+qualifier:
+
+::
+
+    task void func(uniform float a[], uniform int start) {
+        ....
+    }
+
+Tasks must return ``void``; a compile time error is issued if a
+non-``void`` task is defined.
+
+Given a task, one can then write code that launches tasks as follows:
+
+::
+
+    for (uniform int i = 0; i < 100; ++i)
+        launch < func(a, i); >
+
+Note the ``launch`` keyword and the brackets around the function call.
+This code launches 100 tasks, each of which presumably does some
+computation keyed off of given the value ``i``.  In general, one should
+launch many more tasks than there are processors in the system to
+ensure good load-balancing, but not so many that the overhead of scheduling
+and running tasks dominates the computation.
+
+Program execution continues asynchronously after task launch; thus, the
+function shouldn't access values being generated by the tasks without
+synchronization.  A function uses a ``sync`` statement to wait for all
+launched tasks to finish:
+
+::
+
+    for (uniform int i = 0; i < 100; ++i)
+        launch < func(a, i); >
+    sync;
+    // now safe to use computed values in a[]...
+
+Alternatively, any function that launches tasks has an implicit ``sync``
+before it returns, so that functions that call a function that launches
+tasks don't have to worry about outstanding asynchronous computation.
+
+Inside functions with the ``task`` qualifier, two additional built-in
+variables are provided: ``threadIndex`` and ``threadCount``.
+``threadCount`` gives the total number of hardware threads that have been
+launched by the task system.  ``threadIndex`` provides an index between
+zero and ``threadCount-1`` that gives a unique index that corresponds to
+the hardware thread that is executing the current task.  The
+``threadIndex`` can be used for accessing data that is private to the
+current thread and thus doesn't require synchronization to access under
+parallel execution.
+
+If you use the task launch feature in ``ispc``, you must provide C/C++
+implementations of two functions and link them into your final executable
+file:
+
+::
+
+    void ISPCLaunch(void *funcptr, void *data);
+    void ISPCSync();
+
+These are called by the task launch code generated by the ``ispc``
+compiler; the first is called to launch to launch a task and the second is
+called to wait for, respectively.  (Factoring them out in this way
+allows ``ispc`` to inter-operate with the application's task system, if
+any, rather than having a separate one of its own.)  To run a particular
+task, the task system should cast the function pointer to a ``void (*)(void
+*, int, int)`` function pointer and then call it with the provided ``void
+*`` data and then an index for the current hardware thread and the total
+number of hardware threads the task system has launched--in other words:
+
+::
+
+    typedef void (*TaskFuncType)(void *, int, int);
+    TaskFuncType tft = (TaskFuncType)(funcptr);
+    tft(data, threadIndex, threadCount);
+
+A number of sample task system implementations are provided with ``ispc``; 
+see the files ``tasks_concrt.cpp``, ``tasks_gcd.cpp`` and
+``tasks_pthreads.cpp`` in the ``examples/mandelbrot_tasks`` directory of
+the ``ispc`` distribution.
+
+
+The ISPC Standard Library
+=========================
+
+``ispc`` has a standard library that is automatically available when
+compiling ``ispc`` programs.  (To disable the standard library, pass the
+``--nostdlib`` command-line flag to the compiler.)
+
+Math Functions
+--------------
+
+The math functions in the standard library provide a relatively standard
+range of mathematical functionality.
+
+A number of different implementations of the transcendental math functions
+are available; the math library to use can be selected with the
+``--math-lib=`` command line argument.  The following values can be provided
+for this argument. 
+
+* ``default``: ``ispc``'s default built-in math functions.  These have
+  reasonably high precision. (e.g. ``sin`` has a maximum absolute error of
+  approximately 1.45e-6 over the range -10pi to 10pi.)
+* ``fast``: more efficient but lower accuracy versions of the default ``ispc``
+  implementations.
+* ``svml``: use Intel "Short Vector Math Library".  Use
+  ``icc`` to link your final executable so that the appropriate libraries
+  are linked.
+* ``system``: use the system's math library.  On many systems, these
+  functions are more accurate than both of ``ispc``'s implementations.
+  Using these functions may be quite
+  inefficient; the system math functions only compute one result at a time
+  (i.e. they aren't vectorized), so ``ispc`` has to call them once per
+  active program instance.  (This is not the case for the other three
+  options.)
+
+In addition to an absolute value call, ``abs()``, ``signbits()`` extracts
+the sign bit of the given value, returning ``0x80000000`` if the sign bit
+is on (i.e. the value is negative) and zero if it is off.
+
+::
+
+    float abs(float a)
+    uniform float abs(uniform float a)
+    unsigned int signbits(float x)
+
+Standard rounding functions are provided.  (On machines that support Intel®
+SSE or Intel® AVX, these functions all map to variants of the ``roundss`` and
+``roundps`` instructions, respectively.)
+
+::
+
+    float round(float x)
+    uniform float round(uniform float x)
+    float floor(float x)
+    uniform float floor(uniform float x)
+    float ceil(float x)
+    uniform float ceil(uniform float x)
+
+``rcp()`` computes an approximation to ``1/v``.  The amount of error is
+different on different architectures.
+
+::
+
+    float rcp(float v)
+    uniform float rcp(uniform float v)
+
+The square root of a given value can be computed with ``sqrt()``, which
+maps to hardware square root intrinsics when available.  An approximate
+reciprocal square root, ``1/sqrt(v)`` is computed by ``rsqrt()``.  Like
+``rcp()``, the error from this call is different on different
+architectures.
+
+::
+
+    float sqrt(float v)
+    uniform float sqrt(uniform float v)
+    float rsqrt(float v)
+    uniform float rsqrt(uniform float v)
+
+A standard set of minimum and maximum functions is available.  These
+functions also map to corresponding intrinsic functions.
+
+::
+
+    float min(float a, float b)
+    uniform float min(uniform float a, uniform float b)
+    float max(float a, float b)
+    uniform float max(uniform float a, uniform float b)
+    unsigned int min(unsigned int a, unsigned int b)
+    uniform unsigned int min(uniform unsigned int a,
+                             uniform unsigned int b)
+    unsigned int max(unsigned int a, unsigned int b)
+    uniform unsigned int max(uniform unsigned int a,
+                             uniform unsigned int b)
+
+The ``clamp()`` functions clamp the provided value to the given range.
+(Their implementations are based on ``min()`` and ``max()`` and are thus
+quite efficient.)
+
+::
+
+    float clamp(float v, float low, float high)
+    uniform float clamp(uniform float v, uniform float low,
+                        uniform float high)
+    unsigned int clamp(unsigned int v, unsigned int low,
+                       unsigned int high)
+    uniform unsigned int clamp(uniform unsigned int v,
+                               uniform unsigned int low,
+                               uniform unsigned int high)
+
+``ispc`` provides a standard variety of calls for trigonometric functions:
+
+::
+
+    float sin(float x)
+    uniform float sin(uniform float x)
+    float cos(float x)
+    uniform float cos(uniform float x)
+    float tan(float x)
+    uniform float tan(uniform float x)
+
+Arctangent functions are also available:
+
+::
+
+   float atan(float x)
+   float atan2(float x, float y)
+   uniform float atan(uniform float x)
+   uniform float atan2(uniform float x, uniform float y)
+
+If both sine and cosine are needed, then the ``sincos()`` call computes
+both more efficiently than two calls to the respective individual
+functions:
+
+::
+
+    void sincos(float x, reference float s, reference float c)
+    void sincos(uniform float x, uniform reference float s,
+                uniform reference float c)
+
+
+The usual exponential and logarithmic functions are provided.
+
+::
+
+    float exp(float x)
+    uniform float exp(uniform float x)
+    float log(float x)
+    uniform float log(uniform float x)
+    float pow(float a, float b)
+    uniform float pow(uniform float a, uniform float b)
+
+Some functions that end up doing low-level manipulation of the
+floating-point representation in memory are available.  As in the standard
+math library, ``ldexp()`` multiplies the value ``x`` by 2^n, and
+``frexp()`` directly returns the normalized mantissa and returns the
+normalized exponent as a power of two in the ``pw2`` parameter.
+
+::
+
+    float ldexp(float x, int n)
+    uniform float ldexp(uniform float x, uniform int n)
+    float frexp(float x, reference int pw2)
+    niform float frexp(uniform float x,
+                       reference uniform int pw2)
+
+
+A simple random number generator is provided.  State for the RNG
+is maintained in an instance of the ``RNGState`` structure, which is seeded
+with ``seed_rng()``.
+
+::
+
+    struct RNGState;
+    unsigned int random(reference uniform RNGState state)
+    float frandom(reference uniform RNGState state)
+    void seed_rng(reference uniform RNGState state,
+                  uniform int seed)
+
+Output Functions
+----------------
+
+``ispc`` has a simple ``print`` statement for printing values during
+program execution.  In the following short ``ispc`` program, there are
+three uses of the ``print`` statement: 
+
+::
+
+    export void foo(uniform float f[4], uniform int i) {
+        float x = f[programIndex];
+        print("i = %, x = %\n", i, x);
+        if (x < 2) {
+            ++x;
+            print("added to x = %\n", x);
+        }
+        print("last print of x = %\n", x);
+    }
+
+There are a few things to note.  First, the function is called ``print``,
+not ``printf`` (unlike C).  Second, the formatting string passed to this
+function only uses a single percent sign to denote where the corresponding
+value should be printed.  You don't need to match the types of formatting
+operators with the types being passed.  However, you can't currently use
+the rich data formatting options that ``printf`` provides (e.g. constructs
+like ``%.10f``.).
+
+If this function is called with the array of floats (0,1,2,3) passed in for
+the ``f`` parameter and the value ``10`` for the ``i`` parameter, it
+generates the following output on a four-wide compilation target:
+
+::
+
+    i = 10, x = [0.000000,1.000000,2.000000,3.000000]
+    added to x = [1.000000,2.000000,_________,_________]
+    last print of x = [1.000000,2.000000,2.000000,3.000000]
+
+All values of "varying" variables for each executing program instance is
+printed when a "varying" variable is printed.  The result from the second
+print statement, which was called under control flow in the function
+``foo()`` above, and given the input array (0,1,2,3), only includes the
+first two program instances entered the ``if`` block.  Therefore, the
+values for the inactive program instances aren't printed.  (In other cases,
+they may have garbage values or be otherwise undefined.)
+
+
+Cross-Lane Operations
+---------------------
+
+Usually, ``ispc`` code expresses independent computation on separate data
+elements.  There are, however, a number of cases where it's useful for the
+program instances to be able to cooperate in computing results.  The
+cross-lane operations described in this section provide primitives for
+communication between the running program instances.
+ 
+A few routines that evaluate conditions across the running program
+instances.  For example, ``any()`` returns ``true`` if the given value
+``v`` is ``true`` for any of the SPMD program instances currently running,
+and ``all()`` returns ``true`` if it true for all of them.
+
+::
+
+    uniform bool any(bool v)
+    uniform bool all(bool v)
+
+The various variants of ``popcnt()`` return the population count--the
+number of bits set in the given value.
+
+::
+
+    uniform int popcnt(uniform int v)
+    int popcnt(int v)
+    uniform int popcnt(bool v)
+
+The ``lanemask()`` function returns an integer that encodes which of the
+current SPMD program instances are currently executing.  The i'th bit is
+set if the i'th SIMD lane is currently active.
+
+::
+
+    uniform int lanemask()
+
+You can compute reductions across the program instances.  For example, the
+values in each of the SIMD lanes ``x`` are added together by
+``reduce_add()``.  If this function is called under control flow, it only
+adds the values for the currently active program instances.
+
+::
+
+    uniform float reduce_add(float x)
+    uniform int reduce_add(int x)
+    uniform unsigned int reduce_add(unsigned int x)
+
+You can also use functions to compute the minimum and maximum value of the
+given value across all of the currently-executing vector lanes.
+
+::
+
+    uniform float reduce_min(float a, float b)
+    uniform int reduce_min(int a, int b)
+    uniform unsigned int reduce_min(unsigned int a, unsigned int b)
+    uniform float reduce_max(float a, float b)
+    uniform int reduce_max(int a, int b)
+    uniform unsigned int reduce_max(unsigned int a, unsigned int b)
+
+
+Finally, there are routines for writing out and reading in values from
+linear memory locations for the active program instances.
+``packed_load_active()`` loads consecutive values from the given array,
+starting at ``a[offset]``, loading one value for each currently-executing
+program instance and storing it into that program instance's ``val``
+variable.  It returns the total number of values loaded.  Similarly,
+``packed_store_active()`` stores the ``val`` values for each program
+instances that executed the ``packed_store_active()`` call, storing the
+results into the given array starting at the given offset.  It returns the
+total number of values stored.
+
+::
+
+    uniform unsigned int packed_load_active(uniform int a[],
+                                            uniform int offset,
+                                            reference int val)
+    uniform unsigned int packed_store_active(uniform int a[],
+                                             uniform int offset,
+                                             int val)
+
+
+As an example of how these functions can be used, the following code shows
+the use of ``packed_store_active()``.  The program instances that are
+executing each compute some value ``x``; we'd like to record the program
+index values of the program instances for which ``x`` is less than zero, if
+any.  In following the code, the ``programIndex`` value for each program
+instance is written into the ``ids`` array only if ``x < 0`` for that
+program instance.  The total number of values written into ``ids`` is
+returned from ``packed_store_active()``.
+
+::
+
+    uniform int ids[100];
+    uniform int offset = 0;
+    float x = ...;
+    if (x < 0)
+        offset += packed_store_active(ids, offset, programIndex);
+
+
+Finally, there are primitive operations that extract and set values in the
+SIMD lanes.  You can implement all of the operations described
+above in this section from these routines, though in general, not as
+efficiently.  These routines are useful for implementing other reductions
+and cross-lane communication that isn't included in the above, though.
+Given a ``varying`` value, ``extract()`` returns the i'th element of it as
+a single ``uniform`` value.  Similarly, ``insert`` returns a new value
+where the ``i`` th element of ``x`` has been replaced with the value ``v``
+.
+
+::
+
+    uniform float extract(float x, uniform int i)
+    uniform int extract(int x, uniform int i)
+    float insert(float x, uniform int i, uniform float v)
+    int insert(int x, uniform int i, uniform int v)
+
+
+Low-Level Bits
+--------------
+
+``ispc`` provides a number of bit/memory-level utility routines in its
+standard library as well.  It has routines that load from and store
+to 8-bit and 16-bit integer values stored in memory, converting to and from
+32-bit integers for use in computation in ``ispc`` code.  (These functions
+and this conversion step are necessary because ``ispc`` doesn't have native
+8-bit or 16-bit types in the language.)
+
+::
+
+    unsigned int load_from_int8(uniform int a[],
+                                uniform int offset)
+    void store_to_int8(uniform int a[], uniform int offset, 
+                       unsigned int val)
+    unsigned int load_from_int16(uniform int a[],
+                                 uniform int offset)
+    void store_to_int16(uniform int a[], uniform int offset, 
+                        unsigned int val)
+
+There are two things to note in these functions.  First, note that these
+functions take ``unsigned int`` arrays as parameters; you need
+to cast `the ``int8_t`` and ``int16_t`` pointers from the C/C++ side to
+``unsigned int`` when passing them to ``ispc`` code.  Second, although the
+arrays are passed as ``unsigned int``, in the array indexing calculation,
+with the ``offset`` parameter, they are treated as if they were ``int8`` or
+``int16`` types.  (i.e. the offset treated as being in terms of number of 8
+or 16-bit elements.)
+
+The ``intbits()`` and ``floatbits()`` functions can be used to implement
+low-level floating-point bit twiddling.  For example, ``intbits()`` returns
+an ``unsigned int`` that is a bit-for-bit copy of the given ``float``
+value.  (Note: it is **not** the same as ``(int)a``, but corresponds to
+something like ``*((int *)&a)`` in C.
+
+::
+
+    float floatbits(unsigned int a);
+    uniform float floatbits(uniform unsigned int a);
+    unsigned int intbits(float a);
+    uniform unsigned int intbits(uniform float a);
+
+
+The ``intbits()`` and ``floatbits()`` functions have no cost at runtime;
+they just let the compiler know how to interpret the bits of the given
+value.  They make it possible to efficiently write functions that take
+advantage of the low-level bit representation of floating-point values.
+
+For example, the ``abs()`` function in the standard library is implemented
+as follows:
+
+::
+
+    float abs(float a) {
+        unsigned int i = intbits(a);
+        i &= 0x7fffffff;
+        return floatbits(i);
+    }
+
+It, it clears the high order bit, to ensure that the given floating-point
+value is positive.  This compiles down to a single ``andps`` instruction
+when used with an Intel® SSE target, for example.
+
+
+Interoperability with the Application
+=====================================
+
+One of ``ispc``'s key goals is to make it easy to interoperate between the
+C/C++ application code and parallel code written in ``ispc``.  This
+section describes the details of how this works and describes a number of
+the pitfalls.
+
+Interoperability Overview
+-------------------------
+
+As described in `Compiling and Running a Simple ISPC Program`_ it's
+relatively straightforward to call ``ispc`` code from C/C++.  First, any
+``ispc`` functions to be called should be defined with the ``export``
+keyword:
+
+::
+
+    export void foo(uniform float a[]) { 
+        ...
+    }
+
+
+This function corresponds to the following C-callable function:
+
+::
+
+   void foo(float a[]);
+
+
+(Recall from the `Uniform and Varying Qualifiers`_ section 
+that ``uniform`` types correspond to a single instances of the
+corresponding type in C/C++.) 
+
+In addition to variables passed from the application to ``ispc`` in the
+function call, you can also share global variables between the application
+and ``ispc``.  To do so, just declare the global variable as usual (in
+either ``ispc`` or application code), and add an ``extern`` declaration on
+the other side.
+
+For example, given this ``ispc`` code:
+
+::
+
+    // ispc code
+    uniform float foo;
+    extern uniform float bar[10];
+
+And this C++ code:
+
+::
+
+   // C++ code
+   extern float foo;
+   float bar[10];
+
+Both the ``foo`` and ``bar`` global variables can be accessed on each
+side.
+
+``ispc`` code can also call back to C/C++.  On the ``ispc`` side, any
+application functions to be called must be declared with the ``export "C"``
+qualifier.
+
+::
+
+   extern "C" void foo(uniform float f, uniform float g);
+
+Unlike in C++, ``export "C"`` doesn't take braces to delineate
+multiple functions to be declared; thus, multiple C functions to be called
+from ``ispc`` must be declared as follows:
+
+::
+
+   extern "C" void foo(uniform float f, uniform float g);
+   extern "C" uniform int bar(uniform int a);
+
+It is illegal to overload functions declared with ``extern "C"`` linkage;
+``ispc`` issues an error in this case.
+
+Function calls back to C/C++ are not made if none of the program instances
+want to make the call.  For example, given code like:
+
+::
+
+    uniform float foo = ...;
+    float x = ...;
+    if (x != 0)
+        foo = appFunc(foo);
+
+
+``appFunc()`` will only be called if one or more of the running program
+instances evaluates ``true`` for ``x != 0``.  If the application code would
+like to determine which of the running program instances want to make the
+call, a mask representing the active SIMD lanes can be passed to the
+function.
+
+::
+
+   extern "C" float appFunc(uniform float x,
+                            uniform int activeLanes);
+
+If the function is then called as:
+   
+::
+
+   ...
+   x = appFunc(x, lanemask());
+
+The ``activeLanes`` parameter will have the value one in the 0th bit if the
+first program instance is running at this point in the code, one in the
+first bit for the second instance, and so forth.  (The ``lanemask()``
+function is documented in `Low-Level Bits`_.)  Application code can thus be
+written as:
+
+::
+
+    float appFunc(float x, int activeLanes) {
+        for (int i = 0; i < programCount; ++i)
+            if ((activeLanes & (1 << i)) != 0) {
+                // do computation for i'th SIMD lane
+            }
+    }
+
+
+Data Layout
+-----------
+
+In general, ``ispc`` tries to ensure that ``struct`` s and other complex
+datatypes are laid out in the same way in memory as they are in C/C++.
+Matching alignment is important for easy interoperability between C/C++
+code and ``ispc`` code.
+
+The main complexity in sharing data between ``ispc`` and C/C++ often comes
+from reconciling data structures between ``ispc`` code and application
+code; it can be useful to declare the shared structures in ``ispc`` code
+and then examine the generated header file (which will have the C/C++
+equivalents of them.)  For example, given a structure in ``ispc``:
+
+::
+
+  // ispc code
+  struct Node {
+     uniform int count;
+     uniform float pos[3];
+  };
+
+If the ``Node`` structure is used in the parameters to an ``export`` ed
+function, then the header file generated by the ``ispc`` compiler will
+have a declaration like:
+
+::
+
+  // C/C++ code
+  struct Node {
+     int count;
+     float pos[3];
+  };
+
+Because ``varying`` types have different sizes on different processor
+architectures, ``ispc`` prohibits any varying types from being used in
+parameters to functions with the ``export`` qualifier.  (``ispc`` also
+prohibits passing structures that themselves have varying types as members,
+etc.)  Thus, all datatypes that is shared with the application must have
+the ``uniform`` qualifier applied to them.  (See `Understanding How to
+Interoperate With the Application's Data`_ for more discussion of how to
+load vectors of SoA or AoSoA data from the application.)
+
+While ``ispc`` doesn't support pointers, there are two mechanisms to work
+with pointers to arrays from the application.  First, ``ispc`` passes
+arrays by reference (like C), if the application has allocated an array by:
+
+::
+
+   // C++ code
+   float *array = new float[count];
+
+It can pass ``array`` to a ``ispc`` function defined as:
+
+::
+
+   export void foo(uniform float array[], uniform int count)
+
+(Though the pointer must be aligned to the compilation target's natural
+vector width; see the discussion of alignment restrictions in `Data
+Alignment and Aliasing`_ and the aligned allocation routines in
+``examples/options/options.cpp`` for example.)
+
+Similarly, ``struct`` s from the application can have embedded pointers.
+This is handled with similar ``[]`` syntax:
+
+::
+
+  // C code
+  struct Foo {
+      float *foo, *bar;
+  };
+
+On the ``ispc`` side, the corresponding ``struct`` declaration is:
+
+::
+
+  // ispc
+  struct Foo {
+      uniform float foo[], bar[];
+  };
+
+There are two subtleties related to data layout to be aware of.  First, the
+C++ specification doesn't define the size or memory layout of ``bool`` s.
+Therefore, it's dangerous to share ``bool`` values in memory between
+``ispc`` code and C/C++ code.
+
+Second, ``ispc`` stores ``uniform`` short-vector types in memory with their
+first element at the machine's natural vector alignment (i.e. 16 bytes for
+a target that is using Intel® SSE, and so forth.)  This implies that these
+types will have different layout on different compilation targets.  As
+such, applications should in general avoid accessing ``uniform`` short
+vector types from C/C++ application code if possible.
+
+Data Alignment and Aliasing
+---------------------------
+
+There are two important constraints that must be adhered to when passing
+pointers from the application to ``ispc`` programs.
+
+The first constraint is alignment: any pointers from the host program that
+are passed to ``ispc`` must be aligned to natural vector alignment of
+system--for example, 16 byte alignment on a target that supports Intel®
+SSE, 32-byte on an Intel® AVX target.  If this constraint isn't met, the
+program may abort at runtime with an unaligned memory access error.
+
+For example, in a ``ispc`` function with the following declaration:
+
+::
+
+    export void foo(uniform float in[], uniform float out[],
+                    int count);
+
+If the application is passing stack-allocated arrays for ``in`` and
+``out``, these C/C++ compiler must be told to align these arrays.
+
+::
+
+    // MSVC, SSE target
+    __declspec(align(16)) float in[16], out[16];
+    foo(in, out, 16);
+
+With the gcc/clang compilers, the syntax for providing alignment is
+slightly different:
+
+::
+
+    float x[16] __attribute__ ((__align__(16)));
+    foo(in, out, 16);
+
+If the data being passed is dynamically allocated, the appropriate system
+aligned memory allocation routine should be used to allocate it (for
+example, ``_aligned_malloc()`` with Windows\*, ``memalign()`` with
+Linux\*; see the ``AllocAligned()`` function in ``examples/rt/rt.cpp`` for
+an example.)
+
+It is also required that it be valid to read memory at the first element of
+any array that is passed to ``ispc``.  In practice, this should just
+happen naturally, but it does mean that it is illegal to pass a ``NULL``
+pointer as a parameter to a ``ispc`` function called from the application.
+
+The second key constraint is that pointers and references in ``ispc``
+programs must not alias.  The ``ispc`` compiler assumes that different
+pointers can't end up pointing to the same memory location, either due to
+having the same initial value, or through array indexing in the program as
+it executed.
+
+This aliasing constraint also applies to ``reference`` parameters to
+functions.  Given a function like:
+
+::
+
+    void func(reference int a, reference int b) {
+        a = 0;
+        if (b == 0) { ... }
+    }
+
+Then if the same variable must not be passed to ``func()``.  This is
+another case of aliasing, and if the caller calls the function as ``func(x,
+x)``, it's not guaranteed that the ``if`` test will evaluate to true, due
+to the compiler's requirement of no aliasing.
+
+(In the future, ``ispc`` will have the ability to work with unaligned
+memory as well as have a mechanism to indicate that pointers may alias.)
+
+Using ISPC Effectively
+======================
+
+Restructuring Existing Programs to Use ISPC
+-------------------------------------------
+
+``ispc`` is designed to enable you to incorporate
+SPMD parallelism into existing code with minimal modification; features
+like the ability to share memory and data structures betwen C/C++ and
+``ispc`` code and the ability to directly call back and forth between
+``ispc`` and C/C++ are motivated by this.  These features also make it
+easy to incrementally transform a program to use ``ispc``; the most
+computationally-intensive localized parts of the computation can be
+transformed into ``ispc`` code while the remainder of the system is left
+as is.
+
+For a given section of code to be transitioned to run in ``ispc``, the
+next question is how to parallelize the computation.  Generally, there will
+be obvious loops inside which a large amount of computation is done ("for
+each ray", "for each pixel", etc.)  Mapping these to the SPMD computational
+style is often effective.
+
+Carefully choose how to do the exact mapping of computation to SPMD program
+instances.  This choice can impact the mix of gather/scatter memory access
+versus coherent memory access, for example.  (See more on this in the
+section `Gather and Scatter`_ below.)  This decision can also impact the
+coherence of control flow across the running SPMD program instances, which
+can also have a significant effect on performance; in general, creating
+groups of work that will tend to do similar computation across the SPMD
+program instances improves performance.
+
+Understanding How to Interoperate With the Application's Data
+-------------------------------------------------------------
+
+One of ``ispc``'s key goals is to be able to interoperate with the
+application's data, in whatever layout it is stored in.  You don't need to
+worry about reformatting of data or the overhead of a driver model that
+abstracts the data layout.  This section illustrates some of the
+alternatives with a simple example of computing the length of a large
+number of vectors.
+
+Consider for starters a ``Vector`` data-type, defined in C as:
+
+::
+
+   struct Vector { float x, y, z; };
+
+We might have (still in C) an array of ``Vector`` s defined like this:
+
+::
+
+   Vector vectors[1024];
+
+This is called an "array of structures" (AoS) layout.  To compute the
+lengths of these vectors in parallel, you can write ``ispc`` code like
+this:
+
+::
+
+  export void length(Vector vectors[1024], uniform float len[]) {
+      for (uniform int i = 0; i < 1024; i += programCount) {
+          int index = i+programIndex;
+          float x = vectors[index].x;
+          float y = vectors[index].y;
+          float z = vectors[index].z;
+          float l = sqrt(x*x + y*y + z*z);
+          len[index] = l;
+      }
+  }
+
+The ``vectors`` array has been indexed using ``programIndex`` in
+order to "peel off" ``programCount`` worth of values to compute the length
+of each time through the loop.
+
+The problem with this implementation is that the indexing into the array of
+structures, ``vectors[index].x`` is relatively expensive.  On a target
+machine that supports four-wide Intel® SSE, this turns into four loads of
+single ``float`` values from non-contiguous memory locations, which are
+then packed into a four-wide register corresponding to ``float x``.  Once the
+values are loaded into the local ``x``, ``y``, and ``z`` variables,
+SIMD-efficient computation can proceed; getting to that point is
+relatively inefficient.
+
+An alternative would be the "structure of arrays" (SoA) layout.  In C, the
+data would be declared as:
+
+::
+
+    float x[1024], y[1024], z[1024];
+
+The ``ispc`` code might be:
+
+::
+
+  export void length(uniform float x[1024], uniform float y[1024],
+                     uniform float z[1024], uniform float len[]) {
+      for (uniform int i = 0; i < 1024; i += programCount) {
+          int index = i+programIndex;
+          float xx = x[index];
+          float yy = y[index];
+          float zz = z[index];
+          float l = sqrt(xx*xx + yy*yy + zz*zz);
+          len[index] = l;
+      }
+  }
+
+In this example, the loads into ``xx``, ``yy``, and ``zz`` are single
+vector loads of ``programCount`` values into the corresponding registers.
+This processing is more efficient than the multiple scalar loads that are
+required with the AoS layout above.
+
+A final alternative is "array of structures of arrays" (AoSoA), a hybrid
+between these two.  A structure is declared that stores a small number of
+``x``, ``y``, and ``z`` values in contiguous memory locations:
+
+::
+
+  struct Vector16 {
+      float x[16], y[16], z[16];
+  };
+
+
+The ``ispc`` code has an outer loop over ``Vector16`` elements and
+then an inner loop that peels off values from the element members:
+
+::
+
+  #define N_VEC (1024/16)
+  export void length(Vector16 v[N_VEC], uniform float len[]) {
+      for (uniform int i = 0; i < N_VEC; ++i) {
+          for (uniform int j = 0; j < 16; j += programCount) {
+              int index = j + programIndex;
+              float x = v[i].x[index];
+              float y = v[i].y[index];
+              float z = v[i].z[index];
+              float l = sqrt(x*x + y*y + z*z);
+              len[index] = l;
+          }
+      }
+  }
+
+(This code assumes that ``programCount`` divides 16 equally.  See below for
+discussion of the more general case.)  One advantage of the AoSoA layout is
+that the memory accesses to load values are to nearby memory locations,
+where as with SoA, each of the three loads above is to locations separated
+by a few thousand bytes.  Thus, AoSoA can be more cache friendly.  For
+structures with many members, this difference can lead to a substantial
+improvement.
+
+``ispc`` can also efficiently process data in AoSoA layout where the inner
+array length is less than the machine vector width.  For example, consider
+doing computation with this AoSoA structure definition on a machine with an
+8-wide vector unit (for example, an Intel® AVX target):
+
+::
+
+  struct Vector4 {
+      float x[4], y[4], z[4];
+  };
+
+
+The ``ispc`` code to process this loads elements four at a time from
+``Vector4`` instances until it has a full ``programCount`` number of
+elements to work with and then proceeds with the computation.
+
+::
+
+  #define N_VEC (1024/4)
+  export void length(Vector4 v[N_VEC], uniform float len[]) {
+      for (uniform int i = 0; i < N_VEC; i += programCount / 4) {
+          float x, y, z;
+          for (uniform int j = 0; j < programCount / 4; ++j) {
+              if (programIndex >= 4 * j &&
+                  programIndex <  4 * (j+1)) {
+                  int index = (programIndex & 0x3);
+                  x = v[i+j].x[index];
+                  y = v[i+j].y[index];
+                  z = v[i+j].z[index];
+              }
+          }
+          float l = sqrt(x*x + y*y + z*z);
+          len[4*i + programIndex] = l;
+      }
+  }
+
+
+Communicating Between SPMD Program Instances
+--------------------------------------------
+
+The ``programIndex`` built-in variable (see `Mapping Data To Program
+Instances`_) can be used to communicate between the set of executing
+program instances.  Consider the following code, which shows all of the
+program instances writing into unique locations in an array.
+
+::
+
+    float x = ...;
+    uniform float allX[programCount];
+    allX[programIndex] = x;
+
+In this code, a program instance that reads ``allX[0]`` finds the value of
+``x`` that was computed by the first of the running program instances, and
+so forth.  Program instances can communicate with their neighbor instances
+with indexing like ``allX[(programIndex+1)%programCount]``.
+
+
+Gather and Scatter
+------------------
+
+The CPU is a poor fit for SPMD execution in some ways, the worst of which
+is handling of general memory reads and writes from SPMD program instances.
+For example, in a "simple" array index:
+
+::
+
+    int i = ....;
+    uniform float x[10] = { ... };
+    float f = x[i];
+
+Since the index ``i`` is a varying value, the various SPMD program
+instances will in general be reading different locations in the array
+``x``.  Because the CPU doesn't have a gather instruction, the ``ispc``
+compiler has to serialize these memory reads, performing a separate memory
+load for each running program instance, packing the result into ``f``.
+(And the analogous case would happen for a write into ``x[i]``.)
+
+In many cases, gathers like these are unavoidable; the running program
+instances just need to access incoherent memory locations.  However, if the
+array index ``i`` could actually be declared and used as a ``uniform``
+variable, the resulting array index is substantially more
+efficient.  This is another case where using ``uniform`` whenever applicable
+is of benefit.
+
+In some cases, the ``ispc`` compiler is able to deduce that the memory
+locations accessed are either all the same or are uniform.  For example,
+given:
+
+::
+
+  uniform int x = ...;
+  int y = x;
+  return array[y];
+
+The compiler is able to determine that all of the program instances are
+loading from the same location, even though ``y`` is not a ``uniform``
+variable.  In this case, the compiler will transform this load to a regular vector
+load, rather than a general gather.
+
+Sometimes the running program instances will access a
+linear sequence of memory locations; this happens most frequently when
+array indexing is done based on the built-in ``programIndex`` variable.  In
+many of these cases, the compiler is also able to detect this case and then
+do a vector load.  For example, given:
+
+::
+
+    uniform int x = ...;
+    return array[2*x + programIndex];
+
+A regular vector load is done from array, starting at offset ``2*x``.
+
+Low-level Vector Tricks
+-----------------------
+
+Many low-level Intel® SSE coding constructs can be implemented in ``ispc``
+code.  For example, the following code efficiently reverses the sign of the
+given values.
+
+::
+
+  float flipsign(float a) {
+      unsigned int i = intbits(a);
+      i ^= 0x80000000;
+      return floatbits(i);
+  }
+
+This code compiles down to a single XOR instruction.
+
+Debugging
+---------
+
+Support for debugging in ``ispc`` is in progress.  On Linux\* and Mac
+OS\*, the ``-g`` command-line flag can be supplied to the compiler,
+which causes it to generate debugging symbols.  Running ``ispc`` programs
+in the debugger, setting breakpoints, printing out variables and the like
+all generally works, though there is occasional unexpected behavior.
+
+Another option for debugging (the only current option on Windows\*) is
+to use the ``print`` statement for ``printf()``
+style debugging.  You can also use the ability to call back to
+application code at particular points in the program, passing a set of
+variable values to be logged or otherwise analyzed from there.
+
+The "Fast math" Option
+----------------------
+
+``ispc`` has a ``--fast-math`` command-line flag that enables a number of
+optimizations that may be undesirable in code where numerical preceision is
+critically important.  For many graphics applications, the
+approximations may be acceptable.  The following two optimizations are
+performed when ``--fast-math`` is used.  By default, the ``--fast-math``
+flag is off.
+
+* Expressions like ``x / y``, where ``y`` is a compile-time constant, are
+  transformed to ``x * (1./y)``, where the inverse value of ``y`` is
+  precomputed at compile time.
+
+* Expressions like ``x / y``, where ``y`` is not a compile-time constant,
+  are transformed to ``x * rcp(y)``, where ``rcp()`` maps to the
+  approximate reciprocal instruction from the standard library.
+
+
+"Inline" Aggressively
+---------------------
+
+Inlining functions aggressively is generally beneficial for performance
+with ``ispc``.  Definitely use the ``inline`` qualifier for any short
+functions (a few lines long), and experiment with it for longer functions.
+
+Small Performance Tricks
+------------------------
+
+Performance is slightly improved by declaring variables at the same block
+scope where they are first used.  For example, in code like the
+following, if the lifetime of ``foo`` is only within the scope of the
+``if`` clause, write the code like this:  
+
+::
+
+    float func() {
+        ....
+        if (x < y) {
+            float foo;
+            ... use foo ...
+        }
+    }
+
+Try not to write code as:
+
+::
+
+    float func() {
+        float foo;
+        ....
+        if (x < y) {
+            ... use foo ...
+        }
+    }
+
+Doing so can reduce the amount of masked store instructions that the
+compiler needs to generate.
+
+Instrumenting Your ISPC Programs
+--------------------------------
+
+``ispc`` has an optional instrumentation feature that can help you
+understand performance issues.  If a program is compiled using the
+``--instrument`` flag, the compiler emits calls to a function with the
+following signature at various points in the program (for
+example, at interesting points in the control flow, when scatters or
+gathers happen.)
+
+::
+
+    extern "C" {
+        void ISPCInstrument(const char *fn, const char *note, 
+                            int line, int mask);
+    }
+
+This function is passed the file name of the ``ispc`` file running, a short
+note indicating what is happening, the line number in the source file, and
+the current mask of active SPMD program lanes.  You must provide an
+implementation of this function and link it in with your application.
+
+For example, when the ``ispc`` program runs, this function might be called
+as follows:
+
+::
+
+   ISPCInstrument("foo.ispc", "function entry", 55, 0xf);
+
+This call indicates that at the currently executing program has just
+entered the function defined at line 55 of the file ``foo.ispc``, with a
+mask of all lanes currently executing (assuming a four-wide Intel® SSE
+target machine).
+
+For a fuller example of the utility of this functionality, see
+``examples/aobench_instrumented`` in the ``ispc`` distribution.  Ths
+example includes an implementation of the ``ISPCInstrument`` function that
+collects aggregate data about the program's execution behavior.
+
+When running this example, you will want to direct to the ``ao`` executable
+to generate a low resolution image, because the instrumentation adds
+substantial execution overhead.  For example:
+
+::
+
+    % ./ao 1 32 32
+
+After the ``ao`` program exits, a summary report along the following lines
+will be printed.  In the first few lines, you can see how many times a few
+functions were called, and the average percentage of SIMD lanes that were
+active upon function entry.
+
+:: 
+
+    ao.ispc(0067) - function entry: 342424 calls (0 / 0.00% all off!), 95.86% active lanes
+    ao.ispc(0067) - return: uniform control flow: 342424 calls (0 / 0.00% all off!), 95.86% active lanes
+    ao.ispc(0071) - function entry: 1122 calls (0 / 0.00% all off!), 97.33% active lanes
+    ao.ispc(0075) - return: uniform control flow: 1122 calls (0 / 0.00% all off!), 97.33% active lanes
+    ao.ispc(0079) - function entry: 10072 calls (0 / 0.00% all off!), 45.09% active lanes
+    ao.ispc(0088) - function entry: 36928 calls (0 / 0.00% all off!), 97.40% active lanes
+    ...
+
+Disclaimer and Legal Information
+================================
+
+INFORMATION IN THIS DOCUMENT IS PROVIDED IN CONNECTION WITH INTEL(R) PRODUCTS.
+NO LICENSE, EXPRESS OR IMPLIED, BY ESTOPPEL OR OTHERWISE, TO ANY INTELLECTUAL
+PROPERTY RIGHTS IS GRANTED BY THIS DOCUMENT. EXCEPT AS PROVIDED IN INTEL'S TERMS
+AND CONDITIONS OF SALE FOR SUCH PRODUCTS, INTEL ASSUMES NO LIABILITY WHATSOEVER,
+AND INTEL DISCLAIMS ANY EXPRESS OR IMPLIED WARRANTY, RELATING TO SALE AND/OR USE
+OF INTEL PRODUCTS INCLUDING LIABILITY OR WARRANTIES RELATING TO FITNESS FOR A
+PARTICULAR PURPOSE, MERCHANTABILITY, OR INFRINGEMENT OF ANY PATENT, COPYRIGHT
+OR OTHER INTELLECTUAL PROPERTY RIGHT.
+
+UNLESS OTHERWISE AGREED IN WRITING BY INTEL, THE INTEL PRODUCTS ARE NOT DESIGNED
+NOR INTENDED FOR ANY APPLICATION IN WHICH THE FAILURE OF THE INTEL PRODUCT COULD
+CREATE A SITUATION WHERE PERSONAL INJURY OR DEATH MAY OCCUR.
+
+Intel may make changes to specifications and product descriptions at any time,
+without notice. Designers must not rely on the absence or characteristics of any
+features or instructions marked "reserved" or "undefined." Intel reserves these
+for future definition and shall have no responsibility whatsoever for conflicts
+or incompatibilities arising from future changes to them. The information here
+is subject to change without notice. Do not finalize a design with this
+information.
+
+The products described in this document may contain design defects or errors
+known as errata which may cause the product to deviate from published
+specifications. Current characterized errata are available on request.
+
+Contact your local Intel sales office or your distributor to obtain the latest
+specifications and before placing your product order.
+
+Copies of documents which have an order number and are referenced in this
+document, or other Intel literature, may be obtained by calling 1-800-548-4725,
+or by visiting Intel's Web Site.
+
+Intel processor numbers are not a measure of performance. Processor numbers
+differentiate features within each processor family, not across different
+processor families. See http://www.intel.com/products/processor_number for
+details.
+
+BunnyPeople, Celeron, Celeron Inside, Centrino, Centrino Atom,
+Centrino Atom Inside, Centrino Inside, Centrino logo, Core Inside, FlashFile,
+i960, InstantIP, Intel, Intel logo, Intel386, Intel486, IntelDX2, IntelDX4,
+IntelSX2, Intel Atom, Intel Atom Inside, Intel Core, Intel Inside,
+Intel Inside logo, Intel. Leap ahead., Intel. Leap ahead. logo, Intel NetBurst,
+Intel NetMerge, Intel NetStructure, Intel SingleDriver, Intel SpeedStep,
+Intel StrataFlash, Intel Viiv, Intel vPro, Intel XScale, Itanium,
+Itanium Inside, MCS, MMX, Oplus, OverDrive, PDCharm, Pentium, Pentium Inside,
+skoool, Sound Mark, The Journey Inside, Viiv Inside, vPro Inside, VTune, Xeon,
+and Xeon Inside are trademarks of Intel Corporation in the U.S. and other
+countries.
+
+* Other names and brands may be claimed as the property of others.
+
+Copyright(C) 2011, Intel Corporation. All rights reserved.
+
+
+Optimization Notice
+===================
+
+Intel compilers, associated libraries and associated development tools may
+include or utilize options that optimize for instruction sets that are
+available in both Intel and non-Intel microprocessors (for example SIMD
+instruction sets), but do not optimize equally for non-Intel
+microprocessors.  In addition, certain compiler options for Intel
+compilers, including some that are not specific to Intel
+micro-architecture, are reserved for Intel microprocessors.  For a detailed
+description of Intel compiler options, including the instruction sets and
+specific microprocessors they implicate, please refer to the "Intel
+Compiler User and Reference Guides" under "Compiler Options."  Many library
+routines that are part of Intel compiler products are more highly optimized
+for Intel microprocessors than for other microprocessors.  While the
+compilers and libraries in Intel compiler products offer optimizations for
+both Intel and Intel-compatible microprocessors, depending on the options
+you select, your code and other factors, you likely will get extra
+performance on Intel microprocessors.
+
+Intel compilers, associated libraries and associated development tools may
+or may not optimize to the same degree for non-Intel microprocessors for
+optimizations that are not unique to Intel microprocessors.  These
+optimizations include Intel® Streaming SIMD Extensions 2 (Intel® SSE2),
+Intel® Streaming SIMD Extensions 3 (Intel® SSE3), and Supplemental
+Streaming SIMD Extensions 3 (Intel SSSE3) instruction sets and other
+optimizations.  Intel does not guarantee the availability, functionality,
+or effectiveness of any optimization on microprocessors not manufactured by
+Intel.  Microprocessor-dependent optimizations in this product are intended
+for use with Intel microprocessors.
+
+While Intel believes our compilers and libraries are excellent choices to
+assist in obtaining the best performance on Intel and non-Intel
+microprocessors, Intel recommends that you evaluate other compilers and
+libraries to determine which best meet your requirements.  We hope to win
+your business by striving to offer the best performance of any compiler or
+library; please let us know if you find we do not.
diff --git a/doxygen.cfg b/doxygen.cfg
new file mode 100644
index 00000000..9f79b909
--- /dev/null
+++ b/doxygen.cfg
@@ -0,0 +1,1685 @@
+# Doxyfile 1.7.2
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a hash (#) is considered a comment and will be ignored.
+# The format is:
+#       TAG = value [value, ...]
+# For lists items can also be appended using:
+#       TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (" ").
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# http://www.gnu.org/software/libiconv for the list of possible encodings.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded
+# by quotes) that should identify the project.
+
+PROJECT_NAME           = "Intel SPMD Program Compiler"
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number.
+# This could be handy for archiving the generated documentation or
+# if some version control system is used.
+
+PROJECT_NUMBER         = 1.0
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
+# base path where the generated documentation will be put.
+# If a relative path is entered, it will be relative to the location
+# where doxygen was started. If left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       = docs/doxygen
+
+# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create
+# 4096 sub-directories (in 2 levels) under the output directory of each output
+# format and will distribute the generated files over these directories.
+# Enabling this option can be useful when feeding doxygen a huge amount of
+# source files, where putting all generated files in the same directory would
+# otherwise cause performance problems for the file system.
+
+CREATE_SUBDIRS         = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# The default language is English, other supported languages are:
+# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional,
+# Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German,
+# Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English
+# messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian,
+# Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrilic, Slovak,
+# Slovene, Spanish, Swedish, Ukrainian, and Vietnamese.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will
+# include brief member descriptions after the members that are listed in
+# the file and class documentation (similar to JavaDoc).
+# Set to NO to disable this.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend
+# the brief description of a member or function before the detailed description.
+# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator
+# that is used to form the text in various listings. Each string
+# in this list, if found as the leading text of the brief description, will be
+# stripped from the text and the result after processing the whole list, is
+# used as the annotated text. Otherwise, the brief description is used as-is.
+# If left blank, the following values are used ("$name" is automatically
+# replaced with the name of the entity): "The $name class" "The $name widget"
+# "The $name file" "is" "provides" "specifies" "contains"
+# "represents" "a" "an" "the"
+
+ABBREVIATE_BRIEF       =
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# Doxygen will generate a detailed section even if there is only a brief
+# description.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full
+# path before files name in the file list and in the header files. If set
+# to NO the shortest path that makes the file name unique will be used.
+
+FULL_PATH_NAMES        = NO
+
+# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag
+# can be used to strip a user-defined part of the path. Stripping is
+# only done if one of the specified strings matches the left-hand part of
+# the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the
+# path to strip.
+
+STRIP_FROM_PATH        = 
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of
+# the path mentioned in the documentation of a class, which tells
+# the reader which header file to include in order to use a class.
+# If left blank only the name of the header file containing the class
+# definition is used. Otherwise one should specify the include paths that
+# are normally passed to the compiler using the -I flag.
+
+STRIP_FROM_INC_PATH    =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter
+# (but less readable) file names. This can be useful if your file system
+# doesn't support long names like on DOS, Mac, or CD-ROM.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen
+# will interpret the first line (until the first dot) of a JavaDoc-style
+# comment as the brief description. If set to NO, the JavaDoc
+# comments will behave just like regular Qt-style comments
+# (thus requiring an explicit @brief command for a brief description.)
+
+JAVADOC_AUTOBRIEF      = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then Doxygen will
+# interpret the first line (until the first dot) of a Qt-style
+# comment as the brief description. If set to NO, the comments
+# will behave just like regular Qt-style comments (thus requiring
+# an explicit \brief command for a brief description.)
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen
+# treat a multi-line C++ special comment block (i.e. a block of //! or ///
+# comments) as a brief description. This used to be the default behaviour.
+# The new default is to treat a multi-line C++ comment block as a detailed
+# description. Set this tag to YES if you prefer the old behaviour instead.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented
+# member inherits the documentation from any documented member that it
+# re-implements.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce
+# a new page for each member. If set to NO, the documentation of a member will
+# be part of the file/class/namespace that contains it.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab.
+# Doxygen uses this value to replace tabs by spaces in code fragments.
+
+TAB_SIZE               = 4
+
+# This tag can be used to specify a number of aliases that acts
+# as commands in the documentation. An alias has the form "name=value".
+# For example adding "sideeffect=\par Side Effects:\n" will allow you to
+# put the command \sideeffect (or @sideeffect) in the documentation, which
+# will result in a user-defined paragraph with heading "Side Effects:".
+# You can put \n's in the value part of an alias to insert newlines.
+
+ALIASES                =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C
+# sources only. Doxygen will then generate output that is more tailored for C.
+# For instance, some of the names that are used will be different. The list
+# of all members will be omitted, etc.
+
+OPTIMIZE_OUTPUT_FOR_C  = NO
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java
+# sources only. Doxygen will then generate output that is more tailored for
+# Java. For instance, namespaces will be presented as packages, qualified
+# scopes will look different, etc.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources only. Doxygen will then generate output that is more tailored for
+# Fortran.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for
+# VHDL.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given extension.
+# Doxygen has a built-in mapping, but you can override or extend it using this
+# tag. The format is ext=language, where ext is a file extension, and language
+# is one of the parsers supported by doxygen: IDL, Java, Javascript, CSharp, C,
+# C++, D, PHP, Objective-C, Python, Fortran, VHDL, C, C++. For instance to make
+# doxygen treat .inc files as Fortran files (default is PHP), and .f files as C
+# (default is Fortran), use: inc=Fortran f=C. Note that for custom extensions
+# you also need to set FILE_PATTERNS otherwise the files are not read by doxygen.
+
+EXTENSION_MAPPING      =
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should
+# set this tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string); v.s.
+# func(std::string) {}). This also makes the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+
+BUILTIN_STL_SUPPORT    = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only.
+# Doxygen will parse them like normal C++ but will assume all classes use public
+# instead of private inheritance when no explicit protection keyword is present.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate getter
+# and setter methods for a property. Setting this option to YES (the default)
+# will make doxygen replace the get and set methods by a property in the
+# documentation. This will only work if the methods are indeed getting or
+# setting a simple type. If this is not the case, or you want to show the
+# methods anyway, you should set this option to NO.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES, then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# Set the SUBGROUPING tag to YES (the default) to allow class member groups of
+# the same type (for instance a group of public functions) to be put as a
+# subgroup of that type (e.g. under the Public Functions section). Set it to
+# NO to prevent subgrouping. Alternatively, this can be done per class using
+# the \nosubgrouping command.
+
+SUBGROUPING            = YES
+
+# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum
+# is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically
+# be useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to
+# determine which symbols to keep in memory and which to flush to disk.
+# When the cache is full, less often used symbols will be written to disk.
+# For small to medium size projects (<1000 input files) the default value is
+# probably good enough. For larger projects a too small cache size can cause
+# doxygen to be busy swapping symbols to and from disk most of the time
+# causing a significant performance penality.
+# If the system has enough physical memory increasing the cache will improve the
+# performance by keeping more symbols in memory. Note that the value works on
+# a logarithmic scale so increasing the size by one will roughly double the
+# memory usage. The cache size is given by this formula:
+# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0,
+# corresponding to a cache size of 2^16 = 65536 symbols
+
+SYMBOL_CACHE_SIZE      = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
+# documentation are documented, even if no documentation was available.
+# Private class members and static file members will be hidden unless
+# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES
+
+EXTRACT_ALL            = YES
+
+# If the EXTRACT_PRIVATE tag is set to YES all private members of a class
+# will be included in the documentation.
+
+EXTRACT_PRIVATE        = YES
+
+# If the EXTRACT_STATIC tag is set to YES all static members of a file
+# will be included in the documentation.
+
+EXTRACT_STATIC         = YES
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs)
+# defined locally in source files will be included in the documentation.
+# If set to NO only classes defined in header files are included.
+
+EXTRACT_LOCAL_CLASSES  = YES
+
+# This flag is only useful for Objective-C code. When set to YES local
+# methods, which are defined in the implementation section but not in
+# the interface are included in the documentation.
+# If set to NO (the default) only methods in the interface are included.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base
+# name of the file that contains the anonymous namespace. By default
+# anonymous namespaces are hidden.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all
+# undocumented members of documented classes, files or namespaces.
+# If set to NO (the default) these members will be included in the
+# various overviews, but no documentation section is generated.
+# This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_MEMBERS     = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy.
+# If set to NO (the default) these classes will be included in the various
+# overviews. This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_CLASSES     = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all
+# friend (class|struct|union) declarations.
+# If set to NO (the default) these declarations will be included in the
+# documentation.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any
+# documentation blocks found inside the body of a function.
+# If set to NO (the default) these blocks will be appended to the
+# function's detailed documentation block.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation
+# that is typed after a \internal command is included. If the tag is set
+# to NO (the default) then the documentation will be excluded.
+# Set it to YES to include the internal documentation.
+
+INTERNAL_DOCS          = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate
+# file names in lower-case letters. If set to YES upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+
+CASE_SENSE_NAMES       = YES
+
+# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen
+# will show members with their full class and namespace scopes in the
+# documentation. If set to YES the scope will be hidden.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen
+# will put a list of the files that are included by a file in the documentation
+# of that file.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen
+# will list include files with double quotes in the documentation
+# rather than with sharp brackets.
+
+FORCE_LOCAL_INCLUDES   = NO
+
+# If the INLINE_INFO tag is set to YES (the default) then a tag [inline]
+# is inserted in the documentation for inline members.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen
+# will sort the (detailed) documentation of file and class members
+# alphabetically by member name. If set to NO the members will appear in
+# declaration order.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the
+# brief documentation of file, namespace and class members alphabetically
+# by member name. If set to NO (the default) the members will appear in
+# declaration order.
+
+SORT_BRIEF_DOCS        = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen
+# will sort the (brief and detailed) documentation of class members so that
+# constructors and destructors are listed first. If set to NO (the default)
+# the constructors will appear in the respective orders defined by
+# SORT_MEMBER_DOCS and SORT_BRIEF_DOCS.
+# This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO
+# and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the
+# hierarchy of group names into alphabetical order. If set to NO (the default)
+# the group names will appear in their defined order.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be
+# sorted by fully-qualified names, including namespaces. If set to
+# NO (the default), the class list will be sorted only by class name,
+# not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the
+# alphabetical list.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or
+# disable (NO) the todo list. This list is created by putting \todo
+# commands in the documentation.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or
+# disable (NO) the test list. This list is created by putting \test
+# commands in the documentation.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or
+# disable (NO) the bug list. This list is created by putting \bug
+# commands in the documentation.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or
+# disable (NO) the deprecated list. This list is created by putting
+# \deprecated commands in the documentation.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional
+# documentation sections, marked by \if sectionname ... \endif.
+
+ENABLED_SECTIONS       =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines
+# the initial value of a variable or macro consists of for it to appear in
+# the documentation. If the initializer consists of more lines than specified
+# here it will be hidden. Use a value of 0 to hide initializers completely.
+# The appearance of the initializer of individual variables and macros in the
+# documentation can be controlled using \showinitializer or \hideinitializer
+# command in the documentation regardless of this setting.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated
+# at the bottom of the documentation of classes and structs. If set to YES the
+# list will mention the files that were used to generate the documentation.
+
+SHOW_USED_FILES        = YES
+
+# If the sources in your project are distributed over multiple directories
+# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy
+# in the documentation. The default is NO.
+
+SHOW_DIRECTORIES       = YES
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page.
+# This will remove the Files entry from the Quick Index and from the
+# Folder Tree View (if specified). The default is YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the
+# Namespaces page.
+# This will remove the Namespaces entry from the Quick Index
+# and from the Folder Tree View (if specified). The default is YES.
+
+SHOW_NAMESPACES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command <command> <input-file>, where <command> is the value of
+# the FILE_VERSION_FILTER tag, and <input-file> is the name of an input file
+# provided by doxygen. Whatever the program writes to standard output
+# is used as the file version. See the manual for examples.
+
+FILE_VERSION_FILTER    =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. The create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option.
+# You can optionally specify a file name after the option, if omitted
+# DoxygenLayout.xml will be used as the name of the layout file.
+
+LAYOUT_FILE            =
+
+#---------------------------------------------------------------------------
+# configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated
+# by doxygen. Possible values are YES and NO. If left blank NO is used.
+
+QUIET                  = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated by doxygen. Possible values are YES and NO. If left blank
+# NO is used.
+
+WARNINGS               = YES
+
+# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings
+# for undocumented members. If EXTRACT_ALL is set to YES then this flag will
+# automatically be disabled.
+
+WARN_IF_UNDOCUMENTED   = NO
+
+# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some
+# parameters in a documented function, or documenting parameters that
+# don't exist or using markup commands wrongly.
+
+WARN_IF_DOC_ERROR      = YES
+
+# The WARN_NO_PARAMDOC option can be enabled to get warnings for
+# functions that are documented, but have no documentation for their parameters
+# or return value. If set to NO (the default) doxygen will only warn about
+# wrong or incomplete parameter documentation, but not about the absence of
+# documentation.
+
+WARN_NO_PARAMDOC       = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that
+# doxygen can produce. The string should contain the $file, $line, and $text
+# tags, which will be replaced by the file and line number from which the
+# warning originated and the warning text. Optionally the format may contain
+# $version, which will be replaced by the version of the file (if it could
+# be obtained via FILE_VERSION_FILTER)
+
+WARN_FORMAT            =
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning
+# and error messages should be written. If left blank the output is written
+# to stderr.
+
+WARN_LOGFILE           =
+
+#---------------------------------------------------------------------------
+# configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag can be used to specify the files and/or directories that contain
+# documented source files. You may enter file names like "myfile.cpp" or
+# directories like "/usr/src/myproject". Separate the files or directories
+# with spaces.
+
+INPUT                  = builtins.h \
+                         ctx.h \
+                         decl.h \
+                         expr.h \
+                         gatherbuf.h \
+                         ispc.h \
+                         llvmutil.h \
+                         module.h \
+                         opt.h \
+                         stmt.h \
+                         sym.h \
+                         type.h \
+                         util.h \
+                         builtins.cpp \
+                         ctx.cpp \
+                         decl.cpp \
+                         expr.cpp \
+                         gatherbuf.cpp \
+                         ispc.cpp \
+                         llvmutil.cpp \
+                         main.cpp \
+                         module.cpp \
+                         opt.cpp \
+                         stmt.cpp \
+                         sym.cpp \
+                         type.cpp \
+                         util.cpp \
+                         parse.yy \
+                         lex.ll \
+                         stdlib-c.c
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is
+# also the default input encoding. Doxygen uses libiconv (or the iconv built
+# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for
+# the list of possible encodings.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank the following patterns are tested:
+# *.c *.cc *.cxx *.cpp *.c++ *.d *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh
+# *.hxx *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.dox *.py
+# *.f90 *.f *.vhd *.vhdl
+
+FILE_PATTERNS          =
+
+# The RECURSIVE tag can be used to turn specify whether or not subdirectories
+# should be searched for input files as well. Possible values are YES and NO.
+# If left blank NO is used.
+
+RECURSIVE              = YES
+
+# The EXCLUDE tag can be used to specify files and/or directories that should
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+
+EXCLUDE                =
+
+# The EXCLUDE_SYMLINKS tag can be used select whether or not files or
+# directories that are symbolic links (a Unix filesystem feature) are excluded
+# from the input.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories. Note that the wildcards are matched
+# against the file with absolute path, so to exclude all test directories
+# for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       =
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+
+EXCLUDE_SYMBOLS        =
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or
+# directories that contain example code fragments that are included (see
+# the \include command).
+
+EXAMPLE_PATH           = ./examples
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank all files are included.
+
+EXAMPLE_PATTERNS       =
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude
+# commands irrespective of the value of the RECURSIVE tag.
+# Possible values are YES and NO. If left blank NO is used.
+
+EXAMPLE_RECURSIVE      = YES
+
+# The IMAGE_PATH tag can be used to specify one or more files or
+# directories that contain image that are included in the documentation (see
+# the \image command).
+
+IMAGE_PATH             =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command <filter> <input-file>, where <filter>
+# is the value of the INPUT_FILTER tag, and <input-file> is the name of an
+# input file. Doxygen will then use the output that the filter program writes
+# to standard output.
+# If FILTER_PATTERNS is specified, this tag will be
+# ignored.
+
+INPUT_FILTER           =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis.
+# Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match.
+# The filters are a list of the form:
+# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further
+# info on how filters are used. If FILTER_PATTERNS is empty, INPUT_FILTER
+# is applied to all files.
+
+FILTER_PATTERNS        =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will be used to filter the input files when producing source
+# files to browse (i.e. when SOURCE_BROWSER is set to YES).
+
+FILTER_SOURCE_FILES    = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will
+# be generated. Documented entities will be cross-referenced with these sources.
+# Note: To get rid of all source code in the generated output, make sure also
+# VERBATIM_HEADERS is set to NO.
+
+SOURCE_BROWSER         = YES
+
+# Setting the INLINE_SOURCES tag to YES will include the body
+# of functions and classes directly in the documentation.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct
+# doxygen to hide any special comment blocks from generated source code
+# fragments. Normal C and C++ comments will always remain visible.
+
+STRIP_CODE_COMMENTS    = NO
+
+# If the REFERENCED_BY_RELATION tag is set to YES
+# then for each documented function all documented
+# functions referencing it will be listed.
+
+REFERENCED_BY_RELATION = YES
+
+# If the REFERENCES_RELATION tag is set to YES
+# then for each documented function all documented entities
+# called/used by that function will be listed.
+
+REFERENCES_RELATION    = YES
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES (the default)
+# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from
+# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will
+# link to the source code.
+# Otherwise they will link to the documentation.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code
+# will point to the HTML generated by the htags(1) tool instead of doxygen
+# built-in source browser. The htags tool is part of GNU's global source
+# tagging system (see http://www.gnu.org/software/global/global.html). You
+# will need version 4.8.6 or higher.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen
+# will generate a verbatim copy of the header file for each class for
+# which an include is specified. Set to NO to disable this.
+
+VERBATIM_HEADERS       = YES
+
+#---------------------------------------------------------------------------
+# configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index
+# of all compounds will be generated. Enable this if the project
+# contains a lot of classes, structs, unions or interfaces.
+
+ALPHABETICAL_INDEX     = YES
+
+# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then
+# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns
+# in which this list will be split (can be a number in the range [1..20])
+
+COLS_IN_ALPHA_INDEX    = 4
+
+# In case all classes in a project start with a common prefix, all
+# classes will be put under the same header in the alphabetical index.
+# The IGNORE_PREFIX tag can be used to specify one or more prefixes that
+# should be ignored while generating the index headers.
+
+IGNORE_PREFIX          =
+
+#---------------------------------------------------------------------------
+# configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES (the default) Doxygen will
+# generate HTML output.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `html' will be used as the default path.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for
+# each generated HTML page (for example: .htm,.php,.asp). If it is left blank
+# doxygen will generate files with .html extension.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a personal HTML header for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard header.
+
+HTML_HEADER            =
+
+# The HTML_FOOTER tag can be used to specify a personal HTML footer for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard footer.
+
+HTML_FOOTER            =
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading
+# style sheet that is used by each HTML page. It can be used to
+# fine-tune the look of the HTML output. If the tag is left blank doxygen
+# will generate a default style sheet. Note that doxygen will try to copy
+# the style sheet file to the HTML output directory, so don't put your own
+# stylesheet in the HTML output directory as well, or it will be erased!
+
+HTML_STYLESHEET        =
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output.
+# Doxygen will adjust the colors in the stylesheet and background images
+# according to this color. Hue is specified as an angle on a colorwheel,
+# see http://en.wikipedia.org/wiki/Hue for more information.
+# For instance the value 0 represents red, 60 is yellow, 120 is green,
+# 180 is cyan, 240 is blue, 300 purple, and 360 is red again.
+# The allowed range is 0 to 359.
+
+HTML_COLORSTYLE_HUE    = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of
+# the colors in the HTML output. For a value of 0 the output will use
+# grayscales only. A value of 255 will produce the most vivid colors.
+
+HTML_COLORSTYLE_SAT    = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to
+# the luminance component of the colors in the HTML output. Values below
+# 100 gradually make the output lighter, whereas values above 100 make
+# the output darker. The value divided by 100 is the actual gamma applied,
+# so 80 represents a gamma of 0.8, The value 220 represents a gamma of 2.2,
+# and 100 does not change the gamma.
+
+HTML_COLORSTYLE_GAMMA  = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting
+# this to NO can help when comparing the output of multiple runs.
+
+HTML_TIMESTAMP         = YES
+
+# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes,
+# files or namespaces will be aligned in HTML using tables. If set to
+# NO a bullet list will be used.
+
+HTML_ALIGN_MEMBERS     = YES
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded. For this to work a browser that supports
+# JavaScript and DHTML is required (for instance Mozilla 1.0+, Firefox
+# Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari).
+
+HTML_DYNAMIC_SECTIONS  = NO
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files
+# will be generated that can be used as input for Apple's Xcode 3
+# integrated development environment, introduced with OSX 10.5 (Leopard).
+# To create a documentation set, doxygen will generate a Makefile in the
+# HTML output directory. Running make will produce the docset in that
+# directory and running "make install" will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find
+# it at startup.
+# See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
+# for more information.
+
+GENERATE_DOCSET        = NO
+
+# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the
+# feed. A documentation feed provides an umbrella under which multiple
+# documentation sets from a single provider (such as a company or product suite)
+# can be grouped.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that
+# should uniquely identify the documentation set bundle. This should be a
+# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen
+# will append .docset to the name.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# When GENERATE_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+
+# The GENERATE_PUBLISHER_NAME tag identifies the documentation publisher.
+
+DOCSET_PUBLISHER_NAME  = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES, additional index files
+# will be generated that can be used as input for tools like the
+# Microsoft HTML help workshop to generate a compiled HTML help file (.chm)
+# of the generated HTML documentation.
+
+GENERATE_HTMLHELP      = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can
+# be used to specify the file name of the resulting .chm file. You
+# can add a path in front of the file if the result should not be
+# written to the html output directory.
+
+CHM_FILE               =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can
+# be used to specify the location (absolute path including file name) of
+# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run
+# the HTML help compiler on the generated index.hhp.
+
+HHC_LOCATION           =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag
+# controls if a separate .chi index file is generated (YES) or that
+# it should be included in the master .chm file (NO).
+
+GENERATE_CHI           = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING
+# is used to encode HtmlHelp index (hhk), content (hhc) and project file
+# content.
+
+CHM_INDEX_ENCODING     =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag
+# controls whether a binary table of contents is generated (YES) or a
+# normal table of contents (NO) in the .chm file.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members
+# to the contents of the HTML help documentation and to the tree view.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated
+# that can be used as input for Qt's qhelpgenerator to generate a
+# Qt Compressed Help (.qch) of the generated HTML documentation.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can
+# be used to specify the file name of the resulting .qch file.
+# The path specified is relative to the HTML output folder.
+
+QCH_FILE               =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating
+# Qt Help Project output. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#namespace
+
+QHP_NAMESPACE          = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating
+# Qt Help Project output. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#virtual-folders
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to
+# add. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#custom-filters
+
+QHP_CUST_FILTER_NAME   =
+
+# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see
+# <a href="http://doc.trolltech.com/qthelpproject.html#custom-filters">
+# Qt Help Project / Custom Filters</a>.
+
+QHP_CUST_FILTER_ATTRS  =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's
+# filter section matches.
+# <a href="http://doc.trolltech.com/qthelpproject.html#filter-attributes">
+# Qt Help Project / Filter Attributes</a>.
+
+QHP_SECT_FILTER_ATTRS  =
+
+# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can
+# be used to specify the location of Qt's qhelpgenerator.
+# If non-empty doxygen will try to run qhelpgenerator on the generated
+# .qhp file.
+
+QHG_LOCATION           =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files
+#  will be generated, which together with the HTML files, form an Eclipse help
+# plugin. To install this plugin and make it available under the help contents
+# menu in Eclipse, the contents of the directory containing the HTML and XML
+# files needs to be copied into the plugins directory of eclipse. The name of
+# the directory within the plugins directory should be the same as
+# the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before
+# the help appears.
+
+GENERATE_ECLIPSEHELP   = NO
+
+# A unique identifier for the eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have
+# this name.
+
+ECLIPSE_DOC_ID         = org.doxygen.Project
+
+# The DISABLE_INDEX tag can be used to turn on/off the condensed index at
+# top of each HTML page. The value NO (the default) enables the index and
+# the value YES disables it.
+
+DISABLE_INDEX          = NO
+
+# This tag can be used to set the number of enum values (range [0,1..20])
+# that doxygen will group on one line in the generated HTML documentation.
+# Note that a value of 0 will completely suppress the enum values from appearing in the overview section.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information.
+# If the tag value is set to YES, a side panel will be generated
+# containing a tree-like index structure (just like the one that
+# is generated for HTML Help). For this to work a browser that supports
+# JavaScript, DHTML, CSS and frames is required (i.e. any modern browser).
+# Windows users are probably better off using the HTML help feature.
+
+GENERATE_TREEVIEW      = NO
+
+# By enabling USE_INLINE_TREES, doxygen will generate the Groups, Directories,
+# and Class Hierarchy pages using a tree view instead of an ordered list.
+
+USE_INLINE_TREES       = NO
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be
+# used to set the initial width (in pixels) of the frame in which the tree
+# is shown.
+
+TREEVIEW_WIDTH         = 250
+
+# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open
+# links to external symbols imported via tag files in a separate window.
+
+EXT_LINKS_IN_WINDOW    = NO
+
+# Use this tag to change the font size of Latex formulas included
+# as images in the HTML documentation. The default is 10. Note that
+# when you change the font size after a successful doxygen run you need
+# to manually remove any form_*.png images from the HTML output directory
+# to force them to be regenerated.
+
+FORMULA_FONTSIZE       = 10
+
+# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are
+# not supported properly for IE 6.0, but are supported on all modern browsers.
+# Note that when changing this option you need to delete any form_*.png files
+# in the HTML output before the changes have effect.
+
+FORMULA_TRANSPARENT    = YES
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax
+# (see http://www.mathjax.org) which uses client side Javascript for the
+# rendering instead of using prerendered bitmaps. Use this if you do not
+# have LaTeX installed or if you want to formulas look prettier in the HTML
+# output. When enabled you also need to install MathJax separately and
+# configure the path to it using the MATHJAX_RELPATH option.
+
+USE_MATHJAX            = NO
+
+# When MathJax is enabled you need to specify the location relative to the
+# HTML output directory using the MATHJAX_RELPATH option. The destination
+# directory should contain the MathJax.js script. For instance, if the mathjax
+# directory is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to the mathjax.org site, so you can quickly see the result without installing
+# MathJax, but it is strongly recommended to install a local copy of MathJax
+# before deployment.
+
+MATHJAX_RELPATH        = http://www.mathjax.org/mathjax
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box
+# for the HTML output. The underlying search engine uses javascript
+# and DHTML and should work on any modern browser. Note that when using
+# HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets
+# (GENERATE_DOCSET) there is already a search function so this one should
+# typically be disabled. For large projects the javascript based search engine
+# can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution.
+
+SEARCHENGINE           = NO
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a PHP enabled web server instead of at the web client
+# using Javascript. Doxygen will generate the search PHP script and index
+# file to put on the web server. The advantage of the server
+# based approach is that it scales better to large projects and allows
+# full text search. The disadvantages are that it is more difficult to setup
+# and does not have live searching capabilities.
+
+SERVER_BASED_SEARCH    = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will
+# generate Latex output.
+
+GENERATE_LATEX         = NO
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `latex' will be used as the default path.
+
+LATEX_OUTPUT           =
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked. If left blank `latex' will be used as the default command name.
+# Note that when enabling USE_PDFLATEX this option is only used for
+# generating bitmaps for formulas in the HTML output, but not in the
+# Makefile that is written to the output directory.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to
+# generate index for LaTeX. If left blank `makeindex' will be used as the
+# default command name.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact
+# LaTeX documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used
+# by the printer. Possible values are: a4, letter, legal and
+# executive. If left blank a4wide will be used.
+
+PAPER_TYPE             = letter
+
+# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX
+# packages that should be included in the LaTeX output.
+
+EXTRA_PACKAGES         =
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for
+# the generated latex document. The header should contain everything until
+# the first chapter. If it is left blank doxygen will generate a
+# standard header. Notice: only use this tag if you know what you are doing!
+
+LATEX_HEADER           =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated
+# is prepared for conversion to pdf (using ps2pdf). The pdf file will
+# contain links (just like the HTML output) instead of page references
+# This makes the output suitable for online browsing using a pdf viewer.
+
+PDF_HYPERLINKS         = NO
+
+# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of
+# plain latex in the generated Makefile. Set this option to YES to get a
+# higher quality PDF documentation.
+
+USE_PDFLATEX           = NO
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode.
+# command to the generated LaTeX files. This will instruct LaTeX to keep
+# running if errors occur, instead of asking the user for help.
+# This option is also used when generating formulas in HTML.
+
+LATEX_BATCHMODE        = NO
+
+# If LATEX_HIDE_INDICES is set to YES then doxygen will not
+# include the index chapters (such as File Index, Compound Index, etc.)
+# in the output.
+
+LATEX_HIDE_INDICES     = NO
+
+# If LATEX_SOURCE_CODE is set to YES then doxygen will include
+# source code with syntax highlighting in the LaTeX output.
+# Note that which sources are shown also depends on other settings
+# such as SOURCE_BROWSER.
+
+LATEX_SOURCE_CODE      = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output
+# The RTF output is optimized for Word 97 and may not look very pretty with
+# other RTF readers or editors.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `rtf' will be used as the default path.
+
+RTF_OUTPUT             =
+
+# If the COMPACT_RTF tag is set to YES Doxygen generates more compact
+# RTF documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated
+# will contain hyperlink fields. The RTF file will
+# contain links (just like the HTML output) instead of page references.
+# This makes the output suitable for online browsing using WORD or other
+# programs which support those fields.
+# Note: wordpad (write) and others do not support links.
+
+RTF_HYPERLINKS         = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's
+# config file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
+
+RTF_STYLESHEET_FILE    =
+
+# Set optional variables used in the generation of an rtf document.
+# Syntax is similar to doxygen's config file.
+
+RTF_EXTENSIONS_FILE    =
+
+#---------------------------------------------------------------------------
+# configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES (the default) Doxygen will
+# generate man pages
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `man' will be used as the default path.
+
+MAN_OUTPUT             =
+
+# The MAN_EXTENSION tag determines the extension that is added to
+# the generated man pages (default is the subroutine's section .3)
+
+MAN_EXTENSION          =
+
+# If the MAN_LINKS tag is set to YES and Doxygen generates man output,
+# then it will generate one additional man file for each entity
+# documented in the real man page(s). These additional files
+# only source the real man page, but without them the man command
+# would be unable to find the correct page. The default is NO.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES Doxygen will
+# generate an XML file that captures the structure of
+# the code including all documentation.
+
+GENERATE_XML           = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `xml' will be used as the default path.
+
+XML_OUTPUT             = xml
+
+# The XML_SCHEMA tag can be used to specify an XML schema,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_SCHEMA             =
+
+# The XML_DTD tag can be used to specify an XML DTD,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_DTD                =
+
+# If the XML_PROGRAMLISTING tag is set to YES Doxygen will
+# dump the program listings (including syntax highlighting
+# and cross-referencing information) to the XML output. Note that
+# enabling this will significantly increase the size of the XML output.
+
+XML_PROGRAMLISTING     = YES
+
+#---------------------------------------------------------------------------
+# configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will
+# generate an AutoGen Definitions (see autogen.sf.net) file
+# that captures the structure of the code including all
+# documentation. Note that this feature is still experimental
+# and incomplete at the moment.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES Doxygen will
+# generate a Perl module file that captures the structure of
+# the code including all documentation. Note that this
+# feature is still experimental and incomplete at the
+# moment.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES Doxygen will generate
+# the necessary Makefile rules, Perl scripts and LaTeX code to be able
+# to generate PDF and DVI output from the Perl module output.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be
+# nicely formatted so it can be parsed by a human reader.
+# This is useful
+# if you want to understand what is going on.
+# On the other hand, if this
+# tag is set to NO the size of the Perl module output will be much smaller
+# and Perl will parse it just the same.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file
+# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX.
+# This is useful so different doxyrules.make files included by the same
+# Makefile don't overwrite each other's variables.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will
+# evaluate all C-preprocessor directives found in the sources and include
+# files.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro
+# names in the source code. If set to NO (the default) only conditional
+# compilation will be performed. Macro expansion can be done in a controlled
+# way by setting EXPAND_ONLY_PREDEF to YES.
+
+MACRO_EXPANSION        = NO
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES
+# then the macro expansion is limited to the macros specified with the
+# PREDEFINED and EXPAND_AS_DEFINED tags.
+
+EXPAND_ONLY_PREDEF     = NO
+
+# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files
+# in the INCLUDE_PATH (see below) will be search if a #include is found.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by
+# the preprocessor.
+
+INCLUDE_PATH           = /usr/include
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will
+# be used.
+
+INCLUDE_FILE_PATTERNS  =
+
+# The PREDEFINED tag can be used to specify one or more macro names that
+# are defined before the preprocessor is started (similar to the -D option of
+# gcc). The argument of the tag is a list of macros of the form: name
+# or name=definition (no spaces). If the definition and the = are
+# omitted =1 is assumed. To prevent a macro definition from being
+# undefined via #undef or recursively expanded use the := operator
+# instead of the = operator.
+
+PREDEFINED             =
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then
+# this tag can be used to specify a list of macro names that should be expanded.
+# The macro definition that is found in the sources will be used.
+# Use the PREDEFINED tag if you want to use a different macro definition.
+
+EXPAND_AS_DEFINED      =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then
+# doxygen's preprocessor will remove all function-like macros that are alone
+# on a line, have an all uppercase name, and do not end with a semicolon. Such
+# function macros are typically used for boiler-plate code, and will confuse
+# the parser if not removed.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration::additions related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES option can be used to specify one or more tagfiles.
+# Optionally an initial location of the external documentation
+# can be added for each tagfile. The format of a tag file without
+# this location is as follows:
+#
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+#
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where "loc1" and "loc2" can be relative or absolute paths or
+# URLs. If a location is present for each tag, the installdox tool
+# does not have to be run to correct the links.
+# Note that each tag file must have a unique name
+# (where the name does NOT include the path)
+# If a tag file is not located in the directory in which doxygen
+# is run, you must also specify the path to the tagfile here.
+
+TAGFILES               =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create
+# a tag file that is based on the input files it reads.
+
+GENERATE_TAGFILE       =
+
+# If the ALLEXTERNALS tag is set to YES all external classes will be listed
+# in the class index. If set to NO only the inherited external classes
+# will be listed.
+
+ALLEXTERNALS           = YES
+
+# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will
+# be listed.
+
+EXTERNAL_GROUPS        = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of `which perl').
+
+PERL_PATH              =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will
+# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base
+# or super classes. Setting the tag to NO turns the diagrams off. Note that
+# this option also works with HAVE_DOT disabled, but it is recommended to
+# install and use dot, since it yields more powerful graphs.
+
+CLASS_DIAGRAMS         = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see
+# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where
+# the mscgen tool resides. If left empty the tool is assumed to be found in the
+# default search path.
+
+MSCGEN_PATH            =
+
+# If set to YES, the inheritance and collaboration graphs will hide
+# inheritance and usage relations if the target is undocumented
+# or is not a class.
+
+HIDE_UNDOC_RELATIONS   = NO
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz, a graph visualization
+# toolkit from AT&T and Lucent Bell Labs. The other options in this section
+# have no effect if this option is set to NO (the default)
+
+HAVE_DOT               = YES
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is
+# allowed to run in parallel. When set to 0 (the default) doxygen will
+# base this on the number of processors available in the system. You can set it
+# explicitly to a value larger than 0 to get control over the balance
+# between CPU load and processing speed.
+
+DOT_NUM_THREADS        = 0
+
+# By default doxygen will write a font called FreeSans.ttf to the output
+# directory and reference it in all dot files that doxygen generates. This
+# font does not include all possible unicode characters however, so when you need
+# these (or just want a differently looking font) you can specify the font name
+# using DOT_FONTNAME. You need need to make sure dot is able to find the font,
+# which can be done by putting it in a standard location or by setting the
+# DOTFONTPATH environment variable or by setting DOT_FONTPATH to the directory
+# containing the font.
+
+DOT_FONTNAME           = FreeSans
+
+# The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs.
+# The default size is 10pt.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the output directory to look for the
+# FreeSans.ttf font (which doxygen will put there itself). If you specify a
+# different font using DOT_FONTNAME you can set the path where dot
+# can find it using this tag.
+
+DOT_FONTPATH           =
+
+# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect inheritance relations. Setting this tag to YES will force the
+# the CLASS_DIAGRAMS tag to NO.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect implementation dependencies (inheritance, containment, and
+# class references variables) of the class with other documented classes.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for groups, showing the direct groups dependencies
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+
+UML_LOOK               = NO
+
+# If set to YES, the inheritance and collaboration graphs will show the
+# relations between templates and their instances.
+
+TEMPLATE_RELATIONS     = YES
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT
+# tags are set to YES then doxygen will generate a graph for each documented
+# file showing the direct and indirect include dependencies of the file with
+# other documented files.
+
+INCLUDE_GRAPH          = YES
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and
+# HAVE_DOT tags are set to YES then doxygen will generate a graph for each
+# documented header file showing the documented files that directly or
+# indirectly include this file.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH and HAVE_DOT options are set to YES then
+# doxygen will generate a call dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable call graphs
+# for selected functions only using the \callgraph command.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then
+# doxygen will generate a caller dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable caller
+# graphs for selected functions only using the \callergraph command.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen
+# will generate a graphical hierarchy of all classes instead of a textual one.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES
+# then doxygen will show the dependencies a directory has on other directories
+# in a graphical way. The dependency relations are determined by the #include
+# relations between the files in the directories.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. Possible values are png, jpg, or gif.
+# If left blank png will be used.
+
+DOT_IMAGE_FORMAT       = png
+
+# The tag DOT_PATH can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+
+DOT_PATH               = /usr/local/bin/dot
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the
+# \dotfile command).
+
+DOTFILE_DIRS           =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the
+# \mscfile command).
+
+MSCFILE_DIRS           =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of
+# nodes that will be shown in the graph. If the number of nodes in a graph
+# becomes larger than this value, doxygen will truncate the graph, which is
+# visualized by representing a node as a red box. Note that doxygen if the
+# number of direct children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note
+# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the
+# graphs generated by dot. A depth value of 3 means that only nodes reachable
+# from the root by following a path via at most 3 edges will be shown. Nodes
+# that lay further from the root node will be omitted. Note that setting this
+# option to 1 or 2 may greatly reduce the computation time needed for large
+# code bases. Also note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not
+# seem to support this out of the box. Warning: Depending on the platform used,
+# enabling this option may lead to badly anti-aliased labels on the edges of
+# a graph (i.e. they become hard to read).
+
+DOT_TRANSPARENT        = YES
+
+# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10)
+# support this, this feature is disabled by default.
+
+DOT_MULTI_TARGETS      = NO
+
+# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will
+# generate a legend page explaining the meaning of the various boxes and
+# arrows in the dot generated graphs.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will
+# remove the intermediate dot files that are used to generate
+# the various graphs.
+
+DOT_CLEANUP            = YES
diff --git a/examples/README.txt b/examples/README.txt
new file mode 100644
index 00000000..7ef078db
--- /dev/null
+++ b/examples/README.txt
@@ -0,0 +1,88 @@
+====================
+ISPC Examples README
+====================
+
+This directory has a number of sample ispc programs.  Before building them
+(on an system), install the appropriate ispc compiler binary into a
+directory in your path.  Then, if you're running Windows, open the
+"examples.sln" file and built from there.  For building under Linux/OSX,
+there are makefiles in each directory that build the examples individually.
+
+Almost all of them benchmark ispc implementations of the given computation
+against regular serial C++ implementations, printing out a comparison of
+the runtimes and the speedup delivered by ispc.  It may be instructive to
+do a side-by-side diff of the C++ and ispc implementations of these
+algorithms to learn more about wirting ispc code.
+ 
+AOBench
+=======
+
+This is an ISPC implementation of the "AO bench" benchmark
+(http://syoyo.wordpress.com/2009/01/26/ao-bench-is-evolving/).  The command
+line arguments are:
+
+ao (num iterations) (x res) (yres)
+
+It executes the program for the given number of iterations, rendering an
+(xres x yres) image each time and measuring the computation time with both
+serial and ispc implementations.
+
+AOBench_Instrumented
+====================
+
+This version of AO Bench is compiled with the --instrument ispc compiler
+flag.  This causes the compiler to emit calls to a (user-supplied)
+ISPCInstrument() function at interesting places in the compiled code.  An
+example implementation of this function that counts the number of times the
+callback is made and records some statistics about control flow coherence
+is provided in the instrument.cpp file.
+
+*** Note: on Linux, this example currently hits an assertion in LLVM during
+*** compilation
+
+Mandelbrot
+==========
+
+Mandelbrot set generation.  This example is extensively documented at the
+http://ispc.github.com/example.html page.
+
+Mandelbrot_tasks
+================
+
+Implementation of Mandelbrot set generation that also parallelizes across
+cores using tasks.  Under Windows, a simple task system built on
+Microsoft's Concurrency Runtime is used (see tasks_concrt.cpp).  On OSX, a
+task system based on Grand Central Dispatch is used (tasks_gcd.cpp), and on
+Linux, a pthreads-based task system is used (tasks_pthreads.cpp).  When
+using tasks with ispc, no task system is mandated; the user is free to plug
+in any task system they want, for ease of interoperating with existing task
+systems.
+ 
+Options
+=======
+
+This program implements both the Black-Scholes and Binomial options pricing
+models in both ispc and regular serial C++ code.
+
+RT
+==
+
+This is a simple ray tracer; it reads in camera parameters and a bounding
+volume hierarchy and renders the scene from the given viewpoint.  The
+command line arguments are:
+
+rt <scene name base>
+
+Where <scene base name> is one of "cornell", "teapot", or "sponza".
+
+The implementation originally derives from the bounding volume hierarchy
+and triangle intersection code from pbrt; see the pbrt source code and/or
+"Physically Based Rendering" book for more about the basic algorithmic
+details.
+
+Simple
+======
+
+This is a simple "hello world" type program that shows a ~10 line
+application program calling out to a ~5 line ispc program to do a simple
+computation.
diff --git a/examples/aobench/Makefile b/examples/aobench/Makefile
new file mode 100644
index 00000000..8674f7bb
--- /dev/null
+++ b/examples/aobench/Makefile
@@ -0,0 +1,26 @@
+
+CXX=g++
+CXXFLAGS=-Iobjs/ -O3 -Wall
+ISPC=ispc
+ISPCFLAGS=-O2 --fast-math
+
+default: ao
+
+.PHONY: dirs clean
+
+dirs:
+	/bin/mkdir -p objs/
+
+clean:
+	/bin/rm -rf objs *~ ao
+
+ao: dirs objs/ao.o objs/ao_serial.o objs/ao_ispc.o
+	$(CXX) $(CXXFLAGS) -o $@ objs/ao.o objs/ao_ispc.o objs/ao_serial.o -lm -lpthread
+
+objs/%.o: %.cpp
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/ao.o: objs/ao_ispc.h 
+
+objs/%_ispc.h objs/%_ispc.o: %.ispc
+	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
diff --git a/examples/aobench/ao.cpp b/examples/aobench/ao.cpp
new file mode 100644
index 00000000..1a2eefe5
--- /dev/null
+++ b/examples/aobench/ao.cpp
@@ -0,0 +1,182 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#define NOMINMAX
+#pragma warning (disable: 4244)
+#pragma warning (disable: 4305)
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#ifdef __linux__
+#include <malloc.h>
+#endif
+#include <math.h>
+#include <map>
+#include <string>
+#include <algorithm>
+#include <sys/types.h>
+
+#include "ao_ispc.h"
+using namespace ispc;
+
+#include "../timing.h"
+
+#define NSUBSAMPLES        2
+
+extern void ao_serial(int w, int h, int nsubsamples, float image[]);
+
+static unsigned int test_iterations;
+static unsigned int width, height;
+static unsigned char *img;
+static float *fimg;
+
+
+static unsigned char
+clamp(float f)
+{
+    int i = (int)(f * 255.5);
+
+    if (i < 0) i = 0;
+    if (i > 255) i = 255;
+
+    return (unsigned char)i;
+}
+
+
+static void
+savePPM(const char *fname, int w, int h)
+{
+    for (int y = 0; y < h; y++) {
+        for (int x = 0; x < w; x++)  {
+            img[3 * (y * w + x) + 0] = clamp(fimg[3 *(y * w + x) + 0]);
+            img[3 * (y * w + x) + 1] = clamp(fimg[3 *(y * w + x) + 1]);
+            img[3 * (y * w + x) + 2] = clamp(fimg[3 *(y * w + x) + 2]);
+        }
+    }
+
+    FILE *fp = fopen(fname, "wb");
+    if (!fp) {
+        perror(fname);
+        exit(1);
+    }
+
+    fprintf(fp, "P6\n");
+    fprintf(fp, "%d %d\n", w, h);
+    fprintf(fp, "255\n");
+    fwrite(img, w * h * 3, 1, fp);
+    fclose(fp);
+}
+
+
+// Allocate memory with 64-byte alignment.
+float *
+AllocAligned(int size) {
+#if defined(_WIN32) || defined(_WIN64)
+    return (float *)_aligned_malloc(size, 64);
+#elif defined (__APPLE__)
+    // Allocate excess memory to ensure an aligned pointer can be returned
+    void *mem = malloc(size + (64-1) + sizeof(void*));
+    char *amem = ((char*)mem) + sizeof(void*);
+    amem += 64 - (reinterpret_cast<uint64_t>(amem) & (64 - 1));
+    ((void**)amem)[-1] = mem;
+    return (float *)amem;
+#else
+    return (float *)memalign(64, size);
+#endif
+}
+
+
+int main(int argc, char **argv)
+{
+    if (argc != 4) {
+        printf ("%s\n", argv[0]);
+        printf ("Usage: ao [num test iterations] [width] [height]\n");
+        getchar();
+        exit(-1);
+    }
+    else {
+        test_iterations = atoi(argv[1]);
+        width = atoi (argv[2]);
+        height = atoi (argv[3]);
+    }
+
+    // Allocate space for output images
+    img = (unsigned char *)AllocAligned(width * height * 3);
+    fimg = (float *)AllocAligned(sizeof(float) * width * height * 3);
+
+    //
+    // Run the ispc path, test_iterations times, and report the minimum
+    // time for any of them.
+    //
+    double minTimeISPC = 1e30;
+    for (unsigned int i = 0; i < test_iterations; i++) {
+        memset((void *)fimg, 0, sizeof(float) * width * height * 3);
+        assert(NSUBSAMPLES == 2);
+
+        reset_and_start_timer();
+        ao_ispc(width, height, NSUBSAMPLES, fimg);
+        double t = get_elapsed_mcycles();
+        minTimeISPC = std::min(minTimeISPC, t);
+    }
+
+    // Report results and save image
+    printf("[aobench ispc]:\t\t\t[%.3f] M cycles (%d x %d image)\n", minTimeISPC, 
+           width, height);
+    savePPM("ao-ispc.ppm", width, height); 
+
+    //
+    // Run the serial path, again test_iteration times, and report the
+    // minimum time.
+    //
+    double minTimeSerial = 1e30;
+    for (unsigned int i = 0; i < test_iterations; i++) {
+        memset((void *)fimg, 0, sizeof(float) * width * height * 3);
+        reset_and_start_timer();
+        ao_serial(width, height, NSUBSAMPLES, fimg);
+        double t = get_elapsed_mcycles();
+        minTimeSerial = std::min(minTimeSerial, t);
+    }
+
+    // Report more results, save another image...
+    printf("[aobench serial]:\t\t[%.3f] M cycles (%d x %d image)\n", minTimeSerial, 
+           width, height);
+    printf("\t\t\t\t(%.2fx speedup from ISPC)\n", minTimeSerial / minTimeISPC);
+    savePPM("ao-serial.ppm", width, height); 
+        
+    return 0;
+}
diff --git a/examples/aobench/ao.ispc b/examples/aobench/ao.ispc
new file mode 100644
index 00000000..192e0666
--- /dev/null
+++ b/examples/aobench/ao.ispc
@@ -0,0 +1,317 @@
+// -*- mode: c++ -*-
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+/*
+  Based on Syoyo Fujita's aobench: http://code.google.com/p/aobench
+*/
+
+#define NAO_SAMPLES		8
+#define M_PI 3.1415926535f
+
+typedef float<3> vec;
+
+struct Isect {
+    float      t;
+    vec        p;
+    vec        n;
+    int        hit; 
+};
+
+struct Sphere {
+    vec        center;
+    float      radius;
+
+};
+
+struct Plane {
+    vec    p;
+    vec    n;
+};
+
+struct Ray {
+    vec org;
+    vec dir;
+};
+
+static inline float dot(vec a, vec b) {
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+static inline vec vcross(vec v0, vec v1) {
+    vec ret;
+    ret.x = v0.y * v1.z - v0.z * v1.y;
+    ret.y = v0.z * v1.x - v0.x * v1.z;
+    ret.z = v0.x * v1.y - v0.y * v1.x;
+    return ret;
+}
+
+static inline void vnormalize(reference vec v) {
+    float len2 = dot(v, v);
+    float invlen = rsqrt(len2);
+    v *= invlen;
+}
+
+
+static inline void
+ray_plane_intersect(reference Isect isect, reference Ray ray, 
+                    reference Plane plane) {
+    float d = -dot(plane.p, plane.n);
+    float v = dot(ray.dir, plane.n);
+
+    cif (abs(v) < 1.0e-17) 
+        return;
+    else {
+        float t = -(dot(ray.org, plane.n) + d) / v;
+
+        cif ((t > 0.0) && (t < isect.t)) {
+            isect.t = t;
+            isect.hit = 1;
+            isect.p = ray.org + ray.dir * t;
+            isect.n = plane.n;
+        }
+    }
+}
+
+
+static inline void
+ray_sphere_intersect(reference Isect isect, reference Ray ray, 
+                     reference Sphere sphere) {
+    vec rs = ray.org - sphere.center;
+
+    float B = dot(rs, ray.dir);
+    float C = dot(rs, rs) - sphere.radius * sphere.radius;
+    float D = B * B - C;
+
+    cif (D > 0.) {
+        float t = -B - sqrt(D);
+
+        cif ((t > 0.0) && (t < isect.t)) {
+            isect.t = t;
+            isect.hit = 1;
+            isect.p = ray.org + t * ray.dir;
+            isect.n = isect.p - sphere.center;
+            vnormalize(isect.n);
+        }
+    }
+}
+
+
+static inline void
+orthoBasis(reference vec basis[3], vec n) {
+    basis[2] = n;
+    basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;
+
+    if ((n.x < 0.6) && (n.x > -0.6)) {
+        basis[1].x = 1.0;
+    } else if ((n.y < 0.6) && (n.y > -0.6)) {
+        basis[1].y = 1.0;
+    } else if ((n.z < 0.6) && (n.z > -0.6)) {
+        basis[1].z = 1.0;
+    } else {
+        basis[1].x = 1.0;
+    }
+
+    basis[0] = vcross(basis[1], basis[2]);
+    vnormalize(basis[0]);
+
+    basis[1] = vcross(basis[2], basis[0]);
+    vnormalize(basis[1]);
+}
+
+
+static inline float
+ambient_occlusion(reference Isect isect, reference Plane plane, 
+                  reference Sphere spheres[3], reference RNGState rngstate) {
+    float eps = 0.0001f;
+    vec p, n;
+    vec basis[3];
+    float occlusion = 0.0;
+
+    p = isect.p + eps * isect.n;
+
+    orthoBasis(basis, isect.n);
+
+    static const uniform int ntheta = NAO_SAMPLES;
+    static const uniform int nphi   = NAO_SAMPLES;
+    for (uniform int j = 0; j < ntheta; j++) {
+        for (uniform int i = 0; i < nphi; i++) {
+            Ray ray;
+            Isect occIsect;
+
+            float theta = sqrt(frandom(rngstate));
+            float phi   = 2.0f * M_PI * frandom(rngstate);
+            float x = cos(phi) * theta;
+            float y = sin(phi) * theta;
+            float z = sqrt(1.0 - theta * theta);
+
+            // local . global
+            float rx = x * basis[0].x + y * basis[1].x + z * basis[2].x;
+            float ry = x * basis[0].y + y * basis[1].y + z * basis[2].y;
+            float rz = x * basis[0].z + y * basis[1].z + z * basis[2].z;
+
+            ray.org = p;
+            ray.dir.x = rx;
+            ray.dir.y = ry;
+            ray.dir.z = rz;
+
+            occIsect.t   = 1.0e+17;
+            occIsect.hit = 0;
+
+            for (uniform int snum = 0; snum < 3; ++snum)
+                ray_sphere_intersect(occIsect, ray, spheres[snum]); 
+            ray_plane_intersect (occIsect, ray, plane); 
+
+            if (occIsect.hit) occlusion += 1.0;
+        }
+    }
+
+    occlusion = (ntheta * nphi - occlusion) / (float)(ntheta * nphi);
+    return occlusion;
+}
+
+
+/* Compute the image for the scanlines from [y0,y1), for an overall image
+   of width w and height h.
+ */
+void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h, 
+                  uniform int nsubsamples, reference uniform float image[]) {
+    static Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
+    static Sphere spheres[3] = {
+        { { -2.0f, 0.0f, -3.5f }, 0.5f },
+        { { -0.5f, 0.0f, -3.0f }, 0.5f },
+        { { 1.0f, 0.0f, -2.2f }, 0.5f } };
+    RNGState rngstate;
+
+    seed_rng(rngstate, y0);
+
+    // Compute the mapping between the 'programCount'-wide program
+    // instances running in parallel and samples in the image.  
+    //
+    // For now, we'll always take four samples per pixel, so start by
+    // initializing du and dv with offsets into subpixel samples.  We'll
+    // take care of further updating du and dv for the case where we're
+    // doing more than 4 program instances in parallel shortly.
+    uniform float uSteps[4] = { 0, 1, 0, 1 };
+    uniform float vSteps[4] = { 0, 0, 1, 1 };
+    float du = uSteps[programIndex % 4] / nsubsamples;
+    float dv = vSteps[programIndex % 4] / nsubsamples;
+
+    // Now handle the case where we are able to do more than one pixel's
+    // worth of work at once.  nx records the number of pixels in the x
+    // direction we do per iteration and ny the number in y.
+    uniform int nx = 1, ny = 1;
+
+    if (programCount == 8) {
+        // Do two pixels at once in the x direction
+        nx = 2;
+        if (programIndex >= 4) 
+            // And shift the offsets for the second pixel's worth of work
+            ++du;
+    }
+    else if (programCount == 16) {
+        // Two at once in both x and y
+        nx = ny = 2;
+        if ((programIndex >= 4 && programIndex < 8) || programIndex >= 12)
+            ++du;
+        if (programIndex >= 8)  
+            ++dv;
+    }
+
+    // Now loop over all of the pixels, stepping in x and y as calculated
+    // above.  (Assumes that ny divides y and nx divides x...)
+    for (uniform int y = y0; y < y1; y += ny) {
+        for (uniform int x = 0; x < w; x += nx)  {
+            // Figur out x,y pixel in NDC
+            float px =  (x + du - (w / 2.0f)) / (w / 2.0f);
+            float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
+            float ret = 0.f;
+            Ray ray;
+            Isect isect;
+
+            ray.org = 0.f;
+
+            // Poor man's perspective projection
+            ray.dir.x = px;
+            ray.dir.y = py;
+            ray.dir.z = -1.0;
+            vnormalize(ray.dir);
+
+            isect.t   = 1.0e+17;
+            isect.hit = 0;
+
+            for (uniform int snum = 0; snum < 3; ++snum)
+                ray_sphere_intersect(isect, ray, spheres[snum]);
+            ray_plane_intersect(isect, ray, plane);
+
+            // Note use of 'coherent' if statement; the set of rays we
+            // trace will often all hit or all miss the scene
+            cif (isect.hit)
+                ret = ambient_occlusion(isect, plane, spheres, rngstate);
+
+            // This is a little grungy; we have results for
+            // programCount-worth of values.  Because we're doing 2x2
+            // subsamples, we need to peel them off in groups of four,
+            // average the four values for each pixel, and update the
+            // output image.
+            //
+            // Store the varying value to a uniform array of the same size.
+            // See the discussion about communication among program
+            // instances in the ispc user's manual for more discussion on
+            // this idiom.
+            uniform float retArray[programCount];
+            retArray[programIndex] = ret;
+
+            // offset to the first pixel in the image
+            uniform int offset = 3 * (y * w + x);
+            for (uniform int p = 0; p < programCount; p += 4, ++offset) {
+                // Get the four sample values for this pixel
+                uniform float sumret = retArray[p] + retArray[p+1] + retArray[p+2] +
+                    retArray[p+3];
+
+                // Normalize by number of samples taken
+                sumret /= nsubsamples * nsubsamples; 
+                
+                // Store result in the image
+                image[offset+0] = sumret;
+                image[offset+1] = sumret;
+                image[offset+2] = sumret;
+            }
+        }
+    }
+}
+
+
+export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples, 
+                    uniform float image[]) {
+    ao_scanlines(0, h, w, h, nsubsamples, image);
+}
diff --git a/examples/aobench/ao_serial.cpp b/examples/aobench/ao_serial.cpp
new file mode 100644
index 00000000..0b3e2b6d
--- /dev/null
+++ b/examples/aobench/ao_serial.cpp
@@ -0,0 +1,314 @@
+// -*- mode: c++ -*-
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+/*
+  Based on Syoyo Fujita's aobench: http://code.google.com/p/aobench
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#define NOMINMAX
+#pragma warning (disable: 4244)
+#pragma warning (disable: 4305)
+#endif
+
+#include <stdlib.h>
+#include <math.h>
+
+#ifdef _MSC_VER
+static long long drand48_x = 0x1234ABCD330E;
+
+static inline void srand48(int x) {
+    drand48_x = x ^ (x << 16);
+}
+
+static inline double drand48() {
+    drand48_x = drand48_x * 0x5DEECE66D + 0xB;
+    return (drand48_x & 0xFFFFFFFFFFFF) * (1.0 / 281474976710656.0);
+}
+#endif // _MSC_VER
+
+#ifdef _MSC_VER
+__declspec(align(16)) 
+#endif
+struct vec {
+    vec() { x=y=z=pad=0.; }
+    vec(float xx, float yy, float zz) { x = xx; y = yy; z = zz; }
+
+    vec operator*(float f) const { return vec(x*f, y*f, z*f); }
+    vec operator+(const vec &f2) const { 
+        return vec(x+f2.x, y+f2.y, z+f2.z); 
+    }
+    vec operator-(const vec &f2) const { 
+        return vec(x-f2.x, y-f2.y, z-f2.z); 
+    }
+    vec operator*(const vec &f2) const { 
+        return vec(x*f2.x, y*f2.y, z*f2.z); 
+    }
+    float x, y, z;
+    float pad;
+}
+#ifndef _MSC_VER
+__attribute__ ((aligned(16)))
+#endif
+;
+inline vec operator*(float f, const vec &v) { return vec(f*v.x, f*v.y, f*v.z); }
+
+
+#define NAO_SAMPLES		8
+
+#ifdef M_PI
+#undef M_PI
+#endif
+#define M_PI 3.1415926535f
+
+struct Isect {
+    float      t;
+    vec        p;
+    vec        n;
+    int        hit; 
+};
+
+struct Sphere {
+    vec        center;
+    float      radius;
+
+};
+
+struct Plane {
+    vec    p;
+    vec    n;
+};
+
+struct Ray {
+    vec org;
+    vec dir;
+};
+
+static inline float dot(const vec &a, const vec &b) {
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+static inline vec vcross(const vec &v0, const vec &v1) {
+    vec ret;
+    ret.x = v0.y * v1.z - v0.z * v1.y;
+    ret.y = v0.z * v1.x - v0.x * v1.z;
+    ret.z = v0.x * v1.y - v0.y * v1.x;
+    return ret;
+}
+
+static inline void vnormalize(vec &v) {
+    float len2 = dot(v, v);
+    float invlen = 1.f / sqrtf(len2);
+    v = v * invlen;
+}
+
+
+static inline void
+ray_plane_intersect(Isect &isect, Ray &ray, 
+                    Plane &plane) {
+    float d = -dot(plane.p, plane.n);
+    float v = dot(ray.dir, plane.n);
+
+    if (fabsf(v) < 1.0e-17) 
+        return;
+    else {
+        float t = -(dot(ray.org, plane.n) + d) / v;
+
+        if ((t > 0.0) && (t < isect.t)) {
+            isect.t = t;
+            isect.hit = 1;
+            isect.p = ray.org + ray.dir * t;
+            isect.n = plane.n;
+        }
+    }
+}
+
+
+static inline void
+ray_sphere_intersect(Isect &isect, Ray &ray, 
+                     Sphere &sphere) {
+    vec rs = ray.org - sphere.center;
+
+    float B = dot(rs, ray.dir);
+    float C = dot(rs, rs) - sphere.radius * sphere.radius;
+    float D = B * B - C;
+
+    if (D > 0.) {
+        float t = -B - sqrtf(D);
+
+        if ((t > 0.0) && (t < isect.t)) {
+            isect.t = t;
+            isect.hit = 1;
+            isect.p = ray.org + t * ray.dir;
+            isect.n = isect.p - sphere.center;
+            vnormalize(isect.n);
+        }
+    }
+}
+
+
+static inline void
+orthoBasis(vec basis[3], const vec &n) {
+    basis[2] = n;
+    basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;
+
+    if ((n.x < 0.6) && (n.x > -0.6)) {
+        basis[1].x = 1.0;
+    } else if ((n.y < 0.6) && (n.y > -0.6)) {
+        basis[1].y = 1.0;
+    } else if ((n.z < 0.6) && (n.z > -0.6)) {
+        basis[1].z = 1.0;
+    } else {
+        basis[1].x = 1.0;
+    }
+
+    basis[0] = vcross(basis[1], basis[2]);
+    vnormalize(basis[0]);
+
+    basis[1] = vcross(basis[2], basis[0]);
+    vnormalize(basis[1]);
+}
+
+
+static float
+ambient_occlusion(Isect &isect, Plane &plane, 
+                  Sphere spheres[3]) {
+    float eps = 0.0001f;
+    vec p, n;
+    vec basis[3];
+    float occlusion = 0.0;
+
+    p = isect.p + eps * isect.n;
+
+    orthoBasis(basis, isect.n);
+
+    static const int ntheta = NAO_SAMPLES;
+    static const int nphi   = NAO_SAMPLES;
+    for (int j = 0; j < ntheta; j++) {
+        for (int i = 0; i < nphi; i++) {
+            Ray ray;
+            Isect occIsect;
+
+            float theta = sqrtf(drand48());
+            float phi   = 2.0f * M_PI * drand48();
+            float x = cosf(phi) * theta;
+            float y = sinf(phi) * theta;
+            float z = sqrtf(1.0 - theta * theta);
+
+            // local . global
+            float rx = x * basis[0].x + y * basis[1].x + z * basis[2].x;
+            float ry = x * basis[0].y + y * basis[1].y + z * basis[2].y;
+            float rz = x * basis[0].z + y * basis[1].z + z * basis[2].z;
+
+            ray.org = p;
+            ray.dir.x = rx;
+            ray.dir.y = ry;
+            ray.dir.z = rz;
+
+            occIsect.t   = 1.0e+17;
+            occIsect.hit = 0;
+
+            for (int snum = 0; snum < 3; ++snum)
+                ray_sphere_intersect(occIsect, ray, spheres[snum]); 
+            ray_plane_intersect (occIsect, ray, plane); 
+
+            if (occIsect.hit) occlusion += 1.0;
+        }
+    }
+
+    occlusion = (ntheta * nphi - occlusion) / (float)(ntheta * nphi);
+    return occlusion;
+}
+
+
+/* Compute the image for the scanlines from [y0,y1), for an overall image
+   of width w and height h.
+ */
+static void ao_scanlines(int y0, int y1, int w, int h, int nsubsamples,
+                         float image[]) {
+    static Plane plane = { vec(0.0f, -0.5f, 0.0f), vec(0.f, 1.f, 0.f) };
+    static Sphere spheres[3] = {
+        { vec(-2.0f, 0.0f, -3.5f), 0.5f },
+        { vec(-0.5f, 0.0f, -3.0f), 0.5f },
+        { vec(1.0f, 0.0f, -2.2f), 0.5f } };
+
+    srand48(y0);
+    
+    for (int y = y0; y < y1; ++y) {
+        for (int x = 0; x < w; ++x)  {
+            int offset = 3 * (y * w + x);
+            for (int u = 0; u < nsubsamples; ++u) {
+                for (int v = 0; v < nsubsamples; ++v) {
+                    float px = (x + (u / (float)nsubsamples) - (w / 2.0f)) / (w / 2.0f);
+                    float py = -(y + (v / (float)nsubsamples) - (h / 2.0f)) / (h / 2.0f);
+                    float ret = 0.f;
+                    Ray ray;
+                    Isect isect;
+
+                    ray.org = vec(0.f, 0.f, 0.f);
+
+                    ray.dir.x = px;
+                    ray.dir.y = py;
+                    ray.dir.z = -1.0;
+                    vnormalize(ray.dir);
+
+                    isect.t   = 1.0e+17;
+                    isect.hit = 0;
+
+                    for (int snum = 0; snum < 3; ++snum)
+                        ray_sphere_intersect(isect, ray, spheres[snum]);
+                    ray_plane_intersect(isect, ray, plane);
+
+                    if (isect.hit)
+                        ret = ambient_occlusion(isect, plane, spheres);
+
+                    // Update image for AO for this ray
+                    image[offset+0] += ret;
+                    image[offset+1] += ret;
+                    image[offset+2] += ret;
+                }
+            }
+            // Normalize image pixels by number of samples taken per pixel
+            image[offset+0] /= nsubsamples * nsubsamples;
+            image[offset+1] /= nsubsamples * nsubsamples;
+            image[offset+2] /= nsubsamples * nsubsamples;
+        }
+    }
+}
+
+
+void ao_serial(int w, int h, int nsubsamples, 
+               float image[]) {
+    ao_scanlines(0, h, w, h, nsubsamples, image);
+}
diff --git a/examples/aobench/aobench.vcxproj b/examples/aobench/aobench.vcxproj
new file mode 100755
index 00000000..3be6bdb3
--- /dev/null
+++ b/examples/aobench/aobench.vcxproj
@@ -0,0 +1,161 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="ao.cpp" />
+    <ClCompile Include="ao_serial.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="ao.ispc">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>aobench</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
diff --git a/examples/aobench_instrumented/Makefile b/examples/aobench_instrumented/Makefile
new file mode 100644
index 00000000..296a5882
--- /dev/null
+++ b/examples/aobench_instrumented/Makefile
@@ -0,0 +1,26 @@
+
+CXX=g++
+CXXFLAGS=-Iobjs/ -g3 -Wall
+ISPC=ispc
+ISPCFLAGS=-O2 --fast-math --instrument
+
+default: ao
+
+.PHONY: dirs clean
+
+dirs:
+	/bin/mkdir -p objs/
+
+clean:
+	/bin/rm -rf objs *~ ao
+
+ao: dirs objs/ao.o objs/instrument.o objs/ao_ispc.o
+	$(CXX) $(CXXFLAGS) -o $@ objs/ao.o objs/ao_ispc.o objs/instrument.o -lm -lpthread
+
+objs/%.o: %.cpp
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/ao.o: objs/ao_ispc.h 
+
+objs/%_ispc.h objs/%_ispc.o: %.ispc
+	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
diff --git a/examples/aobench_instrumented/ao.cpp b/examples/aobench_instrumented/ao.cpp
new file mode 100644
index 00000000..742a0862
--- /dev/null
+++ b/examples/aobench_instrumented/ao.cpp
@@ -0,0 +1,148 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#define NOMINMAX
+#pragma warning (disable: 4244)
+#pragma warning (disable: 4305)
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#ifdef __linux__
+#include <malloc.h>
+#endif
+#include <math.h>
+#include <map>
+#include <string>
+#include <algorithm>
+#include <sys/types.h>
+
+#include "ao_ispc.h"
+using namespace ispc;
+
+#include "instrument.h"
+#include "../timing.h"
+
+#define NSUBSAMPLES        2
+
+static unsigned int test_iterations;
+static unsigned int width, height;
+static unsigned char *img;
+static float *fimg;
+
+
+static unsigned char
+clamp(float f)
+{
+    int i = (int)(f * 255.5);
+
+    if (i < 0) i = 0;
+    if (i > 255) i = 255;
+
+    return (unsigned char)i;
+}
+
+
+static void
+savePPM(const char *fname, int w, int h)
+{
+    for (int y = 0; y < h; y++) {
+        for (int x = 0; x < w; x++)  {
+            img[3 * (y * w + x) + 0] = clamp(fimg[3 *(y * w + x) + 0]);
+            img[3 * (y * w + x) + 1] = clamp(fimg[3 *(y * w + x) + 1]);
+            img[3 * (y * w + x) + 2] = clamp(fimg[3 *(y * w + x) + 2]);
+        }
+    }
+
+    FILE *fp = fopen(fname, "wb");
+    if (!fp) {
+        perror(fname);
+        exit(1);
+    }
+
+    fprintf(fp, "P6\n");
+    fprintf(fp, "%d %d\n", w, h);
+    fprintf(fp, "255\n");
+    fwrite(img, w * h * 3, 1, fp);
+    fclose(fp);
+}
+
+
+// Allocate memory with 64-byte alignment.
+float *
+AllocAligned(int size) {
+#if defined(_WIN32) || defined(_WIN64)
+    return (float *)_aligned_malloc(size, 64);
+#elif defined (__APPLE__)
+    // Allocate excess memory to ensure an aligned pointer can be returned
+    void *mem = malloc(size + (64-1) + sizeof(void*));
+    char *amem = ((char*)mem) + sizeof(void*);
+    amem += 64 - (reinterpret_cast<uint64_t>(amem) & (64 - 1));
+    ((void**)amem)[-1] = mem;
+    return (float *)amem;
+#else
+    return (float *)memalign(64, size);
+#endif
+}
+
+
+int main(int argc, char **argv)
+{
+    if (argc != 4) {
+        printf ("%s\n", argv[0]);
+        printf ("Usage: ao [num test iterations] [width] [height]\n");
+        getchar();
+        exit(-1);
+    }
+    else {
+        test_iterations = atoi(argv[1]);
+        width = atoi (argv[2]);
+        height = atoi (argv[3]);
+    }
+
+    // Allocate space for output images
+    img = (unsigned char *)AllocAligned(width * height * 3);
+    fimg = (float *)AllocAligned(sizeof(float) * width * height * 3);
+
+    ao_ispc(width, height, NSUBSAMPLES, fimg);
+
+    savePPM("ao-ispc.ppm", width, height); 
+
+    ISPCPrintInstrument();
+
+    return 0;
+}
diff --git a/examples/aobench_instrumented/ao.ispc b/examples/aobench_instrumented/ao.ispc
new file mode 100644
index 00000000..192e0666
--- /dev/null
+++ b/examples/aobench_instrumented/ao.ispc
@@ -0,0 +1,317 @@
+// -*- mode: c++ -*-
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+/*
+  Based on Syoyo Fujita's aobench: http://code.google.com/p/aobench
+*/
+
+#define NAO_SAMPLES		8
+#define M_PI 3.1415926535f
+
+typedef float<3> vec;
+
+struct Isect {
+    float      t;
+    vec        p;
+    vec        n;
+    int        hit; 
+};
+
+struct Sphere {
+    vec        center;
+    float      radius;
+
+};
+
+struct Plane {
+    vec    p;
+    vec    n;
+};
+
+struct Ray {
+    vec org;
+    vec dir;
+};
+
+static inline float dot(vec a, vec b) {
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+static inline vec vcross(vec v0, vec v1) {
+    vec ret;
+    ret.x = v0.y * v1.z - v0.z * v1.y;
+    ret.y = v0.z * v1.x - v0.x * v1.z;
+    ret.z = v0.x * v1.y - v0.y * v1.x;
+    return ret;
+}
+
+static inline void vnormalize(reference vec v) {
+    float len2 = dot(v, v);
+    float invlen = rsqrt(len2);
+    v *= invlen;
+}
+
+
+static inline void
+ray_plane_intersect(reference Isect isect, reference Ray ray, 
+                    reference Plane plane) {
+    float d = -dot(plane.p, plane.n);
+    float v = dot(ray.dir, plane.n);
+
+    cif (abs(v) < 1.0e-17) 
+        return;
+    else {
+        float t = -(dot(ray.org, plane.n) + d) / v;
+
+        cif ((t > 0.0) && (t < isect.t)) {
+            isect.t = t;
+            isect.hit = 1;
+            isect.p = ray.org + ray.dir * t;
+            isect.n = plane.n;
+        }
+    }
+}
+
+
+static inline void
+ray_sphere_intersect(reference Isect isect, reference Ray ray, 
+                     reference Sphere sphere) {
+    vec rs = ray.org - sphere.center;
+
+    float B = dot(rs, ray.dir);
+    float C = dot(rs, rs) - sphere.radius * sphere.radius;
+    float D = B * B - C;
+
+    cif (D > 0.) {
+        float t = -B - sqrt(D);
+
+        cif ((t > 0.0) && (t < isect.t)) {
+            isect.t = t;
+            isect.hit = 1;
+            isect.p = ray.org + t * ray.dir;
+            isect.n = isect.p - sphere.center;
+            vnormalize(isect.n);
+        }
+    }
+}
+
+
+static inline void
+orthoBasis(reference vec basis[3], vec n) {
+    basis[2] = n;
+    basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;
+
+    if ((n.x < 0.6) && (n.x > -0.6)) {
+        basis[1].x = 1.0;
+    } else if ((n.y < 0.6) && (n.y > -0.6)) {
+        basis[1].y = 1.0;
+    } else if ((n.z < 0.6) && (n.z > -0.6)) {
+        basis[1].z = 1.0;
+    } else {
+        basis[1].x = 1.0;
+    }
+
+    basis[0] = vcross(basis[1], basis[2]);
+    vnormalize(basis[0]);
+
+    basis[1] = vcross(basis[2], basis[0]);
+    vnormalize(basis[1]);
+}
+
+
+static inline float
+ambient_occlusion(reference Isect isect, reference Plane plane, 
+                  reference Sphere spheres[3], reference RNGState rngstate) {
+    float eps = 0.0001f;
+    vec p, n;
+    vec basis[3];
+    float occlusion = 0.0;
+
+    p = isect.p + eps * isect.n;
+
+    orthoBasis(basis, isect.n);
+
+    static const uniform int ntheta = NAO_SAMPLES;
+    static const uniform int nphi   = NAO_SAMPLES;
+    for (uniform int j = 0; j < ntheta; j++) {
+        for (uniform int i = 0; i < nphi; i++) {
+            Ray ray;
+            Isect occIsect;
+
+            float theta = sqrt(frandom(rngstate));
+            float phi   = 2.0f * M_PI * frandom(rngstate);
+            float x = cos(phi) * theta;
+            float y = sin(phi) * theta;
+            float z = sqrt(1.0 - theta * theta);
+
+            // local . global
+            float rx = x * basis[0].x + y * basis[1].x + z * basis[2].x;
+            float ry = x * basis[0].y + y * basis[1].y + z * basis[2].y;
+            float rz = x * basis[0].z + y * basis[1].z + z * basis[2].z;
+
+            ray.org = p;
+            ray.dir.x = rx;
+            ray.dir.y = ry;
+            ray.dir.z = rz;
+
+            occIsect.t   = 1.0e+17;
+            occIsect.hit = 0;
+
+            for (uniform int snum = 0; snum < 3; ++snum)
+                ray_sphere_intersect(occIsect, ray, spheres[snum]); 
+            ray_plane_intersect (occIsect, ray, plane); 
+
+            if (occIsect.hit) occlusion += 1.0;
+        }
+    }
+
+    occlusion = (ntheta * nphi - occlusion) / (float)(ntheta * nphi);
+    return occlusion;
+}
+
+
+/* Compute the image for the scanlines from [y0,y1), for an overall image
+   of width w and height h.
+ */
+void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h, 
+                  uniform int nsubsamples, reference uniform float image[]) {
+    static Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
+    static Sphere spheres[3] = {
+        { { -2.0f, 0.0f, -3.5f }, 0.5f },
+        { { -0.5f, 0.0f, -3.0f }, 0.5f },
+        { { 1.0f, 0.0f, -2.2f }, 0.5f } };
+    RNGState rngstate;
+
+    seed_rng(rngstate, y0);
+
+    // Compute the mapping between the 'programCount'-wide program
+    // instances running in parallel and samples in the image.  
+    //
+    // For now, we'll always take four samples per pixel, so start by
+    // initializing du and dv with offsets into subpixel samples.  We'll
+    // take care of further updating du and dv for the case where we're
+    // doing more than 4 program instances in parallel shortly.
+    uniform float uSteps[4] = { 0, 1, 0, 1 };
+    uniform float vSteps[4] = { 0, 0, 1, 1 };
+    float du = uSteps[programIndex % 4] / nsubsamples;
+    float dv = vSteps[programIndex % 4] / nsubsamples;
+
+    // Now handle the case where we are able to do more than one pixel's
+    // worth of work at once.  nx records the number of pixels in the x
+    // direction we do per iteration and ny the number in y.
+    uniform int nx = 1, ny = 1;
+
+    if (programCount == 8) {
+        // Do two pixels at once in the x direction
+        nx = 2;
+        if (programIndex >= 4) 
+            // And shift the offsets for the second pixel's worth of work
+            ++du;
+    }
+    else if (programCount == 16) {
+        // Two at once in both x and y
+        nx = ny = 2;
+        if ((programIndex >= 4 && programIndex < 8) || programIndex >= 12)
+            ++du;
+        if (programIndex >= 8)  
+            ++dv;
+    }
+
+    // Now loop over all of the pixels, stepping in x and y as calculated
+    // above.  (Assumes that ny divides y and nx divides x...)
+    for (uniform int y = y0; y < y1; y += ny) {
+        for (uniform int x = 0; x < w; x += nx)  {
+            // Figur out x,y pixel in NDC
+            float px =  (x + du - (w / 2.0f)) / (w / 2.0f);
+            float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
+            float ret = 0.f;
+            Ray ray;
+            Isect isect;
+
+            ray.org = 0.f;
+
+            // Poor man's perspective projection
+            ray.dir.x = px;
+            ray.dir.y = py;
+            ray.dir.z = -1.0;
+            vnormalize(ray.dir);
+
+            isect.t   = 1.0e+17;
+            isect.hit = 0;
+
+            for (uniform int snum = 0; snum < 3; ++snum)
+                ray_sphere_intersect(isect, ray, spheres[snum]);
+            ray_plane_intersect(isect, ray, plane);
+
+            // Note use of 'coherent' if statement; the set of rays we
+            // trace will often all hit or all miss the scene
+            cif (isect.hit)
+                ret = ambient_occlusion(isect, plane, spheres, rngstate);
+
+            // This is a little grungy; we have results for
+            // programCount-worth of values.  Because we're doing 2x2
+            // subsamples, we need to peel them off in groups of four,
+            // average the four values for each pixel, and update the
+            // output image.
+            //
+            // Store the varying value to a uniform array of the same size.
+            // See the discussion about communication among program
+            // instances in the ispc user's manual for more discussion on
+            // this idiom.
+            uniform float retArray[programCount];
+            retArray[programIndex] = ret;
+
+            // offset to the first pixel in the image
+            uniform int offset = 3 * (y * w + x);
+            for (uniform int p = 0; p < programCount; p += 4, ++offset) {
+                // Get the four sample values for this pixel
+                uniform float sumret = retArray[p] + retArray[p+1] + retArray[p+2] +
+                    retArray[p+3];
+
+                // Normalize by number of samples taken
+                sumret /= nsubsamples * nsubsamples; 
+                
+                // Store result in the image
+                image[offset+0] = sumret;
+                image[offset+1] = sumret;
+                image[offset+2] = sumret;
+            }
+        }
+    }
+}
+
+
+export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples, 
+                    uniform float image[]) {
+    ao_scanlines(0, h, w, h, nsubsamples, image);
+}
diff --git a/examples/aobench_instrumented/aobench_instrumented.vcxproj b/examples/aobench_instrumented/aobench_instrumented.vcxproj
new file mode 100755
index 00000000..94c8926a
--- /dev/null
+++ b/examples/aobench_instrumented/aobench_instrumented.vcxproj
@@ -0,0 +1,161 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="ao.cpp" />
+    <ClCompile Include="instrument.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="ao.ispc">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --instrument
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --instrument
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --instrument
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --instrument
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>aobench_instrumented</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
diff --git a/examples/aobench_instrumented/instrument.cpp b/examples/aobench_instrumented/instrument.cpp
new file mode 100644
index 00000000..d72210d6
--- /dev/null
+++ b/examples/aobench_instrumented/instrument.cpp
@@ -0,0 +1,94 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#include "instrument.h"
+#include <stdio.h>
+#include <assert.h>
+#include <string>
+#include <map>
+
+struct CallInfo {
+    CallInfo() { count = laneCount = allOff = 0; }
+    int count;
+    int laneCount;
+    int allOff;
+};
+
+static std::map<std::string, CallInfo> callInfo;
+
+int countbits(int i) {
+    int ret = 0;
+    while (i) {
+        if (i & 0x1)
+            ++ret;
+        i >>= 1;
+    }
+    return ret;
+}
+
+
+// Callback function that ispc compiler emits calls to when --instrument
+// command-line flag is given while compiling.
+void
+ISPCInstrument(const char *fn, const char *note, int line, int mask) {
+    char sline[16];
+    sprintf(sline, "%04d", line);
+    std::string s = std::string(fn) + std::string("(") + std::string(sline) +
+        std::string(") - ") + std::string(note);
+
+    // Find or create a CallInfo instance for this callsite.
+    CallInfo &ci = callInfo[s];
+
+    // And update its statistics... 
+    ++ci.count;
+    if (mask == 0)
+        ++ci.allOff;
+    ci.laneCount += countbits(mask);
+}
+
+
+void
+ISPCPrintInstrument() {
+    // When program execution is done, go through the stats and print them
+    // out.  (This function is called by ao.cpp).
+    std::map<std::string, CallInfo>::iterator citer = callInfo.begin();
+    while (citer != callInfo.end()) {
+        CallInfo &ci = citer->second;
+        float activePct = 100.f * ci.laneCount / (4.f * ci.count);
+        float allOffPct = 100.f * ci.allOff / ci.count;
+        printf("%s: %d calls (%d / %.2f%% all off!), %.2f%% active lanes\n",
+               citer->first.c_str(), ci.count, ci.allOff, allOffPct,
+               activePct);
+        ++citer;
+    }
+}
diff --git a/examples/aobench_instrumented/instrument.h b/examples/aobench_instrumented/instrument.h
new file mode 100644
index 00000000..a21730b1
--- /dev/null
+++ b/examples/aobench_instrumented/instrument.h
@@ -0,0 +1,45 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifndef INSTRUMENT_H
+#define INSTRUMENT_H 1
+
+#include <stdint.h>
+
+extern "C" { 
+    void ISPCInstrument(const char *fn, const char *note, int line, int mask);
+}
+
+void ISPCPrintInstrument();
+
+#endif // INSTRUMENT_H
diff --git a/examples/examples.sln b/examples/examples.sln
new file mode 100755
index 00000000..5e8de17a
--- /dev/null
+++ b/examples/examples.sln
@@ -0,0 +1,86 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 11.00
+# Visual Studio 2010
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simple", "simple\simple.vcxproj", "{947C5311-8B78-4D05-BEE4-BCF342D4B367}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "rt", "rt\rt.vcxproj", "{E787BC3F-2D2E-425E-A64D-4721E2FF3DC9}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "aobench", "aobench\aobench.vcxproj", "{F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mandelbrot", "mandelbrot\mandelbrot.vcxproj", "{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C1}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "options", "options\options.vcxproj", "{8C7B5D29-1E76-44E6-BBB8-09830E5DEEAE}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mandelbrot_tasks", "mandelbrot_tasks\mandelbrot_tasks.vcxproj", "{E80DA7D4-AB22-4648-A068-327307156BE6}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "aobench_instrumented", "aobench_instrumented\aobench_instrumented.vcxproj", "{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Win32 = Debug|Win32
+		Debug|x64 = Debug|x64
+		Release|Win32 = Release|Win32
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{947C5311-8B78-4D05-BEE4-BCF342D4B367}.Debug|Win32.ActiveCfg = Debug|Win32
+		{947C5311-8B78-4D05-BEE4-BCF342D4B367}.Debug|Win32.Build.0 = Debug|Win32
+		{947C5311-8B78-4D05-BEE4-BCF342D4B367}.Debug|x64.ActiveCfg = Debug|x64
+		{947C5311-8B78-4D05-BEE4-BCF342D4B367}.Debug|x64.Build.0 = Debug|x64
+		{947C5311-8B78-4D05-BEE4-BCF342D4B367}.Release|Win32.ActiveCfg = Release|Win32
+		{947C5311-8B78-4D05-BEE4-BCF342D4B367}.Release|Win32.Build.0 = Release|Win32
+		{947C5311-8B78-4D05-BEE4-BCF342D4B367}.Release|x64.ActiveCfg = Release|x64
+		{947C5311-8B78-4D05-BEE4-BCF342D4B367}.Release|x64.Build.0 = Release|x64
+		{E787BC3F-2D2E-425E-A64D-4721E2FF3DC9}.Debug|Win32.ActiveCfg = Debug|Win32
+		{E787BC3F-2D2E-425E-A64D-4721E2FF3DC9}.Debug|Win32.Build.0 = Debug|Win32
+		{E787BC3F-2D2E-425E-A64D-4721E2FF3DC9}.Debug|x64.ActiveCfg = Debug|x64
+		{E787BC3F-2D2E-425E-A64D-4721E2FF3DC9}.Debug|x64.Build.0 = Debug|x64
+		{E787BC3F-2D2E-425E-A64D-4721E2FF3DC9}.Release|Win32.ActiveCfg = Release|Win32
+		{E787BC3F-2D2E-425E-A64D-4721E2FF3DC9}.Release|Win32.Build.0 = Release|Win32
+		{E787BC3F-2D2E-425E-A64D-4721E2FF3DC9}.Release|x64.ActiveCfg = Release|x64
+		{E787BC3F-2D2E-425E-A64D-4721E2FF3DC9}.Release|x64.Build.0 = Release|x64
+		{F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB}.Debug|Win32.ActiveCfg = Debug|Win32
+		{F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB}.Debug|Win32.Build.0 = Debug|Win32
+		{F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB}.Debug|x64.ActiveCfg = Debug|x64
+		{F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB}.Debug|x64.Build.0 = Debug|x64
+		{F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB}.Release|Win32.ActiveCfg = Release|Win32
+		{F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB}.Release|Win32.Build.0 = Release|Win32
+		{F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB}.Release|x64.ActiveCfg = Release|x64
+		{F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB}.Release|x64.Build.0 = Release|x64
+		{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C1}.Debug|Win32.ActiveCfg = Debug|Win32
+		{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C1}.Debug|Win32.Build.0 = Debug|Win32
+		{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C1}.Debug|x64.ActiveCfg = Debug|x64
+		{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C1}.Debug|x64.Build.0 = Debug|x64
+		{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C1}.Release|Win32.ActiveCfg = Release|Win32
+		{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C1}.Release|Win32.Build.0 = Release|Win32
+		{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C1}.Release|x64.ActiveCfg = Release|x64
+		{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C1}.Release|x64.Build.0 = Release|x64
+		{8C7B5D29-1E76-44E6-BBB8-09830E5DEEAE}.Debug|Win32.ActiveCfg = Debug|Win32
+		{8C7B5D29-1E76-44E6-BBB8-09830E5DEEAE}.Debug|Win32.Build.0 = Debug|Win32
+		{8C7B5D29-1E76-44E6-BBB8-09830E5DEEAE}.Debug|x64.ActiveCfg = Debug|x64
+		{8C7B5D29-1E76-44E6-BBB8-09830E5DEEAE}.Debug|x64.Build.0 = Debug|x64
+		{8C7B5D29-1E76-44E6-BBB8-09830E5DEEAE}.Release|Win32.ActiveCfg = Release|Win32
+		{8C7B5D29-1E76-44E6-BBB8-09830E5DEEAE}.Release|Win32.Build.0 = Release|Win32
+		{8C7B5D29-1E76-44E6-BBB8-09830E5DEEAE}.Release|x64.ActiveCfg = Release|x64
+		{8C7B5D29-1E76-44E6-BBB8-09830E5DEEAE}.Release|x64.Build.0 = Release|x64
+		{E80DA7D4-AB22-4648-A068-327307156BE6}.Debug|Win32.ActiveCfg = Debug|Win32
+		{E80DA7D4-AB22-4648-A068-327307156BE6}.Debug|Win32.Build.0 = Debug|Win32
+		{E80DA7D4-AB22-4648-A068-327307156BE6}.Debug|x64.ActiveCfg = Debug|x64
+		{E80DA7D4-AB22-4648-A068-327307156BE6}.Debug|x64.Build.0 = Debug|x64
+		{E80DA7D4-AB22-4648-A068-327307156BE6}.Release|Win32.ActiveCfg = Release|Win32
+		{E80DA7D4-AB22-4648-A068-327307156BE6}.Release|Win32.Build.0 = Release|Win32
+		{E80DA7D4-AB22-4648-A068-327307156BE6}.Release|x64.ActiveCfg = Release|x64
+		{E80DA7D4-AB22-4648-A068-327307156BE6}.Release|x64.Build.0 = Release|x64
+		{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}.Debug|Win32.ActiveCfg = Debug|Win32
+		{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}.Debug|Win32.Build.0 = Debug|Win32
+		{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}.Debug|x64.ActiveCfg = Debug|x64
+		{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}.Debug|x64.Build.0 = Debug|x64
+		{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}.Release|Win32.ActiveCfg = Release|Win32
+		{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}.Release|Win32.Build.0 = Release|Win32
+		{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}.Release|x64.ActiveCfg = Release|x64
+		{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/examples/mandelbrot/Makefile b/examples/mandelbrot/Makefile
new file mode 100644
index 00000000..dd369d0b
--- /dev/null
+++ b/examples/mandelbrot/Makefile
@@ -0,0 +1,26 @@
+
+CXX=g++
+CXXFLAGS=-Iobjs/ -O3 -Wall
+ISPC=ispc
+ISPCFLAGS=-O2 --target=sse4x2
+
+default: mandelbrot
+
+.PHONY: dirs clean
+
+dirs:
+	/bin/mkdir -p objs/
+
+clean:
+	/bin/rm -rf objs *~ mandelbrot
+
+mandelbrot: dirs objs/mandelbrot.o objs/mandelbrot_serial.o objs/mandelbrot_ispc.o
+	$(CXX) $(CXXFLAGS) -o $@ objs/mandelbrot.o objs/mandelbrot_ispc.o objs/mandelbrot_serial.o -lm
+
+objs/%.o: %.cpp
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/mandelbrot.o: objs/mandelbrot_ispc.h 
+
+objs/%_ispc.h objs/%_ispc.o: %.ispc
+	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
diff --git a/examples/mandelbrot/mandelbrot.cpp b/examples/mandelbrot/mandelbrot.cpp
new file mode 100644
index 00000000..2105a335
--- /dev/null
+++ b/examples/mandelbrot/mandelbrot.cpp
@@ -0,0 +1,117 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#define NOMINMAX
+#pragma warning (disable: 4244)
+#pragma warning (disable: 4305)
+#endif
+
+#include <stdio.h>
+#include <algorithm>
+#include "../timing.h"
+#include "mandelbrot_ispc.h"
+using namespace ispc;
+
+extern void mandelbrot_serial(float x0, float y0, float x1, float y1,
+                              int width, int height, int maxIterations,
+                              int output[]);
+
+/* Write a PPM image file with the image of the Mandelbrot set */
+static void
+writePPM(int *buf, int width, int height, const char *fn) {
+    FILE *fp = fopen(fn, "wb");
+    fprintf(fp, "P6\n");
+    fprintf(fp, "%d %d\n", width, height);
+    fprintf(fp, "255\n");
+    for (int i = 0; i < width*height; ++i) {
+        // Map the iteration count to colors by just alternating between
+        // two greys.
+        char c = (buf[i] & 0x1) ? 240 : 20;
+        for (int j = 0; j < 3; ++j)
+            fputc(c, fp);
+    }
+    fclose(fp);
+}
+
+
+int main() {
+    unsigned int width = 768;
+    unsigned int height = 512;
+    float x0 = -2;
+    float x1 = 1;
+    float y0 = -1;
+    float y1 = 1;
+
+    int maxIterations = 256;
+    int *buf = new int[width*height];
+
+    //
+    // Compute the image using the ispc implementation; report the minimum
+    // time of three runs.
+    //
+    double minISPC = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        reset_and_start_timer();
+        mandelbrot_ispc(x0, y0, x1, y1, width, height, maxIterations, buf);
+        double dt = get_elapsed_mcycles();
+        minISPC = std::min(minISPC, dt);
+    }
+
+    printf("[mandelbrot ispc]:\t\t[%.3f] million cycles\n", minISPC);
+    writePPM(buf, width, height, "mandelbrot-ispc.ppm");
+
+    // Clear out the buffer
+    for (unsigned int i = 0; i < width * height; ++i)
+        buf[i] = 0;
+
+    // 
+    // And run the serial implementation 3 times, again reporting the
+    // minimum time.
+    //
+    double minSerial = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        reset_and_start_timer();
+        mandelbrot_serial(x0, y0, x1, y1, width, height, maxIterations, buf);
+        double dt = get_elapsed_mcycles();
+        minSerial = std::min(minSerial, dt);
+    }
+
+    printf("[mandelbrot serial]:\t\t[%.3f] millon cycles\n", minSerial);
+    writePPM(buf, width, height, "mandelbrot-serial.ppm");
+
+    printf("\t\t\t\t(%.2fx speedup from ISPC)\n", minSerial/minISPC);
+
+    return 0;
+}
diff --git a/examples/mandelbrot/mandelbrot.ispc b/examples/mandelbrot/mandelbrot.ispc
new file mode 100644
index 00000000..ecbb4fc1
--- /dev/null
+++ b/examples/mandelbrot/mandelbrot.ispc
@@ -0,0 +1,76 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+static inline int mandel(float c_re, float c_im, int count) {
+    float z_re = c_re, z_im = c_im;
+    int i;
+    for (i = 0; i < count; ++i) {
+        if (z_re * z_re + z_im * z_im > 4.)
+            break;
+
+        float new_re = z_re*z_re - z_im*z_im;
+        float new_im = 2.f * z_re * z_im;
+        z_re = c_re + new_re;
+        z_im = c_im + new_im;
+    }
+
+    return i;
+}
+
+export void mandelbrot_ispc(uniform float x0, uniform float y0, 
+                            uniform float x1, uniform float y1,
+                            uniform int width, uniform int height, 
+                            uniform int maxIterations,
+                            reference uniform int output[])
+{
+    float dx = (x1 - x0) / width;
+    float dy = (y1 - y0) / height;
+
+    for (uniform int j = 0; j < height; j++) {
+        // Note that we'll be doing programCount computations in parallel,
+        // so increment i by that much.  This assumes that width evenly
+        // divides programCount.
+        for (uniform int i = 0; i < width; i += programCount) {
+            // Figure out the position on the complex plane to compute the
+            // number of iterations at.  Note that the x values are
+            // different across different program instances, since its
+            // initializer incorporates the value of the programIndex
+            // variable.
+            float x = x0 + (programIndex + i) * dx;
+            float y = y0 + j * dy;
+
+            int index = j * width + i + programIndex;
+            output[index] = mandel(x, y, maxIterations);
+        }
+    }
+}
diff --git a/examples/mandelbrot/mandelbrot.vcxproj b/examples/mandelbrot/mandelbrot.vcxproj
new file mode 100755
index 00000000..db33453b
--- /dev/null
+++ b/examples/mandelbrot/mandelbrot.vcxproj
@@ -0,0 +1,161 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C1}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>mandelbrot</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="mandelbrot.cpp" />
+    <ClCompile Include="mandelbrot_serial.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="mandelbrot.ispc">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
diff --git a/examples/mandelbrot/mandelbrot_serial.cpp b/examples/mandelbrot/mandelbrot_serial.cpp
new file mode 100644
index 00000000..4bea7baf
--- /dev/null
+++ b/examples/mandelbrot/mandelbrot_serial.cpp
@@ -0,0 +1,68 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+
+static int mandel(float c_re, float c_im, int count) {
+    float z_re = c_re, z_im = c_im;
+    int i;
+    for (i = 0; i < count; ++i) {
+        if (z_re * z_re + z_im * z_im > 4.)
+            break;
+
+        float new_re = z_re*z_re - z_im*z_im;
+        float new_im = 2.f * z_re * z_im;
+        z_re = c_re + new_re;
+        z_im = c_im + new_im;
+    }
+
+    return i;
+}
+
+void mandelbrot_serial(float x0, float y0, float x1, float y1,
+                       int width, int height, int maxIterations,
+                       int output[])
+{
+    float dx = (x1 - x0) / width;
+    float dy = (y1 - y0) / height;
+
+    for (int j = 0; j < height; j++) {
+        for (int i = 0; i < width; ++i) {
+            float x = x0 + i * dx;
+            float y = y0 + j * dy;
+
+            int index = (j * width + i);
+            output[index] = mandel(x, y, maxIterations);
+        }
+    }
+}
+
diff --git a/examples/mandelbrot_tasks/Makefile b/examples/mandelbrot_tasks/Makefile
new file mode 100644
index 00000000..182c2698
--- /dev/null
+++ b/examples/mandelbrot_tasks/Makefile
@@ -0,0 +1,38 @@
+
+ARCH = $(shell uname)
+
+TASK_CXX=tasks_pthreads.cpp
+TASK_LIB=-lpthread
+
+ifeq ($(ARCH), Darwin)
+  TASK_CXX=tasks_gcd.cpp
+  TASK_LIB=
+endif
+
+TASK_OBJ=$(addprefix objs/, $(TASK_CXX:.cpp=.o))
+
+CXX=g++
+CXXFLAGS=-Iobjs/ -O3 -Wall
+ISPC=ispc
+ISPCFLAGS=-O2 --target=sse4x2
+
+default: mandelbrot
+
+.PHONY: dirs clean
+
+dirs:
+	/bin/mkdir -p objs/
+
+clean:
+	/bin/rm -rf objs *~ mandelbrot
+
+mandelbrot: dirs objs/mandelbrot.o objs/mandelbrot_serial.o objs/mandelbrot_ispc.o $(TASK_OBJ)
+	$(CXX) $(CXXFLAGS) -o $@ objs/mandelbrot.o objs/mandelbrot_ispc.o objs/mandelbrot_serial.o $(TASK_OBJ) -lm $(TASK_LIB)
+
+objs/%.o: %.cpp
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/mandelbrot.o: objs/mandelbrot_ispc.h 
+
+objs/%_ispc.h objs/%_ispc.o: %.ispc
+	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
diff --git a/examples/mandelbrot_tasks/mandelbrot.cpp b/examples/mandelbrot_tasks/mandelbrot.cpp
new file mode 100644
index 00000000..50ad4cf8
--- /dev/null
+++ b/examples/mandelbrot_tasks/mandelbrot.cpp
@@ -0,0 +1,120 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#define NOMINMAX
+#pragma warning (disable: 4244)
+#pragma warning (disable: 4305)
+#endif
+
+#include <stdio.h>
+#include <algorithm>
+#include "../timing.h"
+#include "mandelbrot_ispc.h"
+using namespace ispc;
+
+extern void mandelbrot_serial(float x0, float y0, float x1, float y1,
+                              int width, int height, int maxIterations,
+                              int output[]);
+
+/* Write a PPM image file with the image of the Mandelbrot set */
+static void
+writePPM(int *buf, int width, int height, const char *fn) {
+    FILE *fp = fopen(fn, "wb");
+    fprintf(fp, "P6\n");
+    fprintf(fp, "%d %d\n", width, height);
+    fprintf(fp, "255\n");
+    for (int i = 0; i < width*height; ++i) {
+        // Map the iteration count to colors by just alternating between
+        // two greys.
+        char c = (buf[i] & 0x1) ? 240 : 20;
+        for (int j = 0; j < 3; ++j)
+            fputc(c, fp);
+    }
+    fclose(fp);
+}
+
+
+int main() {
+    unsigned int width = 1536;
+    unsigned int height = 1024;
+    float x0 = -2;
+    float x1 = 1;
+    float y0 = -1;
+    float y1 = 1;
+
+    extern void TasksInit();
+    TasksInit();
+
+    int maxIterations = 512;
+    int *buf = new int[width*height];
+
+    //
+    // Compute the image using the ispc implementation; report the minimum
+    // time of three runs.
+    //
+    double minISPC = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        reset_and_start_timer();
+        mandelbrot_ispc(x0, y0, x1, y1, width, height, maxIterations, buf);
+        double dt = get_elapsed_mcycles();
+        minISPC = std::min(minISPC, dt);
+    }
+
+    printf("[mandelbrot ispc+tasks]:\t[%.3f] million cycles\n", minISPC);
+    writePPM(buf, width, height, "mandelbrot-ispc.ppm");
+
+    // Clear out the buffer
+    for (unsigned int i = 0; i < width * height; ++i)
+        buf[i] = 0;
+
+    // 
+    // And run the serial implementation 3 times, again reporting the
+    // minimum time.
+    //
+    double minSerial = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        reset_and_start_timer();
+        mandelbrot_serial(x0, y0, x1, y1, width, height, maxIterations, buf);
+        double dt = get_elapsed_mcycles();
+        minSerial = std::min(minSerial, dt);
+    }
+
+    printf("[mandelbrot serial]:\t\t[%.3f] millon cycles\n", minSerial);
+    writePPM(buf, width, height, "mandelbrot-serial.ppm");
+
+    printf("\t\t\t\t(%.2fx speedup from ISPC)\n", minSerial/minISPC);
+
+    return 0;
+}
diff --git a/examples/mandelbrot_tasks/mandelbrot.ispc b/examples/mandelbrot_tasks/mandelbrot.ispc
new file mode 100644
index 00000000..df763e0a
--- /dev/null
+++ b/examples/mandelbrot_tasks/mandelbrot.ispc
@@ -0,0 +1,86 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+static inline int
+mandel(float c_re, float c_im, int count) {
+    float z_re = c_re, z_im = c_im;
+    int i;
+    for (i = 0; i < count; ++i) {
+        if (z_re * z_re + z_im * z_im > 4.)
+            break;
+
+        float new_re = z_re*z_re - z_im*z_im;
+        float new_im = 2.f * z_re * z_im;
+        z_re = c_re + new_re;
+        z_im = c_im + new_im;
+    }
+
+    return i;
+}
+
+
+/* Task to compute the Mandelbrot iterations for a span of scanlines from
+   [ystart,yend).
+ */
+task void
+mandelbrot_scanlines(uniform int ystart, uniform int yend,
+                     uniform float x0, uniform float dx, 
+                     uniform float y0, uniform float dy,
+                     uniform int width, uniform int maxIterations,
+                     reference uniform int output[]) {
+    for (uniform int j = ystart; j < yend; ++j) {
+        for (uniform int i = 0; i < width; i += programCount) {
+            float x = x0 + (programIndex + i) * dx;
+            float y = y0 + j * dy;
+
+            int index = j * width + i + programIndex;
+            output[index] = mandel(x, y, maxIterations);
+        }
+    }
+}
+                               
+
+export void
+mandelbrot_ispc(uniform float x0, uniform float y0, 
+                uniform float x1, uniform float y1,
+                uniform int width, uniform int height, 
+                uniform int maxIterations, reference uniform int output[]) {
+    uniform float dx = (x1 - x0) / width;
+    uniform float dy = (y1 - y0) / height;
+
+    /* Launch task to compute results for spans of 'span' scanlines. */
+    uniform int span = 2;
+    for (uniform int j = 0; j < height; j += span)
+        launch < mandelbrot_scanlines(j, j+span, x0, dx, y0, dy, width,
+                                      maxIterations, output) >;
+}
diff --git a/examples/mandelbrot_tasks/mandelbrot_serial.cpp b/examples/mandelbrot_tasks/mandelbrot_serial.cpp
new file mode 100644
index 00000000..4bea7baf
--- /dev/null
+++ b/examples/mandelbrot_tasks/mandelbrot_serial.cpp
@@ -0,0 +1,68 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+
+static int mandel(float c_re, float c_im, int count) {
+    float z_re = c_re, z_im = c_im;
+    int i;
+    for (i = 0; i < count; ++i) {
+        if (z_re * z_re + z_im * z_im > 4.)
+            break;
+
+        float new_re = z_re*z_re - z_im*z_im;
+        float new_im = 2.f * z_re * z_im;
+        z_re = c_re + new_re;
+        z_im = c_im + new_im;
+    }
+
+    return i;
+}
+
+void mandelbrot_serial(float x0, float y0, float x1, float y1,
+                       int width, int height, int maxIterations,
+                       int output[])
+{
+    float dx = (x1 - x0) / width;
+    float dy = (y1 - y0) / height;
+
+    for (int j = 0; j < height; j++) {
+        for (int i = 0; i < width; ++i) {
+            float x = x0 + i * dx;
+            float y = y0 + j * dy;
+
+            int index = (j * width + i);
+            output[index] = mandel(x, y, maxIterations);
+        }
+    }
+}
+
diff --git a/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj b/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
new file mode 100755
index 00000000..ba3687cb
--- /dev/null
+++ b/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
@@ -0,0 +1,162 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{E80DA7D4-AB22-4648-A068-327307156BE6}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>mandelbrot</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="mandelbrot.cpp" />
+    <ClCompile Include="mandelbrot_serial.cpp" />
+    <ClCompile Include="tasks_concrt.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="mandelbrot.ispc">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
diff --git a/examples/mandelbrot_tasks/tasks_concrt.cpp b/examples/mandelbrot_tasks/tasks_concrt.cpp
new file mode 100644
index 00000000..a861ca87
--- /dev/null
+++ b/examples/mandelbrot_tasks/tasks_concrt.cpp
@@ -0,0 +1,115 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/* Simple task system implementation for ispc based on Microsoft's
+   Concurrency Runtime. */
+
+#include <windows.h>
+#include <concrt.h>
+using namespace Concurrency;
+#include <assert.h>
+#include <stdio.h>
+
+// ispc expects these functions to have C linkage / not be mangled
+extern "C" { 
+    void ISPCLaunch(void *f, void *data);
+    void ISPCSync();
+}
+
+typedef void (*TaskFuncType)(void *, int, int);
+
+struct TaskInfo {
+    TaskFuncType ispcFunc;
+    void *ispcData;
+};
+
+// This is a simple implementation that just aborts if more than MAX_TASKS
+// are launched.  It could easily be extended to be more general...
+
+#define MAX_TASKS 4096
+static int taskOffset;
+static TaskInfo taskInfo[MAX_TASKS];
+static event *events[MAX_TASKS];
+static CRITICAL_SECTION criticalSection;
+
+void
+TasksInit() {
+    InitializeCriticalSection(&criticalSection);
+    for (int i = 0; i < MAX_TASKS; ++i)
+        events[i] = new event;
+}
+
+
+void __cdecl
+lRunTask(LPVOID param) {
+    TaskInfo *ti = (TaskInfo *)param;
+    
+    // Actually run the task. 
+    // FIXME: like the tasks_gcd.cpp implementation, this is passing bogus
+    // values for the threadIndex and threadCount builtins, which in turn
+    // will cause bugs in code that uses those.  FWIW this example doesn't
+    // use them...
+    int threadIndex = 0;
+    int threadCount = 1;
+    ti->ispcFunc(ti->ispcData, threadIndex, threadCount);
+
+    // Signal the event that this task is done
+    int taskNum = ti - &taskInfo[0];
+    events[taskNum]->set();
+}
+
+
+void
+ISPCLaunch(void *func, void *data) {
+    // Get a TaskInfo struct for this task
+    EnterCriticalSection(&criticalSection);
+    TaskInfo *ti = &taskInfo[taskOffset++];
+    assert(taskOffset < MAX_TASKS);
+    LeaveCriticalSection(&criticalSection);
+
+    // And pass it on to the Concurrency Runtime...
+    ti->ispcFunc = (TaskFuncType)func;
+    ti->ispcData = data;
+    CurrentScheduler::ScheduleTask(lRunTask, ti);
+}
+
+
+void ISPCSync() {
+    event::wait_for_multiple(&events[0], taskOffset, true, 
+                             COOPERATIVE_TIMEOUT_INFINITE);
+
+    for (int i = 0; i < taskOffset; ++i)
+        events[i]->reset();
+
+    taskOffset = 0;
+}
diff --git a/examples/mandelbrot_tasks/tasks_gcd.cpp b/examples/mandelbrot_tasks/tasks_gcd.cpp
new file mode 100644
index 00000000..b8b8e80f
--- /dev/null
+++ b/examples/mandelbrot_tasks/tasks_gcd.cpp
@@ -0,0 +1,90 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/* A simple task system for ispc programs based on Apple's Grand Central
+   Dispatch. */
+
+#include <dispatch/dispatch.h>
+
+static dispatch_queue_t gcdQueue;
+static dispatch_group_t gcdGroup;
+
+// ispc expects these functions to have C linkage / not be mangled
+extern "C" {
+    void ISPCLaunch(void *f, void *data);
+    void ISPCSync();
+}
+
+struct TaskInfo {
+    void *func;
+    void *data;
+};
+
+
+void
+TasksInit() {
+    gcdQueue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0);
+    gcdGroup = dispatch_group_create();
+}
+
+
+static void
+lRunTask(void *ti) {
+    typedef void (*TaskFuncType)(void *, int, int);
+    TaskInfo *taskInfo = (TaskInfo *)ti;
+
+    TaskFuncType func = (TaskFuncType)(taskInfo->func);
+
+    // FIXME: these are bogus values; may cause bugs in code that depends
+    // on them having unique values in different threads.
+    int threadIndex = 0;
+    int threadCount = 1;
+    // Actually run the task
+    func(taskInfo->data, threadIndex, threadCount);
+
+    // FIXME: taskInfo leaks...
+}
+
+
+void ISPCLaunch(void *func, void *data) {
+    TaskInfo *ti = new TaskInfo;
+    ti->func = func;
+    ti->data = data;
+    dispatch_group_async_f(gcdGroup, gcdQueue, ti, lRunTask);
+}
+
+
+void ISPCSync() {
+    // Wait for all of the tasks in the group to complete before returning
+    dispatch_group_wait(gcdGroup, DISPATCH_TIME_FOREVER);
+}
diff --git a/examples/mandelbrot_tasks/tasks_pthreads.cpp b/examples/mandelbrot_tasks/tasks_pthreads.cpp
new file mode 100644
index 00000000..4a23c5dc
--- /dev/null
+++ b/examples/mandelbrot_tasks/tasks_pthreads.cpp
@@ -0,0 +1,285 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#include <pthread.h>
+#include <semaphore.h>
+#include <string.h>
+#include <unistd.h>
+#include <assert.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/param.h>
+#include <sys/sysctl.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <vector>
+
+// ispc expects these functions to have C linkage / not be mangled
+extern "C" { 
+    void ISPCLaunch(void *f, void *data);
+    void ISPCSync();
+}
+
+
+static int nThreads;
+static pthread_t *threads;
+static pthread_mutex_t taskQueueMutex;
+static std::vector<std::pair<void *, void *> > taskQueue;
+static sem_t *workerSemaphore;
+static uint32_t numUnfinishedTasks;
+static pthread_mutex_t tasksRunningConditionMutex;
+static pthread_cond_t tasksRunningCondition;
+
+static void *lTaskEntry(void *arg);
+
+/** Figure out how many CPU cores there are in the system
+ */
+static int
+lNumCPUCores() {
+#if defined(__linux__)
+    return sysconf(_SC_NPROCESSORS_ONLN);
+#else
+    // Mac
+    int mib[2];
+    mib[0] = CTL_HW;
+    size_t length = 2;
+    if (sysctlnametomib("hw.logicalcpu", mib, &length) == -1) {
+        fprintf(stderr, "sysctlnametomib() filed.  Guessing 2 cores.");
+        return 2;
+    }
+    assert(length == 2);
+
+    int nCores = 0;
+    size_t size = sizeof(nCores);
+
+    if (sysctl(mib, 2, &nCores, &size, NULL, 0) == -1) {
+        fprintf(stderr, "sysctl() to find number of cores present failed.  Guessing 2.");
+        return 2;
+    }
+    return nCores;
+#endif
+}
+
+void
+TasksInit() {
+    nThreads = lNumCPUCores();
+
+    threads = new pthread_t[nThreads];
+
+    int err;
+    if ((err = pthread_mutex_init(&taskQueueMutex, NULL)) != 0) {
+        fprintf(stderr, "Error creating mutex: %s\n", strerror(err));
+        exit(1);
+    }
+
+    char name[32];
+    sprintf(name, "mandelbrot.%d", (int)getpid());
+    workerSemaphore = sem_open(name, O_CREAT, S_IRUSR|S_IWUSR, 0);
+    if (!workerSemaphore) {
+        fprintf(stderr, "Error creating semaphore: %s\n", strerror(err));
+        exit(1);
+    }
+
+    if ((err = pthread_cond_init(&tasksRunningCondition, NULL)) != 0) {
+        fprintf(stderr, "Error creating condition variable: %s\n", strerror(err));
+        exit(1);
+    }
+
+    if ((err = pthread_mutex_init(&tasksRunningConditionMutex, NULL)) != 0) {
+        fprintf(stderr, "Error creating mutex: %s\n", strerror(err));
+        exit(1);
+    }
+
+    for (int i = 0; i < nThreads; ++i) {
+        err = pthread_create(&threads[i], NULL, &lTaskEntry, reinterpret_cast<void *>(i));
+        if (err != 0) {
+            fprintf(stderr, "Error creating pthread %d: %s\n", i, strerror(err));
+            exit(1);
+        }
+    }
+}
+
+
+void
+ISPCLaunch(void *f, void *d) {
+    //
+    // Acquire mutex, add task
+    //
+    int err;
+    if ((err = pthread_mutex_lock(&taskQueueMutex)) != 0) {
+        fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
+        exit(1);
+    }
+
+    taskQueue.push_back(std::make_pair(f, d));
+
+    if ((err = pthread_mutex_unlock(&taskQueueMutex)) != 0) {
+        fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
+        exit(1);
+    }
+
+    //
+    // Update count of number of tasks left to run
+    //
+    if ((err = pthread_mutex_lock(&tasksRunningConditionMutex)) != 0) {
+        fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
+        exit(1);
+    }
+
+    ++numUnfinishedTasks;
+
+    if ((err = pthread_mutex_unlock(&tasksRunningConditionMutex)) != 0) {
+        fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
+        exit(1);
+    }
+
+    //
+    // Post to the worker semaphore to wake up worker threads that are
+    // sleeping waiting for tasks to show up
+    //
+    if ((err = sem_post(workerSemaphore)) != 0) {
+        fprintf(stderr, "Error from sem_post: %s\n", strerror(err));
+        exit(1);
+    }
+}
+
+
+static void *
+lTaskEntry(void *arg) {
+    int threadIndex = int(reinterpret_cast<int64_t>(arg));
+    int threadCount = nThreads;
+
+    while (true) {
+        int err;
+        if ((err = sem_wait(workerSemaphore)) != 0) {
+            fprintf(stderr, "Error from sem_wait: %s\n", strerror(err));
+            exit(1);
+        }
+
+        std::pair<void *, void *> myTask;
+        //
+        // Acquire mutex, get task
+        //
+        if ((err = pthread_mutex_lock(&taskQueueMutex)) != 0) {
+            fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
+            exit(1);
+        }
+        if (taskQueue.size() == 0) {
+            //
+            // Task queue is empty, go back and wait on the semaphore
+            //
+            if ((err = pthread_mutex_unlock(&taskQueueMutex)) != 0) {
+                fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
+                exit(1);
+            }
+            continue;
+        }
+
+        myTask = taskQueue.back();
+        taskQueue.pop_back();
+
+        if ((err = pthread_mutex_unlock(&taskQueueMutex)) != 0) {
+            fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
+            exit(1);
+        }
+
+        //
+        // Do work for _myTask_
+        //
+        typedef void (*TaskFunType)(void *, int, int);
+        TaskFunType func = (TaskFunType)myTask.first;
+        func(myTask.second, threadIndex, threadCount);
+
+        //
+        // Decrement the number of unfinished tasks counter
+        //
+        if ((err = pthread_mutex_lock(&tasksRunningConditionMutex)) != 0) {
+            fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
+            exit(1);
+        }
+
+        int unfinished = --numUnfinishedTasks;
+        if (unfinished == 0) {
+            //
+            // Signal the "no more tasks are running" condition if all of
+            // them are done.
+            //
+            int err;
+            if ((err = pthread_cond_signal(&tasksRunningCondition)) != 0) {
+                fprintf(stderr, "Error from pthread_cond_signal: %s\n", strerror(err));
+                exit(1);
+            }
+        }
+
+        if ((err = pthread_mutex_unlock(&tasksRunningConditionMutex)) != 0) {
+            fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
+            exit(1);
+        }
+    }
+
+    pthread_exit(NULL);
+    return 0;
+}
+
+
+void ISPCSync() {
+    int err;
+    if ((err = pthread_mutex_lock(&tasksRunningConditionMutex)) != 0) {
+        fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
+        exit(1);
+    }
+
+    // As long as there are tasks running, wait on the condition variable;
+    // doing so causes this thread to go to sleep until someone signals on
+    // the tasksRunningCondition condition variable.
+    while (numUnfinishedTasks > 0) {
+        if ((err = pthread_cond_wait(&tasksRunningCondition, 
+                                     &tasksRunningConditionMutex)) != 0) {
+            fprintf(stderr, "Error from pthread_cond_wait: %s\n", strerror(err));
+            exit(1);
+        }
+    }
+    
+    // We acquire ownership of the condition variable mutex when the above
+    // pthread_cond_wait returns.
+    // FIXME: is there a lurking issue here if numUnfinishedTasks gets back
+    // to zero by the time we get to ISPCSync() and thence we're trying to
+    // unlock a mutex we don't have a lock on?
+    if ((err = pthread_mutex_unlock(&tasksRunningConditionMutex)) != 0) {
+        fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
+        exit(1);
+    }
+}
diff --git a/examples/options/Makefile b/examples/options/Makefile
new file mode 100644
index 00000000..46be29e2
--- /dev/null
+++ b/examples/options/Makefile
@@ -0,0 +1,26 @@
+
+CXX=g++
+CXXFLAGS=-Iobjs/ -g -Wall
+ISPC=ispc
+ISPCFLAGS=-O2 --target=sse4x2
+
+default: options
+
+.PHONY: dirs clean
+
+dirs:
+	/bin/mkdir -p objs/
+
+clean:
+	/bin/rm -rf objs *~ options
+
+options: dirs objs/options.o objs/options_serial.o objs/options_ispc.o
+	$(CXX) $(CXXFLAGS) -o $@ objs/options.o objs/options_ispc.o objs/options_serial.o -lm
+
+objs/%.o: %.cpp
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/options.o: objs/options_ispc.h options_defs.h
+
+objs/%_ispc.h objs/%_ispc.o: %.ispc options_defs.h
+	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
diff --git a/examples/options/options.cpp b/examples/options/options.cpp
new file mode 100644
index 00000000..241b32be
--- /dev/null
+++ b/examples/options/options.cpp
@@ -0,0 +1,151 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <math.h>
+#include <algorithm>
+#ifndef __APPLE__
+#include <malloc.h>
+#endif // !__APPLE__
+using std::max;
+
+#include "options_defs.h"
+#include "../timing.h"
+
+#include "options_ispc.h"
+using namespace ispc;
+
+// Allocate memory with 64-byte alignment.
+float *AllocFloats(int count) {
+    int size = count * sizeof(float);
+#if defined(_WIN32) || defined(_WIN64)
+    return (float *)_aligned_malloc(size, 64);
+#elif defined (__APPLE__)
+    // Allocate excess memory to ensure an aligned pointer can be returned
+    void *mem = malloc(size + (64-1) + sizeof(void*));
+    char *amem = ((char*)mem) + sizeof(void*);
+    amem += 64 - (reinterpret_cast<uint64_t>(amem) & (64 - 1));
+    ((void**)amem)[-1] = mem;
+    return (float *)amem;
+#else
+    return (float *)memalign(64, size);
+#endif
+}
+
+extern void black_scholes_serial(float Sa[], float Xa[], float Ta[], 
+                                 float ra[], float va[], 
+                                 float result[], int count);
+
+extern void binomial_put_serial(float Sa[], float Xa[], float Ta[], 
+                                float ra[], float va[], 
+                                float result[], int count);
+
+int main() {
+    // Pointers passed to ispc code must have alignment of the target's
+    // vector width at minimum.
+    float *S = AllocFloats(N_OPTIONS);
+    float *X = AllocFloats(N_OPTIONS);
+    float *T = AllocFloats(N_OPTIONS);
+    float *r = AllocFloats(N_OPTIONS);
+    float *v = AllocFloats(N_OPTIONS);
+    float *result = AllocFloats(N_OPTIONS);
+
+    for (int i = 0; i < N_OPTIONS; ++i) {
+        S[i] = 100;  // stock price
+        X[i] = 98;   // option strike price
+        T[i] = 2;    // time (years)
+        r[i] = .02;  // risk-free interest rate
+        v[i] = 5;    // volatility
+    }
+
+    //
+    // Binomial options pricing model, ispc implementation
+    //
+    reset_and_start_timer();
+    binomial_put_ispc(S, X, T, r, v, result, N_OPTIONS);
+    double binomial_ispc = get_elapsed_mcycles();
+    float sum = 0.f;
+    for (int i = 0; i < N_OPTIONS; ++i)
+        sum += result[i];
+    printf("[binomial ispc]:\t\t[%.3f] million cycles (avg %f)\n", 
+           binomial_ispc, sum / N_OPTIONS);
+
+    //
+    // Binomial options, serial implementation
+    //
+    reset_and_start_timer();
+    binomial_put_serial(S, X, T, r, v, result, N_OPTIONS);
+    double binomial_serial = get_elapsed_mcycles();
+    sum = 0.f;
+    for (int i = 0; i < N_OPTIONS; ++i)
+        sum += result[i];
+    printf("[binomial serial]:\t\t[%.3f] million cycles (avg %f)\n", 
+           binomial_serial, sum / N_OPTIONS);
+
+    printf("\t\t\t\t(%.2fx speedup from ISPC)\n", binomial_serial / binomial_ispc);
+
+    //
+    // Black-Scholes options pricing model, ispc implementation
+    //
+    sum = 0.f;
+    reset_and_start_timer();
+    for (int a = 0; a < N_BLACK_SCHOLES_ROUNDS; ++a) {
+        black_scholes_ispc(S, X, T, r, v, result, N_OPTIONS);
+        for (int i = 0; i < N_OPTIONS; ++i)
+            sum += result[i];
+    }
+    double bs_ispc = get_elapsed_mcycles();
+    printf("[black-scholes ispc]:\t\t[%.3f] million cycles (avg %f)\n", 
+           bs_ispc, sum / (N_BLACK_SCHOLES_ROUNDS * N_OPTIONS));
+
+    //
+    // Black-Scholes options pricing model, serial implementation
+    //
+    sum = 0.f;
+    reset_and_start_timer();
+    for (int a = 0; a < N_BLACK_SCHOLES_ROUNDS; ++a) {
+        black_scholes_serial(S, X, T, r, v, result, N_OPTIONS);
+        for (int i = 0; i < N_OPTIONS; ++i)
+            sum += result[i];
+    }
+    double bs_serial = get_elapsed_mcycles();
+    printf("[black-scholes serial]:\t\t[%.3f] million cycles (avg %f)\n", bs_serial, 
+           sum / (N_BLACK_SCHOLES_ROUNDS * N_OPTIONS));
+
+    printf("\t\t\t\t(%.2fx speedup from ISPC)\n", bs_serial / bs_ispc);
+
+    return 0;
+}
diff --git a/examples/options/options.ispc b/examples/options/options.ispc
new file mode 100644
index 00000000..89e53634
--- /dev/null
+++ b/examples/options/options.ispc
@@ -0,0 +1,103 @@
+// -*- mode: c++ -*-
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#include "options_defs.h"
+
+// Cumulative normal distribution function
+static inline float
+CND(float X) {
+    float L = abs(X);
+
+    float k = 1.0 / (1.0 + 0.2316419 * L);
+    float k2 = k*k;
+    float k3 = k2*k;
+    float k4 = k2*k2;
+    float k5 = k3*k2;
+
+    const float invSqrt2Pi = 0.39894228040f;
+    float w = (0.31938153f * k - 0.356563782f * k2 + 1.781477937f * k3 +
+               -1.821255978f * k4 + 1.330274429f * k5);
+    w *= invSqrt2Pi * exp(-L * L * .5f);
+
+    if (X > 0.f)
+        w = 1.0 - w;
+    return w;
+}
+
+export void
+black_scholes_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[],
+                   uniform float ra[], uniform float va[], 
+                   uniform float result[], uniform int count) {
+    for (uniform int i = 0; i < count; i += programCount) {
+        float S = Sa[i + programIndex], X = Xa[i + programIndex];
+        float T = Ta[i + programIndex], r = ra[i + programIndex];
+        float v = va[i + programIndex];
+
+        float d1 = (log(S/X) + (r + v * v * .5f) * T) / (v * sqrt(T));
+        float d2 = d1 - v * sqrt(T);
+
+        result[i + programIndex] = S * CND(d1) - X * exp(-r * T) * CND(d2);
+    }
+}
+
+
+export void
+binomial_put_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[], 
+                  uniform float ra[], uniform float va[], 
+                  uniform float result[], uniform int count) {
+    float V[BINOMIAL_NUM];
+
+    for (uniform int i = 0; i < count; i += programCount) {
+        float S = Sa[i + programIndex], X = Xa[i + programIndex];
+        float T = Ta[i + programIndex], r = ra[i + programIndex];
+        float v = va[i + programIndex];
+
+        float dt = T / BINOMIAL_NUM;
+        float u = exp(v * sqrt(dt));
+        float d = 1. / u;
+        float disc = exp(r * dt);
+        float Pu = (disc - d) / (u - d);
+
+        for (uniform int j = 0; j < BINOMIAL_NUM; ++j) {
+            float upow = pow(u, (float)(2*j-BINOMIAL_NUM));
+            V[j] = max(0., X - S * upow);
+        }
+
+        for (uniform int j = BINOMIAL_NUM-1; j >= 0; --j)
+            for (uniform int k = 0; k < j; ++k)
+                V[k] = ((1 - Pu) * V[k] + Pu * V[k + 1]) / disc;
+
+        result[i + programIndex] = V[0];
+    }
+}
diff --git a/examples/options/options.vcxproj b/examples/options/options.vcxproj
new file mode 100755
index 00000000..5b8f709b
--- /dev/null
+++ b/examples/options/options.vcxproj
@@ -0,0 +1,168 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{8C7B5D29-1E76-44E6-BBB8-09830E5DEEAE}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>options</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <DisableSpecificWarnings>4305</DisableSpecificWarnings>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <DisableSpecificWarnings>4305</DisableSpecificWarnings>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <DisableSpecificWarnings>4305</DisableSpecificWarnings>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <DisableSpecificWarnings>4305</DisableSpecificWarnings>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="options.cpp" />
+    <ClCompile Include="options_serial.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="options.ispc">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="options_defs.h" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
diff --git a/examples/options/options_defs.h b/examples/options/options_defs.h
new file mode 100644
index 00000000..54b8ec81
--- /dev/null
+++ b/examples/options/options_defs.h
@@ -0,0 +1,42 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifndef OPTIONS_DEFS_H
+#define OPTIONS_DEFS_H 1
+
+#define BINOMIAL_NUM 64
+#define N_OPTIONS 65536
+#define N_BLACK_SCHOLES_ROUNDS 20
+
+
+#endif // OPTIONS_DEFS_H
diff --git a/examples/options/options_serial.cpp b/examples/options/options_serial.cpp
new file mode 100644
index 00000000..a2689b73
--- /dev/null
+++ b/examples/options/options_serial.cpp
@@ -0,0 +1,114 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#define NOMINMAX
+#pragma warning (disable: 4244)
+#pragma warning (disable: 4305)
+#endif
+
+#include "options_defs.h"
+#include <math.h>
+#include <algorithm>
+
+// Cumulative normal distribution function
+static inline float
+CND(float X) {
+    float L = fabsf(X);
+
+    float k = 1.0 / (1.0 + 0.2316419 * L);
+    float k2 = k*k;
+    float k3 = k2*k;
+    float k4 = k2*k2;
+    float k5 = k3*k2;
+
+    const float invSqrt2Pi = 0.39894228040f;
+    float w = (0.31938153f * k - 0.356563782f * k2 + 1.781477937f * k3 +
+               -1.821255978f * k4 + 1.330274429f * k5);
+    w *= invSqrt2Pi * expf(-L * L * .5f);
+
+    if (X > 0.f)
+        w = 1.0 - w;
+    return w;
+}
+
+
+void
+black_scholes_serial(float Sa[], float Xa[], float Ta[], 
+                     float ra[], float va[], 
+                     float result[], int count) {
+    for (int i = 0; i < count; ++i) {
+        float S = Sa[i], X = Xa[i];
+        float T = Ta[i], r = ra[i];
+        float v = va[i];
+
+        float d1 = (logf(S/X) + (r + v * v * .5f) * T) / (v * sqrtf(T));
+        float d2 = d1 - v * sqrtf(T);
+
+        result[i] = S * CND(d1) - X * expf(-r * T) * CND(d2);
+    }
+}
+
+
+void
+binomial_put_serial(float Sa[], float Xa[], float Ta[], 
+                    float ra[], float va[], 
+                    float result[], int count) {
+    float V[BINOMIAL_NUM];
+
+    for (int i = 0; i < count; ++i) {
+        float S = Sa[i], X = Xa[i];
+        float T = Ta[i], r = ra[i];
+        float v = va[i];
+
+        float dt = T / BINOMIAL_NUM;
+        float u = expf(v * sqrtf(dt));
+        float d = 1. / u;
+        float disc = expf(r * dt);
+        float Pu = (disc - d) / (u - d);
+
+        for (int j = 0; j < BINOMIAL_NUM; ++j) {
+            float upow = powf(u, (float)(2*j-BINOMIAL_NUM));
+            V[j] = std::max(0.f, X - S * upow);
+        }
+
+        for (int j = BINOMIAL_NUM-1; j >= 0; --j)
+            for (int k = 0; k < j; ++k)
+                V[k] = ((1 - Pu) * V[k] + Pu * V[k + 1]) / disc;
+
+        result[i] = V[0];
+    }
+}
+
+
diff --git a/examples/rt/Makefile b/examples/rt/Makefile
new file mode 100644
index 00000000..7df58868
--- /dev/null
+++ b/examples/rt/Makefile
@@ -0,0 +1,24 @@
+
+CXX=g++
+CXXFLAGS=-Iobjs/ -O3 -Wall
+ISPC=ispc
+ISPCFLAGS=-O2 --target=sse4x2
+
+default: rt
+
+.PHONY: dirs clean
+
+dirs:
+	/bin/mkdir -p objs/
+
+clean:
+	/bin/rm -rf objs *~ rt
+
+rt: dirs objs/rt.o objs/rt_serial.o objs/rt_ispc.o
+	$(CXX) $(CXXFLAGS) -o $@ objs/rt.o objs/rt_ispc.o objs/rt_serial.o -lm
+
+objs/%.o: %.cpp objs/rt_ispc.h
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/%_ispc.h objs/%_ispc.o: %.ispc
+	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
diff --git a/examples/rt/cornell.bvh b/examples/rt/cornell.bvh
new file mode 100644
index 00000000..f7e0f3dd
Binary files /dev/null and b/examples/rt/cornell.bvh differ
diff --git a/examples/rt/cornell.camera b/examples/rt/cornell.camera
new file mode 100644
index 00000000..0fec1642
Binary files /dev/null and b/examples/rt/cornell.camera differ
diff --git a/examples/rt/rt.cpp b/examples/rt/rt.cpp
new file mode 100644
index 00000000..e589bd94
--- /dev/null
+++ b/examples/rt/rt.cpp
@@ -0,0 +1,244 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#define NOMINMAX
+#pragma warning (disable: 4244)
+#pragma warning (disable: 4305)
+#endif
+
+#include <stdio.h>
+#include <math.h>
+#include <algorithm>
+#include <assert.h>
+#include <sys/types.h>
+#ifndef __APPLE__
+#include <malloc.h>
+#endif
+#include "../timing.h"
+#include "rt_ispc.h"
+
+using namespace ispc;
+
+typedef unsigned int uint;
+
+template <typename T> 
+T *AllocAligned(int count) {
+    int size = count * sizeof(T);
+#if defined(_WIN32) || defined(_WIN64)
+    return (T *)_aligned_malloc(size, 64);
+#elif defined (__APPLE__)
+    // Allocate excess memory to ensure an aligned pointer can be returned
+    void *mem = malloc(size + (64-1) + sizeof(void*));
+    char *amem = ((char*)mem) + sizeof(void*);
+    amem += 64 - (reinterpret_cast<uint64_t>(amem) & (64 - 1));
+    ((void**)amem)[-1] = mem;
+    return (T *)amem;
+#else
+    return (T *)memalign(64, size);
+#endif
+}
+
+extern void raytrace_serial(int width, int height, const float raster2camera[4][4], 
+                            const float camera2world[4][4], float image[],
+                            int id[], const LinearBVHNode nodes[],
+                            const Triangle triangles[]);
+
+
+static void writeImage(int *idImage, float *depthImage, int width, int height,
+                       const char *filename) {
+    FILE *f = fopen(filename, "wb");
+    if (!f) {
+        perror(filename);
+        exit(1);
+    }
+
+    fprintf(f, "P6\n%d %d\n255\n", width, height);
+    for (int y = 0; y < height; ++y) {
+        for (int x = 0; x < width; ++x) {
+            // use the bits from the object id of the hit object to make a
+            // random color
+            int id = idImage[y * width + x];
+            unsigned char r = 0, g = 0, b = 0;
+
+            for (int i = 0; i < 8; ++i) {
+                // extract bit 3*i for red, 3*i+1 for green, 3*i+2 for blue
+                int rbit = (id & (1 << (3*i)))   >> (3*i);
+                int gbit = (id & (1 << (3*i+1))) >> (3*i+1);
+                int bbit = (id & (1 << (3*i+2))) >> (3*i+2);
+                // and then set the bits of the colors starting from the
+                // high bits...
+                r |= rbit << (7-i);
+                g |= gbit << (7-i);
+                b |= bbit << (7-i);
+            }
+            fputc(r, f);
+            fputc(g, f);
+            fputc(b, f);
+        }
+    }            
+    fclose(f);
+}
+
+
+int main(int argc, char *argv[]) {
+    if (argc != 2) {
+        fprintf(stderr, "usage: rt <filename base>\n");
+        exit(1);
+    }
+
+#define READ(var, n)                                            \
+    if (fread(&(var), sizeof(var), n, f) != (unsigned int)n) {  \
+        fprintf(stderr, "Unexpected EOF reading scene file\n"); \
+        return 1;                                               \
+    } else /* eat ; */                                                     
+
+    //
+    // Read the camera specification information from the camera file
+    //
+    char fnbuf[1024];
+    sprintf(fnbuf, "%s.camera", argv[1]);
+    FILE *f = fopen(fnbuf, "rb");
+    if (!f) {
+        perror(argv[1]);
+        return 1;
+    }
+
+    //
+    // Nothing fancy, and trouble if we run on a big-endian system, just
+    // fread in the bits
+    //
+    int width, height;
+    float camera2world[4][4], raster2camera[4][4];
+    READ(width, 1);
+    READ(height, 1);
+    READ(camera2world[0][0], 16);
+    READ(raster2camera[0][0], 16);
+
+    //
+    // Read in the serialized BVH 
+    //
+    sprintf(fnbuf, "%s.bvh", argv[1]);
+    f = fopen(fnbuf, "rb");
+    if (!f) {
+        perror(argv[2]);
+        return 1;
+    }
+
+    // The BVH file starts with an int that gives the total number of BVH
+    // nodes
+    uint nNodes;
+    READ(nNodes, 1);
+
+    LinearBVHNode *nodes = AllocAligned<LinearBVHNode>(nNodes);
+    for (unsigned int i = 0; i < nNodes; ++i) {
+        // Each node is 6x floats for a boox, then an integer for an offset
+        // to the second child node, then an integer that encodes the type
+        // of node, the total number of int it if a leaf node, etc.
+        float b[6];
+        READ(b[0], 6);
+        nodes[i].bounds[0].v[0] = b[0];
+        nodes[i].bounds[0].v[1] = b[1];
+        nodes[i].bounds[0].v[2] = b[2];
+        nodes[i].bounds[1].v[0] = b[3];
+        nodes[i].bounds[1].v[1] = b[4];
+        nodes[i].bounds[1].v[2] = b[5];
+        READ(nodes[i].offset, 1);
+        READ(nodes[i].primsAxis, 1);
+    }
+
+    // And then read the triangles 
+    uint nTris;
+    READ(nTris, 1);
+    Triangle *triangles = AllocAligned<Triangle>(nTris);
+    for (uint i = 0; i < nTris; ++i) {
+        // 9x floats for the 3 vertices
+        float v[9];
+        READ(v[0], 9);
+        float *vp = v;
+        for (int j = 0; j < 3; ++j) {
+            triangles[i].p[j].v[0] = *vp++;
+            triangles[i].p[j].v[1] = *vp++;
+            triangles[i].p[j].v[2] = *vp++;
+        }
+        // And create an object id
+        triangles[i].id = i+1;
+    }
+    fclose(f);
+
+    // round image resolution up to multiple of 4 to makethings easy for
+    // the code that assigns pixels to ispc program instances
+    height = (height + 3) & ~3;
+    width = (width + 3) & ~3;
+
+    // allocate images; one to hold hit object ids, one to hold depth to
+    // the first interseciton
+    int *id = new int[width*height];
+    float *image = new float[width*height];
+
+    //
+    // Run 3 iterations with ispc, record the minimum time
+    //
+    double minTimeISPC = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        reset_and_start_timer();
+        raytrace(width, height, raster2camera, camera2world, 
+                 image, id, nodes, triangles);
+        double dt = get_elapsed_mcycles();
+        minTimeISPC = std::min(dt, minTimeISPC);
+    }
+    printf("[rt ispc]:\t\t\t[%.3f] million cycles for %d x %d image\n", minTimeISPC, width, height);
+
+    writeImage(id, image, width, height, "rt-ispc.ppm");
+
+    //
+    // And 3 iterations with the serial implementation, reporting the
+    // minimum time.
+    //
+    double minTimeSerial = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        reset_and_start_timer();
+        raytrace_serial(width, height, raster2camera, camera2world, 
+                        image, id, nodes, triangles);
+        double dt = get_elapsed_mcycles();
+        minTimeSerial = std::min(dt, minTimeSerial);
+    }
+    printf("[rt serial]:\t\t\t[%.3f] million cycles for %d x %d image\n", 
+           minTimeSerial, width, height);
+    printf("\t\t\t\t(%.2fx speedup from ISPC)\n", minTimeSerial / minTimeISPC);
+
+    writeImage(id, image, width, height, "rt-serial.ppm");
+
+    return 0;
+}
diff --git a/examples/rt/rt.ispc b/examples/rt/rt.ispc
new file mode 100644
index 00000000..08dabb0e
--- /dev/null
+++ b/examples/rt/rt.ispc
@@ -0,0 +1,273 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#define bool int
+
+typedef float<3> float3;
+
+struct Ray {
+    float3 origin, dir, invDir;
+    uniform unsigned int dirIsNeg[3];
+    float mint, maxt;
+    int hitId;
+};
+
+struct Triangle {
+    uniform float3 p[3];
+    uniform int id;
+};
+
+struct LinearBVHNode {
+    uniform float3 bounds[2];
+    uniform unsigned int offset;     // num primitives for leaf, second child for interior
+    uniform unsigned int primsAxis;  // 0:7 nPrimitives, 8:15 split axis, 16:31 padding
+};
+
+static inline uniform int nPrims(const reference LinearBVHNode node) {
+    return (node.primsAxis & 0xff);
+}
+
+static inline uniform int axis(const reference LinearBVHNode node) {
+    return ((node.primsAxis >> 8) & 0xff);
+}
+
+static inline uniform bool isInterior(const reference LinearBVHNode node) {
+    return nPrims(node) == 0;
+}
+
+static inline float3 Cross(const float3 v1, const float3 v2) {
+    float v1x = v1.x, v1y = v1.y, v1z = v1.z;
+    float v2x = v2.x, v2y = v2.y, v2z = v2.z;
+    float3 ret;
+    ret.x = (v1y * v2z) - (v1z * v2y);
+    ret.y = (v1z * v2x) - (v1x * v2z);
+    ret.z = (v1x * v2y) - (v1y * v2x);
+    return ret;
+}
+
+static inline float Dot(const float3 a, const float3 b) {
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+
+static void generateRay(uniform const float raster2camera[4][4], 
+                        uniform const float camera2world[4][4],
+                        float x, float y, reference Ray ray) {
+    ray.mint = 0.f;
+    ray.maxt = 1e30f;
+
+    ray.hitId = 0;
+
+    // transform raster coordinate (x, y, 0) to camera space
+    float camx = raster2camera[0][0] * x + raster2camera[0][1] * y + raster2camera[0][3];
+    float camy = raster2camera[1][0] * x + raster2camera[1][1] * y + raster2camera[1][3];
+    float camz = raster2camera[2][3];
+    float camw = raster2camera[3][3];
+    camx /= camw;
+    camy /= camw;
+    camz /= camw;
+
+    ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy + camera2world[0][2] * camz;
+    ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy + camera2world[1][2] * camz;
+    ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy + camera2world[2][2] * camz;
+
+    ray.origin.x = camera2world[0][3] / camera2world[3][3];
+    ray.origin.y = camera2world[1][3] / camera2world[3][3];
+    ray.origin.z = camera2world[2][3] / camera2world[3][3];
+
+    ray.invDir = 1.f / ray.dir;
+
+    ray.dirIsNeg[0] = any(ray.invDir.x < 0) ? 1 : 0;
+    ray.dirIsNeg[1] = any(ray.invDir.y < 0) ? 1 : 0;
+    ray.dirIsNeg[2] = any(ray.invDir.z < 0) ? 1 : 0;
+}
+
+
+static inline bool BBoxIntersect(const reference uniform float3 bounds[2], 
+                                 const reference Ray ray) {
+    float t0 = ray.mint, t1 = ray.maxt;
+
+    // Check all three axis-aligned slabs.  Don't try to early out; it's
+    // not worth the trouble
+    float3 tNear = (bounds[0] - ray.origin) * ray.invDir;
+    float3 tFar  = (bounds[1] - ray.origin) * ray.invDir;
+    if (tNear.x > tFar.x) {
+        float tmp = tNear.x;
+        tNear.x = tFar.x;
+        tFar.x = tmp;
+    }
+    t0 = max(tNear.x, t0);
+    t1 = min(tFar.x, t1);
+
+    if (tNear.y > tFar.y) {
+        float tmp = tNear.y;
+        tNear.y = tFar.y;
+        tFar.y = tmp;
+    }
+    t0 = max(tNear.y, t0);
+    t1 = min(tFar.y, t1);
+
+    if (tNear.z > tFar.z) {
+        float tmp = tNear.z;
+        tNear.z = tFar.z;
+        tFar.z = tmp;
+    }
+    t0 = max(tNear.z, t0);
+    t1 = min(tFar.z, t1);
+    
+    return (t0 <= t1);
+}
+
+
+
+static inline bool TriIntersect(const reference Triangle tri, reference Ray ray) {
+    uniform float3 e1 = tri.p[1] - tri.p[0];
+    uniform float3 e2 = tri.p[2] - tri.p[0];
+
+    float3 s1 = Cross(ray.dir, e2);
+    float divisor = Dot(s1, e1);
+    bool hit = true;
+
+    if (divisor == 0.)
+        hit = false;
+    float invDivisor = 1.f / divisor;
+
+    // Compute first barycentric coordinate
+    float3 d = ray.origin - tri.p[0];
+    float b1 = Dot(d, s1) * invDivisor;
+    if (b1 < 0. || b1 > 1.)
+        hit = false;
+
+    // Compute second barycentric coordinate
+    float3 s2 = Cross(d, e1);
+    float b2 = Dot(ray.dir, s2) * invDivisor;
+    if (b2 < 0. || b1 + b2 > 1.)
+        hit = false;
+
+    // Compute _t_ to intersection point
+    float t = Dot(e2, s2) * invDivisor;
+    if (t < ray.mint || t > ray.maxt)
+        hit = false;
+
+    if (hit) {
+        ray.maxt = t;
+        ray.hitId = tri.id;
+    }
+    return hit;
+}
+
+
+bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[], 
+                  reference Ray r) {
+    Ray ray = r;
+    bool hit = false;
+    // Follow ray through BVH nodes to find primitive intersections
+    uniform int todoOffset = 0, nodeNum = 0;
+    uniform int todo[64];
+
+    while (true) {
+        // Check ray against BVH node
+        LinearBVHNode node = nodes[nodeNum];
+        if (any(BBoxIntersect(node.bounds, ray))) {
+            uniform unsigned int nPrimitives = nPrims(node);
+            if (nPrimitives > 0) {
+                // Intersect ray with primitives in leaf BVH node
+                uniform unsigned int primitivesOffset = node.offset;
+                for (uniform unsigned int i = 0; i < nPrimitives; ++i) {
+                    if (TriIntersect(tris[primitivesOffset+i], ray))
+                        hit = true;
+                }
+                if (todoOffset == 0) 
+                    break;
+                nodeNum = todo[--todoOffset];
+            }
+            else {
+                // Put far BVH node on _todo_ stack, advance to near node
+                if (r.dirIsNeg[axis(node)]) {
+                   todo[todoOffset++] = nodeNum + 1;
+                   nodeNum = node.offset;
+                }
+                else {
+                   todo[todoOffset++] = node.offset;
+                   nodeNum = nodeNum + 1;
+                }
+            }
+        }
+        else {
+            if (todoOffset == 0)
+                break;
+            nodeNum = todo[--todoOffset];
+        }
+    }
+    r.maxt = ray.maxt;
+    r.hitId = ray.hitId;
+
+    return hit;
+}
+
+
+export void raytrace(uniform int width, uniform int height,
+                     const uniform float raster2camera[4][4], 
+                     const uniform float camera2world[4][4],
+                     uniform float image[], uniform int id[],
+                     const LinearBVHNode nodes[],
+                     const Triangle triangles[]) {
+    static const uniform float udx[16] = { 0, 1, 0, 1, 2, 3, 2, 3, 
+                                           0, 1, 0, 1, 2, 3, 2, 3 };
+    static const uniform float udy[16] = { 0, 0, 1, 1, 0, 0, 1, 1, 
+                                           2, 2, 3, 3, 2, 2, 3, 3 };
+
+    // The outer loops are always over blocks of 4x4 pixels
+    for (uniform int y = 0; y < height; y += 4) {
+        for (uniform int x = 0; x < width; x += 4) {
+            // Now we have a block of 4x4=16 pixels to process; it will
+            // take 16/programCount iterations of this loop to process
+            // them.
+            for (uniform int o = 0; o < 16 / programCount; ++o) {
+                // Map program instances to samples in the udx/udy arrays
+                // to figure out which pixel each program instance is
+                // responsible for
+                const float dx = udx[o * programCount + programIndex];
+                const float dy = udy[o * programCount + programIndex];
+
+                Ray ray;
+                generateRay(raster2camera, camera2world, x+dx, y+dy, ray);
+                BVHIntersect(nodes, triangles, ray);
+
+                int offset = (y + (int)dy) * width + (x + (int)dx);
+                image[offset] = ray.maxt;
+                id[offset] = ray.hitId;
+            }
+        }
+    }
+}
diff --git a/examples/rt/rt.vcxproj b/examples/rt/rt.vcxproj
new file mode 100755
index 00000000..4a893a8f
--- /dev/null
+++ b/examples/rt/rt.vcxproj
@@ -0,0 +1,165 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{E787BC3F-2D2E-425E-A64D-4721E2FF3DC9}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>rt</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CustomBuild Include="rt.ispc">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="rt.cpp" />
+    <ClCompile Include="rt_serial.cpp" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
diff --git a/examples/rt/rt_serial.cpp b/examples/rt/rt_serial.cpp
new file mode 100644
index 00000000..53f7d4cb
--- /dev/null
+++ b/examples/rt/rt_serial.cpp
@@ -0,0 +1,288 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#define NOMINMAX
+#pragma warning (disable: 4244)
+#pragma warning (disable: 4305)
+#endif
+
+#include <algorithm>
+
+// Just enough of a float3 class to do what we need in this file.
+#ifdef _MSC_VER
+__declspec(align(16)) 
+#endif
+struct float3 {
+    float3() { }
+    float3(float xx, float yy, float zz) { x = xx; y = yy; z = zz; }
+
+    float3 operator*(float f) const { return float3(x*f, y*f, z*f); }
+    float3 operator-(const float3 &f2) const { 
+        return float3(x-f2.x, y-f2.y, z-f2.z); 
+    }
+    float3 operator*(const float3 &f2) const { 
+        return float3(x*f2.x, y*f2.y, z*f2.z); 
+    }
+    float x, y, z;
+    float pad;  // match padding/alignment of ispc version 
+}
+#ifndef _MSC_VER
+__attribute__ ((aligned(16)))
+#endif
+;
+
+struct Ray {
+    float3 origin, dir, invDir;
+    unsigned int dirIsNeg[3];
+    float mint, maxt;
+    int hitId;
+};
+
+
+// Declare these in a namespace so the mangling matches
+namespace ispc {
+    struct Triangle {
+        float3 p[3];
+        int id;
+    };
+
+    struct LinearBVHNode {
+        float3 bounds[2];
+        unsigned int offset;     // primitives for leaf, second child for interior
+        unsigned int primsAxis;  // 0:7 nPrimitives, 8:15 split axis, 16:31 padding
+    };
+}
+
+using namespace ispc;
+
+inline int nPrims(const LinearBVHNode &node) {
+    return (node.primsAxis & 0xff);
+}
+
+inline int axis(const LinearBVHNode &node) {
+    return ((node.primsAxis >> 8) & 0xff);
+}
+
+inline bool isInterior(const LinearBVHNode &node) {
+    return nPrims(node) == 0;
+}
+
+inline float3 Cross(const float3 &v1, const float3 &v2) {
+    float v1x = v1.x, v1y = v1.y, v1z = v1.z;
+    float v2x = v2.x, v2y = v2.y, v2z = v2.z;
+    float3 ret;
+    ret.x = (v1y * v2z) - (v1z * v2y);
+    ret.y = (v1z * v2x) - (v1x * v2z);
+    ret.z = (v1x * v2y) - (v1y * v2x);
+    return ret;
+}
+
+inline float Dot(const float3 &a, const float3 &b) {
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+
+static void generateRay(const float raster2camera[4][4], 
+                        const float camera2world[4][4],
+                        float x, float y, Ray &ray) {
+    ray.mint = 0.f;
+    ray.maxt = 1e30f;
+
+    ray.hitId = 0;
+
+    // transform raster coordinate (x, y, 0) to camera space
+    float camx = raster2camera[0][0] * x + raster2camera[0][1] * y + raster2camera[0][3];
+    float camy = raster2camera[1][0] * x + raster2camera[1][1] * y + raster2camera[1][3];
+    float camz = raster2camera[2][3];
+    float camw = raster2camera[3][3];
+    camx /= camw;
+    camy /= camw;
+    camz /= camw;
+
+    ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy + camera2world[0][2] * camz;
+    ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy + camera2world[1][2] * camz;
+    ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy + camera2world[2][2] * camz;
+
+    ray.origin.x = camera2world[0][3] / camera2world[3][3];
+    ray.origin.y = camera2world[1][3] / camera2world[3][3];
+    ray.origin.z = camera2world[2][3] / camera2world[3][3];
+
+    ray.invDir.x = 1.f / ray.dir.x;
+    ray.invDir.y = 1.f / ray.dir.y;
+    ray.invDir.z = 1.f / ray.dir.z;
+
+    ray.dirIsNeg[0] = (ray.invDir.x < 0) ? 1 : 0;
+    ray.dirIsNeg[1] = (ray.invDir.y < 0) ? 1 : 0;
+    ray.dirIsNeg[2] = (ray.invDir.z < 0) ? 1 : 0;
+}
+
+
+static inline bool BBoxIntersect(const float3 bounds[2], 
+                                 const Ray &ray) {
+    float t0 = ray.mint, t1 = ray.maxt;
+
+    float3 tNear = (bounds[0] - ray.origin) * ray.invDir;
+    float3 tFar  = (bounds[1] - ray.origin) * ray.invDir;
+    if (tNear.x > tFar.x) {
+        float tmp = tNear.x;
+        tNear.x = tFar.x;
+        tFar.x = tmp;
+    }
+    t0 = std::max(tNear.x, t0);
+    t1 = std::min(tFar.x, t1);
+
+    if (tNear.y > tFar.y) {
+        float tmp = tNear.y;
+        tNear.y = tFar.y;
+        tFar.y = tmp;
+    }
+    t0 = std::max(tNear.y, t0);
+    t1 = std::min(tFar.y, t1);
+
+    if (tNear.z > tFar.z) {
+        float tmp = tNear.z;
+        tNear.z = tFar.z;
+        tFar.z = tmp;
+    }
+    t0 = std::max(tNear.z, t0);
+    t1 = std::min(tFar.z, t1);
+    
+    return (t0 <= t1);
+}
+
+
+
+inline bool TriIntersect(const Triangle &tri, Ray &ray) {
+    float3 e1 = tri.p[1] - tri.p[0];
+    float3 e2 = tri.p[2] - tri.p[0];
+
+    float3 s1 = Cross(ray.dir, e2);
+    float divisor = Dot(s1, e1);
+
+    if (divisor == 0.)
+        return false;
+    float invDivisor = 1.f / divisor;
+
+    // Compute first barycentric coordinate
+    float3 d = ray.origin - tri.p[0];
+    float b1 = Dot(d, s1) * invDivisor;
+    if (b1 < 0. || b1 > 1.)
+        return false;
+
+    // Compute second barycentric coordinate
+    float3 s2 = Cross(d, e1);
+    float b2 = Dot(ray.dir, s2) * invDivisor;
+    if (b2 < 0. || b1 + b2 > 1.)
+        return false;
+
+    // Compute _t_ to intersection point
+    float t = Dot(e2, s2) * invDivisor;
+    if (t < ray.mint || t > ray.maxt)
+        return false;
+
+    ray.maxt = t;
+    ray.hitId = tri.id;
+    return true;
+}
+
+
+bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[], 
+                  Ray &r) {
+    Ray ray = r;
+    bool hit = false;
+    // Follow ray through BVH nodes to find primitive intersections
+    int todoOffset = 0, nodeNum = 0;
+    int todo[64];
+
+    while (true) {
+        // Check ray against BVH node
+        const LinearBVHNode &node = nodes[nodeNum];
+        if (BBoxIntersect(node.bounds, ray)) {
+            unsigned int nPrimitives = nPrims(node);
+            if (nPrimitives > 0) {
+                // Intersect ray with primitives in leaf BVH node
+                unsigned int primitivesOffset = node.offset;
+                for (unsigned int i = 0; i < nPrimitives; ++i) {
+                    if (TriIntersect(tris[primitivesOffset+i], ray))
+                        hit = true;
+                }
+                if (todoOffset == 0) 
+                    break;
+                nodeNum = todo[--todoOffset];
+            }
+            else {
+                // Put far BVH node on _todo_ stack, advance to near node
+                if (r.dirIsNeg[axis(node)]) {
+                   todo[todoOffset++] = nodeNum + 1;
+                   nodeNum = node.offset;
+                }
+                else {
+                   todo[todoOffset++] = node.offset;
+                   nodeNum = nodeNum + 1;
+                }
+            }
+        }
+        else {
+            if (todoOffset == 0)
+                break;
+            nodeNum = todo[--todoOffset];
+        }
+    }
+    r.maxt = ray.maxt;
+    r.hitId = ray.hitId;
+
+    return hit;
+}
+
+
+void raytrace_serial(int width, int height,
+                     const float raster2camera[4][4], 
+                     const float camera2world[4][4],
+                     float image[],
+                     int id[],
+                     const LinearBVHNode nodes[],
+                     const Triangle triangles[]) {
+    for (int y = 0; y < height; ++y) {
+        for (int x = 0; x < width; ++x) {
+                Ray ray;
+                generateRay(raster2camera, camera2world, x, y, ray);
+                BVHIntersect(nodes, triangles, ray);
+
+                int offset = y * width + x;
+                image[offset] = ray.maxt;
+                id[offset] = ray.hitId;
+        }
+    }
+}
diff --git a/examples/rt/sponza.bvh b/examples/rt/sponza.bvh
new file mode 100644
index 00000000..e59bde24
Binary files /dev/null and b/examples/rt/sponza.bvh differ
diff --git a/examples/rt/sponza.camera b/examples/rt/sponza.camera
new file mode 100644
index 00000000..7d44ec23
Binary files /dev/null and b/examples/rt/sponza.camera differ
diff --git a/examples/rt/teapot.bvh b/examples/rt/teapot.bvh
new file mode 100644
index 00000000..efcd7807
Binary files /dev/null and b/examples/rt/teapot.bvh differ
diff --git a/examples/rt/teapot.camera b/examples/rt/teapot.camera
new file mode 100644
index 00000000..9a98e3f6
Binary files /dev/null and b/examples/rt/teapot.camera differ
diff --git a/examples/simple/Makefile b/examples/simple/Makefile
new file mode 100644
index 00000000..b00c6737
--- /dev/null
+++ b/examples/simple/Makefile
@@ -0,0 +1,25 @@
+
+CXX=g++
+CXXFLAGS=-Iobjs/ -O3 -Wall
+ISPC=ispc
+ISPCFLAGS=-O2
+
+default: simple
+
+.PHONY: dirs clean
+.PRECIOUS: objs/simple.h
+
+dirs:
+	/bin/mkdir -p objs/
+
+clean:
+	/bin/rm -rf objs *~ simple
+
+simple: dirs  objs/simple.o objs/simple_ispc.o
+	$(CXX) $(CXXFLAGS) -o $@ objs/simple.o objs/simple_ispc.o
+
+objs/simple.o: simple.cpp objs/simple_ispc.h 
+	$(CXX) $(CXXFLAGS) -c -o $@ $<
+
+objs/%_ispc.h objs/%_ispc.o: %.ispc
+	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
new file mode 100644
index 00000000..3b5bf028
--- /dev/null
+++ b/examples/simple/simple.cpp
@@ -0,0 +1,63 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#include <stdio.h>
+
+// Include the header file that the ispc compiler generates
+#include "simple_ispc.h"
+using namespace ispc;
+
+int main() {
+    // Pointers passed to ispc-compiled code are currently required to have
+    // alignment equal to the target's native vector size.  Here we align
+    // to 32 bytes to be safe for both SSE and AVX targets.
+#ifdef _MSC_VER
+    __declspec(align(32)) float vin[16], vout[16];
+#else
+    float vin[16] __attribute__((aligned(32)));
+    float vout[16] __attribute__((aligned(32)));
+#endif
+
+    // Initialize input buffer
+    for (int i = 0; i < 16; ++i)
+        vin[i] = (float)i;
+
+    // Call simple() function from simple.ispc file
+    simple(vin, vout, 16);
+
+    // Print results
+    for (int i = 0; i < 16; ++i)
+        printf("%d: simple(%f) = %f\n", i, vin[i], vout[i]);
+
+    return 0;
+}
diff --git a/examples/simple/simple.ispc b/examples/simple/simple.ispc
new file mode 100644
index 00000000..a44c29e5
--- /dev/null
+++ b/examples/simple/simple.ispc
@@ -0,0 +1,53 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+
+export void simple(uniform float vin[], uniform float vout[], 
+                   uniform int count) {
+    // Compute the result for 'programCount' values in parallel
+    for (uniform int i = 0; i < count; i += programCount) {
+        int index = i + programIndex;
+        // Load the appropriate input value for this program instance.
+        float v = vin[index];
+
+        // Do an arbitrary little computation, but at least make the
+        // computation dependent on the value being processed
+        if (v < 3.)
+            v = v * v;
+        else
+            v = sqrt(v);
+
+        // And write the result to the output array.
+        vout[index] = v;
+    }
+}
diff --git a/examples/simple/simple.vcxproj b/examples/simple/simple.vcxproj
new file mode 100755
index 00000000..9723ed02
--- /dev/null
+++ b/examples/simple/simple.vcxproj
@@ -0,0 +1,164 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="simple.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="simple.ispc">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+cl /E /TP %(Filename).ispc | ispc -O2 -o %(Filename).obj -h %(Filename)_ispc.h
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{947C5311-8B78-4D05-BEE4-BCF342D4B367}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>simple</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
diff --git a/examples/timing.h b/examples/timing.h
new file mode 100644
index 00000000..a51ab372
--- /dev/null
+++ b/examples/timing.h
@@ -0,0 +1,67 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#include <stdint.h>
+
+
+#ifdef WIN32
+#include <windows.h>
+#define rdtsc __rdtsc
+#else
+extern "C" {
+    __inline__ uint64_t rdtsc() {
+        uint32_t low, high;
+        __asm__ __volatile__ (
+            "xorl %%eax,%%eax \n    cpuid"
+            ::: "%rax", "%rbx", "%rcx", "%rdx" );
+        __asm__ __volatile__ (
+                              "rdtsc" : "=a" (low), "=d" (high));
+        return (uint64_t)high << 32 | low;
+    }
+}
+#endif            
+            
+static uint64_t start, end;
+
+static inline void reset_and_start_timer()
+{
+    start = rdtsc();
+}
+
+/* Returns the number of millions of elapsed processor cycles since the
+   last reset_and_start_timer() call. */
+static inline double get_elapsed_mcycles()
+{
+    end = rdtsc();
+    return (end-start) / (1024. * 1024.);
+}
diff --git a/expr.cpp b/expr.cpp
new file mode 100644
index 00000000..53b07bdf
--- /dev/null
+++ b/expr.cpp
@@ -0,0 +1,4519 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file expr.cpp
+    @brief Implementations of expression classes
+*/
+
+#include "expr.h"
+#include "type.h"
+#include "sym.h"
+#include "ctx.h"
+#include "module.h"
+#include "util.h"
+#include "llvmutil.h"
+
+#include <list>
+#include <set>
+#include <stdio.h>
+#include <llvm/Module.h>
+#include <llvm/Function.h>
+#include <llvm/Type.h>
+#include <llvm/DerivedTypes.h>
+#include <llvm/LLVMContext.h>
+#include <llvm/Instructions.h>
+#include <llvm/CallingConv.h>
+#include <llvm/Target/TargetData.h>
+#include <llvm/Support/IRBuilder.h>
+#include <llvm/ExecutionEngine/GenericValue.h>
+#include <llvm/Support/InstIterator.h>
+
+/////////////////////////////////////////////////////////////////////////////////////
+// Expr
+
+llvm::Value *
+Expr::GetLValue(FunctionEmitContext *ctx) const {
+    // Expressions that can't provide an lvalue can just return NULL
+    return NULL;
+}
+
+
+llvm::Constant *
+Expr::GetConstant(const Type *type) const {
+    // The default is failure; just return NULL
+    return NULL;
+}
+
+
+Symbol *
+Expr::GetBaseSymbol() const {
+    // Not all expressions can do this, so provide a generally-useful
+    // default
+    return NULL;
+}
+
+
+/** If a conversion from 'fromAtomicType' to 'toAtomicType' may cause lost
+    precision, issue a warning.  Don't warn for conversions to bool and
+    conversions between signed and unsigned integers of the same size.
+ */
+static void
+lMaybeIssuePrecisionWarning(const AtomicType *toAtomicType, 
+                            const AtomicType *fromAtomicType, 
+                            SourcePos pos, const char *errorMsgBase) {
+    switch (toAtomicType->basicType) {
+    case AtomicType::TYPE_BOOL:
+    case AtomicType::TYPE_INT32:
+    case AtomicType::TYPE_UINT32:
+    case AtomicType::TYPE_FLOAT:
+    case AtomicType::TYPE_INT64:
+    case AtomicType::TYPE_UINT64:
+    case AtomicType::TYPE_DOUBLE:
+        if ((int)toAtomicType->basicType < (int)fromAtomicType->basicType &&
+            toAtomicType->basicType != AtomicType::TYPE_BOOL &&
+            !(toAtomicType->basicType == AtomicType::TYPE_INT32 && 
+              fromAtomicType->basicType == AtomicType::TYPE_UINT32) &&
+            !(toAtomicType->basicType == AtomicType::TYPE_INT64 && 
+              fromAtomicType->basicType == AtomicType::TYPE_UINT64))
+            Warning(pos, "Conversion from type \"%s\" to type \"%s\" for %s"
+                    " may lose information.",
+                    fromAtomicType->GetString().c_str(), toAtomicType->GetString().c_str(),
+                    errorMsgBase);
+        break;
+    default:
+        FATAL("logic error in lMaybeIssuePrecisionWarning");
+    }
+}
+
+
+Expr *
+Expr::TypeConv(const Type *toType, const char *errorMsgBase, bool failureOk) {
+    /* This function is way too long and complex.  Is type conversion stuff
+       always this messy, or can this be cleaned up somehow? */
+    assert(failureOk || errorMsgBase != NULL);
+
+    const Type *fromType = GetType();
+    if (toType == NULL || fromType == NULL)
+        return this;
+
+    // The types are equal; there's nothing to do
+    if (Type::Equal(toType, fromType))
+        return this;
+
+    if (fromType == AtomicType::Void) {
+        if (!failureOk)
+            Error(pos, "Can't convert from \"void\" to \"%s\" for %s.",
+                  toType->GetString().c_str(), errorMsgBase);
+        return NULL;
+    }
+
+    if (toType == AtomicType::Void) {
+        if (!failureOk)
+            Error(pos, "Can't convert type \"%s\" to \"void\" for %s.",
+                  fromType->GetString().c_str(), errorMsgBase);
+        return NULL;
+    }
+
+    if (toType->IsUniformType() && fromType->IsVaryingType()) {
+        if (!failureOk)
+            Error(pos, "Can't convert from varying type \"%s\" to uniform "
+                  "type \"%s\" for %s.", fromType->GetString().c_str(), 
+                  toType->GetString().c_str(), errorMsgBase);
+        return NULL;
+    }
+
+    // Convert from type T -> const T; just return a TypeCast expr, which
+    // can handle this
+    if (Type::Equal(toType, fromType->GetAsConstType()))
+        return new TypeCastExpr(toType, this, pos);
+    
+    if (dynamic_cast<const ReferenceType *>(fromType)) {
+        if (dynamic_cast<const ReferenceType *>(toType)) {
+            // Convert from a reference to a type to a const reference to a type;
+            // this is handled by TypeCastExpr
+            if (Type::Equal(toType->GetReferenceTarget(),
+                            fromType->GetReferenceTarget()->GetAsConstType()))
+                return new TypeCastExpr(toType, this, pos);
+#if 0
+            // FIXME: why is this commented out??
+            else {
+                Error(pos, "Can't convert between incompatible reference types \"%s\" "
+                      "and \"%s\".", fromType->GetString().c_str(),
+                      toType->GetString().c_str());
+                return NULL;
+            }
+#endif
+        }
+        else {
+            // convert from a reference T -> T
+            Expr *fromExpr = new DereferenceExpr(this, pos);
+            if (fromExpr->GetType() == NULL)
+                return NULL;
+            return fromExpr->TypeConv(toType, errorMsgBase, failureOk);
+        }
+    }
+    else if (dynamic_cast<const ReferenceType *>(toType)) {
+        // T -> reference T
+        Expr *fromExpr = new ReferenceExpr(this, pos);
+        if (fromExpr->GetType() == NULL)
+            return NULL;
+        return fromExpr->TypeConv(toType, errorMsgBase, failureOk);
+    }
+    else if (Type::Equal(toType, fromType->GetAsNonConstType()))
+        // convert: const T -> T (as long as T isn't a reference)
+        return new TypeCastExpr(toType, this, pos);
+
+    fromType = fromType->GetReferenceTarget();
+    toType = toType->GetReferenceTarget();
+    // I don't think this is necessary
+//CO    if (Type::Equal(toType, fromType))
+//CO        return fromExpr;
+
+    const ArrayType *toArrayType = dynamic_cast<const ArrayType *>(toType);
+    const ArrayType *fromArrayType = dynamic_cast<const ArrayType *>(fromType);
+    if (toArrayType && fromArrayType) {
+        if (Type::Equal(toArrayType->GetElementType(), fromArrayType->GetElementType())) {
+            // the case of different element counts should have returned
+            // out earlier, yes??
+            assert(toArrayType->GetElementCount() != fromArrayType->GetElementCount());
+            return new TypeCastExpr(new ReferenceType(toType, false), this, pos);
+        }
+        else if (Type::Equal(toArrayType->GetElementType(), 
+                             fromArrayType->GetElementType()->GetAsConstType())) {
+            // T[x] -> const T[x]
+            return new TypeCastExpr(new ReferenceType(toType, false), this, pos);
+        }
+        else {
+            if (!failureOk)
+                Error(pos, "Array type \"%s\" can't be converted to type \"%s\" for %s.",
+                      fromType->GetString().c_str(), toType->GetString().c_str(),
+                      errorMsgBase);
+            return NULL;
+        }
+    }
+
+    const VectorType *toVectorType = dynamic_cast<const VectorType *>(toType);
+    const VectorType *fromVectorType = dynamic_cast<const VectorType *>(fromType);
+    if (toVectorType && fromVectorType) {
+        // converting e.g. int<n> -> float<n>
+        if (fromVectorType->GetElementCount() != toVectorType->GetElementCount()) {
+            if (!failureOk)
+                Error(pos, "Can't convert between differently sized vector types "
+                      "\"%s\" -> \"%s\" for %s.", fromType->GetString().c_str(),
+                      toType->GetString().c_str(), errorMsgBase);
+            return NULL;
+        }
+        return new TypeCastExpr(toType, this, pos);
+    }
+
+    const StructType *toStructType = dynamic_cast<const StructType *>(toType);
+    const StructType *fromStructType = dynamic_cast<const StructType *>(fromType);
+    if (toStructType && fromStructType) {
+        if (!Type::Equal(toStructType->GetAsUniformType()->GetAsConstType(),
+                         fromStructType->GetAsUniformType()->GetAsConstType())) {
+            if (!failureOk)
+                Error(pos, "Can't convert between different struct types "
+                      "\"%s\" -> \"%s\".", fromStructType->GetString().c_str(),
+                      toStructType->GetString().c_str());
+            return NULL;
+        }
+
+        return new TypeCastExpr(toType, this, pos);
+    }
+
+    // from here on out, the from type can only be atomic something or
+    // other...
+    const AtomicType *fromAtomicType = dynamic_cast<const AtomicType *>(fromType);
+    if (fromAtomicType == NULL) {
+        if (!failureOk)
+            Error(pos, "Type conversion only possible from atomic types, not "
+                  "from \"%s\" to \"%s\", for %s.", fromType->GetString().c_str(), 
+                  toType->GetString().c_str(), errorMsgBase);
+        return NULL;
+    }
+
+    // scalar -> short-vector conversions
+    if (toVectorType != NULL)
+        return new TypeCastExpr(toType, this, pos);
+
+    // ok, it better be a scalar->scalar conversion of some sort by now
+    const AtomicType *toAtomicType = dynamic_cast<const AtomicType *>(toType);
+    if (toAtomicType == NULL) {
+        if (!failureOk)
+            Error(pos, "Type conversion only possible to atomic types, not "
+                  "from \"%s\" to \"%s\", for %s.",
+                  fromType->GetString().c_str(), toType->GetString().c_str(), 
+                  errorMsgBase);
+        return NULL;
+    }
+
+    if (!failureOk)
+        lMaybeIssuePrecisionWarning(toAtomicType, fromAtomicType, pos, 
+                                    errorMsgBase);
+
+    return new TypeCastExpr(toType, this, pos);
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+
+/** Given an atomic or vector type, this returns a boolean type with the
+    same "shape".  In other words, if the given type is a vector type of
+    three uniform ints, the returned type is a vector type of three uniform
+    bools. */
+static const Type *
+lMatchingBoolType(const Type *type) {
+    bool uniformTest = type->IsUniformType();
+    const AtomicType *boolBase = uniformTest ? AtomicType::UniformBool : 
+                                               AtomicType::VaryingBool;
+    const VectorType *vt = dynamic_cast<const VectorType *>(type);
+    if (vt != NULL)
+        return new VectorType(boolBase, vt->GetElementCount());
+    else {
+        assert(dynamic_cast<const AtomicType *>(type) != NULL);
+        return boolBase;
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////
+// UnaryExpr
+
+static llvm::Constant *
+lLLVMConstantValue(const Type *type, llvm::LLVMContext *ctx, double value) {
+    const AtomicType *atomicType = dynamic_cast<const AtomicType *>(type);
+    const VectorType *vectorType = dynamic_cast<const VectorType *>(type);
+
+    // This function is only called with, and only works for atomic and
+    // vector types.
+    assert(atomicType != NULL || vectorType != NULL);
+
+    if (atomicType) {
+        // If it's an atomic type, then figure out which of the llvmutil.h
+        // functions to call to get the corresponding constant and then
+        // call it...
+        bool isUniform = type->IsUniformType();
+        switch (atomicType->basicType) {
+        case AtomicType::TYPE_VOID:
+            FATAL("can't get constant value for void type");
+            return NULL;
+        case AtomicType::TYPE_BOOL:
+            if (isUniform)
+                return (value != 0.) ? LLVMTrue : LLVMFalse;
+            else
+                return LLVMBoolVector(value != 0.);
+        case AtomicType::TYPE_UINT32: {
+            unsigned int i = (unsigned int)value;
+            return isUniform ? LLVMUInt32(i) : LLVMUInt32Vector(i);
+        }
+        case AtomicType::TYPE_INT32: {
+            int i = (int)value;
+            assert((double)i == value);
+            return isUniform ? LLVMInt32(i) : LLVMInt32Vector(i);
+        }
+        case AtomicType::TYPE_FLOAT:
+            return isUniform ? LLVMFloat((float)value) : 
+                               LLVMFloatVector((float)value);
+        case AtomicType::TYPE_UINT64: {
+            uint64_t i = (uint64_t)value;
+            assert(value == (int64_t)i);
+            return isUniform ? LLVMUInt64(i) : LLVMUInt64Vector(i);
+        }
+        case AtomicType::TYPE_INT64: {
+            int64_t i = (int64_t)value;
+            assert((double)i == value);
+            return isUniform ? LLVMInt64(i) : LLVMInt64Vector(i);
+        }
+        case AtomicType::TYPE_DOUBLE:
+            return isUniform ? LLVMDouble(value) : LLVMDoubleVector(value);
+        default:
+            FATAL("logic error in lLLVMConstantValue");
+            return NULL;
+        }
+    }
+
+    // For vector types, first get the LLVM constant for the basetype with
+    // a recursive call to lLLVMConstantValue().
+    const Type *baseType = vectorType->GetBaseType();
+    llvm::Constant *constElement = lLLVMConstantValue(baseType, ctx, value);
+    const llvm::Type *llvmVectorType = vectorType->LLVMType(ctx);
+
+    // Now create a constant version of the corresponding LLVM type that we
+    // use to represent the VectorType.
+    // FIXME: this is a little ugly in that the fact that ispc represents
+    // uniform VectorTypes as LLVM VectorTypes and varying VectorTypes as
+    // LLVM ArrayTypes leaks into the code here; it feels like this detail
+    // should be better encapsulated?
+    if (baseType->IsUniformType()) {
+        const llvm::VectorType *lvt = 
+            llvm::dyn_cast<const llvm::VectorType>(llvmVectorType);
+        assert(lvt != NULL);
+        std::vector<llvm::Constant *> vals;
+        for (unsigned int i = 0; i < lvt->getNumElements(); ++i)
+            vals.push_back(constElement);
+        return llvm::ConstantVector::get(lvt, vals);
+    }
+    else {
+        const llvm::ArrayType *lat = 
+            llvm::dyn_cast<const llvm::ArrayType>(llvmVectorType);
+        assert(lat != NULL);
+        std::vector<llvm::Constant *> vals;
+        for (unsigned int i = 0; i < lat->getNumElements(); ++i)
+            vals.push_back(constElement);
+        return llvm::ConstantArray::get(lat, vals);
+    }
+}
+
+
+/** Utility routine to emit code to do a {pre,post}-{inc,dec}rement of the
+    given expresion.
+ */
+static llvm::Value *
+lEmitPrePostIncDec(UnaryExpr::Op op, Expr *expr, SourcePos pos,
+                   FunctionEmitContext *ctx) {
+    const Type *type = expr->GetType();
+
+    // Get both the lvalue and the rvalue of the given expression
+    llvm::Value *lvalue = NULL, *rvalue = NULL;
+    if (dynamic_cast<const ReferenceType *>(type) != NULL) {
+        type = type->GetReferenceTarget();
+        lvalue = expr->GetValue(ctx);
+
+        Expr *deref = new DereferenceExpr(expr, expr->pos);
+        rvalue = deref->GetValue(ctx);
+    }
+    else {
+        lvalue = expr->GetLValue(ctx);
+        rvalue = expr->GetValue(ctx);
+    }
+
+    if (lvalue == NULL) {
+        // If we can't get a lvalue, then we have an error here 
+        Error(expr->pos, "Can't %s-%s non-lvalues.",
+              (op == UnaryExpr::PreInc || op == UnaryExpr::PreDec) ? "pre" : "post",
+              (op == UnaryExpr::PreInc || op == UnaryExpr::PostInc) ? "increment" : "decrement");
+        return NULL;
+    }
+
+    // Emit code to do the appropriate addition/subtraction to the
+    // expression's old value
+    ctx->SetDebugPos(pos);
+    llvm::Value *binop = NULL;
+    int delta = (op == UnaryExpr::PreInc || op == UnaryExpr::PostInc) ? 1 : -1;
+    llvm::Constant *dval = lLLVMConstantValue(type, g->ctx, delta);
+    if (!type->IsFloatType())
+        binop = ctx->BinaryOperator(llvm::Instruction::Add, rvalue, 
+                                    dval, "val_inc_or_dec");
+    else
+        binop = ctx->BinaryOperator(llvm::Instruction::FAdd, rvalue, 
+                                    dval, "val_inc_or_dec");
+
+#if 0
+    if (type->IsUniformType()) {
+        if (ctx->VaryingCFDepth() > 0)
+            Warning(expr->pos, 
+                    "Modifying \"uniform\" value under \"varying\" control flow.  Beware.");
+    }
+#endif
+
+    // And store the result out to the lvalue
+    ctx->StoreInst(binop, lvalue, ctx->GetMask(), type);
+
+    // And then if it's a pre increment/decrement, return the final
+    // computed result; otherwise return the previously-grabbed expression
+    // value.
+    return (op == UnaryExpr::PreInc || op == UnaryExpr::PreDec) ? binop : rvalue;
+}
+
+
+
+/** Utility routine to emit code to negate the given expression.
+ */
+static llvm::Value *
+lEmitNegate(Expr *arg, SourcePos pos, FunctionEmitContext *ctx) {
+    const Type *type = arg->GetType();
+    llvm::Value *argVal = arg->GetValue(ctx);
+    if (type == NULL || argVal == NULL)
+        return NULL;
+
+    // Negate by subtracting from zero...
+    llvm::Value *zero = lLLVMConstantValue(type, g->ctx, 0.);
+    ctx->SetDebugPos(pos);
+    if (type->IsFloatType())
+        return ctx->BinaryOperator(llvm::Instruction::FSub, zero, argVal, "fnegate");
+    else {
+        assert(type->IsIntType());
+        return ctx->BinaryOperator(llvm::Instruction::Sub, zero, argVal, "fnegate");
+    }
+}
+
+
+UnaryExpr::UnaryExpr(Op o, Expr *e, SourcePos p) 
+  : Expr(p), op(o) { 
+    expr = e;
+}
+
+
+llvm::Value *
+UnaryExpr::GetValue(FunctionEmitContext *ctx) const {
+    if (expr == NULL)
+        return NULL;
+
+    ctx->SetDebugPos(pos);
+
+    switch (op) {
+    case PreInc:
+    case PreDec:
+    case PostInc:
+    case PostDec:
+        return lEmitPrePostIncDec(op, expr, pos, ctx);
+    case Negate:
+        return lEmitNegate(expr, pos, ctx);
+    case LogicalNot: {
+        llvm::Value *argVal = expr->GetValue(ctx);
+        return ctx->NotOperator(argVal, "logicalnot");
+    }
+    case BitNot: {
+        llvm::Value *argVal = expr->GetValue(ctx);
+        return ctx->NotOperator(argVal, "bitnot");
+    }
+    default:
+        FATAL("logic error");
+        return NULL;
+    }
+}
+
+
+const Type *
+UnaryExpr::GetType() const {
+    if (expr == NULL)
+        return NULL;
+
+    const Type *type = expr->GetType();
+    if (type == NULL)
+        return NULL;
+
+    // For all unary expressions besides logical not, the returned type is
+    // the same as the source type.  Logical not always returns a bool
+    // type, with the same shape as the input type.
+    switch (op) {
+    case PreInc:
+    case PreDec:
+    case PostInc:
+    case PostDec:
+    case Negate:
+    case BitNot: 
+        return type;
+    case LogicalNot:
+        return lMatchingBoolType(type);
+    default:
+        FATAL("error");
+        return NULL;
+    }
+}
+
+
+Expr *
+UnaryExpr::Optimize() {
+    if (!expr)
+        return NULL;
+
+    expr = expr->Optimize();
+
+    ConstExpr *constExpr = dynamic_cast<ConstExpr *>(expr);
+    // If the operand isn't a constant, then we can't do any optimization
+    // here...
+    if (constExpr == NULL)
+        return this;
+
+    const Type *type = constExpr->GetType();
+
+    if (type == AtomicType::UniformInt64 || 
+        type == AtomicType::VaryingInt64 ||
+        type == AtomicType::UniformUInt64 || 
+        type == AtomicType::VaryingUInt64 ||
+        type == AtomicType::UniformConstInt64 || 
+        type == AtomicType::VaryingConstInt64 ||
+        type == AtomicType::UniformConstUInt64 || 
+        type == AtomicType::VaryingConstUInt64)
+        // FIXME: should handle these at some point; for now we only do
+        // constant folding for bool, int32 and float types...
+        return this;
+
+    switch (op) {
+    case PreInc:
+    case PreDec:
+    case PostInc:
+    case PostDec:
+        // this shouldn't happen--it's illegal to modify a contant value..
+        // An error will be issued elsewhere...
+        return this;
+    case Negate: {
+        // Since we currently only handle int32 and floats here, it's safe
+        // to stuff whatever we have into a double, do the negate as a
+        // double, and then return a ConstExpr with the same type as the
+        // original...
+        double v[ISPC_MAX_NVEC];
+        int count = constExpr->AsDouble(v);
+        for (int i = 0; i < count; ++i)
+            v[i] = -v[i];
+        return new ConstExpr(constExpr, v);
+    }
+    case BitNot: {
+        if (type == AtomicType::UniformInt32 || 
+            type == AtomicType::VaryingInt32 ||
+            type == AtomicType::UniformConstInt32 || 
+            type == AtomicType::VaryingConstInt32) {
+            int32_t v[ISPC_MAX_NVEC];
+            int count = constExpr->AsInt32(v);
+            for (int i = 0; i < count; ++i)
+                v[i] = ~v[i];
+            return new ConstExpr(type, v, pos);
+        }
+        else if (type == AtomicType::UniformUInt32 || 
+                 type == AtomicType::VaryingUInt32 ||
+                 type == AtomicType::UniformConstUInt32 || 
+                 type == AtomicType::VaryingConstUInt32) {
+            uint32_t v[ISPC_MAX_NVEC];
+            int count = constExpr->AsUInt32(v);
+            for (int i = 0; i < count; ++i)
+                v[i] = ~v[i];
+            return new ConstExpr(type, v, pos);
+        }
+        else
+            FATAL("unexpected type in UnaryExpr::Optimize() / BitNot case");
+    }
+    case LogicalNot: {
+        assert(type == AtomicType::UniformBool || 
+               type == AtomicType::VaryingBool ||
+               type == AtomicType::UniformConstBool || 
+               type == AtomicType::VaryingConstBool);
+        bool v[ISPC_MAX_NVEC];
+        int count = constExpr->AsBool(v);
+        for (int i = 0; i < count; ++i)
+            v[i] = !v[i];
+        return new ConstExpr(type, v, pos);
+    }
+    default:
+        FATAL("unexpected op in UnaryExpr::Optimize()");
+        return NULL;
+    }
+}
+
+
+Expr *
+UnaryExpr::TypeCheck() {
+    if (expr != NULL) 
+        expr = expr->TypeCheck();
+    if (expr == NULL)
+        // something went wrong in type checking...
+        return NULL;
+
+    const Type *type = expr->GetType();
+    if (type == NULL)
+        return NULL;
+
+    if (op == PreInc || op == PreDec || op == PostInc || op == PostDec) {
+        if (!type->IsNumericType()) {
+            Error(expr->pos, "Can only pre/post increment float and integer "
+                  "types, not \"%s\".", type->GetString().c_str());
+            return NULL;
+        }
+        return this;
+    }
+
+    // don't do this for pre/post increment/decrement
+    if (dynamic_cast<const ReferenceType *>(type)) {
+        expr = new DereferenceExpr(expr, pos);
+        type = expr->GetType();
+    }
+
+    if (op == Negate) {
+        if (!type->IsNumericType()) {
+            Error(expr->pos, "Negate not allowed for non-numeric type \"%s\".", 
+                  type->GetString().c_str());
+            return NULL;
+        }
+    }
+    else if (op == LogicalNot) {
+        const Type *boolType = lMatchingBoolType(type);
+        expr = expr->TypeConv(boolType, "logical not");
+        if (!expr)
+            return NULL;
+    }
+    else if (op == BitNot) {
+        if (!type->IsIntType()) {
+            Error(expr->pos, "~ operator can only be used with integer types, "
+                  "not \"%s\".", type->GetString().c_str());
+            return NULL;
+        }
+    }
+    return this;
+}
+
+
+void
+UnaryExpr::Print() const {
+    if (!expr || !GetType())
+        return;
+
+    printf("[ %s ] (", GetType()->GetString().c_str());
+    if (op == PreInc) printf("++");
+    if (op == PreDec) printf("--");
+    if (op == Negate) printf("-");
+    if (op == LogicalNot) printf("!");
+    if (op == BitNot) printf("~");
+    printf("(");
+    expr->Print();
+    printf(")");
+    if (op == PostInc) printf("++");
+    if (op == PostDec) printf("--");
+    printf(")");
+    pos.Print();
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// BinaryExpr
+
+static const char *
+lOpString(BinaryExpr::Op op) {
+    switch (op) {
+    case BinaryExpr::Add:        return "+";
+    case BinaryExpr::Sub:        return "-";
+    case BinaryExpr::Mul:        return "*";
+    case BinaryExpr::Div:        return "/";
+    case BinaryExpr::Mod:        return "%";
+    case BinaryExpr::Shl:        return "<<";
+    case BinaryExpr::Shr:        return ">>";
+    case BinaryExpr::Lt:         return "<";
+    case BinaryExpr::Gt:         return ">";
+    case BinaryExpr::Le:         return "<=";
+    case BinaryExpr::Ge:         return ">=";
+    case BinaryExpr::Equal:      return "==";
+    case BinaryExpr::NotEqual:   return "!=";
+    case BinaryExpr::BitAnd:     return "&";
+    case BinaryExpr::BitXor:     return "^";
+    case BinaryExpr::BitOr:      return "|";
+    case BinaryExpr::LogicalAnd: return "&&";
+    case BinaryExpr::LogicalOr:  return "||";
+    case BinaryExpr::Comma:      return ",";
+    default:
+        FATAL("unimplemented case in lOpString()");
+        return "";
+    }
+}
+
+
+/** Utility routine to emit the binary bitwise operator corresponding to
+    the given BinaryExpr::Op. 
+*/
+static llvm::Value *
+lEmitBinaryBitOp(BinaryExpr::Op op, llvm::Value *arg0Val,
+                 llvm::Value *arg1Val, FunctionEmitContext *ctx) {
+    llvm::Instruction::BinaryOps inst;
+    switch (op) {
+    case BinaryExpr::Shl:    inst = llvm::Instruction::Shl;  break;
+    case BinaryExpr::Shr:    inst = llvm::Instruction::AShr; break; 
+    case BinaryExpr::BitAnd: inst = llvm::Instruction::And;  break;
+    case BinaryExpr::BitXor: inst = llvm::Instruction::Xor;  break;
+    case BinaryExpr::BitOr:  inst = llvm::Instruction::Or;   break;
+    default:
+        FATAL("logic error in lEmitBinaryBitOp()");
+        return NULL;
+    }
+
+    return ctx->BinaryOperator(inst, arg0Val, arg1Val, "bitop");
+}
+
+
+/** Utility routine to emit binary arithmetic operator based on the given
+    BinaryExpr::Op.
+*/
+static llvm::Value *
+lEmitBinaryArith(BinaryExpr::Op op, llvm::Value *e0Val, llvm::Value *e1Val,
+                 const Type *type, FunctionEmitContext *ctx, SourcePos pos) {
+    llvm::Instruction::BinaryOps inst;
+    bool isFloatOp = type->IsFloatType();
+    bool isUnsignedOp = type->IsUnsignedType();
+
+    switch (op) {
+    case BinaryExpr::Add:
+        inst = isFloatOp ? llvm::Instruction::FAdd : llvm::Instruction::Add;
+        break;
+    case BinaryExpr::Sub:
+        inst = isFloatOp ? llvm::Instruction::FSub : llvm::Instruction::Sub;
+        break;
+    case BinaryExpr::Mul:
+        inst = isFloatOp ? llvm::Instruction::FMul : llvm::Instruction::Mul;
+        break;
+    case BinaryExpr::Div:
+        if (type->IsVaryingType() && !isFloatOp)
+            PerformanceWarning(pos, "Division with varying integer types is "
+                               "very inefficient."); 
+        inst = isFloatOp ? llvm::Instruction::FDiv : 
+                (isUnsignedOp ? llvm::Instruction::UDiv : llvm::Instruction::SDiv);
+        break;
+    case BinaryExpr::Mod:
+        if (type->IsVaryingType() && !isFloatOp)
+            PerformanceWarning(pos, "Modulus operator with varying types is "
+                               "very inefficient."); 
+        inst = isFloatOp ? llvm::Instruction::FRem : 
+                (isUnsignedOp ? llvm::Instruction::URem : llvm::Instruction::SRem);
+        break;
+    default:
+        FATAL("Invalid op type passed to lEmitBinaryArith()");
+        return NULL;
+    }
+
+    return ctx->BinaryOperator(inst, e0Val, e1Val, "binop");
+}
+
+
+/** Utility routine to emit a binary comparison operator based on the given
+    BinaryExpr::Op.
+ */
+static llvm::Value *
+lEmitBinaryCmp(BinaryExpr::Op op, llvm::Value *e0Val, llvm::Value *e1Val,
+               const Type *type, FunctionEmitContext *ctx, SourcePos pos) {
+    bool isFloatOp = type->IsFloatType();
+    bool isUnsignedOp = type->IsUnsignedType();
+
+    llvm::CmpInst::Predicate pred;
+    switch (op) {
+    case BinaryExpr::Lt:
+        pred = isFloatOp ? llvm::CmpInst::FCMP_OLT : 
+            (isUnsignedOp ? llvm::CmpInst::ICMP_ULT : llvm::CmpInst::ICMP_SLT);
+        break;
+    case BinaryExpr::Gt:
+        pred = isFloatOp ? llvm::CmpInst::FCMP_OGT : 
+            (isUnsignedOp ? llvm::CmpInst::ICMP_UGT : llvm::CmpInst::ICMP_SGT);
+        break;
+    case BinaryExpr::Le:
+        pred = isFloatOp ? llvm::CmpInst::FCMP_OLE : 
+            (isUnsignedOp ? llvm::CmpInst::ICMP_ULE : llvm::CmpInst::ICMP_SLE);
+        break;
+    case BinaryExpr::Ge:
+        pred = isFloatOp ? llvm::CmpInst::FCMP_OGE : 
+            (isUnsignedOp ? llvm::CmpInst::ICMP_UGE : llvm::CmpInst::ICMP_SGE);
+        break;
+    case BinaryExpr::Equal:
+        pred = isFloatOp ? llvm::CmpInst::FCMP_OEQ : llvm::CmpInst::ICMP_EQ;
+        break;
+    case BinaryExpr::NotEqual:
+        pred = isFloatOp ? llvm::CmpInst::FCMP_ONE : llvm::CmpInst::ICMP_NE;
+        break;
+    default:
+        FATAL("error in lEmitBinaryCmp()");
+        return NULL;
+    }
+
+    llvm::Value *cmp = ctx->CmpInst(isFloatOp ? llvm::Instruction::FCmp : 
+                                    llvm::Instruction::ICmp,
+                                    pred, e0Val, e1Val, "bincmp");
+    // This is a little ugly: CmpInst returns i1 values, but we use vectors
+    // of i32s for varying bool values; type convert the result here if
+    // needed.
+    if (type->IsVaryingType())
+        cmp = ctx->I1VecToBoolVec(cmp);
+
+    return cmp;
+}
+
+
+BinaryExpr::BinaryExpr(Op o, Expr *a, Expr *b, SourcePos p) 
+    : Expr(p), op(o) {
+    arg0 = a;
+    arg1 = b;
+}
+
+
+llvm::Value *
+BinaryExpr::GetValue(FunctionEmitContext *ctx) const {
+    if (!arg0 || !arg1)
+        return NULL;
+
+    llvm::Value *e0Val = arg0->GetValue(ctx);
+    llvm::Value *e1Val = arg1->GetValue(ctx);
+    ctx->SetDebugPos(pos);
+
+    switch (op) {
+    case Add:
+    case Sub:
+    case Mul:
+    case Div:
+    case Mod:
+        return lEmitBinaryArith(op, e0Val, e1Val, arg0->GetType(), ctx, pos);
+    case Lt:
+    case Gt:
+    case Le:
+    case Ge:
+    case Equal:
+    case NotEqual:
+        return lEmitBinaryCmp(op, e0Val, e1Val, arg0->GetType(), ctx, pos);
+    case Shl:
+    case Shr:
+    case BitAnd:
+    case BitXor:
+    case BitOr: {
+        if (op == Shr && arg1->GetType()->IsVaryingType() && 
+            dynamic_cast<ConstExpr *>(arg1) == NULL)
+            PerformanceWarning(pos, "Shift right is extremely inefficient for "
+                               "varying shift amounts.");
+        return lEmitBinaryBitOp(op, e0Val, e1Val, ctx);
+    }
+    case LogicalAnd:
+        return ctx->BinaryOperator(llvm::Instruction::And, e0Val, e1Val,
+                                   "logical_and");
+    case LogicalOr:
+        return ctx->BinaryOperator(llvm::Instruction::Or, e0Val, e1Val, 
+                                   "logical_or");
+    case Comma:
+        return e1Val;
+    default:
+        FATAL("logic error");
+        return NULL;
+    }
+}
+
+
+const Type *
+BinaryExpr::GetType() const {
+    if (arg0 == NULL || arg1 == NULL)
+        return NULL;
+
+    const Type *type0 = arg0->GetType(), *type1 = arg1->GetType();
+    if (type0 == NULL || type1 == NULL)
+        return NULL;
+
+    if (!type0->IsBoolType() && !type0->IsNumericType()) {
+        Error(arg0->pos, "First operand to binary operator \"%s\" is of invalid "
+              "type \"%s\".", lOpString(op), type0->GetString().c_str());
+        return NULL;
+    }
+    if (!type1->IsBoolType() && !type1->IsNumericType()) {
+        Error(arg1->pos,
+              "Second operand to binary operator \"%s\" is of invalid "
+              "type \"%s\".", lOpString(op), type1->GetString().c_str());
+        return NULL;
+    }
+
+    const Type *promotedType = Type::MoreGeneralType(type0, type1, pos, 
+                                                     lOpString(op));
+    // I don't think that MoreGeneralType should be able to fail after the
+    // type checks above.
+    assert(promotedType != NULL);
+
+    switch (op) {
+    case Add:
+    case Sub:
+    case Mul:
+    case Div:
+    case Mod:
+        return promotedType;
+    case Lt:
+    case Gt:
+    case Le:
+    case Ge:
+    case Equal:
+    case NotEqual:
+    case LogicalAnd:
+    case LogicalOr:
+        return lMatchingBoolType(promotedType);
+    case Shl:
+    case Shr:
+    case BitAnd:
+    case BitXor:
+    case BitOr:
+        return promotedType;
+    case Comma:
+        return arg1->GetType();
+    default:
+        FATAL("logic error in BinaryExpr::GetType()");
+        return NULL;
+    }
+}
+
+
+#define FOLD_OP(O, E)                           \
+    case O:                                     \
+        for (int i = 0; i < count; ++i)         \
+            result[i] = (v0[i] E v1[i]);        \
+        break
+
+/** Constant fold the binary integer operations that aren't also applicable
+    to floating-point types. 
+*/
+template <typename T> static ConstExpr *
+lConstFoldBinIntOp(BinaryExpr::Op op, const T *v0, const T *v1, ConstExpr *carg0) {
+    T result[ISPC_MAX_NVEC];
+    int count = carg0->Count();
+        
+    switch (op) {
+        FOLD_OP(BinaryExpr::Mod, %);
+        FOLD_OP(BinaryExpr::Shl, <<);
+        FOLD_OP(BinaryExpr::Shr, >>);
+        FOLD_OP(BinaryExpr::BitAnd, &);
+        FOLD_OP(BinaryExpr::BitXor, ^);
+        FOLD_OP(BinaryExpr::BitOr, |);
+    default:
+        return NULL;
+    }
+
+    return new ConstExpr(carg0->GetType(), result, carg0->pos);
+}
+
+
+/** Constant fold the binary logical ops.
+ */ 
+template <typename T> static ConstExpr *
+lConstFoldBinLogicalOp(BinaryExpr::Op op, const T *v0, const T *v1, ConstExpr *carg0) {
+    bool result[ISPC_MAX_NVEC];
+    int count = carg0->Count();
+
+    switch (op) {
+        FOLD_OP(BinaryExpr::Lt, <);
+        FOLD_OP(BinaryExpr::Gt, >);
+        FOLD_OP(BinaryExpr::Le, <=);
+        FOLD_OP(BinaryExpr::Ge, >=);
+        FOLD_OP(BinaryExpr::Equal, ==);
+        FOLD_OP(BinaryExpr::NotEqual, !=);
+        FOLD_OP(BinaryExpr::LogicalAnd, &&);
+        FOLD_OP(BinaryExpr::LogicalOr, ||);
+    default:
+        return NULL;
+    }
+
+    const Type *rType = carg0->GetType()->IsUniformType() ? AtomicType::UniformBool : 
+                                                        AtomicType::VaryingBool;
+    return new ConstExpr(rType, result, carg0->pos);
+}
+
+
+/** Constant fold binary arithmetic ops.
+ */
+template <typename T> static ConstExpr *
+lConstFoldBinArithOp(BinaryExpr::Op op, const T *v0, const T *v1, ConstExpr *carg0) {
+    T result[ISPC_MAX_NVEC];
+    int count = carg0->Count();
+
+    switch (op) {
+        FOLD_OP(BinaryExpr::Add, +);
+        FOLD_OP(BinaryExpr::Sub, -);
+        FOLD_OP(BinaryExpr::Mul, *);
+        FOLD_OP(BinaryExpr::Div, /);
+    default:
+        return NULL;
+    }
+    
+    return new ConstExpr(carg0->GetType(), result, carg0->pos);
+}
+
+
+/** Constant fold the various boolean binary ops.
+ */
+static ConstExpr *
+lConstFoldBoolBinOp(BinaryExpr::Op op, const bool *v0, const bool *v1, 
+                    ConstExpr *carg0) {
+    bool result[ISPC_MAX_NVEC];
+    int count = carg0->Count();
+
+    switch (op) {
+        FOLD_OP(BinaryExpr::BitAnd, &);
+        FOLD_OP(BinaryExpr::BitXor, ^);
+        FOLD_OP(BinaryExpr::BitOr, |);
+        FOLD_OP(BinaryExpr::Lt, <);
+        FOLD_OP(BinaryExpr::Gt, >);
+        FOLD_OP(BinaryExpr::Le, <=);
+        FOLD_OP(BinaryExpr::Ge, >=);
+        FOLD_OP(BinaryExpr::Equal, ==);
+        FOLD_OP(BinaryExpr::NotEqual, !=);
+        FOLD_OP(BinaryExpr::LogicalAnd, &&);
+        FOLD_OP(BinaryExpr::LogicalOr, ||);
+    default:
+        return NULL;
+    }
+
+    return new ConstExpr(carg0->GetType(), result, carg0->pos);
+}
+
+
+Expr *
+BinaryExpr::Optimize() {
+    if (arg0 != NULL) 
+        arg0 = arg0->Optimize();
+    if (arg1 != NULL) 
+        arg1 = arg1->Optimize();
+
+    if (!arg0 || !arg1)
+        return NULL;
+
+    ConstExpr *constArg0 = dynamic_cast<ConstExpr *>(arg0);
+    ConstExpr *constArg1 = dynamic_cast<ConstExpr *>(arg1);
+
+    if (g->opt.fastMath) {
+        // optimizations related to division by floats..
+
+        // transform x / const -> x * (1/const)
+        if (op == Div && constArg1 != NULL) {
+            const Type *type1 = constArg1->GetType();
+            if (Type::Equal(type1, AtomicType::UniformFloat) ||
+                Type::Equal(type1, AtomicType::VaryingFloat) ||
+                Type::Equal(type1, AtomicType::UniformConstFloat) ||
+                Type::Equal(type1, AtomicType::VaryingConstFloat)) {
+                float inv[ISPC_MAX_NVEC];
+                int count = constArg1->AsFloat(inv);
+                for (int i = 0; i < count; ++i)
+                    inv[i] = 1.f / inv[i];
+                Expr *einv = new ConstExpr(type1, inv, constArg1->pos);
+                Expr *e = new BinaryExpr(Mul, arg0, einv, pos);
+                e = e->TypeCheck();
+                if (e == NULL)
+                    return NULL;
+                return e->Optimize();
+            }
+        }
+
+        // transform x / y -> x * rcp(y)
+        if (op == Div) {
+            const Type *type1 = arg1->GetType();
+            if (Type::Equal(type1, AtomicType::UniformFloat) ||
+                Type::Equal(type1, AtomicType::VaryingFloat) ||
+                Type::Equal(type1, AtomicType::UniformConstFloat) ||
+                Type::Equal(type1, AtomicType::VaryingConstFloat)) {
+                // Get the symbol for the appropriate builtin
+                std::vector<Symbol *> *rcpFuns = 
+                    m->symbolTable->LookupFunction("rcp");
+                if (rcpFuns != NULL) {
+                    assert(rcpFuns->size() == 2);
+                    Expr *rcpSymExpr = new FunctionSymbolExpr(rcpFuns, pos);
+                    ExprList *args = new ExprList(arg1, arg1->pos);
+                    Expr *rcpCall = new FunctionCallExpr(rcpSymExpr, args, 
+                                                         arg1->pos, false);
+                    rcpCall = rcpCall->TypeCheck();
+                    if (rcpCall == NULL)
+                        return NULL;
+                    rcpCall = rcpCall->Optimize();
+                    if (rcpCall == NULL)
+                        return NULL;
+
+                    Expr *ret = new BinaryExpr(Mul, arg0, rcpCall, pos);
+                    ret = ret->TypeCheck();
+                    if (ret == NULL)
+                        return NULL;
+                    return ret->Optimize();
+                }
+                else
+                    Warning(pos, "rcp() not found from stdlib.  Can't apply "
+                            "fast-math rcp optimization.");
+            }
+        }
+    }
+
+    // From here on out, we're just doing constant folding, so if both args
+    // aren't constants then we're done...
+    if (constArg0 == NULL || constArg1 == NULL)
+        return this;
+
+    assert(Type::Equal(arg0->GetType()->GetAsNonConstType(),
+                       arg1->GetType()->GetAsNonConstType()));
+    const Type *type = arg0->GetType()->GetAsNonConstType();
+    if (type == AtomicType::UniformFloat || type == AtomicType::VaryingFloat) {
+        float v0[ISPC_MAX_NVEC], v1[ISPC_MAX_NVEC];
+        constArg0->AsFloat(v0);
+        constArg1->AsFloat(v1);
+        ConstExpr *ret;
+        if ((ret = lConstFoldBinArithOp(op, v0, v1, constArg0)) != NULL)
+            return ret;
+        else if ((ret = lConstFoldBinLogicalOp(op, v0, v1, constArg0)) != NULL)
+            return ret;
+        else 
+            return this;
+    }
+    if (type == AtomicType::UniformDouble || type == AtomicType::VaryingDouble) {
+        double v0[ISPC_MAX_NVEC], v1[ISPC_MAX_NVEC];
+        constArg0->AsDouble(v0);
+        constArg1->AsDouble(v1);
+        ConstExpr *ret;
+        if ((ret = lConstFoldBinArithOp(op, v0, v1, constArg0)) != NULL)
+            return ret;
+        else if ((ret = lConstFoldBinLogicalOp(op, v0, v1, constArg0)) != NULL)
+            return ret;
+        else 
+            return this;
+    }
+    if (type == AtomicType::UniformInt32 || type == AtomicType::VaryingInt32) {
+        int32_t v0[ISPC_MAX_NVEC], v1[ISPC_MAX_NVEC];
+        constArg0->AsInt32(v0);
+        constArg1->AsInt32(v1);
+        ConstExpr *ret;
+        if ((ret = lConstFoldBinArithOp(op, v0, v1, constArg0)) != NULL)
+            return ret;
+        else if ((ret = lConstFoldBinIntOp(op, v0, v1, constArg0)) != NULL)
+            return ret;
+        else if ((ret = lConstFoldBinLogicalOp(op, v0, v1, constArg0)) != NULL)
+            return ret;
+        else
+            return this;
+    }
+    else if (type == AtomicType::UniformUInt32 || type == AtomicType::VaryingUInt32) {
+        uint32_t v0[ISPC_MAX_NVEC], v1[ISPC_MAX_NVEC];
+        constArg0->AsUInt32(v0);
+        constArg1->AsUInt32(v1);
+        ConstExpr *ret;
+        if ((ret = lConstFoldBinArithOp(op, v0, v1, constArg0)) != NULL)
+            return ret;
+        else if ((ret = lConstFoldBinIntOp(op, v0, v1, constArg0)) != NULL)
+            return ret;
+        else if ((ret = lConstFoldBinLogicalOp(op, v0, v1, constArg0)) != NULL)
+            return ret;
+        else
+            return this;
+    }
+    else if (type == AtomicType::UniformBool || type == AtomicType::VaryingBool) {
+        bool v0[ISPC_MAX_NVEC], v1[ISPC_MAX_NVEC];
+        constArg0->AsBool(v0);
+        constArg1->AsBool(v1);
+        ConstExpr *ret;
+        if ((ret = lConstFoldBoolBinOp(op, v0, v1, constArg0)) != NULL)
+            return ret;
+        else if ((ret = lConstFoldBinLogicalOp(op, v0, v1, constArg0)) != NULL)
+            return ret;
+        else 
+            return this;
+    }
+    else
+        return this;
+}
+
+
+Expr *
+BinaryExpr::TypeCheck() {
+    if (arg0 != NULL) 
+        arg0 = arg0->TypeCheck();
+    if (arg1 != NULL) 
+        arg1 = arg1->TypeCheck();
+
+    if (arg0 == NULL || arg1 == NULL)
+        return NULL;
+
+    const Type *type0 = arg0->GetType(), *type1 = arg1->GetType();
+    if (type0 == NULL || type1 == NULL)
+        return NULL;
+
+    switch (op) {
+    case Shl:
+    case Shr:
+    case BitAnd:
+    case BitXor:
+    case BitOr: {
+        // Must have integer or bool-typed operands for these bit-related
+        // ops; don't do any implicit conversions from floats here...
+        if (!type0->IsIntType() && !type0->IsBoolType()) {
+            Error(arg0->pos, "First operand to binary operator \"%s\" must be "
+                  "an integer or bool.", lOpString(op));
+            return NULL;
+        }
+        if (!type1->IsIntType() && !type1->IsBoolType()) {
+            Error(arg1->pos, "Second operand to binary operator \"%s\" must be "
+                  "an integer or bool.", lOpString(op));
+            return NULL;
+        }
+
+        const Type *promotedType = Type::MoreGeneralType(type0, type1, arg0->pos,
+                                                         "binary bit op");
+        if (promotedType == NULL)
+            return NULL;
+
+        arg0 = arg0->TypeConv(promotedType, "binary bit op");
+        arg1 = arg1->TypeConv(promotedType, "binary bit op");
+        if (arg0 == NULL || arg1 == NULL)
+            return NULL;
+        return this;
+    }
+    case Add:
+    case Sub:
+    case Mul:
+    case Div:
+    case Mod:
+    case Lt:
+    case Gt:
+    case Le:
+    case Ge: {
+        // Must be numeric type for these.  (And mod is special--can't be float)
+        if (!type0->IsNumericType() || (op == Mod && type0->IsFloatType())) {
+            Error(arg0->pos, "First operand to binary operator \"%s\" is of "
+                  "invalid type \"%s\".", lOpString(op), 
+                  type0->GetString().c_str());
+            return NULL;
+        }
+        if (!type1->IsNumericType() || (op == Mod && type1->IsFloatType())) {
+            Error(arg1->pos, "First operand to binary operator \"%s\" is of "
+                  "invalid type \"%s\".", lOpString(op), 
+                  type1->GetString().c_str());
+            return NULL;
+        }
+
+        const Type *promotedType = Type::MoreGeneralType(type0, type1, arg0->pos,
+                                                         lOpString(op));
+        if (promotedType == NULL)
+            return NULL;
+
+        arg0 = arg0->TypeConv(promotedType, lOpString(op));
+        arg1 = arg1->TypeConv(promotedType, lOpString(op));
+        if (!arg0 || !arg1)
+            return NULL;
+        return this;
+    }
+    case Equal:
+    case NotEqual: {
+        if (!type0->IsBoolType() && !type0->IsNumericType()) {
+            Error(arg0->pos,
+                  "First operand to equality operator \"%s\" is of "
+                  "non-comparable type \"%s\".", lOpString(op), 
+                  type0->GetString().c_str());
+            return NULL;
+        }
+        if (!type1->IsBoolType() && !type1->IsNumericType()) {
+            Error(arg1->pos,
+                  "Second operand to equality operator \"%s\" is of "
+                  "non-comparable type \"%s\".", lOpString(op), 
+                  type1->GetString().c_str());
+            return NULL;
+        }
+
+        const Type *promotedType = 
+            Type::MoreGeneralType(type0, type1, arg0->pos, lOpString(op));
+        if (promotedType == NULL)
+            return NULL;
+
+        arg0 = arg0->TypeConv(promotedType, lOpString(op));
+        arg1 = arg1->TypeConv(promotedType, lOpString(op));
+        if (!arg0 || !arg1)
+            return NULL;
+        return this;
+    }
+    case LogicalAnd:
+    case LogicalOr: {
+        // We need to type convert to a boolean type of the more general
+        // shape of the two types
+        bool isUniform = (type0->IsUniformType() && type1->IsUniformType());
+        const AtomicType *boolType = isUniform ? AtomicType::UniformBool : 
+                                                 AtomicType::VaryingBool;
+        const Type *destType = NULL;
+        const VectorType *vtype0 = dynamic_cast<const VectorType *>(type0);
+        const VectorType *vtype1 = dynamic_cast<const VectorType *>(type1);
+        if (vtype0 && vtype1) {
+            int sz0 = vtype0->GetElementCount(), sz1 = vtype1->GetElementCount();
+            if (sz0 != sz1) {
+                Error(pos, "Can't do logical operation \"%s\" between vector types of "
+                      "different sizes (%d vs. %d).", lOpString(op), sz0, sz1);
+                return NULL;
+            }
+            destType = new VectorType(boolType, sz0);
+        }
+        else if (vtype0)
+            destType = new VectorType(boolType, vtype0->GetElementCount());
+        else if (vtype1)
+            destType = new VectorType(boolType, vtype1->GetElementCount());
+        else
+            destType = boolType;
+            
+        arg0 = arg0->TypeConv(destType, lOpString(op));
+        arg1 = arg1->TypeConv(destType, lOpString(op));
+        if (!arg0 || !arg1)
+            return NULL;
+        return this;
+    }
+    case Comma:
+        return this;
+    default:
+        FATAL("logic error");
+        return NULL;
+    }
+}
+
+
+void
+BinaryExpr::Print() const {
+    if (!arg0 || !arg1 || !GetType())
+        return;
+
+    printf("[ %s ] (", GetType()->GetString().c_str());
+    arg0->Print();
+    printf(" %s ", lOpString(op));
+    arg1->Print();
+    printf(")");
+    pos.Print();
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// AssignExpr
+
+
+/** Store the result of an assignment to the given location. 
+ */
+static void
+lStoreAssignResult(llvm::Value *rv, llvm::Value *lv, const Type *type, 
+                   FunctionEmitContext *ctx, Symbol *baseSym) {
+    assert(baseSym->varyingCFDepth <= ctx->VaryingCFDepth());
+    if (!g->opt.disableMaskedStoreToStore &&
+        baseSym->varyingCFDepth == ctx->VaryingCFDepth() &&
+        baseSym->isStatic == false &&
+        dynamic_cast<const ReferenceType *>(baseSym->type) == NULL) {
+        // If the variable is declared at the same varying control flow
+        // depth as where it's being assigned, then we don't need to do any
+        // masking but can just do the assignment as if all the lanes were
+        // known to be on.  While this may lead to random/garbage values
+        // written into the lanes that are off, by definition they will
+        // never be accessed, since those lanes aren't executing, and won't
+        // be executing at this scope or any other one before the variable
+        // goes out of scope.
+        ctx->StoreInst(rv, lv, LLVMMaskAllOn, type);
+    }
+    else
+        ctx->StoreInst(rv, lv, ctx->GetMask(), type);
+}
+
+
+/** Emit code to do an "assignment + operation" operator, e.g. "+=".
+ */
+static llvm::Value *
+lEmitOpAssign(AssignExpr::Op op, Expr *arg0, Expr *arg1, const Type *type, 
+              Symbol *baseSym, SourcePos pos, FunctionEmitContext *ctx) {
+    llvm::Value *lv = arg0->GetLValue(ctx);
+    if (!lv) {
+        // FIXME: I think this test is unnecessary and that this case
+        // should be caught during typechecking
+        Error(pos, "Can't assign to left-hand side of expression.");
+        return NULL;
+    }
+
+    // Get the value on the right-hand side of the assignment+operation
+    // operator and load the current value on the left-hand side.
+    llvm::Value *rvalue = arg1->GetValue(ctx);
+    ctx->SetDebugPos(pos);
+    llvm::Value *oldLHS = ctx->LoadInst(lv, type, "opassign_load");
+
+    // Map the operator to the corresponding BinaryExpr::Op operator
+    BinaryExpr::Op basicop;
+    switch (op) {
+    case AssignExpr::MulAssign: basicop = BinaryExpr::Mul;    break;
+    case AssignExpr::DivAssign: basicop = BinaryExpr::Div;    break;
+    case AssignExpr::ModAssign: basicop = BinaryExpr::Mod;    break;
+    case AssignExpr::AddAssign: basicop = BinaryExpr::Add;    break;
+    case AssignExpr::SubAssign: basicop = BinaryExpr::Sub;    break;
+    case AssignExpr::ShlAssign: basicop = BinaryExpr::Shl;    break;
+    case AssignExpr::ShrAssign: basicop = BinaryExpr::Shr;    break;
+    case AssignExpr::AndAssign: basicop = BinaryExpr::BitAnd; break;
+    case AssignExpr::XorAssign: basicop = BinaryExpr::BitXor; break;
+    case AssignExpr::OrAssign:  basicop = BinaryExpr::BitOr;  break;
+    default:
+        FATAL("logic error in lEmitOpAssign()");
+        return NULL;
+    }
+
+    // Emit the code to compute the new value
+    llvm::Value *newValue = NULL;
+    switch (op) {
+    case AssignExpr::MulAssign:
+    case AssignExpr::DivAssign:
+    case AssignExpr::ModAssign:
+    case AssignExpr::AddAssign:
+    case AssignExpr::SubAssign:
+        newValue = lEmitBinaryArith(basicop, oldLHS, rvalue, type, ctx, pos);
+        break;
+    case AssignExpr::ShlAssign:
+    case AssignExpr::ShrAssign:
+    case AssignExpr::AndAssign:
+    case AssignExpr::XorAssign:
+    case AssignExpr::OrAssign:
+        newValue = lEmitBinaryBitOp(basicop, oldLHS, rvalue, ctx);
+        break;
+    default:
+        FATAL("logic error in lEmitOpAssign");
+        return NULL;
+    }
+
+    // And store the result back to the lvalue.
+    lStoreAssignResult(newValue, lv, type, ctx, baseSym);
+
+    return newValue;
+}
+
+
+AssignExpr::AssignExpr(AssignExpr::Op o, Expr *a, Expr *b, SourcePos p) 
+    : Expr(p), op(o) {
+    lvalue = a;
+    rvalue = b;
+}
+
+
+llvm::Value *
+AssignExpr::GetValue(FunctionEmitContext *ctx) const {
+    const Type *type = NULL;
+    if (lvalue == NULL || rvalue == NULL || (type = GetType()) == NULL)
+        return NULL;
+
+    ctx->SetDebugPos(pos);
+
+#if 0
+    if (ctx->VaryingCFDepth() > 0 && type->IsUniformType())
+        Warning(pos, "Modifying \"uniform\" value under \"varying\" control flow.  Beware.");
+#endif
+
+    Symbol *baseSym = lvalue->GetBaseSymbol();
+    if (!baseSym) {
+        // FIXME: I think that this check also is unnecessary and that this
+        // case should be covered during type checking.
+        Error(pos, "Left hand side of assignment statement can't be assigned to.");
+        return NULL;
+    }
+
+    switch (op) {
+    case Assign: {
+        llvm::Value *lv = lvalue->GetLValue(ctx);
+        if (!lv) {
+            // FIXME: another, I believe, now unnecessary test?
+            Error(lvalue->pos, "Can't assign to left-hand side of expression.");
+            return NULL;
+        }
+
+        llvm::Value *rv = rvalue->GetValue(ctx);
+        if (rv == NULL)
+            return NULL;
+
+        ctx->SetDebugPos(pos);
+
+        // Warn if we're assigning a large array
+        const ArrayType *at = dynamic_cast<const ArrayType *>(type);
+        if (at && at->TotalElementCount() > 4)
+            PerformanceWarning(pos, "Copying %d element array in assignment expression.",
+                               at->TotalElementCount());
+
+#if 0
+        const StructType *st = dynamic_cast<const StructType *>(type);
+        if (st != NULL) {
+            bool anyUniform = false;
+            for (int i = 0; i < st->NumElements(); ++i) {
+                if (st->GetMemberType(i)->IsUniformType())
+                    anyUniform = true;
+            }
+
+            if (anyUniform && ctx->VaryingCFDepth() > 0)
+                Warning(pos, "Modifying \"uniform\" value under \"varying\" "
+                        "control flow.  Beware.");
+        }
+#endif
+
+        lStoreAssignResult(rv, lv, type, ctx, baseSym);
+
+        return rv;
+    }
+    case MulAssign:
+    case DivAssign:
+    case ModAssign:
+    case AddAssign:
+    case SubAssign:
+    case ShlAssign:
+    case ShrAssign:
+    case AndAssign:
+    case XorAssign:
+    case OrAssign: {
+        // This should be caught during type checking
+        assert(!dynamic_cast<const ArrayType *>(type) &&
+               !dynamic_cast<const StructType *>(type));
+        return lEmitOpAssign(op, lvalue, rvalue, type, baseSym, pos, ctx);
+    }
+    default:
+        FATAL("logic error in AssignExpr::GetValue()");
+        return NULL;
+    }
+}
+
+
+Expr *
+AssignExpr::Optimize() {
+    if (lvalue) 
+        lvalue = lvalue->Optimize();
+    if (rvalue) 
+        rvalue = rvalue->Optimize();
+    if (lvalue == NULL || rvalue == NULL)
+        return NULL;
+
+    return this;
+}
+
+
+const Type *
+AssignExpr::GetType() const {
+    return lvalue ? lvalue->GetType() : NULL;
+}
+
+
+Expr *
+AssignExpr::TypeCheck() {
+    bool lvalueIsReference = lvalue &&
+        dynamic_cast<const ReferenceType *>(lvalue->GetType()) != NULL;
+    bool rvalueIsReference = rvalue &&
+        dynamic_cast<const ReferenceType *>(rvalue->GetType()) != NULL;
+
+    // hack to allow asigning array references e.g. in a struct...
+    if (lvalueIsReference &&
+        !(rvalueIsReference && 
+          dynamic_cast<const ArrayType *>(rvalue->GetType()->GetReferenceTarget())))
+        lvalue = new DereferenceExpr(lvalue, lvalue->pos);
+
+    if (lvalue != NULL) 
+        lvalue = lvalue->TypeCheck();
+    if (rvalue != NULL) 
+        rvalue = rvalue->TypeCheck();
+    if (rvalue != NULL && lvalue != NULL) 
+        rvalue = rvalue->TypeConv(lvalue->GetType(), "operator =");
+    if (rvalue == NULL || lvalue == NULL) 
+        return NULL;
+
+    if (lvalue->GetType()->IsConstType()) {
+        Error(pos, "Can't assign to type \"%s\" on left-hand size of "
+              "expression.", lvalue->GetType()->GetString().c_str());
+        return NULL;
+    }
+
+    return this;
+}
+
+
+void
+AssignExpr::Print() const {
+    if (!lvalue || !rvalue || !GetType())
+        return;
+
+    printf("[%s] assign (", GetType()->GetString().c_str());
+    lvalue->Print();
+    printf(" ");
+    if (op == Assign)    printf("=");
+    if (op == MulAssign) printf("*=");
+    if (op == DivAssign) printf("/=");
+    if (op == ModAssign) printf("%%=");
+    if (op == AddAssign) printf("+=");
+    if (op == SubAssign) printf("-=");
+    if (op == ShlAssign) printf("<<=");
+    if (op == ShrAssign) printf(">>=");
+    if (op == AndAssign) printf("&=");
+    if (op == XorAssign) printf("^=");
+    if (op == OrAssign)  printf("|=");
+    printf(" ");
+    rvalue->Print();
+    printf(")");
+    pos.Print();
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// SelectExpr
+
+SelectExpr::SelectExpr(Expr *t, Expr *e1, Expr *e2, SourcePos p) 
+    : Expr(p) {
+    test = t;
+    expr1 = e1;
+    expr2 = e2;
+}
+
+
+/** Emit code to select between two varying values based on a varying test
+    value.
+ */
+static llvm::Value *
+lEmitVaryingSelect(FunctionEmitContext *ctx, llvm::Value *test, 
+                   llvm::Value *expr1, llvm::Value *expr2, 
+                   const Type *type) {
+    llvm::Value *resultPtr = ctx->AllocaInst(expr1->getType(), "selectexpr_tmp");
+    // Don't need to worry about masking here
+    ctx->StoreInst(expr2, resultPtr);
+    // Use masking to conditionally store the expr1 values
+    ctx->StoreInst(expr1, resultPtr, test, type);
+    return ctx->LoadInst(resultPtr, type, "selectexpr_final");
+}
+
+
+llvm::Value *
+SelectExpr::GetValue(FunctionEmitContext *ctx) const {
+    if (!expr1 || !expr2 || !test)
+        return NULL;
+
+    ctx->SetDebugPos(pos);
+
+    const Type *testType = test->GetType()->GetAsNonConstType();
+    // This should be taken care of during typechecking
+    assert(testType->GetBaseType() == AtomicType::UniformBool ||
+           testType->GetBaseType() == AtomicType::VaryingBool);
+
+    const Type *type = expr1->GetType();
+    // Type checking should also make sure this is the case
+    assert(Type::Equal(type->GetAsNonConstType(), 
+                       expr2->GetType()->GetAsNonConstType()));
+
+    if (testType == AtomicType::UniformBool) {
+        // Simple case of a single uniform bool test expression; we just
+        // want one of the two expressions.  In this case, we can be
+        // careful to evaluate just the one of the expressions that we need
+        // the value of so that if the other one has side-effects or
+        // accesses invalid memory, it doesn't execute.
+        llvm::Value *testVal = test->GetValue(ctx);
+        llvm::BasicBlock *testTrue = ctx->CreateBasicBlock("select_true");
+        llvm::BasicBlock *testFalse = ctx->CreateBasicBlock("select_false");
+        llvm::BasicBlock *testDone = ctx->CreateBasicBlock("select_done");
+        ctx->BranchInst(testTrue, testFalse, testVal);
+
+        ctx->SetCurrentBasicBlock(testTrue);
+        llvm::Value *expr1Val = expr1->GetValue(ctx);
+        // Note that truePred won't be necessarily equal to testTrue, in
+        // case the expr1->GetValue() call changes the current basic block.
+        llvm::BasicBlock *truePred = ctx->GetCurrentBasicBlock();
+        ctx->BranchInst(testDone);
+
+        ctx->SetCurrentBasicBlock(testFalse);
+        llvm::Value *expr2Val = expr2->GetValue(ctx);
+        // See comment above truePred for why we can't just assume we're in
+        // the testFalse basic block here.
+        llvm::BasicBlock *falsePred = ctx->GetCurrentBasicBlock();
+        ctx->BranchInst(testDone);
+
+        ctx->SetCurrentBasicBlock(testDone);
+        llvm::PHINode *ret = ctx->PhiNode(expr1Val->getType(), 2, "select");
+        ret->addIncoming(expr1Val, truePred);
+        ret->addIncoming(expr2Val, falsePred);
+        return ret;
+    }
+    else if (dynamic_cast<const VectorType *>(testType) == NULL) {
+        // if the test is a varying bool type, then evaluate both of the
+        // value expressions with the mask set appropriately and then do an
+        // element-wise select to get the result
+        llvm::Value *testVal = test->GetValue(ctx);
+        assert(testVal->getType() == LLVMTypes::MaskType);
+        llvm::Value *oldMask = ctx->GetMask();
+        ctx->MaskAnd(oldMask, testVal);
+        llvm::Value *expr1Val = expr1->GetValue(ctx);
+        ctx->MaskAndNot(oldMask, testVal);
+        llvm::Value *expr2Val = expr2->GetValue(ctx);
+        ctx->SetMask(oldMask);
+
+        return lEmitVaryingSelect(ctx, testVal, expr1Val, expr2Val, type);
+    }
+    else {
+        // FIXME? Short-circuiting doesn't work in the case of
+        // vector-valued test expressions.  (We could also just prohibit
+        // these and place the issue in the user's hands...)
+        llvm::Value *testVal = test->GetValue(ctx);
+        llvm::Value *expr1Val = expr1->GetValue(ctx);
+        llvm::Value *expr2Val = expr2->GetValue(ctx);
+
+        ctx->SetDebugPos(pos);
+        const VectorType *vt = dynamic_cast<const VectorType *>(type);
+        // Things that typechecking should have caught
+        assert(vt != NULL);
+        assert(dynamic_cast<const VectorType *>(testType) != NULL &&
+               (dynamic_cast<const VectorType *>(testType)->GetElementCount() == 
+                vt->GetElementCount()));
+
+        // Do an element-wise select  
+        llvm::Value *result = llvm::UndefValue::get(type->LLVMType(g->ctx));
+        for (int i = 0; i < vt->GetElementCount(); ++i) {
+            llvm::Value *ti = ctx->ExtractInst(testVal, i, "");
+            llvm::Value *e1i = ctx->ExtractInst(expr1Val, i, "");
+            llvm::Value *e2i = ctx->ExtractInst(expr2Val, i, "");
+            llvm::Value *sel = NULL;
+            if (testType->IsUniformType())
+                sel = ctx->SelectInst(ti, e1i, e2i);
+            else
+                sel = lEmitVaryingSelect(ctx, ti, e1i, e2i, vt->GetElementType());
+            result = ctx->InsertInst(result, sel, i, "");
+        }
+        return result;
+    }
+}
+
+
+const Type *
+SelectExpr::GetType() const {
+    if (!test || !expr1 || !expr2)
+        return NULL;
+
+    const Type *testType = test->GetType();
+    const Type *expr1Type = expr1->GetType();
+    const Type *expr2Type = expr2->GetType();
+
+    if (!testType || !expr1Type || !expr2Type)
+        return NULL;
+
+    bool becomesVarying = (testType->IsVaryingType() || expr1Type->IsVaryingType() ||
+                           expr2Type->IsVaryingType());
+    // if expr1 and expr2 have different vector sizes, typechecking should fail...
+    int testVecSize = dynamic_cast<const VectorType *>(testType) != NULL ?
+        dynamic_cast<const VectorType *>(testType)->GetElementCount() : 0;
+    int expr1VecSize = dynamic_cast<const VectorType *>(expr1Type) != NULL ?
+        dynamic_cast<const VectorType *>(expr1Type)->GetElementCount() : 0;
+//CO    int expr2VecSize = dynamic_cast<const VectorType *>(expr2Type) != NULL ?
+//CO        dynamic_cast<const VectorType *>(expr2Type)->GetElementCount() : 0;
+//CO    assert(testVecSize == expr1VecSize && expr1VecSize == expr2VecSize);
+    // REMOVE? old test
+    assert(!(testVecSize != 0 && expr1VecSize != 0 && testVecSize != expr1VecSize));
+    
+    int vectorSize = std::max(testVecSize, expr1VecSize);
+    return Type::MoreGeneralType(expr1Type, expr2Type, pos, "select expression", 
+                                 becomesVarying, vectorSize);
+}
+
+
+Expr *
+SelectExpr::Optimize() {
+    if (test) 
+        test = test->Optimize();
+    if (expr1) 
+        expr1 = expr1->Optimize();
+    if (expr2) 
+        expr2 = expr2->Optimize();
+    if (test == NULL || expr1 == NULL || expr2 == NULL)
+        return NULL;
+
+    return this;
+}
+
+
+Expr *
+SelectExpr::TypeCheck() {
+    if (test) 
+        test = test->TypeCheck();
+    if (expr1) 
+        expr1 = expr1->TypeCheck();
+    if (expr2) 
+        expr2 = expr2->TypeCheck();
+
+    if (test == NULL || expr1 == NULL || expr2 == NULL)
+        return NULL;
+
+    const Type *type1 = expr1->GetType(), *type2 = expr2->GetType();
+    if (!type1 || !type2)
+        return NULL;
+
+    if (dynamic_cast<const ArrayType *>(type1)) {
+        Error(pos, "Array type \"%s\" can't be used in select expression", 
+              type1->GetString().c_str());
+        return NULL;
+    }
+    if (dynamic_cast<const ArrayType *>(type2)) {
+        Error(pos, "Array type \"%s\" can't be used in select expression", 
+              type2->GetString().c_str());
+        return NULL;
+    }
+
+    const Type *testType = test->GetType();
+    if (testType == NULL)
+        return NULL;
+    test = test->TypeConv(lMatchingBoolType(testType), "select");
+    if (testType == NULL)
+        return NULL;
+    testType = test->GetType();
+
+    int testVecSize = dynamic_cast<const VectorType *>(testType) ?
+        dynamic_cast<const VectorType *>(testType)->GetElementCount() : 0;
+    const Type *promotedType = Type::MoreGeneralType(type1, type2, pos, "select expression", 
+                                                     testType->IsVaryingType(), testVecSize);
+    if (promotedType == NULL)
+        return NULL;
+
+    expr1 = expr1->TypeConv(promotedType, "select");
+    expr2 = expr2->TypeConv(promotedType, "select");
+    if (!expr1 || !expr2)
+        return NULL;
+
+    return this;
+}
+
+
+void
+SelectExpr::Print() const {
+    if (!test || !expr1 || !expr2 || !GetType())
+        return;
+
+    printf("[%s] (", GetType()->GetString().c_str());
+    test->Print();
+    printf(" ? ");
+    expr1->Print();
+    printf(" : ");
+    expr2->Print();
+    printf(")");
+    pos.Print();
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// FunctionCallExpr
+
+static void
+lPrintFunctionOverloads(const std::vector<Symbol *> &matches) {
+    for (unsigned int i = 0; i < matches.size(); ++i) {
+        const FunctionType *t = dynamic_cast<const FunctionType *>(matches[i]->type);
+        assert(t != NULL);
+        fprintf(stderr, "\t%s\n", t->GetString().c_str());
+    }
+}
+
+
+/** Helper function used for function overload resolution: returns true if
+    the call argument's type exactly matches the function argument type
+    (modulo a conversion to a const type if needed).
+ */ 
+static bool
+lExactMatch(Expr *callArg, const Type *funcArgType) {
+    const Type *callType = callArg->GetType();
+    if (dynamic_cast<const ReferenceType *>(callType) == NULL)
+        callType = callType->GetAsNonConstType();
+
+    return Type::Equal(callType, funcArgType);
+}
+
+/** Helper function used for function overload resolution: returns true if
+    the call argument type and the function argument type match, modulo
+    conversion to a reference type if needed.
+ */
+static bool
+lMatchIgnoringReferences(Expr *callArg, const Type *funcArgType) {
+    const Type *callType = callArg->GetType()->GetReferenceTarget();
+    if (funcArgType->IsConstType())
+        callType = callType->GetAsConstType();
+
+    return Type::Equal(callType,
+                       funcArgType->GetReferenceTarget());
+}
+
+
+/** Helper function used for function overload resolution: returns true if
+    the call argument type and the function argument type match if we only
+    do a uniform -> varying type conversion but otherwise have exactly the
+    same type.
+ */
+static bool
+lMatchIgnoringUniform(Expr *callArg, const Type *funcArgType) {
+    const Type *callType = callArg->GetType();
+    if (dynamic_cast<const ReferenceType *>(callType) == NULL)
+        callType = callType->GetAsNonConstType();
+
+    if (Type::Equal(callType, funcArgType))
+        return true;
+
+    return (callType->IsUniformType() && 
+            funcArgType->IsVaryingType() &&
+            Type::Equal(callType->GetAsVaryingType(), funcArgType));
+}
+
+
+/** Helper function used for function overload resolution: returns true if
+    we can type convert from the call argument type to the function
+    argument type, but without doing a uniform -> varying conversion.
+ */
+static bool
+lMatchWithTypeConvSameVariability(Expr *callArg, const Type *funcArgType) {
+    Expr *te = callArg->TypeConv(funcArgType, 
+                                 "function call argument", true);
+    return (te != NULL && 
+            te->GetType()->IsUniformType() == callArg->GetType()->IsUniformType());
+}
+
+
+/** Helper function used for function overload resolution: returns true if
+    there is any type conversino that gets us from the caller argument type
+    to the function argument type.
+ */
+static bool
+lMatchWithTypeConv(Expr *callArg, const Type *funcArgType) {
+    Expr *te = callArg->TypeConv(funcArgType, 
+                                 "function call argument", true);
+    return (te != NULL);
+}
+
+
+/** See if we can find a single function from the set of overload options
+    based on the predicate function passed in.  Returns true if no more
+    tries should be made to find a match, either due to success from
+    finding a single overloaded function that matches or failure due to
+    finding multiple ambiguous matches.
+ */
+bool
+FunctionCallExpr::tryResolve(bool (*matchFunc)(Expr *, const Type *)) {
+    FunctionSymbolExpr *fse = dynamic_cast<FunctionSymbolExpr *>(func);
+    if (!fse) 
+        // error will be issued later if not calling an actual function
+        return false;
+
+    const char *funName = fse->candidateFunctions->front()->name.c_str();
+    std::vector<Expr *> &callArgs = args->exprs;
+
+    std::vector<Symbol *> matches;
+    std::vector<Symbol *>::iterator iter;
+    for (iter = fse->candidateFunctions->begin(); 
+         iter != fse->candidateFunctions->end(); ++iter) {
+        // Loop over the set of candidate functions and try each one
+        Symbol *candidateFunction = *iter;
+        const FunctionType *ft = 
+            dynamic_cast<const FunctionType *>(candidateFunction->type);
+        assert(ft != NULL);
+        const std::vector<const Type *> &candArgTypes = ft->GetArgumentTypes();
+        const std::vector<ConstExpr *> &argumentDefaults = ft->GetArgumentDefaults();
+
+        // There's no way to match if the caller is passing more arguments
+        // than this function instance takes.
+        if (callArgs.size() > candArgTypes.size())
+            continue;
+
+        unsigned int i;
+        // Note that we're looping over the caller arguments, not the
+        // function arguments; it may be ok to have more arguments to the
+        // function than are passed, if the function has default argument
+        // values.  This case is handled below.
+        for (i = 0; i < callArgs.size(); ++i) {
+            // This may happen if there's an error earlier in compilation.
+            // It's kind of a silly to redundantly discover this for each
+            // potential match versus detecting this earlier in the
+            // matching process and just giving up.
+            if (!callArgs[i] || !callArgs[i]->GetType() || !candArgTypes[i])
+                return false;
+            
+            // See if this caller argument matches the type of the
+            // corresponding function argument according to the given
+            // predicate function.  If not, break out and stop trying.
+            if (!matchFunc(callArgs[i], candArgTypes[i]))
+                break;
+        }
+        if (i == callArgs.size()) {
+            // All of the arguments matched!
+            if (i == candArgTypes.size())
+                // And we have exactly as many arguments as the function
+                // wants, so we're done.
+                matches.push_back(candidateFunction);
+            else if (i < candArgTypes.size() && argumentDefaults[i] != NULL)
+                // Otherwise we can still make it if there are default
+                // arguments for the rest of the arguments!  Because in
+                // Module::AddFunction() we have verified that once the
+                // default arguments start, then all of the following ones
+                // have them as well.  Therefore, we just need to check if
+                // the arg we stopped at has a default value and we're
+                // done.
+                matches.push_back(candidateFunction);
+            // otherwise, we don't have a match
+        }
+    }
+
+    if (matches.size() == 0)
+        return false;
+    else if (matches.size() == 1) {
+        fse->matchingFunc = matches[0];
+
+        // fill in any function defaults required
+        const FunctionType *ft = 
+            dynamic_cast<const FunctionType *>(fse->matchingFunc->type);
+        assert(ft != NULL);
+        const std::vector<ConstExpr *> &argumentDefaults = ft->GetArgumentDefaults();
+        const std::vector<const Type *> &argTypes = ft->GetArgumentTypes();
+        for (unsigned int i = callArgs.size(); i < argTypes.size(); ++i) {
+            assert(argumentDefaults[i] != NULL);
+            args->exprs.push_back(argumentDefaults[i]);
+        }
+        return true;
+    }
+    else {
+        Error(fse->pos, "Multiple overloaded instances of function \"%s\" matched.",
+              funName);
+        lPrintFunctionOverloads(matches);
+        // Stop trying to find more matches after failure
+        return true;
+    }
+}
+
+
+void
+FunctionCallExpr::resolveFunctionOverloads() {
+    FunctionSymbolExpr *fse = dynamic_cast<FunctionSymbolExpr *>(func);
+    if (!fse) 
+        // error will be issued later if not calling an actual function
+        return;
+    assert(args);
+    
+    // Try to find the best overload for the function...
+
+    // Is there an exact match that doesn't require any argument type
+    // conversion at all?
+    if (tryResolve(lExactMatch))
+        return;
+
+    // Try to find a single match ignoring references
+    if (tryResolve(lMatchIgnoringReferences))
+        return;
+
+    // TODO: next, try to find an exact match via type promotion--i.e. char
+    // -> int, etc--things that don't lose data
+
+    // Next try to see if there's a match via just uniform -> varying
+    // promotions.  TODO: look for one with a minimal number of them?
+    if (tryResolve(lMatchIgnoringUniform))
+        return;
+
+    // Try to find a match via type conversion, but don't change
+    // unif->varying
+    if (tryResolve(lMatchWithTypeConvSameVariability))
+        return;
+    
+    // Last chance: try to find a match via arbitrary type conversion.
+    if (tryResolve(lMatchWithTypeConv))
+        return;
+
+    // failure :-(
+    const char *funName = fse->candidateFunctions->front()->name.c_str();
+    Error(pos, "Unable to find matching overload for call to function \"%s\". "
+          "Candidates are:", funName);
+    lPrintFunctionOverloads(*fse->candidateFunctions);
+    fprintf(stderr, "Passed types: %s(", funName);
+    for (unsigned int i = 0; i < args->exprs.size(); ++i) {
+        const Type *t = args->exprs[i]->GetType();
+        if (t)
+            fprintf(stderr, "%s%s", t->GetString().c_str(),
+                    (i < args->exprs.size()-1) ? ", " : ")\n");
+        else
+            fprintf(stderr, "(unknown type)%s", 
+                    (i < args->exprs.size()-1) ? ", " : ")\n");
+    }
+}
+
+
+FunctionCallExpr::FunctionCallExpr(Expr *f, ExprList *a, SourcePos p, bool il) 
+    : Expr(p) {
+    func = f;
+    args = a;
+    isLaunch = il;
+
+    resolveFunctionOverloads();
+}
+
+
+/** Starting from the function initialFunction, we're calling into
+    calledFunc.  The question is: is this a recursive call back to
+    initialFunc?  If it definitely is or if it may be, then return true.
+    Return false if it definitely is not.
+ */
+static bool
+lMayBeRecursiveCall(llvm::Function *calledFunc, 
+                    llvm::Function *initialFunc,
+                    std::set<llvm::Function *> &seenFuncs) {
+    // Easy case: intrinsics aren't going to call functions themselves
+    if (calledFunc->isIntrinsic())
+        return false;
+
+    std::string name = calledFunc->getName();
+    if (name.size() > 2 && name[0] == '_' && name[1] == '_')
+        // builtin stdlib function; none of these are recursive...
+        return false;
+
+    if (calledFunc->isDeclaration())
+        // There's visibility into what the called function does without a
+        // definition, so we have to be conservative
+        return true;
+
+    if (calledFunc == initialFunc)
+        // hello recursive call
+        return true;
+
+    // Otherwise iterate over all of the instructions in the function.  If
+    // any of them is a function call then check recursively..
+    llvm::inst_iterator iter;
+    for (iter = llvm::inst_begin(calledFunc); 
+         iter != llvm::inst_end(calledFunc); ++iter) {
+        llvm::Instruction *inst = &*iter;
+        llvm::CallInst *ci = llvm::dyn_cast<llvm::CallInst>(inst);
+        if (ci != NULL) {
+            llvm::Function *nextCalledFunc = ci->getCalledFunction();
+            // Don't repeatedly test functions we've seen before 
+            if (seenFuncs.find(nextCalledFunc) == seenFuncs.end()) {
+                seenFuncs.insert(nextCalledFunc);
+                if (lMayBeRecursiveCall(nextCalledFunc, initialFunc, 
+                                        seenFuncs))
+                    return true;
+            }
+        }
+    }
+    return false;
+}
+
+
+llvm::Value *
+FunctionCallExpr::GetValue(FunctionEmitContext *ctx) const {
+    if (!func || !args)
+        return NULL;
+
+    ctx->SetDebugPos(pos);
+
+    FunctionSymbolExpr *fse = dynamic_cast<FunctionSymbolExpr *>(func);
+    if (!fse) {
+        Error(pos, "Invalid function name for function call.");
+        return NULL;
+    }
+
+    if (!fse->matchingFunc) 
+        // no overload match was found, get out of here..
+        return NULL;
+
+    Symbol *funSym = fse->matchingFunc;
+    llvm::Function *callee = funSym->function;
+    if (!callee) {
+        Error(pos, "Symbol \"%s\" is not a function.", funSym->name.c_str());
+        return NULL;
+    }
+
+    const FunctionType *ft = dynamic_cast<const FunctionType *>(funSym->type);
+    assert(ft != NULL);
+    bool isVoidFunc = (ft->GetReturnType() == AtomicType::Void);
+
+    // Automatically convert function call args to references if needed.
+    // FIXME: this should move to the TypeCheck() method... (but the
+    // GetLValue call below needs a FunctionEmitContext, which is
+    // problematic...)  
+    std::vector<Expr *> callargs = args->exprs;
+    const std::vector<const Type *> &argTypes = ft->GetArgumentTypes();
+    bool err = false;
+    for (unsigned int i = 0; i < callargs.size(); ++i) {
+        Expr *argExpr = callargs[i];
+        if (!argExpr)
+            continue;
+
+        // All arrays should already have been converted to reference types
+        assert(dynamic_cast<const ArrayType *>(argTypes[i]) == NULL);
+
+        if (dynamic_cast<const ReferenceType *>(argTypes[i])) {
+            if (!dynamic_cast<const ReferenceType *>(argExpr->GetType())) {
+                // The function wants a reference type but the argument
+                // being passed isn't already a reference.
+                if (argExpr->GetLValue(ctx) == NULL) {
+                    // If it doesn't have an lvalue, then we can't make it
+                    // a reference, so issue an error.
+                    // FIXME: for const reference parameters, we could
+                    // store the expr's value to alloca'ed memory and then
+                    // pass a reference to that...
+                    Error(pos, "Can't pass non-lvalue as \"reference\" parameter \"%s\" "
+                          "to function \"%s\".", ft->GetArgumentName(i).c_str(), 
+                          funSym->name.c_str());
+                    err = true;
+                }
+                else
+                    argExpr = new ReferenceExpr(argExpr, argExpr->pos);
+            }
+        }
+
+        // Do whatever type conversion is needed
+        argExpr = argExpr->TypeConv(argTypes[i], "function call argument");
+        // The function overload resolution code should have ensured that
+        // we can successfully do any type conversions needed here.
+        assert(argExpr != NULL);
+        callargs[i] = argExpr;
+    }
+    if (err)
+        return NULL;
+
+    // Now evaluate the values of all of the parameters being passed.  We
+    // need to evaluate these first here, since their GetValue() calls may
+    // change the current basic block (e.g. if one of these is itself a
+    // function call expr...); we need to basic blocks to stay consistent
+    // below when we emit the code that does the actual funciton call.
+    std::vector<llvm::Value *> argVals;
+    std::vector<llvm::Value *> storedArgValPtrs, argValLValues;
+    for (unsigned int i = 0; i < callargs.size(); ++i) {
+        Expr *argExpr = callargs[i];
+        if (!argExpr)
+            // give up; we hit an error earlier
+            return NULL;
+
+        llvm::Value *argValue = argExpr->GetValue(ctx);
+        if (!argValue)
+            // something went wrong in evaluating the argument's
+            // expression, so give up on this
+            return NULL;
+
+        if (dynamic_cast<const ReferenceType *>(argTypes[i]) &&
+            !llvm::isa<const llvm::PointerType>(argValue->getType())) {
+            assert(llvm::isa<const llvm::ArrayType>(argValue->getType()));
+            // if the parameter is a reference and the lvalue needs a
+            // gather to pull it together, then do the gather here and
+            // store the result to local memory, so that we can pass the
+            // single pointer to the local memory that is needed for the
+            // reference.  Below, we'll copy the result back to the varying
+            // lvalue pointer we have here.  (== pass by value/result)
+            const ReferenceType *rt = 
+                dynamic_cast<const ReferenceType *>(argExpr->GetType());
+            assert(rt != NULL);
+            const Type *type = rt->GetReferenceTarget();
+
+            llvm::Value *ptr = ctx->AllocaInst(type->LLVMType(g->ctx), "arg");
+            llvm::Value *val = ctx->LoadInst(argValue, type);
+            ctx->StoreInst(val, ptr);
+            storedArgValPtrs.push_back(ptr);
+            argValLValues.push_back(argValue);
+            argVals.push_back(ptr);
+        }
+        else {
+            argVals.push_back(argValue);
+            storedArgValPtrs.push_back(NULL);
+            argValLValues.push_back(NULL);
+        }
+    }
+
+    // We sometimes need to check to see if the mask is all off here;
+    // specifically, if the mask is all off and we call a recursive
+    // function, then we will probably have an unsesirable infinite loop.
+    ctx->SetDebugPos(pos);
+    llvm::BasicBlock *bDoCall = ctx->CreateBasicBlock("funcall_mask_ok");
+    llvm::BasicBlock *bSkip = ctx->CreateBasicBlock("funcall_mask_off");
+    llvm::BasicBlock *bAfter = ctx->CreateBasicBlock("after_funcall");
+    llvm::Function *currentFunc = ctx->GetCurrentBasicBlock()->getParent();
+
+    // If we need to check the mask (it may be a recursive call, possibly
+    // transitively), or we're launching a task, which is expensive and
+    // thus probably always worth checking, then use the mask to choose
+    // whether to go to the bDoCallBlock or the bSkip block
+    std::set<llvm::Function *> seenFuncs;
+    seenFuncs.insert(currentFunc);
+    if (ft->isTask || lMayBeRecursiveCall(callee, currentFunc, seenFuncs)) {
+        Debug(pos, "Checking mask before function call \"%s\".", funSym->name.c_str());
+        ctx->BranchIfMaskAny(bDoCall, bSkip);
+    }
+    else
+        // If we don't need to check the mask, then always to the call;
+        // just jump to bDoCall
+        ctx->BranchInst(bDoCall);
+    
+    // And the bSkip block just jumps immediately to bAfter.  So why do we
+    // need it?  So the phi node below can easily tell what paths are
+    // going into it
+    ctx->SetCurrentBasicBlock(bSkip);
+    ctx->BranchInst(bAfter);
+
+    // Emit the code to do the function call
+    ctx->SetCurrentBasicBlock(bDoCall);
+
+    llvm::Value *retVal = NULL;
+    ctx->SetDebugPos(pos);
+    if (ft->isTask)
+        ctx->LaunchInst(callee, argVals);
+    else {
+        // Most of the time, the mask is passed as the last argument.  this
+        // isn't the case for things like SSE intrinsics and extern "C"
+        // functions from the application.
+        assert(callargs.size() + 1 == callee->arg_size() ||
+               callargs.size() == callee->arg_size());
+
+        if (callargs.size() + 1 == callee->arg_size())
+            argVals.push_back(ctx->GetMask());
+
+        retVal = ctx->CallInst(callee, argVals, isVoidFunc ? "" : "calltmp");
+    }
+
+    // For anything we had to do as pass by value/result, copy the
+    // corresponding reference values back out
+    for (unsigned int i = 0; i < storedArgValPtrs.size(); ++i) {
+        llvm::Value *ptr = storedArgValPtrs[i];
+        if (ptr != NULL) {
+            const ReferenceType *rt = 
+                dynamic_cast<const ReferenceType *>(callargs[i]->GetType());
+            assert(rt != NULL);
+            llvm::Value *load = ctx->LoadInst(ptr, rt->GetReferenceTarget(),
+                                              "load_ref");
+            // FIXME: apply the "don't do blending" optimization here if
+            // appropriate?
+            ctx->StoreInst(load, argValLValues[i], ctx->GetMask(), 
+                           rt->GetReferenceTarget());
+        }
+    }
+
+    // And jump out to the 'after funciton call' basic block
+    ctx->BranchInst(bAfter);
+    ctx->SetCurrentBasicBlock(bAfter);
+
+    if (isVoidFunc)
+        return NULL;
+
+    // The return value for the non-void case is either undefined or the
+    // function return value, depending on whether we actually ran the code
+    // path that called the function or not.
+    const llvm::Type *lrType = ft->GetReturnType()->LLVMType(g->ctx);
+    llvm::PHINode *ret = ctx->PhiNode(lrType, 2, "fun_ret");
+    assert(retVal != NULL);
+    ret->addIncoming(llvm::UndefValue::get(lrType), bSkip);
+    ret->addIncoming(retVal, bDoCall);
+    return ret;
+}
+
+
+const Type *
+FunctionCallExpr::GetType() const {
+    FunctionSymbolExpr *fse = dynamic_cast<FunctionSymbolExpr *>(func);
+    if (fse && fse->matchingFunc) {
+        const FunctionType *ft = 
+            dynamic_cast<const FunctionType *>(fse->matchingFunc->type);
+        assert(ft != NULL);
+        return ft->GetReturnType();
+    }
+    else
+        return NULL;
+}
+
+
+Expr *
+FunctionCallExpr::Optimize() {
+    if (func) 
+        func = func->Optimize();
+    if (args) 
+        args = args->Optimize();
+    if (!func || !args)
+        return NULL;
+        
+    return this;
+}
+
+
+Expr *
+FunctionCallExpr::TypeCheck() {
+    if (func) {
+        func = func->TypeCheck();
+        if (func != NULL) {
+            const FunctionType *ft = dynamic_cast<const FunctionType *>(func->GetType());
+            if (ft != NULL) {
+                if (ft->isTask) {
+                    if (!isLaunch)
+                        Error(pos, "\"launch\" expression needed to call function "
+                              "with \"task\" qualifier.");
+                }
+                else if (isLaunch)
+                    Error(pos, "\"launch\" expression illegal with non-\"task\"-"
+                          "qualified function.");
+            }
+            else
+                Error(pos, "Valid function name must be used for function call.");
+        }
+    }
+
+    if (args) 
+        args = args->TypeCheck();
+
+    if (!func || !args)
+        return NULL;
+    return this;
+}
+
+
+void
+FunctionCallExpr::Print() const {
+    if (!func || !args || !GetType())
+        return;
+
+    printf("[%s] funcall %s ", GetType()->GetString().c_str(),
+           isLaunch ? "launch" : "");
+    func->Print();
+    printf(" args (");
+    args->Print();
+    printf(")");
+    pos.Print();
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// ExprList
+
+llvm::Value *
+ExprList::GetValue(FunctionEmitContext *ctx) const {
+    FATAL("ExprList::GetValue() should never be called");
+    return NULL;
+}
+
+
+const Type *
+ExprList::GetType() const {
+    FATAL("ExprList::GetType() should never be called");
+    return NULL;
+}
+
+
+ExprList *
+ExprList::Optimize() {
+    for (unsigned int i = 0; i < exprs.size(); ++i)
+        if (exprs[i])
+            exprs[i] = exprs[i]->Optimize();
+    return this;
+}
+
+
+ExprList *
+ExprList::TypeCheck() {
+    for (unsigned int i = 0; i < exprs.size(); ++i)
+        if (exprs[i])
+            exprs[i] = exprs[i]->TypeCheck();
+    return this;
+}
+
+
+llvm::Constant *
+ExprList::GetConstant(const Type *type) const {
+    const StructType *structType = dynamic_cast<const StructType *>(type);
+    const SequentialType *sequentialType = 
+        dynamic_cast<const SequentialType *>(type);
+
+    if (structType != NULL) {
+        // We can potentially return an llvm::ConstantStruct if we have the
+        // same number of elements in the ExprList as the struct has
+        // members (and the various elements line up with the shape of the
+        // corresponding struct elements).
+        if ((int)exprs.size() != structType->NumElements()) {
+            Error(pos, "Initializer list for struct \"%s\" must have %d "
+                  "elements (has %d).", structType->GetString().c_str(),
+                  (int)exprs.size(), structType->NumElements());
+            return NULL;
+        }
+
+        std::vector<llvm::Constant *> cv;
+        for (unsigned int i = 0; i < exprs.size(); ++i) {
+            if (exprs[i] == NULL)
+                return NULL;
+            const Type *elementType = structType->GetMemberType(i);
+            llvm::Constant *c = exprs[i]->GetConstant(elementType);
+            if (c == NULL)
+                // If this list element couldn't convert to the right
+                // constant type for the corresponding struct member, then
+                // give up
+                return NULL;
+            cv.push_back(c);
+        }
+
+#if defined(LLVM_2_8) || defined(LLVM_2_9)
+        return llvm::ConstantStruct::get(*g->ctx, cv, false);
+#else
+        const llvm::StructType *llvmStructType =
+            llvm::dyn_cast<const llvm::StructType>(structType->LLVMType(g->ctx));
+        assert(llvmStructType != NULL);
+        return llvm::ConstantStruct::get(llvmStructType, cv);
+#endif
+    }
+    else if (sequentialType) {
+        // Similarly, if we have an array or vector type, we may be able to
+        // return the corresponding llvm constant value.
+        if ((int)exprs.size() != sequentialType->GetElementCount()) {
+            bool isArray = (dynamic_cast<const ArrayType *>(type) != NULL);
+            Error(pos, "Initializer list for %s \"%s\" must have %d elements (has %d).",
+                  isArray ? "array" : "vector", sequentialType->GetString().c_str(),
+                  (int)exprs.size(), sequentialType->GetElementCount());
+            return NULL;
+        }
+
+        std::vector<llvm::Constant *> cv;
+        for (unsigned int i = 0; i < exprs.size(); ++i) {
+            if (exprs[i] == NULL)
+                return NULL;
+            const Type *elementType = sequentialType->GetElementType();
+            llvm::Constant *c = exprs[i]->GetConstant(elementType);
+            if (c == NULL) 
+                return NULL;
+            cv.push_back(c);
+        }
+        
+        const llvm::Type *lt = type->LLVMType(g->ctx);
+        const llvm::ArrayType *lat = llvm::dyn_cast<const llvm::ArrayType>(lt);
+        // FIXME: should the assert below validly fail for uniform vectors
+        // now?
+        assert(lat != NULL);
+        return llvm::ConstantArray::get(lat, cv);
+    }
+    return NULL;
+}
+
+
+void
+ExprList::Print() const {
+    printf("expr list (");
+    for (unsigned int i = 0; i < exprs.size(); ++i) {
+        if (exprs[i] != NULL)
+            exprs[i]->Print();
+        printf("%s", (i == exprs.size() - 1) ? ")" : ", ");
+    }
+    pos.Print();
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// IndexExpr
+
+IndexExpr::IndexExpr(Expr *a, Expr *i, SourcePos p) 
+    : Expr(p) {
+    arrayOrVector = a;
+    index = i;
+}
+
+
+// FIXME: This is an ugly hack--if we're indexing into a uniform ispc
+// VectorType, then this bitcasts the corresponding llvm::VectorType value
+// to be a pointer to the vector's element type, so that a GEP to index
+// from the pointer indices elements of the llvm::VectorType and doesn't
+// incorrectly try to index into an array of llvm::VectorType instances.
+
+static llvm::Value *
+lCastUniformVectorBasePtr(llvm::Value *ptr, FunctionEmitContext *ctx) {
+    const llvm::PointerType *baseType = 
+        llvm::dyn_cast<const llvm::PointerType>(ptr->getType());
+    if (!baseType)
+        return ptr;
+
+    const llvm::VectorType *baseEltVecType = 
+        llvm::dyn_cast<const llvm::VectorType>(baseType->getElementType());
+    if (!baseEltVecType)
+        return ptr;
+
+    const llvm::Type *vecEltType = baseEltVecType->getElementType();
+    int numElts = baseEltVecType->getNumElements();
+    const llvm::Type *castType = 
+        llvm::PointerType::get(llvm::ArrayType::get(vecEltType, numElts), 0);
+    return ctx->BitCastInst(ptr, castType);
+}
+
+
+llvm::Value *
+IndexExpr::GetValue(FunctionEmitContext *ctx) const {
+    const Type *arrayOrVectorType;
+    if (arrayOrVector == NULL || index == NULL || 
+        ((arrayOrVectorType = arrayOrVector->GetType()) == NULL))
+        return NULL;
+
+    ctx->SetDebugPos(pos);
+    llvm::Value *lvalue = GetLValue(ctx);
+    if (!lvalue) {
+        // We may be indexing into a temporary that hasn't hit memory, so
+        // get the full value and stuff it into temporary alloca'd space so
+        // that we can index from there...
+        llvm::Value *val = arrayOrVector->GetValue(ctx);
+        if (val == NULL) {
+            assert(m->errorCount > 0);
+            return NULL;
+        }
+        ctx->SetDebugPos(pos);
+        llvm::Value *ptr = ctx->AllocaInst(arrayOrVectorType->LLVMType(g->ctx), 
+                                           "array_tmp");
+        ctx->StoreInst(val, ptr);
+        ptr = lCastUniformVectorBasePtr(ptr, ctx);
+        lvalue = ctx->GetElementPtrInst(ptr, LLVMInt32(0), index->GetValue(ctx));
+    }
+
+    ctx->SetDebugPos(pos);
+    return ctx->LoadInst(lvalue, GetType(), "index");
+}
+
+
+const Type *
+IndexExpr::GetType() const {
+    const Type *arrayOrVectorType, *indexType;
+    if (!arrayOrVector || !index || 
+        ((arrayOrVectorType = arrayOrVector->GetType()) == NULL) ||
+        ((indexType = index->GetType()) == NULL))
+        return NULL;
+
+    const SequentialType *sequentialType = 
+        dynamic_cast<const SequentialType *>(arrayOrVectorType->GetReferenceTarget());
+    // Typechecking should have caught this...
+    assert(sequentialType != NULL);
+
+    const Type *elementType = sequentialType->GetElementType();
+    if (indexType->IsUniformType())
+        // If the index is uniform, the resulting type is just whatever the
+        // element type is
+        return elementType;
+    else
+        // A varying index into uniform array/vector -> varying type (and
+        // same for varying array of course...)
+        return elementType->GetAsVaryingType();
+}
+
+
+Symbol *
+IndexExpr::GetBaseSymbol() const {
+    return arrayOrVector ? arrayOrVector->GetBaseSymbol() : NULL;
+}
+
+
+llvm::Value *
+IndexExpr::GetLValue(FunctionEmitContext *ctx) const {
+    const Type *type;
+    if (!arrayOrVector || !index || ((type = arrayOrVector->GetType()) == NULL))
+        return NULL;
+
+    ctx->SetDebugPos(pos);
+    llvm::Value *basePtr = NULL;
+    if (dynamic_cast<const ArrayType *>(type) ||
+        dynamic_cast<const VectorType *>(type))
+        basePtr = arrayOrVector->GetLValue(ctx);
+    else {
+        type = type->GetReferenceTarget();
+        assert(dynamic_cast<const ArrayType *>(type) ||
+               dynamic_cast<const VectorType *>(type));
+        basePtr = arrayOrVector->GetValue(ctx);
+    }
+    if (!basePtr)
+        return NULL;
+
+    basePtr = lCastUniformVectorBasePtr(basePtr, ctx);
+
+    ctx->SetDebugPos(pos);
+    return ctx->GetElementPtrInst(basePtr, LLVMInt32(0), index->GetValue(ctx));
+}
+
+
+Expr *
+IndexExpr::Optimize() {
+    if (arrayOrVector) 
+        arrayOrVector = arrayOrVector->Optimize();
+    if (index) 
+        index = index->Optimize();
+    if (arrayOrVector == NULL || index == NULL)
+        return NULL;
+
+    return this;
+}
+
+
+Expr *
+IndexExpr::TypeCheck() {
+    if (arrayOrVector) 
+        arrayOrVector = arrayOrVector->TypeCheck();
+    if (index) 
+        index = index->TypeCheck();
+    
+    if (!arrayOrVector || !index || !index->GetType())
+        return NULL;
+
+    const Type *arrayOrVectorType = arrayOrVector->GetType();
+    if (!arrayOrVectorType)
+        return NULL;
+
+    if (dynamic_cast<const SequentialType *>(arrayOrVectorType->GetReferenceTarget()) == NULL) {
+        Error(pos, "Trying to index into non-array or vector type \"%s\".", 
+              arrayOrVectorType->GetString().c_str());
+        return NULL;
+    }
+
+    bool isUniform = (index->GetType()->IsUniformType() && 
+                      !g->opt.disableUniformMemoryOptimizations);
+    const Type *indexType = isUniform ? AtomicType::UniformInt32 : 
+                                        AtomicType::VaryingInt32;
+    index = index->TypeConv(indexType, "array index");
+    if (!index)
+        return NULL;
+
+    return this;
+}
+
+
+void
+IndexExpr::Print() const {
+    if (!arrayOrVector || !index || !GetType())
+        return;
+
+    printf("[%s] index ", GetType()->GetString().c_str());
+    arrayOrVector->Print();
+    printf("[");
+    index->Print();
+    printf("]");
+    pos.Print();
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// MemberExpr
+
+MemberExpr::MemberExpr(Expr *e, const char *id, SourcePos p, SourcePos idpos) 
+    : Expr(p), identifierPos(idpos) {
+    expr = e;
+    identifier = id;
+}
+
+
+llvm::Value *
+MemberExpr::GetValue(FunctionEmitContext *ctx) const {
+    if (!expr) 
+        return NULL;
+
+    llvm::Value *lvalue = GetLValue(ctx);
+    if (!lvalue) {
+        // As in the array case, this may be a temporary that hasn't hit
+        // memory; get the full value and stuff it into a temporary array
+        // so that we can index from there...
+        llvm::Value *val = expr->GetValue(ctx);
+        if (!val) {
+            assert(m->errorCount > 0);
+            return NULL;
+        }
+        ctx->SetDebugPos(pos);
+        const Type *exprType = expr->GetType();
+        llvm::Value *ptr = ctx->AllocaInst(exprType->LLVMType(g->ctx), 
+                                           "struct_tmp");
+        ctx->StoreInst(val, ptr);
+
+        int elementNumber = getElementNumber();
+        if (elementNumber == -1)
+            return NULL;
+        lvalue = ctx->GetElementPtrInst(ptr, 0, elementNumber);
+    }
+
+    ctx->SetDebugPos(pos);
+    return ctx->LoadInst(lvalue, GetType(), "structelement");
+}
+
+
+const Type *
+MemberExpr::GetType() const {
+    if (!expr)
+        return NULL;
+
+    const Type *exprType = expr->GetType();
+    if (!exprType)
+        return NULL;
+
+    const StructType *structType = dynamic_cast<const StructType *>(exprType);
+    const VectorType *vectorType = dynamic_cast<const VectorType *>(exprType);
+    if (!structType && !vectorType) {
+        const ReferenceType *referenceType = 
+            dynamic_cast<const ReferenceType *>(exprType);
+        const Type *refTarget = (referenceType == NULL) ? NULL :
+            referenceType->GetReferenceTarget();
+        if ((structType = dynamic_cast<const StructType *>(refTarget)) == NULL &&
+            (vectorType = dynamic_cast<const VectorType *>(refTarget)) == NULL) {
+            Error(pos, "Can't access member of non-struct/vector type \"%s\".",
+                  exprType->GetString().c_str());
+            return NULL;
+        }
+    }
+
+    if (vectorType != NULL)
+        // only one-element vector selection is supported for now (i.e. no
+        // swizzling "foo.xxy"), so the result type is always just the
+        // element type.
+        return vectorType->GetElementType();
+    else {
+        // Otherwise it's a struct, and the result type is the element
+        // type, possibly promoted to varying if the struct type / lvalue
+        // is varying.
+        const Type *elementType = structType->GetMemberType(identifier);
+        if (!elementType)
+            Error(identifierPos, "Element name \"%s\" not present in struct type \"%s\".%s",
+                  identifier.c_str(), structType->GetString().c_str(),
+                  getCandidateNearMatches().c_str());
+
+        if (exprType->IsVaryingType()) 
+            return elementType->GetAsVaryingType();
+        else
+            return elementType;
+    }
+}
+
+
+Symbol *
+MemberExpr::GetBaseSymbol() const {
+    return expr ? expr->GetBaseSymbol() : NULL;
+}
+
+
+/** Map one character ids to vector element numbers.  Allow a few different
+    conventions--xyzw, rgba, uv.
+ */
+static int
+lIdentifierToVectorElement(char id) {
+    switch (id) {
+    case 'x':
+    case 'r':
+    case 'u':
+        return 0;
+    case 'y':
+    case 'g':
+    case 'v':
+        return 1;
+    case 'z':
+    case 'b':
+        return 2;
+    case 'w':
+    case 'a':
+        return 3;
+    default:
+        return -1;
+    }
+}
+
+
+int
+MemberExpr::getElementNumber() const {
+    const Type *exprType;
+    if (!expr || ((exprType = expr->GetType()) == NULL))
+        return -1;
+
+    const StructType *structType = dynamic_cast<const StructType *>(exprType);
+    const VectorType *vectorType = dynamic_cast<const VectorType *>(exprType);
+    if (!structType && !vectorType) {
+        const ReferenceType *referenceType = 
+            dynamic_cast<const ReferenceType *>(exprType);
+        const Type *refTarget = (referenceType == NULL) ? NULL :
+            referenceType->GetReferenceTarget() ;
+        if ((structType = dynamic_cast<const StructType *>(refTarget)) == NULL &&
+            (vectorType = dynamic_cast<const VectorType *>(refTarget)) == NULL)
+            // FIXME: I think we shouldn't ever get here and that
+            // typechecking should have caught this case
+            return -1;
+    }
+
+    int elementNumber = -1;
+    if (vectorType) {
+        if (identifier.size() != 1) {
+            Error(pos, "Only single-character vector element accessors are currently "
+                  "supported--\"%s\" is invalid.  Sorry.", identifier.c_str());
+        }
+        else {
+            elementNumber = lIdentifierToVectorElement(identifier[0]);
+            if (elementNumber == -1)
+                Error(pos, "Vector element identifier \"%s\" unknown.", 
+                      identifier.c_str());
+        }
+    }
+    else {
+        elementNumber = structType->GetMemberNumber(identifier);
+        if (elementNumber == -1)
+            Error(identifierPos, "Element name \"%s\" not present in struct type \"%s\".%s",
+                  identifier.c_str(), structType->GetString().c_str(),
+                  getCandidateNearMatches().c_str());
+    }
+    return elementNumber;
+}
+
+
+
+llvm::Value *
+MemberExpr::GetLValue(FunctionEmitContext *ctx) const {
+    const Type *exprType;
+    if (!expr || ((exprType = expr->GetType()) == NULL))
+        return NULL;
+
+    ctx->SetDebugPos(pos);
+    const StructType *structType = dynamic_cast<const StructType *>(exprType);
+    const VectorType *vectorType = dynamic_cast<const VectorType *>(exprType);
+    llvm::Value *basePtr = NULL;
+    if (structType || vectorType)
+        basePtr = expr->GetLValue(ctx);
+    else {
+        const ReferenceType *referenceType = dynamic_cast<const ReferenceType *>(exprType);
+        // FIXME: store structType and vectorType as members, or do all
+        // this in a separate function?  This code to figure out
+        // struct/vectorType is replicated a bunch of times in
+        // MemberExpr...
+        const Type *refTarget = (referenceType == NULL) ? NULL :
+            referenceType->GetReferenceTarget() ;
+        if ((structType = dynamic_cast<const StructType *>(refTarget)) == NULL &&
+            (vectorType = dynamic_cast<const VectorType *>(refTarget)) == NULL) {
+            // FIXME: again I think typechecking should have caught this
+            Error(pos, "Can't access member of non-struct/vector type \"%s\".",
+                  exprType->GetString().c_str());
+            return NULL;
+        }
+        basePtr = expr->GetValue(ctx);
+    }
+    if (!basePtr)
+        return NULL;
+
+    int elementNumber = getElementNumber();
+    if (elementNumber == -1)
+        return NULL;
+
+    ctx->SetDebugPos(pos);
+    return ctx->GetElementPtrInst(basePtr, 0, elementNumber);
+}
+
+
+Expr *
+MemberExpr::TypeCheck() {
+    if (expr) 
+        expr = expr->TypeCheck();
+    return expr ? this : NULL;
+}
+
+
+Expr *
+MemberExpr::Optimize() {
+    if (expr) 
+        expr = expr->Optimize();
+    return expr ? this : NULL;
+}
+
+
+void
+MemberExpr::Print() const {
+    if (!expr || !GetType())
+        return;
+
+    printf("[%s] member (", GetType()->GetString().c_str());
+    expr->Print();
+    printf(" . %s)", identifier.c_str());
+    pos.Print();
+}
+
+
+/** There is no structure member with the name we've got in "identifier".
+    Use the approximate string matching routine to see if the identifier is
+    a minor misspelling of one of the ones that is there.
+ */
+std::string
+MemberExpr::getCandidateNearMatches() const {
+    const StructType *structType = 
+        dynamic_cast<const StructType *>(expr->GetType());
+    if (!structType)
+        return "";
+
+    std::vector<std::string> elementNames;
+    for (int i = 0; i < structType->NumElements(); ++i)
+        elementNames.push_back(structType->GetElementName(i));
+    std::vector<std::string> alternates = MatchStrings(identifier, elementNames);
+    if (!alternates.size())
+        return "";
+
+    std::string ret = " Did you mean ";
+    for (unsigned int i = 0; i < alternates.size(); ++i) {
+        ret += std::string("\"") + alternates[i] + std::string("\"");
+        if (i < alternates.size() - 1) ret += ", or ";
+    }
+    ret += "?";
+    return ret;
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// ConstExpr
+
+ConstExpr::ConstExpr(const Type *t, int32_t i, SourcePos p) 
+  : Expr(p) {
+    type = dynamic_cast<const AtomicType *>(t);
+    assert(type != NULL);
+    type = type->GetAsConstType();
+    assert(type == AtomicType::UniformConstInt32);
+    int32Val[0] = i;
+}
+
+
+ConstExpr::ConstExpr(const Type *t, int32_t *i, SourcePos p) 
+  : Expr(p) {
+    type = dynamic_cast<const AtomicType *>(t);
+    assert(type != NULL);
+    type = type->GetAsConstType();
+    assert(type == AtomicType::UniformConstInt32 || 
+           type == AtomicType::VaryingConstInt32);
+    for (int j = 0; j < Count(); ++j)
+        int32Val[j] = i[j];
+}
+
+
+ConstExpr::ConstExpr(const Type *t, uint32_t u, SourcePos p) 
+  : Expr(p) {
+    type = dynamic_cast<const AtomicType *>(t);
+    assert(type != NULL);
+    type = type->GetAsConstType();
+    assert(type == AtomicType::UniformConstUInt32);
+    uint32Val[0] = u;
+}
+
+
+ConstExpr::ConstExpr(const Type *t, uint32_t *u, SourcePos p) 
+  : Expr(p) {
+    type = dynamic_cast<const AtomicType *>(t);
+    assert(type != NULL);
+    type = type->GetAsConstType();
+    assert(type == AtomicType::UniformConstUInt32 || 
+           type == AtomicType::VaryingConstUInt32);
+    for (int j = 0; j < Count(); ++j)
+        uint32Val[j] = u[j];
+}
+
+
+ConstExpr::ConstExpr(const Type *t, float f, SourcePos p)
+    : Expr(p) {
+    type = dynamic_cast<const AtomicType *>(t);
+    assert(type != NULL);
+    type = type->GetAsConstType();
+    assert(type == AtomicType::UniformConstFloat);
+    floatVal[0] = f;
+}
+
+
+ConstExpr::ConstExpr(const Type *t, float *f, SourcePos p) 
+  : Expr(p) {
+    type = dynamic_cast<const AtomicType *>(t);
+    assert(type != NULL);
+    type = type->GetAsConstType();
+    assert(type == AtomicType::UniformConstFloat || 
+           type == AtomicType::VaryingConstFloat);
+    for (int j = 0; j < Count(); ++j)
+        floatVal[j] = f[j];
+}
+
+
+ConstExpr::ConstExpr(const Type *t, int64_t i, SourcePos p) 
+  : Expr(p) {
+    type = dynamic_cast<const AtomicType *>(t);
+    assert(type != NULL);
+    type = type->GetAsConstType();
+    assert(type == AtomicType::UniformConstInt64);
+    int64Val[0] = i;
+}
+
+
+ConstExpr::ConstExpr(const Type *t, int64_t *i, SourcePos p) 
+  : Expr(p) {
+    type = dynamic_cast<const AtomicType *>(t);
+    assert(type != NULL);
+    type = type->GetAsConstType();
+    assert(type == AtomicType::UniformConstInt64 || 
+           type == AtomicType::VaryingConstInt64);
+    for (int j = 0; j < Count(); ++j)
+        int64Val[j] = i[j];
+}
+
+
+ConstExpr::ConstExpr(const Type *t, uint64_t u, SourcePos p) 
+  : Expr(p) {
+    type = dynamic_cast<const AtomicType *>(t);
+    assert(type != NULL);
+    type = type->GetAsConstType();
+    assert(type == AtomicType::UniformUInt64);
+    uint64Val[0] = u;
+}
+
+
+ConstExpr::ConstExpr(const Type *t, uint64_t *u, SourcePos p) 
+  : Expr(p) {
+    type = dynamic_cast<const AtomicType *>(t);
+    assert(type != NULL);
+    type = type->GetAsConstType();
+    assert(type == AtomicType::UniformConstUInt64 || 
+           type == AtomicType::VaryingConstUInt64);
+    for (int j = 0; j < Count(); ++j)
+        uint64Val[j] = u[j];
+}
+
+
+ConstExpr::ConstExpr(const Type *t, double f, SourcePos p)
+    : Expr(p) {
+    type = dynamic_cast<const AtomicType *>(t);
+    assert(type != NULL);
+    type = type->GetAsConstType();
+    assert(type == AtomicType::UniformConstDouble);
+    doubleVal[0] = f;
+}
+
+
+ConstExpr::ConstExpr(const Type *t, double *f, SourcePos p) 
+  : Expr(p) {
+    type = dynamic_cast<const AtomicType *>(t);
+    assert(type != NULL);
+    type = type->GetAsConstType();
+    assert(type == AtomicType::UniformConstDouble || 
+           type == AtomicType::VaryingConstDouble);
+    for (int j = 0; j < Count(); ++j)
+        doubleVal[j] = f[j];
+}
+
+
+ConstExpr::ConstExpr(const Type *t, bool b, SourcePos p) 
+  : Expr(p) {
+    type = dynamic_cast<const AtomicType *>(t);
+    assert(type != NULL);
+    type = type->GetAsConstType();
+    assert(type == AtomicType::UniformConstBool);
+    boolVal[0] = b;
+}
+
+
+ConstExpr::ConstExpr(const Type *t, bool *b, SourcePos p) 
+  : Expr(p) {
+    type = dynamic_cast<const AtomicType *>(t);
+    assert(type != NULL);
+    type = type->GetAsConstType();
+    assert(type == AtomicType::UniformConstBool || 
+           type == AtomicType::VaryingConstBool);
+    for (int j = 0; j < Count(); ++j)
+        boolVal[j] = b[j];
+}
+
+
+ConstExpr::ConstExpr(ConstExpr *old, double *v) 
+    : Expr(old->pos) {
+    type = old->type;
+    switch (type->basicType) {
+    case AtomicType::TYPE_BOOL:
+        for (int i = 0; i < Count(); ++i)
+            boolVal[i] = (v[i] != 0.);
+        break;
+    case AtomicType::TYPE_INT32:
+        for (int i = 0; i < Count(); ++i)
+            int32Val[i] = (int)v[i];
+        break;
+    case AtomicType::TYPE_UINT32:
+        for (int i = 0; i < Count(); ++i)
+            uint32Val[i] = (unsigned int)v[i];
+        break;
+    case AtomicType::TYPE_FLOAT:
+        for (int i = 0; i < Count(); ++i)
+            floatVal[i] = (float)v[i];
+        break;
+    case AtomicType::TYPE_DOUBLE:
+        for (int i = 0; i < Count(); ++i)
+            doubleVal[i] = v[i];
+        break;
+    case AtomicType::TYPE_INT64:
+    case AtomicType::TYPE_UINT64:
+        FATAL("fixme; we need another constructor so that we're not trying to pass "
+               "double values to init an int64 type...");
+    default:
+        FATAL("unimplemented const type");
+    }
+}
+
+
+const Type *
+ConstExpr::GetType() const { 
+    return type; 
+}
+
+
+llvm::Value *
+ConstExpr::GetValue(FunctionEmitContext *ctx) const {
+    ctx->SetDebugPos(pos);
+    bool isVarying = type->IsVaryingType();
+
+    // ConstExpr only represents atomic types; just dispatch out to the
+    // appropriate utility routine to get the llvm constant value of the
+    // type we need.
+    switch (type->basicType) {
+    case AtomicType::TYPE_BOOL:
+        if (isVarying)
+            return LLVMBoolVector(boolVal);
+        else
+            return boolVal[0] ? LLVMTrue : LLVMFalse;
+    case AtomicType::TYPE_INT32:
+        return isVarying ? LLVMInt32Vector(int32Val) : 
+                           LLVMInt32(int32Val[0]);
+    case AtomicType::TYPE_UINT32:
+        return isVarying ? LLVMUInt32Vector(uint32Val) : 
+                           LLVMUInt32(uint32Val[0]);
+    case AtomicType::TYPE_FLOAT:
+        return isVarying ? LLVMFloatVector(floatVal) : 
+                           LLVMFloat(floatVal[0]);
+    case AtomicType::TYPE_INT64:
+        return isVarying ? LLVMInt64Vector(int64Val) : 
+                           LLVMInt64(int64Val[0]);
+    case AtomicType::TYPE_UINT64:
+        return isVarying ? LLVMUInt64Vector(uint64Val) : 
+                           LLVMUInt64(uint64Val[0]);
+    case AtomicType::TYPE_DOUBLE:
+        return isVarying ? LLVMDoubleVector(doubleVal) : 
+                           LLVMDouble(doubleVal[0]);
+    default:
+        FATAL("unimplemented const type");
+        return NULL;
+    }
+}
+
+
+/* Type conversion templates: take advantage of C++ function overloading
+   rules to get the one we want to match. */
+
+/* First the most general case, just use C++ type conversion if nothing
+   else matches */
+template <typename From, typename To> static inline void
+lConvertElement(From from, To *to) {
+    *to = (To)from;
+}
+
+
+/** When converting from bool types to numeric types, make sure the result
+    is one or zero.
+    FIXME: this is a different rule than we use elsewhere, where we sign extend
+    the bool.  We should fix the other case to just zero extend and then
+    patch up places in the stdlib that depend on sign extension to call a 
+    routine to make that happen.
+ */ 
+template <typename To> static inline void
+lConvertElement(bool from, To *to) {
+    *to = from ? (To)1 : (To)0;
+}
+
+
+/** When converting numeric types to bool, compare to zero.  (Do we
+    actually need this one??) */
+template <typename From> static inline void
+lConvertElement(From from, bool *to) {
+    *to = (from != 0);
+}
+
+
+/** And bool -> bool is just assignment */
+static inline void
+lConvertElement(bool from, bool *to) {
+    *to = from;
+}
+
+
+/** Type conversion utility function
+ */
+template <typename From, typename To> static void
+lConvert(const From *from, To *to, int count, bool forceVarying) {
+    for (int i = 0; i < count; ++i)
+        lConvertElement(from[i], &to[i]);
+
+    if (forceVarying && count == 1)
+        for (int i = 1; i < g->target.vectorWidth; ++i)
+            to[i] = to[0];
+}
+
+
+int
+ConstExpr::AsInt64(int64_t *ip, bool forceVarying) const {
+    switch (type->basicType) {
+    case AtomicType::TYPE_BOOL:   lConvert(boolVal,   ip, Count(), forceVarying); break;
+    case AtomicType::TYPE_INT32:  lConvert(int32Val,  ip, Count(), forceVarying); break;
+    case AtomicType::TYPE_UINT32: lConvert(uint32Val, ip, Count(), forceVarying); break;
+    case AtomicType::TYPE_FLOAT:  lConvert(floatVal,  ip, Count(), forceVarying); break;
+    case AtomicType::TYPE_DOUBLE: lConvert(doubleVal, ip, Count(), forceVarying); break;
+    case AtomicType::TYPE_INT64:  lConvert(int64Val,  ip, Count(), forceVarying); break;
+    case AtomicType::TYPE_UINT64: lConvert(uint64Val, ip, Count(), forceVarying); break;
+    default:
+        FATAL("unimplemented const type");
+    }
+    return Count();
+}
+
+
+int
+ConstExpr::AsUInt64(uint64_t *up, bool forceVarying) const {
+    switch (type->basicType) {
+    case AtomicType::TYPE_BOOL:   lConvert(boolVal,   up, Count(), forceVarying); break;
+    case AtomicType::TYPE_INT32:  lConvert(int32Val,  up, Count(), forceVarying); break;
+    case AtomicType::TYPE_UINT32: lConvert(uint32Val, up, Count(), forceVarying); break;
+    case AtomicType::TYPE_FLOAT:  lConvert(floatVal,  up, Count(), forceVarying); break;
+    case AtomicType::TYPE_DOUBLE: lConvert(doubleVal, up, Count(), forceVarying); break;
+    case AtomicType::TYPE_INT64:  lConvert(int64Val,  up, Count(), forceVarying); break;
+    case AtomicType::TYPE_UINT64: lConvert(uint64Val, up, Count(), forceVarying); break;
+    default:
+        FATAL("unimplemented const type");
+    }
+    return Count();
+}
+
+
+int
+ConstExpr::AsDouble(double *d, bool forceVarying) const {
+    switch (type->basicType) {
+    case AtomicType::TYPE_BOOL:   lConvert(boolVal,   d, Count(), forceVarying); break;
+    case AtomicType::TYPE_INT32:  lConvert(int32Val,  d, Count(), forceVarying); break;
+    case AtomicType::TYPE_UINT32: lConvert(uint32Val, d, Count(), forceVarying); break;
+    case AtomicType::TYPE_FLOAT:  lConvert(floatVal,  d, Count(), forceVarying); break;
+    case AtomicType::TYPE_DOUBLE: lConvert(doubleVal, d, Count(), forceVarying); break;
+    case AtomicType::TYPE_INT64:  lConvert(int64Val,  d, Count(), forceVarying); break;
+    case AtomicType::TYPE_UINT64: lConvert(uint64Val, d, Count(), forceVarying); break;
+    default:
+        FATAL("unimplemented const type");
+    }
+    return Count();
+}
+
+
+int
+ConstExpr::AsFloat(float *fp, bool forceVarying) const {
+    switch (type->basicType) {
+    case AtomicType::TYPE_BOOL:   lConvert(boolVal,   fp, Count(), forceVarying); break;
+    case AtomicType::TYPE_INT32:  lConvert(int32Val,  fp, Count(), forceVarying); break;
+    case AtomicType::TYPE_UINT32: lConvert(uint32Val, fp, Count(), forceVarying); break;
+    case AtomicType::TYPE_FLOAT:  lConvert(floatVal,  fp, Count(), forceVarying); break;
+    case AtomicType::TYPE_DOUBLE: lConvert(doubleVal, fp, Count(), forceVarying); break;
+    case AtomicType::TYPE_INT64:  lConvert(int64Val,  fp, Count(), forceVarying); break;
+    case AtomicType::TYPE_UINT64: lConvert(uint64Val, fp, Count(), forceVarying); break;
+    default:
+        FATAL("unimplemented const type");
+    }
+    return Count();
+}
+
+
+int
+ConstExpr::AsBool(bool *b, bool forceVarying) const {
+    switch (type->basicType) {
+    case AtomicType::TYPE_BOOL:   lConvert(boolVal,   b, Count(), forceVarying); break;
+    case AtomicType::TYPE_INT32:  lConvert(int32Val,  b, Count(), forceVarying); break;
+    case AtomicType::TYPE_UINT32: lConvert(uint32Val, b, Count(), forceVarying); break;
+    case AtomicType::TYPE_FLOAT:  lConvert(floatVal,  b, Count(), forceVarying); break;
+    case AtomicType::TYPE_DOUBLE: lConvert(doubleVal, b, Count(), forceVarying); break;
+    case AtomicType::TYPE_INT64:  lConvert(int64Val,  b, Count(), forceVarying); break;
+    case AtomicType::TYPE_UINT64: lConvert(uint64Val, b, Count(), forceVarying); break;
+    default:
+        FATAL("unimplemented const type");
+    }
+    return Count();
+}
+
+
+int
+ConstExpr::AsInt32(int32_t *ip, bool forceVarying) const {
+    switch (type->basicType) {
+    case AtomicType::TYPE_BOOL:   lConvert(boolVal,   ip, Count(), forceVarying); break;
+    case AtomicType::TYPE_INT32:  lConvert(int32Val,  ip, Count(), forceVarying); break;
+    case AtomicType::TYPE_UINT32: lConvert(uint32Val, ip, Count(), forceVarying); break;
+    case AtomicType::TYPE_FLOAT:  lConvert(floatVal,  ip, Count(), forceVarying); break;
+    case AtomicType::TYPE_DOUBLE: lConvert(doubleVal, ip, Count(), forceVarying); break;
+    case AtomicType::TYPE_INT64:  lConvert(int64Val,  ip, Count(), forceVarying); break;
+    case AtomicType::TYPE_UINT64: lConvert(uint64Val, ip, Count(), forceVarying); break;
+    default:
+        FATAL("unimplemented const type");
+    }
+    return Count();
+}
+
+
+int
+ConstExpr::AsUInt32(uint32_t *up, bool forceVarying) const {
+    switch (type->basicType) {
+    case AtomicType::TYPE_BOOL:   lConvert(boolVal,   up, Count(), forceVarying); break;
+    case AtomicType::TYPE_INT32:  lConvert(int32Val,  up, Count(), forceVarying); break;
+    case AtomicType::TYPE_UINT32: lConvert(uint32Val, up, Count(), forceVarying); break;
+    case AtomicType::TYPE_FLOAT:  lConvert(floatVal,  up, Count(), forceVarying); break;
+    case AtomicType::TYPE_DOUBLE: lConvert(doubleVal, up, Count(), forceVarying); break;
+    case AtomicType::TYPE_INT64:  lConvert(int64Val,  up, Count(), forceVarying); break;
+    case AtomicType::TYPE_UINT64: lConvert(uint64Val, up, Count(), forceVarying); break;
+    default:
+        FATAL("unimplemented const type");
+    }
+    return Count();
+}
+
+
+int
+ConstExpr::Count() const { 
+    return GetType()->IsVaryingType() ? g->target.vectorWidth : 1; 
+}
+
+
+llvm::Constant *
+ConstExpr::GetConstant(const Type *type) const {
+    // Caller shouldn't be trying to stuff a varying value here into a
+    // constant type.
+    if (type->IsUniformType())
+        assert(Count() == 1);
+
+    type = type->GetAsNonConstType();
+    if (type == AtomicType::UniformBool || type == AtomicType::VaryingBool) {
+        bool bv[ISPC_MAX_NVEC];
+        AsBool(bv, type->IsVaryingType());
+        if (type->IsUniformType())
+            return bv[0] ? LLVMTrue : LLVMFalse;
+        else
+            return LLVMBoolVector(bv);
+    }
+    else if (type == AtomicType::UniformInt32 || type == AtomicType::VaryingInt32) {
+        int32_t iv[ISPC_MAX_NVEC];
+        AsInt32(iv, type->IsVaryingType());
+        if (type->IsUniformType())
+            return LLVMInt32(iv[0]);
+        else
+            return LLVMInt32Vector(iv);
+    }
+    else if (type == AtomicType::UniformUInt32 || type == AtomicType::VaryingUInt32) {
+        uint32_t uiv[ISPC_MAX_NVEC];
+        AsUInt32(uiv, type->IsVaryingType());
+        if (type->IsUniformType())
+            return LLVMUInt32(uiv[0]);
+        else
+            return LLVMUInt32Vector(uiv);
+    }
+    else if (type == AtomicType::UniformFloat || type == AtomicType::VaryingFloat) {
+        float fv[ISPC_MAX_NVEC];
+        AsFloat(fv, type->IsVaryingType());
+        if (type->IsUniformType())
+            return LLVMFloat(fv[0]);
+        else
+            return LLVMFloatVector(fv);
+    }
+    else if (type == AtomicType::UniformInt64 || type == AtomicType::VaryingInt64) {
+        int64_t iv[ISPC_MAX_NVEC];
+        AsInt64(iv, type->IsVaryingType());
+        if (type->IsUniformType())
+            return LLVMInt64(iv[0]);
+        else
+            return LLVMInt64Vector(iv);
+    }
+    else if (type == AtomicType::UniformUInt64 || type == AtomicType::VaryingUInt64) {
+        uint64_t uiv[ISPC_MAX_NVEC];
+        AsUInt64(uiv, type->IsVaryingType());
+        if (type->IsUniformType())
+            return LLVMUInt64(uiv[0]);
+        else
+            return LLVMUInt64Vector(uiv);
+    }
+    else if (type == AtomicType::UniformDouble || type == AtomicType::VaryingDouble) {
+        double dv[ISPC_MAX_NVEC];
+        AsDouble(dv, type->IsVaryingType());
+        if (type->IsUniformType())
+            return LLVMDouble(dv[0]);
+        else
+            return LLVMDoubleVector(dv);
+    }
+    else {
+        FATAL("unexpected type in ConstExpr::GetConstant()");
+        return NULL;
+    }
+}
+
+
+Expr *
+ConstExpr::Optimize() {
+    return this;
+}
+
+
+Expr *
+ConstExpr::TypeCheck() {
+    return this;
+}
+
+
+
+void
+ConstExpr::Print() const {
+    printf("[%s] (", GetType()->GetString().c_str());
+    for (int i = 0; i < Count(); ++i) {
+        switch (type->basicType) {
+        case AtomicType::TYPE_BOOL:
+            printf("%s", boolVal[i] ? "true" : "false");
+            break;
+        case AtomicType::TYPE_INT32:
+            printf("%d", int32Val[i]);
+            break;
+        case AtomicType::TYPE_UINT32:
+            printf("%u", uint32Val[i]);
+            break;
+        case AtomicType::TYPE_FLOAT:
+            printf("%f", floatVal[i]);
+            break;
+        case AtomicType::TYPE_INT64:
+#ifdef ISPC_IS_LINUX
+            printf("%ld", int64Val[i]);
+#else
+            printf("%lld", int64Val[i]);
+#endif
+            break;
+        case AtomicType::TYPE_UINT64:
+#ifdef ISPC_IS_LINUX
+            printf("%lu", uint64Val[i]);
+#else
+            printf("%llu", uint64Val[i]);
+#endif
+            break;
+        case AtomicType::TYPE_DOUBLE:
+            printf("%f", doubleVal[i]);
+            break;
+        default:
+            FATAL("unimplemented const type");
+        }
+        if (i != Count() - 1)
+            printf(", ");
+    }
+    printf(")");
+    pos.Print();
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// TypeCastExpr
+
+TypeCastExpr::TypeCastExpr(const Type *t, Expr *e, SourcePos p) 
+  : Expr(p) {
+    type = t;
+    expr = e;
+}
+
+
+/** Handle all the grungy details of type conversion between atomic types.
+    Given an input value in exprVal of type fromType, convert it to the
+    llvm::Value with type toType.
+ */
+static llvm::Value *
+lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, 
+                const AtomicType *toType, const AtomicType *fromType,
+                SourcePos pos) {
+    llvm::Value *cast = NULL;
+
+    switch (toType->basicType) {
+    case AtomicType::TYPE_FLOAT: {
+        const llvm::Type *targetType = 
+            fromType->IsUniformType() ? LLVMTypes::FloatType : 
+                                        LLVMTypes::FloatVectorType;
+        switch (fromType->basicType) {
+        case AtomicType::TYPE_BOOL:
+            if (fromType->IsVaryingType() && 
+                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+                // If we have a bool vector of i32 element,s first truncate
+                // down to a single bit
+                exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, "bool_to_i1");
+            // And then do an unisgned int->float cast
+            cast = ctx->CastInst(llvm::Instruction::UIToFP, // unsigned int
+                                 exprVal, targetType, "bool2float");
+            break;
+        case AtomicType::TYPE_INT32:
+        case AtomicType::TYPE_INT64:
+            cast = ctx->CastInst(llvm::Instruction::SIToFP, // signed int to float
+                                 exprVal, targetType, "int2float");
+            break;
+        case AtomicType::TYPE_UINT32:
+        case AtomicType::TYPE_UINT64:
+            if (fromType->IsVaryingType())
+                PerformanceWarning(pos, "Conversion from unsigned int to float is slow. "
+                                   "Use \"int\" if possible");
+            cast = ctx->CastInst(llvm::Instruction::UIToFP, // unsigned int to float
+                                 exprVal, targetType, "uint2float");
+            break;
+        case AtomicType::TYPE_FLOAT:
+            // No-op cast.
+            cast = exprVal;
+            break;
+        case AtomicType::TYPE_DOUBLE:
+            cast = ctx->FPCastInst(exprVal, targetType, "double2float");
+            break;
+        default:
+            FATAL("unimplemented");
+        }
+        break;
+    }
+    case AtomicType::TYPE_DOUBLE: {
+        const llvm::Type *targetType = 
+            fromType->IsUniformType() ? LLVMTypes::DoubleType :
+                                        LLVMTypes::DoubleVectorType;
+        switch (fromType->basicType) {
+        case AtomicType::TYPE_BOOL:
+            if (fromType->IsVaryingType() && 
+                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+                // truncate i32 bool vector values to i1s
+                exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, "bool_to_i1");
+            cast = ctx->CastInst(llvm::Instruction::UIToFP, // unsigned int to double
+                                 exprVal, targetType, "bool2double");
+            break;
+        case AtomicType::TYPE_INT32:
+        case AtomicType::TYPE_INT64:
+            cast = ctx->CastInst(llvm::Instruction::SIToFP, // signed int
+                                 exprVal, targetType, "int2double");
+            break;
+        case AtomicType::TYPE_UINT32:
+        case AtomicType::TYPE_UINT64:
+            if (fromType->IsVaryingType())
+                PerformanceWarning(pos, "Conversion from unsigned int64 to float is slow. "
+                                   "Use \"int64\" if possible");
+            cast = ctx->CastInst(llvm::Instruction::UIToFP, // unsigned int
+                                 exprVal, targetType, "uint2double");
+            break;
+        case AtomicType::TYPE_FLOAT:
+            cast = ctx->FPCastInst(exprVal, targetType, "float2double");
+            break;
+        case AtomicType::TYPE_DOUBLE:
+            cast = exprVal;
+            break;
+        default:
+            FATAL("unimplemented");
+        }
+        break;
+    }
+    case AtomicType::TYPE_INT32: {
+        const llvm::Type *targetType = 
+            fromType->IsUniformType() ? LLVMTypes::Int32Type :
+                                        LLVMTypes::Int32VectorType;
+        switch (fromType->basicType) {
+        case AtomicType::TYPE_BOOL:
+            if (fromType->IsVaryingType() && 
+                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+                exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, "bool_to_i1");
+            // FIXME: we're currently doing sign extension rather than zero
+            // extension here, which means that ints will have the value
+            // 0xffffffff for 'true' bools (versus the value 1).  There is
+            // some code in stdlib.ispc that depends on bool->int conversions
+            // having this behavior that needs to be cleaned up (e.g. to
+            // call a __sext() builtin to do bool->int conversions) if we
+            // are going to fix this here.
+            cast = ctx->SExtInst(exprVal, targetType, "bool2int");
+            break;
+        case AtomicType::TYPE_INT32:
+        case AtomicType::TYPE_UINT32:
+            cast = exprVal;
+            break;
+        case AtomicType::TYPE_FLOAT:
+            cast = ctx->CastInst(llvm::Instruction::FPToSI, // signed int
+                                 exprVal, targetType, "float2int");
+            break;
+        case AtomicType::TYPE_INT64:
+        case AtomicType::TYPE_UINT64:
+            cast = ctx->TruncInst(exprVal, targetType, "int64_to_int32");
+            break;
+        case AtomicType::TYPE_DOUBLE:
+            cast = ctx->CastInst(llvm::Instruction::FPToSI, // signed int
+                                 exprVal, targetType, "double2int");
+            break;
+        default:
+            FATAL("unimplemented");
+        }
+        break;
+    }
+    case AtomicType::TYPE_UINT32: {
+        const llvm::Type *targetType = 
+            fromType->IsUniformType() ? LLVMTypes::Int32Type :
+                                        LLVMTypes::Int32VectorType;
+        switch (fromType->basicType) {
+        case AtomicType::TYPE_BOOL:
+            if (fromType->IsVaryingType() && 
+                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+                exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, "bool_to_i1");
+            // FIXME: See comments above w.r.t. fixing this to be a
+            // ZExtInst rather than an SExtInst...
+            cast = ctx->SExtInst(exprVal, targetType, "bool2uint");
+            break;
+        case AtomicType::TYPE_INT32:
+        case AtomicType::TYPE_UINT32:
+            cast = exprVal;
+            break;
+        case AtomicType::TYPE_FLOAT:
+            if (fromType->IsVaryingType())
+                PerformanceWarning(pos, "Conversion from float to unsigned int is slow. "
+                                   "Use \"int\" if possible");
+            cast = ctx->CastInst(llvm::Instruction::FPToUI, // unsigned int
+                                 exprVal, targetType, "float2uint");
+            break;
+        case AtomicType::TYPE_INT64:
+        case AtomicType::TYPE_UINT64:
+            cast = ctx->TruncInst(exprVal, targetType, "int64_to_uint32");
+            break;
+        case AtomicType::TYPE_DOUBLE:
+            if (fromType->IsVaryingType())
+                PerformanceWarning(pos, "Conversion from double to unsigned int is slow. "
+                                   "Use \"int\" if possible");
+            cast = ctx->CastInst(llvm::Instruction::FPToUI, // unsigned int
+                                 exprVal, targetType, "double2uint");
+            break;
+        default:
+            FATAL("unimplemented");
+        }
+        break;
+    }
+    case AtomicType::TYPE_INT64: {
+        const llvm::Type *targetType = 
+            fromType->IsUniformType() ? LLVMTypes::Int64Type : 
+                                        LLVMTypes::Int64VectorType;
+        switch (fromType->basicType) {
+        case AtomicType::TYPE_BOOL:
+            if (fromType->IsVaryingType() &&
+                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+                exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, "bool_to_i1");
+            cast = ctx->SExtInst(exprVal, targetType, "bool2int64");
+            break;
+        case AtomicType::TYPE_INT32:
+            cast = ctx->SExtInst(exprVal, targetType, "int32_to_int64");
+            break;
+        case AtomicType::TYPE_UINT32:
+            cast = ctx->ZExtInst(exprVal, targetType, "uint32_to_int64");
+            break;
+        case AtomicType::TYPE_FLOAT:
+            cast = ctx->CastInst(llvm::Instruction::FPToSI, // signed int
+                                 exprVal, targetType, "float2int64");
+            break;
+        case AtomicType::TYPE_INT64:
+        case AtomicType::TYPE_UINT64:
+            cast = exprVal;
+            break;
+        case AtomicType::TYPE_DOUBLE:
+            cast = ctx->CastInst(llvm::Instruction::FPToSI, // signed int
+                                 exprVal, targetType, "double2int");
+            break;
+        default:
+            FATAL("unimplemented");
+        }
+        break;
+    }
+    case AtomicType::TYPE_UINT64: {
+        const llvm::Type *targetType = 
+            fromType->IsUniformType() ? LLVMTypes::Int64Type : 
+                                        LLVMTypes::Int64VectorType;
+        switch (fromType->basicType) {
+        case AtomicType::TYPE_BOOL:
+            if (fromType->IsVaryingType() && 
+                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+                exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, "bool_to_i1");
+            cast = ctx->SExtInst(exprVal, targetType, "bool2uint");
+            break;
+        case AtomicType::TYPE_INT32:
+            cast = ctx->SExtInst(exprVal, targetType, "int32_to_uint64");
+            break;
+        case AtomicType::TYPE_UINT32:
+            cast = ctx->ZExtInst(exprVal, targetType, "uint32_to_uint64");
+            break;
+        case AtomicType::TYPE_FLOAT:
+            if (fromType->IsVaryingType())
+                PerformanceWarning(pos, "Conversion from float to unsigned int64 is slow. "
+                                   "Use \"int64\" if possible");
+            cast = ctx->CastInst(llvm::Instruction::FPToUI, // signed int
+                                 exprVal, targetType, "float2uint");
+            break;
+        case AtomicType::TYPE_INT64:
+        case AtomicType::TYPE_UINT64:
+            cast = exprVal;
+            break;
+        case AtomicType::TYPE_DOUBLE:
+            if (fromType->IsVaryingType())
+                PerformanceWarning(pos, "Conversion from double to unsigned int64 is slow. "
+                                   "Use \"int64\" if possible");
+            cast = ctx->CastInst(llvm::Instruction::FPToUI, // signed int
+                                 exprVal, targetType, "double2uint");
+            break;
+        default:
+            FATAL("unimplemented");
+        }
+        break;
+    }
+    case AtomicType::TYPE_BOOL: {
+        switch (fromType->basicType) {
+        case AtomicType::TYPE_BOOL:
+            cast = exprVal;
+            break;
+        case AtomicType::TYPE_INT32:
+        case AtomicType::TYPE_UINT32: {
+            llvm::Value *zero = fromType->IsUniformType() ? (llvm::Value *)LLVMInt32(0) : 
+                (llvm::Value *)LLVMInt32Vector(0);
+            cast = ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_NE,
+                                exprVal, zero, "cmpi0");
+            break;
+        }
+        case AtomicType::TYPE_FLOAT: {
+            llvm::Value *zero = fromType->IsUniformType() ? (llvm::Value *)LLVMFloat(0.f) : 
+                (llvm::Value *)LLVMFloatVector(0.f);
+            cast = ctx->CmpInst(llvm::Instruction::FCmp, llvm::CmpInst::FCMP_ONE,
+                                exprVal, zero, "cmpf0");
+            break;
+        }
+        case AtomicType::TYPE_INT64:
+        case AtomicType::TYPE_UINT64: {
+            llvm::Value *zero = fromType->IsUniformType() ? (llvm::Value *)LLVMInt64(0) : 
+                (llvm::Value *)LLVMInt64Vector((int64_t)0);
+            cast = ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_NE,
+                                exprVal, zero, "cmpi0");
+            break;
+        }
+        case AtomicType::TYPE_DOUBLE: {
+            llvm::Value *zero = fromType->IsUniformType() ? (llvm::Value *)LLVMDouble(0.) : 
+                (llvm::Value *)LLVMDoubleVector(0.);
+            cast = ctx->CmpInst(llvm::Instruction::FCmp, llvm::CmpInst::FCMP_ONE,
+                                exprVal, zero, "cmpd0");
+            break;
+        }
+        default:
+            FATAL("unimplemented");
+        }
+
+        if (fromType->IsUniformType()) {
+            if (toType->IsVaryingType() && 
+                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) {
+                // extend out to i32 bool values from i1 here.  then we'll
+                // turn into a vector below, the way it does for everyone
+                // else...
+                cast = ctx->SExtInst(cast, LLVMTypes::BoolVectorType->getElementType(),
+                                     "i1bool_to_i32bool");
+            }
+        }
+        else
+            // fromType->IsVaryingType())
+            cast = ctx->I1VecToBoolVec(cast);
+
+        break;
+    }
+    default:
+        FATAL("unimplemented");
+    }
+
+    // If we also want to go from uniform to varying, replicate out the
+    // value across the vector elements..
+    if (toType->IsVaryingType() && fromType->IsUniformType()) {
+        const llvm::Type *vtype = toType->LLVMType(g->ctx);
+        llvm::Value *castVec = llvm::UndefValue::get(vtype);
+        for (int i = 0; i < g->target.vectorWidth; ++i)
+            castVec = ctx->InsertInst(castVec, cast, i, "smearinsert");
+        return castVec;
+    }
+    else
+        return cast;
+}
+
+
+/** Converts the given value of the given type to be the varying
+    equivalent, returning the resulting value.
+ */
+static llvm::Value *
+lUniformValueToVarying(FunctionEmitContext *ctx, llvm::Value *value,
+                       const Type *type) {
+    // nothing to do if it's already varying
+    if (type->IsVaryingType())
+        return value;
+
+    const llvm::Type *llvmType = type->GetAsVaryingType()->LLVMType(g->ctx);
+    llvm::Value *retValue = llvm::UndefValue::get(llvmType);
+
+    // for structs, just recursively make their elements varying (if
+    // needed) and populate the return struct
+    const StructType *structType = dynamic_cast<const StructType *>(type);
+    if (structType != NULL) {
+        for (int i = 0; i < structType->NumElements(); ++i) {
+            llvm::Value *v = ctx->ExtractInst(value, i, "struct_element");
+            v = lUniformValueToVarying(ctx, v, structType->GetMemberType(i));
+            retValue = ctx->InsertInst(retValue, v, i, "set_struct_element");
+        }
+        return retValue;
+    }
+
+    // And similarly do the elements of arrays and vectors individually
+    const SequentialType *sequentialType = 
+        dynamic_cast<const SequentialType *>(type);
+    if (sequentialType != NULL) {
+        for (int i = 0; i < sequentialType->GetElementCount(); ++i) {
+            llvm::Value *v = ctx->ExtractInst(value, i, "get_element");
+            v = lUniformValueToVarying(ctx, v, sequentialType->GetElementType());
+            retValue = ctx->InsertInst(retValue, v, i, "set_element");
+        }
+        return retValue;
+    }
+
+    // Otherwise we must have a uniform AtomicType, so smear its value
+    // across the vector lanes.
+    assert(dynamic_cast<const AtomicType *>(type) != NULL);
+    for (int i = 0; i < g->target.vectorWidth; ++i)
+        retValue = ctx->InsertInst(retValue, value, i, "smearinsert");
+    return retValue;
+}
+
+
+
+llvm::Value *
+TypeCastExpr::GetValue(FunctionEmitContext *ctx) const {
+    if (!expr)
+        return NULL;
+
+    ctx->SetDebugPos(pos);
+    const Type *toType = GetType(), *fromType = expr->GetType();
+    if (!toType || !fromType || toType == AtomicType::Void || 
+        fromType == AtomicType::Void)
+        // an error should have been issued elsewhere in this case
+        return NULL;
+
+    if (Type::Equal(toType->GetAsConstType(), fromType->GetAsConstType()))
+        // There's nothing to do, just return the value.  (LLVM's type
+        // system doesn't worry about constiness.)
+        return expr->GetValue(ctx);
+
+    // This also should be caught during typechecking
+    assert(!(toType->IsUniformType() && fromType->IsVaryingType()));
+
+    const ReferenceType *toReference = dynamic_cast<const ReferenceType *>(toType);
+    const ReferenceType *fromReference = dynamic_cast<const ReferenceType *>(fromType);
+    if (toReference && fromReference) {
+        const Type *toTarget = toReference->GetReferenceTarget();
+        const Type *fromTarget = fromReference->GetReferenceTarget();
+
+        const ArrayType *toArray = dynamic_cast<const ArrayType *>(toTarget);
+        const ArrayType *fromArray = dynamic_cast<const ArrayType *>(fromTarget);
+        if (toArray && fromArray) {
+            // cast array pointer from [n x foo] to [0 x foo] if needed to be able
+            // to pass to a function that takes an unsized array as a parameter
+            if(toArray->GetElementCount() != 0 && 
+               (toArray->GetElementCount() != fromArray->GetElementCount()))
+                Warning(pos, "Type-converting array of length %d to length %d",
+                        fromArray->GetElementCount(), toArray->GetElementCount());
+            assert(Type::Equal(toArray->GetBaseType()->GetAsConstType(),
+                               fromArray->GetBaseType()->GetAsConstType()));
+            llvm::Value *v = expr->GetValue(ctx);
+            const llvm::Type *ptype = toType->LLVMType(g->ctx);
+            return ctx->BitCastInst(v, ptype); //, "array_cast_0size");
+        }
+
+        assert(Type::Equal(toTarget, fromTarget) ||
+               Type::Equal(toTarget, fromTarget->GetAsConstType()));
+        return expr->GetValue(ctx);
+    }
+
+    const StructType *toStruct = dynamic_cast<const StructType *>(toType);
+    const StructType *fromStruct = dynamic_cast<const StructType *>(fromType);
+    if (toStruct && fromStruct) {
+        // The only legal type conversions for structs are to go from a
+        // uniform to a varying instance of the same struct type.
+        assert(toStruct->IsVaryingType() && fromStruct->IsUniformType() &&
+               Type::Equal(toStruct, fromStruct->GetAsVaryingType()));
+
+        llvm::Value *origValue = expr->GetValue(ctx);
+        if (!origValue)
+            return NULL;
+        return lUniformValueToVarying(ctx, origValue, fromType);
+    }
+
+    const VectorType *toVector = dynamic_cast<const VectorType *>(toType);
+    const VectorType *fromVector = dynamic_cast<const VectorType *>(fromType);
+    if (toVector && fromVector) {
+        // this should be caught during typechecking
+        assert(toVector->GetElementCount() == fromVector->GetElementCount());
+
+        llvm::Value *exprVal = expr->GetValue(ctx);
+        if (!exprVal)
+            return NULL;
+
+        // Emit instructions to do type conversion of each of the elements
+        // of the vector.
+        // FIXME: since uniform vectors are represented as
+        // llvm::VectorTypes, we should just be able to issue the
+        // corresponding vector type convert, which should be more
+        // efficient by avoiding serialization!
+        llvm::Value *cast = llvm::UndefValue::get(toType->LLVMType(g->ctx));
+        for (int i = 0; i < toVector->GetElementCount(); ++i) {
+            llvm::Value *ei = ctx->ExtractInst(exprVal, i);
+
+            llvm::Value *conv = lTypeConvAtomic(ctx, ei, toVector->GetElementType(),
+                                                fromVector->GetElementType(), pos);
+            if (!conv) 
+                return NULL;
+            cast = ctx->InsertInst(cast, conv, i);
+        }
+        return cast;
+    }
+
+    const AtomicType *fromAtomic = dynamic_cast<const AtomicType *>(fromType);
+    // at this point, coming from an atomic type is all that's left...
+    assert(fromAtomic != NULL);
+
+    llvm::Value *exprVal = expr->GetValue(ctx);
+    if (!exprVal)
+        return NULL;
+
+    if (toVector) {
+        // scalar -> short vector conversion
+        llvm::Value *conv = lTypeConvAtomic(ctx, exprVal, toVector->GetElementType(),
+                                            fromAtomic, pos);
+        if (!conv) 
+            return NULL;
+
+        llvm::Value *cast = llvm::UndefValue::get(toType->LLVMType(g->ctx));
+        for (int i = 0; i < toVector->GetElementCount(); ++i)
+            cast = ctx->InsertInst(cast, conv, i);
+        return cast;
+    }
+    else {
+        const AtomicType *toAtomic = dynamic_cast<const AtomicType *>(toType);
+        // typechecking should ensure this is the case
+        assert(toAtomic != NULL);
+
+        return lTypeConvAtomic(ctx, exprVal, toAtomic, fromAtomic, pos);
+    }
+}
+
+
+const Type *
+TypeCastExpr::GetType() const { 
+    return type; 
+}
+
+
+Expr *
+TypeCastExpr::TypeCheck() {
+    if (expr != NULL) 
+        expr = expr->TypeCheck();
+    if (expr == NULL)
+        return NULL;
+
+    const Type *toType = GetType(), *fromType = expr->GetType();
+    if (toType == NULL || fromType == NULL)
+        return NULL;
+
+    const char *toTypeString = toType->GetString().c_str();
+    const char *fromTypeString = fromType->GetString().c_str();
+
+    // It's an error to attempt to convert from varying to uniform
+    if (toType->IsUniformType() && !fromType->IsUniformType()) {
+        Error(pos, "Can't assign 'varying' value to 'uniform' type \"%s\".",
+              toTypeString);
+        return NULL;
+    }
+
+    // And any kind of void type in a type cast doesn't make sense
+    if (toType == AtomicType::Void || fromType == AtomicType::Void) {
+        Error(pos, "Void type illegal in type cast from type \"%s\" to "
+              "type \"%s\".", fromTypeString, toTypeString);
+        return NULL;
+    }
+
+    // FIXME: do we need to worry more about references here?
+
+    if (dynamic_cast<const VectorType *>(fromType) != NULL) {
+        // Starting from a vector type; the result type must be a vector
+        // type as well
+        if (dynamic_cast<const VectorType *>(toType) == NULL) {
+            Error(pos, "Can't convert vector type \"%s\" to non-vector type \"%s\".",
+                  fromTypeString, toTypeString);
+            return NULL;
+        }
+
+        // And the two vectors must have the same number of elements
+        if (dynamic_cast<const VectorType *>(toType)->GetElementCount() != 
+            dynamic_cast<const VectorType *>(fromType)->GetElementCount()) {
+            Error(pos, "Can't convert vector type \"%s\" to differently-sized "
+                  "vector type \"%s\".", fromTypeString, toTypeString);
+            return NULL;
+        }
+
+        // And we're ok; since vectors can only hold AtomicTypes, we know
+        // that type converting the elements will work.
+        return this;
+    }
+    else if (dynamic_cast<const ArrayType *>(fromType) != NULL) {
+        FATAL("Shouldn't ever get here");
+        return this;
+    }
+    else {
+        assert(dynamic_cast<const AtomicType *>(fromType) != NULL);
+        // If we're going from an atomic type, the only possible result is
+        // another atomic type
+        if (dynamic_cast<const AtomicType *>(toType) == NULL) {
+            Error(pos, "Can't convert from non-atomic type \"%s\" to \"%s\".",
+                  fromTypeString, toTypeString);
+            return NULL;
+        }
+
+        return this;
+    }
+
+}
+
+
+Expr *
+TypeCastExpr::Optimize() {
+    if (expr != NULL) 
+        expr = expr->Optimize();
+    if (expr == NULL)
+        return NULL;
+
+    ConstExpr *constExpr = dynamic_cast<ConstExpr *>(expr);
+    if (!constExpr)
+        // We can't do anything if this isn't a const expr
+        return this;
+
+    const Type *toType = GetType();
+    const AtomicType *toAtomic = dynamic_cast<const AtomicType *>(toType);
+    // If we're not casting to an atomic type, we can't do anything here,
+    // since ConstExprs can only represent atomic types.  (So e.g. we're
+    // casting from an int to an int<4>.)
+    if (toAtomic == NULL)
+        return this;
+
+    bool forceVarying = toType->IsVaryingType();
+
+    // All of the type conversion smarts we need is already in the
+    // ConstExpr::AsBool(), etc., methods, so we just need to call the
+    // appropriate one for the type that this cast is converting to.
+    switch (toAtomic->basicType) {
+    case AtomicType::TYPE_BOOL: {
+        bool bv[ISPC_MAX_NVEC];
+        constExpr->AsBool(bv, forceVarying);
+        return new ConstExpr(toType, bv, pos);
+    }
+    case AtomicType::TYPE_INT32: {
+        int32_t iv[ISPC_MAX_NVEC];
+        constExpr->AsInt32(iv, forceVarying);
+        return new ConstExpr(toType, iv, pos);
+    }
+    case AtomicType::TYPE_UINT32: {
+        uint32_t uv[ISPC_MAX_NVEC];
+        constExpr->AsUInt32(uv, forceVarying);
+        return new ConstExpr(toType, uv, pos);
+    }
+    case AtomicType::TYPE_FLOAT: {
+        float fv[ISPC_MAX_NVEC];
+        constExpr->AsFloat(fv, forceVarying);
+        return new ConstExpr(toType, fv, pos);
+    }
+    case AtomicType::TYPE_INT64: {
+        int64_t iv[ISPC_MAX_NVEC];
+        constExpr->AsInt64(iv, forceVarying);
+        return new ConstExpr(toType, iv, pos);
+    }
+    case AtomicType::TYPE_UINT64: {
+        uint64_t uv[ISPC_MAX_NVEC];
+        constExpr->AsUInt64(uv, forceVarying);
+        return new ConstExpr(toType, uv, pos);
+    }
+    case AtomicType::TYPE_DOUBLE: {
+        double dv[ISPC_MAX_NVEC];
+        constExpr->AsDouble(dv, forceVarying);
+        return new ConstExpr(toType, dv, pos);
+    }
+    default:
+        FATAL("unimplemented");
+    }
+    return this;
+
+}
+
+
+void
+TypeCastExpr::Print() const {
+    printf("[%s] type cast (", GetType()->GetString().c_str());
+    expr->Print();
+    printf(")");
+    pos.Print();
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// ReferenceExpr
+
+ReferenceExpr::ReferenceExpr(Expr *e, SourcePos p)
+    : Expr(p) {
+    expr = e;
+}
+
+
+llvm::Value *
+ReferenceExpr::GetValue(FunctionEmitContext *ctx) const {
+    ctx->SetDebugPos(pos);
+    return expr ? expr->GetLValue(ctx) : NULL;
+}
+
+
+Symbol *
+ReferenceExpr::GetBaseSymbol() const {
+    return expr ? expr->GetBaseSymbol() : NULL;
+}
+
+
+const Type *
+ReferenceExpr::GetType() const {
+    if (!expr) 
+        return NULL;
+
+    const Type *type = expr->GetType();
+    if (!type) 
+        return NULL;
+
+    return new ReferenceType(type, false);
+}
+
+
+Expr *
+ReferenceExpr::Optimize() {
+    if (expr) 
+        expr = expr->Optimize();
+    if (expr == NULL)
+        return NULL;
+
+    return this;
+}
+
+
+Expr *
+ReferenceExpr::TypeCheck() {
+    if (expr != NULL) 
+        expr = expr->TypeCheck();
+    if (expr == NULL)
+        return NULL;
+    return this;
+}
+
+
+void
+ReferenceExpr::Print() const {
+    if (expr == NULL || GetType() == NULL)
+        return;
+
+    printf("[%s] &(", GetType()->GetString().c_str());
+    expr->Print();
+    printf(")");
+    pos.Print();
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// DereferenceExpr
+
+DereferenceExpr::DereferenceExpr(Expr *e, SourcePos p)
+    : Expr(p) {
+    expr = e;
+}
+
+
+llvm::Value *
+DereferenceExpr::GetValue(FunctionEmitContext *ctx) const {
+    if (expr == NULL) 
+        return NULL;
+    llvm::Value *ptr = expr->GetValue(ctx);
+    if (ptr == NULL)
+        return NULL;
+    const Type *type = GetType();
+    if (type == NULL)
+        return NULL;
+
+    ctx->SetDebugPos(pos);
+    return ctx->LoadInst(ptr, type, "reference_load");
+}
+
+
+llvm::Value *
+DereferenceExpr::GetLValue(FunctionEmitContext *ctx) const {
+    if (expr == NULL) 
+        return NULL;
+    return expr->GetValue(ctx);
+}
+
+
+Symbol *
+DereferenceExpr::GetBaseSymbol() const {
+    return expr ? expr->GetBaseSymbol() : NULL;
+}
+
+
+const Type *
+DereferenceExpr::GetType() const {
+    return (expr && expr->GetType()) ? expr->GetType()->GetReferenceTarget() : 
+        NULL;
+}
+
+
+Expr *
+DereferenceExpr::TypeCheck() {
+    if (expr != NULL)
+        expr = expr->TypeCheck();
+    if (expr == NULL)
+        return NULL;
+    return this;
+}
+
+
+Expr *
+DereferenceExpr::Optimize() {
+    if (expr != NULL) 
+        expr = expr->Optimize();
+    if (expr == NULL)
+        return NULL;
+    return this;
+}
+
+
+void
+DereferenceExpr::Print() const {
+    if (expr == NULL || GetType() == NULL)
+        return;
+
+    printf("[%s] *(", GetType()->GetString().c_str());
+    expr->Print();
+    printf(")");
+    pos.Print();
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// SymbolExpr
+
+SymbolExpr::SymbolExpr(Symbol *s, SourcePos p) 
+  : Expr(p) {
+    symbol = s;
+}
+
+
+llvm::Value *
+SymbolExpr::GetValue(FunctionEmitContext *ctx) const {
+    // storagePtr may be NULL due to an earlier compilation error
+    if (!symbol || !symbol->storagePtr)
+        return NULL;
+    ctx->SetDebugPos(pos);
+    return ctx->LoadInst(symbol->storagePtr, GetType(), symbol->name.c_str());
+}
+
+
+llvm::Value *
+SymbolExpr::GetLValue(FunctionEmitContext *ctx) const {
+    if (symbol == NULL)
+        return NULL;
+    ctx->SetDebugPos(pos);
+    return symbol->storagePtr;
+}
+
+
+Symbol *
+SymbolExpr::GetBaseSymbol() const {
+    return symbol;
+}
+
+
+const Type *
+SymbolExpr::GetType() const { 
+    return symbol ? symbol->type : NULL;
+}
+
+
+Expr *
+SymbolExpr::TypeCheck() {
+    return this;
+}
+
+
+Expr *
+SymbolExpr::Optimize() {
+    if (symbol == NULL)
+        return NULL;
+    else if (symbol->constValue != NULL) {
+        assert(GetType()->IsConstType());
+        return symbol->constValue;
+    }
+    else
+        return this;
+}
+
+
+void
+SymbolExpr::Print() const {
+    if (symbol == NULL || GetType() == NULL)
+        return;
+
+    printf("[%s] sym: (%s)", GetType()->GetString().c_str(), 
+           symbol->name.c_str());
+    pos.Print();
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// FunctionSymbolExpr
+
+FunctionSymbolExpr::FunctionSymbolExpr(std::vector<Symbol *> *candidates,
+                                       SourcePos p) 
+  : Expr(p) {
+    matchingFunc = NULL;
+    candidateFunctions = candidates;
+}
+
+
+const Type *
+FunctionSymbolExpr::GetType() const {
+    return matchingFunc ? matchingFunc->type : NULL;
+}
+
+
+llvm::Value *
+FunctionSymbolExpr::GetValue(FunctionEmitContext *ctx) const {
+    assert("!should not call FunctionSymbolExpr::GetValue()");
+    return NULL;
+}
+
+
+Symbol *
+FunctionSymbolExpr::GetBaseSymbol() const {
+    return matchingFunc;
+}
+
+
+Expr *
+FunctionSymbolExpr::TypeCheck() {
+    return this;
+}
+
+
+Expr *
+FunctionSymbolExpr::Optimize() {
+    return this;
+}
+
+
+void
+FunctionSymbolExpr::Print() const {
+    if (!matchingFunc || !GetType())
+        return;
+
+    printf("[%s] fun sym (%s)", GetType()->GetString().c_str(),
+           matchingFunc->name.c_str());
+    pos.Print();
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// SyncExpr
+
+const Type *
+SyncExpr::GetType() const {
+    return AtomicType::Void;
+}
+
+
+llvm::Value *
+SyncExpr::GetValue(FunctionEmitContext *ctx) const {
+    ctx->SetDebugPos(pos);
+    std::vector<llvm::Value *> noArg;
+    llvm::Function *fsync = m->module->getFunction("ISPCSync");
+    if (fsync == NULL) {
+        FATAL("Couldn't find ISPCSync declaration?!");
+        return NULL;
+    }
+
+    return ctx->CallInst(fsync, noArg, "");
+}
+
+
+void
+SyncExpr::Print() const {
+    printf("sync");
+    pos.Print();
+}
+
+
+Expr *
+SyncExpr::TypeCheck() {
+    return this;
+}
+
+
+Expr *
+SyncExpr::Optimize() {
+    return this;
+}
diff --git a/expr.h b/expr.h
new file mode 100644
index 00000000..ae59b101
--- /dev/null
+++ b/expr.h
@@ -0,0 +1,543 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file expr.h
+    @brief Expr abstract base class and expression implementations
+*/
+
+#ifndef ISPC_EXPR_H
+#define ISPC_EXPR_H 1
+
+#include "ispc.h"
+
+class FunctionSymbolExpr;
+
+/** @brief Expr is the abstract base class that defines the interface that
+    all expression types must implement.
+ */
+class Expr : public ASTNode {
+public:
+    Expr(SourcePos p) : ASTNode(p) { }
+
+    /** This is the main method for Expr implementations to implement.  It
+        should call methods in the FunctionEmitContext to emit LLVM IR
+        instructions to the current basic block in order to generate an
+        llvm::Value that represents the expression's value. */
+    virtual llvm::Value *GetValue(FunctionEmitContext *ctx) const = 0;
+
+    /** For expressions that can provide an lvalue (e.g. array indexing),
+        this function should emit IR that computes the expression's lvalue
+        and returns the corresponding llvm::Value.  Expressions that can't
+        provide an lvalue should leave this unimplemented; the default
+        implementation returns NULL.  */
+    virtual llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
+
+    /** Returns the Type of the expression. */
+    virtual const Type *GetType() const = 0;
+
+    /** For expressions that have values based on a symbol (e.g. regular
+        symbol references, array indexing, etc.), this returns a pointer to
+        that symbol. */
+    virtual Symbol *GetBaseSymbol() const;
+
+    /** If this is a constant expression that can be converted to a
+        constant of the given type, this method should return the
+        corresponding llvm::Constant value.  Otherwise it should return
+        NULL. */
+    virtual llvm::Constant *GetConstant(const Type *type) const;
+
+    /** This method should perform early optimizations of the expression
+        (constant folding, etc.) and return a pointer to the resulting
+        expression.  If an error is encountered during optimization, NULL
+        should be returned. */
+    virtual Expr *Optimize() = 0;
+
+    /** This method should perform type checking of the expression and
+        return a pointer to the resulting expression.  If an error is
+        encountered, NULL should be returned. */
+    virtual Expr *TypeCheck() = 0;
+
+    /** Prints the expression to standard output (used for debugging). */
+    virtual void Print() const = 0;
+
+    /** This method tries to convert the expression to the given type.  In
+        the event of failure, if the failureOk parameter is true, then no
+        error is issued.  If failureOk is false, then an error is printed
+        that incorporates the given error message string.  In either
+        failure case, NULL is returned.  */
+    Expr *TypeConv(const Type *type, const char *errorMsgBase = NULL, 
+                   bool failureOk = false);
+};
+
+
+/** @brief Unary expression */
+class UnaryExpr : public Expr {
+public:
+    enum Op {
+        PreInc,      ///< Pre-increment
+        PreDec,      ///< Pre-decrement 
+        PostInc,     ///< Post-increment
+        PostDec,     ///< Post-decrement
+        Negate,      ///< Negation
+        LogicalNot,  ///< Logical not
+        BitNot,      ///< Bit not
+    };
+
+    UnaryExpr(Op op, Expr *expr, SourcePos pos);
+
+    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
+    const Type *GetType() const;
+    void Print() const;
+    Expr *Optimize();
+    Expr *TypeCheck();
+
+private:
+    const Op op;
+    Expr *expr;
+};
+
+
+/** @brief Binary expression */
+class BinaryExpr : public Expr {
+public:
+    enum Op {
+        Add,           ///< Addition
+        Sub,           ///< Subtraction
+        Mul,           ///< Multiplication
+        Div,           ///< Division
+        Mod,           ///< Modulus
+        Shl,           ///< Shift left
+        Shr,           ///< Shift right
+
+        Lt,            ///< Less than
+        Gt,            ///< Greater than
+        Le,            ///< Less than or equal
+        Ge,            ///< Greater than or equal
+        Equal,         ///< Equal
+        NotEqual,      ///< Not equal
+
+        BitAnd,        ///< Bitwise AND
+        BitXor,        ///< Bitwise XOR
+        BitOr,         ///< Bitwise OR
+        LogicalAnd,    ///< Logical AND
+        LogicalOr,     ///< Logical OR
+
+        Comma,         ///< Comma operator
+    };
+
+    BinaryExpr(Op o, Expr *a, Expr *b, SourcePos p);
+
+    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
+    const Type *GetType() const;
+    void Print() const;
+
+    Expr *Optimize();
+    Expr *TypeCheck();
+
+private:
+    const Op op;
+    Expr *arg0, *arg1;
+};
+
+
+/** @brief Assignment expression */
+class AssignExpr : public Expr {
+public:
+    enum Op {
+        Assign,     ///< Regular assignment
+        MulAssign,  ///< *= assignment
+        DivAssign,  ///< /= assignment
+        ModAssign,  ///< %= assignment
+        AddAssign,  ///< += assignment
+        SubAssign,  ///< -= assignment
+        ShlAssign,  ///< <<= assignment
+        ShrAssign,  ///< >>= assignment
+        AndAssign,  ///< &= assignment
+        XorAssign,  ///< ^= assignment
+        OrAssign,   ///< |= assignment
+    };
+
+    AssignExpr(Op o, Expr *a, Expr *b, SourcePos p);
+
+    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
+    const Type *GetType() const;
+    void Print() const;
+
+    Expr *Optimize();
+    Expr *TypeCheck();
+
+private:
+    const Op op;
+    Expr *lvalue, *rvalue;
+};
+
+
+/** @brief Selection expression, corresponding to "test ? a : b".  
+
+    Returns the value of "a" or "b", depending on the value of "test".
+*/
+class SelectExpr : public Expr {
+public:
+    SelectExpr(Expr *test, Expr *a, Expr *b, SourcePos p);
+
+    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
+    const Type *GetType() const;
+    void Print() const;
+
+    Expr *Optimize();
+    Expr *TypeCheck();
+
+private:
+    Expr *test, *expr1, *expr2;
+};
+
+
+/** @brief A list of expressions.
+
+    These are mostly used for representing curly-brace delimited
+    initializers for initializers for complex types and for representing
+    the arguments passed to a function call.
+ */
+class ExprList : public Expr {
+public:
+    ExprList(SourcePos p) : Expr(p) { }
+    ExprList(Expr *e, SourcePos p) : Expr(p) { exprs.push_back(e); }
+
+    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
+    const Type *GetType() const;
+    void Print() const;
+    llvm::Constant *GetConstant(const Type *type) const;
+    ExprList *Optimize();
+    ExprList *TypeCheck();
+
+    std::vector<Expr *> exprs;
+};
+
+
+/** @brief Expression representing a function call.
+ */
+class FunctionCallExpr : public Expr {
+public:
+    FunctionCallExpr(Expr *func, ExprList *args, SourcePos p, bool isLaunch);
+
+    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
+    const Type *GetType() const;
+    void Print() const;
+
+    Expr *Optimize();
+    Expr *TypeCheck();
+
+private:
+    Expr *func;
+    ExprList *args;
+    bool isLaunch;
+
+    void resolveFunctionOverloads();
+    bool tryResolve(bool (*matchFunc)(Expr *, const Type *));
+};
+
+
+/** @brief Expression representing indexing into something with an integer
+    offset.
+
+    This is used for both array indexing and indexing into VectorTypes. 
+*/
+class IndexExpr : public Expr {
+public:
+    IndexExpr(Expr *arrayOrVector, Expr *index, SourcePos p);
+
+    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
+    llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
+    const Type *GetType() const;
+    Symbol *GetBaseSymbol() const;
+    void Print() const;
+
+    Expr *Optimize();
+    Expr *TypeCheck();
+
+private:
+    Expr *arrayOrVector, *index;
+};
+
+
+/** @brief Expression representing member selection ("foo.bar").
+ */
+class MemberExpr : public Expr {
+public:
+    MemberExpr(Expr *expr, const char *identifier, SourcePos pos, 
+               SourcePos identifierPos);
+
+    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
+    llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
+    const Type *GetType() const;
+    Symbol *GetBaseSymbol() const;
+    void Print() const;
+    Expr *Optimize();
+    Expr *TypeCheck();
+
+private:
+    std::string getCandidateNearMatches() const;
+    int getElementNumber() const;
+
+    Expr *expr;
+    std::string identifier;
+    const SourcePos identifierPos;
+};
+
+
+/** @brief Expression representing a compile-time constant value.  
+
+    This class can currently represent compile-time constants of anything
+    that is an AtomicType; for anything more complex, we don't currently
+    have a representation of a compile-time constant that can be further
+    reasoned about.
+ */
+class ConstExpr : public Expr {
+public:
+    /** Create a ConstExpr from a uniform int32 value */
+    ConstExpr(const Type *t, int32_t i, SourcePos p);
+    /** Create a ConstExpr from a varying int32 value */
+    ConstExpr(const Type *t, int32_t *i, SourcePos p);
+    /** Create a ConstExpr from a uniform uint32 value */
+    ConstExpr(const Type *t, uint32_t u, SourcePos p);
+    /** Create a ConstExpr from a varying uint32 value */
+    ConstExpr(const Type *t, uint32_t *u, SourcePos p);
+    /** Create a ConstExpr from a uniform float value */
+    ConstExpr(const Type *t, float f, SourcePos p);
+    /** Create a ConstExpr from a varying float value */
+    ConstExpr(const Type *t, float *f, SourcePos p);
+    /** Create a ConstExpr from a uniform double value */
+    ConstExpr(const Type *t, double d, SourcePos p);
+    /** Create a ConstExpr from a varying double value */
+    ConstExpr(const Type *t, double *d, SourcePos p);
+    /** Create a ConstExpr from a uniform int64 value */
+    ConstExpr(const Type *t, int64_t i, SourcePos p);
+    /** Create a ConstExpr from a varying int64 value */
+    ConstExpr(const Type *t, int64_t *i, SourcePos p);
+    /** Create a ConstExpr from a uniform uint64 value */
+    ConstExpr(const Type *t, uint64_t i, SourcePos p);
+    /** Create a ConstExpr from a varying uint64 value */
+    ConstExpr(const Type *t, uint64_t *i, SourcePos p);
+    /** Create a ConstExpr from a uniform bool value */
+    ConstExpr(const Type *t, bool b, SourcePos p);
+    /** Create a ConstExpr from a varying bool value */
+    ConstExpr(const Type *t, bool *b, SourcePos p);
+    /** Create a ConstExpr of the same type as the given old ConstExpr,
+        with values given by the "vales" parameter. */
+    ConstExpr(ConstExpr *old, double *values);
+
+    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
+    const Type *GetType() const;
+    void Print() const;
+    llvm::Constant *GetConstant(const Type *type) const;
+
+    Expr *TypeCheck();
+    Expr *Optimize();
+
+    /** Return the ConstExpr's values as booleans, doing type conversion
+        from the actual type if needed.  If forceVarying is true, then type
+        convert to 'varying' so as to always return a number of values
+        equal to the target vector width into the given pointer. */
+    int AsBool(bool *, bool forceVarying = false) const;
+
+    /** Return the ConstExpr's values as int32s, doing type conversion
+        from the actual type if needed.  If forceVarying is true, then type
+        convert to 'varying' so as to always return a number of values
+        equal to the target vector width into the given pointer. */
+    int AsInt32(int32_t *, bool forceVarying = false) const;
+
+    /** Return the ConstExpr's values as uint32s, doing type conversion
+        from the actual type if needed.  If forceVarying is true, then type
+        convert to 'varying' so as to always return a number of values
+        equal to the target vector width into the given pointer. */
+    int AsUInt32(uint32_t *, bool forceVarying = false) const;
+
+    /** Return the ConstExpr's values as floats, doing type conversion
+        from the actual type if needed.  If forceVarying is true, then type
+        convert to 'varying' so as to always return a number of values
+        equal to the target vector width into the given pointer. */
+    int AsFloat(float *, bool forceVarying = false) const;
+
+    /** Return the ConstExpr's values as int64s, doing type conversion
+        from the actual type if needed.  If forceVarying is true, then type
+        convert to 'varying' so as to always return a number of values
+        equal to the target vector width into the given pointer. */
+    int AsInt64(int64_t *, bool forceVarying = false) const;
+
+    /** Return the ConstExpr's values as uint64s, doing type conversion
+        from the actual type if needed.  If forceVarying is true, then type
+        convert to 'varying' so as to always return a number of values
+        equal to the target vector width into the given pointer. */
+    int AsUInt64(uint64_t *, bool forceVarying = false) const;
+
+    /** Return the ConstExpr's values as doubles, doing type conversion
+        from the actual type if needed.  If forceVarying is true, then type
+        convert to 'varying' so as to always return a number of values
+        equal to the target vector width into the given pointer. */
+    int AsDouble(double *, bool forceVarying = false) const;
+
+    /** Return the number of values in the ConstExpr; should be either 1,
+        if it has uniform type, or the target's vector width if it's
+        varying. */
+    int Count() const;
+
+private:
+    const AtomicType *type;
+    union {
+        int32_t int32Val[ISPC_MAX_NVEC];
+        uint32_t uint32Val[ISPC_MAX_NVEC];
+        bool boolVal[ISPC_MAX_NVEC];
+        float floatVal[ISPC_MAX_NVEC];
+        double doubleVal[ISPC_MAX_NVEC];
+        int64_t int64Val[ISPC_MAX_NVEC];
+        uint64_t uint64Val[ISPC_MAX_NVEC];
+    };
+};
+
+
+/** @brief Expression representing a type cast of the given expression to a
+    probably-different type. */
+class TypeCastExpr : public Expr {
+public:
+    TypeCastExpr(const Type *t, Expr *e, SourcePos p);
+
+    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
+    const Type *GetType() const;
+    void Print() const;
+    Expr *TypeCheck();
+    Expr *Optimize();
+
+private:
+    const Type *type;
+    Expr *expr;
+};
+
+
+/** @brief Expression that represents taking a reference of a (non-reference)
+    variable. */
+class ReferenceExpr : public Expr {
+public:
+    ReferenceExpr(Expr *e, SourcePos p);
+
+    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
+    const Type *GetType() const;
+    Symbol *GetBaseSymbol() const;
+    void Print() const;
+    Expr *TypeCheck();
+    Expr *Optimize();
+
+private:
+    Expr *expr;
+};
+
+
+/** @brief Expression that represents dereferencing a reference to get its
+    value. */
+class DereferenceExpr : public Expr {
+public:
+    DereferenceExpr(Expr *e, SourcePos p);
+
+    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
+    llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
+    const Type *GetType() const;
+    Symbol *GetBaseSymbol() const;
+    void Print() const;
+    Expr *TypeCheck();
+    Expr *Optimize();
+
+private:
+    Expr *expr;
+};
+
+
+/** @brief Expression representing a symbol reference in the program */
+class SymbolExpr : public Expr {
+public:
+    SymbolExpr(Symbol *s, SourcePos p);
+
+    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
+    llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
+    const Type *GetType() const;
+    Symbol *GetBaseSymbol() const;
+    Expr *TypeCheck();
+    Expr *Optimize();
+    void Print() const;
+
+private:
+    Symbol *symbol;
+};
+
+
+/** @brief Expression representing a function symbol in the program (generally
+    used for a function call).
+ */    
+class FunctionSymbolExpr : public Expr {
+public:
+    FunctionSymbolExpr(std::vector<Symbol *> *candidateFunctions, 
+                       SourcePos pos);
+
+    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
+    const Type *GetType() const;
+    Symbol *GetBaseSymbol() const;
+    Expr *TypeCheck();
+    Expr *Optimize();
+    void Print() const;
+
+private:
+    friend class FunctionCallExpr;
+
+    /** All of the functions with the name given in the function call;
+        there may be more then one, in which case we need to resolve which
+        overload is the best match. */
+    std::vector<Symbol *> *candidateFunctions;
+
+    /** The actual matching function found after overload resolution; this
+        value is set by FunctionCallExpr::resolveFunctionOverloads() */
+    Symbol *matchingFunc;
+};
+
+
+/** @brief A sync statement in the program (waits for all launched tasks before
+    proceeding). */
+class SyncExpr : public Expr {
+public:
+    SyncExpr(SourcePos p) : Expr(p) { }
+
+    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
+    const Type *GetType() const;
+    Expr *TypeCheck();
+    Expr *Optimize();
+    void Print() const;
+};
+
+#endif // ISPC_EXPR_H
diff --git a/failing_tests/max-uint-1.ispc b/failing_tests/max-uint-1.ispc
new file mode 100644
index 00000000..d86126e6
--- /dev/null
+++ b/failing_tests/max-uint-1.ispc
@@ -0,0 +1,19 @@
+static float float4(uniform float a, uniform float b, uniform float c, 
+                    uniform float d) {
+    float ret = 0;
+    for (uniform int i = 0; i < programCount; i += 4) {
+        ret = insert(ret, i + 0, a);
+        ret = insert(ret, i + 1, b);
+        ret = insert(ret, i + 2, c);
+        ret = insert(ret, i + 3, d);
+    }
+    return ret;
+}
+
+export float f_f(float a) {
+    unsigned int i = (unsigned int)a;
+    return max((unsigned int)2, i);
+}
+
+export float result() { return float4(2,2,3,4); }
+
diff --git a/failing_tests/max-uint.ispc b/failing_tests/max-uint.ispc
new file mode 100644
index 00000000..145aa707
--- /dev/null
+++ b/failing_tests/max-uint.ispc
@@ -0,0 +1,8 @@
+
+export float f_f(float a) {
+    unsigned int i = (unsigned int)a;
+    return max((unsigned int)10, i);
+}
+
+export float result() { return 10; }
+
diff --git a/failing_tests/min-uint-1.ispc b/failing_tests/min-uint-1.ispc
new file mode 100644
index 00000000..018b20d6
--- /dev/null
+++ b/failing_tests/min-uint-1.ispc
@@ -0,0 +1,19 @@
+static float float4(uniform float a, uniform float b, uniform float c, 
+                    uniform float d) {
+    float ret = 0;
+    for (uniform int i = 0; i < programCount; i += 4) {
+        ret = insert(ret, i + 0, a);
+        ret = insert(ret, i + 1, b);
+        ret = insert(ret, i + 2, c);
+        ret = insert(ret, i + 3, d);
+    }
+    return ret;
+}
+
+export float f_f(float a) {
+    unsigned int i = (unsigned int)a;
+    return min((unsigned int)2, i);
+}
+
+export float result() { return float4(1,2,2,2); }
+
diff --git a/failing_tests/min-uint-2.ispc b/failing_tests/min-uint-2.ispc
new file mode 100644
index 00000000..5b5f0539
--- /dev/null
+++ b/failing_tests/min-uint-2.ispc
@@ -0,0 +1,19 @@
+static float float4(uniform float a, uniform float b, uniform float c, 
+                    uniform float d) {
+    float ret = 0;
+    for (uniform int i = 0; i < programCount; i += 4) {
+        ret = insert(ret, i + 0, a);
+        ret = insert(ret, i + 1, b);
+        ret = insert(ret, i + 2, c);
+        ret = insert(ret, i + 3, d);
+    }
+    return ret;
+}
+
+export float f_f(float a) {
+    unsigned int i = (unsigned int)a;
+    return min((unsigned int)20, i);
+}
+
+export float result() { return float4(1,2,3,4); }
+
diff --git a/failing_tests/struct-array-assign.ispc b/failing_tests/struct-array-assign.ispc
new file mode 100644
index 00000000..8dc09543
--- /dev/null
+++ b/failing_tests/struct-array-assign.ispc
@@ -0,0 +1,11 @@
+
+struct Foo {
+    float f;
+};
+
+
+export float foo(Foo f[], int i, uniform int j) {
+    Foo x = f[i];
+    return x.f;
+}
+
diff --git a/ispc.cpp b/ispc.cpp
new file mode 100644
index 00000000..506846f0
--- /dev/null
+++ b/ispc.cpp
@@ -0,0 +1,137 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file ispc.cpp
+    @brief ispc global definitions
+*/
+
+#include "ispc.h"
+#include "module.h"
+#include "util.h"
+#include <stdio.h>
+#ifdef ISPC_IS_WINDOWS
+#include <windows.h>
+#include <direct.h>
+#endif
+#include <llvm/LLVMContext.h>
+#include <llvm/Module.h>
+#ifndef LLVM_2_8
+#include <llvm/Analysis/DIBuilder.h>
+#endif
+#include <llvm/Analysis/DebugInfo.h>
+#include <llvm/Support/Dwarf.h>
+
+Globals *g;
+Module *m;
+
+///////////////////////////////////////////////////////////////////////////
+// Target
+
+Target::Target() {
+    arch = "x86-64";
+    cpu = "nehalem";
+    isa = SSE4;
+    nativeVectorWidth = 4;
+    vectorWidth = 4;
+}
+
+///////////////////////////////////////////////////////////////////////////
+// Opt
+
+Opt::Opt() {
+    level = 1;
+    fastMath = false;
+    disableBlendedMaskedStores = false;
+    disableCoherentControlFlow = false;
+    disableUniformControlFlow = false;
+    disableGatherScatterOptimizations = false;
+    disableMaskedStoreToStore = false;
+    disableGatherScatterFlattening = false;
+    disableUniformMemoryOptimizations = false;
+    disableMaskedStoreOptimizations = false;
+}
+
+///////////////////////////////////////////////////////////////////////////
+// Globals
+
+Globals::Globals() {
+    mathLib = Globals::Math_ISPC;
+
+    includeStdlib = true;
+    runCPP = true;
+    debugPrint = false;
+    disableWarnings = false;
+    emitPerfWarnings = true;
+    emitInstrumentation = false;
+    generateDebuggingSymbols = false;
+
+    ctx = new llvm::LLVMContext;
+
+#ifdef ISPC_IS_WINDOWS
+    _getcwd(currentDirectory, sizeof(currentDirectory));
+#else
+    getcwd(currentDirectory, sizeof(currentDirectory));
+#endif
+}
+
+///////////////////////////////////////////////////////////////////////////
+// ASTNode
+
+ASTNode::~ASTNode() {
+}
+
+///////////////////////////////////////////////////////////////////////////
+// SourcePos
+
+SourcePos::SourcePos(const char *n, int l, int c) {
+    name = n ? n : m->module->getModuleIdentifier().c_str();
+    first_line = last_line = l;
+    first_column = last_column = c;
+}
+
+llvm::DIFile SourcePos::GetDIFile() const {
+#ifdef LLVM_2_8
+    return llvm::DIFile();
+#else
+    std::string directory, filename;
+    GetDirectoryAndFileName(g->currentDirectory, name, &directory, &filename);
+    return m->diBuilder->createFile(filename, directory);
+#endif // LLVM_2_8
+}
+
+
+void
+SourcePos::Print() const { 
+    printf(" @ [%s:%d.%d - %d.%d] ", name, first_line, first_column,
+           last_line, last_column); 
+}
diff --git a/ispc.h b/ispc.h
new file mode 100644
index 00000000..2c4ec158
--- /dev/null
+++ b/ispc.h
@@ -0,0 +1,313 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file ispc.h
+    @brief Main ispc.header file
+*/
+
+#ifndef ISPC_H
+#define ISPC_H
+
+#if defined(_WIN32) || defined(_WIN64)
+#define ISPC_IS_WINDOWS
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
+#include <assert.h>
+#include <stdint.h>
+#include <vector>
+#include <string>
+
+/** @def ISPC_MAX_NVEC maximum vector size of any of the compliation
+    targets.
+ */
+#define ISPC_MAX_NVEC 16
+
+// Forward declarations of a number of widely-used LLVM types
+namespace llvm {
+    class BasicBlock;
+    class Constant;
+    class ConstantValue;
+    class DIBuilder;
+    class DIDescriptor;
+    class DIFile;
+    class DIType;
+    class Function;
+    class FunctionType;
+    class LLVMContext;
+    class Module;
+    class Type;
+    class Value;
+}
+
+class ArrayType;
+class AtomicType;
+class DeclSpecs;
+class Declaration;
+class Declarator;
+class FunctionEmitContext;
+class Expr;
+class ExprList;
+class FunctionType;
+class GatherBuffer;
+class Module;
+class Stmt;
+class Symbol;
+class SymbolTable;
+class Type;
+
+/** @brief Representation of a range of positions in a source file.
+
+    This class represents a range of characters in a source file
+    (e.g. those that span a token's definition), from starting line and
+    column to ending line and column.  (These values are tracked by the
+    lexing code).  Both lines and columns are counted starting from one.
+ */
+struct SourcePos {
+    SourcePos(const char *n = NULL, int l = 0, int c = 0);
+
+    const char *name;
+    int first_line;
+    int first_column;
+    int last_line;
+    int last_column;
+
+    /** Prints the filename and line/column range to standard output. */
+    void Print() const;
+
+    /** Returns a LLVM DIFile object that represents the SourcePos's file */
+    llvm::DIFile GetDIFile() const;
+};
+
+
+/** @brief Abstract base class for nodes in the abstract syntax tree (AST).
+
+    This class defines a basic interface that all abstract syntax tree
+    (AST) nodes must implement.  The base classes for both expressions
+    (Expr) and statements (Stmt) inherit from this class.
+*/
+class ASTNode {
+public:
+    ASTNode(SourcePos p) : pos(p) { }
+    virtual ~ASTNode();
+
+    /** The Optimize() method should perform any appropriate early-stage
+        optimizations on the node (e.g. constant folding).  The caller
+        should use the returned ASTNode * in place of the original node.
+        This method may return NULL if an error is encountered during
+        optimization. */
+    virtual ASTNode *Optimize() = 0;
+
+    /** Type checking should be performed by the node when this method is
+        called.  In the event of an error, a NULL value may be returned.
+        As with ASTNode::Optimize(), the caller should store the returned
+        pointer in place of the original ASTNode *. */
+    virtual ASTNode *TypeCheck() = 0;
+
+    /** All AST nodes must track the file position where they are
+        defined. */
+    const SourcePos pos;
+};
+
+/** @brief Structure that defines a compilation target 
+
+    This structure defines a compilation target for the ispc compiler.
+*/
+struct Target {
+    Target();
+
+    /** Enumerant giving the instruction sets that the compiler can
+        target. */
+    enum ISA { SSE2, SSE4, AVX };
+
+    /** Instruction set being compiled to. */
+    ISA isa;
+
+    /** Target system architecture.  (e.g. "x86-64", "x86"). */
+    std::string arch;
+
+    /** Target CPU. (e.g. "corei7", "corei7-avx", ..) */
+    std::string cpu;
+
+    /** Native vector width of the vector instruction set.  Note that this
+        value is directly derived from the ISA Being used (e.g. it's 4 for
+        SSE, 8 for AVX, etc.) */
+    int nativeVectorWidth;
+
+    /** Actual vector width currently being compiled to.  This may be an
+        integer multiple of the native vector width, for example if we're
+        "doubling up" and compiling 8-wide on a 4-wide SSE system. */
+    int vectorWidth;
+};
+
+/** @brief Structure that collects optimization options
+
+    This structure collects all of the options related to optimization of
+    generated code. 
+*/
+struct Opt {
+    Opt();
+    
+    /** Optimization level.  Currently, the only valid values are 0,
+        indicating essentially no optimization, and 1, indicating as much
+        optimization as possible. */
+    int level;
+
+    /** Indicates whether "fast and loose" numerically unsafe optimizations
+        should be performed.  This is false by default. */
+    bool fastMath;
+
+    /** On targets that don't have a masked store instruction but do have a
+        blending instruction, by default, we simulate masked stores by
+        loading the old value, blending, and storing the result.  This can
+        potentially be unsafe in multi-threaded code, in that it writes to
+        locations that aren't supposed to be written to.  Setting this
+        value to true disables this work-around, and instead implements
+        masked stores by 'scalarizing' them, so that we iterate over the
+        ISIMD lanes and do a scalar write for the ones that are running. */
+    bool disableBlendedMaskedStores;
+
+    /** Disables the 'coherent control flow' constructs in the
+        language. (e.g. this causes "cif" statements to be demoted to "if"
+        statements.)  This is likely only useful for measuring the impact
+        of coherent control flow. */
+    bool disableCoherentControlFlow;
+
+    /** Disables uniform control flow optimizations (e.g. this changes an
+        "if" statement with a uniform condition to have a varying
+        condition).  This is likely only useful for measuring the impact of
+        uniform control flow. */
+    bool disableUniformControlFlow;
+
+    /** Disables the backend optimizations related to gather/scatter
+        (e.g. transforming gather from sequential locations to an unaligned
+        load, etc.)  This is likely only useful for measuring the impact of
+        these optimizations. */
+    bool disableGatherScatterOptimizations;
+
+    /** Disables the optimization that demotes masked stores to regular
+        stores when the store is happening at the same control flow level
+        where the variable was declared.  This is likely only useful for
+        measuring the impact of this optimization. */
+    bool disableMaskedStoreToStore;
+
+    /** Disables the optimization that detects when the execution mask is
+        all on and emits code for gathers and scatters that doesn't loop
+        over the SIMD lanes but just does the scalar loads and stores
+        directly. */
+    bool disableGatherScatterFlattening;
+
+    /** Disables the optimizations that detect when arrays are being
+        indexed with 'uniform' values and issue scalar loads/stores rather
+        than gathers/scatters.  This is likely only useful for measuring
+        the impact of this optimization. */
+    bool disableUniformMemoryOptimizations;
+
+    /** Disables optimizations for masked stores: masked stores with the
+        mask all on are transformed to regular stores, and masked stores
+        with the mask are all off are removed (which in turn can allow
+        eliminating additional dead code related to computing the value
+        stored).  This is likely only useful for measuring the impact of
+        this optimization. */
+    bool disableMaskedStoreOptimizations;
+};
+
+/** @brief This structure collects together a number of global variables. 
+
+    This structure collects a number of global variables that mostly
+    represent parameter settings for this compilation run.  In particular,
+    none of these values should change after compilation befins; their
+    values are all set during command-line argument processing or very
+    early during the compiler's execution, before any files are parsed.
+  */
+struct Globals {
+    Globals();
+
+    /** Optimization option settings */
+    Opt opt;
+    /** Compilation target information */
+    Target target;
+
+    /** There are a number of math libraries that can be used for
+        transcendentals and the like during program compilation. */
+    enum MathLib { Math_ISPC, Math_ISPCFast, Math_SVML, Math_System };
+    MathLib mathLib;
+
+    /** Records whether the ispc standard library should be made available
+        to the program during compilations. (Default is true.) */
+    bool includeStdlib;
+
+    /** Indicates whether the C pre-processor should be run over the
+        program source before compiling it.  (Default is true.) */
+    bool runCPP;
+
+    /** When \c true, voluminous debugging output will be printed during
+        ispc's execution. */
+    bool debugPrint;
+
+    /** Indicates whether all warning messages should be surpressed. */
+    bool disableWarnings;
+
+    /** Indicates whether additional warnings should be issued about
+        possible performance pitfalls. */
+    bool emitPerfWarnings;
+
+    /** Indicates whether calls should be emitted in the program to an
+        externally-defined program instrumentation function. (See the
+        "Instrumenting your ispc programs" section in the user's
+        manual.) */
+    bool emitInstrumentation; 
+
+    /** Indicates whether ispc should generate debugging symbols for the
+        program in its output. */
+    bool generateDebuggingSymbols;
+
+    /** Global LLVMContext object */
+    llvm::LLVMContext *ctx;
+
+    /** Current working directory when the ispc compiler starts
+        execution. */
+    char currentDirectory[1024];
+
+    /** Arguments to pass along to the C pre-processor, if it is run on the
+        program before compilation. */
+    std::vector<std::string> cppArgs;
+};
+
+extern Globals *g;
+extern Module *m;
+
+#endif // ISPC_H
diff --git a/ispc.sln b/ispc.sln
new file mode 100755
index 00000000..0f48203a
--- /dev/null
+++ b/ispc.sln
@@ -0,0 +1,25 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 11.00
+# Visual Studio 2010
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ispc", "ispc.vcxproj", "{9861F490-F516-480C-B63C-D62A77AFA9D5}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ispc_test", "ispc_test.vcxproj", "{92547BA8-BE86-4E78-8799-1D72A70E5831}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Win32 = Debug|Win32
+		Release|Win32 = Release|Win32
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{9861F490-F516-480C-B63C-D62A77AFA9D5}.Debug|Win32.ActiveCfg = Debug|Win32
+		{9861F490-F516-480C-B63C-D62A77AFA9D5}.Debug|Win32.Build.0 = Debug|Win32
+		{9861F490-F516-480C-B63C-D62A77AFA9D5}.Release|Win32.ActiveCfg = Release|Win32
+		{9861F490-F516-480C-B63C-D62A77AFA9D5}.Release|Win32.Build.0 = Release|Win32
+		{92547BA8-BE86-4E78-8799-1D72A70E5831}.Debug|Win32.ActiveCfg = Debug|Win32
+		{92547BA8-BE86-4E78-8799-1D72A70E5831}.Debug|Win32.Build.0 = Debug|Win32
+		{92547BA8-BE86-4E78-8799-1D72A70E5831}.Release|Win32.ActiveCfg = Release|Win32
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/ispc.vcxproj b/ispc.vcxproj
new file mode 100755
index 00000000..e06e1eff
--- /dev/null
+++ b/ispc.vcxproj
@@ -0,0 +1,216 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="builtins.cpp" />
+    <ClCompile Include="ctx.cpp" />
+    <ClCompile Include="decl.cpp" />
+    <ClCompile Include="expr.cpp" />
+    <ClCompile Include="gen-bitcode-avx.cpp" />
+    <ClCompile Include="gen-bitcode-c.cpp" />
+    <ClCompile Include="gen-bitcode-sse2.cpp" />
+    <ClCompile Include="gen-bitcode-sse4.cpp" />
+    <ClCompile Include="gen-bitcode-sse4x2.cpp" />
+    <ClCompile Include="gen-stdlib.cpp" />
+    <ClCompile Include="ispc.cpp" />
+    <ClCompile Include="lex.cc" />
+    <ClCompile Include="llvmutil.cpp" />
+    <ClCompile Include="module.cpp" />
+    <ClCompile Include="main.cpp" />
+    <ClCompile Include="opt.cpp" />
+    <ClCompile Include="parse.cc" />
+    <CustomBuild Include="stdlib-c.c">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%LLVM_INSTALL_DIR%\bin\clang -emit-llvm stdlib-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py stdlib-c.c &gt; gen-bitcode-c.cpp</Command>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">clang stdlib-c.c</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%LLVM_INSTALL_DIR%\bin\clang -emit-llvm stdlib-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py stdlib-c.c &gt; gen-bitcode-c.cpp</Command>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">clang stdlib-c.c</Message>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-c.cpp</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-c.cpp</Outputs>
+    </CustomBuild>
+    <ClCompile Include="stmt.cpp" />
+    <ClCompile Include="sym.cpp" />
+    <ClCompile Include="type.cpp" />
+    <ClCompile Include="util.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="builtins.h" />
+    <ClInclude Include="ctx.h" />
+    <ClInclude Include="decl.h" />
+    <ClInclude Include="expr.h" />
+    <ClInclude Include="ispc.h" />
+    <ClInclude Include="llvmutil.h" />
+    <ClInclude Include="module.h" />
+    <ClInclude Include="opt.h" />
+    <ClInclude Include="stmt.h" />
+    <ClInclude Include="sym.h" />
+    <ClInclude Include="type.h" />
+    <ClInclude Include="util.h" />
+    <ClInclude Include="winstuff\unistd.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="stdlib.ispc">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">cl /EP /TP %(Filename).ispc /DISPC=1 /DPI=3.1415926535 | python stdlib2cpp.py &gt; gen-stdlib.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-stdlib.cpp</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">cl /EP /TP %(Filename).ispc /DISPC=1 /DPI=3.1415926535 | python stdlib2cpp.py &gt; gen-stdlib.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-stdlib.cpp</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-stdlib.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-stdlib.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="stdlib-sse4.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 stdlib.m4 stdlib-sse4.ll | python bitcode2cpp.py stdlib-sse4.ll &gt; gen-bitcode-sse4.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse4.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">stdlib.m4;stdlib-sse.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 stdlib.m4 stdlib-sse4.ll | python bitcode2cpp.py stdlib-sse4.ll &gt; gen-bitcode-sse4.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse4.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">stdlib.m4;stdlib-sse.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse4.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse4.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="stdlib-sse4x2.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 stdlib.m4 stdlib-sse4x2.ll | python bitcode2cpp.py stdlib-sse4x2.ll &gt; gen-bitcode-sse4x2.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse4x2.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">stdlib.m4;stdlib-sse.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 stdlib.m4 stdlib-sse4x2.ll | python bitcode2cpp.py stdlib-sse4x2.ll &gt; gen-bitcode-sse4x2.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse4x2.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">stdlib.m4;stdlib-sse.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse4x2.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse4x2.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="stdlib-sse2.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 stdlib.m4 stdlib-sse2.ll | python bitcode2cpp.py stdlib-sse2.ll &gt; gen-bitcode-sse2.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse2.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">stdlib.m4;stdlib-sse.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 stdlib.m4 stdlib-sse2.ll | python bitcode2cpp.py stdlib-sse2.ll &gt; gen-bitcode-sse2.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse2.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">stdlib.m4;stdlib-sse.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse2.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse2.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="stdlib-avx.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 stdlib.m4 stdlib-avx.ll | python bitcode2cpp.py stdlib-avx.ll &gt; gen-bitcode-avx.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">stdlib.m4;stdlib-sse.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 stdlib.m4 stdlib-avx.ll | python bitcode2cpp.py stdlib-avx.ll &gt; gen-bitcode-avx.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">stdlib.m4;stdlib-sse.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="lex.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">flex -t lex.ll &gt; lex.cc</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">lex.cc</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">flex -t lex.ll &gt; lex.cc</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">lex.cc</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc.h;decl.h;parse.hh;sym.h</AdditionalInputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc.h;decl.h;parse.hh;sym.h</AdditionalInputs>
+    </CustomBuild>
+    <CustomBuild Include="parse.yy">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">bison -d -v -t -o parse.cc parse.yy</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">parse.cc;parse.h</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">bison -d -v -t -o parse.cc parse.yy</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">parse.cc;parse.h</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc.h;type.h;decl.h;expr.h;sym.h;stmt.h</AdditionalInputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc.h;type.h;decl.h;expr.h;sym.h;stmt.h</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Running bison on parse.yy</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Running bison on parse.yy</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{9861F490-F516-480C-B63C-D62A77AFA9D5}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>ispc</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>NOMINMAX</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)\include;.;.\winstuff;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <DisableSpecificWarnings>4146;4800;4996;4355;4624</DisableSpecificWarnings>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <AdditionalDependencies>LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>NOMINMAX</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)\include;.;.\winstuff;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <DisableSpecificWarnings>4146;4800;4996;4355;4624</DisableSpecificWarnings>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <AdditionalDependencies>LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
diff --git a/ispc_test.cpp b/ispc_test.cpp
new file mode 100644
index 00000000..3665aa42
--- /dev/null
+++ b/ispc_test.cpp
@@ -0,0 +1,313 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#define _CRT_SECURE_NO_WARNINGS
+
+#include <stdio.h>
+#include <stdint.h>
+
+#ifdef ISPC_HAVE_SVML
+#include <xmmintrin.h>
+extern "C" {
+    extern __m128 __svml_sinf4(__m128);
+    extern __m128 __svml_cosf4(__m128);
+    extern __m128 __svml_sincosf4(__m128 *,__m128);
+    extern __m128 __svml_tanf4(__m128);
+    extern __m128 __svml_atanf4(__m128);
+    extern __m128 __svml_atan2f4(__m128, __m128);
+    extern __m128 __svml_expf4(__m128);
+    extern __m128 __svml_logf4(__m128);
+    extern __m128 __svml_powf4(__m128, __m128);
+}
+#endif
+
+#include <llvm/LLVMContext.h>
+#include <llvm/Module.h>
+#include <llvm/Type.h>
+#include <llvm/DerivedTypes.h>
+#include <llvm/Instructions.h>
+#include <llvm/ExecutionEngine/ExecutionEngine.h>
+#include <llvm/ExecutionEngine/JIT.h>
+#include <llvm/Target/TargetSelect.h>
+#include <llvm/Target/TargetOptions.h>
+#include <llvm/Target/TargetData.h>
+#include <llvm/Transforms/Scalar.h>
+#include <llvm/Transforms/IPO.h>
+#include <llvm/PassManager.h>
+#include <llvm/Support/CFG.h>
+#include <llvm/Analysis/Verifier.h>
+#include <llvm/Assembly/PrintModulePass.h>
+#include <llvm/Support/raw_ostream.h>
+#include <llvm/Bitcode/ReaderWriter.h>
+#include <llvm/Support/MemoryBuffer.h>
+#ifndef LLVM_2_8
+#include <llvm/Support/system_error.h>
+#endif
+
+extern "C" { 
+    void ISPCLaunch(void *, void *);
+    void ISPCSync();
+}
+
+void ISPCLaunch(void *func, void *data) {
+    typedef void (*TaskFuncType)(void *, int, int);
+    TaskFuncType tft = (TaskFuncType)(func);
+    tft(data, 0, 1);
+}
+
+
+void ISPCSync() {
+}
+
+static void usage(int ret) {
+    fprintf(stderr, "usage: ispc_test\n");
+    fprintf(stderr, "\t[-h/--help]\tprint help\n");
+    fprintf(stderr, "\t<files>\n");
+    exit(ret);
+}
+
+static void svml_missing() {
+    fprintf(stderr, "Program called unavailable SVML function!\n");
+    exit(1);
+}
+
+static bool lRunTest(const char *fn) {
+    llvm::LLVMContext *ctx = new llvm::LLVMContext;
+
+#ifdef LLVM_2_8
+    std::string err;
+    llvm::MemoryBuffer *buf = llvm::MemoryBuffer::getFileOrSTDIN(fn, &err);
+    if (!buf) {
+        fprintf(stderr, "Unable to open file \"%s\": %s\n", fn, err.c_str());
+        delete ctx;
+        return false;
+    }
+    std::string bcErr;
+    llvm::Module *module = llvm::ParseBitcodeFile(buf, *ctx, &bcErr);
+#else
+    llvm::OwningPtr<llvm::MemoryBuffer> buf;
+    llvm::error_code err = llvm::MemoryBuffer::getFileOrSTDIN(fn, buf);
+    if (err) {
+        fprintf(stderr, "Unable to open file \"%s\": %s\n", fn, err.message().c_str());
+        delete ctx;
+        return false;
+    }
+    std::string bcErr;
+    llvm::Module *module = llvm::ParseBitcodeFile(buf.get(), *ctx, &bcErr);
+#endif
+
+    if (!module) {
+        fprintf(stderr, "Bitcode reader failed for \"%s\": %s\n", fn, bcErr.c_str());
+        delete ctx;
+        return false;
+    }
+
+    std::string eeError;
+    llvm::ExecutionEngine *ee = llvm::ExecutionEngine::createJIT(module, &eeError);
+    if (!ee) {
+        fprintf(stderr, "Unable to create ExecutionEngine: %s\n", eeError.c_str());
+        return false;
+    }
+
+    llvm::Function *func;
+    if ((func = module->getFunction("ISPCLaunch")) != NULL)
+        ee->addGlobalMapping(func, (void *)ISPCLaunch);
+    if ((func = module->getFunction("ISPCSync")) != NULL)
+        ee->addGlobalMapping(func, (void *)ISPCSync);
+    if ((func = module->getFunction("putchar")) != NULL)
+        ee->addGlobalMapping(func, (void *)putchar);
+    if ((func = module->getFunction("printf")) != NULL)
+        ee->addGlobalMapping(func, (void *)printf);
+    if ((func = module->getFunction("fflush")) != NULL)
+        ee->addGlobalMapping(func, (void *)fflush);
+    if ((func = module->getFunction("sinf")) != NULL)
+        ee->addGlobalMapping(func, (void *)sinf);
+    if ((func = module->getFunction("cosf")) != NULL)
+        ee->addGlobalMapping(func, (void *)cosf);
+    if ((func = module->getFunction("tanf")) != NULL)
+        ee->addGlobalMapping(func, (void *)tanf);
+    if ((func = module->getFunction("atanf")) != NULL)
+        ee->addGlobalMapping(func, (void *)atanf);
+    if ((func = module->getFunction("atan2f")) != NULL)
+        ee->addGlobalMapping(func, (void *)atan2f);
+    if ((func = module->getFunction("powf")) != NULL)
+        ee->addGlobalMapping(func, (void *)powf);
+    if ((func = module->getFunction("expf")) != NULL)
+        ee->addGlobalMapping(func, (void *)expf);
+    if ((func = module->getFunction("logf")) != NULL)
+        ee->addGlobalMapping(func, (void *)logf);
+
+#ifdef ISPC_HAVE_SVML
+#define DO_SVML(FUNC ,FUNCNAME)                           \
+    if ((func = module->getFunction(FUNCNAME)) != NULL)   \
+        ee->addGlobalMapping(func, (void *)FUNC)
+#else
+#define DO_SVML(FUNC, FUNCNAME)                                         \
+    if ((func = module->getFunction(FUNCNAME)) != NULL)                 \
+        ee->addGlobalMapping(func, (void *)svml_missing)
+#endif
+
+    DO_SVML(__svml_sinf4, "__svml_sinf4");
+    DO_SVML(__svml_cosf4, "__svml_cosf4");
+    DO_SVML(__svml_sincosf4, "__svml_sincosf4");
+    DO_SVML(__svml_tanf4, "__svml_tanf4");
+    DO_SVML(__svml_atanf4, "__svml_atanf4");
+    DO_SVML(__svml_atan2f4, "__svml_atan2f4");
+    DO_SVML(__svml_expf4, "__svml_expf4");
+    DO_SVML(__svml_logf4, "__svml_logf4");
+    DO_SVML(__svml_powf4, "__svml_powf4");
+
+    // figure out the vector width in the compiled code
+    func = module->getFunction("width");
+    if (!func) {
+        fprintf(stderr, "No width() function found!\n");
+        return false;
+    }
+    int width;
+    {
+        typedef int (*PFN)();
+        PFN pfn = reinterpret_cast<PFN>(ee->getPointerToFunction(func));
+        width = pfn();
+        assert(width == 4 || width == 8 || width == 12 || width == 16);
+    }
+
+    // find the value that returns the desired result
+    func = module->getFunction("result");
+    bool foundResult = (func != NULL);
+    float result[16];
+    for (int i = 0; i < 16; ++i)
+        result[i] = 0;
+    bool ok = true;
+    if (foundResult) {
+        typedef void (*PFN)(float *);
+        PFN pfn = reinterpret_cast<PFN>(ee->getPointerToFunction(func));
+        pfn(result);
+    }
+    else
+        fprintf(stderr, "Warning: no result() function found.\n");
+
+    // try to find a function to run
+    float returned[16];
+    for (int i = 0; i < 16; ++i)
+        returned[i] = 0;
+    float vfloat[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
+    double vdouble[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
+    int vint[16] = { 2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32 };
+    int vint2[16] = { 5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20};
+
+    if ((func = module->getFunction("f_v")) != NULL) {
+        typedef void (*PFN)(float *);
+        PFN pfn = reinterpret_cast<PFN>(ee->getPointerToFunction(func));
+        pfn(returned);
+    }
+    else if ((func = module->getFunction("f_f")) != NULL) {
+        typedef void (*PFN)(float *, float *);
+        PFN pfn = reinterpret_cast<PFN>(ee->getPointerToFunction(func));
+        llvm::verifyFunction(*func);
+        pfn(returned, vfloat);
+    }
+    else if ((func = module->getFunction("f_fu")) != NULL) {
+        typedef void (*PFN)(float *, float *, float fu);
+        PFN pfn = reinterpret_cast<PFN>(ee->getPointerToFunction(func));
+        llvm::verifyFunction(*func);
+        pfn(returned, vfloat, 5.);
+    }
+    else if ((func = module->getFunction("f_fi")) != NULL) {
+        typedef void (*PFN)(float *, float *, int *);
+        PFN pfn = reinterpret_cast<PFN>(ee->getPointerToFunction(func));
+        pfn(returned, vfloat, vint);
+    }
+    else if ((func = module->getFunction("f_du")) != NULL) {
+        typedef void (*PFN)(float *, double *, double);
+        PFN pfn = reinterpret_cast<PFN>(ee->getPointerToFunction(func));
+        pfn(returned, vdouble, 5.);
+    }
+    else if ((func = module->getFunction("f_duf")) != NULL) {
+        typedef void (*PFN)(float *, double *, float);
+        PFN pfn = reinterpret_cast<PFN>(ee->getPointerToFunction(func));
+        pfn(returned, vdouble, 5.f);
+    }
+    else if ((func = module->getFunction("f_di")) != NULL) {
+        typedef void (*PFN)(float *, double *, int *);
+        PFN pfn = reinterpret_cast<PFN>(ee->getPointerToFunction(func));
+        pfn(returned, vdouble, vint2);
+    }
+    else {
+        fprintf(stderr, "Unable to find runnable function in file \"%s\"\n", fn);
+        ok = false;
+    }
+
+    // see if we got the right result
+    if (ok) {
+        if (foundResult) {
+            for (int i = 0; i < width; ++i)
+                if (returned[i] != result[i]) {
+                    ok = false;
+                    fprintf(stderr, "Test \"%s\" RETURNED %d: %g / %a EXPECTED %g / %a\n",
+                            fn, i, returned[i], returned[i], result[i], result[i]);
+                }
+        }
+        else {
+            for (int i = 0; i < width; ++i)
+                fprintf(stderr, "Test \"%s\" returned %d: %g / %a\n",
+                        fn, i, returned[i], returned[i]);
+        }
+    }
+
+    delete ee;
+    delete ctx;
+
+    return ok && foundResult;
+}
+
+int main(int argc, char *argv[]) {
+    llvm::InitializeNativeTarget();
+
+    std::vector<const char *> files;
+    for (int i = 1; i < argc; ++i) {
+        if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h"))
+            usage(0);
+        else
+            files.push_back(argv[i]);
+    }
+
+    int passes = 0, fails = 0;
+    for (unsigned int i = 0; i < files.size(); ++i) {
+        if (lRunTest(files[i])) ++passes;
+        else ++fails;
+    }
+
+    if (fails > 0)
+        fprintf(stderr, "%d/%d tests passed\n", passes, passes+fails);
+    return fails > 0;
+}
diff --git a/ispc_test.vcxproj b/ispc_test.vcxproj
new file mode 100755
index 00000000..bd7a6407
--- /dev/null
+++ b/ispc_test.vcxproj
@@ -0,0 +1,88 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="ispc_test.cpp" />
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{92547BA8-BE86-4E78-8799-1D72A70E5831}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>ispc_test</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)/include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)/lib</AdditionalLibraryDirectories>
+      <AdditionalDependencies>LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMJIT.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)/include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)/lib</AdditionalLibraryDirectories>
+      <AdditionalDependencies>LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMJIT.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
diff --git a/lex.ll b/lex.ll
new file mode 100644
index 00000000..327ac144
--- /dev/null
+++ b/lex.ll
@@ -0,0 +1,426 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+%{
+
+#include "ispc.h"
+#include "decl.h"
+#include "parse.hh"
+#include "sym.h"
+#include "util.h"
+#include "module.h"
+
+static uint32_t lParseBinary(const char *ptr, SourcePos pos);
+static void lCComment(SourcePos *);
+static void lCppComment(SourcePos *);
+static void lHandleCppHash(SourcePos *);
+static void lStringConst(YYSTYPE *, SourcePos *);
+
+#define YY_USER_ACTION \
+    yylloc->first_line = yylloc->last_line; \
+    yylloc->first_column = yylloc->last_column; \
+    yylloc->last_column += yyleng;
+
+#ifdef ISPC_IS_WINDOWS
+inline int isatty(int) { return 0; }
+#endif // ISPC_IS_WINDOWS
+
+%}
+
+%option nounput
+%option noyywrap
+%option bison-bridge
+%option bison-locations
+%option nounistd
+
+WHITESPACE [ \t\r]+
+INT_NUMBER (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))
+FLOAT_NUMBER (([0-9]+|(([0-9]+\.[0-9]*[fF]?)|(\.[0-9]+)))([eE][-+]?[0-9]+)?[fF]?)|([-]?0x[01]\.?[0-9a-fA-F]+p[-+]?[0-9]+[fF]?)
+
+IDENT [a-zA-Z_][a-zA-Z_0-9]*
+
+%%
+"/*"            { lCComment(yylloc); }
+"//"            { lCppComment(yylloc); }
+
+bool { return TOKEN_BOOL; }
+break { return TOKEN_BREAK; }
+case { return TOKEN_CASE; }
+cbreak { return TOKEN_CBREAK; }
+ccontinue { return TOKEN_CCONTINUE; }
+cdo { return TOKEN_CDO; }
+cfor { return TOKEN_CFOR; }
+char { return TOKEN_CHAR; }
+cif { return TOKEN_CIF; }
+cwhile { return TOKEN_CWHILE; }
+const { return TOKEN_CONST; }
+continue { return TOKEN_CONTINUE; }
+creturn { return TOKEN_CRETURN; }
+default { return TOKEN_DEFAULT; }
+do { return TOKEN_DO; }
+double { return TOKEN_DOUBLE; }
+else { return TOKEN_ELSE; }
+enum { return TOKEN_ENUM; }
+export { return TOKEN_EXPORT; }
+extern { return TOKEN_EXTERN; }
+false { return TOKEN_FALSE; }
+float { return TOKEN_FLOAT; }
+for { return TOKEN_FOR; }
+goto { return TOKEN_GOTO; }
+if { return TOKEN_IF; }
+inline { return TOKEN_INLINE; }
+int { return TOKEN_INT; }
+int32 { return TOKEN_INT; }
+int64 { return TOKEN_INT64; }
+launch { return TOKEN_LAUNCH; }
+print { return TOKEN_PRINT; }
+reference { return TOKEN_REFERENCE; }
+return { return TOKEN_RETURN; }
+soa { return TOKEN_SOA; }
+static { return TOKEN_STATIC; }
+struct { return TOKEN_STRUCT; }
+switch { return TOKEN_SWITCH; }
+sync { return TOKEN_SYNC; }
+task { return TOKEN_TASK; }
+true { return TOKEN_TRUE; }
+typedef { return TOKEN_TYPEDEF; }
+uniform { return TOKEN_UNIFORM; }
+unsigned { return TOKEN_UNSIGNED; }
+varying { return TOKEN_VARYING; }
+void { return TOKEN_VOID; }
+while { return TOKEN_WHILE; }
+
+L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL; }
+
+{IDENT} { 
+    /* We have an identifier--is it a type name or an identifier?
+       The symbol table will straighten us out... */
+    yylval->stringVal = new std::string(yytext);
+    if (m->symbolTable->LookupType(yytext) != NULL)
+        return TOKEN_TYPE_NAME;
+    else
+        return TOKEN_IDENTIFIER; 
+}
+
+{INT_NUMBER} { 
+    char *endPtr = NULL;
+#ifdef ISPC_IS_WINDOWS
+    unsigned long val;
+#else
+    unsigned long long val;
+#endif
+
+    if (yytext[0] == '0' && yytext[1] == 'b')
+        val = lParseBinary(yytext+2, *yylloc);
+    else {
+#ifdef ISPC_IS_WINDOWS
+        val = strtoul(yytext, &endPtr, 0);
+#else
+        val = strtoull(yytext, &endPtr, 0);
+#endif
+    }
+    yylval->int32Val = (int32_t)val;
+    if (val != (unsigned int)yylval->int32Val)
+        Warning(*yylloc, "32-bit integer has insufficient bits to represent value %s (%x %llx)",
+                yytext, yylval->int32Val, (unsigned long long)val);
+    return TOKEN_INT_CONSTANT; 
+}
+
+{INT_NUMBER}[uU] {
+    char *endPtr = NULL;
+#ifdef ISPC_IS_WINDOWS
+    unsigned long val;
+#else
+    unsigned long long val;
+#endif
+
+    if (yytext[0] == '0' && yytext[1] == 'b')
+        val = lParseBinary(yytext+2, *yylloc);
+    else {
+#ifdef ISPC_IS_WINDOWS
+        val = strtoul(yytext, &endPtr, 0);
+#else
+        val = strtoull(yytext, &endPtr, 0);
+#endif
+    }
+
+    yylval->int32Val = (int32_t)val;
+    if (val != (unsigned int)yylval->int32Val)
+        Warning(*yylloc, "32-bit integer has insufficient bits to represent value %s (%x %llx)",
+                yytext, yylval->int32Val, (unsigned long long)val);
+    return TOKEN_UINT_CONSTANT; 
+}
+
+{FLOAT_NUMBER} { 
+    /* FIXME: need to implement a hex float constant parser so that we can 
+       support them on Windows (which doesn't handle them in its atof()
+       implementation... */
+    yylval->floatVal = atof(yytext); 
+    return TOKEN_FLOAT_CONSTANT; 
+}
+
+"++" { return TOKEN_INC_OP; }
+"--" { return TOKEN_DEC_OP; }
+"<<" { return TOKEN_LEFT_OP; }
+">>" { return TOKEN_RIGHT_OP; }
+"<=" { return TOKEN_LE_OP; }
+">=" { return TOKEN_GE_OP; }
+"==" { return TOKEN_EQ_OP; }
+"!=" { return TOKEN_NE_OP; }
+"&&" { return TOKEN_AND_OP; }
+"||" { return TOKEN_OR_OP; }
+"*=" { return TOKEN_MUL_ASSIGN; }
+"/=" { return TOKEN_DIV_ASSIGN; }
+"%=" { return TOKEN_MOD_ASSIGN; }
+"+=" { return TOKEN_ADD_ASSIGN; }
+"-=" { return TOKEN_SUB_ASSIGN; }
+"<<=" { return TOKEN_LEFT_ASSIGN; }
+">>=" { return TOKEN_RIGHT_ASSIGN; }
+"&=" { return TOKEN_AND_ASSIGN; }
+"^=" { return TOKEN_XOR_ASSIGN; }
+"|=" { return TOKEN_OR_ASSIGN; }
+";"             { return ';'; }
+("{"|"<%")      { return '{'; }
+("}"|"%>")      { return '}'; }
+","             { return ','; }
+":"             { return ':'; }
+"="             { return '='; }
+"("             { return '('; }
+")"             { return ')'; }
+("["|"<:")      { return '['; }
+("]"|":>")      { return ']'; }
+"."             { return '.'; }
+"&"             { return '&'; }
+"!"             { return '!'; }
+"~"             { return '~'; }
+"-"             { return '-'; }
+"+"             { return '+'; }
+"*"             { return '*'; }
+"/"             { return '/'; }
+"%"             { return '%'; }
+"<"             { return '<'; }
+">"             { return '>'; }
+"^"             { return '^'; }
+"|"             { return '|'; }
+"?"             { return '?'; }
+
+{WHITESPACE} { }
+
+\n {
+    yylloc->last_line++; 
+    yylloc->last_column = 1; 
+}
+
+#(line)?[ ][0-9]+[ ]\"(\\.|[^\\"])*\"[^\n]* { 
+    lHandleCppHash(yylloc); 
+}
+
+. {
+    Error(*yylloc, "Illegal character: %c (0x%x)", yytext[0], int(yytext[0]));
+    YY_USER_ACTION 
+}
+
+%%
+
+/*sizeof { return TOKEN_SIZEOF; }*/
+/*"->" { return TOKEN_PTR_OP; }*/
+/*short { return TOKEN_SHORT; }*/
+/*long { return TOKEN_LONG; }*/
+/*signed { return TOKEN_SIGNED; }*/
+/*volatile { return TOKEN_VOLATILE; }*/
+/*"long"[ \t\v\f\n]+"long" { return TOKEN_LONGLONG; }*/
+/*union { return TOKEN_UNION; }*/
+/*"..." { return TOKEN_ELLIPSIS; }*/
+
+/** Return the integer version of a binary constant from a string.
+ */
+static uint32_t
+lParseBinary(const char *ptr, SourcePos pos) {
+    uint32_t val = 0;
+    bool warned = false;
+
+    while (*ptr != '\0') {
+        /* if this hits, the regexp for 0b... constants is broken */
+        assert(*ptr == '0' || *ptr == '1');
+
+        if ((val & (1<<31)) && warned == false) {
+            // We're about to shift out a set bit
+            // FIXME: 64-bit int constants...
+            Warning(pos, "Can't represent binary constant with 32-bit integer type");
+            warned = true;
+        }
+
+        val = (val << 1) | (*ptr == '0' ? 0 : 1);
+        ++ptr;
+    }
+    return val;
+}
+
+
+/** Handle a C-style comment in the source. 
+ */
+static void
+lCComment(SourcePos *pos) {
+    char c, prev = 0;
+  
+    while ((c = yyinput()) != 0) {
+        if (c == '\n') {
+            pos->last_line++;
+            pos->last_column = 1;
+        }
+        if (c == '/' && prev == '*')
+            return;
+        prev = c;
+    }
+    Error(*pos, "unterminated comment");
+}
+
+/** Handle a C++-style comment--eat everything up until the end of the line.
+ */
+static void
+lCppComment(SourcePos *pos) {
+    char c;
+    do {
+        c = yyinput();
+    } while (c != 0 && c != '\n');
+    if (c == '\n') {
+        pos->last_line++;
+        pos->last_column = 1;
+    }
+}
+
+/** Handle a line that starts with a # character; this should be something
+    left behind by the preprocessor indicating the source file/line
+    that our current position corresponds to.
+ */
+static void lHandleCppHash(SourcePos *pos) {
+    char *ptr, *src;
+
+    // Advance past the opening stuff on the line.
+    assert(yytext[0] == '#');
+    if (yytext[1] == ' ')
+        // On Linux/OSX, the preprocessor gives us lines like
+        // # 1234 "foo.c"
+        ptr = yytext + 2;
+    else {
+        // On windows, cl.exe's preprocessor gives us lines of the form:
+        // #line 1234 "foo.c"
+        assert(!strncmp(yytext+1, "line ", 5));
+        ptr = yytext + 6;
+    }
+
+    // Now we can set the line number based on the integer in the string
+    // that ptr is pointing at.
+    pos->last_line = strtol(ptr, &src, 10) - 1;
+    pos->last_column = 1;
+    // Make sure that the character after the integer is a space and that
+    // then we have open quotes
+    assert(src != ptr && src[0] == ' ' && src[1] == '"');
+    src += 2;
+
+    // And the filename is everything up until the closing quotes
+    std::string filename;
+    while (*src != '"') {
+        assert(*src && *src != '\n');
+        filename.push_back(*src);
+        ++src;
+    }
+    pos->name = strdup(filename.c_str());
+}
+
+
+/** Given a pointer to a position in a string, return the character that it
+    represents, accounting for the escape characters supported in string
+    constants.  (i.e. given the literal string "\\", return the character
+    '/').  The return value is the new position in the string and the
+    decoded character is returned in *pChar.
+*/
+static char *
+lEscapeChar(char *str, char *pChar, SourcePos *pos)
+{
+    if (*str != '\\') {
+        *pChar = *str;
+    }
+    else {
+        char *tail;
+        ++str;
+        switch (*str) {
+        case '\'': *pChar = '\''; break;
+        case '\"': *pChar = '\"'; break;
+        case '?':  *pChar = '\?'; break;
+        case '\\': *pChar = '\\'; break;
+        case 'a':  *pChar = '\a'; break;
+        case 'b':  *pChar = '\b'; break;
+        case 'f':  *pChar = '\f'; break;
+        case 'n':  *pChar = '\n'; break;
+        case 'r':  *pChar = '\r'; break;
+        case 't':  *pChar = '\t'; break;
+        case 'v':  *pChar = '\v'; break;
+        // octal constants \012
+        case '0': case '1': case '2': case '3': case '4':
+        case '5': case '6': case '7':
+            *pChar = strtol(str, &tail, 8);
+            str = tail - 1;
+            break;
+        // hexidecimal constant \xff
+        case 'x':
+            *pChar = strtol(str, &tail, 16);
+            str = tail - 1;
+            break;
+        default:
+            Error(*pos, "Bad character escape sequence: '%s'\n.", str);
+            break;
+        }
+    }
+    ++str;
+    return str;
+}
+
+
+/** Parse a string constant in the source file.  For each character in the
+    string, handle any escaped characters with lEscapeChar() and keep eating
+    characters until we come to the closing quote.
+*/
+static void
+lStringConst(YYSTYPE *yylval, SourcePos *pos)
+{
+    char *p;
+    std::string str;
+    p = strchr(yytext, '"') + 1;
+    while (*p != '\"') {
+       char cval;
+       p = lEscapeChar(p, &cval, pos);
+       str.push_back(cval);
+    } 
+    yylval->stringVal = new std::string(str);
+}
diff --git a/llvmutil.cpp b/llvmutil.cpp
new file mode 100644
index 00000000..e0fc4511
--- /dev/null
+++ b/llvmutil.cpp
@@ -0,0 +1,329 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file llvmutil.cpp
+    @brief Implementations of various LLVM utility types and classes.
+*/
+
+#include "llvmutil.h"
+#include "type.h"
+
+const llvm::Type *LLVMTypes::VoidType = NULL;
+const llvm::PointerType *LLVMTypes::VoidPointerType = NULL;
+const llvm::Type *LLVMTypes::BoolType = NULL;
+const llvm::Type *LLVMTypes::Int8Type = NULL;
+const llvm::Type *LLVMTypes::Int16Type = NULL;
+const llvm::Type *LLVMTypes::Int32Type = NULL;
+const llvm::Type *LLVMTypes::Int32PointerType = NULL;
+const llvm::Type *LLVMTypes::Int64Type = NULL;
+const llvm::Type *LLVMTypes::Int64PointerType = NULL;
+const llvm::Type *LLVMTypes::FloatType = NULL;
+const llvm::Type *LLVMTypes::FloatPointerType = NULL;
+const llvm::Type *LLVMTypes::DoubleType = NULL;
+
+const llvm::VectorType *LLVMTypes::MaskType = NULL;
+const llvm::VectorType *LLVMTypes::BoolVectorType = NULL;
+const llvm::VectorType *LLVMTypes::Int1VectorType = NULL;
+const llvm::VectorType *LLVMTypes::Int32VectorType = NULL;
+const llvm::Type *LLVMTypes::Int32VectorPointerType = NULL;
+const llvm::VectorType *LLVMTypes::Int64VectorType = NULL;
+const llvm::Type *LLVMTypes::Int64VectorPointerType = NULL;
+const llvm::VectorType *LLVMTypes::FloatVectorType = NULL;
+const llvm::Type *LLVMTypes::FloatVectorPointerType = NULL;
+const llvm::VectorType *LLVMTypes::DoubleVectorType = NULL;
+const llvm::ArrayType *LLVMTypes::VoidPointerVectorType = NULL;
+
+llvm::Constant *LLVMTrue = NULL;
+llvm::Constant *LLVMFalse = NULL;
+llvm::Constant *LLVMMaskAllOn = NULL;
+llvm::Constant *LLVMMaskAllOff = NULL;
+
+
+void
+InitLLVMUtil(llvm::LLVMContext *ctx, Target target) {
+    LLVMTypes::VoidType = llvm::Type::getVoidTy(*ctx);
+    LLVMTypes::VoidPointerType = llvm::PointerType::get(llvm::Type::getInt8Ty(*ctx), 0);
+    LLVMTypes::BoolType = llvm::Type::getInt1Ty(*ctx);
+    LLVMTypes::Int8Type = llvm::Type::getInt8Ty(*ctx);
+    LLVMTypes::Int16Type = llvm::Type::getInt16Ty(*ctx);
+    LLVMTypes::Int32Type = llvm::Type::getInt32Ty(*ctx);
+    LLVMTypes::Int32PointerType = llvm::PointerType::get(LLVMTypes::Int32Type, 0);
+    LLVMTypes::Int64Type = llvm::Type::getInt64Ty(*ctx);
+    LLVMTypes::Int64PointerType = llvm::PointerType::get(LLVMTypes::Int64Type, 0);
+    LLVMTypes::FloatType = llvm::Type::getFloatTy(*ctx);
+    LLVMTypes::FloatPointerType = llvm::PointerType::get(LLVMTypes::FloatType, 0);
+    LLVMTypes::DoubleType = llvm::Type::getDoubleTy(*ctx);
+
+    // Note that both the mask and bool vectors are vector of int32s
+    // (not i1s).  LLVM ends up generating much better SSE code with
+    // this representation.
+    LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
+        llvm::VectorType::get(llvm::Type::getInt32Ty(*ctx), target.vectorWidth);
+
+    LLVMTypes::Int1VectorType = 
+        llvm::VectorType::get(llvm::Type::getInt1Ty(*ctx), target.vectorWidth);
+    LLVMTypes::Int32VectorType = 
+        llvm::VectorType::get(LLVMTypes::Int32Type, target.vectorWidth);
+    LLVMTypes::Int32VectorPointerType = llvm::PointerType::get(LLVMTypes::Int32VectorType, 0);
+    LLVMTypes::Int64VectorType = 
+        llvm::VectorType::get(LLVMTypes::Int64Type, target.vectorWidth);
+    LLVMTypes::Int64VectorPointerType = llvm::PointerType::get(LLVMTypes::Int64VectorType, 0);
+    LLVMTypes::FloatVectorType = 
+        llvm::VectorType::get(LLVMTypes::FloatType, target.vectorWidth);
+    LLVMTypes::FloatVectorPointerType = llvm::PointerType::get(LLVMTypes::FloatVectorType, 0);
+    LLVMTypes::DoubleVectorType = 
+        llvm::VectorType::get(LLVMTypes::DoubleType, target.vectorWidth);
+    LLVMTypes::VoidPointerVectorType = 
+        llvm::ArrayType::get(LLVMTypes::VoidPointerType, target.vectorWidth);
+
+    LLVMTrue = llvm::ConstantInt::getTrue(*ctx);
+    LLVMFalse = llvm::ConstantInt::getFalse(*ctx);
+
+    std::vector<llvm::Constant *> maskOnes;
+    llvm::Constant *onMask = NULL;
+    onMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), -1,
+                                    true /*signed*/); // 0xffffffff
+
+    for (int i = 0; i < target.vectorWidth; ++i)
+        maskOnes.push_back(onMask);
+    LLVMMaskAllOn = llvm::ConstantVector::get(LLVMTypes::MaskType, maskOnes);
+
+    std::vector<llvm::Constant *> maskZeros;
+    llvm::Constant *offMask = NULL;
+    offMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), 0,
+                                     true /*signed*/);
+
+    for (int i = 0; i < target.vectorWidth; ++i)
+        maskZeros.push_back(offMask);
+    LLVMMaskAllOff = llvm::ConstantVector::get(LLVMTypes::MaskType, maskZeros);
+}
+
+
+llvm::ConstantInt *LLVMInt32(int32_t ival) {
+    return llvm::ConstantInt::get(llvm::Type::getInt32Ty(*g->ctx), ival,
+                                  true /*signed*/);
+}
+
+
+llvm::ConstantInt *
+LLVMUInt32(uint32_t ival) {
+    return llvm::ConstantInt::get(llvm::Type::getInt32Ty(*g->ctx), ival,
+                                  false /*unsigned*/);
+}
+
+
+llvm::ConstantInt *
+LLVMInt64(int64_t ival) {
+    return llvm::ConstantInt::get(llvm::Type::getInt64Ty(*g->ctx), ival,
+                                  true /*signed*/);
+}
+
+
+llvm::ConstantInt *
+LLVMUInt64(uint64_t ival) {
+    return llvm::ConstantInt::get(llvm::Type::getInt64Ty(*g->ctx), ival,
+                                  false /*unsigned*/);
+}
+
+
+llvm::Constant *
+LLVMFloat(float fval) {
+    return llvm::ConstantFP::get(llvm::Type::getFloatTy(*g->ctx), fval);
+}
+
+
+llvm::Constant *
+LLVMDouble(double dval) {
+    return llvm::ConstantFP::get(llvm::Type::getDoubleTy(*g->ctx), dval);
+}
+
+
+llvm::Constant *
+LLVMInt32Vector(int32_t ival) {
+    llvm::Constant *v = LLVMInt32(ival);
+    std::vector<llvm::Constant *> vals;
+    for (int i = 0; i < g->target.vectorWidth; ++i)
+        vals.push_back(v);
+    return llvm::ConstantVector::get(LLVMTypes::Int32VectorType, vals);
+}
+
+
+llvm::Constant *
+LLVMInt32Vector(const int32_t *ivec) {
+    std::vector<llvm::Constant *> vals;
+    for (int i = 0; i < g->target.vectorWidth; ++i)
+        vals.push_back(LLVMInt32(ivec[i]));
+    return llvm::ConstantVector::get(LLVMTypes::Int32VectorType, vals);
+}
+
+
+llvm::Constant *
+LLVMUInt32Vector(uint32_t ival) {
+    llvm::Constant *v = LLVMUInt32(ival);
+    std::vector<llvm::Constant *> vals;
+    for (int i = 0; i < g->target.vectorWidth; ++i)
+        vals.push_back(v);
+    return llvm::ConstantVector::get(LLVMTypes::Int32VectorType, vals);
+}
+
+
+llvm::Constant *
+LLVMUInt32Vector(const uint32_t *ivec) {
+    std::vector<llvm::Constant *> vals;
+    for (int i = 0; i < g->target.vectorWidth; ++i)
+        vals.push_back(LLVMUInt32(ivec[i]));
+    return llvm::ConstantVector::get(LLVMTypes::Int32VectorType, vals);
+}
+
+
+llvm::Constant *
+LLVMFloatVector(float fval) {
+    llvm::Constant *v = LLVMFloat(fval);
+    std::vector<llvm::Constant *> vals;
+    for (int i = 0; i < g->target.vectorWidth; ++i)
+        vals.push_back(v);
+    return llvm::ConstantVector::get(LLVMTypes::FloatVectorType, vals);
+}
+
+
+llvm::Constant *
+LLVMFloatVector(const float *fvec) {
+    std::vector<llvm::Constant *> vals;
+    for (int i = 0; i < g->target.vectorWidth; ++i)
+        vals.push_back(LLVMFloat(fvec[i]));
+    return llvm::ConstantVector::get(LLVMTypes::FloatVectorType, vals);
+}
+
+
+llvm::Constant *
+LLVMDoubleVector(double dval) {
+    llvm::Constant *v = LLVMDouble(dval);
+    std::vector<llvm::Constant *> vals;
+    for (int i = 0; i < g->target.vectorWidth; ++i)
+        vals.push_back(v);
+    return llvm::ConstantVector::get(LLVMTypes::DoubleVectorType, vals);
+}
+
+
+llvm::Constant *
+LLVMDoubleVector(const double *dvec) {
+    std::vector<llvm::Constant *> vals;
+    for (int i = 0; i < g->target.vectorWidth; ++i)
+        vals.push_back(LLVMDouble(dvec[i]));
+    return llvm::ConstantVector::get(LLVMTypes::DoubleVectorType, vals);
+}
+
+
+llvm::Constant *
+LLVMInt64Vector(int64_t ival) {
+    llvm::Constant *v = LLVMInt64(ival);
+    std::vector<llvm::Constant *> vals;
+    for (int i = 0; i < g->target.vectorWidth; ++i)
+        vals.push_back(v);
+    return llvm::ConstantVector::get(LLVMTypes::Int64VectorType, vals);
+}
+
+
+llvm::Constant *
+LLVMInt64Vector(const int64_t *ivec) {
+    std::vector<llvm::Constant *> vals;
+    for (int i = 0; i < g->target.vectorWidth; ++i)
+        vals.push_back(LLVMInt64(ivec[i]));
+    return llvm::ConstantVector::get(LLVMTypes::Int64VectorType, vals);
+}
+
+
+llvm::Constant *
+LLVMUInt64Vector(uint64_t ival) {
+    llvm::Constant *v = LLVMUInt64(ival);
+    std::vector<llvm::Constant *> vals;
+    for (int i = 0; i < g->target.vectorWidth; ++i)
+        vals.push_back(v);
+    return llvm::ConstantVector::get(LLVMTypes::Int64VectorType, vals);
+}
+
+
+llvm::Constant *
+LLVMUInt64Vector(const uint64_t *ivec) {
+    std::vector<llvm::Constant *> vals;
+    for (int i = 0; i < g->target.vectorWidth; ++i)
+        vals.push_back(LLVMUInt64(ivec[i]));
+    return llvm::ConstantVector::get(LLVMTypes::Int64VectorType, vals);
+}
+
+
+llvm::Constant *
+LLVMBoolVector(bool b) {
+    llvm::Constant *v;
+    if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) 
+        v = llvm::ConstantInt::get(LLVMTypes::Int32Type, b ? 0xffffffff : 0, 
+                                   false /*unsigned*/);
+    else {
+        assert(LLVMTypes::BoolVectorType->getElementType() == 
+               llvm::Type::getInt1Ty(*g->ctx));
+        v = b ? LLVMTrue : LLVMFalse;
+    }
+
+    std::vector<llvm::Constant *> vals;
+    for (int i = 0; i < g->target.vectorWidth; ++i)
+        vals.push_back(v);
+    return llvm::ConstantVector::get(LLVMTypes::BoolVectorType, vals);
+}
+
+
+llvm::Constant *
+LLVMBoolVector(const bool *bvec) {
+    std::vector<llvm::Constant *> vals;
+    for (int i = 0; i < g->target.vectorWidth; ++i) {
+        llvm::Constant *v;
+        if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) 
+            v = llvm::ConstantInt::get(LLVMTypes::Int32Type, bvec[i] ? 0xffffffff : 0, 
+                                       false /*unsigned*/);
+        else {
+            assert(LLVMTypes::BoolVectorType->getElementType() == 
+                   llvm::Type::getInt1Ty(*g->ctx));
+            v = bvec[i] ? LLVMTrue : LLVMFalse;
+        }
+
+        vals.push_back(v);
+    }
+    return llvm::ConstantVector::get(LLVMTypes::BoolVectorType, vals);
+}
+
+
+const llvm::ArrayType *
+LLVMPointerVectorType(const llvm::Type *t) {
+    // NOTE: ArrayType, not VectorType
+    return llvm::ArrayType::get(llvm::PointerType::get(t, 0), 
+                                g->target.vectorWidth);
+}
diff --git a/llvmutil.h b/llvmutil.h
new file mode 100644
index 00000000..3a5a4e4c
--- /dev/null
+++ b/llvmutil.h
@@ -0,0 +1,157 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file llvmutil.h
+    @brief Header file with declarations for various LLVM utility stuff
+*/
+
+#ifndef ISPC_LLVMUTIL_H
+#define ISPC_LLVMUTIL_H 1
+
+#include "ispc.h"
+#include <llvm/LLVMContext.h>
+#include <llvm/Type.h>
+#include <llvm/DerivedTypes.h>
+#include <llvm/Constants.h>
+
+/** This structure holds pointers to a variety of LLVM types; code
+    elsewhere can use them from here, ratherthan needing to make more
+    verbose LLVM API calls.
+ */ 
+struct LLVMTypes {
+    static const llvm::Type *VoidType;
+    static const llvm::PointerType *VoidPointerType;
+    static const llvm::Type *BoolType;
+    static const llvm::Type *Int8Type;
+    static const llvm::Type *Int16Type;
+    static const llvm::Type *Int32Type;
+    static const llvm::Type *Int32PointerType;
+    static const llvm::Type *Int64Type;
+    static const llvm::Type *Int64PointerType;
+    static const llvm::Type *FloatType;
+    static const llvm::Type *FloatPointerType;
+    static const llvm::Type *DoubleType;
+
+    static const llvm::VectorType *MaskType;
+    static const llvm::VectorType *BoolVectorType;
+    static const llvm::VectorType *Int1VectorType;
+    static const llvm::VectorType *Int32VectorType;
+    static const llvm::Type *Int32VectorPointerType;
+    static const llvm::VectorType *Int64VectorType;
+    static const llvm::Type *Int64VectorPointerType;
+    static const llvm::VectorType *FloatVectorType;
+    static const llvm::Type *FloatVectorPointerType;
+    static const llvm::VectorType *DoubleVectorType;
+    static const llvm::ArrayType *VoidPointerVectorType;
+};
+
+/** These variables hold the corresponding LLVM constant values as a
+    convenience to code elsewhere in the system.
+ */
+extern llvm::Constant *LLVMTrue, *LLVMFalse;
+
+/** This should be called early in initialization to initialize the members
+    of LLVMTypes and the LLVMTrue/LLVMFalse constants.  However, it can't
+    be called until the compilation target is known.
+ */
+extern void InitLLVMUtil(llvm::LLVMContext *ctx, Target target);
+
+/** Returns an LLVM i32 constant of the given value */
+extern llvm::ConstantInt *LLVMInt32(int32_t i);
+/** Returns an LLVM i32 constant of the given value */
+extern llvm::ConstantInt *LLVMUInt32(uint32_t i);
+/** Returns an LLVM i64 constant of the given value */
+extern llvm::ConstantInt *LLVMInt64(int64_t i);
+/** Returns an LLVM i64 constant of the given value */
+extern llvm::ConstantInt *LLVMUInt64(uint64_t i);
+/** Returns an LLVM float constant of the given value */
+extern llvm::Constant *LLVMFloat(float f);
+/** Returns an LLVM double constant of the given value */
+extern llvm::Constant *LLVMDouble(double f);
+
+/** Returns an LLVM boolean vector constant of the given value smeared
+    across all elements */
+extern llvm::Constant *LLVMBoolVector(bool v);
+/** Returns an LLVM i32 vector constant of the given value smeared
+    across all elements */
+extern llvm::Constant *LLVMInt32Vector(int32_t i);
+/** Returns an LLVM i32 vector constant of the given value smeared
+    across all elements */
+extern llvm::Constant *LLVMUInt32Vector(uint32_t i);
+/** Returns an LLVM i64 vector constant of the given value smeared
+    across all elements */
+extern llvm::Constant *LLVMInt64Vector(int64_t i);
+/** Returns an LLVM i64 vector constant of the given value smeared
+    across all elements */
+extern llvm::Constant *LLVMUInt64Vector(uint64_t i);
+/** Returns an LLVM float vector constant of the given value smeared
+    across all elements */
+extern llvm::Constant *LLVMFloatVector(float f);
+/** Returns an LLVM double vector constant of the given value smeared
+    across all elements */
+extern llvm::Constant *LLVMDoubleVector(double f);
+
+/** Returns an LLVM boolean vector based on the given array of values.
+    The array should have g->target.vectorWidth elements. */
+extern llvm::Constant *LLVMBoolVector(const bool *v);
+/** Returns an LLVM i32 vector based on the given array of values.
+    The array should have g->target.vectorWidth elements. */
+extern llvm::Constant *LLVMInt32Vector(const int32_t *i);
+/** Returns an LLVM i32 vector based on the given array of values.
+    The array should have g->target.vectorWidth elements. */
+extern llvm::Constant *LLVMUInt32Vector(const uint32_t *i);
+/** Returns an LLVM i64 vector based on the given array of values.
+    The array should have g->target.vectorWidth elements. */
+extern llvm::Constant *LLVMInt64Vector(const int64_t *i);
+/** Returns an LLVM i64 vector based on the given array of values.
+    The array should have g->target.vectorWidth elements. */
+extern llvm::Constant *LLVMUInt64Vector(const uint64_t *i);
+/** Returns an LLVM float vector based on the given array of values.
+    The array should have g->target.vectorWidth elements. */
+extern llvm::Constant *LLVMFloatVector(const float *f);
+/** Returns an LLVM double vector based on the given array of values.
+    The array should have g->target.vectorWidth elements. */
+extern llvm::Constant *LLVMDoubleVector(const double *f);
+
+/** LLVM constant value representing an 'all on' SIMD lane mask */
+extern llvm::Constant *LLVMMaskAllOn;
+/** LLVM constant value representing an 'all off' SIMD lane mask */
+extern llvm::Constant *LLVMMaskAllOff;
+
+/** Given an LLVM type, returns the corresponding type for a vector of
+    pointers to that type.  (In practice, an array of pointers, since LLVM
+    prohibits vectors of pointers.
+ */
+extern const llvm::ArrayType *LLVMPointerVectorType(const llvm::Type *t);
+
+#endif // ISPC_LLVMUTIL_H
diff --git a/main.cpp b/main.cpp
new file mode 100644
index 00000000..09034828
--- /dev/null
+++ b/main.cpp
@@ -0,0 +1,330 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file main.cpp
+    @brief main() entrypoint implementation for ispc
+*/
+
+#include "ispc.h"
+#include "module.h"
+#include <stdio.h>
+#include <llvm/Support/PrettyStackTrace.h>
+#ifdef LLVM_2_8
+#include <llvm/System/Signals.h>
+#else
+#include <llvm/Support/Signals.h>
+#endif
+
+#ifdef ISPC_IS_WINDOWS
+#define strcasecmp stricmp
+#define BUILD_DATE __DATE__
+#define BUILD_VERSION ""
+#endif // ISPC_IS_WINDOWS
+
+static void usage(int ret) {
+    printf("This is the Intel(r) SPMD Program Compiler (ispc), build %s (%s)\n\n", BUILD_DATE, BUILD_VERSION);
+    printf("usage: ispc\n");
+    printf("    [--arch={x86,x86-64}]\t\tSelect target architecture\n");
+    printf("    [--cpu=<cpu>]\t\t\tSelect target CPU type\n");
+    printf("         (atom, barcelona, core2, corei7, corei7-avx, istanbul, nocona,\n");
+    printf("          penryn, westmere)\n");
+#ifndef ISPC_IS_WINDOWS
+    printf("    [-D<foo>]\t\t\t\t#define value when running preprocessor\n");
+#endif
+    printf("    [--debug]\t\t\t\tPrint information useful for debugging ispc\n");
+    printf("    [--emit-asm]\t\t\tGenerate assembly language file as output\n");
+    printf("    [--emit-llvm]\t\t\tEmit LLVM bitode file as output\n");
+    printf("    [--emit-obj]\t\t\tGenerate object file file as output\n");
+    printf("    [--fast-math]\t\t\tPerform non-IEEE-compliant optimizations of numeric expressions\n");
+    printf("    [-g]\t\t\t\tGenerate debugging information\n");
+    printf("    [--help]\t\t\t\tPrint help\n");
+    printf("    [-h] <name>\t\t\t\tOutput filename for header\n");
+    printf("    [--instrument]\t\t\tEmit instrumentation to gather performance data\n");
+    printf("    [--math-lib=<option>]\t\tSelect math library\n");
+    printf("        default\t\t\t\tUse ispc's built-in math functions\n");
+    printf("        fast\t\t\t\tUse high-performance but lower-accuracy math functions\n");
+    printf("        svml\t\t\t\tUse the Intel SVML math libraries\n");
+    printf("        system\t\t\t\tUse the system's math library (*may be quite slow*)\n");
+    printf("    [--nostdlib]\t\t\tDon't make the ispc standard library available\n");
+#ifndef ISPC_IS_WINDOWS
+    printf("    [--nocpp]\t\t\t\tDon't run the C preprocessor\n");
+#endif
+    printf("    [-o/--outfile] <name>\t\tOutput filename for bitcode (may be \"-\" for standard output)\n");
+    printf("    [-O0/-O1]\t\t\t\tSet optimization level\n");
+    printf("    [--opt=<option>]\t\t\tSet optimization option\n");
+    printf("        disable-blended-masked-stores\t\tScalarize masked stores on SSE (vs. using vblendps)\n");
+    printf("        disable-coherent-control-flow\t\tDisable coherent control flow optimizations\n");
+    printf("        disable-uniform-control-flow\t\tDisable uniform control flow optimizations\n");
+    printf("        disable-gather-scatter-optimizations\tDisable improvements to gather/scatter\n");
+    printf("        disable-blending-removal\t\tDisable eliminating blend at same scope\n");
+    printf("        disable-gather-scatter-flattening\tDisable flattening when all lanes are on\n");
+    printf("        disable-uniform-memory-optimizations\tDisable uniform-based coherent memory access\n");
+    printf("        disable-masked-store-optimizations\tDisable lowering to regular stores when possible\n");
+    printf("    [--target={sse2,sse4,sse4x2,avx}] Select target ISA (SSE4 is default)\n");
+    printf("    [--version]\t\t\t\tPrint ispc version\n");
+    printf("    [--woff]\t\t\t\tDisable warnings\n");
+    printf("    [--wno-perf]\t\t\tDon't issue warnings related to performance-related issues\n");
+    printf("    <file to compile or \"-\" for stdin>\n");
+    exit(ret);
+}
+
+/** Given a target name string, set initialize the global g->target
+    structure appropriately. 
+*/
+static void lDoTarget(const char *target) {
+    if (!strcasecmp(target, "sse2")) {
+        g->target.isa = Target::SSE2;
+        g->target.nativeVectorWidth = 4;
+        g->target.vectorWidth = 4;
+    }
+    else if (!strcasecmp(target, "sse4")) {
+        g->target.isa = Target::SSE4;
+        g->target.nativeVectorWidth = 4;
+        g->target.vectorWidth = 4;
+    }
+    else if (!strcasecmp(target, "sse4x2")) {
+        g->target.isa = Target::SSE4;
+        g->target.nativeVectorWidth = 4;
+        g->target.vectorWidth = 8;
+    }
+    else if (!strcasecmp(target, "avx")) {
+        g->target.isa = Target::AVX;
+        g->target.nativeVectorWidth = 8;
+        g->target.vectorWidth = 8;
+    }
+    else
+        usage(1);
+}
+
+
+/** We take arguments from both the command line as well as from the
+    ISPC_ARGS environment variable.  This function returns a new set of
+    arguments representing the ones from those two sources merged together.
+ */ 
+static void lGetAllArgs(int Argc, char *Argv[], int &argc, char *argv[128]) {
+    // Copy over the command line arguments (passed in)
+    for (int i = 0; i < Argc; ++i)
+        argv[i] = Argv[i];
+    argc = Argc;
+
+    // See if we have any set via the environment variable
+    const char *env = getenv("ISPC_ARGS");
+    if (!env)
+        return;
+    while (true) {
+        // Look for the next space in the string, which delimits the end of
+        // the current argument
+        const char *end = strchr(env, ' ');
+        if (end == NULL)
+            end = env + strlen(env);
+        int len = end - env;
+
+        // Copy the argument into a newly allocated memory (so we can
+        // NUL-terminate it).
+        char *ptr = new char[len+1];
+        strncpy(ptr, env, len);
+        ptr[len] = '\0';
+
+        // Add it to the args array and get out of here 
+        argv[argc++] = ptr;
+        if (*end == '\0')
+            break;
+
+        // Advance the starting pointer of the string to the next non-space
+        // character
+        env = end+1;
+        while (*env == ' ')
+            ++env;
+
+        // Hit the end of the string; get out of here
+        if (*env == '\0')
+            break;
+    }
+}
+
+
+int main(int Argc, char *Argv[]) {
+    int argc;
+    char *argv[128];
+    lGetAllArgs(Argc, Argv, argc, argv);
+
+    // Use LLVM's little utility function to print out nice stack traces if
+    // we crash
+    llvm::sys::PrintStackTraceOnErrorSignal();
+    llvm::PrettyStackTraceProgram X(argc, argv);
+
+    char *file = NULL;
+    const char *headerFileName = NULL;
+    const char *outFileName = NULL;
+
+    // Initiailize globals early so that we can set various option values
+    // as we're parsing below
+    g = new Globals;
+
+    bool debugSet = false, optSet = false;
+    Module::OutputType ot = Module::Object;
+
+    for (int i = 1; i < argc; ++i) {
+        if (!strcmp(argv[i], "--help"))
+            usage(0);
+#ifndef ISPC_IS_WINDOWS
+        else if (!strncmp(argv[i], "-D", 2)) {
+            g->cppArgs.push_back(argv[i]);
+        }
+#endif // !ISPC_IS_WINDOWS
+        else if (!strncmp(argv[i], "--arch=", 7))
+            g->target.arch = argv[i] + 7;
+        else if (!strncmp(argv[i], "--cpu=", 6))
+            g->target.cpu = argv[i] + 6;
+        else if (!strcmp(argv[i], "--fast-math"))
+            g->opt.fastMath = true;
+        else if (!strcmp(argv[i], "--debug"))
+            g->debugPrint = true;
+        else if (!strcmp(argv[i], "--instrument"))
+            g->emitInstrumentation = true;
+        else if (!strcmp(argv[i], "-g")) {
+            g->generateDebuggingSymbols = true;
+            debugSet = true;
+        }
+        else if (!strcmp(argv[i], "--emit-asm"))
+            ot = Module::Asm;
+        else if (!strcmp(argv[i], "--emit-llvm"))
+            ot = Module::Bitcode;
+        else if (!strcmp(argv[i], "--emit-obj"))
+            ot = Module::Object;
+        else if (!strcmp(argv[i], "--target")) {
+            if (++i == argc) usage(1);
+            lDoTarget(argv[i]);
+        }
+        else if (!strncmp(argv[i], "--target=", 9)) {
+            const char *target = argv[i] + 9;
+            lDoTarget(target);
+        }
+        else if (!strncmp(argv[i], "--math-lib=", 11)) {
+            const char *lib = argv[i] + 11;
+            if (!strcmp(lib, "default"))
+                g->mathLib = Globals::Math_ISPC;
+            else if (!strcmp(lib, "fast"))
+                g->mathLib = Globals::Math_ISPCFast;
+            else if (!strcmp(lib, "svml"))
+                g->mathLib = Globals::Math_SVML;
+            else if (!strcmp(lib, "system"))
+                g->mathLib = Globals::Math_System;
+            else
+                usage(1);
+        }
+        else if (!strncmp(argv[i], "--opt=", 6)) {
+            const char *opt = argv[i] + 6;
+            if (!strcmp(opt, "disable-blended-masked-stores"))
+                g->opt.disableBlendedMaskedStores = true;
+            else if (!strcmp(opt, "disable-coherent-control-flow"))
+                g->opt.disableCoherentControlFlow = true;
+            else if (!strcmp(opt, "disable-uniform-control-flow"))
+                g->opt.disableUniformControlFlow = true;
+            else if (!strcmp(opt, "disable-gather-scatter-optimizations"))
+                g->opt.disableGatherScatterOptimizations = true;
+            else if (!strcmp(opt, "disable-blending-removal"))
+                g->opt.disableMaskedStoreToStore = true;
+            else if (!strcmp(opt, "disable-gather-scatter-flattening"))
+                g->opt.disableGatherScatterFlattening = true;
+            else if (!strcmp(opt, "disable-uniform-memory-optimizations"))
+                g->opt.disableUniformMemoryOptimizations = true;
+            else if (!strcmp(opt, "disable-masked-store-optimizations"))
+                g->opt.disableMaskedStoreOptimizations = true;
+            else 
+                usage(1);
+        }
+        else if (!strcmp(argv[i], "--woff") || !strcmp(argv[i], "-woff")) {
+            g->disableWarnings = true;
+            g->emitPerfWarnings = false;
+        }
+        else if (!strcmp(argv[i], "--wno-perf") || !strcmp(argv[i], "-wno-perf"))
+            g->emitPerfWarnings = false;
+        else if (!strcmp(argv[i], "-o") || !strcmp(argv[i], "--outfile")) {
+            if (++i == argc) usage(1);
+            outFileName = argv[i];
+        }
+        else if (!strcmp(argv[i], "-h") || !strcmp(argv[i], "--header-outfile")) {
+            if (++i == argc) usage(1);
+            headerFileName = argv[i];
+        }
+        else if (!strcmp(argv[i], "-O0")) {
+            g->opt.level = 0;
+            optSet = true;
+        }
+        else if (!strcmp(argv[i], "-O") ||  !strcmp(argv[i], "-O1") || 
+                 !strcmp(argv[i], "-O2") || !strcmp(argv[i], "-O3")) {
+            g->opt.level = 1;
+            optSet = true;
+        }
+        else if (!strcmp(argv[i], "-"))
+            ;
+        else if (!strcmp(argv[i], "--nostdlib"))
+            g->includeStdlib = false;
+        else if (!strcmp(argv[i], "--nocpp"))
+            g->runCPP = false;
+        else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--version")) {
+            printf("Intel(r) SPMD Program Compiler (ispc) build %s (%s)\n", 
+                   BUILD_DATE, BUILD_VERSION);
+            return 0;
+        }
+        else if (argv[i][0] == '-')
+            usage(1);
+        else {
+            if (file != NULL)
+                usage(1);
+            else
+                file = argv[i];
+        }
+    }
+
+    // If the user specified -g, then the default optimization level is 0.
+    // If -g wasn't specified, the default optimization level is 1 (full
+    // optimization).
+    if (debugSet && !optSet)
+        g->opt.level = 0;
+
+    m = new Module(file);
+    if (m->CompileFile() == 0) {
+        if (outFileName != NULL)
+            if (!m->WriteOutput(ot, outFileName))
+                return 1;
+        if (headerFileName != NULL)
+            if (!m->WriteOutput(Module::Header, headerFileName))
+                return 1;
+    }
+    int errorCount = m->errorCount;
+    delete m;
+
+    return errorCount > 0;
+}
diff --git a/module.cpp b/module.cpp
new file mode 100644
index 00000000..8ad728b5
--- /dev/null
+++ b/module.cpp
@@ -0,0 +1,1431 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file module.cpp
+    @brief Impementation of the Module class, which collects the result of compiling
+           a source file and then generates output (object files, etc.)
+*/
+
+#include "module.h"
+#include "util.h"
+#include "ctx.h"
+#include "builtins.h"
+#include "decl.h"
+#include "type.h"
+#include "expr.h"
+#include "sym.h"
+#include "stmt.h"
+#include "opt.h"
+#include "llvmutil.h"
+
+#include <stdio.h>
+#include <assert.h>
+#include <stdarg.h>
+#include <ctype.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <algorithm>
+#include <set>
+#ifdef ISPC_IS_WINDOWS
+#include <windows.h>
+#include <io.h>
+#define strcasecmp stricmp
+#endif
+
+#include <llvm/LLVMContext.h>
+#include <llvm/Module.h>
+#include <llvm/Type.h>
+#include <llvm/DerivedTypes.h>
+#include <llvm/Instructions.h>
+#include <llvm/Intrinsics.h>
+#include <llvm/Support/FormattedStream.h>
+#include <llvm/Target/TargetMachine.h>
+#include <llvm/Target/TargetRegistry.h>
+#include <llvm/Target/TargetSelect.h>
+#include <llvm/Target/TargetOptions.h>
+#include <llvm/Target/TargetData.h>
+#include <llvm/Target/SubtargetFeature.h>
+#include <llvm/PassManager.h>
+#include <llvm/Analysis/Verifier.h>
+#include <llvm/Support/CFG.h>
+#ifndef LLVM_2_8
+#include <llvm/Support/ToolOutputFile.h>
+#include <llvm/Support/Host.h>
+#endif // !LLVM_2_8
+#include <llvm/Assembly/PrintModulePass.h>
+#include <llvm/Support/raw_ostream.h>
+#include <llvm/Bitcode/ReaderWriter.h>
+
+///////////////////////////////////////////////////////////////////////////
+// Module
+
+Module::Module(const char *fn) {
+    // FIXME: It's a hack to do this here, but it must be done after the
+    // target information has been set (so e.g. the vector width is
+    // known...)
+    InitLLVMUtil(g->ctx, g->target);
+
+    filename = fn;
+    errorCount = 0;
+    symbolTable = new SymbolTable;
+    module = new llvm::Module(filename ? filename : "<stdin>", *g->ctx);
+
+#ifndef LLVM_2_8
+    if (g->generateDebuggingSymbols)
+        diBuilder = new llvm::DIBuilder(*module);
+    else
+        diBuilder = NULL;
+#endif // LLVM_2_8
+
+#ifndef LLVM_2_8
+    // If we're generating debugging symbols, let the DIBuilder know that
+    // we're starting a new compilation unit.
+    if (diBuilder != NULL) {
+        std::string directory, name;
+        GetDirectoryAndFileName(g->currentDirectory, filename, &directory,
+                                &name);
+        diBuilder->createCompileUnit(llvm::dwarf::DW_LANG_C99,  /* lang */
+                                     name,  /* filename */
+                                     directory, /* directory */
+                                     "ispc", /* producer */
+                                     g->opt.level > 0 /* is optimized */,
+                                     "-g", /* command line args */
+                                     0 /* run time version */);
+    }
+#endif // LLVM_2_8
+}
+
+
+extern FILE *yyin;
+extern int yyparse();
+typedef struct yy_buffer_state *YY_BUFFER_STATE;
+extern void yy_switch_to_buffer(YY_BUFFER_STATE);
+extern YY_BUFFER_STATE yy_create_buffer(FILE *, int);
+
+
+int
+Module::CompileFile() {
+    // FIXME: it'd be nice to do this in the Module constructor, but this
+    // function ends up calling into routines that expect the global
+    // variable 'm' to be initialized and available (which it isn't until
+    // the Module constructor returns...)
+    DefineStdlib(symbolTable, g->ctx, module, g->includeStdlib);
+
+    bool runPreprocessor = g->runCPP;
+
+    // We currently require that the user run the preprocessor by hand on
+    // windows and pipe the result to ispc.
+    // FIXME: It'd be nice to run cl.exe for them to do this, if it's available
+    // in the PATH...
+#ifdef ISPC_IS_WINDOWS
+    runPreprocessor = false;
+#endif // ISPC_IS_WINDOWS
+
+    // The FILE handle that we'll point the parser at.  This may end up
+    // being stdin, an opened file on disk, or the piped output from the
+    // preprocessor.
+    FILE *f;
+
+    if (runPreprocessor) {
+        // Before we run the preprocessor, make sure that file exists and
+        // we can read it since otherwise we get a pretty obscure/unhelpful
+        // error message from cpp
+        if (filename) {
+            f = fopen(filename, "r");
+            if (f == NULL) {
+                perror(filename);
+                return 1;
+            }
+            fclose(f);
+        }
+
+        // Go ahead and construct a command string to run the preprocessor.
+        // First, concatentate all of the -D statements from the original
+        // ispc command line so that we can pass them along to cpp.
+        std::string cppDefs;
+        for (unsigned int i = 0; i < g->cppArgs.size(); ++i) {
+            cppDefs += g->cppArgs[i];
+            cppDefs += ' ';
+        }
+
+#ifdef ISPC_IS_WINDOWS
+        // For now, this code should never be reached
+        FATAL("Need to implement code to run the preprocessor for windows"); 
+#else // ISPC_IS_WINDOWS
+        char *cmd = NULL;
+        if (asprintf(&cmd, "/usr/bin/cpp -DISPC=1 -DPI=3.1415936535 %s %s", 
+                     cppDefs.c_str(), filename ? filename : "-") == -1) {
+            fprintf(stderr, "Unable to allocate memory in asprintf()?!\n");
+            exit(1);
+        }
+
+        f = popen(cmd, "r");
+        free(cmd);
+
+        if (f == NULL) {
+            perror(filename ? filename : "<stdin>");
+            return 1;
+        }
+#endif // ISPC_IS_WINDOWS
+    }
+    else {
+        // No preprocessor, just open up the file if it's not stdin..
+        if (filename == NULL) 
+            f = stdin;
+        else {
+            f = fopen(filename, "r");
+            if (f == NULL) {
+                perror(filename);
+                return 1;
+            }
+        }
+    }
+
+    // Here is where the magic happens: parse the file, build the AST, etc.
+    // This in turn will lead to calls back to Module::AddFunction(),
+    // etc...
+    yyin = f;
+    yy_switch_to_buffer(yy_create_buffer(yyin, 4096));
+    yyparse();
+
+    if (runPreprocessor) {
+#ifdef ISPC_IS_WINDOWS
+        FATAL("need to implement this for windows as well");
+#else
+        pclose(f);
+#endif // ISPC_IS_WINDOWS
+    }
+    else
+        fclose(f);
+
+    if (errorCount == 0)
+        Optimize(module, g->opt.level);
+
+    return errorCount;
+}
+
+
+/** Given an arbitrary type, see if it or any of the types contained in it
+    are varying.  Returns true if so, false otherwise. 
+*/
+static bool
+lRecursiveCheckVarying(const Type *t) {
+    t = t->GetBaseType();
+    if (t->IsVaryingType()) return true;
+
+    const StructType *st = dynamic_cast<const StructType *>(t);
+    if (st) {
+        for (int i = 0; i < st->NumElements(); ++i)
+            if (lRecursiveCheckVarying(st->GetMemberType(i)))
+                return true;
+    }
+    return false;
+}
+
+
+/** Given a Symbol representing a function parameter, see if it or any
+    contained types are varying.  If so, issue an error.  (This function
+    should only be called for parameters to 'export'ed functions, where
+    varying parameters is illegal.
+ */
+static void
+lCheckForVaryingParameter(Symbol *sym) {
+    if (lRecursiveCheckVarying(sym->type)) {
+        const Type *t = sym->type->GetBaseType();
+        if (dynamic_cast<const StructType *>(t))
+            Error(sym->pos, "Struct parameter \"%s\" with varying member(s) is illegal "
+                  "in an exported function.",
+                  sym->name.c_str());
+        else
+            Error(sym->pos, "Varying parameter \"%s\" is illegal in an exported function.",
+                  sym->name.c_str());
+    }
+}
+
+
+/** Given a function type, loop through the function parameters and see if
+    any are StructTypes.  If so, issue an error (this seems to be broken
+    currently).
+
+    @todo Fix passing structs from C/C++ to ispc functions.
+ */
+static void
+lCheckForStructParameters(const FunctionType *ftype, SourcePos pos) {
+    const std::vector<const Type *> &argTypes = ftype->GetArgumentTypes();
+    for (unsigned int i = 0; i < argTypes.size(); ++i) {
+        const Type *type = argTypes[i];
+        if (dynamic_cast<const StructType *>(type) != NULL) {
+            Error(pos, "Passing structs to/from application functions is currently broken. "
+                  "Use a reference or const reference instead for now.");
+            return;
+        }
+    }
+}
+
+
+/** We've got a declaration for a function to process.  This function does
+    all the work of creating the corresponding llvm::Function instance,
+    adding the symbol for the function to the symbol table and doing
+    various sanity checks.  This function returns true upon success and
+    false if any errors were encountered.
+ */
+static bool
+lInitFunSymDecl(DeclSpecs *ds, Declarator *decl) {
+    // Make sure that we've got what we expect here
+    Symbol *funSym = decl->sym;
+    assert(decl->isFunction);
+    assert(decl->arraySize.size() == 0);
+
+    // So far, so good.  Go ahead and set the type of the function symbol
+    funSym->type = decl->GetType(ds);
+
+    // If a global variable with the same name has already been declared
+    // issue an error.
+    if (m->symbolTable->LookupVariable(funSym->name.c_str()) != NULL) {
+        Error(decl->pos, "Function \"%s\" shadows previously-declared global variable. "
+              "Ignoring this definition.",
+              funSym->name.c_str());
+        return false;
+    }
+
+    if (ds->storageClass == SC_EXTERN_C) {
+        // Make sure the user hasn't supplied both an 'extern "C"' and a
+        // 'task' qualifier with the function
+        if (ds->typeQualifier & TYPEQUAL_TASK) {
+            Error(funSym->pos, "\"task\" qualifier is illegal with C-linkage extern "
+                  "function \"%s\".  Ignoring this function.", funSym->name.c_str());
+            return false;
+        }
+        std::vector<Symbol *> *funcs;
+        funcs = m->symbolTable->LookupFunction(decl->sym->name.c_str());
+        if (funcs != NULL) {
+            if (funcs->size() > 1) {
+                // Multiple functions with this name have already been declared; 
+                // can't overload here
+                Error(funSym->pos, "Can't overload extern \"C\" function \"%s\"; "
+                      "%d functions with the same name have already been declared.",
+                      funSym->name.c_str(), (int)funcs->size());
+                return false;
+            }
+
+            // One function with the same name has been declared; see if it
+            // has the same type as this one, in which case it's ok.
+            if (Type::Equal((*funcs)[0]->type, funSym->type))
+                return true;
+            else {
+                Error(funSym->pos, "Can't overload extern \"C\" function \"%s\".",
+                      funSym->name.c_str());
+                return false;
+            }
+        }
+    }
+
+    // We should have gotten a FunctionType back from the GetType() call above.
+    const FunctionType *functionType = 
+        dynamic_cast<const FunctionType *>(funSym->type);
+    assert(functionType != NULL);
+
+    // Get the LLVM FunctionType
+    bool includeMask = (ds->storageClass != SC_EXTERN_C);
+    const llvm::FunctionType *llvmFunctionType = 
+        functionType->LLVMFunctionType(g->ctx, includeMask);
+    if (llvmFunctionType == NULL)
+        return false;
+
+    // And create the llvm::Function
+    llvm::GlobalValue::LinkageTypes linkage = ds->storageClass == SC_STATIC ?
+        llvm::GlobalValue::InternalLinkage : llvm::GlobalValue::ExternalLinkage;
+    std::string functionName = ((ds->storageClass == SC_EXTERN_C) ?
+                                funSym->name : funSym->MangledName());
+    llvm::Function *function = 
+        llvm::Function::Create(llvmFunctionType, linkage, functionName.c_str(), m->module);
+
+    // Set function attributes: we never throw exceptions, and want to
+    // inline everything we can
+    function->setDoesNotThrow(true);
+    if (!(ds->storageClass == SC_EXTERN_C) && !g->generateDebuggingSymbols &&
+        (ds->typeQualifier & TYPEQUAL_INLINE))
+        function->addFnAttr(llvm::Attribute::AlwaysInline);
+    if (functionType->isTask)
+        // This also applies transitively to members I think? 
+        function->setDoesNotAlias(1, true);
+
+    // Make sure that the return type isn't 'varying' if the function is
+    // 'export'ed.
+    if (ds->storageClass == SC_EXPORT && lRecursiveCheckVarying(functionType->GetReturnType()))
+        Error(decl->pos, "Illegal to return a \"varying\" type from exported function \"%s\"",
+              funSym->name.c_str());
+
+    if (functionType->isTask && (functionType->GetReturnType() != AtomicType::Void))
+        Error(funSym->pos, "Task-qualified functions must have void return type.");
+
+    if (functionType->isExported || functionType->isExternC)
+        lCheckForStructParameters(functionType, funSym->pos);
+
+    // Loop over all of the arguments; process default values if present
+    // and do other checks and parameter attribute setting.
+    bool seenDefaultArg = false;
+    std::vector<ConstExpr *> argDefaults;
+    int nArgs = decl->functionArgs ? decl->functionArgs->size() : 0;
+    for (int i = 0; i < nArgs; ++i) {
+        Declaration *pdecl = (*decl->functionArgs)[i];
+        assert(pdecl->declarators.size() == 1);
+        Symbol *sym = pdecl->declarators[0]->sym;
+
+        // If the function is exported, make sure that the parameter
+        // doesn't have any varying stuff going on in it.
+        if (ds->storageClass == SC_EXPORT)
+            lCheckForVaryingParameter(sym);
+
+        // ISPC assumes that all memory passed in is aligned to the native
+        // width and that no pointers alias.  (It should be possible to
+        // specify when this is not the case, but this should be the
+        // default.)  Set parameter attributes accordingly.
+        if (!functionType->isTask && dynamic_cast<const ReferenceType *>(sym->type) != NULL) {
+            // NOTE: LLVM indexes function parameters starting from 1.
+            // This is unintuitive.
+            function->setDoesNotAlias(i+1, true);
+            int align = 4 * RoundUpPow2(g->target.nativeVectorWidth);
+            function->addAttribute(i+1, llvm::Attribute::constructAlignmentFromInt(align));
+        }
+
+        if (m->symbolTable->LookupFunction(sym->name.c_str()) != NULL)
+            Warning(sym->pos, "Function parameter \"%s\" shadows a function "
+                    "declared in global scope.", sym->name.c_str());
+        
+        // See if a default argument value was provided with the parameter
+        Expr *defaultValue = pdecl->declarators[0]->initExpr;
+        if (defaultValue != NULL) {
+            // If we have one, make sure it's a compile-time constant
+            seenDefaultArg = true;
+            defaultValue = defaultValue->TypeCheck();
+            defaultValue = defaultValue->Optimize();
+            defaultValue = dynamic_cast<ConstExpr *>(defaultValue);
+            if (!defaultValue) {
+                Error(sym->pos, "Default value for parameter \"%s\" must be "
+                      "a compile-time constant.", sym->name.c_str());
+                return false;
+            }
+        }
+        else if (seenDefaultArg) {
+            // Once one parameter has provided a default value, then all of
+            // the following ones must have them as well.
+            Error(sym->pos, "Parameter \"%s\" is missing default: all parameters after "
+                  "the first parameter with a default value must have default values "
+                  "as well.", sym->name.c_str());
+        }
+
+        // Add the default value to argDefaults.  Note that we make this
+        // call for all parameters, even those where no default value was
+        // provided.  In that case, a NULL value is stored here.  This
+        // approach means that we can always just look at the i'th entry of
+        // argDefaults to find the default value for the i'th parameter.
+        argDefaults.push_back(dynamic_cast<ConstExpr *>(defaultValue));
+    }
+
+    // And only now can we set the default values in the FunctionType
+    functionType->SetArgumentDefaults(argDefaults);
+
+    // If llvm gave us back a Function * with a different name than the one
+    // we asked for, then there's already a function with that same
+    // (mangled) name in the llvm::Module.  In that case, erase the one we
+    // tried to add and just work with the one it already had.
+    if (function->getName() != functionName) {
+        function->eraseFromParent();
+        function = m->module->getFunction(functionName);
+    }
+    funSym->function = function;
+
+    // But if that function has a definition, we don't want to redefine it.
+    if (!function->empty()) {
+        Warning(funSym->pos, "Ignoring redefinition of function \"%s\".", 
+                funSym->name.c_str());
+        return false;
+    }
+
+    // Finally, we know all is good and we can add the function to the
+    // symbol table
+    bool ok = m->symbolTable->AddFunction(funSym);
+    assert(ok);
+    return true;
+}
+
+
+void
+Module::AddGlobal(DeclSpecs *ds, Declarator *decl) {
+    // This function is called for a number of cases: function
+    // declarations, typedefs, and global variables declarations /
+    // definitions.  Figure out what we've got and take care of it.
+
+    if (decl->isFunction) {
+        // function declaration
+        const Type *t = decl->GetType(ds);
+        const FunctionType *ft = dynamic_cast<const FunctionType *>(t);
+        assert(ft != NULL);
+        if (m->symbolTable->LookupFunction(decl->sym->name.c_str(), ft) != NULL)
+            // Ignore redeclaration of a function with the same name and type
+            return;
+        // Otherwise do all of the llvm Module and SymbolTable work..
+        lInitFunSymDecl(ds, decl);
+    }
+    else if (ds->storageClass == SC_TYPEDEF) {
+        // Typedefs are easy; just add the mapping between the given name
+        // and the given type.
+        m->symbolTable->AddType(decl->sym->name.c_str(), decl->sym->type,
+                                decl->sym->pos);
+    }
+    else {
+        // global variable
+        if (m->symbolTable->LookupFunction(decl->sym->name.c_str()) != NULL) {
+            Error(decl->pos, "Global variable \"%s\" shadows previously-declared function.",
+                  decl->sym->name.c_str());
+            return;
+        }
+
+        // These may be NULL due to errors in parsing; just gracefully
+        // return here if so.
+        if (!decl->sym || !decl->sym->type) {
+            // But if these are NULL and there haven't been any previous
+            // errors, something surprising is going on
+            assert(errorCount > 0);
+            return;
+        }
+
+        if (ds->storageClass == SC_EXTERN_C) {
+            Error(decl->pos, "extern \"C\" qualifier can only be used for functions.");
+            return;
+        }
+
+        const llvm::Type *llvmType = decl->sym->type->LLVMType(g->ctx);
+        llvm::GlobalValue::LinkageTypes linkage =
+            (ds->storageClass == SC_STATIC) ? llvm::GlobalValue::InternalLinkage :
+                                              llvm::GlobalValue::ExternalLinkage;
+
+        // See if we have an initializer expression for the global.  If so,
+        // make sure it's a compile-time constant!
+        llvm::Constant *llvmInitializer = NULL;
+        if (ds->storageClass == SC_EXTERN || ds->storageClass == SC_EXTERN_C) {
+            externGlobals.push_back(decl->sym);
+            if (decl->initExpr != NULL)
+                Error(decl->pos, "Initializer can't be provided with \"extern\" "
+                      "global variable \"%s\".", decl->sym->name.c_str());
+        }
+        else {
+            if (decl->initExpr != NULL) {
+                decl->initExpr = decl->initExpr->TypeCheck();
+                if (decl->initExpr != NULL) {
+                    // We need to make sure the initializer expression is
+                    // the same type as the global
+                    decl->initExpr = decl->initExpr->TypeConv(decl->sym->type, "initializer");
+
+                    if (decl->initExpr != NULL) {
+                        decl->initExpr = decl->initExpr->Optimize();
+                        // Fingers crossed, now let's see if we've got a
+                        // constant value..
+                        llvmInitializer = decl->initExpr->GetConstant(decl->sym->type);
+
+                        if (llvmInitializer != NULL) {
+                            if (decl->sym->type->IsConstType())
+                                // Try to get a ConstExpr associated with
+                                // the symbol.  This dynamic_cast can
+                                // validly fail, for example for types like
+                                // StructTypes where a ConstExpr can't
+                                // represent their values.
+                                decl->sym->constValue = 
+                                    dynamic_cast<ConstExpr *>(decl->initExpr);
+                        }
+                        else
+                            Error(decl->pos, "Initializer for global variable \"%s\" "
+                                  "must be a constant.", decl->sym->name.c_str());
+                    }
+                }
+            }
+
+            // If no initializer was provided or if we couldn't get a value
+            // above, initialize it with zeros..
+            if (llvmInitializer == NULL)
+                llvmInitializer = llvm::Constant::getNullValue(llvmType);
+        }
+
+        bool isConst = (ds->typeQualifier & TYPEQUAL_CONST) != 0;
+        decl->sym->storagePtr = new llvm::GlobalVariable(*module, llvmType, isConst, 
+                                                         linkage, llvmInitializer, 
+                                                         decl->sym->name.c_str());
+        m->symbolTable->AddVariable(decl->sym);
+
+#ifndef LLVM_2_8
+        if (diBuilder && (ds->storageClass != SC_EXTERN)) {
+            llvm::DIFile file = decl->pos.GetDIFile();
+            diBuilder->createGlobalVariable(decl->sym->name, 
+                                            file,
+                                            decl->pos.first_line,
+                                            decl->sym->type->GetDIType(file),
+                                            (ds->storageClass == SC_STATIC),
+                                            decl->sym->storagePtr);
+        }
+#endif // LLVM_2_8
+    }
+}
+
+
+/** Parameters for tasks are stored in a big structure; this utility
+    function emits code to copy those values out of the task structure into
+    local stack-allocated variables.  (Which we expect that LLVM's
+    'mem2reg' pass will in turn promote to SSA registers..
+ */
+static void
+lCopyInTaskParameter(int i, llvm::Value *structArgPtr, Declarator *decl, 
+                     FunctionEmitContext *ctx) {
+    // We expect the argument structure to come in as a poitner to a
+    // structure.  Confirm and figure out its type here.
+    const llvm::Type *structArgType = structArgPtr->getType();
+    assert(llvm::isa<llvm::PointerType>(structArgType));
+    const llvm::PointerType *pt = llvm::dyn_cast<const llvm::PointerType>(structArgType);
+    assert(llvm::isa<llvm::StructType>(pt->getElementType()));
+    const llvm::StructType *argStructType = 
+        llvm::dyn_cast<const llvm::StructType>(pt->getElementType());
+
+    // Get the type of the argument we're copying in and its Symbol pointer
+    const llvm::Type *argType = argStructType->getElementType(i);
+    Declaration *pdecl = (*decl->functionArgs)[i];
+    assert(pdecl->declarators.size() == 1);
+    Symbol *sym = pdecl->declarators[0]->sym;
+
+    // allocate space to copy the parameter in to
+    sym->storagePtr = ctx->AllocaInst(argType, sym->name.c_str());
+
+    // get a pointer to the value in the struct
+    llvm::Value *ptr = ctx->GetElementPtrInst(structArgPtr, 0, i, sym->name.c_str());
+
+    // and copy the value from the struct and into the local alloca'ed
+    // memory
+    llvm::Value *ptrval = ctx->LoadInst(ptr, NULL, sym->name.c_str());
+    ctx->StoreInst(ptrval, sym->storagePtr);
+}
+
+
+/** Given the statements implementing a function, emit the code that
+    implements the function.  Most of the work do be done here just
+    involves wiring up the function parameter values to be available in the
+    function body code.
+ */
+static void 
+lEmitFunctionCode(FunctionEmitContext *ctx, llvm::Function *function, 
+                  const FunctionType *ft, Symbol *funSym,
+                  Declarator *decl, Stmt *code) {
+#if 0
+    llvm::BasicBlock *entryBBlock = ctx->GetCurrentBasicBlock();
+#endif
+    if (ft->isTask == true) {
+        // For tasks, we there should always be three parmeters: the
+        // pointer to the structure that holds all of the arguments, the
+        // thread index, and the thread count variables.
+        llvm::Function::arg_iterator argIter = function->arg_begin();
+        llvm::Value *structParamPtr = argIter++;
+        llvm::Value *threadIndex = argIter++;
+        llvm::Value *threadCount = argIter++;
+
+        // Copy the function parameter values from the structure into local
+        // storage
+        if (decl->functionArgs)
+            for (unsigned int i = 0; i < decl->functionArgs->size(); ++i)
+                lCopyInTaskParameter(i, structParamPtr, decl, ctx);
+
+        // Copy in the mask as well.
+        // FIXME: we may probably to check the mask at runtime and emit an
+        // 'all on' code path if it is all on, since that should be a
+        // common case.
+#if 1
+        int nArgs = decl->functionArgs ? decl->functionArgs->size() : 0;
+        // The mask is the last parameter in the argument structure
+        llvm::Value *ptr = ctx->GetElementPtrInst(structParamPtr, 0, nArgs,
+                                                  "task_struct_mask");
+        llvm::Value *ptrval = ctx->LoadInst(ptr, NULL, "mask");
+        ctx->SetEntryMask(ptrval);
+#else
+        Warning(funSym->pos, "Running task with all-on mask to start.");
+#endif
+
+        // Copy threadIndex and threadCount into stack-allocated storage so
+        // that their symbols point to something reasonable.
+        Symbol *threadIndexSym = m->symbolTable->LookupVariable("threadIndex");
+        assert(threadIndexSym);
+        threadIndexSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "threadIndex");
+        ctx->StoreInst(threadIndex, threadIndexSym->storagePtr);
+
+        Symbol *threadCountSym = m->symbolTable->LookupVariable("threadCount");
+        assert(threadCountSym);
+        threadCountSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "threadCount");
+        ctx->StoreInst(threadCount, threadCountSym->storagePtr);
+    }
+    else {
+        // Regular, non-task function
+        llvm::Function::arg_iterator argIter = function->arg_begin(); 
+        if (decl->functionArgs) {
+            for (unsigned int i = 0; i < decl->functionArgs->size(); ++i, ++argIter) {
+                Declaration *pdecl = (*decl->functionArgs)[i];
+                assert(pdecl->declarators.size() == 1);
+                Symbol *sym = pdecl->declarators[0]->sym;
+                argIter->setName(sym->name.c_str());
+
+                // Allocate stack storage for the parameter and emit code
+                // to store the its value there.
+                sym->storagePtr = ctx->AllocaInst(argIter->getType(), sym->name.c_str());
+                ctx->StoreInst(argIter, sym->storagePtr);
+                ctx->EmitFunctionParameterDebugInfo(sym);
+            }
+        }
+
+        // If the number of actual function arguments is equal to the
+        // number of declared arguments in decl->functionArgs, then we
+        // don't have a mask parameter, so set it to be all on.  This
+        // happens for exmaple with 'export'ed functions that the app
+        // calls.
+        if (argIter == function->arg_end())
+            ctx->SetEntryMask(LLVMMaskAllOn);
+        else {
+            // Otherwise use the mask to set the entry mask value
+            argIter->setName("__mask");
+            assert(argIter->getType() == LLVMTypes::MaskType);
+            ctx->SetEntryMask(argIter);
+            assert(++argIter == function->arg_end());
+        }
+    }
+
+    // Finally, we can generate code for the function
+    if (code != NULL) {
+        bool checkMask = (ft->isTask == true) || 
+            (function->hasFnAttr(llvm::Attribute::AlwaysInline) == false);
+        if (checkMask) {
+            bool allTrue[ISPC_MAX_NVEC];
+            for (int i = 0; i < g->target.vectorWidth; ++i)
+                allTrue[i] = true;
+            Expr *trueExpr = new ConstExpr(AtomicType::VaryingBool, allTrue, 
+                                           code->pos);
+            code = new IfStmt(trueExpr, code, NULL, true, code->pos);
+        }
+
+        ctx->SetDebugPos(code->pos);
+        ctx->AddInstrumentationPoint("function entry");
+        code->EmitCode(ctx);
+    }
+
+    if (ctx->GetCurrentBasicBlock()) {
+        // FIXME: We'd like to issue a warning if we've reached the end of
+        // the function without a return statement (for non-void
+        // functions).  But the test below isn't right, since we can have
+        // (with 'x' a varying test) "if (x) return a; else return b;", in
+        // which case we have a valid basic block but its unreachable so ok
+        // to not have return statement.
+#if 0
+        // If the bblock has no predecessors, then it doesn't matter if it
+        // doesn't have a return; it'll never be reached.  If it does,
+        // issue a warning.  Also need to warn if it's the entry block for
+        // the function (in which case it will not have predeccesors but is
+        // still reachable.)
+        if (ft->GetReturnType() != AtomicType::Void &&
+            (pred_begin(ec.bblock) != pred_end(ec.bblock) || (ec.bblock == entryBBlock)))
+            Warning(funSym->pos, "Missing return statement in function returning \"%s\".",
+                    ft->rType->GetString().c_str());
+#endif
+
+        // FIXME: would like to set the context's current position to
+        // e.g. the end of the function code
+
+        // if bblock is non-NULL, it hasn't been terminated by e.g. a
+        // return instruction.  Need to add a return instruction.
+        ctx->ReturnInst();
+    }
+}
+
+
+void
+Module::AddFunction(DeclSpecs *ds, Declarator *decl, Stmt *code) {
+    if (code) {
+        code = code->TypeCheck();
+        if (code)
+            code = code->Optimize();
+    }
+
+    if (g->debugPrint) {
+        printf("Add Function\n");
+        ds->Print();
+        printf("\n");
+        decl->Print();
+        printf("\n");
+        code->Print(0);
+        printf("\n\n\n");
+    }
+
+    // Get the symbol for the function from the symbol table.  (It should
+    // already have been added to the symbol table by AddGlobal() by the
+    // time we get here.)
+    const FunctionType *functionType =
+        dynamic_cast<const FunctionType *>(decl->GetType(ds));
+    assert(functionType != NULL);
+    Symbol *funSym = symbolTable->LookupFunction(decl->sym->name.c_str(), 
+                                                 functionType);
+    assert(funSym != NULL);
+    funSym->pos = decl->pos;
+
+    llvm::Function *function = funSym->function;
+    assert(function != NULL);
+
+    // Figure out a reasonable source file position for the start of the
+    // function body.  If possible, get the position of the first actual
+    // non-StmtList statment...
+    SourcePos firstStmtPos = funSym->pos;
+    if (code) {
+        StmtList *sl = dynamic_cast<StmtList *>(code);
+        if (sl && sl->GetStatements().size() > 0 && 
+            sl->GetStatements()[0] != NULL)
+            firstStmtPos = sl->GetStatements()[0]->pos;
+        else
+            firstStmtPos = code->pos;
+    }
+
+    // And we can now go ahead and emit the code 
+    {
+        FunctionEmitContext ec(functionType->GetReturnType(), function, funSym,
+                               firstStmtPos);
+        lEmitFunctionCode(&ec, function, functionType, funSym, decl, code);
+    }
+
+    if (errorCount == 0) {
+        if (g->debugPrint) {
+            llvm::PassManager ppm;
+            ppm.add(llvm::createPrintModulePass(&llvm::outs()));
+            ppm.run(*module);
+        }
+
+        llvm::verifyFunction(*function);
+
+        // If the function is 'export'-qualified, emit a second version of
+        // it without a mask parameter and without name mangling so that
+        // the application can call it
+        if (ds->storageClass == SC_EXPORT) {
+            if (!functionType->isTask) {
+                const llvm::FunctionType *ftype = functionType->LLVMFunctionType(g->ctx);
+                llvm::GlobalValue::LinkageTypes linkage = llvm::GlobalValue::ExternalLinkage;
+                llvm::Function *appFunction = 
+                    llvm::Function::Create(ftype, linkage, funSym->name.c_str(), module);
+                appFunction->setDoesNotThrow(true);
+
+                if (appFunction->getName() != funSym->name) {
+                    // this was a redefinition for which we already emitted an
+                    // error, so don't worry about this one...
+                    appFunction->eraseFromParent();
+                }
+                else {
+                    // And emit the code again
+                    FunctionEmitContext ec(functionType->GetReturnType(), appFunction, funSym,
+                                           firstStmtPos);
+                    lEmitFunctionCode(&ec, appFunction, functionType, funSym, decl, code);
+                    if (errorCount == 0)
+                        llvm::verifyFunction(*appFunction);
+                }
+            }
+        }
+    }
+}
+
+
+bool
+Module::WriteOutput(OutputType outputType, const char *outFileName) {
+    // First, issue a warning if the output file suffix and the type of
+    // file being created seem to mismatch.  This can help catch missing
+    // command-line arguments specifying the output file type.
+    const char *suffix = strrchr(outFileName, '.');
+    if (suffix != NULL) {
+        ++suffix;
+        const char *fileType = NULL;
+        switch (outputType) {
+        case Asm:
+            if (strcasecmp(suffix, "s"))
+                fileType = "assembly";
+            break;
+        case Bitcode:
+            if (strcasecmp(suffix, "bc"))
+                fileType = "LLVM bitcode";
+            break;
+        case Object:
+            if (strcasecmp(suffix, "o") && strcasecmp(suffix, "obj"))
+                fileType = "object";
+            break;
+        case Header:
+            if (strcasecmp(suffix, "h") && strcasecmp(suffix, "hh") &&
+                strcasecmp(suffix, "hpp"))
+                fileType = "header";
+            break;
+        }
+        if (fileType != NULL)
+            fprintf(stderr, "Warning: emitting %s file, but filename \"%s\" "
+                    "has suffix \"%s\"?\n", fileType, outFileName, suffix);
+    }
+
+    if (outputType == Header)
+        return writeHeader(outFileName);
+    else {
+        if (outputType == Bitcode) {
+            // Get a file descriptor corresponding to where we want the output
+            // to go.  If we open it, it'll be closed by the
+            // llvm::raw_fd_ostream destructor.
+            int fd;
+            if (!strcmp(outFileName, "-"))
+                fd = 1; // stdout
+            else {
+                int flags = O_CREAT|O_WRONLY|O_TRUNC;
+#ifdef ISPC_IS_WINDOWS
+                flags |= O_BINARY;
+                fd = _open(outFileName, flags, 0644);
+#else
+                fd = open(outFileName, flags, 0644);
+#endif // ISPC_IS_WINDOWS
+                if (fd == -1) {
+                    perror(outFileName);
+                    return false;
+                }
+            }
+
+            llvm::raw_fd_ostream fos(fd, (fd != 1), false);
+            llvm::WriteBitcodeToFile(module, fos);
+            return true;
+        }
+        else {
+#ifdef LLVM_2_8
+            fprintf(stderr, "Direct object file emission not supported in this build.\n");
+            return false;
+#else
+            return writeObjectFileOrAssembly(outputType, outFileName);
+#endif // LLVM_2_8
+        }
+    }
+}
+
+
+bool
+Module::writeObjectFileOrAssembly(OutputType outputType, const char *outFileName) {
+    llvm::InitializeAllTargets();
+    llvm::InitializeAllAsmPrinters();
+    llvm::InitializeAllAsmParsers();
+
+    llvm::Triple triple(module->getTargetTriple());
+    if (triple.getTriple().empty())
+        triple.setTriple(llvm::sys::getHostTriple());
+
+    const llvm::Target *target = NULL;
+    if (g->target.arch != "") {
+        // If the user specified a target architecture, see if it's a known
+        // one; print an error with the valid ones otherwise.
+        for (llvm::TargetRegistry::iterator iter = llvm::TargetRegistry::begin();
+             iter != llvm::TargetRegistry::end(); ++iter) {
+            if (g->target.arch == iter->getName()) {
+                target = &*iter;
+                break;
+            }
+        }
+        if (!target) {
+            fprintf(stderr, "Invalid target \"%s\"\nOptions: ", 
+                    g->target.arch.c_str());
+            llvm::TargetRegistry::iterator iter;
+            for (iter = llvm::TargetRegistry::begin();
+                 iter != llvm::TargetRegistry::end(); ++iter)
+                fprintf(stderr, "%s ", iter->getName());
+            fprintf(stderr, "\n");
+            return false;
+        }
+
+        llvm::Triple::ArchType archType = 
+            llvm::Triple::getArchTypeForLLVMName(g->target.arch);
+        if (archType != llvm::Triple::UnknownArch)
+            triple.setArch(archType);
+    }
+    else {
+        // Otherwise get the target either based on the host or the
+        // module's target, if it has been set there.
+        std::string error;
+        target = llvm::TargetRegistry::lookupTarget(triple.getTriple(), error);
+        if (!target) {
+            fprintf(stderr, "Unable to select target for module: %s\n", 
+                    error.c_str());
+            return false;
+        }
+    }
+
+    std::string featuresString;
+    if (g->target.cpu.size()) {
+        llvm::SubtargetFeatures features;
+        features.setCPU(g->target.cpu);
+        featuresString = features.getString();
+    }
+
+    llvm::TargetMachine *targetMachine = 
+        target->createTargetMachine(triple.getTriple(), featuresString);
+    if (targetMachine == NULL) {
+        fprintf(stderr, "Unable to create target machine for target \"%s\"!",
+                triple.str().c_str());
+        return false;
+    }
+    targetMachine->setAsmVerbosityDefault(true);
+
+    // Figure out if we're generating object file or assembly output, and
+    // set binary output for object files
+    llvm::TargetMachine::CodeGenFileType fileType = (outputType == Object) ? 
+        llvm::TargetMachine::CGFT_ObjectFile : llvm::TargetMachine::CGFT_AssemblyFile;
+    bool binary = (fileType == llvm::TargetMachine::CGFT_ObjectFile);
+    unsigned int flags = binary ? llvm::raw_fd_ostream::F_Binary : 0;
+
+    std::string error;
+    llvm::tool_output_file *of = new llvm::tool_output_file(outFileName, error, flags);
+    if (error.size()) {
+        fprintf(stderr, "Error opening output file \"%s\".\n", outFileName);
+        return false;
+    }
+
+    llvm::PassManager pm;
+    if (const llvm::TargetData *td = targetMachine->getTargetData())
+        pm.add(new llvm::TargetData(*td));
+    else
+        pm.add(new llvm::TargetData(module));
+
+    llvm::formatted_raw_ostream fos(of->os());
+    llvm::CodeGenOpt::Level optLevel = 
+        (g->opt.level > 0) ? llvm::CodeGenOpt::Aggressive : llvm::CodeGenOpt::None;
+
+    if (targetMachine->addPassesToEmitFile(pm, fos, fileType, optLevel)) {
+        fprintf(stderr, "Fatal error adding passes to emit object file for "
+                "target %s!\n", triple.str().c_str());
+        return false;
+    }
+
+    // Finally, run the passes to emit the object file/assembly
+    pm.run(*module);
+
+    // Success; tell tool_output_file to keep the final output file. 
+    of->keep();
+
+    return true;
+}
+
+
+/** Walk through the elements of the given structure; for any elements that
+    are themselves structs, add their Type * to structParamTypes and
+    recursively process their elements.
+ */
+static void
+lRecursiveAddStructs(const StructType *structType,
+                     std::vector<const StructType *> &structParamTypes) {
+    for (int i = 0; i < structType->NumElements(); ++i) {
+        const Type *elementBaseType = structType->GetMemberType(i)->GetBaseType();
+        const StructType *elementStructType = 
+            dynamic_cast<const StructType *>(elementBaseType);
+        if (elementStructType != NULL) {
+            structParamTypes.push_back(elementStructType);
+            lRecursiveAddStructs(elementStructType, structParamTypes);
+        }
+    }
+        
+}
+
+
+/** Small structure used in representing dependency graphs of structures
+    (i.e. given a StructType, which other structure types does it have as
+    elements).
+ */ 
+struct StructDAGNode {
+    StructDAGNode()
+        : visited(false) { }
+
+    bool visited;
+    std::vector<const StructType *> dependents;
+};
+
+
+/** Visit a node for the topological sort.
+ */
+static void
+lVisitNode(const StructType *structType, 
+           std::map<const StructType *, StructDAGNode *> &structToNode,
+           std::vector<const StructType *> &sortedTypes) {
+    assert(structToNode.find(structType) != structToNode.end());
+    // Get the node that encodes the structs that this one is immediately
+    // dependent on.
+    StructDAGNode *node = structToNode[structType];
+    if (node->visited)
+        return;
+
+    node->visited = true;
+    // Depth-first traversal: visit all of the dependent nodes...
+    for (unsigned int i = 0; i < node->dependents.size(); ++i)
+        lVisitNode(node->dependents[i], structToNode, sortedTypes);
+    // ...and then add this one to the sorted list
+    sortedTypes.push_back(structType);
+}
+           
+
+/** Given a set of structures that we want to print C declarations of in a
+    header file, order them so that any struct that is used as a member
+    variable in another struct is printed before the struct that uses it
+    and then print them to the given file.
+ */
+static void
+lEmitStructDecls(std::vector<const StructType *> &structTypes, FILE *file) {
+    // First, build a DAG among the struct types where there is an edge
+    // from node A to node B if struct type A depends on struct type B
+
+    // Records the struct types that have incoming edges in the
+    // DAG--i.e. the ones that one or more other struct types depend on
+    std::set<const StructType *> hasIncomingEdges;
+    // Records the mapping between struct type pointers and the
+    // StructDagNode structures
+    std::map<const StructType *, StructDAGNode *> structToNode;
+    for (unsigned int i = 0; i < structTypes.size(); ++i) {
+        // For each struct type, create its DAG node and record the
+        // relationship between it and its node
+        const StructType *st = structTypes[i];
+        StructDAGNode *node = new StructDAGNode;
+        structToNode[st] = node;
+
+        for (int j = 0; j < st->NumElements(); ++j) {
+            const StructType *elementStructType = 
+                dynamic_cast<const StructType *>(st->GetMemberType(j));
+            // If this element is a struct type and we haven't already
+            // processed it for the current struct type, then upate th
+            // dependencies and record that this element type has other
+            // struct types that depend on it.
+            if (elementStructType != NULL &&
+                (std::find(node->dependents.begin(), node->dependents.end(), 
+                           elementStructType) == node->dependents.end())) {
+                node->dependents.push_back(elementStructType);
+                hasIncomingEdges.insert(elementStructType);
+            }
+        }
+    }
+
+    // Perform a topological sort of the struct types.  Kick it off by
+    // visiting nodes with no incoming edges; i.e. the struct types that no
+    // other struct types depend on.
+    std::vector<const StructType *> sortedTypes;
+    for (unsigned int i = 0; i < structTypes.size(); ++i) {
+        const StructType *structType = structTypes[i];
+        if (hasIncomingEdges.find(structType) == hasIncomingEdges.end())
+            lVisitNode(structType, structToNode, sortedTypes);
+    }
+    assert(sortedTypes.size() == structTypes.size());
+
+    // And finally we can emit the struct declarations by going through the
+    // sorted ones in order.
+    for (unsigned int i = 0; i < sortedTypes.size(); ++i) {
+        const StructType *st = sortedTypes[i];
+        fprintf(file, "struct %s {\n", st->GetStructName().c_str());
+        for (int j = 0; j < st->NumElements(); ++j) {
+            const Type *type = st->GetMemberType(j)->GetAsNonConstType();
+            std::string d = type->GetCDeclaration(st->GetElementName(j));
+            fprintf(file, "    %s;\n", d.c_str());
+        }
+        fprintf(file, "};\n\n");
+    }
+}
+
+
+/** Print declarations of VectorTypes used in 'export'ed parts of the
+    program in the header file.
+ */
+static void
+lEmitVectorTypedefs(const std::vector<const VectorType *> &types, FILE *file) {
+    if (types.size() == 0)
+        return;
+
+    fprintf(file, "///////////////////////////////////////////////////////////////////////////\n");
+    fprintf(file, "// Vector types with external visibility from ispc code\n");
+    fprintf(file, "///////////////////////////////////////////////////////////////////////////\n\n");
+
+    std::vector<const VectorType *> emittedTypes;
+    int align = g->target.nativeVectorWidth * 4;
+
+    for (unsigned int i = 0; i < types.size(); ++i) {
+        std::string baseDecl;
+        const VectorType *vt = types[i]->GetAsNonConstType();
+        int size = vt->GetElementCount();
+
+        // Don't print the declaration for this type if we've already
+        // handled it.
+        //
+        // FIXME: this is n^2, unnecessarily.  Being able to compare Type
+        // *s directly will eventually make this much better--can use a
+        // std::set...  Probably not going to matter in practice.
+        for (unsigned int j = 0; j < emittedTypes.size(); ++j) {
+            if (Type::Equal(vt, emittedTypes[j]))
+                goto skip;
+        }
+
+        baseDecl = vt->GetBaseType()->GetCDeclaration("");
+        fprintf(file, "#ifdef _MSC_VER\n__declspec( align(%d) ) ", align);
+        fprintf(file, "struct %s%d { %s v[%d]; };\n", baseDecl.c_str(), size,
+                baseDecl.c_str(), size);
+        fprintf(file, "#else\n");
+        fprintf(file, "struct %s%d { %s v[%d]; } __attribute__ ((aligned(%d)));\n", 
+                baseDecl.c_str(), size, baseDecl.c_str(), size, align);
+        fprintf(file, "#endif\n");
+                
+        emittedTypes.push_back(vt);
+    skip:
+        ;
+    }
+    fprintf(file, "\n");
+}
+
+
+/** Given a set of StructTypes, walk through their elements and collect the
+    VectorTypes that are present in them.
+ */
+static void
+lGetVectorsFromStructs(const std::vector<const StructType *> &structParamTypes,
+                       std::vector<const VectorType *> *vectorParamTypes) {
+    for (unsigned int i = 0; i < structParamTypes.size(); ++i) {
+        const StructType *structType = structParamTypes[i];
+        for (int j = 0; j < structType->NumElements(); ++j) {
+            const Type *elementType = structType->GetMemberType(j);
+
+            const ArrayType *at = dynamic_cast<const ArrayType *>(elementType);
+            if (at)
+                elementType = at->GetBaseType();
+
+            const VectorType *vt = dynamic_cast<const VectorType *>(elementType);
+            if (vt != NULL) {
+                // make sure it isn't there already...
+                for (unsigned int k = 0; k < vectorParamTypes->size(); ++k)
+                    if (Type::Equal(vt, (*vectorParamTypes)[k]))
+                        goto skip;
+                vectorParamTypes->push_back(vt);
+            }
+            skip:
+            ;
+        }
+    }
+}
+
+
+static void
+lGetStructAndVectorTypes(const Type *type, 
+                         std::vector<const StructType *> *structParamTypes,
+                         std::vector<const VectorType *> *vectorParamTypes) {
+    const StructType *st = dynamic_cast<const StructType *>(type->GetBaseType());
+    if (st != NULL)
+        structParamTypes->push_back(st);
+    const VectorType *vt = dynamic_cast<const VectorType *>(type);
+    if (vt != NULL)
+        vectorParamTypes->push_back(vt);
+    vt = dynamic_cast<const VectorType *>(type->GetBaseType());
+    if (vt != NULL)
+        vectorParamTypes->push_back(vt);
+}
+
+
+/** Given a set of functions, return the set of structure and vector types
+    present in the parameters to them.
+ */
+static void
+lGetStructAndVectorParams(const std::vector<Symbol *> &funcs, 
+                          std::vector<const StructType *> *structParamTypes,
+                          std::vector<const VectorType *> *vectorParamTypes) {
+    for (unsigned int i = 0; i < funcs.size(); ++i) {
+        const FunctionType *ftype = dynamic_cast<const FunctionType *>(funcs[i]->type);
+        lGetStructAndVectorTypes(ftype->GetReturnType(), structParamTypes,
+                                 vectorParamTypes);
+        const std::vector<const Type *> &argTypes = ftype->GetArgumentTypes();
+        for (unsigned int j = 0; j < argTypes.size(); ++j) {
+            lGetStructAndVectorTypes(argTypes[j], structParamTypes, 
+                                     vectorParamTypes);
+        }
+    }
+}
+
+
+static void
+lPrintFunctionDeclarations(FILE *file, const std::vector<Symbol *> &funcs) {
+    fprintf(file, "#ifdef __cplusplus\nextern \"C\" {\n#endif // __cplusplus\n");
+    for (unsigned int i = 0; i < funcs.size(); ++i) {
+        const FunctionType *ftype = dynamic_cast<const FunctionType *>(funcs[i]->type);
+        assert(ftype);
+        std::string decl = ftype->GetCDeclaration(funcs[i]->name);
+        fprintf(file, "    extern %s;\n", decl.c_str());
+    }
+    fprintf(file, "#ifdef __cplusplus\n}\n#endif // __cplusplus\n");
+}
+
+
+static void
+lPrintExternGlobals(FILE *file, const std::vector<Symbol *> &externGlobals) {
+    for (unsigned int i = 0; i < externGlobals.size(); ++i) {
+        Symbol *sym = externGlobals[i];
+        if (lRecursiveCheckVarying(sym->type))
+            Warning(sym->pos, "Not emitting declaration for symbol \"%s\" into generated "
+                    "header file since it (or some of its members) are varying.",
+                    sym->name.c_str());
+        else
+            fprintf(file, "extern %s;\n", sym->type->GetCDeclaration(sym->name).c_str());
+    }
+}
+
+
+static bool
+lIsExported(const Symbol *sym) {
+    const FunctionType *ft = dynamic_cast<const FunctionType *>(sym->type);
+    assert(ft);
+    return ft->isExported;
+}
+
+
+static bool
+lIsExternC(const Symbol *sym) {
+    const FunctionType *ft = dynamic_cast<const FunctionType *>(sym->type);
+    assert(ft);
+    return ft->isExternC;
+}
+
+
+bool
+Module::writeHeader(const char *fn) {
+    FILE *f = fopen(fn, "w");
+    if (!f) {
+        perror("fopen");
+        return false;
+    }
+    fprintf(f, "//\n// %s\n// (Header automatically generated by the ispc compiler.)\n", fn);
+    fprintf(f, "// DO NOT EDIT THIS FILE.\n//\n\n");
+
+    // Create a nice guard string from the filename, turning any
+    // non-number/letter characters into underbars
+    std::string guard = "ISPC_";
+    const char *p = fn;
+    while (*p) {
+        if (isdigit(*p)) 
+            guard += *p;
+        else if (isalpha(*p)) 
+            guard += toupper(*p);
+        else
+            guard += "_";
+        ++p;
+    }
+    fprintf(f, "#ifndef %s\n#define %s\n\n", guard.c_str(), guard.c_str());
+
+    fprintf(f, "#include <stdint.h>\n\n");
+    fprintf(f, "#ifdef __cplusplus\nnamespace ispc {\n#endif // __cplusplus\n\n");
+
+    if (g->emitInstrumentation) {
+        fprintf(f, "#define ISPC_INSTRUMENTATION 1\n");
+        fprintf(f, "extern \"C\" {\n");
+        fprintf(f, "  void ISPCInstrument(const char *fn, const char *note, int line, int mask);\n");
+        fprintf(f, "}\n");
+    }
+
+    // Collect single linear arrays of the exported and extern "C"
+    // functions
+    std::vector<Symbol *> exportedFuncs, externCFuncs;
+    m->symbolTable->GetMatchingFunctions(lIsExported, &exportedFuncs);
+    m->symbolTable->GetMatchingFunctions(lIsExternC, &externCFuncs);
+    
+    // Get all of the structs used as function parameters and extern
+    // globals.  These vectors may have repeats.
+    std::vector<const StructType *> structParamTypes;
+    std::vector<const VectorType *> vectorParamTypes;
+    lGetStructAndVectorParams(exportedFuncs, &structParamTypes, &vectorParamTypes);
+    lGetStructAndVectorParams(externCFuncs, &structParamTypes, &vectorParamTypes);
+
+    // And do same for the 'extern' globals
+    for (unsigned int i = 0; i < externGlobals.size(); ++i)
+        lGetStructAndVectorTypes(externGlobals[i]->type,
+                                 &structParamTypes, &vectorParamTypes);
+
+    // Get all of the structs that the structs we have seen so far they
+    // depend on transitively.  Note the array may grow as a result of the
+    // call to lRecursiveAddStructs -> an iterator would be a bad idea
+    // (would be invalidated) -> the value of size() may increase as we go
+    // along.  But that's good; that lets us actually get the whole
+    // transitive set of struct types we need.
+    for (unsigned int i = 0; i < structParamTypes.size(); ++i)
+        lRecursiveAddStructs(structParamTypes[i], structParamTypes);
+
+    // Now get the unique struct types.  This is an n^2 search, which is
+    // kind of ugly, but unlikely to be a problem in practice.
+    std::vector<const StructType *> uniqueStructTypes;
+    for (unsigned int i = 0; i < structParamTypes.size(); ++i) {
+        for (unsigned int j = 0; j < uniqueStructTypes.size(); ++j)
+            if (Type::Equal(structParamTypes[i], uniqueStructTypes[j]))
+                goto skip;
+        uniqueStructTypes.push_back(structParamTypes[i]);
+    skip:
+        ;
+    }
+
+    lGetVectorsFromStructs(uniqueStructTypes, &vectorParamTypes);
+    lEmitVectorTypedefs(vectorParamTypes, f);
+    lEmitStructDecls(uniqueStructTypes, f);
+
+    // emit externs for globals
+    if (externGlobals.size() > 0) {
+        fprintf(f, "///////////////////////////////////////////////////////////////////////////\n");
+        fprintf(f, "// Globals declared \"extern\" from ispc code\n");
+        fprintf(f, "///////////////////////////////////////////////////////////////////////////\n");
+        lPrintExternGlobals(f, externGlobals);
+    }
+
+    // emit function declarations for exported stuff...
+    if (exportedFuncs.size() > 0) {
+        fprintf(f, "\n");
+        fprintf(f, "///////////////////////////////////////////////////////////////////////////\n");
+        fprintf(f, "// Functions exported from ispc code\n");
+        fprintf(f, "///////////////////////////////////////////////////////////////////////////\n");
+        lPrintFunctionDeclarations(f, exportedFuncs);
+    }
+#if 0
+    if (externCFuncs.size() > 0) {
+        fprintf(f, "\n");
+        fprintf(f, "///////////////////////////////////////////////////////////////////////////\n");
+        fprintf(f, "// External C functions used by ispc code\n");
+        fprintf(f, "///////////////////////////////////////////////////////////////////////////\n");
+        lPrintFunctionDeclarations(f, externCFuncs);
+    }
+#endif
+
+    // end namespace
+    fprintf(f, "\n#ifdef __cplusplus\n}\n#endif // __cplusplus\n");
+
+    // end guard
+    fprintf(f, "\n#endif // %s\n", guard.c_str());
+
+    fclose(f);
+    return true;
+}
diff --git a/module.h b/module.h
new file mode 100644
index 00000000..403f5d05
--- /dev/null
+++ b/module.h
@@ -0,0 +1,113 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file module.h
+    @brief Declaration of the Module class, which is the ispc-side representation
+    of the results of compiling a source file.
+ */
+
+#ifndef ISPC_MODULE_H
+#define ISPC_MODULE_H 1
+
+#include "ispc.h"
+
+class Module {
+public:
+    /** The name of the source file being compiled should be passed as the
+        module name. */
+    Module(const char *filename);
+
+    /** Compiles the source file passed to the Module constructor, adding
+        its global variables and functions to both the llvm::Module and
+        SymbolTable.  Returns the number of errors during compilation.  */
+    int CompileFile();
+
+    /** Adds the global variable described by the declaration information to
+        the module. */
+    void AddGlobal(DeclSpecs *ds, Declarator *decl);
+
+    /** Adds the function described by the declaration information and the
+        provided statements to the module. */
+    void AddFunction(DeclSpecs *ds, Declarator *decl, Stmt *code);
+
+    /** After a source file has been compiled, output can be generated in a
+        number of different formats. */
+    enum OutputType { Asm,      /** Generate text assembly language output */
+                      Bitcode,  /** Generate LLVM IR bitcode output */
+                      Object,   /** Generate a native object file */
+                      Header    /** Generate a C/C++ header file with 
+                                    declarations of 'export'ed functions, global
+                                    variables, and the types used by them. */
+    };
+
+    /** Write the corresponding output type to the given file.  Returns
+        true on success, false if there has been an error.  The given
+        filename may be NULL, indicating that output should go to standard
+        output. */
+    bool WriteOutput(OutputType ot, const char *filename);
+
+    /** Total number of errors encountered during compilation. */
+    int errorCount;
+
+    /** Symbol table to hold symbols visible in the current scope during
+        compilation. */
+    SymbolTable *symbolTable;
+
+    /** llvm Module object into which globals and functions are added. */
+    llvm::Module *module; 
+
+#ifndef LLVM_2_8
+    /** The diBuilder manages generating debugging information (only
+        supported in LLVM 2.9 and beyond...) */
+    llvm::DIBuilder *diBuilder;
+#endif
+
+    GatherBuffer *gatherBuffer;
+
+private:
+    const char *filename;
+
+    /** This member records the global variables that have been defined
+        with 'extern' linkage, so that it's easy to include their
+        declarations in generated header files.
+
+        @todo FIXME: it would be nice to eliminate this and then query the
+        symbol table or the llvm Module for them when/if we need them.
+     */
+    std::vector<Symbol *> externGlobals;
+
+    bool writeHeader(const char *filename);
+    bool writeObjectFileOrAssembly(OutputType outputType, const char *filename);
+};
+
+#endif // ISPC_MODULE_H
diff --git a/opt.cpp b/opt.cpp
new file mode 100644
index 00000000..583e8324
--- /dev/null
+++ b/opt.cpp
@@ -0,0 +1,2251 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file opt.cpp
+    @brief Implementations of various ispc optimization passes that operate
+           on the LLVM IR.
+*/
+
+#include "opt.h"
+#include "ctx.h"
+#include "sym.h"
+#include "module.h"
+#include "util.h"
+#include "llvmutil.h"
+
+#include <stdio.h>
+
+#include <llvm/Pass.h>
+#include <llvm/Module.h>
+#include <llvm/PassManager.h>
+#include <llvm/PassRegistry.h>
+#include <llvm/Assembly/PrintModulePass.h>
+#include <llvm/Function.h>
+#include <llvm/BasicBlock.h>
+#include <llvm/Instructions.h>
+#include <llvm/Intrinsics.h>
+#include <llvm/Constants.h>
+#ifndef LLVM_2_8
+    #include <llvm/Target/TargetLibraryInfo.h>
+    #ifdef LLVM_2_9
+        #include <llvm/Support/StandardPasses.h>
+    #else
+        #include <llvm/Support/PassManagerBuilder.h>
+    #endif // LLVM_2_9
+#endif // LLVM_2_8
+#include <llvm/ADT/Triple.h>
+#include <llvm/Transforms/Scalar.h>
+#include <llvm/Transforms/IPO.h>
+#include <llvm/Transforms/Utils/BasicBlockUtils.h>
+#include <llvm/Target/TargetOptions.h>
+#include <llvm/Target/TargetData.h>
+#include <llvm/Analysis/Verifier.h>
+#include <llvm/Support/raw_ostream.h>
+#ifndef LLVM_2_8
+#include <llvm/Analysis/DIBuilder.h>
+#endif
+#include <llvm/Analysis/DebugInfo.h>
+#include <llvm/Support/Dwarf.h>
+
+static llvm::Pass *CreateIntrinsicsOptPass();
+static llvm::Pass *CreateGatherScatterFlattenPass();
+static llvm::Pass *CreateGatherScatterImprovementsPass();
+static llvm::Pass *CreateLowerGatherScatterPass();
+static llvm::Pass *CreateLowerMaskedStorePass();
+static llvm::Pass *CreateMaskedStoreOptPass();
+static llvm::Pass *CreateIsCompileTimeConstantPass(bool isLastTry);
+static llvm::Pass *CreateMakeInternalFuncsStaticPass();
+
+///////////////////////////////////////////////////////////////////////////
+
+
+/** This utility routine copies the metadata (if any) attached to the
+    'from' instruction in the IR to the 'to' instruction.  
+
+    For flexibility, this function takes an llvm::Value rather than an
+    llvm::Instruction for the 'to' parameter; at some places in the code
+    below, we sometimes use a llvm::Value to start out storing a value and
+    then later store instructions.  If a llvm::Value is passed to this, the
+    routine just returns without doing anything; if it is in fact an
+    LLVM::Instruction, then the metadata can be copied to it.
+ */
+static void
+lCopyMetadata(llvm::Value *vto, const llvm::Instruction *from) {
+    llvm::Instruction *to = llvm::dyn_cast<llvm::Instruction>(vto);
+    if (!to) 
+        return;
+                                                              
+    llvm::SmallVector<std::pair<unsigned int, llvm::MDNode *>, 8> metadata;
+    from->getAllMetadata(metadata);
+    for (unsigned int i = 0; i < metadata.size(); ++i)
+        to->setMetadata(metadata[i].first, metadata[i].second);
+}
+
+
+/** We have a protocol with the front-end LLVM IR code generation process
+    that allows us to encode the source file position that corresponds with
+    instructions.  (For example, this allows us to issue performance
+    warnings related to things like scatter and gather after optimization
+    has been performed, so that we aren't warning about scatters and
+    gathers that have been improved to stores and loads by optimization
+    passes.)  Note that this is slightly redundant with the source file
+    position encoding generated for debugging symbols, though we don't
+    always generate debugging information but we do always generate this
+    position data.
+
+    This function finds the SourcePos that the metadata in the instruction
+    (if present) corresponds to.  See the implementation of
+    FunctionEmitContext::addGSMetadata(), which encodes the source position during
+    code generation.
+
+    @param inst   Instruction to try to find the source position of
+    @param pos    Output variable in which to store the position
+    @returns      True if source file position metadata was present and *pos
+                  has been set.  False otherwise.
+*/
+static bool
+lGetSourcePosFromMetadata(const llvm::Instruction *inst, SourcePos *pos) {
+    llvm::MDNode *filename = inst->getMetadata("filename");
+    llvm::MDNode *line = inst->getMetadata("line");
+    llvm::MDNode *column = inst->getMetadata("column");
+    if (!filename || !line || !column)
+        return false;
+
+    // All of these asserts are things that FunctionEmitContext::addGSMetadata() is
+    // expected to have done in its operation
+    assert(filename->getNumOperands() == 1 && line->getNumOperands() == 1);
+    llvm::MDString *str = llvm::dyn_cast<llvm::MDString>(filename->getOperand(0));
+    assert(str);
+    llvm::ConstantInt *lnum = llvm::dyn_cast<llvm::ConstantInt>(line->getOperand(0));
+    assert(lnum);
+    llvm::ConstantInt *colnum = llvm::dyn_cast<llvm::ConstantInt>(column->getOperand(0));
+    assert(column);
+
+    *pos = SourcePos(str->getString().data(), (int)lnum->getZExtValue(),
+                   (int)colnum->getZExtValue());
+    return true;
+}
+
+
+/** Utility routine that prints out the LLVM IR for everything in the
+    module.  (Used for debugging).
+ */
+static void
+lPrintModuleCode(llvm::Module *module) {
+    llvm::PassManager ppm;
+    ppm.add(llvm::createPrintModulePass(&llvm::outs()));
+    ppm.run(*module);
+}
+
+
+void
+Optimize(llvm::Module *module, int optLevel) {
+    if (g->debugPrint) {
+        printf("*** Code going into optimization ***\n");
+        lPrintModuleCode(module);
+    }
+
+    llvm::PassManager optPM;
+    llvm::FunctionPassManager funcPM(module);
+
+#ifndef LLVM_2_8
+    llvm::TargetLibraryInfo *targetLibraryInfo =
+        new llvm::TargetLibraryInfo(llvm::Triple(module->getTargetTriple()));
+    optPM.add(targetLibraryInfo);
+#endif
+    optPM.add(new llvm::TargetData(module));
+
+    if (optLevel == 0) {
+        // This is more or less the minimum set of optimizations that we
+        // need to do to generate code that will actually run.  (We can't
+        // run absolutely no optimizations, since the front-end needs us to
+        // take the various __pseudo_* functions it has emitted and turn
+        // them into something that can actually execute.
+        optPM.add(CreateGatherScatterFlattenPass());
+        optPM.add(CreateLowerGatherScatterPass());
+        optPM.add(CreateLowerMaskedStorePass());
+        optPM.add(CreateIsCompileTimeConstantPass(true));
+        optPM.add(llvm::createFunctionInliningPass());
+        optPM.add(CreateMakeInternalFuncsStaticPass());
+        optPM.add(llvm::createGlobalDCEPass());
+    }
+    else {
+        // Otherwise throw the kitchen sink of optimizations at the code.
+        // This is almost certainly overkill and likely could be reduced,
+        // but on the other hand trying to remove some of these has
+        // historically caused performance slowdowns.  Benchmark carefully
+        // if changing these around.
+        //
+        // Note in particular that a number of the ispc optimization
+        // passes are run repeatedly along the way; they often can kick in
+        // only later in the optimization process as things like constant
+        // propagation have done their thing, and then when they do kick
+        // in, they can often open up new opportunities for optimization...
+#ifndef LLVM_2_8
+        llvm::PassRegistry *registry = llvm::PassRegistry::getPassRegistry();
+        llvm::initializeCore(*registry);
+        llvm::initializeScalarOpts(*registry);
+        llvm::initializeIPO(*registry);
+        llvm::initializeAnalysis(*registry);
+        llvm::initializeIPA(*registry);
+        llvm::initializeTransformUtils(*registry);
+        llvm::initializeInstCombine(*registry);
+        llvm::initializeInstrumentation(*registry);
+        llvm::initializeTarget(*registry);
+#endif
+        // Early optimizations to try to reduce the total amount of code to
+        // work with if we can
+        optPM.add(CreateGatherScatterFlattenPass());
+        optPM.add(llvm::createReassociatePass());
+        optPM.add(llvm::createConstantPropagationPass());
+
+        if (!g->opt.disableMaskedStoreOptimizations) {
+            optPM.add(CreateIntrinsicsOptPass());
+            optPM.add(CreateMaskedStoreOptPass());
+        }
+        optPM.add(llvm::createDeadInstEliminationPass());
+
+        optPM.add(llvm::createConstantPropagationPass());
+        optPM.add(llvm::createDeadInstEliminationPass());
+        
+        // On to more serious optimizations
+        optPM.add(llvm::createCFGSimplificationPass());
+        optPM.add(llvm::createScalarReplAggregatesPass());
+        optPM.add(llvm::createInstructionCombiningPass());
+        optPM.add(llvm::createCFGSimplificationPass());
+        optPM.add(llvm::createPromoteMemoryToRegisterPass());
+        optPM.add(llvm::createGlobalOptimizerPass());
+        optPM.add(llvm::createReassociatePass());
+        optPM.add(llvm::createIPConstantPropagationPass());
+        optPM.add(llvm::createDeadArgEliminationPass());
+        optPM.add(llvm::createInstructionCombiningPass());
+        optPM.add(llvm::createCFGSimplificationPass());
+        optPM.add(llvm::createPruneEHPass());
+        optPM.add(llvm::createFunctionAttrsPass());
+        optPM.add(llvm::createFunctionInliningPass());
+        optPM.add(llvm::createConstantPropagationPass());
+        optPM.add(llvm::createDeadInstEliminationPass());
+        optPM.add(llvm::createCFGSimplificationPass());
+
+        optPM.add(llvm::createArgumentPromotionPass());
+        optPM.add(llvm::createSimplifyLibCallsPass());
+        optPM.add(llvm::createInstructionCombiningPass());
+        optPM.add(llvm::createJumpThreadingPass());
+        optPM.add(llvm::createCFGSimplificationPass());
+        optPM.add(llvm::createScalarReplAggregatesPass());
+        optPM.add(llvm::createInstructionCombiningPass());
+        optPM.add(llvm::createTailCallEliminationPass());
+
+        if (!g->opt.disableMaskedStoreOptimizations) {
+            optPM.add(CreateIntrinsicsOptPass());
+            optPM.add(CreateMaskedStoreOptPass());
+        }
+        optPM.add(CreateLowerMaskedStorePass());
+        if (!g->opt.disableGatherScatterOptimizations)
+            optPM.add(CreateGatherScatterImprovementsPass());
+        optPM.add(CreateLowerMaskedStorePass());
+        optPM.add(CreateLowerGatherScatterPass());
+        optPM.add(llvm::createFunctionInliningPass());
+        optPM.add(llvm::createConstantPropagationPass());
+        optPM.add(CreateIntrinsicsOptPass());
+
+#if defined(LLVM_2_8)
+        optPM.add(CreateIsCompileTimeConstantPass(true));
+#elif defined(LLVM_2_9)
+        llvm::createStandardModulePasses(&optPM, 3, 
+                                         false /* opt size */,
+                                         true /* unit at a time */, 
+                                         false /* unroll loops */,
+                                         true /* simplify lib calls */,
+                                         false /* may have exceptions */,
+                                         llvm::createFunctionInliningPass());
+        llvm::createStandardLTOPasses(&optPM, true /* internalize pass */,
+                                      true /* inline once again */,
+                                      false /* verify after each pass */);
+        llvm::createStandardFunctionPasses(&optPM, 3);
+
+        optPM.add(CreateIsCompileTimeConstantPass(true));
+        optPM.add(CreateIntrinsicsOptPass());
+
+        llvm::createStandardModulePasses(&optPM, 3, 
+                                         false /* opt size */,
+                                         true /* unit at a time */, 
+                                         false /* unroll loops */,
+                                         true /* simplify lib calls */,
+                                         false /* may have exceptions */,
+                                         llvm::createFunctionInliningPass());
+#else
+        llvm::PassManagerBuilder builder;
+        builder.OptLevel = 3;
+        builder.Inliner = llvm::createFunctionInliningPass();
+        builder.populateFunctionPassManager(funcPM);
+        builder.populateModulePassManager(optPM);
+        optPM.add(CreateIsCompileTimeConstantPass(true));
+        optPM.add(CreateIntrinsicsOptPass());
+        builder.populateLTOPassManager(optPM, true /* internalize */,
+                                       true /* inline once again */);
+        optPM.add(CreateIsCompileTimeConstantPass(true));
+        optPM.add(CreateIntrinsicsOptPass());
+        builder.populateModulePassManager(optPM);
+#endif
+        optPM.add(CreateMakeInternalFuncsStaticPass());
+        optPM.add(llvm::createGlobalDCEPass());
+    }
+
+    // Finish up by making sure we didn't mess anything up in the IR along
+    // the way.
+    optPM.add(llvm::createVerifierPass());
+    
+    for (llvm::Module::iterator fiter = module->begin(); fiter != module->end();
+         ++fiter)
+        funcPM.run(*fiter);
+
+    optPM.run(*module);
+
+    if (g->debugPrint) {
+        printf("\n*****\nFINAL OUTPUT\n*****\n");
+        lPrintModuleCode(module);
+    }
+}
+
+
+
+///////////////////////////////////////////////////////////////////////////
+// IntrinsicsOpt
+
+/** This is a relatively simple optimization pass that does a few small
+    optimizations that LLVM's x86 optimizer doesn't currently handle.
+    (Specifically, MOVMSK of a constant can be replaced with the
+    corresponding constant value, and a BLENDVPS with either an 'all on' or
+    'all off' blend factor can be replaced with the corredponding value of
+    one of the two operands.
+
+    @todo The better thing to do would be to submit a patch to LLVM to get
+    these; they're presumably pretty simple patterns to match.  
+*/
+class IntrinsicsOpt : public llvm::BasicBlockPass {
+public:
+    IntrinsicsOpt();
+
+    const char *getPassName() const { return "Intrinsics Cleanup Optimization"; }
+    bool runOnBasicBlock(llvm::BasicBlock &BB);
+
+    static char ID;
+
+private:
+    struct MaskInstruction {
+        MaskInstruction(llvm::Function *f) { function = f; }
+        llvm::Function *function;
+    };
+    std::vector<MaskInstruction> maskInstructions;
+
+    /** Structure that records everything we need to know about a blend
+        instruction for this optimization pass.
+     */
+    struct BlendInstruction {
+        BlendInstruction(llvm::Function *f, int ao, int o0, int o1, int of)
+            : function(f), allOnMask(ao), op0(o0), op1(o1), opFactor(of) { }
+        /** Function pointer for the blend instruction */ 
+        llvm::Function *function;
+        /** Mask value for an "all on" mask for this instruction */
+        int allOnMask;
+        /** The operand number in the llvm CallInst corresponds to the
+            first operand to blend with. */
+        int op0;
+        /** The operand number in the CallInst corresponding to the second
+            operand to blend with. */
+        int op1;
+        /** The operand in the call inst where the blending factor is
+            found. */
+        int opFactor;
+    };
+    std::vector<BlendInstruction> blendInstructions;
+
+    bool matchesMaskInstruction(llvm::Function *function);
+    BlendInstruction *matchingBlendInstruction(llvm::Function *function);
+};
+
+char IntrinsicsOpt::ID = 0;
+llvm::RegisterPass<IntrinsicsOpt> sse("sse-constants", "Intrinsics Cleanup Pass");
+
+
+IntrinsicsOpt::IntrinsicsOpt() 
+    : BasicBlockPass(ID) {
+
+    // All of the mask instructions we may encounter.  Note that even if
+    // compiling for AVX, we may still encounter the regular 4-wide SSE
+    // MOVMSK instruction.
+    llvm::Function *sseMovmsk = 
+        llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_sse_movmsk_ps);
+    maskInstructions.push_back(sseMovmsk);
+    maskInstructions.push_back(m->module->getFunction("llvm.x86.avx.movmsk.ps"));
+    maskInstructions.push_back(m->module->getFunction("llvm.x86.mic.mask16.to.int"));
+    maskInstructions.push_back(m->module->getFunction("__movmsk"));
+
+    // And all of the blend instructions
+    blendInstructions.push_back(BlendInstruction(
+        llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_sse41_blendvps),
+        0xf, 0, 1, 2));
+    blendInstructions.push_back(BlendInstruction(
+        m->module->getFunction("llvm.x86.avx.blendvps"), 0xff, 0, 1, 2));
+    blendInstructions.push_back(BlendInstruction(
+        m->module->getFunction("llvm.x86.mic.blend.ps"), 0xffff, 1, 2, 0));
+}
+
+
+/** Given an llvm::Value represinting a vector mask, see if the value is a
+    constant.  If so, return the integer mask found by taking the high bits
+    of the mask values in turn and concatenating them into a single integer.
+    In other words, given the 4-wide mask: < 0xffffffff, 0, 0, 0xffffffff >, 
+    we have 0b1001 = 9.
+
+    @todo This will break if we ever do 32-wide compilation, in which case
+    it don't be possible to distinguish between -1 for "don't know" and
+    "known and all bits on".
+ */
+static int
+lGetMask(llvm::Value *factor) {
+    llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(factor);
+    if (cv) {
+        int mask = 0;
+        llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> elements;
+        cv->getVectorElements(elements);
+
+        for (unsigned int i = 0; i < elements.size(); ++i) {
+            llvm::APInt intMaskValue;
+            // SSE has the "interesting" approach of encoding blending
+            // masks as <n x float>.
+            llvm::ConstantFP *cf = llvm::dyn_cast<llvm::ConstantFP>(elements[i]);
+            if (cf) {
+                llvm::APFloat apf = cf->getValueAPF();
+                intMaskValue = apf.bitcastToAPInt();
+            }
+            else {
+                // Otherwise get it as an int
+                llvm::ConstantInt *ci = llvm::dyn_cast<llvm::ConstantInt>(elements[i]);
+                assert(ci != NULL);  // vs return -1 if NULL?
+                intMaskValue = ci->getValue();
+            }
+            // Is the high-bit set?  If so, OR in the appropriate bit in
+            // the result mask
+            if (intMaskValue.countLeadingOnes() > 0)
+                mask |= (1 << i);
+        }
+        return mask;
+    }
+    else if (llvm::isa<llvm::ConstantAggregateZero>(factor))
+        return 0;
+    else {
+        // else we should be able to handle it above...
+        assert(!llvm::isa<llvm::Constant>(factor));
+        return -1;
+    }
+}
+
+
+/** Given an llvm::Value, return true if we can determine that it's an
+    undefined value.  This only makes a weak attempt at chasing this down,
+    only detecting flat-out undef values, and bitcasts of undef values.
+
+    @todo Is it worth working harder to find more of these?  It starts to
+    get tricky, since having an undef operand doesn't necessarily mean that
+    the result will be undefined.  (And for that matter, is there an LLVM
+    call that will do this for us?)
+ */
+static bool
+lIsUndef(llvm::Value *value) {
+    if (llvm::isa<llvm::UndefValue>(value))
+        return true;
+
+    llvm::BitCastInst *bci = llvm::dyn_cast<llvm::BitCastInst>(value);
+    if (bci)
+        return lIsUndef(bci->getOperand(0));
+
+    return false;
+}
+
+
+bool
+IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
+    bool modifiedAny = false;
+ restart:
+    for (llvm::BasicBlock::iterator i = bb.begin(), e = bb.end(); i != e; ++i) {
+        llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*i);
+        if (!callInst)
+            continue;
+
+        BlendInstruction *blend = matchingBlendInstruction(callInst->getCalledFunction());
+        if (blend != NULL) {
+            llvm::Value *v[2] = { callInst->getArgOperand(blend->op0), 
+                                  callInst->getArgOperand(blend->op1) };
+            llvm::Value *factor = callInst->getArgOperand(blend->opFactor);
+
+            // If the values are the same, then no need to blend..
+            if (v[0] == v[1]) {
+                llvm::ReplaceInstWithValue(i->getParent()->getInstList(), i, v[0]);
+                modifiedAny = true;
+                goto restart;
+            }
+
+            // If one of the two is undefined, we're allowed to replace
+            // with the value of the other.  (In other words, the only
+            // valid case is that the blend factor ends up having a value
+            // that only selects from the defined one of the two operands,
+            // otherwise the result is undefined and any value is fine,
+            // ergo the defined one is an acceptable result.)
+            if (lIsUndef(v[0])) {
+                llvm::ReplaceInstWithValue(i->getParent()->getInstList(), i, v[1]);
+                modifiedAny = true;
+                goto restart;
+            }
+            if (lIsUndef(v[1])) {
+                llvm::ReplaceInstWithValue(i->getParent()->getInstList(), i, v[0]);
+                modifiedAny = true;
+                goto restart;
+            }
+
+            int mask = lGetMask(factor);
+            llvm::Value *value = NULL;
+            if (mask == 0)
+                // Mask all off -> replace with the first blend value
+                value = v[0];
+            else if (mask == blend->allOnMask)
+                // Mask all on -> replace with the second blend value
+                value = v[1];
+
+            if (value != NULL) {
+                llvm::ReplaceInstWithValue(i->getParent()->getInstList(), i, value);
+                modifiedAny = true;
+                goto restart;
+            }
+        }
+        else if (matchesMaskInstruction(callInst->getCalledFunction())) {
+            llvm::Value *factor = callInst->getArgOperand(0);
+            int mask = lGetMask(factor);
+            if (mask != -1) {
+                // If the vector-valued mask has a known value, replace it
+                // with the corresponding integer mask from its elements
+                // high bits.
+                llvm::Value *value = LLVMInt32(mask);
+                llvm::ReplaceInstWithValue(i->getParent()->getInstList(), i, value);
+                modifiedAny = true;
+                goto restart;
+            }
+        }
+    }
+    return modifiedAny;
+}
+
+
+bool
+IntrinsicsOpt::matchesMaskInstruction(llvm::Function *function) {
+    for (unsigned int i = 0; i < maskInstructions.size(); ++i)
+        if (function == maskInstructions[i].function)
+            return true;
+    return false;
+}
+
+
+IntrinsicsOpt::BlendInstruction *
+IntrinsicsOpt::matchingBlendInstruction(llvm::Function *function) {
+    for (unsigned int i = 0; i < blendInstructions.size(); ++i)
+        if (function == blendInstructions[i].function)
+            return &blendInstructions[i];
+    return NULL;
+}
+
+
+static llvm::Pass *
+CreateIntrinsicsOptPass() {
+    return new IntrinsicsOpt;
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// GatherScatterFlattenOpt
+
+/** When the front-end emits gathers and scatters, it generates an array of
+    vector-width pointers to represent the set of addresses to read from or
+    write to.  However, because ispc doesn't support pointers, it turns
+    out to be the case that scatters and gathers always end up indexing
+    into an array with a common base pointer.  Therefore, this optimization
+    transforms the original arrays of general pointers into a single base
+    pointer and an array of offsets.
+
+    (Implementation seems to be easier with this approach versus having the
+    front-end try to emit base pointer + offset stuff from the start,
+    though arguably the latter approach would be a little more elegant.)
+
+    See for example the comments discussing the __pseudo_gather functions
+    in builtins.cpp for more information about this.
+
+    @todo The implementation of this is pretty messy, and it sure would be
+    nice to not have all the complexity of built-in assumptions of the
+    structure of how the front end will have generated code, all of the
+    instruction dyn_casts, etc.  Can we do something simpler, e.g. an early
+    pass to flatten out GEPs when the size is known, then do LLVM's
+    constant folding, then flatten into an array, etc.?
+ */
+class GatherScatterFlattenOpt : public llvm::BasicBlockPass {
+public:
+    static char ID;
+    GatherScatterFlattenOpt() : BasicBlockPass(ID) { }
+
+    const char *getPassName() const { return "Gather/Scatter Flattening"; }
+    bool runOnBasicBlock(llvm::BasicBlock &BB);
+};
+
+char GatherScatterFlattenOpt::ID = 0;
+
+llvm::RegisterPass<GatherScatterFlattenOpt> gsf("gs-flatten", "Gather/Scatter Flatten Pass");
+
+
+/** Given an llvm::Value known to be an unsigned integer, return its value as
+    an int64_t.
+*/
+static uint64_t
+lGetIntValue(llvm::Value *offset) {
+    llvm::ConstantInt *intOffset = llvm::dyn_cast<llvm::ConstantInt>(offset);
+    assert(intOffset && (intOffset->getBitWidth() == 32 ||
+                         intOffset->getBitWidth() == 64));
+    return intOffset->getZExtValue();
+}
+
+
+/** Returns the size of the given llvm::Type as an llvm::Value, if the size
+    can be easily determined at compile type.  If it's not easy to figure
+    out the size, this just returns NULL and we handle finding its size
+    differently.
+ */
+static bool
+lSizeOfIfKnown(const llvm::Type *type, uint64_t *size) {
+    if (type == LLVMTypes::Int8Type) {
+        *size = 1;
+        return true;
+    }
+    else if (type == LLVMTypes::Int16Type) {
+        *size = 2;
+        return true;
+    }
+    else if (type == LLVMTypes::FloatType || type == LLVMTypes::Int32Type) {
+        *size = 4;
+        return true;
+    }
+    else if (type == LLVMTypes::FloatVectorType || type == LLVMTypes::Int32VectorType) {
+        *size = g->target.vectorWidth * 4;
+        return true;
+    }
+    else if (type == LLVMTypes::DoubleType || type == LLVMTypes::Int64Type) {
+        *size = 8;
+        return true;
+    }
+    else if (type == LLVMTypes::DoubleVectorType || type == LLVMTypes::Int64VectorType) {
+        *size = g->target.vectorWidth * 8;
+        return true;
+    }
+    else if (llvm::isa<const llvm::ArrayType>(type)) {
+        const llvm::ArrayType *at = llvm::dyn_cast<const llvm::ArrayType>(type);
+        uint64_t eltSize;
+        if (lSizeOfIfKnown(at->getElementType(), &eltSize)) {
+            *size = eltSize * at->getNumElements();
+            return true;
+        }
+        else
+            return false;
+    }
+    return false;
+}
+
+
+/** This function returns an llvm::Value giving the size of the given type.
+    If any instructions need to be generated to compute the size, they are
+    inserted before insertBefore.
+ */
+static llvm::Value *
+lSizeOf(const llvm::Type *type, llvm::Instruction *insertBefore) {
+    // First try the easy case and see if we can get it as a simple
+    // constant..
+    uint64_t size;
+    if (lSizeOfIfKnown(type, &size))
+        return LLVMInt64(size);
+
+    // Otherwise use the trick of doing a GEP with a NULL pointer to get
+    // the pointer to the second element of an array of items of this type.
+    // Then convert that pointer to an int and we've got the offset to the
+    // second element in the array, a.k.a. the size of the type.
+    const llvm::Type *ptrType = llvm::PointerType::get(type, 0);
+    llvm::Value *nullPtr = llvm::Constant::getNullValue(ptrType);
+    llvm::Value *index[1] = { LLVMInt32(1) };
+    llvm::Value *poffset = llvm::GetElementPtrInst::Create(nullPtr, &index[0], &index[1],
+                                                           "offset_ptr", insertBefore);
+    lCopyMetadata(poffset, insertBefore);
+    llvm::Instruction *inst = new llvm::PtrToIntInst(poffset, LLVMTypes::Int64Type, 
+                                                     "offset_int", insertBefore);
+    lCopyMetadata(inst, insertBefore);
+    return inst;
+}
+
+
+/** This function returns a value that gives the offset in bytes from the
+    start of the given structure type to the given struct member.  The
+    instructions that compute this value are inserted before insertBefore.
+ */
+static llvm::Value *
+lStructOffset(const llvm::Type *type, uint64_t member, 
+              llvm::Instruction *insertBefore) {
+    // Do a similar trick to the one done in lSizeOf above; compute the
+    // pointer to the member starting from a NULL base pointer and then
+    // cast that 'pointer' to an int...
+    assert(llvm::isa<const llvm::StructType>(type));
+    const llvm::Type *ptrType = llvm::PointerType::get(type, 0);
+    llvm::Value *nullPtr = llvm::Constant::getNullValue(ptrType);
+    llvm::Value *index[2] = { LLVMInt32(0), LLVMInt32((int32_t)member) };
+    llvm::Value *poffset = llvm::GetElementPtrInst::Create(nullPtr, &index[0], &index[2],
+                                                           "member_ptr", insertBefore);
+    lCopyMetadata(poffset, insertBefore);
+    llvm::Instruction *inst = new llvm::PtrToIntInst(poffset, LLVMTypes::Int64Type, 
+                                                     "member_int", insertBefore);
+    lCopyMetadata(inst, insertBefore);
+    return inst;
+}
+
+
+static llvm::Value *
+lGetTypeSize(const llvm::Type *type, llvm::Instruction *insertBefore) {
+    const llvm::ArrayType *arrayType =  llvm::dyn_cast<const llvm::ArrayType>(type);
+    if (arrayType != NULL)
+        type = arrayType->getElementType();
+
+    llvm::Value *scale = lSizeOf(type, insertBefore);
+    llvm::Instruction *inst = new llvm::TruncInst(scale, LLVMTypes::Int32Type, "sizeof32", 
+                                                  insertBefore);
+    lCopyMetadata(inst, insertBefore);
+    return inst;
+}
+
+
+static llvm::Value *
+lGetOffsetForLane(int lane, llvm::Value *value, llvm::Value **offset, 
+                  const llvm::Type **scaleType, bool *leafIsVarying,
+                  llvm::Instruction *insertBefore) {
+    if (!llvm::isa<llvm::GetElementPtrInst>(value)) {
+        assert(llvm::isa<llvm::BitCastInst>(value));
+        value = llvm::dyn_cast<llvm::BitCastInst>(value)->getOperand(0);
+
+        llvm::ExtractValueInst *ev = llvm::dyn_cast<llvm::ExtractValueInst>(value);
+        assert(ev->hasIndices() && ev->getNumIndices() == 1);
+        assert(int(*(ev->idx_begin())) == lane);
+
+        llvm::InsertValueInst *iv = llvm::dyn_cast<llvm::InsertValueInst>(ev->getOperand(0));
+        assert(iv->hasIndices() && iv->getNumIndices() == 1);
+        while (int(*(iv->idx_begin())) != lane) {
+            iv = llvm::dyn_cast<llvm::InsertValueInst>(iv->getOperand(0));
+            assert(iv && iv->hasIndices() && iv->getNumIndices() == 1);
+        }
+    
+        value = iv->getOperand(1);
+    }
+
+    if (leafIsVarying != NULL) {
+        const llvm::Type *pt = value->getType();
+        const llvm::PointerType *ptrType = llvm::dyn_cast<const llvm::PointerType>(pt);
+        assert(ptrType);
+        const llvm::Type *eltType = ptrType->getElementType();
+        *leafIsVarying = llvm::isa<const llvm::VectorType>(eltType);
+    }
+
+    llvm::GetElementPtrInst *gep = llvm::dyn_cast<llvm::GetElementPtrInst>(value);
+    assert(gep);
+
+    assert(lGetIntValue(gep->getOperand(1)) == 0);
+    const llvm::PointerType *targetPtrType = 
+        llvm::dyn_cast<const llvm::PointerType>(gep->getOperand(0)->getType());
+    assert(targetPtrType);
+    const llvm::Type *targetType = targetPtrType->getElementType();
+
+    if (llvm::isa<const llvm::StructType>(targetType)) {
+        *offset = lStructOffset(targetType, lGetIntValue(gep->getOperand(2)),
+                                insertBefore);
+        *offset = new llvm::TruncInst(*offset, LLVMTypes::Int32Type, "member32", 
+                                      insertBefore);
+        lCopyMetadata(*offset, insertBefore);
+        *scaleType = LLVMTypes::Int8Type; // aka char aka sizeof(1)
+    }
+    else {
+        *offset = gep->getOperand(2);
+        assert(*scaleType == NULL || *scaleType == targetType);
+        *scaleType = targetType;
+    }
+
+    llvm::ExtractValueInst *ee = 
+        llvm::dyn_cast<llvm::ExtractValueInst>(gep->getOperand(0));
+    if (ee == NULL) {
+        // found the base pointer, here it is...
+        return gep->getOperand(0);
+    }
+    else {
+        assert(ee->hasIndices() && ee->getNumIndices() == 1 &&
+               int(*(ee->idx_begin())) == lane);
+        llvm::InsertValueInst *iv =
+            llvm::dyn_cast<llvm::InsertValueInst>(ee->getOperand(0));
+        assert(iv != NULL);
+        // do this chain of inserts for the next dimension...
+        return iv;
+    }
+}
+
+
+/** We have an LLVM array of pointer values, where each pointer has been
+    computed with a GEP from some common base pointer value.  This function
+    deconstructs the LLVM array, storing the offset from the base pointer
+    as an llvm::Value for the i'th element into the i'th element of the
+    offsets[] array passed in to the function.  It returns a scale factor
+    for the offsets via *scaleType, and sets *leafIsVarying to true if the
+    leaf data type being indexed into is a 'varying' ispc type.  The
+    return value is either the base pointer or the an array of pointers for
+    the next dimension of indexing (that we'll in turn deconstruct with
+    this function).
+
+    @todo All of the additional indexing magic for varying stuff should
+    happen in the front end.
+ */
+static llvm::Value *
+lTraverseInsertChain(llvm::Value *ptrs, llvm::Value *offsets[ISPC_MAX_NVEC],
+                     const llvm::Type **scaleType, bool *leafIsVarying,
+                     llvm::Instruction *insertBefore) {
+    // This depends on the front-end constructing the arrays of pointers
+    // via InsertValue instructions.  (Which it does do in
+    // FunctionEmitContext::GetElementPtrInst()).
+    llvm::InsertValueInst *ivInst = llvm::dyn_cast<llvm::InsertValueInst>(ptrs);
+    assert(ivInst != NULL);
+
+    // We have a chain of insert value instructions where each instruction
+    // sets one of the elements of the array and where the input array is
+    // either the base pointer or another insert value instruction.  Here
+    // we talk through all of the insert value instructions until we hit
+    // the end.
+    llvm::Value *nextChain = NULL;
+    while (ivInst != NULL) {
+        // Figure out which array index this current instruction is setting
+        // the value of.
+        assert(ivInst->hasIndices() && ivInst->getNumIndices() == 1);
+        int elementIndex = *(ivInst->idx_begin());
+        assert(elementIndex >= 0 && elementIndex < g->target.vectorWidth);
+        // We shouldn't have already seen something setting the value for
+        // this index.
+        assert(offsets[elementIndex] == NULL);
+
+        // Set offsets[elementIndex] here.  This returns the value from
+        // which the GEP operation was done; this should either be the base
+        // pointer or an insert value chain for another dimension of the
+        // array being indexed into.
+        llvm::Value *myNext = lGetOffsetForLane(elementIndex, ivInst->getOperand(1), 
+                                                &offsets[elementIndex], scaleType,
+                                                leafIsVarying, insertBefore);
+        if (nextChain == NULL)
+            nextChain = myNext;
+        else
+            // All of these insert value instructions should have the same
+            // base value
+            assert(nextChain == myNext);
+
+        // Do we have another element of the array to process?
+        llvm::Value *nextInsert = ivInst->getOperand(0);
+        ivInst = llvm::dyn_cast<llvm::InsertValueInst>(nextInsert);
+        if (!ivInst)
+            assert(llvm::isa<llvm::UndefValue>(nextInsert));
+    }
+    return nextChain;
+}
+
+
+/** Given a scalar value, return a vector of width g->target.vectorWidth
+    that has the scalar replicated across each of its elements.
+
+    @todo Using shufflevector to do this seems more idiomatic (and would be
+    just a single instruction).  Switch to that?
+ */
+static llvm::Value *
+lSmearScalar(llvm::Value *scalar, llvm::Instruction *insertBefore) {
+    const llvm::Type *vectorType = llvm::VectorType::get(scalar->getType(), 
+                                                         g->target.vectorWidth);
+    llvm::Value *result = llvm::UndefValue::get(vectorType);
+    for (int i = 0; i < g->target.vectorWidth; ++i) {
+        result = llvm::InsertElementInst::Create(result, scalar, LLVMInt32(i),
+                                                 "smearinsert", insertBefore);
+        lCopyMetadata(result, insertBefore);
+    }
+    return result;
+}
+
+
+static llvm::Value *
+lGetPtrAndOffsets(llvm::Value *ptrs, llvm::Value **basePtr, 
+                  llvm::Instruction *insertBefore, int eltSize) {
+    llvm::Value *offset = LLVMInt32Vector(0);
+    bool firstLoop = true, leafIsVarying;
+
+    while (ptrs != NULL) {
+        llvm::Value *offsets[ISPC_MAX_NVEC];
+        for (int i = 0; i < g->target.vectorWidth; ++i)
+            offsets[i] = NULL;
+        const llvm::Type *scaleType = NULL;
+
+        llvm::Value *nextChain = 
+            lTraverseInsertChain(ptrs, offsets, &scaleType,
+                                 firstLoop ? &leafIsVarying : NULL, insertBefore);
+
+        for (int i = 0; i < g->target.vectorWidth; ++i)
+            assert(offsets[i] != NULL);
+        llvm::Value *delta = llvm::UndefValue::get(LLVMTypes::Int32VectorType);
+        for (int i = 0; i < g->target.vectorWidth; ++i) {
+            delta = llvm::InsertElementInst::Create(delta, offsets[i],
+                                                    LLVMInt32(i), "dim",
+                                                    insertBefore);
+            lCopyMetadata(delta, insertBefore);
+        }
+
+        llvm::Value *size = lGetTypeSize(scaleType, insertBefore);
+
+        llvm::Value *scale = lSmearScalar(size, insertBefore);
+        delta = llvm::BinaryOperator::Create(llvm::Instruction::Mul, delta, 
+                                             scale, "delta_scale", insertBefore);
+        lCopyMetadata(delta, insertBefore);
+        offset = llvm::BinaryOperator::Create(llvm::Instruction::Add, offset, 
+                                              delta, "offset_delta", 
+                                              insertBefore);
+        lCopyMetadata(offset, insertBefore);
+
+        if (llvm::dyn_cast<llvm::InsertValueInst>(nextChain))
+            ptrs = nextChain;
+        else {
+            // else we don't have a unique starting pointer....
+            assert(*basePtr == NULL || *basePtr == nextChain);
+            *basePtr = nextChain;
+            break;
+        }
+        firstLoop = false;
+    }
+
+    // handle varying stuff...
+    if (leafIsVarying) {
+        llvm::Value *deltaVector = llvm::UndefValue::get(LLVMTypes::Int32VectorType);
+        for (int i = 0; i < g->target.vectorWidth; ++i) {
+            deltaVector = 
+                llvm::InsertElementInst::Create(deltaVector, LLVMInt32(eltSize*i),
+                                                LLVMInt32(i), "delta", insertBefore);
+            lCopyMetadata(deltaVector, insertBefore);
+        }
+        offset = llvm::BinaryOperator::Create(llvm::Instruction::Add, offset, 
+                                              deltaVector, "offset_varying_delta", 
+                                              insertBefore);
+        lCopyMetadata(offset, insertBefore);
+    }
+
+    return offset;
+}
+
+
+bool
+GatherScatterFlattenOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
+    llvm::Function *gather32Func = m->module->getFunction("__pseudo_gather_32");
+    llvm::Function *gather64Func = m->module->getFunction("__pseudo_gather_64");
+    llvm::Function *scatter32Func = m->module->getFunction("__pseudo_scatter_32");
+    llvm::Function *scatter64Func = m->module->getFunction("__pseudo_scatter_64");
+    assert(gather32Func && gather64Func && scatter32Func && scatter64Func);
+
+    bool modifiedAny = false;
+ restart:
+    // Iterate through all of the instructions in the basic block.
+    for (llvm::BasicBlock::iterator i = bb.begin(), e = bb.end(); i != e; ++i) {
+        llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*i);
+        // If we don't have a call to one of the
+        // __pseudo_{gather,scatter}_* functions, then just go on to the
+        // next instruction.
+        if (!callInst ||
+            (callInst->getCalledFunction() != gather32Func &&
+             callInst->getCalledFunction() != gather64Func &&
+             callInst->getCalledFunction() != scatter32Func &&
+             callInst->getCalledFunction() != scatter64Func))
+            continue;
+
+        bool isGather = (callInst->getCalledFunction() == gather32Func ||
+                         callInst->getCalledFunction() == gather64Func);
+        bool is32 = (callInst->getCalledFunction() == gather32Func ||
+                     callInst->getCalledFunction() == scatter32Func);
+
+        // Transform the array of pointers to a single base pointer and an
+        // array of int32 offsets.  (All the hard work is done by
+        // lGetPtrAndOffsets).
+        llvm::Value *ptrs = callInst->getArgOperand(0);
+        llvm::Value *basePtr = NULL;
+        llvm::Value *offsetVector = lGetPtrAndOffsets(ptrs, &basePtr, callInst, 
+                                                      is32 ? 4 : 8);
+        // Cast the base pointer to a void *, since that's what the
+        // __pseudo_*_base_offsets_* functions want.
+        basePtr = new llvm::BitCastInst(basePtr, LLVMTypes::VoidPointerType, "base2void", 
+                                        callInst);
+        lCopyMetadata(basePtr, callInst);
+
+        if (isGather) {
+            llvm::Value *mask = callInst->getArgOperand(1);
+            llvm::Function *gFunc = 
+                m->module->getFunction(is32 ? "__pseudo_gather_base_offsets_32" :
+                                              "__pseudo_gather_base_offsets_64");
+            assert(gFunc != NULL);
+
+            // Generate a new function call to the next pseudo gather
+            // base+offsets instruction.  Note that we're passing a NULL
+            // llvm::BasicBlock to llvm::CallInst::Create; this means that
+            // the instruction isn't inserted into a basic block and that
+            // way we can then call ReplaceInstWithInst().
+            llvm::Value *newArgs[3] = { basePtr, offsetVector, mask };
+            llvm::Instruction *newCall = 
+                llvm::CallInst::Create(gFunc, &newArgs[0], &newArgs[3], "newgather");
+            lCopyMetadata(newCall, callInst);
+            llvm::ReplaceInstWithInst(callInst, newCall);
+        }
+        else {
+            llvm::Value *mask = callInst->getArgOperand(2);
+            llvm::Value *rvalue = callInst->getArgOperand(1);
+            llvm::Function *gFunc = 
+                m->module->getFunction(is32 ? "__pseudo_scatter_base_offsets_32" :
+                                              "__pseudo_scatter_base_offsets_64");
+            assert(gFunc);
+
+            // Generate a new function call to the next pseudo scatter
+            // base+offsets instruction.  See above for why passing NULL
+            // for the basic block is intended.
+            llvm::Value *newArgs[4] = { basePtr, offsetVector, rvalue, mask };
+            llvm::Instruction *newCall = 
+                llvm::CallInst::Create(gFunc, &newArgs[0], &newArgs[4]);
+            lCopyMetadata(newCall, callInst);
+            llvm::ReplaceInstWithInst(callInst, newCall);
+        }
+        modifiedAny = true;
+        goto restart;
+    }
+    return modifiedAny;
+}
+
+
+static llvm::Pass *
+CreateGatherScatterFlattenPass() {
+    return new GatherScatterFlattenOpt;
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// MaskedStoreOptPass
+
+/** Masked stores are generally more complex than regular stores; for
+    example, they require multiple instructions to simulate under SSE.
+    This optimization detects cases where masked stores can be replaced
+    with regular stores or removed entirely, for the cases of an 'all on'
+    mask and an 'all off' mask, respectively.
+ */
+class MaskedStoreOptPass : public llvm::BasicBlockPass {
+public:
+    static char ID;
+    MaskedStoreOptPass() : BasicBlockPass(ID) { }
+
+    const char *getPassName() const { return "Masked Store Scalarize"; }
+    bool runOnBasicBlock(llvm::BasicBlock &BB);
+};
+
+
+char MaskedStoreOptPass::ID = 0;
+
+llvm::RegisterPass<MaskedStoreOptPass> mss("masked-store-scalarize",
+                                           "Masked Store Scalarize Pass");
+
+bool
+MaskedStoreOptPass::runOnBasicBlock(llvm::BasicBlock &bb) {
+    llvm::Function *pms32Func = m->module->getFunction("__pseudo_masked_store_32");
+    llvm::Function *pms64Func = m->module->getFunction("__pseudo_masked_store_64");
+    llvm::Function *msb32Func = m->module->getFunction("__masked_store_blend_32");
+    llvm::Function *msb64Func = m->module->getFunction("__masked_store_blend_64");
+    llvm::Function *ms32Func = m->module->getFunction("__masked_store_32");
+    llvm::Function *ms64Func = m->module->getFunction("__masked_store_64");
+
+    bool modifiedAny = false;
+ restart:
+    // Iterate over all of the instructions to look for one of the various
+    // masked store functions
+    for (llvm::BasicBlock::iterator i = bb.begin(), e = bb.end(); i != e; ++i) {
+        llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*i);
+        if (!callInst)
+            continue;
+
+        llvm::Function *called = callInst->getCalledFunction();
+        if (called != pms32Func && called != pms64Func &&
+            called != msb32Func && called != msb64Func &&
+            called != ms32Func  && called != ms64Func)
+            continue;
+
+        // Got one; grab the operands
+        llvm::Value *lvalue = callInst->getArgOperand(0);
+        llvm::Value *rvalue  = callInst->getArgOperand(1);
+        llvm::Value *mask = callInst->getArgOperand(2);
+
+        int allOnMask = (1 << g->target.vectorWidth) - 1;
+
+        int maskAsInt = lGetMask(mask);
+        if (maskAsInt == 0) {
+            // Zero mask - no-op, so remove the store completely.  (This
+            // may in turn lead to being able to optimize out instructions
+            // that compute the rvalue...)
+            callInst->eraseFromParent();
+            modifiedAny = true;
+            goto restart;
+        }
+        else if (maskAsInt == allOnMask) {
+            // The mask is all on, so turn this into a regular store
+            const llvm::Type *ptrType = llvm::PointerType::get(rvalue->getType(), 0);
+            lvalue = new llvm::BitCastInst(lvalue, ptrType, "lvalue_to_ptr_type", callInst);
+            lCopyMetadata(lvalue, callInst);
+            llvm::Instruction *store = new llvm::StoreInst(rvalue, lvalue);
+            lCopyMetadata(store, callInst);
+            llvm::ReplaceInstWithInst(callInst, store);
+
+            modifiedAny = true;
+            goto restart;
+        }
+    }
+    return modifiedAny;
+}
+
+
+static llvm::Pass *
+CreateMaskedStoreOptPass() {
+    return new MaskedStoreOptPass;
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// LowerMaskedStorePass
+
+/** When the front-end needs to do a masked store, it emits a
+    __pseudo_masked_store_{32,64} call as a placeholder.  This pass lowers
+    these calls to either __masked_store_{32,64} or
+    __masked_store_blend_{32,64} calls.
+  */
+class LowerMaskedStorePass : public llvm::BasicBlockPass {
+public:
+    static char ID;
+    LowerMaskedStorePass() : BasicBlockPass(ID) { }
+
+    const char *getPassName() const { return "Lower Masked Stores"; }
+    bool runOnBasicBlock(llvm::BasicBlock &BB);
+};
+
+
+char LowerMaskedStorePass::ID = 0;
+
+llvm::RegisterPass<LowerMaskedStorePass> lms("masked-store-lower",
+                                             "Lower Masked Store Pass");
+
+
+/** This routine attempts to determine if the given pointer in lvalue is
+    pointing to stack-allocated memory.  It's conservative in that it
+    should never return true for non-stack allocated memory, but may return
+    false for memory that actually is stack allocated.  The basic strategy
+    is to traverse through the operands and see if the pointer originally
+    comes from an AllocaInst.
+ */
+static bool
+lIsStackVariablePointer(llvm::Value *lvalue) {
+    llvm::BitCastInst *bc = llvm::dyn_cast<llvm::BitCastInst>(lvalue);
+    if (bc)
+        return lIsStackVariablePointer(bc->getOperand(0));
+    else {
+        llvm::AllocaInst *ai = llvm::dyn_cast<llvm::AllocaInst>(lvalue);
+        if (ai)
+            return true;
+        else {
+            llvm::GetElementPtrInst *gep = llvm::dyn_cast<llvm::GetElementPtrInst>(lvalue);
+            if (gep)
+                return lIsStackVariablePointer(gep->getOperand(0));
+            else
+                return false;
+        }
+    }
+}
+
+
+/** Utilty routine to figure out which masked store function to use.  The
+    blend parameter indicates if we want the blending version, is32
+    indicates if the element size is 32 bits.
+ */
+static const char *
+lMaskedStoreName(bool blend, bool is32) {
+    if (blend) {
+        if (is32)
+            return "__masked_store_blend_32";
+        else
+            return "__masked_store_blend_64";
+    }
+    else {
+        if (is32)
+            return "__masked_store_32";
+        else
+            return "__masked_store_64";
+    }
+}
+
+
+bool
+LowerMaskedStorePass::runOnBasicBlock(llvm::BasicBlock &bb) {
+    llvm::Function *maskedStore32Func = m->module->getFunction("__pseudo_masked_store_32");
+    llvm::Function *maskedStore64Func = m->module->getFunction("__pseudo_masked_store_64");
+    assert(maskedStore32Func && maskedStore64Func);
+
+    bool modifiedAny = false;
+ restart:
+    for (llvm::BasicBlock::iterator i = bb.begin(), e = bb.end(); i != e; ++i) {
+        // Iterate through all of the instructions and look for
+        // __pseudo_masked_store_* calls.
+        llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*i);
+        if (!callInst ||
+            (callInst->getCalledFunction() != maskedStore32Func &&
+             callInst->getCalledFunction() != maskedStore64Func))
+            continue;
+
+        bool is32 = (callInst->getCalledFunction() == maskedStore32Func);
+        llvm::Value *lvalue = callInst->getArgOperand(0);
+        llvm::Value *rvalue  = callInst->getArgOperand(1);
+        llvm::Value *mask = callInst->getArgOperand(2);
+
+        // On SSE, we need to choose between doing the load + blend + store
+        // trick, or serializing the masked store.  On targets with a
+        // native masked store instruction, the implementations of
+        // __masked_store_blend_* should be the same as __masked_store_*,
+        // so this doesn't matter.  On SSE, blending is generally more
+        // efficient and is always safe to do on stack-allocated values.(?)
+        bool doBlend = lIsStackVariablePointer(lvalue);
+        if (g->target.isa == Target::SSE4 || g->target.isa == Target::SSE2)
+            doBlend |= !g->opt.disableBlendedMaskedStores;
+
+        // Generate the call to the appropriate masked store function and
+        // replace the __pseudo_* one with it.
+        llvm::Function *fms = m->module->getFunction(lMaskedStoreName(doBlend, is32));
+        assert(fms);
+        llvm::Value *args[3] = { lvalue, rvalue, mask };
+        llvm::Instruction *inst = llvm::CallInst::Create(fms, &args[0], &args[3], "", 
+                                                         callInst);
+        lCopyMetadata(inst, callInst);
+
+        callInst->eraseFromParent();
+        modifiedAny = true;
+        goto restart;
+    }
+    return modifiedAny;
+}
+
+
+static llvm::Pass *
+CreateLowerMaskedStorePass() {
+    return new LowerMaskedStorePass;
+}
+
+///////////////////////////////////////////////////////////////////////////
+// GSImprovementsPass
+
+/** After earlier optimization passes have run, we are sometimes able to
+    determine that gathers/scatters are actually accessing memory in a more
+    regular fashion and then change the operation to something simpler and
+    more efficient.  For example, if all of the lanes in a gather are
+    reading from the same location, we can instead do a scalar load and
+    broadcast.  This pass examines gathers and scatters and tries to
+    simplify them if at all possible.
+
+    @todo Currently, this only looks for all program instances going to the
+    same location and all going to a linear sequence of locations in
+    memory.  There are a number of other cases that might make sense to
+    look for, including things that could be handled with a vector load +
+    shuffle or things that could be handled with hybrids of e.g. 2 4-wide
+    vector loads with AVX, etc.
+ */
+class GSImprovementsPass : public llvm::BasicBlockPass {
+public:
+    static char ID;
+    GSImprovementsPass() : BasicBlockPass(ID) { }
+
+    const char *getPassName() const { return "Gather/Scatter Improvements"; }
+    bool runOnBasicBlock(llvm::BasicBlock &BB);
+};
+
+
+char GSImprovementsPass::ID = 0;
+
+llvm::RegisterPass<GSImprovementsPass> gsi("gs-improvements",
+                                           "Gather/Scatter Improvements Pass");
+
+
+#if 0
+// Debugging routine: dump the values of all of the elmenets in a
+// flattened-out vector
+static void lPrintVector(const char *info, llvm::Value *elements[ISPC_MAX_NVEC]) {
+    fprintf(stderr, "--- %s ---\n", info);
+    for (int i = 0; i < g->target.vectorWidth; ++i) {
+        fprintf(stderr, "%d: ", i);
+        elements[i]->dump();
+    }
+    fprintf(stderr, "-----\n");
+}
+#endif
+
+
+/** Given an LLVM vector in vec, return a 'scalarized' version of the
+    vector in the provided offsets[] array.  For example, if the vector
+    value passed in is:  
+
+    add <4 x i32> %a_smear, <4 x i32> <4, 8, 12, 16>,
+
+    and if %a_smear was computed by replicating a scalar value i32 %a
+    across all of the elements of %a_smear, then the values returned will
+    be:
+
+    offsets[0] = add i32 %a, i32 4
+    offsets[1] = add i32 %a, i32 8
+    offsets[2] = add i32 %a, i32 12
+    offsets[3] = add i32 %a, i32 16
+    
+    This function isn't fully general, but it seems to be able to handle
+    all of the patterns that currently arise in practice.  If it can't
+    scalarize a vector value, then it just returns false and the calling
+    code proceeds as best it can without this information.
+
+    @param vec               Vector to be scalarized
+    @param scalarizedVector  Array in which to store the individual vector 
+                             elements
+    @returns                 True if the vector was successfully scalarized and
+                             the values in offsets[] are valid; false otherwise
+ */
+static bool
+lScalarizeVector(llvm::Value *vec, llvm::Value *scalarizedVector[ISPC_MAX_NVEC]) {
+    // First initialize the values of scalarizedVector[] to NULL.
+    for (int i = 0; i < g->target.vectorWidth; ++i)
+        scalarizedVector[i] = NULL;
+
+    // ConstantVectors are easy; just pull out the individual constant
+    // element values
+    llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(vec);
+    if (cv != NULL) {
+        for (int i = 0; i < g->target.vectorWidth; ++i)
+            scalarizedVector[i] = cv->getOperand(i);
+        return true;
+    }
+
+    // It's also easy if it's just a vector of all zeros
+    llvm::ConstantAggregateZero *caz = llvm::dyn_cast<llvm::ConstantAggregateZero>(vec);
+    if (caz) {
+        for (int i = 0; i < g->target.vectorWidth; ++i)
+            scalarizedVector[i] = LLVMInt32(0);
+        return true;
+    }
+
+    llvm::BinaryOperator *bo = llvm::dyn_cast<llvm::BinaryOperator>(vec);
+    if (bo) {
+        // BinaryOperators are handled by attempting to scalarize both of
+        // the operands.  If we're successful at this, then the vector of
+        // scalar values we return from here are synthesized with scalar
+        // versions of the original vector binary operator
+        llvm::Instruction::BinaryOps opcode = bo->getOpcode();
+        llvm::Value *v0[ISPC_MAX_NVEC], *v1[ISPC_MAX_NVEC];
+
+        if (!lScalarizeVector(bo->getOperand(0), v0) || 
+            !lScalarizeVector(bo->getOperand(1), v1))
+            return false;
+
+        for (int i = 0; i < g->target.vectorWidth; ++i) {
+            scalarizedVector[i] = 
+                llvm::BinaryOperator::Create(opcode, v0[i], v1[i], "flat_bop", bo);
+            lCopyMetadata(scalarizedVector[i], bo);
+        }
+
+        return true;
+    }
+
+    llvm::InsertElementInst *ie = llvm::dyn_cast<llvm::InsertElementInst>(vec);
+    if (ie != NULL) {
+        // If we have an InsertElement instruction, we generally have a
+        // chain along the lines of:
+        //
+        // %v0 = insertelement undef, value_0, i32 index_0
+        // %v1 = insertelement %v1,   value_1, i32 index_1
+        // ...
+        // %vn = insertelement %vn-1, value_n-1, i32 index_n-1
+        //
+        // We start here witn %vn and work backwards through the chain of
+        // insertelement instructions until we get to the undef value that
+        // started it all.  At each instruction, we set the appropriate
+        // vaue in scalarizedVector[] based on the value being inserted.
+        while (ie != NULL) {
+            uint64_t iOffset = lGetIntValue(ie->getOperand(2));
+            assert((int)iOffset < g->target.vectorWidth);
+            assert(scalarizedVector[iOffset] == NULL);
+
+            scalarizedVector[iOffset] = ie->getOperand(1);
+
+            llvm::Value *insertBase = ie->getOperand(0);
+            ie = llvm::dyn_cast<llvm::InsertElementInst>(insertBase);
+            if (!ie)
+                assert(llvm::isa<llvm::UndefValue>(insertBase));
+        }
+        return true;
+    }
+
+    llvm::CastInst *ci = llvm::dyn_cast<llvm::CastInst>(vec);
+    if (ci) {
+        // Casts are similar to BinaryOperators in that we attempt to
+        // scalarize the vector being cast and if successful, we apply
+        // equivalent scalar cast operators to each of the values in the
+        // scalarized vector.
+        llvm::Instruction::CastOps op = ci->getOpcode();
+
+        llvm::Value *scalarizedTarget[ISPC_MAX_NVEC];
+        if (!lScalarizeVector(ci->getOperand(0), scalarizedTarget))
+            return false;
+
+        const llvm::Type *destType = ci->getDestTy();
+        const llvm::VectorType *vectorDestType =
+            llvm::dyn_cast<const llvm::VectorType>(destType);
+        assert(vectorDestType != NULL);
+        const llvm::Type *elementType = vectorDestType->getElementType();
+
+        for (int i = 0; i < g->target.vectorWidth; ++i) {
+            scalarizedVector[i] = 
+                llvm::CastInst::Create(op, scalarizedTarget[i], elementType,
+                                       "cast", ci);
+            lCopyMetadata(scalarizedVector[i], ci);
+        }
+        return true;
+    }
+
+    llvm::ShuffleVectorInst *svi = llvm::dyn_cast<llvm::ShuffleVectorInst>(vec);
+    if (svi) {
+        // Note that the code for shufflevector instructions is untested.
+        // (We haven't yet had a case where it needs to run).  Therefore,
+        // an assert at the bottom of this routien will hit the first time
+        // it runs as a reminder that this needs to be tested further.
+
+        const llvm::VectorType *svInstType = 
+            llvm::dyn_cast<const llvm::VectorType>(svi->getType());
+        assert(svInstType != NULL);
+        assert((int)svInstType->getNumElements() == g->target.vectorWidth);
+
+        // Scalarize the two vectors being shuffled.  First figure out how
+        // big they are.
+        const llvm::Type *type0 = svi->getOperand(0)->getType();
+        const llvm::Type *type1 = svi->getOperand(1)->getType();
+        const llvm::VectorType *vectorType0 = 
+            llvm::dyn_cast<const llvm::VectorType>(type0);
+        const llvm::VectorType *vectorType1 = 
+            llvm::dyn_cast<const llvm::VectorType>(type1);
+        assert(vectorType0 != NULL && vectorType1 != NULL);
+
+        int n0 = vectorType0->getNumElements();
+        int n1 = vectorType1->getNumElements();
+
+        // FIXME: It's actually totally legitimate for these two to have
+        // different sizes; the final result just needs to have the native
+        // vector width.  To handle this, not only do we need to
+        // potentially dynamically allocate space for the arrays passed
+        // into lScalarizeVector, but we need to change the rest of its
+        // implementation to not key off g->target.vectorWidth everywhere
+        // to get the sizes of the arrays to iterate over, etc.
+        assert(n0 == g->target.vectorWidth && n1 == g->target.vectorWidth);
+
+        // Go ahead and scalarize the two input vectors now.
+        // FIXME: it's ok if some or all of the values of these two vectors
+        // have undef values, so long as we don't try to access undef
+        // values with the vector indices provided to the instruction.
+        // Should fix lScalarizeVector so that it doesn't return false in
+        // this case and just leaves the elements of the arrays with undef
+        // values as NULL.
+        llvm::Value *v0[ISPC_MAX_NVEC], *v1[ISPC_MAX_NVEC];
+        if (!lScalarizeVector(svi->getOperand(0), v0) ||
+            !lScalarizeVector(svi->getOperand(1), v1))
+            return false;
+
+        llvm::ConstantVector *shuffleIndicesVector = 
+            llvm::dyn_cast<llvm::ConstantVector>(svi->getOperand(2));
+        // I think this has to be a ConstantVector.  If this ever hits,
+        // we'll dig into what we got instead and figure out how to handle
+        // that...
+        assert(shuffleIndicesVector != NULL);
+
+        // Get the integer indices for each element of the returned vector
+        llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> shuffleIndices;
+        shuffleIndicesVector->getVectorElements(shuffleIndices);
+        assert((int)shuffleIndices.size() == g->target.vectorWidth);
+
+        // And loop over the indices, setting the i'th element of the
+        // result vector with the source vector element that corresponds to
+        // the i'th shuffle index value.
+        for (unsigned int i = 0; i < shuffleIndices.size(); ++i) {
+            if (!llvm::isa<llvm::ConstantInt>(shuffleIndices[i]))
+                // I'm not sure when this case would ever happen, though..
+                return false;
+            int offset = (int)lGetIntValue(shuffleIndices[i]);
+            assert(offset >= 0 && offset < n0+n1);
+
+            if (offset < n0)
+                // Offsets from 0 to n0-1 index into the first vector
+                scalarizedVector[i] = v0[offset];
+            else
+                // And offsets from n0 to (n0+n1-1) index into the second
+                // vector
+                scalarizedVector[i] = v1[offset - n0];
+        }
+        FATAL("the above code is untested so far; check now that it's actually running");
+        return true;
+    }
+
+#if 0
+    fprintf(stderr, "flatten vector fixme\n");
+    vec->dump();
+    assert(0);
+#endif
+
+    return false;
+}
+
+
+/** Conservative test to see if two values are equal.  There are
+    (potentially many) cases where the two values actually are equal but
+    this will return false.  However, if it does return true, the two
+    vectors definitely are equal.  
+
+    @todo This seems to catch all of the cases we currently need it for in
+    practice, but it's be nice to make it a little more robust/general.  In
+    general, though, a little something called the halting problem means we
+    won't get all of them.
+ */
+static bool
+lValuesAreEqual(llvm::Value *v0, llvm::Value *v1) {
+    // Thanks to the fact that LLVM hashes and returns the same pointer for
+    // constants (of all sorts, even constant expressions), this first test
+    // actually catches a lot of cases.  LLVM's SSA form also helps a lot
+    // with this..
+    if (v0 == v1)
+        return true;
+
+    llvm::BinaryOperator *bo0 = llvm::dyn_cast<llvm::BinaryOperator>(v0);
+    llvm::BinaryOperator *bo1 = llvm::dyn_cast<llvm::BinaryOperator>(v1);
+    if (bo0 && bo1) {
+        if (bo0->getOpcode() != bo1->getOpcode())
+            return false;
+        return (lValuesAreEqual(bo0->getOperand(0), bo1->getOperand(0)) &&
+                lValuesAreEqual(bo0->getOperand(1), bo1->getOperand(1)));
+    }
+
+    return false;
+}
+
+
+/** Tests to see if all of the llvm::Values in the array are equal.  Like
+    lValuesAreEqual, this is a conservative test and may return false for
+    arrays where the values are actually all equal.
+ */
+static bool
+lVectorValuesAllEqual(llvm::Value *v[ISPC_MAX_NVEC]) {
+    for (int i = 0; i < g->target.vectorWidth-1; ++i)
+        if (!lValuesAreEqual(v[i], v[i+1]))
+            return false;
+    return true;
+}
+
+
+/** Given an array of scalar integer values, test to see if they are a
+    linear sequence of compile-time constant integers starting from an
+    arbirary value but then having a step of value "stride" between
+    elements.
+ */
+static bool
+lVectorIsLinearConstantInts(llvm::Value *v[ISPC_MAX_NVEC], int stride) {
+    llvm::ConstantInt *prev = llvm::dyn_cast<llvm::ConstantInt>(v[0]);
+    if (!prev)
+        return false;
+    int prevVal = (int)prev->getZExtValue();
+
+    // For each element in the array, see if it is both a ConstantInt and
+    // if the difference between it and the value of the previous element
+    // is stride.  If not, fail.
+    for (int i = 1; i < g->target.vectorWidth; ++i) {
+        llvm::ConstantInt *next = llvm::dyn_cast<llvm::ConstantInt>(v[i]);
+        if (!next) 
+            return false;
+
+        int nextVal = (int)next->getZExtValue();
+        if (prevVal + stride != nextVal)
+            return false;
+
+        prevVal = nextVal;
+    }
+    return true;
+}
+
+
+/** Given an array of integer-typed values, see if the elements of the
+    array have a step of 'stride' between their values.  This function
+    tries to handle as many possibilities as possible, including things
+    like all elements equal to some non-constant value plus an integer
+    offset, etc.
+
+    @todo FIXME Crazy thought: can we just build up expressions that
+    subtract the constants [v[0], v[0]+stride, v[0]+2*stride, ...] from the
+    given values, throw the LLVM optimizer at those, and then see if we get
+    back an array of all zeros?
+ */
+static bool
+lVectorIsLinear(llvm::Value *v[ISPC_MAX_NVEC], int stride) {
+#if 0
+    lPrintVector("called lVectorIsLinear", v);
+#endif
+
+    // First try the easy case: if the values are all just constant
+    // integers and have the expected stride between them, then we're done.
+    if (lVectorIsLinearConstantInts(v, stride))
+        return true;
+
+    // ConstantExprs need a bit of deconstruction to figure out
+
+    // FIXME: do we need to handle cases where e.g. v[0] is an
+    // llvm::ConstantInt and then the rest are ConstExprs??
+    if (llvm::dyn_cast<llvm::ConstantExpr>(v[0])) {
+        // First, see if all of the array elements are ConstantExprs of
+        // some sort.  If not, give up.
+        // FIXME: are we potentially missing cases here, e.g. a mixture of
+        // ConstantExprs and ConstantInts?
+        for (int i = 0; i < g->target.vectorWidth; ++i) {
+            if (!llvm::isa<llvm::ConstantExpr>(v[i]))
+                return false;
+        }
+
+        // See if any of the array elements are adds of constant
+        // expressions.  As it turns out, LLVM's constant expression
+        // optimizer is very thorough about converting "add(0, foo)" to
+        // "foo", so we need to deal with cases where element 0 is "foo",
+        // element 1 is add(4, foo), etc...
+        bool anyAdds = false, allAdds = true;
+        for (int i = 0; i < g->target.vectorWidth; ++i) {
+            llvm::ConstantExpr *ce = llvm::dyn_cast<llvm::ConstantExpr>(v[i]);
+            if (ce->getOpcode() == llvm::Instruction::Add)
+                anyAdds = true;
+            else 
+                allAdds = false;
+        }
+
+        if (anyAdds && !allAdds) {
+            // In v[], we should have an array of elements that are all
+            // either ConstExprs with add operators, where one of the
+            // operads is a constant int, or other non-add ConstExpr
+            // values.  
+            // 
+            // Now we through each element and:
+            // 1. For ones that aren't add ConstExprs, treat them as if they 
+            //    are an add with 0 as the other operand.
+            // 2. Extract the ConstInt operand of the add into the intBit[]
+            //    array and put the other operand in the otherBit[] array.
+            llvm::Value *intBit[ISPC_MAX_NVEC], *otherBit[ISPC_MAX_NVEC];
+            for (int i = 0; i < g->target.vectorWidth; ++i) {
+                llvm::ConstantExpr *ce = llvm::dyn_cast<llvm::ConstantExpr>(v[i]);
+                if (ce->getOpcode() == llvm::Instruction::Add) {
+                    // The ConstantInt may be either of the two operands of
+                    // the add.  Put the operands in the right arrays.
+                    if (llvm::isa<llvm::ConstantInt>(ce->getOperand(0))) {
+                        intBit[i] = ce->getOperand(0);
+                        otherBit[i] = ce->getOperand(1);
+                    }
+                    else {
+                        intBit[i] = ce->getOperand(1);
+                        otherBit[i] = ce->getOperand(0);
+                    }
+                }
+                else {
+                    // We don't have an Add, so pretend we have an add with
+                    // zero.
+                    intBit[i] = LLVMInt32(0);
+                    otherBit[i] = v[i];
+                }
+            }
+
+            // Now that everything is lined up, see if we have a case where
+            // we're adding constant values with the desired stride to the
+            // same base value.  If so, we know we have a linear set of
+            // locations.
+            return (lVectorIsLinear(intBit, stride) &&
+                    lVectorValuesAllEqual(otherBit));
+        }
+
+        // If this ever hits, the assertion can just be commented out and
+        // false returned below.  However, it's worth figuring out how the
+        // analysis needs to be generalized rather than necessarily giving
+        // up and possibly hurting performance of the final code.
+        FATAL("Unexpected case with a ConstantExpr in lVectorIsLinear");
+#if 0
+        for (int i = 0; i < g->target.vectorWidth; ++i)
+            v[i]->dump();
+        FATAL("FIXME");
+#endif
+        return false;
+    }
+
+
+    llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(v[0]);
+    if (bop) {
+        // We also need to deal with non-constant binary operators that
+        // represent linear accesses here..
+        // FIXME: here, too, what about cases with v[0] being a load or something
+        // and then everything after element 0 being a binary operator with an add.
+        // That won't get caught by this case??
+        bool anyAdd = false;
+        for (int i = 0; i < g->target.vectorWidth; ++i) {
+            llvm::BinaryOperator *bopi = llvm::dyn_cast<llvm::BinaryOperator>(v[i]);
+            if (bopi && bopi->getOpcode() == llvm::Instruction::Add)
+                anyAdd = true;
+        }
+
+        if (anyAdd) {
+            // is one of the operands the same for all elements?  if so, then just
+            // need to check this case for the other operand...
+
+            // FIXME: do we need a more general check that starts with both
+            // the first and second operand of v[0]'s add and then checks
+            // the remainder of the elements to see if either one of their
+            // two operands matches the one we started with?  That would be
+            // more robust to switching the ordering of operands, in case
+            // that ever happens...
+            for (int operand = 0; operand <= 1; ++operand) {
+                llvm::Value *addOperandValues[ISPC_MAX_NVEC];
+                // Go through the vector elements and grab the operand'th
+                // one if this is an add or the v
+                for (int i = 0; i < g->target.vectorWidth; ++i) {
+                    llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(v[i]);
+                    if (bop->getOpcode() == llvm::Instruction::Add)
+                        addOperandValues[i] = bop->getOperand(operand);
+                    else
+                        // The other guys are adds, so we'll treat this as
+                        // an "add 0" in the below, so just grab the value
+                        // v[i] itself
+                        addOperandValues[i] = v[i];
+                }
+
+                if (lVectorValuesAllEqual(addOperandValues)) {
+                    // If this operand's values are all equal, then the
+                    // overall result is a linear sequence if the second
+                    // operand's values are themselves a linear sequence...
+                    int otherOperand = operand ^ 1;
+                    for (int i = 0; i < g->target.vectorWidth; ++i) {
+                        llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(v[i]);
+                        if (bop->getOpcode() == llvm::Instruction::Add)
+                            addOperandValues[i] = bop->getOperand(otherOperand);
+                        else
+                            addOperandValues[i] = LLVMInt32(0);
+                    }
+                    return lVectorIsLinear(addOperandValues, stride);
+                }
+            }
+        }
+
+        if (bop->getOpcode() == llvm::Instruction::Mul) {
+            // Finally, if we have a multiply, then if one of the operands
+            // has the same value for all elements and if the other operand
+            // is a linear sequence such that the scale times the sequence
+            // values is a linear sequence with the desired stride, then
+            // we're good.
+            llvm::ConstantInt *op0 = llvm::dyn_cast<llvm::ConstantInt>(bop->getOperand(0));
+            llvm::ConstantInt *op1 = llvm::dyn_cast<llvm::ConstantInt>(bop->getOperand(1));
+
+            // We need one of them to be a constant for us to be able to proceed...
+            if (!op0 && !op1)
+                return false;
+            // But if they're both constants, then the LLVM constant folder
+            // should have simplified them down to their product!
+            assert(!(op0 && op1));
+
+            // Figure out which operand number is the constant scale and
+            // which is the varying one
+            int scaleOperand, otherOperand;
+            llvm::ConstantInt *scaleValue;
+            if (op0 != NULL) {
+                scaleOperand = 0;
+                otherOperand = 1;
+                scaleValue = op0;
+            }
+            else {
+                scaleOperand = 1;
+                otherOperand = 0;
+                scaleValue = op1;
+            }
+
+            // Find the scale value; make sure it evenly divides the
+            // stride.  Otherwise there's no chance that the scale times a
+            // set of integer values will give a sequence with the desired
+            // stride.
+            int mulScale = (int)scaleValue->getZExtValue();
+            if ((stride % mulScale) != 0)
+                return false;
+
+            llvm::Value *otherValue[ISPC_MAX_NVEC];
+            for (int i = 0; i < g->target.vectorWidth; ++i) {
+                llvm::BinaryOperator *eltBop = llvm::dyn_cast<llvm::BinaryOperator>(v[i]);
+                // Give up if it's not matching the desired pattern of "all
+                // mul ops with the scaleOperand being a constant with the
+                // same value".
+                if (!eltBop || eltBop->getOpcode() != llvm::Instruction::Mul)
+                    return false;
+                if (eltBop->getOperand(scaleOperand) != scaleValue)
+                    return false;
+
+                otherValue[i] = eltBop->getOperand(otherOperand);
+            }
+            // Now see if the sequence of values being scaled gives us
+            // something with the desired stride.
+            return lVectorIsLinear(otherValue, stride / mulScale);
+        }
+    }
+
+    return false;
+}
+
+
+bool
+GSImprovementsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
+    llvm::Function *gather32Func = m->module->getFunction("__pseudo_gather_base_offsets_32");
+    llvm::Function *gather64Func = m->module->getFunction("__pseudo_gather_base_offsets_64");
+    llvm::Function *scatter32Func = m->module->getFunction("__pseudo_scatter_base_offsets_32");
+    llvm::Function *scatter64Func = m->module->getFunction("__pseudo_scatter_base_offsets_64");
+    assert(gather32Func && gather64Func && scatter32Func && scatter64Func);
+
+    bool modifiedAny = false;
+
+ restart:
+    for (llvm::BasicBlock::iterator i = bb.begin(), e = bb.end(); i != e; ++i) {
+        // Iterate over all of the instructions and look for calls to
+        // __pseudo_*_base_offsets_* calls.
+        llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*i);
+        if (!callInst || 
+            (callInst->getCalledFunction() != gather32Func &&
+             callInst->getCalledFunction() != gather64Func &&
+             callInst->getCalledFunction() != scatter32Func &&
+             callInst->getCalledFunction() != scatter64Func))
+            continue;
+
+        SourcePos pos;
+        bool ok = lGetSourcePosFromMetadata(callInst, &pos);
+        assert(ok);     
+
+        bool isGather = (callInst->getCalledFunction() == gather32Func ||
+                         callInst->getCalledFunction() == gather64Func);
+        bool is32 = (callInst->getCalledFunction() == gather32Func ||
+                     callInst->getCalledFunction() == scatter32Func);
+
+        // Get the actual base pointer; note that it comes into the gather
+        // or scatter function bitcast to an i8 *, so we need to work back
+        // to get the pointer as the original type.
+        llvm::Value *base = callInst->getArgOperand(0);
+        llvm::BitCastInst *bci = llvm::dyn_cast<llvm::BitCastInst>(base);
+        if (bci)
+            base = bci->getOperand(0);
+        llvm::ConstantExpr *ce = llvm::dyn_cast<llvm::ConstantExpr>(base);
+        if (ce && ce->getOpcode() == llvm::Instruction::BitCast)
+            base = ce->getOperand(0);
+
+        // Try to out the offsets; the i'th element of the offsetElements
+        // array should be an i32 with the value of the offset for the i'th
+        // vector lane.  This may fail; if so, just give up.
+        llvm::Value *offsetElements[ISPC_MAX_NVEC];
+        if (!lScalarizeVector(callInst->getArgOperand(1), offsetElements))
+            continue;
+
+        llvm::Value *mask = callInst->getArgOperand(isGather ? 2 : 3);
+
+        if (lVectorValuesAllEqual(offsetElements)) {
+            // If all the offsets are equal, then compute the single
+            // pointer they all represent based on the first one of them
+            // (arbitrarily).
+            llvm::Value *indices[1] = { offsetElements[0] };
+            llvm::Value *basei8 =
+                new llvm::BitCastInst(base, LLVMTypes::VoidPointerType, "base2i8", callInst);
+            lCopyMetadata(basei8, callInst);
+            llvm::Value *ptr = 
+                llvm::GetElementPtrInst::Create(basei8, &indices[0], &indices[1],
+                                                "ptr", callInst);
+            lCopyMetadata(ptr, callInst);
+
+            if (isGather) {
+                // A gather with everyone going to the same location is
+                // handled as a scalar load and broadcast across the lanes.
+                // Note that we do still have to pass the mask to the
+                // __load_and_broadcast_* functions, since they shouldn't
+                // access memory if the mask is all off (the location may
+                // be invalid in that case).
+                Debug(pos, "Transformed gather to scalar load and broadcast!");
+                llvm::Function *loadBroadcast = 
+                    m->module->getFunction(is32 ? "__load_and_broadcast_32" :
+                                                  "__load_and_broadcast_64");
+                assert(loadBroadcast);
+                llvm::Value *args[2] = { ptr, mask };
+                llvm::Instruction *newCall = 
+                    llvm::CallInst::Create(loadBroadcast, &args[0], &args[2],
+                                           "load_broadcast");
+                lCopyMetadata(newCall, callInst);
+                llvm::ReplaceInstWithInst(callInst, newCall);
+            }
+            else {
+                // A scatter with everyone going to the same location is
+                // undefined.  Issue a warning and arbitrarily let the
+                // first guy win.
+                Warning(pos, "Undefined behavior: all program instances are "
+                        "writing to the same location!");
+
+                llvm::Value *rvalue = callInst->getArgOperand(2);
+                llvm::Value *first = 
+                    llvm::ExtractElementInst::Create(rvalue, LLVMInt32(0), "rvalue_first",
+                                                     callInst);
+                lCopyMetadata(first, callInst);
+                ptr = new llvm::BitCastInst(ptr, llvm::PointerType::get(first->getType(), 0),
+                                            "ptr2rvalue_type", callInst);
+                lCopyMetadata(ptr, callInst);
+                llvm::Instruction *sinst = 
+                    new llvm::StoreInst(first, ptr, false, is32 ? 4 : 8 /* align */);
+                lCopyMetadata(sinst, callInst);
+                llvm::ReplaceInstWithInst(callInst, sinst);
+            }
+
+            modifiedAny = true;
+            goto restart;
+        }
+
+        if (lVectorIsLinear(offsetElements, is32 ? 4 : 8)) {
+            // We have a linear sequence of memory locations being accessed
+            // starting with the location given by the offset from
+            // offsetElements[0], with stride of 4 or 8 bytes (for 32 bit
+            // and 64 bit gather/scatters, respectively.)
+
+            // Get the base pointer using the first guy's offset.
+            llvm::Value *indices[2] = { offsetElements[0] };
+            llvm::Value *basei8 =
+                new llvm::BitCastInst(base, LLVMTypes::VoidPointerType, "base2i8", callInst);
+            lCopyMetadata(basei8, callInst);
+            llvm::Value *ptr = 
+                llvm::GetElementPtrInst::Create(basei8, &indices[0], &indices[1],
+                                                "ptr", callInst);
+            lCopyMetadata(ptr, callInst);
+
+            if (isGather) {
+                Debug(pos, "Transformed gather to unaligned vector load!");
+                // FIXME: make this an aligned load when possible..
+                // FIXME: are there lurking potential bugs when e.g. the
+                // last few entries of the mask are off and the load ends
+                // up straddling a page boundary?
+                llvm::Function *loadMasked = 
+                    m->module->getFunction(is32 ? "__load_masked_32" : "__load_masked_64");
+                assert(loadMasked);
+
+                llvm::Value *args[2] = { ptr, mask };
+                llvm::Instruction *newCall = 
+                    llvm::CallInst::Create(loadMasked, &args[0], &args[2], "load_masked");
+                lCopyMetadata(newCall, callInst);
+                llvm::ReplaceInstWithInst(callInst, newCall);
+            }
+            else {
+                Debug(pos, "Transformed scatter to unaligned vector store!");
+                // FIXME: make this an aligned store when possible.  Need
+                // to work through the messiness of issuing a pseudo store
+                // here.
+                llvm::Value *rvalue = callInst->getArgOperand(2);
+
+                llvm::Function *storeMasked = 
+                    m->module->getFunction(is32 ? "__pseudo_masked_store_32" :
+                                                  "__pseudo_masked_store_64");
+                assert(storeMasked);
+                const llvm::Type *vecPtrType = is32 ?
+                    LLVMTypes::Int32VectorPointerType : LLVMTypes::Int64VectorPointerType;
+                ptr = new llvm::BitCastInst(ptr, vecPtrType, "ptrcast", callInst);
+
+                llvm::Value *args[3] = { ptr, rvalue, mask };
+                llvm::Instruction *newCall = 
+                    llvm::CallInst::Create(storeMasked, &args[0], &args[3], "");
+                lCopyMetadata(newCall, callInst);
+                llvm::ReplaceInstWithInst(callInst, newCall);
+            }
+
+            modifiedAny = true;
+            goto restart;
+        }
+
+#if 0
+        lPrintVector("scatter/gather no love: flattened", offsetElements);
+        bb.dump();
+#endif
+    }
+
+    return modifiedAny;
+}
+
+
+static llvm::Pass *
+CreateGatherScatterImprovementsPass() {
+    return new GSImprovementsPass;
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// LowerGSPass
+
+/** For any gathers and scatters remaining after the GSImprovementsPass
+    runs, we need to turn them into actual native gathers and scatters.
+    This task is handled by the LowerGSPass here.
+ */
+class LowerGSPass : public llvm::BasicBlockPass {
+public:
+    static char ID;
+    LowerGSPass() : BasicBlockPass(ID) { }
+
+    const char *getPassName() const { return "Gather/Scatter Improvements"; }
+    bool runOnBasicBlock(llvm::BasicBlock &BB);
+};
+
+
+char LowerGSPass::ID = 0;
+
+llvm::RegisterPass<LowerGSPass> lgs("lower-gs",
+                                    "Lower Gather/Scatter Pass");
+
+bool
+LowerGSPass::runOnBasicBlock(llvm::BasicBlock &bb) {
+    llvm::Function *gather32Func = m->module->getFunction("__pseudo_gather_base_offsets_32");
+    llvm::Function *gather64Func = m->module->getFunction("__pseudo_gather_base_offsets_64");
+    llvm::Function *scatter32Func = m->module->getFunction("__pseudo_scatter_base_offsets_32");
+    llvm::Function *scatter64Func = m->module->getFunction("__pseudo_scatter_base_offsets_64");
+    assert(gather32Func && gather64Func && scatter32Func && scatter64Func);
+
+    bool modifiedAny = false;
+ restart:
+    for (llvm::BasicBlock::iterator i = bb.begin(), e = bb.end(); i != e; ++i) {
+        // Loop over the instructions and find calls to the
+        // __pseudo_*_base_offsets_* functions.
+        llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*i);
+        if (!callInst || 
+            (callInst->getCalledFunction() != gather32Func &&
+             callInst->getCalledFunction() != gather64Func &&
+             callInst->getCalledFunction() != scatter32Func &&
+             callInst->getCalledFunction() != scatter64Func))
+            continue;
+
+        bool isGather = (callInst->getCalledFunction() == gather32Func ||
+                         callInst->getCalledFunction() == gather64Func);
+        bool is32 = (callInst->getCalledFunction() == gather32Func ||
+                     callInst->getCalledFunction() == scatter32Func);
+
+        // Get the source position from the metadata attached to the call
+        // instruction so that we can issue PerformanceWarning()s below.
+        SourcePos pos;
+        bool ok = lGetSourcePosFromMetadata(callInst, &pos);
+        assert(ok);     
+
+        if (isGather) {
+            llvm::Function *gFunc = m->module->getFunction(is32 ? "__gather_base_offsets_i32" :
+                                                                  "__gather_base_offsets_i64");
+            assert(gFunc);
+            callInst->setCalledFunction(gFunc);
+            PerformanceWarning(pos, "Gather required to compute value in expression.");
+        }
+        else {
+            llvm::Function *sFunc = m->module->getFunction(is32 ? "__scatter_base_offsets_i32" :
+                                                                  "__scatter_base_offsets_i64");
+            assert(sFunc);
+            callInst->setCalledFunction(sFunc);
+            PerformanceWarning(pos, "Scatter required for storing value.");
+        }
+        modifiedAny = true;
+        goto restart;
+    }
+    return modifiedAny;
+}
+
+
+static llvm::Pass *
+CreateLowerGatherScatterPass() {
+    return new LowerGSPass;
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// IsCompileTimeConstantPass
+
+/** LLVM IR implementations of target-specific functions may include calls
+    to a function "bool __is_compile_time_constant_mask(mask type)"; this
+    allows them to have specialied code paths for where the mask is known
+    at compile time but not incurring the cost of a MOVMSK call at runtime
+    to compute its value in cases where the mask value isn't known until
+    runtime.
+
+    This pass resolves these calls into either 'true' or 'false' values so
+    that later optimization passes can operate with these as constants.
+
+    See stdlib.m4 for a number of uses of this idiom. 
+ */
+
+class IsCompileTimeConstantPass : public llvm::BasicBlockPass {
+public:
+    static char ID;
+    IsCompileTimeConstantPass(bool last = false) : BasicBlockPass(ID) {
+        isLastTry = last;
+    }
+
+    const char *getPassName() const { return "Resolve \"is compile time constant\""; }
+    bool runOnBasicBlock(llvm::BasicBlock &BB);
+
+    bool isLastTry;
+};
+
+char IsCompileTimeConstantPass::ID = 0;
+
+llvm::RegisterPass<IsCompileTimeConstantPass> 
+    ctcrp("compile-time-constant", "Compile-Time Constant Resolve Pass");
+
+bool
+IsCompileTimeConstantPass::runOnBasicBlock(llvm::BasicBlock &bb) {
+    llvm::Function *func = m->module->getFunction("__is_compile_time_constant_mask");
+    if (!func)
+        return false;
+
+    bool modifiedAny = false;
+ restart:
+    for (llvm::BasicBlock::iterator i = bb.begin(), e = bb.end(); i != e; ++i) {
+        // Iterate through the instructions looking for calls to
+        // __is_compile_time_constant_mask().
+        llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*i);
+        if (!callInst || callInst->getCalledFunction() != func)
+            continue;
+
+        // This optimization pass can be disabled with the (poorly named)
+        // disableGatherScatterFlattening option.
+        if (g->opt.disableGatherScatterFlattening) {
+            llvm::ReplaceInstWithValue(i->getParent()->getInstList(), i, LLVMFalse);
+            modifiedAny = true;
+            goto restart;
+        }
+
+        // Is it a constant?  Bingo, turn the call's value into a constant
+        // true value.
+        llvm::Value *mask = callInst->getArgOperand(0);
+        if (llvm::isa<llvm::Constant>(mask)) {
+            llvm::ReplaceInstWithValue(i->getParent()->getInstList(), i, LLVMTrue);
+            modifiedAny = true;
+            goto restart;
+        }
+
+        // This pass runs multiple times during optimization.  Up until the
+        // very last time, it only replaces the call with a 'true' if the
+        // value is known to be constant and otherwise leaves the call
+        // alone, in case further optimization passes can help resolve its
+        // value.  The last time through, it eventually has to give up, and
+        // replaces any remaining ones with 'false' constants.
+        if (isLastTry) {
+            llvm::ReplaceInstWithValue(i->getParent()->getInstList(), i, LLVMFalse);
+            modifiedAny = true;
+            goto restart;
+        }
+    }
+
+    return modifiedAny;
+}
+
+
+static llvm::Pass *
+CreateIsCompileTimeConstantPass(bool isLastTry) {
+    return new IsCompileTimeConstantPass(isLastTry);
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// MakeInternalFuncsStaticPass
+
+/** There are a number of target-specific functions that we use during
+    these optimization passes.  By the time we are done with optimization,
+    any uses of these should be inlined and no calls to these functions
+    should remain.  This pass marks all of these functions as having
+    private linkage so that subsequent passes can eliminate them as dead
+    code, thus cleaning up the final code output by the compiler.  We can't
+    just declare these as static from the start, however, since then they
+    end up being eliminated as dead code during early optimization passes
+    even though we may need to generate calls to them during later
+    optimization passes.
+ */
+class MakeInternalFuncsStaticPass : public llvm::ModulePass {
+public:
+    static char ID;
+    MakeInternalFuncsStaticPass(bool last = false) : ModulePass(ID) {
+    }
+
+    const char *getPassName() const { return "Make internal funcs \"static\""; }
+    bool runOnModule(llvm::Module &m);
+};
+
+char MakeInternalFuncsStaticPass::ID = 0;
+
+llvm::RegisterPass<MakeInternalFuncsStaticPass> 
+  mifsp("make-internal-funcs-static", "Make Internal Funcs Static Pass");
+
+bool
+MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) {
+    const char *names[] = {
+        "__do_print", "__gather_base_offsets_i32", "__gather_base_offsets_i64",
+        "__gather_elt_32", "__gather_elt_64", "__load_and_broadcast_32", 
+        "__load_and_broadcast_64", "__load_masked_32", "__load_masked_64",
+        "__masked_store_32", "__masked_store_64", "__masked_store_blend_32",
+        "__masked_store_blend_64", "__packed_load_active", "__packed_store_active",
+        "__scatter_base_offsets_i32", "__scatter_base_offsets_i64", "__scatter_elt_32",
+        "__scatter_elt_64", };
+
+    int count = sizeof(names) / sizeof(names[0]);
+    for (int i = 0; i < count; ++i) {
+        llvm::Function *f = m->module->getFunction(names[i]);
+        if (f != NULL)
+            f->setLinkage(llvm::GlobalValue::PrivateLinkage);
+    }
+
+    return true;
+}
+
+
+static llvm::Pass *
+CreateMakeInternalFuncsStaticPass() {
+    return new MakeInternalFuncsStaticPass;
+}
diff --git a/opt.h b/opt.h
new file mode 100644
index 00000000..7584fd71
--- /dev/null
+++ b/opt.h
@@ -0,0 +1,50 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file opt.h
+    @brief Declarations related to optimization passes
+*/
+
+#ifndef ISPC_OPT_H
+#define ISPC_OPT_H 1
+
+#include "ispc.h"
+
+/** Optimize the functions in the given module, applying the specified
+    level of optimization.  optLevel zero corresponds to essentially no
+    optimization--just enough to generate correct code, while level one
+    corresponds to full optimization.  
+*/
+void Optimize(llvm::Module *module, int optLevel);
+
+#endif // ISPC_OPT_H
diff --git a/parse.yy b/parse.yy
new file mode 100644
index 00000000..333f2992
--- /dev/null
+++ b/parse.yy
@@ -0,0 +1,1138 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+%locations
+
+/* supress shift-reduces conflict message for dangling else */
+/* one for 'if', one for 'uif' */
+%expect 2
+
+%pure-parser
+
+%code requires {
+
+#define YYLTYPE SourcePos
+
+# define YYLLOC_DEFAULT(Current, Rhs, N)                               \
+    do                                                                 \
+      if (N)                                                           \
+        {                                                              \
+          (Current).first_line   = YYRHSLOC (Rhs, 1).first_line;       \
+          (Current).first_column = YYRHSLOC (Rhs, 1).first_column;     \
+          (Current).last_line    = YYRHSLOC (Rhs, N).last_line;        \
+          (Current).last_column  = YYRHSLOC (Rhs, N).last_column;      \
+          (Current).name         = YYRHSLOC (Rhs, 1).name    ;         \
+        }                                                              \
+      else                                                             \
+        { /* empty RHS */                                              \
+          (Current).first_line   = (Current).last_line   =             \
+            YYRHSLOC (Rhs, 0).last_line;                               \
+          (Current).first_column = (Current).last_column =             \
+            YYRHSLOC (Rhs, 0).last_column;                             \
+          (Current).name = NULL;                        /* new */ \
+        }                                                              \
+    while (0)
+}
+
+%{
+
+#include "ispc.h"
+#include "type.h"
+#include "module.h"
+#include "decl.h"
+#include "expr.h"
+#include "sym.h"
+#include "stmt.h"
+#include "util.h"
+
+#include <stdio.h>
+#include <llvm/Constants.h>
+
+#define UNIMPLEMENTED \
+        Error(yylloc, "Unimplemented parser functionality %s:%d", \
+        __FILE__, __LINE__);
+
+union YYSTYPE;
+extern int yylex(YYSTYPE *, SourcePos *);
+
+extern char *yytext;
+
+void yyerror(const char *s) { fprintf(stderr, "Parse error: %s\n", s); }
+
+static void lAddFunctionParams(Declarator *decl);
+static void lAddMaskToSymbolTable(SourcePos pos);
+static void lAddThreadIndexCountToSymbolTable(SourcePos pos);
+static std::string lGetAlternates(std::vector<std::string> &alternates);
+static const char *lGetStorageClassString(StorageClass sc);
+
+static const char *lBuiltinTokens[] = {
+    "bool", "break", "case", "cbreak", "ccontinue", "cdo", "cfor", "char", 
+    "cif", "cwhile", "const", "continue", "creturn", "default", "do", "double", 
+    "else", "enum", "export", "extern", "false", "float", "for", "goto", "if",
+    "inline", "int", "int32", "int64", "launch", "print", "reference", "return",
+    "static", "struct", "switch", "sync", "task", "true", "typedef", "uniform",
+    "unsigned", "varying", "void", "while", NULL 
+};
+    
+%}
+
+%union {
+    Expr *expr;
+    ExprList *exprList;
+    const Type *type;
+    int typeQualifier;
+    StorageClass storageClass;
+    Stmt *stmt;
+    DeclSpecs *declSpecs;
+    Declaration *declaration;
+    std::vector<Declarator *> *declarators;
+    std::vector<Declaration *> *declarationList;
+    Declarator *declarator;
+    std::vector<Declarator *> *structDeclaratorList;
+    StructDeclaration *structDeclaration;
+    std::vector<StructDeclaration *> *structDeclarationList;
+    int32_t int32Val;
+    uint32_t uint32Val;
+    double floatVal;
+    int64_t int64Val;
+    uint64_t uint64Val;
+    std::string *stringVal;
+    const char *constCharPtr;
+}
+
+
+%token TOKEN_IDENTIFIER TOKEN_INT_CONSTANT TOKEN_UINT_CONSTANT TOKEN_FLOAT_CONSTANT 
+%token TOKEN_STRING_LITERAL TOKEN_TYPE_NAME
+%token TOKEN_PTR_OP TOKEN_INC_OP TOKEN_DEC_OP TOKEN_LEFT_OP TOKEN_RIGHT_OP 
+%token TOKEN_LE_OP TOKEN_GE_OP TOKEN_EQ_OP TOKEN_NE_OP
+%token TOKEN_AND_OP TOKEN_OR_OP TOKEN_MUL_ASSIGN TOKEN_DIV_ASSIGN TOKEN_MOD_ASSIGN 
+%token TOKEN_ADD_ASSIGN TOKEN_SUB_ASSIGN TOKEN_LEFT_ASSIGN TOKEN_RIGHT_ASSIGN 
+%token TOKEN_AND_ASSIGN TOKEN_OR_ASSIGN TOKEN_XOR_ASSIGN
+
+%token TOKEN_EXTERN TOKEN_EXPORT TOKEN_STATIC TOKEN_INLINE TOKEN_TASK 
+%token TOKEN_UNIFORM TOKEN_VARYING TOKEN_TYPEDEF TOKEN_SOA
+%token TOKEN_CHAR TOKEN_INT TOKEN_UNSIGNED TOKEN_FLOAT TOKEN_DOUBLE
+%token TOKEN_INT64 TOKEN_CONST TOKEN_VOID TOKEN_BOOL 
+%token TOKEN_ENUM TOKEN_STRUCT TOKEN_TRUE TOKEN_FALSE TOKEN_REFERENCE
+
+%token TOKEN_CASE TOKEN_DEFAULT TOKEN_IF TOKEN_ELSE TOKEN_SWITCH
+%token TOKEN_WHILE TOKEN_DO TOKEN_LAUNCH
+%token TOKEN_FOR TOKEN_GOTO TOKEN_CONTINUE TOKEN_BREAK TOKEN_RETURN
+%token TOKEN_CIF TOKEN_CDO TOKEN_CFOR TOKEN_CWHILE
+%token TOKEN_CBREAK TOKEN_CCONTINUE TOKEN_CRETURN TOKEN_SYNC TOKEN_PRINT
+
+%type <expr> primary_expression postfix_expression
+%type <expr> unary_expression cast_expression
+%type <expr> multiplicative_expression additive_expression shift_expression
+%type <expr> relational_expression equality_expression and_expression
+%type <expr> exclusive_or_expression inclusive_or_expression
+%type <expr> logical_and_expression logical_or_expression 
+%type <expr> conditional_expression assignment_expression expression
+%type <expr> initializer constant_expression for_test
+%type <exprList> argument_expression_list initializer_list
+
+%type <stmt> statement labeled_statement compound_statement for_init_statement
+%type <stmt> expression_statement selection_statement iteration_statement
+%type <stmt> jump_statement statement_list declaration_statement print_statement
+
+%type <declaration> declaration parameter_declaration
+%type <declarators> init_declarator_list 
+%type <declarationList> parameter_list parameter_type_list
+%type <declarator> declarator init_declarator direct_declarator struct_declarator
+
+%type <structDeclaratorList> struct_declarator_list
+%type <structDeclaration> struct_declaration
+%type <structDeclarationList> struct_declaration_list
+
+%type <type> specifier_qualifier_list struct_or_union_specifier
+%type <type> enum_specifier type_specifier type_name
+
+%type <typeQualifier> type_qualifier
+%type <storageClass> storage_class_specifier
+%type <declSpecs> declaration_specifiers 
+
+%type <stringVal> string_constant
+%type <constCharPtr> struct_or_union_name
+%type <int32Val> int_constant soa_width_specifier
+
+%start translation_unit
+%%
+
+string_constant
+    : TOKEN_STRING_LITERAL { $$ = new std::string(*yylval.stringVal); }
+    ;
+
+primary_expression
+    : TOKEN_IDENTIFIER {
+        const char *name = yylval.stringVal->c_str();
+        Symbol *s = m->symbolTable->LookupVariable(name);
+        $$ = NULL;
+        if (s)
+            $$ = new SymbolExpr(s, @1);       
+        else {
+            std::vector<Symbol *> *funs = m->symbolTable->LookupFunction(name);
+            if (funs)
+                $$ = new FunctionSymbolExpr(funs, @1);
+        }
+        if ($$ == NULL) {
+            std::vector<std::string> alternates = 
+                m->symbolTable->ClosestVariableOrFunctionMatch(name);
+            std::string alts = lGetAlternates(alternates);
+            Error(@1, "Undeclared symbol \"%s\".%s", name, alts.c_str());
+        }
+    }
+    | TOKEN_INT_CONSTANT {
+        /* FIXME: should support 64 bit constants (and doubles...) */
+        $$ = new ConstExpr(AtomicType::UniformConstInt32, yylval.int32Val, @1); 
+    }
+    | TOKEN_UINT_CONSTANT {
+        $$ = new ConstExpr(AtomicType::UniformConstUInt32, yylval.uint32Val, @1); 
+    }
+    | TOKEN_FLOAT_CONSTANT {
+        $$ = new ConstExpr(AtomicType::UniformConstFloat, (float)yylval.floatVal, @1); 
+    }
+    | TOKEN_TRUE {
+        $$ = new ConstExpr(AtomicType::UniformConstBool, true, @1);
+    }
+    | TOKEN_FALSE {
+        $$ = new ConstExpr(AtomicType::UniformConstBool, false, @1);
+    }
+/*    | TOKEN_STRING_LITERAL
+       { UNIMPLEMENTED }*/
+    | '(' expression ')' { $$ = $2; }
+    ;
+
+postfix_expression
+    : primary_expression
+    | postfix_expression '[' expression ']'
+      { $$ = new IndexExpr($1, $3, @1); }
+    | postfix_expression '(' ')'
+      { $$ = new FunctionCallExpr($1, new ExprList(@1), @1, false); }
+    | postfix_expression '(' argument_expression_list ')'
+      { $$ = new FunctionCallExpr($1, $3, @1, false); }
+    | TOKEN_LAUNCH '<' postfix_expression '(' argument_expression_list ')' '>'
+      { $$ = new FunctionCallExpr($3, $5, @3, true); }
+    | TOKEN_LAUNCH '<' postfix_expression '(' ')' '>'
+      { $$ = new FunctionCallExpr($3, new ExprList(@3), @3, true); }
+    | postfix_expression '.' TOKEN_IDENTIFIER
+      { $$ = new MemberExpr($1, yytext, @1, @3); }
+/*    | postfix_expression TOKEN_PTR_OP TOKEN_IDENTIFIER
+      { UNIMPLEMENTED }
+*/
+    | postfix_expression TOKEN_INC_OP
+      { $$ = new UnaryExpr(UnaryExpr::PostInc, $1, @1); }
+    | postfix_expression TOKEN_DEC_OP
+      { $$ = new UnaryExpr(UnaryExpr::PostDec, $1, @1); }
+    ;
+
+argument_expression_list
+    : assignment_expression      { $$ = new ExprList($1, @1); }
+    | argument_expression_list ',' assignment_expression
+      {
+          ExprList *argList = dynamic_cast<ExprList *>($1);
+          assert(argList != NULL);
+          argList->exprs.push_back($3);
+          $$ = argList;
+      }
+    ;
+
+unary_expression
+    : postfix_expression
+    | TOKEN_INC_OP unary_expression   
+      { $$ = new UnaryExpr(UnaryExpr::PreInc, $2, @2); }
+    | TOKEN_DEC_OP unary_expression   
+      { $$ = new UnaryExpr(UnaryExpr::PreDec, $2, @2); }
+    | '+' cast_expression 
+      { $$ = $2; }
+    | '-' cast_expression 
+      { $$ = new UnaryExpr(UnaryExpr::Negate, $2, @2); }
+    | '~' cast_expression 
+      { $$ = new UnaryExpr(UnaryExpr::BitNot, $2, @2); }
+    | '!' cast_expression 
+      { $$ = new UnaryExpr(UnaryExpr::LogicalNot, $2, @2); }
+    ;
+
+cast_expression
+    : unary_expression
+    | '(' type_name ')' cast_expression
+      {
+          // If type_name isn't explicitly a varying, We do a GetUniform() 
+          // call here so that things like:
+          // uniform int y = ...;
+          // uniform float x = 1. / (float)y;
+          // don't issue an error due to (float)y being inadvertently
+          // and undesirably-to-the-user "varying"...
+          if ($4->GetType()->IsUniformType())
+             $2 = $2->GetAsUniformType();
+          $$ = new TypeCastExpr($2, $4, @1); 
+      }
+    ;
+
+multiplicative_expression
+    : cast_expression
+    | multiplicative_expression '*' cast_expression
+      { $$ = new BinaryExpr(BinaryExpr::Mul, $1, $3, @2); }
+    | multiplicative_expression '/' cast_expression
+      { $$ = new BinaryExpr(BinaryExpr::Div, $1, $3, @2); }
+    | multiplicative_expression '%' cast_expression
+      { $$ = new BinaryExpr(BinaryExpr::Mod, $1, $3, @2); }
+    ;
+
+additive_expression
+    : multiplicative_expression
+    | additive_expression '+' multiplicative_expression
+      { $$ = new BinaryExpr(BinaryExpr::Add, $1, $3, @2); }
+    | additive_expression '-' multiplicative_expression
+      { $$ = new BinaryExpr(BinaryExpr::Sub, $1, $3, @2); }
+    ;
+
+shift_expression
+    : additive_expression
+    | shift_expression TOKEN_LEFT_OP additive_expression
+      { $$ = new BinaryExpr(BinaryExpr::Shl, $1, $3, @1); }
+    | shift_expression TOKEN_RIGHT_OP additive_expression
+      { $$ = new BinaryExpr(BinaryExpr::Shr, $1, $3, @1); }
+    ;
+
+relational_expression
+    : shift_expression
+    | relational_expression '<' shift_expression
+      { $$ = new BinaryExpr(BinaryExpr::Lt, $1, $3, @2); }
+    | relational_expression '>' shift_expression
+      { $$ = new BinaryExpr(BinaryExpr::Gt, $1, $3, @2); }
+    | relational_expression TOKEN_LE_OP shift_expression
+      { $$ = new BinaryExpr(BinaryExpr::Le, $1, $3, @2); }
+    | relational_expression TOKEN_GE_OP shift_expression
+      { $$ = new BinaryExpr(BinaryExpr::Ge, $1, $3, @2); }
+    ;
+
+equality_expression
+    : relational_expression
+    | equality_expression TOKEN_EQ_OP relational_expression
+      { $$ = new BinaryExpr(BinaryExpr::Equal, $1, $3, @2); }
+    | equality_expression TOKEN_NE_OP relational_expression
+      { $$ = new BinaryExpr(BinaryExpr::NotEqual, $1, $3, @2); }
+    ;
+
+and_expression
+    : equality_expression
+    | and_expression '&' equality_expression
+      { $$ = new BinaryExpr(BinaryExpr::BitAnd, $1, $3, @2); }
+    ;
+
+exclusive_or_expression
+    : and_expression
+    | exclusive_or_expression '^' and_expression
+      { $$ = new BinaryExpr(BinaryExpr::BitXor, $1, $3, @2); }
+    ;
+
+inclusive_or_expression
+    : exclusive_or_expression
+    | inclusive_or_expression '|' exclusive_or_expression
+      { $$ = new BinaryExpr(BinaryExpr::BitOr, $1, $3, @2); }
+    ;
+
+logical_and_expression
+    : inclusive_or_expression
+    | logical_and_expression TOKEN_AND_OP inclusive_or_expression
+      { $$ = new BinaryExpr(BinaryExpr::LogicalAnd, $1, $3, @2); }
+    ;
+
+logical_or_expression
+    : logical_and_expression
+    | logical_or_expression TOKEN_OR_OP logical_and_expression
+      { $$ = new BinaryExpr(BinaryExpr::LogicalOr, $1, $3, @2); }
+    ;
+
+conditional_expression
+    : logical_or_expression
+    | logical_or_expression '?' expression ':' conditional_expression
+      { $$ = new SelectExpr($1, $3, $5, @1); }
+    ;
+
+assignment_expression
+    : conditional_expression
+    | unary_expression '=' assignment_expression
+      { $$ = new AssignExpr(AssignExpr::Assign, $1, $3, @2); }
+    | unary_expression TOKEN_MUL_ASSIGN assignment_expression
+      { $$ = new AssignExpr(AssignExpr::MulAssign, $1, $3, @2); }
+    | unary_expression TOKEN_DIV_ASSIGN assignment_expression
+      { $$ = new AssignExpr(AssignExpr::DivAssign, $1, $3, @2); }
+    | unary_expression TOKEN_MOD_ASSIGN assignment_expression
+      { $$ = new AssignExpr(AssignExpr::ModAssign, $1, $3, @2); }
+    | unary_expression TOKEN_ADD_ASSIGN assignment_expression
+      { $$ = new AssignExpr(AssignExpr::AddAssign, $1, $3, @2); }
+    | unary_expression TOKEN_SUB_ASSIGN assignment_expression
+      { $$ = new AssignExpr(AssignExpr::SubAssign, $1, $3, @2); }
+    | unary_expression TOKEN_LEFT_ASSIGN assignment_expression
+      { $$ = new AssignExpr(AssignExpr::ShlAssign, $1, $3, @2); }
+    | unary_expression TOKEN_RIGHT_ASSIGN assignment_expression
+      { $$ = new AssignExpr(AssignExpr::ShrAssign, $1, $3, @2); }
+    | unary_expression TOKEN_AND_ASSIGN assignment_expression
+      { $$ = new AssignExpr(AssignExpr::AndAssign, $1, $3, @2); }
+    | unary_expression TOKEN_XOR_ASSIGN assignment_expression
+      { $$ = new AssignExpr(AssignExpr::XorAssign, $1, $3, @2); }
+    | unary_expression TOKEN_OR_ASSIGN assignment_expression
+      { $$ = new AssignExpr(AssignExpr::OrAssign, $1, $3, @2); }
+    ;
+
+expression
+    : assignment_expression
+    | TOKEN_SYNC 
+      { $$ = new SyncExpr(@1); }
+    | expression ',' assignment_expression
+      { $$ = new BinaryExpr(BinaryExpr::Comma, $1, $3, @2); }
+    ;
+
+constant_expression
+    : conditional_expression
+    ;
+
+declaration_statement
+    : declaration     { $$ = new DeclStmt(@1, $1, m->symbolTable); }
+    ;
+
+declaration
+    : declaration_specifiers ';'
+      {
+          $$ = new Declaration($1);
+      }
+    | declaration_specifiers init_declarator_list ';'
+      {
+          $$ = new Declaration($1, $2);
+      }
+    ;
+
+soa_width_specifier
+    : TOKEN_SOA '<' int_constant '>'
+      { $$ = $3; }
+    ;
+
+declaration_specifiers
+    : storage_class_specifier
+      {
+          $$ = new DeclSpecs(NULL, $1);
+      }
+    | storage_class_specifier declaration_specifiers
+      {
+          DeclSpecs *ds = (DeclSpecs *)$2;
+          if (ds->storageClass != SC_NONE)
+              Error(@1, "Multiple storage class specifiers in a declaration are illegal. "
+                    "(Have provided both \"%s\" and \"%s\".)",
+                    lGetStorageClassString(ds->storageClass),
+                    lGetStorageClassString($1));
+          else
+              ds->storageClass = $1;
+          $$ = ds;
+      }
+    | soa_width_specifier
+      {
+          DeclSpecs *ds = new DeclSpecs;
+          ds->soaWidth = $1;
+          $$ = ds;
+      }
+    | soa_width_specifier declaration_specifiers
+      {
+          DeclSpecs *ds = (DeclSpecs *)$2;
+          if (ds->soaWidth != 0)
+              Error(@1, "soa<> qualifier supplied multiple times in declaration.");
+          else
+              ds->soaWidth = $1;
+          $$ = ds;
+      }
+    | type_specifier
+      {
+          $$ = new DeclSpecs($1);
+      }
+    | type_specifier '<' int_constant '>'
+    {
+          DeclSpecs *ds = new DeclSpecs($1);
+          ds->vectorSize = $3;
+          $$ = ds;
+    }
+    | type_specifier declaration_specifiers
+      {
+          DeclSpecs *ds = (DeclSpecs *)$2;
+          if (ds->baseType != NULL)
+              Error(@1, "Multiple types provided for declaration.");
+          ds->baseType = $1;
+          $$ = ds;
+      }
+    | type_qualifier
+      {
+          $$ = new DeclSpecs(NULL, SC_NONE, $1);
+      }
+    | type_qualifier declaration_specifiers
+      {
+          DeclSpecs *ds = (DeclSpecs *)$2;
+          ds->typeQualifier |= $1;
+          $$ = ds;
+      }
+    ;
+
+init_declarator_list
+    : init_declarator
+      {
+          std::vector<Declarator *> *dl = new std::vector<Declarator *>;
+          dl->push_back($1);
+          $$ = dl;
+      }
+    | init_declarator_list ',' init_declarator
+      {
+          std::vector<Declarator *> *dl = (std::vector<Declarator *> *)$1;
+          dl->push_back($3);
+          $$ = $1;
+      }
+    ;
+
+init_declarator
+    : declarator
+    | declarator '=' initializer { $1->initExpr = $3; $$ = $1; }
+    ;
+
+storage_class_specifier
+    : TOKEN_TYPEDEF { $$ = SC_TYPEDEF; }
+    | TOKEN_EXTERN { $$ = SC_EXTERN; }
+    | TOKEN_EXTERN TOKEN_STRING_LITERAL  { 
+          if (*(yylval.stringVal) != "C")
+              Error(@2, "String constant \"%s\" illegal after \"extern\" qualifier.  "
+                    "(Expected \"C\".)", yylval.stringVal->c_str());
+          $$ = SC_EXTERN_C; 
+      }
+    | TOKEN_EXPORT { $$ = SC_EXPORT; }
+    | TOKEN_STATIC { $$ = SC_STATIC; }
+    ;
+
+type_specifier
+    : TOKEN_VOID { $$ = AtomicType::Void; }
+    | TOKEN_BOOL { $$ = AtomicType::VaryingBool; }
+/*    | TOKEN_CHAR { UNIMPLEMENTED; } */
+    | TOKEN_INT { $$ = AtomicType::VaryingInt32; }
+    | TOKEN_FLOAT { $$ = AtomicType::VaryingFloat; }
+    | TOKEN_DOUBLE { $$ = AtomicType::VaryingDouble; }
+    | TOKEN_INT64 { $$ = AtomicType::VaryingInt64; }
+    | TOKEN_TYPE_NAME
+      { const Type *t = m->symbolTable->LookupType(yytext); 
+        assert(t != NULL);
+        $$ = t;
+      }
+    | struct_or_union_specifier { $$ = $1; }
+    | enum_specifier 
+      { UNIMPLEMENTED; }
+/*    | TOKEN_TYPE_NAME 
+      { UNIMPLEMENTED; }
+*/
+    ;
+
+struct_or_union_name
+    : TOKEN_IDENTIFIER { $$ = strdup(yytext); }
+    | TOKEN_TYPE_NAME { $$ = strdup(yytext); }
+    ;
+
+struct_or_union_specifier
+    : struct_or_union struct_or_union_name '{' struct_declaration_list '}' 
+      { 
+          std::vector<const Type *> elementTypes;
+          std::vector<std::string> elementNames;
+          GetStructTypesAndNames(*$4, &elementTypes, &elementNames);
+          StructType *st = new StructType($2, elementTypes, elementNames,
+                                          false, true, @2);
+          m->symbolTable->AddType($2, st, @2);
+          $$ = st;
+      }
+    | struct_or_union '{' struct_declaration_list '}' 
+      {
+          std::vector<const Type *> elementTypes;
+          std::vector<std::string> elementNames;
+          GetStructTypesAndNames(*$3, &elementTypes, &elementNames);
+          $$ = new StructType("", elementTypes, elementNames, false, true, @1);
+      }
+    | struct_or_union '{' '}' 
+      {
+          Error(@1, "Empty struct definitions not allowed."); 
+      }
+    | struct_or_union struct_or_union_name '{' '}' 
+      {
+          Error(@1, "Empty struct definitions not allowed."); 
+      }
+    | struct_or_union struct_or_union_name
+      { const Type *st = m->symbolTable->LookupType($2); 
+        if (!st) {
+            std::vector<std::string> alternates = m->symbolTable->ClosestTypeMatch($2);
+            std::string alts = lGetAlternates(alternates);
+            Error(@2, "Struct type \"%s\" unknown.%s", $2, alts.c_str());
+        }
+        else if (dynamic_cast<const StructType *>(st) == NULL)
+            Error(@2, "Type \"%s\" is not a struct type!", $2);
+        $$ = st;
+      }
+    ;
+
+struct_or_union
+    : TOKEN_STRUCT 
+    ;
+
+struct_declaration_list
+    : struct_declaration 
+      { 
+          std::vector<StructDeclaration *> *sdl = new std::vector<StructDeclaration *>;
+          sdl->push_back($1);
+          $$ = sdl;
+      }
+    | struct_declaration_list struct_declaration 
+      {
+          std::vector<StructDeclaration *> *sdl = (std::vector<StructDeclaration *> *)$1;
+          sdl->push_back($2);
+          $$ = $1;
+      }
+    ;
+
+struct_declaration
+    : specifier_qualifier_list struct_declarator_list ';' 
+      { $$ = new StructDeclaration($1, $2); }
+    ;
+
+specifier_qualifier_list
+    : type_specifier specifier_qualifier_list
+    | type_specifier
+    | type_qualifier specifier_qualifier_list 
+    {
+        if ($1 == TYPEQUAL_UNIFORM)
+            $$ = $2->GetAsUniformType();
+        else if ($1 == TYPEQUAL_VARYING)
+            $$ = $2->GetAsVaryingType();
+        else if ($1 == TYPEQUAL_REFERENCE)
+            $$ = new ReferenceType($2, false);
+        else if ($1 == TYPEQUAL_CONST)
+            $$ = $2->GetAsConstType();
+        else if ($1 == TYPEQUAL_UNSIGNED) {
+            const Type *t = $2->GetAsUnsignedType();
+            if (t)
+                $$ = t;
+            else {
+                Error(@1, "Can't apply \"unsigned\" qualifier to \"%s\" type. Ignoring.",
+                      $2->GetString().c_str());
+                $$ = $2;
+            }
+        } 
+        else {
+            UNIMPLEMENTED;
+        }
+    }
+/* K&R--implicit int type--e.g. "static foo" -> foo is an int */
+/*    | type_qualifier { UNIMPLEMENTED; }*/
+    ;
+
+
+struct_declarator_list
+    : struct_declarator 
+      {
+          std::vector<Declarator *> *sdl = new std::vector<Declarator *>;
+          sdl->push_back($1);
+          $$ = sdl;
+      }
+    | struct_declarator_list ',' struct_declarator 
+      {
+          std::vector<Declarator *> *sdl = (std::vector<Declarator *> *)$1;
+          sdl->push_back($3);
+          $$ = $1;
+      }
+    ;
+
+struct_declarator
+    : declarator { $$ = $1; }
+/* bitfields
+    | ':' constant_expression
+    | declarator ':' constant_expression
+*/
+    ;
+
+enum_specifier
+    : TOKEN_ENUM '{' enumerator_list '}' 
+      { UNIMPLEMENTED; }
+    | TOKEN_ENUM TOKEN_IDENTIFIER '{' enumerator_list '}' 
+      { UNIMPLEMENTED; }
+    | TOKEN_ENUM TOKEN_IDENTIFIER 
+      { UNIMPLEMENTED; }
+    ;
+
+enumerator_list
+    : enumerator 
+      { UNIMPLEMENTED; }
+    | enumerator_list ',' enumerator
+    ;
+
+enumerator
+    : TOKEN_IDENTIFIER
+    | TOKEN_IDENTIFIER '=' constant_expression
+    ;
+
+type_qualifier
+    : TOKEN_CONST      { $$ = TYPEQUAL_CONST; }
+    | TOKEN_UNIFORM    { $$ = TYPEQUAL_UNIFORM; }
+    | TOKEN_VARYING    { $$ = TYPEQUAL_VARYING; }
+    | TOKEN_TASK       { $$ = TYPEQUAL_TASK; }
+    | TOKEN_INLINE     { $$ = TYPEQUAL_INLINE; }
+    | TOKEN_REFERENCE  { $$ = TYPEQUAL_REFERENCE; }
+    | TOKEN_UNSIGNED   { $$ = TYPEQUAL_UNSIGNED; }
+    ;
+
+declarator
+    : direct_declarator
+    ;
+
+int_constant
+    : TOKEN_INT_CONSTANT { $$ = yylval.int32Val; }
+    ;
+
+direct_declarator
+    : TOKEN_IDENTIFIER
+      {
+          Symbol *sym = new Symbol(yytext, @1);
+          $$ = new Declarator(sym, @1);
+      }
+    | '(' declarator ')' { $$ = $2; }
+    | direct_declarator '[' constant_expression ']'
+    {
+        Expr *size = $3;
+        if (size) size = size->TypeCheck();
+        if (size) {
+            size = size->Optimize();
+            llvm::Constant *cval = size->GetConstant(size->GetType());
+            if (!cval) {
+                Error(@3, "Array dimension must be compile-time constant");
+                $$ = NULL;
+            }
+            else {
+                llvm::ConstantInt *ci = llvm::dyn_cast<llvm::ConstantInt>(cval);
+                if (!ci) {
+                    Error(@3, "Array dimension must be compile-time integer constant.");
+                    $$ = NULL;
+                }
+                $1->AddArrayDimension((int)ci->getZExtValue());
+                $$ = $1;
+            }
+        }
+        else
+            $$ = NULL;
+    }
+    | direct_declarator '[' ']'
+    {
+        $1->AddArrayDimension(-1); // unsized
+        $$ = $1;
+    }
+    | direct_declarator '(' parameter_type_list ')'
+      {
+          Declarator *d = (Declarator *)$1;
+          d->isFunction = true;
+          d->functionArgs = $3;
+          $$ = d;
+      }
+/* K&R?   | direct_declarator '(' identifier_list ')' */
+    | direct_declarator '(' ')'
+      {
+          Declarator *d = (Declarator *)$1;
+          d->isFunction = true;
+          $$ = d;
+      }
+    ;
+
+
+parameter_type_list
+    : parameter_list { $$ = $1; }
+    ;
+
+parameter_list
+    : parameter_declaration
+    {
+        std::vector<Declaration *> *dl = new std::vector<Declaration *>;
+        dl->push_back($1);
+        $$ = dl;
+    }
+    | parameter_list ',' parameter_declaration
+    {
+        std::vector<Declaration *> *dl = (std::vector<Declaration *> *)$1;
+        dl->push_back($3);
+        $$ = dl;
+    }
+    ;
+
+parameter_declaration
+    : declaration_specifiers declarator
+    {
+        $$ = new Declaration($1, $2); 
+    }
+    | declaration_specifiers declarator '=' initializer
+    { 
+        $2->initExpr = $4;
+        $$ = new Declaration($1, $2); 
+
+    }
+    | declaration_specifiers abstract_declarator
+    {
+        UNIMPLEMENTED;
+    }
+    | declaration_specifiers
+    {
+        $$ = new Declaration($1); 
+    }
+    ;
+
+/* K&R? 
+identifier_list
+    : IDENTIFIER
+    | identifier_list ',' IDENTIFIER
+    ;
+*/
+
+type_name
+    : specifier_qualifier_list
+    | specifier_qualifier_list abstract_declarator
+    ;
+
+abstract_declarator
+/*    : pointer
+    | direct_abstract_declarator
+    | pointer direct_abstract_declarator */
+    : direct_abstract_declarator { UNIMPLEMENTED; }
+    ;
+
+direct_abstract_declarator
+    : '(' abstract_declarator ')'
+/*     | '[' ']' */
+    | '[' constant_expression ']'
+/*    | direct_abstract_declarator '[' ']' */
+    | direct_abstract_declarator '[' constant_expression ']'
+    | '(' ')'
+    | '(' parameter_type_list ')'
+    | direct_abstract_declarator '(' ')'
+    | direct_abstract_declarator '(' parameter_type_list ')'
+    ;
+
+initializer
+    : assignment_expression
+    | '{' initializer_list '}' { $$ = $2; }
+    | '{' initializer_list ',' '}' { $$ = $2; }
+    ;
+
+initializer_list
+    : initializer
+      { $$ = new ExprList($1, @1); }
+    | initializer_list ',' initializer
+      {
+          ExprList *exprList = dynamic_cast<ExprList *>($1);
+          assert(exprList);
+          exprList->exprs.push_back($3);
+          $$ = exprList;
+      }
+    ;
+
+statement
+    : labeled_statement
+    | compound_statement
+    | expression_statement
+    | selection_statement
+    | iteration_statement
+    | jump_statement
+    | declaration_statement
+    | print_statement
+    ;
+
+labeled_statement
+    : TOKEN_CASE constant_expression ':' statement
+      { UNIMPLEMENTED; }
+    | TOKEN_DEFAULT ':' statement
+      { UNIMPLEMENTED; }
+    ;
+
+start_scope
+    : '{' { m->symbolTable->PushScope(); }
+    ;
+
+end_scope
+    : '}' { m->symbolTable->PopScope(); }
+    ;
+
+compound_statement
+    : '{' '}' { $$ = NULL; }
+    | start_scope statement_list end_scope { $$ = $2; }
+    ;
+
+statement_list
+    : statement
+      {
+          StmtList *sl = new StmtList(@1);
+          sl->Add($1);
+          $$ = sl;
+      }
+    | statement_list statement
+      {
+          ((StmtList *)$1)->Add($2);
+          $$ = $1;
+      }
+    ;
+
+expression_statement
+    : ';' { $$ = NULL; }
+    | expression ';' { $$ = new ExprStmt($1, @1); }
+    ;
+
+selection_statement
+    : TOKEN_IF '(' expression ')' statement
+      { $$ = new IfStmt($3, $5, NULL, false, @1); }
+    | TOKEN_IF '(' expression ')' statement TOKEN_ELSE statement
+      { $$ = new IfStmt($3, $5, $7, false, @1); }
+    | TOKEN_CIF '(' expression ')' statement
+      { $$ = new IfStmt($3, $5, NULL, true, @1); }
+    | TOKEN_CIF '(' expression ')' statement TOKEN_ELSE statement
+      { $$ = new IfStmt($3, $5, $7, true, @1); }
+    | TOKEN_SWITCH '(' expression ')' statement
+      { UNIMPLEMENTED; }
+    ;
+
+for_test
+    : ';'
+      { $$ = NULL; }
+    | expression ';'
+      { $$ = $1; }
+    ;
+
+for_init_statement
+    : expression_statement
+    | declaration_statement
+    ;
+
+for_scope
+    : TOKEN_FOR { m->symbolTable->PushScope(); }
+    ;
+
+cfor_scope
+    : TOKEN_CFOR { m->symbolTable->PushScope(); }
+    ;
+
+iteration_statement
+    : TOKEN_WHILE '(' expression ')' statement
+      { $$ = new ForStmt(NULL, $3, NULL, $5, false, @1); }
+    | TOKEN_CWHILE '(' expression ')' statement
+      { $$ = new ForStmt(NULL, $3, NULL, $5, true, @1); }
+    | TOKEN_DO statement TOKEN_WHILE '(' expression ')' ';'
+      { $$ = new DoStmt($5, $2, false, @1); }
+    | TOKEN_CDO statement TOKEN_WHILE '(' expression ')' ';'
+      { $$ = new DoStmt($5, $2, true, @1); }
+    | for_scope '(' for_init_statement for_test ')' statement
+      { $$ = new ForStmt($3, $4, NULL, $6, false, @1); 
+        m->symbolTable->PopScope();
+      }
+    | for_scope '(' for_init_statement for_test expression ')' statement
+      { $$ = new ForStmt($3, $4, new ExprStmt($5, @5), $7, false, @1); 
+        m->symbolTable->PopScope();
+      }
+    | cfor_scope '(' for_init_statement for_test ')' statement
+      { $$ = new ForStmt($3, $4, NULL, $6, true, @1);
+        m->symbolTable->PopScope();
+      }
+    | cfor_scope '(' for_init_statement for_test expression ')' statement
+      { $$ = new ForStmt($3, $4, new ExprStmt($5, @5), $7, true, @1);
+        m->symbolTable->PopScope();
+      }
+    ;
+
+jump_statement
+    : TOKEN_GOTO TOKEN_IDENTIFIER ';'
+      { UNIMPLEMENTED; }
+    | TOKEN_CONTINUE ';'
+      { $$ = new ContinueStmt(false, @1); }
+    | TOKEN_BREAK ';'
+      { $$ = new BreakStmt(false, @1); }
+    | TOKEN_RETURN ';'
+      { $$ = new ReturnStmt(NULL, false, @1); }
+    | TOKEN_RETURN expression ';'
+      { $$ = new ReturnStmt($2, false, @1); }
+    | TOKEN_CCONTINUE ';'
+      { $$ = new ContinueStmt(true, @1); }
+    | TOKEN_CBREAK ';'
+      { $$ = new BreakStmt(true, @1); }
+    | TOKEN_CRETURN ';'
+      { $$ = new ReturnStmt(NULL, true, @1); }
+    | TOKEN_CRETURN expression ';'
+      { $$ = new ReturnStmt($2, true, @1); }
+    ;
+
+print_statement
+    : TOKEN_PRINT '(' string_constant ')'
+      {
+           $$ = new PrintStmt(*$3, NULL, @1); 
+      }
+    | TOKEN_PRINT '(' string_constant ',' argument_expression_list ')'
+      {
+           $$ = new PrintStmt(*$3, $5, @1); 
+      }
+    ;
+
+translation_unit
+    : external_declaration
+    | translation_unit external_declaration
+    | error
+    {
+        std::vector<std::string> builtinTokens;
+        const char **token = lBuiltinTokens;
+        while (*token) {
+            builtinTokens.push_back(*token);
+            ++token;
+        }
+        std::vector<std::string> alternates = MatchStrings(yytext, builtinTokens);
+        std::string alts = lGetAlternates(alternates);
+        Error(@1, "Syntax error--token \"%s\" unknown.%s", yytext, alts.c_str());
+    }
+    ;
+
+external_declaration
+    : function_definition
+    | TOKEN_EXTERN TOKEN_STRING_LITERAL '{' declaration '}' // FIXME: make sure string=="C"
+    | declaration 
+    { 
+        for (unsigned int i = 0; i < $1->declarators.size(); ++i)
+            m->AddGlobal($1->declSpecs, $1->declarators[i]);
+    }
+    ;
+
+function_definition
+    : declaration_specifiers declarator 
+    {
+        m->AddGlobal($1, $2);
+        lAddFunctionParams($2); 
+        lAddMaskToSymbolTable(@2);
+        if ($1->typeQualifier & TYPEQUAL_TASK)
+            lAddThreadIndexCountToSymbolTable(@2);
+    } 
+    compound_statement
+    {
+        m->AddFunction($1, $2, $4);
+        m->symbolTable->PopScope(); // push in lAddFunctionParams();
+    }
+/* function with no declared return type??
+func(...) 
+    | declarator { lAddFunctionParams($1); } compound_statement
+    {
+        m->AddFunction(new DeclSpecs(, $1, $3);
+        m->symbolTable->PopScope(); // push in lAddFunctionParams();
+    }
+*/
+    ;
+
+%%
+
+
+/** We're about to start parsing the body of a function; add all of the
+    parameters to the symbol table so that they're available.
+*/
+static void
+lAddFunctionParams(Declarator *decl) {
+    m->symbolTable->PushScope();
+
+    // wire up arguments
+    if (decl->functionArgs) {
+        for (unsigned int i = 0; i < decl->functionArgs->size(); ++i) {
+            Declaration *pdecl = (*decl->functionArgs)[i];
+            assert(pdecl->declarators.size() == 1);
+            Symbol *sym = pdecl->declarators[0]->sym;
+#ifndef NDEBUG
+            bool ok = m->symbolTable->AddVariable(sym);
+            assert(ok); // or error message?
+#else
+            m->symbolTable->AddVariable(sym);
+#endif
+        }
+    }
+
+    // The corresponding pop scope happens in function_definition rules
+    // above...
+}
+
+
+/** Add a symbol for the built-in mask variable to the symbol table */
+static void lAddMaskToSymbolTable(SourcePos pos) {
+    const Type *t = AtomicType::VaryingConstUInt32;
+    Symbol *maskSymbol = new Symbol("__mask", pos, t);
+    m->symbolTable->AddVariable(maskSymbol);
+}
+
+
+/** Add the thread index and thread count variables to the symbol table
+    (this should only be done for 'task'-qualified functions. */
+static void lAddThreadIndexCountToSymbolTable(SourcePos pos) {
+    Symbol *threadIndexSym = new Symbol("threadIndex", pos, AtomicType::UniformConstUInt32);
+    m->symbolTable->AddVariable(threadIndexSym);
+
+    Symbol *threadCountSym = new Symbol("threadCount", pos, AtomicType::UniformConstUInt32);
+    m->symbolTable->AddVariable(threadCountSym);
+}
+
+
+/** Small utility routine to construct a string for error messages that
+    suggests alternate tokens for possibly-misspelled ones... */
+static std::string lGetAlternates(std::vector<std::string> &alternates) {
+    std::string alts;
+    if (alternates.size()) {
+        alts += " Did you mean ";
+        for (unsigned int i = 0; i < alternates.size(); ++i) {
+            alts += std::string("\"") + alternates[i] + std::string("\"");
+            if (i < alternates.size() - 1) alts += ", or ";
+        }
+        alts += "?";
+    }
+    return alts;
+}
+
+static const char *
+lGetStorageClassString(StorageClass sc) {
+    switch (sc) {
+    case SC_NONE:
+        return "";
+    case SC_EXTERN:
+        return "extern";
+    case SC_EXPORT:
+        return "export";
+    case SC_STATIC:
+        return "static";
+    case SC_TYPEDEF:
+        return "typedef";
+    case SC_EXTERN_C:
+        return "extern \"C\"";
+    default:
+        assert(!"logic error in lGetStorageClassString()");
+        return "";
+    }
+}
+
+
diff --git a/run_tests.sh b/run_tests.sh
new file mode 100755
index 00000000..7eced103
--- /dev/null
+++ b/run_tests.sh
@@ -0,0 +1,43 @@
+#!/bin/zsh
+
+surprises=0
+
+echo Running correctness tests
+
+for i in tests/*.ispc; do
+    bc=${i%%ispc}bc
+    ispc -O2 $i -woff -o $bc --emit-llvm --target=sse4
+    if [[ $? != 0 ]]; then
+        surprises=1
+        echo Test $i FAILED ispc compile
+        echo
+    else
+        ispc_test $bc
+        if [[ $? != 0 ]]; then
+            surprises=1
+            echo Test $i FAILED ispc_test
+            echo
+        fi
+#        cmp $bc tests_bitcode${bc##tests}
+#        if [[ $? == 0 ]]; then
+#            /bin/rm $bc
+#        fi
+    fi
+    /bin/rm $bc
+done
+
+echo Running failing tests
+for i in failing_tests/*.ispc; do
+    (ispc -O2 $i -woff -o - --emit-llvm | ispc_test -) 2>/dev/null 1>/dev/null
+    if [[ $? == 0 ]]; then
+        surprises=1
+        echo Test $i UNEXPECTEDLY PASSED
+        echo
+    fi
+done
+
+if [[ $surprises == 0 ]]; then
+    echo No surprises.
+fi
+
+exit $surprises
diff --git a/stdlib-avx.ll b/stdlib-avx.ll
new file mode 100644
index 00000000..3d125a7e
--- /dev/null
+++ b/stdlib-avx.ll
@@ -0,0 +1,589 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; *** Untested *** AVX target implementation.
+;;
+;; The LLVM AVX code generator is incomplete, so the ispc AVX target
+;; hasn't yet been tested.  There is therefore a higher-than-normal
+;; chance that there are bugs in the code in this file.
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Basic 8-wide definitions
+
+stdlib_core(8)
+packed_load_and_store(8)
+int8_16(8)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+declare <8 x float> @llvm.x86.avx.rcp.ps(<8 x float>) nounwind readnone
+declare <8 x float> @llvm.x86.avx.rcp.ss(<8 x float>) nounwind readnone
+
+define internal <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ;  float iv = __rcp_v(v);
+  ;  return iv * (2. - v * iv);
+
+  %call = call <8 x float> @llvm.x86.avx.rcp.ps(<8 x float> %0)
+  ; do one N-R iteration
+  %v_iv = fmul <8 x float> %0, %call
+  %two_minus = fsub <8 x float> <float 2., float 2., float 2., float 2.,
+                                 float 2., float 2., float 2., float 2.>, %v_iv  
+  %iv_mul = fmul <8 x float> %call, %two_minus
+  ret <8 x float> %iv_mul
+}
+
+define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
+;    uniform float iv = extract(__rcp_u(v), 0);
+;    return iv * (2. - v * iv);
+  %vecval = insertelement <8 x float> undef, float %0, i32 0
+  %call = call <8 x float> @llvm.x86.avx.rcp.ss(<8 x float> %vecval)
+  %scall = extractelement <8 x float> %call, i32 0
+
+  ; do one N-R iteration
+  %v_iv = fmul float %0, %scall
+  %two_minus = fsub float 2., %v_iv  
+  %iv_mul = fmul float %scall, %two_minus
+  ret float %iv_mul
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding
+
+declare <8 x float> @llvm.x86.avx.round.ps(<8 x float>, i32) nounwind readnone
+declare <8 x float> @llvm.x86.avx.round.ss(<8 x float>, <8 x float>, i32) nounwind readnone
+
+define internal <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  %call = call <8 x float> @llvm.x86.avx.round.ps(<8 x float> %0, i32 8)
+  ret <8 x float> %call
+}
+
+define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
+  ; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  ; the roundss intrinsic is a total mess--docs say:
+  ;
+  ;  __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
+  ;       
+  ;  b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
+  ;  on b0. The higher order 96 bits are copied directly from input parameter a. The
+  ;  return value is described by the following equations:
+  ;
+  ;  r0 = RND(b0)
+  ;  r1 = a1
+  ;  r2 = a2
+  ;  r3 = a3
+  ;
+  ;  It doesn't matter what we pass as a, since we only need the r0 value
+  ;  here.  So we pass the same register for both.
+  %xi = insertelement <8 x float> undef, float %0, i32 0
+  %xr = call <8 x float> @llvm.x86.avx.round.ss(<8 x float> %xi, <8 x float> %xi, i32 8)
+  %rs = extractelement <8 x float> %xr, i32 0
+  ret float %rs
+}
+
+define internal <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  %call = call <8 x float> @llvm.x86.avx.round.ps(<8 x float> %0, i32 9)
+  ret <8 x float> %call
+}
+
+define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <8 x float> undef, float %0, i32 0
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  %xr = call <8 x float> @llvm.x86.avx.round.ss(<8 x float> %xi, <8 x float> %xi, i32 9)
+  %rs = extractelement <8 x float> %xr, i32 0
+  ret float %rs
+}
+
+define internal <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  %call = call <8 x float> @llvm.x86.avx.round.ps(<8 x float> %0, i32 10)
+  ret <8 x float> %call
+}
+
+define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <8 x float> undef, float %0, i32 0
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  %xr = call <8 x float> @llvm.x86.avx.round.ss(<8 x float> %xi, <8 x float> %xi, i32 10)
+  %rs = extractelement <8 x float> %xr, i32 0
+  ret float %rs
+}
+
+declare <8 x float> @llvm.x86.avx.rsqrt.ps(<8 x float>) nounwind readnone
+declare <8 x float> @llvm.x86.avx.rsqrt.ss(<8 x float>) nounwind readnone
+
+define internal <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
+  ;  float is = __rsqrt_v(v);
+  %is = call <8 x float> @llvm.x86.avx.rsqrt.ps(<8 x float> %v)
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul <8 x float> %v, %is
+  %v_is_is = fmul <8 x float> %v_is, %is
+  %three_sub = fsub <8 x float> <float 3., float 3., float 3., float 3., float 3., float 3., float 3., float 3.>, %v_is_is
+  %is_mul = fmul <8 x float> %is, %three_sub
+  %half_scale = fmul <8 x float> <float 0.5, float 0.5, float 0.5, float 0.5, float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  ret <8 x float> %half_scale
+}
+
+define internal float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
+  ;  uniform float is = extract(__rsqrt_u(v), 0);
+  %v = insertelement <8 x float> undef, float %0, i32 0
+  %vis = call <8 x float> @llvm.x86.avx.rsqrt.ss(<8 x float> %v)
+  %is = extractelement <8 x float> %vis, i32 0
+
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul float %0, %is
+  %v_is_is = fmul float %v_is, %is
+  %three_sub = fsub float 3., %v_is_is
+  %is_mul = fmul float %is, %three_sub
+  %half_scale = fmul float 0.5, %is_mul
+  ret float %half_scale
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; sqrt
+
+declare <8 x float> @llvm.x86.avx.sqrt.ps(<8 x float>) nounwind readnone
+declare <8 x float> @llvm.x86.avx.sqrt.ss(<8 x float>) nounwind readnone
+
+define internal <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  %call = call <8 x float> @llvm.x86.avx.sqrt.ps(<8 x float> %0)
+  ret <8 x float> %call
+}
+
+define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
+  sse_unary_scalar(ret, 8, float, @llvm.x86.avx.sqrt.ss, %0)
+  ret float %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; fastmath
+
+declare void @llvm.x86.avx.stmxcsr(i32 *) nounwind
+declare void @llvm.x86.avx.ldmxcsr(i32 *) nounwind
+
+define internal void @__fastmath() nounwind alwaysinline {
+  %ptr = alloca i32
+  call void @llvm.x86.avx.stmxcsr(i32 * %ptr)
+  %oldval = load i32 *%ptr
+
+  ; turn on DAZ (64)/FTZ (32768) -> 32832
+  %update = or i32 %oldval, 32832
+  store i32 %update, i32 *%ptr
+  call void @llvm.x86.avx.ldmxcsr(i32 * %ptr)
+  ret void
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; svml
+
+; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
+; or, use the macro to call the 4-wide ones twice with our 8-wide
+; vectors...
+
+declare <8 x float> @__svml_sin(<8 x float>)
+declare <8 x float> @__svml_cos(<8 x float>)
+declare void @__svml_sincos(<8 x float>, <8 x float> *, <8 x float> *)
+declare <8 x float> @__svml_tan(<8 x float>)
+declare <8 x float> @__svml_atan(<8 x float>)
+declare <8 x float> @__svml_atan2(<8 x float>, <8 x float>)
+declare <8 x float> @__svml_exp(<8 x float>)
+declare <8 x float> @__svml_log(<8 x float>)
+declare <8 x float> @__svml_pow(<8 x float>, <8 x float>)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+declare <8 x float> @llvm.x86.avx.max.ps(<8 x float>, <8 x float>) nounwind readnone
+declare <8 x float> @llvm.x86.avx.max.ss(<8 x float>, <8 x float>) nounwind readnone
+declare <8 x float> @llvm.x86.avx.min.ps(<8 x float>, <8 x float>) nounwind readnone
+declare <8 x float> @llvm.x86.avx.min.ss(<8 x float>, <8 x float>) nounwind readnone
+
+define internal <8 x float> @__max_varying_float(<8 x float>,
+                                                 <8 x float>) nounwind readonly alwaysinline {
+  %call = call <8 x float> @llvm.x86.avx.max.ps(<8 x float> %0, <8 x float> %1)
+  ret <8 x float> %call
+}
+
+define internal float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 8, float, @llvm.x86.avx.max.ss, %0, %1)
+  ret float %ret
+}
+
+define internal <8 x float> @__min_varying_float(<8 x float>,
+                                                 <8 x float>) nounwind readonly alwaysinline {
+  %call = call <8 x float> @llvm.x86.avx.min.ps(<8 x float> %0, <8 x float> %1)
+  ret <8 x float> %call
+}
+
+define internal float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 8, float, @llvm.x86.avx.min.ss, %0, %1)
+  ret float %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+declare <8 x i32> @llvm.x86.avx.pminsd(<8 x i32>, <8 x i32>) nounwind readnone
+declare <8 x i32> @llvm.x86.avx.pmaxsd(<8 x i32>, <8 x i32>) nounwind readnone
+
+define internal <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  %call = call <8 x i32> @llvm.x86.avx.pminsd(<8 x i32> %0, <8 x i32> %1)
+  ret <8 x i32> %call
+}
+
+define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 8, i32, @llvm.x86.avx.pminsd, %0, %1)
+  ret i32 %ret
+}
+
+define internal <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  %call = call <8 x i32> @llvm.x86.avx.pmaxsd(<8 x i32> %0, <8 x i32> %1)
+  ret <8 x i32> %call
+}
+
+define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 8, i32, @llvm.x86.avx.pmaxsd, %0, %1)
+  ret i32 %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+declare <8 x i32> @llvm.x86.avx.pminud(<8 x i32>, <8 x i32>) nounwind readnone
+declare <8 x i32> @llvm.x86.avx.pmaxud(<8 x i32>, <8 x i32>) nounwind readnone
+
+define internal <8 x i32> @__min_varying_uint32(<8 x i32>,
+                                                <8 x i32>) nounwind readonly alwaysinline {
+  %call = call <8 x i32> @llvm.x86.avx.pminud(<8 x i32> %0, <8 x i32> %1)
+  ret <8 x i32> %call
+}
+
+define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 8, i32, @llvm.x86.avx.pminud, %0, %1)
+  ret i32 %ret
+}
+
+define internal <8 x i32> @__max_varying_uint32(<8 x i32>,
+                                                <8 x i32>) nounwind readonly alwaysinline {
+  %call = call <8 x i32> @llvm.x86.avx.pmaxud(<8 x i32> %0, <8 x i32> %1)
+  ret <8 x i32> %call
+}
+
+define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 8, i32, @llvm.x86.avx.pmaxud, %0, %1)
+  ret i32 %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops
+
+declare i32 @llvm.ctpop.i32(i32) nounwind readnone
+
+define internal i32 @__popcnt(i32) nounwind readonly alwaysinline {
+  %call = call i32 @llvm.ctpop.i32(i32 %0)
+  ret i32 %call
+}
+
+declare i32 @llvm.x86.avx.movmsk.ps(<8 x float>) nounwind readnone
+
+define internal i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %v = call i32 @llvm.x86.avx.movmsk.ps(<8 x float> %floatmask) nounwind readnone
+  ret i32 %v
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal float ops
+
+declare <8 x float> @llvm.x86.avx.hadd.ps(<8 x float>, <8 x float>) nounwind readnone
+
+define internal float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
+  %v1 = call <8 x float> @llvm.x86.avx.hadd.ps(<8 x float> %0, <8 x float> %0)
+  %v2 = call <8 x float> @llvm.x86.avx.hadd.ps(<8 x float> %v1, <8 x float> %v1)
+  %scalar1 = extractelement <8 x float> %v2, i32 0
+  %scalar2 = extractelement <8 x float> %v2, i32 4
+  %sum = fadd float %scalar1, %scalar2
+  ret float %sum
+}
+
+
+define internal float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
+  reduce8(float, @__min_varying_float, @__min_uniform_float)
+}
+
+
+define internal float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
+  reduce8(float, @__max_varying_float, @__max_uniform_float)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int32 ops
+
+define internal <8 x i32> @__add_varying_int32(<8 x i32>,
+                                               <8 x i32>) nounwind readnone alwaysinline {
+  %s = add <8 x i32> %0, %1
+  ret <8 x i32> %s
+}
+
+define internal i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
+  %s = add i32 %0, %1
+  ret i32 %s
+}
+
+define internal i32 @__reduce_add_int32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8(i32, @__add_varying_int32, @__add_uniform_int32)
+}
+
+
+define internal i32 @__reduce_min_int32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8(i32, @__min_varying_int32, @__min_uniform_int32)
+}
+
+
+define internal i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8(i32, @__max_varying_int32, @__max_uniform_int32)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; horizontal uint32 ops
+
+define internal i32 @__reduce_add_uint32(<8 x i32> %v) nounwind readnone alwaysinline {
+  %r = call i32 @__reduce_add_int32(<8 x i32> %v)
+  ret i32 %r
+}
+
+define internal i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32)
+}
+
+
+define internal i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+define <8 x i32> @__load_and_broadcast_32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
+  %mm = call i32 @__movmsk(<8 x i32> %mask)
+  %any_on = icmp ne i32 %mm, 0
+  br i1 %any_on, label %load, label %skip
+
+load:
+  %ptr = bitcast i8 * %0 to i32 *
+  %val = load i32 * %ptr
+
+  %ret0 = insertelement <8 x i32> undef, i32 %val, i32 0
+  %ret1 = insertelement <8 x i32> %ret0, i32 %val, i32 1
+  %ret2 = insertelement <8 x i32> %ret1, i32 %val, i32 2
+  %ret3 = insertelement <8 x i32> %ret2, i32 %val, i32 3
+  %ret4 = insertelement <8 x i32> %ret3, i32 %val, i32 4
+  %ret5 = insertelement <8 x i32> %ret4, i32 %val, i32 5
+  %ret6 = insertelement <8 x i32> %ret5, i32 %val, i32 6
+  %ret7 = insertelement <8 x i32> %ret6, i32 %val, i32 7
+  ret <8 x i32> %ret7
+
+skip:
+  ret <8 x i32> undef
+}
+
+
+define <8 x i64> @__load_and_broadcast_64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
+  %mm = call i32 @__movmsk(<8 x i32> %mask)
+  %any_on = icmp ne i32 %mm, 0
+  br i1 %any_on, label %load, label %skip
+
+load:
+  %ptr = bitcast i8 * %0 to i64 *
+  %val = load i64 * %ptr
+
+  %ret0 = insertelement <8 x i64> undef, i64 %val, i32 0
+  %ret1 = insertelement <8 x i64> %ret0, i64 %val, i32 1
+  %ret2 = insertelement <8 x i64> %ret1, i64 %val, i32 2
+  %ret3 = insertelement <8 x i64> %ret2, i64 %val, i32 3
+  %ret4 = insertelement <8 x i64> %ret3, i64 %val, i32 4
+  %ret5 = insertelement <8 x i64> %ret4, i64 %val, i32 5
+  %ret6 = insertelement <8 x i64> %ret5, i64 %val, i32 6
+  %ret7 = insertelement <8 x i64> %ret6, i64 %val, i32 7
+  ret <8 x i64> %ret3
+
+skip:
+  ret <8 x i64> undef
+}
+
+
+define <8 x i32> @__load_masked_32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
+  %mm = call i32 @__movmsk(<8 x i32> %mask)
+  %any_on = icmp ne i32 %mm, 0
+  br i1 %any_on, label %load, label %skip
+
+load:
+  %ptr = bitcast i8 * %0 to <8 x i32> *
+  %val = load <8 x i32> * %ptr, align 4
+  ret <8 x i32> %val
+
+skip:
+  ret <8 x i32> undef
+}
+
+
+define <8 x i64> @__load_masked_64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
+  %mm = call i32 @__movmsk(<8 x i32> %mask)
+  %any_on = icmp ne i32 %mm, 0
+  br i1 %any_on, label %load, label %skip
+
+load:
+  %ptr = bitcast i8 * %0 to <8 x i64> *
+  %val = load <8 x i64> * %ptr, align 8
+  ret <8 x i64> %val
+
+skip:
+  ret <8 x i64> undef
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+define void @__masked_store_32(<8 x i32>* nocapture, <8 x i32>, 
+                               <8 x i32>) nounwind alwaysinline {
+  per_lane(8, <8 x i32> %2, `
+      ; compute address for this one
+      %ptr_ID = getelementptr <8 x i32> * %0, i32 0, i32 LANE
+      %storeval_ID = extractelement <8 x i32> %1, i32 LANE
+      store i32 %storeval_ID, i32 * %ptr_ID')
+  ret void
+}
+
+define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>,
+                               <8 x i32>) nounwind alwaysinline {
+  per_lane(8, <8 x i32> %2, `
+      %ptr_ID = getelementptr <8 x i64> * %0, i32 0, i32 LANE
+      %storeval_ID = extractelement <8 x i64> %1, i32 LANE
+      store i64 %storeval_ID, i64 * %ptr_ID')
+  ret void
+}
+
+
+declare <8 x float> @llvm.x86.avx.blendvps(<8 x float>, <8 x float>,
+                                           <8 x float>) nounwind readnone
+
+
+define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
+                                           <8 x i32>) nounwind alwaysinline {
+  %mask_as_float = bitcast <8 x i32> %2 to <8 x float>
+  %oldValue = load <8 x i32>* %0
+  %oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float>
+  %newAsFloat = bitcast <8 x i32> %1 to <8 x float>
+  %blend = call <8 x float> @llvm.x86.avx.blendvps(<8 x float> %oldAsFloat,
+                                                   <8 x float> %newAsFloat,
+                                                   <8 x float> %mask_as_float)
+  %blendAsInt = bitcast <8 x float> %blend to <8 x i32>
+  store <8 x i32> %blendAsInt, <8 x i32>* %0
+  ret void
+}
+
+
+define void @__masked_store_blend_64(<8 x i64>* nocapture, <8 x i64>,
+                                     <8 x i32>) nounwind alwaysinline {
+  ; always just serialize it
+  ; FIXME: should implement the "do two 32-bit masked stores" stuff that
+  ; other targets do...
+  call void @__masked_store_64(<8 x i64>* nocapture %0, <8 x i64> %1, <8 x i32> %2)
+  ret void
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather/scatter
+
+gen_gather(8, i32)
+gen_gather(8, i64)
+gen_scatter(8, i32)
+gen_scatter(8, i64)
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <4 x double> @llvm.x86.avx.sqrt.pd(<4 x double>) nounwind readnone
+declare <4 x double> @llvm.x86.avx.sqrt.sd(<4 x double>) nounwind readnone
+
+define internal <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
+  unary4to8(ret, double, @llvm.x86.avx.sqrt.pd, %0)
+  ret <8 x double> %ret
+}
+
+define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
+  sse_unary_scalar(ret, 4, double, @llvm.x86.avx.sqrt.pd, %0)
+  ret double %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <4 x double> @llvm.x86.avx.max.pd(<4 x double>, <4 x double>) nounwind readnone
+declare <4 x double> @llvm.x86.avx.max.sd(<4 x double>, <4 x double>) nounwind readnone
+declare <4 x double> @llvm.x86.avx.min.pd(<4 x double>, <4 x double>) nounwind readnone
+declare <4 x double> @llvm.x86.avx.min.sd(<4 x double>, <4 x double>) nounwind readnone
+
+define internal <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
+  binary4to8(ret, double, @llvm.x86.avx.min.pd, %0, %1)
+  ret <8 x double> %ret
+}
+
+define internal double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
+  sse_binary_scalar(ret, 4, double, @llvm.x86.avx.min.pd, %0, %1)
+  ret double %ret
+}
+
+define internal <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
+  binary4to8(ret, double, @llvm.x86.avx.max.pd, %0, %1)
+  ret <8 x double> %ret
+}
+
+define internal double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
+  sse_binary_scalar(ret, 4, double, @llvm.x86.avx.max.pd, %0, %1)
+  ret double %ret
+}
diff --git a/stdlib-c.c b/stdlib-c.c
new file mode 100644
index 00000000..a5b69129
--- /dev/null
+++ b/stdlib-c.c
@@ -0,0 +1,141 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file stdlib-c.c
+    @brief Standard library function implementations written in C.
+
+    This file provides C implementations of various functions that can be
+    called from ispc programs; in other words, this file is *not* linked
+    into the ispc compiler executable, but rather provides functions that
+    can be compiled into ispc programs.
+
+    When the ispc compiler is built, this file is compiled with clang to
+    generate LLVM bitcode.  This bitcode is later linked in to the program
+    being compiled by the DefineStdlib() function.  The first way to access
+    definitions from this file is by asking for them name from the
+    llvm::Module's' symbol table (e.g. as the PrintStmt implementation does
+    with __do_print() below.  Alternatively, if a function defined in this
+    file has a signature that can be mapped back to ispc types by the
+    lLLVMTypeToIspcType() function, then its declaration will be made
+    available to ispc programs at compile time automatically.
+  */
+
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdarg.h>
+
+typedef int Bool;
+
+#define PRINT_SCALAR(fmt, type)  \
+    printf(fmt, *((type *)ptr)); \
+    break
+
+#define PRINT_VECTOR(fmt, type)                                         \
+    putchar('[');                                                       \
+    for (int i = 0; i < width; ++i) {                                   \
+        /* only print the value if the current lane is executing */     \
+        if (mask & (1<<i))                                              \
+            printf(fmt, ((type *)ptr)[i]);                              \
+        else                                                            \
+            printf("((" fmt "))", ((type *)ptr)[i]);                    \
+        putchar(i != width-1 ? ',' : ']');                              \
+    }                                                                   \
+    break
+
+/** This function is called by PrintStmt to do the work of printing values
+    from ispc programs.  Note that the function signature here must match
+    the parameters that PrintStmt::EmitCode() generates.
+
+    @param format  Print format string
+    @param types   Encoded types of the values being printed.
+                   (See lEncodeType()). 
+    @param width   Vector width of the compilation target
+    @param mask    Current lane mask when the print statemnt is called
+    @param args    Array of pointers to the values to be printed
+ */
+void __do_print(const char *format, const char *types, int width, int mask, 
+                void **args) {
+    if (mask == 0) 
+        return;
+
+    int argCount = 0;
+    while (*format) {
+        // Format strings are just single percent signs.
+        if (*format != '%')
+            putchar(*format);
+        else {
+            if (*types) {
+                void *ptr = args[argCount++];
+                // Based on the encoding in the types string, cast the
+                // value appropriately and print it with a reasonable
+                // printf() formatting string.
+                switch (*types) {
+                case 'b': {
+                    printf("%s", *((Bool *)ptr) ? "true" : "false");
+                    break;
+                }
+                case 'B': {
+                    putchar('[');
+                    for (int i = 0; i < width; ++i) {
+                        if (mask & (1<<i))
+                            printf("%s", ((Bool *)ptr)[i] ? "true" : "false");
+                        else
+                            printf("_________");
+                        putchar(i != width-1 ? ',' : ']');
+                    }
+                    break;
+                }
+                case 'i': PRINT_SCALAR("%d", int);
+                case 'I': PRINT_VECTOR("%d", int);
+                case 'u': PRINT_SCALAR("%u", unsigned int);
+                case 'U': PRINT_VECTOR("%u", unsigned int);
+                case 'f': PRINT_SCALAR("%f", float);
+                case 'F': PRINT_VECTOR("%f", float);
+                case 'l': PRINT_SCALAR("%lld", long long);
+                case 'L': PRINT_VECTOR("%lld", long long);
+                case 'v': PRINT_SCALAR("%llu", unsigned long long);
+                case 'V': PRINT_VECTOR("%llu", unsigned long long);
+                case 'd': PRINT_SCALAR("%f", double);
+                case 'D': PRINT_VECTOR("%f", double);
+                default:
+                    printf("UNKNOWN TYPE ");
+                    putchar(*types);
+                }
+                ++types;
+            }
+        }
+        ++format;
+    }
+    fflush(stdout);
+}
diff --git a/stdlib-sse.ll b/stdlib-sse.ll
new file mode 100644
index 00000000..7b38a121
--- /dev/null
+++ b/stdlib-sse.ll
@@ -0,0 +1,441 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+;; This file declares implementations of various stdlib builtins that
+;; only require SSE version 1 and 2 functionality; this file, in turn
+;; is then included by stdlib-sse2.ll and stdlib-sse4.ll to provide
+;; those definitions for them.
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+int8_16(4)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
+
+define internal <4 x float> @__rcp_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  %call = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %0)
+  ; do one N-R iteration to improve precision
+  ;  float iv = __rcp_v(v);
+  ;  return iv * (2. - v * iv);
+  %v_iv = fmul <4 x float> %0, %call
+  %two_minus = fsub <4 x float> <float 2., float 2., float 2., float 2.>, %v_iv  
+  %iv_mul = fmul <4 x float> %call, %two_minus
+  ret <4 x float> %iv_mul
+}
+
+define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
+  ; do the rcpss call
+  %vecval = insertelement <4 x float> undef, float %0, i32 0
+  %call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
+  %scall = extractelement <4 x float> %call, i32 0
+
+  ; do one N-R iteration to improve precision, as above
+  %v_iv = fmul float %0, %scall
+  %two_minus = fsub float 2., %v_iv  
+  %iv_mul = fmul float %scall, %two_minus
+  ret float %iv_mul
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; rsqrt
+
+declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
+
+define internal <4 x float> @__rsqrt_varying_float(<4 x float> %v) nounwind readonly alwaysinline {
+  ;  float is = __rsqrt_v(v);
+  %is = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %v)
+  ; Newton-Raphson iteration to improve precision
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul <4 x float> %v, %is
+  %v_is_is = fmul <4 x float> %v_is, %is
+  %three_sub = fsub <4 x float> <float 3., float 3., float 3., float 3.>, %v_is_is
+  %is_mul = fmul <4 x float> %is, %three_sub
+  %half_scale = fmul <4 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  ret <4 x float> %half_scale
+}
+
+define internal float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
+  ;  uniform float is = extract(__rsqrt_u(v), 0);
+  %v = insertelement <4 x float> undef, float %0, i32 0
+  %vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
+  %is = extractelement <4 x float> %vis, i32 0
+
+  ; Newton-Raphson iteration to improve precision
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul float %0, %is
+  %v_is_is = fmul float %v_is, %is
+  %three_sub = fsub float 3., %v_is_is
+  %is_mul = fmul float %is, %three_sub
+  %half_scale = fmul float 0.5, %is_mul
+  ret float %half_scale
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; sqrt
+
+declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
+
+define internal <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  %call = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %0)
+  ret <4 x float> %call
+}
+
+define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
+  sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
+  ret float %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; fast math mode
+
+declare void @llvm.x86.sse.stmxcsr(i32 *) nounwind
+declare void @llvm.x86.sse.ldmxcsr(i32 *) nounwind
+
+define internal void @__fastmath() nounwind alwaysinline {
+  %ptr = alloca i32
+  call void @llvm.x86.sse.stmxcsr(i32 * %ptr)
+  %oldval = load i32 *%ptr
+
+  ; turn on DAZ (64)/FTZ (32768) -> 32832
+  %update = or i32 %oldval, 32832
+  store i32 %update, i32 *%ptr
+  call void @llvm.x86.sse.ldmxcsr(i32 * %ptr)
+  ret void
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; svml stuff
+
+declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
+declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
+
+
+define internal <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
+  ret <4 x float> %ret
+}
+
+define internal <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
+  ret <4 x float> %ret
+}
+
+define internal void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
+  %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
+  store <4 x float> %s, <4 x float> * %1
+  ret void
+}
+
+define internal <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
+  ret <4 x float> %ret
+}
+
+define internal <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
+  ret <4 x float> %ret
+}
+
+define internal <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
+  ret <4 x float> %ret
+}
+
+define internal <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_expf4(<4 x float> %0)
+  ret <4 x float> %ret
+}
+
+define internal <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_logf4(<4 x float> %0)
+  ret <4 x float> %ret
+}
+
+define internal <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
+  ret <4 x float> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define internal <4 x float> @__max_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
+  %call = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %0, <4 x float> %1)
+  ret <4 x float> %call
+}
+
+define internal float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
+  ret float %ret
+}
+
+define internal <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
+  %call = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %0, <4 x float> %1)
+  ret <4 x float> %call
+}
+
+define internal float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
+  ret float %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops / reductions
+
+declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
+
+define internal i32 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i32> %0 to <4 x float>
+  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+  ret i32 %v
+}
+
+define internal float @__reduce_min_float(<4 x float>) nounwind readnone {
+  reduce4(float, @__min_varying_float, @__min_uniform_float)
+}
+
+define internal float @__reduce_max_float(<4 x float>) nounwind readnone {
+  reduce4(float, @__max_varying_float, @__max_uniform_float)
+}
+
+define internal i32 @__reduce_add_int32(<4 x i32> %v) nounwind readnone {
+  %v1 = shufflevector <4 x i32> %v, <4 x i32> undef,
+                      <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %m1 = add <4 x i32> %v1, %v
+  %m1a = extractelement <4 x i32> %m1, i32 0
+  %m1b = extractelement <4 x i32> %m1, i32 1
+  %sum = add i32 %m1a, %m1b
+  ret i32 %sum
+}
+
+define internal i32 @__reduce_min_int32(<4 x i32>) nounwind readnone {
+  reduce4(i32, @__min_varying_int32, @__min_uniform_int32)
+}
+
+define internal i32 @__reduce_max_int32(<4 x i32>) nounwind readnone {
+  reduce4(i32, @__max_varying_int32, @__max_uniform_int32)
+}
+
+define internal i32 @__reduce_add_uint32(<4 x i32> %v) nounwind readnone {
+  %r = call i32 @__reduce_add_int32(<4 x i32> %v)
+  ret i32 %r
+}
+
+define internal i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone {
+  reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32)
+}
+
+define internal i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone {
+  reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32)
+ }
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+define void @__masked_store_32(<4 x i32>* nocapture, <4 x i32>, <4 x i32>) nounwind alwaysinline {
+  per_lane(4, <4 x i32> %2, `
+      ; compute address for this one
+      %ptr_ID = getelementptr <4 x i32> * %0, i32 0, i32 LANE
+      %storeval_ID = extractelement <4 x i32> %1, i32 LANE
+      store i32 %storeval_ID, i32 * %ptr_ID')
+  ret void
+}
+
+define void @__masked_store_64(<4 x i64>* nocapture, <4 x i64>, <4 x i32>) nounwind alwaysinline {
+  per_lane(4, <4 x i32> %2, `
+      %ptr_ID = getelementptr <4 x i64> * %0, i32 0, i32 LANE
+      %storeval_ID = extractelement <4 x i64> %1, i32 LANE
+      store i64 %storeval_ID, i64 * %ptr_ID')
+  ret void
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+define <4 x i32> @__load_and_broadcast_32(i8 *, <4 x i32> %mask) nounwind alwaysinline {
+  ; must not load if the mask is all off; the address may be invalid
+  %mm = call i32 @__movmsk(<4 x i32> %mask)
+  %any_on = icmp ne i32 %mm, 0
+  br i1 %any_on, label %load, label %skip
+
+load:
+  %ptr = bitcast i8 * %0 to i32 *
+  %val = load i32 * %ptr
+
+  %ret0 = insertelement <4 x i32> undef, i32 %val, i32 0
+  %ret1 = insertelement <4 x i32> %ret0, i32 %val, i32 1
+  %ret2 = insertelement <4 x i32> %ret1, i32 %val, i32 2
+  %ret3 = insertelement <4 x i32> %ret2, i32 %val, i32 3
+  ret <4 x i32> %ret3
+
+skip:
+  ret <4 x i32> undef
+}
+
+define <4 x i64> @__load_and_broadcast_64(i8 *, <4 x i32> %mask) nounwind alwaysinline {
+  ; must not load if the mask is all off; the address may be invalid
+  %mm = call i32 @__movmsk(<4 x i32> %mask)
+  %any_on = icmp ne i32 %mm, 0
+  br i1 %any_on, label %load, label %skip
+
+load:
+  %ptr = bitcast i8 * %0 to i64 *
+  %val = load i64 * %ptr
+
+  %ret0 = insertelement <4 x i64> undef, i64 %val, i32 0
+  %ret1 = insertelement <4 x i64> %ret0, i64 %val, i32 1
+  %ret2 = insertelement <4 x i64> %ret1, i64 %val, i32 2
+  %ret3 = insertelement <4 x i64> %ret2, i64 %val, i32 3
+  ret <4 x i64> %ret3
+
+skip:
+  ret <4 x i64> undef
+}
+
+define <4 x i32> @__load_masked_32(i8 *, <4 x i32> %mask) nounwind alwaysinline {
+  %mm = call i32 @__movmsk(<4 x i32> %mask)
+  %any_on = icmp ne i32 %mm, 0
+  br i1 %any_on, label %load, label %skip
+
+load: 
+  ; if any mask lane is on, just load all of the values
+  ; FIXME: there is a lurking bug here if we straddle a page boundary, the
+  ; next page is invalid to read, but the mask bits are set so that we
+  ; aren't supposed to be reading those elements...
+  %ptr = bitcast i8 * %0 to <4 x i32> *
+  %val = load <4 x i32> * %ptr, align 4
+  ret <4 x i32> %val
+
+skip:
+  ret <4 x i32> undef
+}
+
+define <4 x i64> @__load_masked_64(i8 *, <4 x i32> %mask) nounwind alwaysinline {
+  %mm = call i32 @__movmsk(<4 x i32> %mask)
+  %any_on = icmp ne i32 %mm, 0
+  br i1 %any_on, label %load, label %skip
+
+load:
+  ; if any mask lane is on, just load all of the values
+  ; FIXME: there is a lurking bug here if we straddle a page boundary, the
+  ; next page is invalid to read, but the mask bits are set so that we
+  ; aren't supposed to be reading those elements...
+  %ptr = bitcast i8 * %0 to <4 x i64> *
+  %val = load <4 x i64> * %ptr, align 8
+  ret <4 x i64> %val
+
+skip:
+  ret <4 x i64> undef
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather/scatter
+
+; define these with the macros from stdlib.m4
+
+gen_gather(4, i32)
+gen_gather(4, i64)
+gen_scatter(4, i32)
+gen_scatter(4, i64)
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
+
+define internal <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
+  unary2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
+  ret <4 x double> %ret
+}
+
+
+define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
+  sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.pd, %0)
+  ret double %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define internal <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone {
+  binary2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
+  ret <4 x double> %ret
+}
+
+
+define internal double @__min_uniform_double(double, double) nounwind readnone {
+  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.pd, %0, %1)
+  ret double %ret
+}
+
+
+define internal <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone {
+  binary2to4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
+  ret <4 x double> %ret
+}
+
+
+define internal double @__max_uniform_double(double, double) nounwind readnone {
+  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.pd, %0, %1)
+  ret double %ret
+}
diff --git a/stdlib-sse2.ll b/stdlib-sse2.ll
new file mode 100644
index 00000000..654e81f1
--- /dev/null
+++ b/stdlib-sse2.ll
@@ -0,0 +1,328 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Define the standard library builtins for the SSE2 target
+
+; Define some basics for a 4-wide target
+stdlib_core(4)
+packed_load_and_store(4)
+
+; Include the various definitions of things that only require SSE1 and SSE2
+include(`stdlib-sse.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding
+;;
+;; There are not any rounding instructions in SSE2, so we have to emulate
+;; the functionality with multiple instructions...
+
+; The code for __round_* is the result of compiling the following source
+; code.
+;
+; export float Round(float x) {
+;    unsigned int sign = signbits(x);
+;    unsigned int ix = intbits(x);
+;    ix ^= sign;
+;    x = floatbits(ix);
+;    x += 0x1.0p23f;
+;    x -= 0x1.0p23f;
+;    ix = intbits(x);
+;    ix ^= sign;
+;    x = floatbits(ix);
+;    return x;
+;}
+
+define internal <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  %float_to_int_bitcast.i.i.i.i = bitcast <4 x float> %0 to <4 x i32>
+  %bitop.i.i = and <4 x i32> %float_to_int_bitcast.i.i.i.i, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+  %bitop.i = xor <4 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i40.i = bitcast <4 x i32> %bitop.i to <4 x float>
+  %binop.i = fadd <4 x float> %int_to_float_bitcast.i.i40.i, <float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06>
+  %binop21.i = fadd <4 x float> %binop.i, <float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06>
+  %float_to_int_bitcast.i.i.i = bitcast <4 x float> %binop21.i to <4 x i32>
+  %bitop31.i = xor <4 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i.i = bitcast <4 x i32> %bitop31.i to <4 x float>
+  ret <4 x float> %int_to_float_bitcast.i.i.i
+}
+
+define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
+  %float_to_int_bitcast.i.i.i.i = bitcast float %0 to i32
+  %bitop.i.i = and i32 %float_to_int_bitcast.i.i.i.i, -2147483648
+  %bitop.i = xor i32 %bitop.i.i, %float_to_int_bitcast.i.i.i.i
+  %int_to_float_bitcast.i.i40.i = bitcast i32 %bitop.i to float
+  %binop.i = fadd float %int_to_float_bitcast.i.i40.i, 8.388608e+06
+  %binop21.i = fadd float %binop.i, -8.388608e+06
+  %float_to_int_bitcast.i.i.i = bitcast float %binop21.i to i32
+  %bitop31.i = xor i32 %float_to_int_bitcast.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop31.i to float
+  ret float %int_to_float_bitcast.i.i.i
+}
+
+;; Similarly, for implementations of the __floor* functions below, we have the
+;; bitcode from compiling the following source code...
+
+;export float Floor(float x) {
+;    float y = Round(x);
+;    unsigned int cmp = y > x ? 0xffffffff : 0;
+;    float delta = -1.f;
+;    unsigned int idelta = intbits(delta);
+;    idelta &= cmp;
+;    delta = floatbits(idelta);
+;    return y + delta;
+;}
+
+define internal <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <4 x float> @__round_varying_float(<4 x float> %0) nounwind
+  %bincmp.i = fcmp ogt <4 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <4 x i1> %bincmp.i to <4 x i32>
+  %bitop.i = and <4 x i32> %val_to_boolvec32.i, <i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432>
+  %int_to_float_bitcast.i.i.i = bitcast <4 x i32> %bitop.i to <4 x float>
+  %binop.i = fadd <4 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <4 x float> %binop.i
+}
+
+define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
+  %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
+  %bincmp.i = fcmp ogt float %calltmp.i, %0
+  %selectexpr.i = sext i1 %bincmp.i to i32
+  %bitop.i = and i32 %selectexpr.i, -1082130432
+  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
+  %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret float %binop.i
+}
+
+;; And here is the code we compiled to get the __ceil* functions below
+;
+;export uniform float Ceil(uniform float x) {
+;    uniform float y = Round(x);
+;    uniform int yltx = y < x ? 0xffffffff : 0;
+;    uniform float delta = 1.f;
+;    uniform int idelta = intbits(delta);
+;    idelta &= yltx;
+;    delta = floatbits(idelta);
+;    return y + delta;
+;}
+
+define internal <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <4 x float> @__round_varying_float(<4 x float> %0) nounwind
+  %bincmp.i = fcmp olt <4 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <4 x i1> %bincmp.i to <4 x i32>
+  %bitop.i = and <4 x i32> %val_to_boolvec32.i, <i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216>
+  %int_to_float_bitcast.i.i.i = bitcast <4 x i32> %bitop.i to <4 x float>
+  %binop.i = fadd <4 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <4 x float> %binop.i
+}
+
+define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
+  %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
+  %bincmp.i = fcmp olt float %calltmp.i, %0
+  %selectexpr.i = sext i1 %bincmp.i to i32
+  %bitop.i = and i32 %selectexpr.i, 1065353216
+  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
+  %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret float %binop.i
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; min/max
+
+; There is no blend instruction with SSE2, so we simulate it with bit
+; operations on i32s.  For these two vselect functions, for each
+; vector element, if the mask is on, we return the corresponding value
+; from %1, and otherwise return the value from %0.
+
+define internal <4 x i32> @__vselect_i32(<4 x i32>, <4 x i32> ,
+                                         <4 x i32> %mask) nounwind readnone alwaysinline {
+  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %cleared_old = and <4 x i32> %0, %notmask
+  %masked_new = and <4 x i32> %1, %mask
+  %new = or <4 x i32> %cleared_old, %masked_new
+  ret <4 x i32> %new
+}
+
+define internal <4 x float> @__vselect_float(<4 x float>, <4 x float>,
+                                             <4 x i32> %mask) nounwind readnone alwaysinline {
+  %v0 = bitcast <4 x float> %0 to <4 x i32>
+  %v1 = bitcast <4 x float> %1 to <4 x i32>
+  %r = call <4 x i32> @__vselect_i32(<4 x i32> %v0, <4 x i32> %v1, <4 x i32> %mask)
+  %rf = bitcast <4 x i32> %r to <4 x float>
+  ret <4 x float> %rf
+}
+
+
+; To do vector integer min and max, we do the vector compare and then sign
+; extend the i1 vector result to an i32 mask.  The __vselect does the
+; rest...
+
+define internal <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %c = icmp slt <4 x i32> %0, %1
+  %mask = sext <4 x i1> %c to <4 x i32>
+  %v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
+  ret <4 x i32> %v
+}
+
+define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp slt i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define internal <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %c = icmp sgt <4 x i32> %0, %1
+  %mask = sext <4 x i1> %c to <4 x i32>
+  %v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
+  ret <4 x i32> %v
+}
+
+define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp sgt i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+
+; The functions for unsigned ints are similar, just with unsigned
+; comparison functions...
+
+define internal <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %c = icmp ult <4 x i32> %0, %1
+  %mask = sext <4 x i1> %c to <4 x i32>
+  %v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
+  ret <4 x i32> %v
+}
+
+define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp ult i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define internal <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %c = icmp ugt <4 x i32> %0, %1
+  %mask = sext <4 x i1> %c to <4 x i32>
+  %v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
+  ret <4 x i32> %v
+}
+
+define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp ugt i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops / reductions
+
+; FIXME: this is very inefficient, loops over all 32 bits...
+
+define internal i32 @__popcnt(i32) nounwind readonly alwaysinline {
+entry:
+  br label %loop
+
+loop:
+  %count = phi i32 [ 0, %entry ], [ %newcount, %loop ]
+  %val = phi i32 [ %0, %entry ], [ %newval, %loop ]
+  %delta = and i32 %val, 1
+  %newcount = add i32 %count, %delta
+  %newval = lshr i32 %val, 1
+  %done = icmp eq i32 %newval, 0
+  br i1 %done, label %exit, label %loop
+
+exit:
+  ret i32 %newcount
+}
+
+
+define internal float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline {
+  %v1 = shufflevector <4 x float> %v, <4 x float> undef,
+                      <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %m1 = fadd <4 x float> %v1, %v
+  %m1a = extractelement <4 x float> %m1, i32 0
+  %m1b = extractelement <4 x float> %m1, i32 1
+  %sum = fadd float %m1a, %m1b
+  ret float %sum
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>, 
+                                     <4 x i32> %mask) nounwind alwaysinline {
+  %val = load <4 x i32> * %0
+  %newval = call <4 x i32> @__vselect_i32(<4 x i32> %val, <4 x i32> %1, <4 x i32> %mask) 
+  store <4 x i32> %newval, <4 x i32> * %0
+  ret void
+}
+
+define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
+                                     <4 x i32> %mask) nounwind alwaysinline {
+  %oldValue = load <4 x i64>* %ptr
+
+  ; Do 4x64-bit blends by doing two <4 x i32> blends, where the <4 x i32> values
+  ; are actually bitcast <2 x i64> values
+  ;
+  ; set up the first two 64-bit values
+  %old01  = shufflevector <4 x i64> %oldValue, <4 x i64> undef,
+                          <2 x i32> <i32 0, i32 1>
+  %old01f = bitcast <2 x i64> %old01 to <4 x float>
+  %new01  = shufflevector <4 x i64> %new, <4 x i64> undef,
+                          <2 x i32> <i32 0, i32 1>
+  %new01f = bitcast <2 x i64> %new01 to <4 x float>
+  ; compute mask--note that the indices 0 and 1 are doubled-up
+  %mask01 = shufflevector <4 x i32> %mask, <4 x i32> undef,
+                          <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+  ; and blend the two of the values
+  %result01f = call <4 x float> @__vselect_float(<4 x float> %old01f, <4 x float> %new01f, <4 x i32> %mask01)
+  %result01 = bitcast <4 x float> %result01f to <2 x i64>
+
+  ; and again
+  %old23  = shufflevector <4 x i64> %oldValue, <4 x i64> undef,
+                          <2 x i32> <i32 2, i32 3>
+  %old23f = bitcast <2 x i64> %old23 to <4 x float>
+  %new23  = shufflevector <4 x i64> %new, <4 x i64> undef,
+                          <2 x i32> <i32 2, i32 3>
+  %new23f = bitcast <2 x i64> %new23 to <4 x float>
+  ; compute mask--note that the values 2 and 3 are doubled-up
+  %mask23 = shufflevector <4 x i32> %mask, <4 x i32> undef,
+                          <4 x i32> <i32 2, i32 2, i32 3, i32 3>
+  ; and blend the two of the values
+  %result23f = call <4 x float> @__vselect_float(<4 x float> %old23f, <4 x float> %new23f, <4 x i32> %mask23)
+  %result23 = bitcast <4 x float> %result23f to <2 x i64>
+
+  ; reconstruct the final <4 x i64> vector
+  %final = shufflevector <2 x i64> %result01, <2 x i64> %result23,
+                         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x i64> %final, <4 x i64> * %ptr
+  ret void
+}
+
diff --git a/stdlib-sse4.ll b/stdlib-sse4.ll
new file mode 100644
index 00000000..f28dc35d
--- /dev/null
+++ b/stdlib-sse4.ll
@@ -0,0 +1,248 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Define common 4-wide stuff
+stdlib_core(4)
+packed_load_and_store(4)
+
+; Define the stuff that can be done with base SSE1/SSE2 instructions
+include(`stdlib-sse.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; math
+
+declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
+declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
+
+define internal <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 8)
+  ret <4 x float> %call
+}
+
+define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
+  ; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  ; the roundss intrinsic is a total mess--docs say:
+  ;
+  ;  __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
+  ;       
+  ;  b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
+  ;  on b0. The higher order 96 bits are copied directly from input parameter a. The
+  ;  return value is described by the following equations:
+  ;
+  ;  r0 = RND(b0)
+  ;  r1 = a1
+  ;  r2 = a2
+  ;  r3 = a3
+  ;
+  ;  It doesn't matter what we pass as a, since we only need the r0 value
+  ;  here.  So we pass the same register for both.  Further, only the 0th
+  ;  element of the b parameter matters
+  %xi = insertelement <4 x float> undef, float %0, i32 0
+  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
+  %rs = extractelement <4 x float> %xr, i32 0
+  ret float %rs
+}
+
+define internal <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 9)
+  ret <4 x float> %call
+}
+
+define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <4 x float> undef, float %0, i32 0
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
+  %rs = extractelement <4 x float> %xr, i32 0
+  ret float %rs
+}
+
+define internal <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 10)
+  ret <4 x float> %call
+}
+
+define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <4 x float> undef, float %0, i32 0
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
+  %rs = extractelement <4 x float> %xr, i32 0
+  ret float %rs
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; integer min/max
+
+declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
+
+define internal <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %call = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %call
+}
+
+define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret i32 %ret
+}
+
+define internal <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %call = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %call
+}
+
+define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret i32 %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; unsigned int min/max
+
+declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
+
+define internal <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %call = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %call
+}
+
+define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret i32 %ret
+}
+
+define internal <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %call = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %call
+}
+
+define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret i32 %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops / reductions
+
+declare i32 @llvm.ctpop.i32(i32) nounwind readnone
+
+define internal i32 @__popcnt(i32) nounwind readonly alwaysinline {
+  %call = call i32 @llvm.ctpop.i32(i32 %0)
+  ret i32 %call
+}
+
+declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define internal float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline {
+  %v1 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %0, <4 x float> %0)
+  %v2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %v1, <4 x float> %v1)
+  %scalar = extractelement <4 x float> %v2, i32 0
+  ret float %scalar
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
+                                             <4 x float>) nounwind readnone
+
+
+define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>, 
+                                     <4 x i32> %mask) nounwind alwaysinline {
+  %mask_as_float = bitcast <4 x i32> %mask to <4 x float>
+  %oldValue = load <4 x i32>* %0
+  %oldAsFloat = bitcast <4 x i32> %oldValue to <4 x float>
+  %newAsFloat = bitcast <4 x i32> %1 to <4 x float>
+  %blend = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %oldAsFloat,
+                                                     <4 x float> %newAsFloat,
+                                                     <4 x float> %mask_as_float)
+  %blendAsInt = bitcast <4 x float> %blend to <4 x i32>
+  store <4 x i32> %blendAsInt, <4 x i32>* %0
+  ret void
+}
+
+
+define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
+                                     <4 x i32> %i32mask) nounwind alwaysinline {
+  %oldValue = load <4 x i64>* %ptr
+  %mask = bitcast <4 x i32> %i32mask to <4 x float>
+
+  ; Do 4x64-bit blends by doing two <4 x i32> blends, where the <4 x i32> values
+  ; are actually bitcast <2 x i64> values
+  ;
+  ; set up the first two 64-bit values
+  %old01  = shufflevector <4 x i64> %oldValue, <4 x i64> undef,
+                          <2 x i32> <i32 0, i32 1>
+  %old01f = bitcast <2 x i64> %old01 to <4 x float>
+  %new01  = shufflevector <4 x i64> %new, <4 x i64> undef,
+                          <2 x i32> <i32 0, i32 1>
+  %new01f = bitcast <2 x i64> %new01 to <4 x float>
+  ; compute mask--note that the indices 0 and 1 are doubled-up
+  %mask01 = shufflevector <4 x float> %mask, <4 x float> undef,
+                          <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+  ; and blend the two of the values
+  %result01f = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %old01f,
+                                                         <4 x float> %new01f,
+                                                         <4 x float> %mask01)
+  %result01 = bitcast <4 x float> %result01f to <2 x i64>
+
+  ; and again
+  %old23  = shufflevector <4 x i64> %oldValue, <4 x i64> undef,
+                          <2 x i32> <i32 2, i32 3>
+  %old23f = bitcast <2 x i64> %old23 to <4 x float>
+  %new23  = shufflevector <4 x i64> %new, <4 x i64> undef,
+                          <2 x i32> <i32 2, i32 3>
+  %new23f = bitcast <2 x i64> %new23 to <4 x float>
+  ; compute mask--note that the values 2 and 3 are doubled-up
+  %mask23 = shufflevector <4 x float> %mask, <4 x float> undef,
+                          <4 x i32> <i32 2, i32 2, i32 3, i32 3>
+  ; and blend the two of the values
+  %result23f = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %old23f,
+                                                         <4 x float> %new23f,
+                                                         <4 x float> %mask23)
+  %result23 = bitcast <4 x float> %result23f to <2 x i64>
+
+  ; reconstruct the final <4 x i64> vector
+  %final = shufflevector <2 x i64> %result01, <2 x i64> %result23,
+                         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x i64> %final, <4 x i64> * %ptr
+  ret void
+}
diff --git a/stdlib-sse4x2.ll b/stdlib-sse4x2.ll
new file mode 100644
index 00000000..c97fd8ce
--- /dev/null
+++ b/stdlib-sse4x2.ll
@@ -0,0 +1,703 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+
+;; This file defines the target for "double-pumped" SSE4, i.e. running
+;; with 8-wide vectors
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; standard 8-wide definitions from m4 macros
+
+stdlib_core(8)
+packed_load_and_store(8)
+int8_16(8)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
+
+define internal <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ;  float iv = __rcp_v(v);
+  ;  return iv * (2. - v * iv);
+
+  unary4to8(call, float, @llvm.x86.sse.rcp.ps, %0)
+  ; do one N-R iteration
+  %v_iv = fmul <8 x float> %0, %call
+  %two_minus = fsub <8 x float> <float 2., float 2., float 2., float 2.,
+                                 float 2., float 2., float 2., float 2.>, %v_iv  
+  %iv_mul = fmul <8 x float> %call, %two_minus
+  ret <8 x float> %iv_mul
+}
+
+define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
+;    uniform float iv = extract(__rcp_u(v), 0);
+;    return iv * (2. - v * iv);
+  %vecval = insertelement <4 x float> undef, float %0, i32 0
+  %call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
+  %scall = extractelement <4 x float> %call, i32 0
+
+  ; do one N-R iteration
+  %v_iv = fmul float %0, %scall
+  %two_minus = fsub float 2., %v_iv  
+  %iv_mul = fmul float %scall, %two_minus
+  ret float %iv_mul
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rsqrt
+
+declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
+
+define internal <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
+  ;  float is = __rsqrt_v(v);
+  unary4to8(is, float, @llvm.x86.sse.rsqrt.ps, %v)
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul <8 x float> %v, %is
+  %v_is_is = fmul <8 x float> %v_is, %is
+  %three_sub = fsub <8 x float> <float 3., float 3., float 3., float 3.,
+                                 float 3., float 3., float 3., float 3.>, %v_is_is
+  %is_mul = fmul <8 x float> %is, %three_sub
+  %half_scale = fmul <8 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
+                                  float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  ret <8 x float> %half_scale
+}
+
+define internal float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
+  ;  uniform float is = extract(__rsqrt_u(v), 0);
+  %v = insertelement <4 x float> undef, float %0, i32 0
+  %vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
+  %is = extractelement <4 x float> %vis, i32 0
+
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul float %0, %is
+  %v_is_is = fmul float %v_is, %is
+  %three_sub = fsub float 3., %v_is_is
+  %is_mul = fmul float %is, %three_sub
+  %half_scale = fmul float 0.5, %is_mul
+  ret float %half_scale
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; sqrt
+
+declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
+
+define internal <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  unary4to8(call, float, @llvm.x86.sse.sqrt.ps, %0)
+  ret <8 x float> %call
+}
+
+define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
+  sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
+  ret float %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; fast math
+
+declare void @llvm.x86.sse.stmxcsr(i32 *) nounwind
+declare void @llvm.x86.sse.ldmxcsr(i32 *) nounwind
+
+define internal void @__fastmath() nounwind alwaysinline {
+  %ptr = alloca i32
+  call void @llvm.x86.sse.stmxcsr(i32 * %ptr)
+  %oldval = load i32 *%ptr
+
+  ; turn on DAZ (64)/FTZ (32768) -> 32832
+  %update = or i32 %oldval, 32832
+  store i32 %update, i32 *%ptr
+  call void @llvm.x86.sse.ldmxcsr(i32 * %ptr)
+  ret void
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; svml stuff
+
+declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
+declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
+
+
+define internal <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline {
+  unary4to8(ret, float, @__svml_sinf4, %0)
+  ret <8 x float> %ret
+}
+
+define internal <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline {
+  unary4to8(ret, float, @__svml_cosf4, %0)
+  ret <8 x float> %ret
+}
+
+define internal void @__svml_sincos(<8 x float>, <8 x float> *,
+                                    <8 x float> *) nounwind readnone alwaysinline {
+  ; call svml_sincosf4 two times with the two 4-wide sub-vectors
+  %a = shufflevector <8 x float> %0, <8 x float> undef,
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %b = shufflevector <8 x float> %0, <8 x float> undef,
+         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+
+  %cospa = alloca <4 x float>
+  %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a)
+
+  %cospb = alloca <4 x float>
+  %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b)
+
+  %sin = shufflevector <4 x float> %sa, <4 x float> %sb,
+         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
+                    i32 4, i32 5, i32 6, i32 7>
+  store <8 x float> %sin, <8 x float> * %1
+
+  %cosa = load <4 x float> * %cospa
+  %cosb = load <4 x float> * %cospb
+  %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb,
+         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
+                    i32 4, i32 5, i32 6, i32 7>
+  store <8 x float> %cos, <8 x float> * %2
+
+  ret void
+}
+
+define internal <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline {
+  unary4to8(ret, float, @__svml_tanf4, %0)
+  ret <8 x float> %ret
+}
+
+define internal <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline {
+  unary4to8(ret, float, @__svml_atanf4, %0)
+  ret <8 x float> %ret
+}
+
+define internal <8 x float> @__svml_atan2(<8 x float>,
+                                          <8 x float>) nounwind readnone alwaysinline {
+  binary4to8(ret, float, @__svml_atan2f4, %0, %1)
+  ret <8 x float> %ret
+}
+
+define internal <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline {
+  unary4to8(ret, float, @__svml_expf4, %0)
+  ret <8 x float> %ret
+}
+
+define internal <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline {
+  unary4to8(ret, float, @__svml_logf4, %0)
+  ret <8 x float> %ret
+}
+
+define internal <8 x float> @__svml_pow(<8 x float>,
+                                        <8 x float>) nounwind readnone alwaysinline {
+  binary4to8(ret, float, @__svml_powf4, %0, %1)
+  ret <8 x float> %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define internal <8 x float> @__max_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
+  binary4to8(call, float, @llvm.x86.sse.max.ps, %0, %1)
+  ret <8 x float> %call
+}
+
+define internal float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
+  ret float %ret
+}
+
+define internal <8 x float> @__min_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
+  binary4to8(call, float, @llvm.x86.sse.min.ps, %0, %1)
+  ret <8 x float> %call
+}
+
+define internal float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
+  ret float %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
+
+define internal <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(call, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret <8 x i32> %call
+}
+
+define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret i32 %ret
+}
+
+define internal <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(call, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret <8 x i32> %call
+}
+
+define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret i32 %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; unsigned int min/max
+
+declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
+
+define internal <8 x i32> @__min_varying_uint32(<8 x i32>,
+                                                <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(call, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret <8 x i32> %call
+}
+
+define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret i32 %ret
+}
+
+define internal <8 x i32> @__max_varying_uint32(<8 x i32>,
+                                                <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(call, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret <8 x i32> %call
+}
+
+define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret i32 %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops / reductions
+
+declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
+
+define internal i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
+  ; first do two 4-wide movmsk calls
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
+  %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
+
+  ; and shift the first one over by 4 before ORing it with the value 
+  ; of the second one
+  %v1s = shl i32 %v1, 4
+  %v = or i32 %v0, %v1s
+  ret i32 %v
+}
+
+define internal float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
+  reduce8by4(float, @llvm.x86.sse.min.ps, @__min_uniform_float)
+}
+
+define internal float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
+  reduce8by4(float, @llvm.x86.sse.max.ps, @__max_uniform_float)
+}
+
+; helper function for reduce_add_int32
+define internal <4 x i32> @__vec4_add_int32(<4 x i32> %v0,
+                                            <4 x i32> %v1) nounwind readnone alwaysinline {
+  %v = add <4 x i32> %v0, %v1
+  ret <4 x i32> %v
+}
+
+; helper function for reduce_add_int32
+define internal i32 @__add_int32(i32, i32) nounwind readnone alwaysinline {
+  %v = add i32 %0, %1
+  ret i32 %v
+}
+
+define internal i32 @__reduce_add_int32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8by4(i32, @__vec4_add_int32, @__add_int32)
+}
+
+define internal i32 @__reduce_min_int32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8by4(i32, @llvm.x86.sse41.pminsd, @__min_uniform_int32)
+}
+
+define internal i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8by4(i32, @llvm.x86.sse41.pmaxsd, @__max_uniform_int32)
+}
+
+define internal i32 @__reduce_add_uint32(<8 x i32> %v) nounwind readnone alwaysinline {
+  %r = call i32 @__reduce_add_int32(<8 x i32> %v)
+  ret i32 %r
+}
+
+define internal i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8by4(i32, @llvm.x86.sse41.pminud, @__min_uniform_uint32)
+}
+
+define internal i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8by4(i32, @llvm.x86.sse41.pmaxud, @__max_uniform_uint32)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+define void @__masked_store_32(<8 x i32>* nocapture, <8 x i32>,
+                               <8 x i32>) nounwind alwaysinline {
+  per_lane(8, <8 x i32> %2, `
+      ; compute address for this one
+      %ptr_ID = getelementptr <8 x i32> * %0, i32 0, i32 LANE
+      %storeval_ID = extractelement <8 x i32> %1, i32 LANE
+      store i32 %storeval_ID, i32 * %ptr_ID')
+  ret void
+}
+
+
+define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>,
+                               <8 x i32>) nounwind alwaysinline {
+  per_lane(8, <8 x i32> %2, `
+      ; compute address for this one
+      %ptr_ID = getelementptr <8 x i64> * %0, i32 0, i32 LANE
+      %storeval_ID = extractelement <8 x i64> %1, i32 LANE
+      store i64 %storeval_ID, i64 * %ptr_ID')
+  ret void
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+; FIXME: I think this and the next one need to verify that the mask isn't
+; all off before doing the load!!!  (See e.g. stdlib-sse.ll)
+
+define <8 x i32> @__load_and_broadcast_32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
+  %ptr = bitcast i8 * %0 to i32 *
+  %val = load i32 * %ptr
+
+  %ret0 = insertelement <8 x i32> undef, i32 %val, i32 0
+  %ret1 = insertelement <8 x i32> %ret0, i32 %val, i32 1
+  %ret2 = insertelement <8 x i32> %ret1, i32 %val, i32 2
+  %ret3 = insertelement <8 x i32> %ret2, i32 %val, i32 3
+  %ret4 = insertelement <8 x i32> %ret3, i32 %val, i32 4
+  %ret5 = insertelement <8 x i32> %ret4, i32 %val, i32 5
+  %ret6 = insertelement <8 x i32> %ret5, i32 %val, i32 6
+  %ret7 = insertelement <8 x i32> %ret6, i32 %val, i32 7
+  ret <8 x i32> %ret7
+}
+
+
+define <8 x i64> @__load_and_broadcast_64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
+  %ptr = bitcast i8 * %0 to i64 *
+  %val = load i64 * %ptr
+
+  %ret0 = insertelement <8 x i64> undef, i64 %val, i32 0
+  %ret1 = insertelement <8 x i64> %ret0, i64 %val, i32 1
+  %ret2 = insertelement <8 x i64> %ret1, i64 %val, i32 2
+  %ret3 = insertelement <8 x i64> %ret2, i64 %val, i32 3
+  %ret4 = insertelement <8 x i64> %ret3, i64 %val, i32 4
+  %ret5 = insertelement <8 x i64> %ret4, i64 %val, i32 5
+  %ret6 = insertelement <8 x i64> %ret5, i64 %val, i32 6
+  %ret7 = insertelement <8 x i64> %ret6, i64 %val, i32 7
+  ret <8 x i64> %ret7
+}
+
+
+define <8 x i32> @__load_masked_32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
+  %ptr = bitcast i8 * %0 to <8 x i32> *
+  %val = load <8 x i32> * %ptr, align 4
+  ret <8 x i32> %val
+}
+
+
+define <8 x i64> @__load_masked_64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
+  %ptr = bitcast i8 * %0 to <8 x i64> *
+  %val = load <8 x i64> * %ptr, align 8
+  ret <8 x i64> %val
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather/scatter
+
+gen_gather(8, i32)
+gen_gather(8, i64)
+gen_scatter(8, i32)
+gen_scatter(8, i64)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; math
+
+declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
+declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
+
+define internal <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  round4to8(%0, 8)
+}
+
+define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
+  ; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  ; the roundss intrinsic is a total mess--docs say:
+  ;
+  ;  __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
+  ;       
+  ;  b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
+  ;  on b0. The higher order 96 bits are copied directly from input parameter a. The
+  ;  return value is described by the following equations:
+  ;
+  ;  r0 = RND(b0)
+  ;  r1 = a1
+  ;  r2 = a2
+  ;  r3 = a3
+  ;
+  ;  It doesn't matter what we pass as a, since we only need the r0 value
+  ;  here.  So we pass the same register for both.  
+  %xi = insertelement <4 x float> undef, float %0, i32 0
+  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
+  %rs = extractelement <4 x float> %xr, i32 0
+  ret float %rs
+}
+
+define internal <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  round4to8(%0, 9)
+}
+
+define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <4 x float> undef, float %0, i32 0
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
+  %rs = extractelement <4 x float> %xr, i32 0
+  ret float %rs
+}
+
+define internal <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  round4to8(%0, 10)
+}
+
+define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <4 x float> undef, float %0, i32 0
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
+  %rs = extractelement <4 x float> %xr, i32 0
+  ret float %rs
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops / reductions
+
+declare i32 @llvm.ctpop.i32(i32) nounwind readnone
+
+define internal i32 @__popcnt(i32) nounwind readonly alwaysinline {
+  %call = call i32 @llvm.ctpop.i32(i32 %0)
+  ret i32 %call
+}
+
+declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define internal float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
+  %a = shufflevector <8 x float> %0, <8 x float> undef,
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %b = shufflevector <8 x float> %0, <8 x float> undef,
+         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %ab = fadd <4 x float> %a, %b
+  %hab = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %ab, <4 x float> %ab)
+  %a_scalar = extractelement <4 x float> %hab, i32 0
+  %b_scalar = extractelement <4 x float> %hab, i32 1
+  %sum = fadd float %a_scalar, %b_scalar
+  ret float %sum
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
+                                             <4 x float>) nounwind readnone
+
+define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>, 
+                                     <8 x i32> %mask) nounwind alwaysinline {
+  ; do two 4-wide blends with blendvps
+  %mask_as_float = bitcast <8 x i32> %mask to <8 x float>
+  %mask_a = shufflevector <8 x float> %mask_as_float, <8 x float> undef,
+                <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %mask_b = shufflevector <8 x float> %mask_as_float, <8 x float> undef,
+                <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %oldValue = load <8 x i32>* %0
+  %oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float>
+  %newAsFloat = bitcast <8 x i32> %1 to <8 x float>
+  %old_a = shufflevector <8 x float> %oldAsFloat, <8 x float> undef,
+                <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %old_b = shufflevector <8 x float> %oldAsFloat, <8 x float> undef,
+                <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %new_a = shufflevector <8 x float> %newAsFloat, <8 x float> undef,
+                <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %new_b = shufflevector <8 x float> %newAsFloat, <8 x float> undef,
+                <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %blend_a = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %old_a, <4 x float> %new_a,
+                                                       <4 x float> %mask_a)
+  %blend_b = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %old_b, <4 x float> %new_b,
+                                                       <4 x float> %mask_b)
+  %blend = shufflevector <4 x float> %blend_a, <4 x float> %blend_b,
+               <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %blendAsInt = bitcast <8 x float> %blend to <8 x i32>
+  store <8 x i32> %blendAsInt, <8 x i32>* %0
+  ret void
+}
+
+define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
+                                     <8 x i32> %mask) nounwind alwaysinline {
+  ; implement this as 4 blends of <4 x i32>s, which are actually bitcast
+  ; <2 x i64>s...
+
+  %mask_as_float = bitcast <8 x i32> %mask to <8 x float>
+
+  %old = load <8 x i64>* %ptr
+
+  ; set up the first two 64-bit values
+  %old01 = shufflevector <8 x i64> %old, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
+  %old01f = bitcast <2 x i64> %old01 to <4 x float>
+  %new01  = shufflevector <8 x i64> %new, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
+  %new01f = bitcast <2 x i64> %new01 to <4 x float>
+  ; compute mask--note that the values mask0 and mask1 are doubled-up
+  %mask01 = shufflevector <8 x float> %mask_as_float, <8 x float> undef,
+                          <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+  ; and blend the two of them values
+  %result01f = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %old01f,
+                                                         <4 x float> %new01f,
+                                                         <4 x float> %mask01)
+  %result01 = bitcast <4 x float> %result01f to <2 x i64>
+
+  ; and again
+  %old23 = shufflevector <8 x i64> %old, <8 x i64> undef, <2 x i32> <i32 2, i32 3>
+  %old23f = bitcast <2 x i64> %old23 to <4 x float>
+  %new23  = shufflevector <8 x i64> %new, <8 x i64> undef, <2 x i32> <i32 2, i32 3>
+  %new23f = bitcast <2 x i64> %new23 to <4 x float>
+  %mask23 = shufflevector <8 x float> %mask_as_float, <8 x float> undef,
+                          <4 x i32> <i32 2, i32 2, i32 3, i32 3>
+  %result23f = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %old23f,
+                                                         <4 x float> %new23f,
+                                                         <4 x float> %mask23)
+  %result23 = bitcast <4 x float> %result23f to <2 x i64>
+
+  %old45 = shufflevector <8 x i64> %old, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
+  %old45f = bitcast <2 x i64> %old45 to <4 x float>
+  %new45  = shufflevector <8 x i64> %new, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
+  %new45f = bitcast <2 x i64> %new45 to <4 x float>
+  %mask45 = shufflevector <8 x float> %mask_as_float, <8 x float> undef,
+                          <4 x i32> <i32 4, i32 4, i32 5, i32 5>
+  %result45f = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %old45f,
+                                                         <4 x float> %new45f,
+                                                         <4 x float> %mask45)
+  %result45 = bitcast <4 x float> %result45f to <2 x i64>
+
+  %old67 = shufflevector <8 x i64> %old, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
+  %old67f = bitcast <2 x i64> %old67 to <4 x float>
+  %new67  = shufflevector <8 x i64> %new, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
+  %new67f = bitcast <2 x i64> %new67 to <4 x float>
+  %mask67 = shufflevector <8 x float> %mask_as_float, <8 x float> undef,
+                          <4 x i32> <i32 6, i32 6, i32 7, i32 7>
+  %result67f = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %old67f,
+                                                         <4 x float> %new67f,
+                                                         <4 x float> %mask67)
+  %result67 = bitcast <4 x float> %result67f to <2 x i64>
+
+  %final0123 = shufflevector <2 x i64> %result01, <2 x i64> %result23,
+       <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %final4567 = shufflevector <2 x i64> %result45, <2 x i64> %result67,
+       <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %final = shufflevector <4 x i64> %final0123, <4 x i64> %final4567,
+       <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x i64> %final, <8 x i64> * %ptr
+  ret void
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
+
+define internal <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
+  unary2to8(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
+  ret <8 x double> %ret
+}
+
+
+define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
+  sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.pd, %0)
+  ret double %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision float min/max
+
+declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define internal <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
+  binary2to8(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
+  ret <8 x double> %ret
+}
+
+define internal double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
+  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.pd, %0, %1)
+  ret double %ret
+}
+
+define internal <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
+  binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
+  ret <8 x double> %ret
+}
+
+define internal double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
+  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.pd, %0, %1)
+  ret double %ret
+
+}
diff --git a/stdlib.ispc b/stdlib.ispc
new file mode 100644
index 00000000..629cdcd3
--- /dev/null
+++ b/stdlib.ispc
@@ -0,0 +1,1605 @@
+// -*- mode: c++ -*-
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file stdlib.ispc
+
+    @brief Portion of the ispc standard library implementation that's in
+           ispc code 
+*/
+
+///////////////////////////////////////////////////////////////////////////
+// Low level primitives
+
+static inline float floatbits(unsigned int a) {
+    return __floatbits_varying_int32(a);
+}
+
+static inline uniform float floatbits(uniform unsigned int a) {
+    return __floatbits_uniform_int32(a);
+}
+
+static inline float floatbits(int a) {
+    return __floatbits_varying_int32(a);
+}
+
+static inline uniform float floatbits(uniform int a) {
+    return __floatbits_uniform_int32(a);
+}
+
+static inline double doublebits(unsigned int64 a) {
+    return __doublebits_varying_int64(a);
+}
+
+static inline uniform double doublebits(uniform unsigned int64 a) {
+    return __doublebits_uniform_int64(a);
+}
+
+static inline unsigned int intbits(float a) {
+    return __intbits_varying_float(a);
+}
+
+static inline uniform unsigned int intbits(uniform float a) {
+    return __intbits_uniform_float(a);
+}
+
+static inline unsigned int64 intbits(double d) {
+    return __intbits_varying_double(d);
+}
+
+static inline uniform unsigned int64 intbits(uniform double d) {
+    return __intbits_uniform_double(d);
+}
+
+// x[i]
+static inline uniform float extract(float x, uniform int i) {
+    return __extract(x, i);
+}
+
+// x[i] = v
+static inline float insert(float x, uniform int i, uniform float v) {
+    return __insert(x, i, v);
+}
+
+static inline uniform int extract(int x, uniform int i) {
+    return intbits(extract(floatbits(x), i));
+}
+
+static inline int insert(int x, uniform int i, uniform int v) {
+    return intbits(insert(floatbits(x), i, floatbits(v)));
+}
+
+static inline uniform unsigned int extract(unsigned int x, uniform int i) {
+    return intbits(extract(floatbits(x), i));
+}
+
+static inline unsigned int insert(unsigned int x, uniform int i, uniform unsigned int v) {
+    return intbits(insert(floatbits(x), i, floatbits(v)));
+}
+
+static inline uniform bool any(bool v) {
+    // We only care about whether "any" is true for the active program instances,
+    // so we have to make v with the current program mask.
+    return __movmsk(v & __mask) != 0;
+}
+
+static inline uniform bool all(bool v) {
+    // As with any(), we need to explicitly mask v with the current program mask
+    // so we're only looking at the current lanes
+    bool match = ((v & __mask) == __mask);
+    return __movmsk(match) == (1 << programCount) - 1;
+}
+
+static inline uniform int popcnt(uniform int v) {
+    return __popcnt(v);
+}
+
+static inline int popcnt(int v) {
+    int r;
+    uniform int i;
+    for (i = 0; i < programCount; ++i)
+        r = insert(r, i, popcnt(extract(v, i)));
+    return (r & __mask);
+}
+
+static inline uniform int popcnt(bool v) {
+    // As with any() and all(), only count across the active lanes
+    return __popcnt(__movmsk(v & __mask));
+}
+
+static inline uniform int lanemask() {
+    return __movmsk(__mask);
+}
+
+///////////////////////////////////////////////////////////////////////////
+// Horizontal ops / reductions
+
+static inline uniform float reduce_add(float x) {
+    // zero the lanes where the mask is off
+    return __reduce_add_float(__mask ? x : 0.);
+}
+
+
+static inline uniform float reduce_min(float v) {
+    // For the lanes where the mask is off, replace the given value with
+    // infinity, so that it doesn't affect the result.
+    int iflt_max = 0x7f800000; // infinity
+    // Must use __floatbits_varying_int32, not floatbits(), since with the
+    // latter the current mask enters into the returned result...
+    return __reduce_min_float(__mask ? v : __floatbits_varying_int32(iflt_max));
+}
+
+static inline uniform float reduce_max(float v) {
+    // For the lanes where the mask is off, replace the given value with
+    // negative infinity, so that it doesn't affect the result.
+    const uniform int iflt_neg_max = 0xff800000; // -infinity
+    // Must use __floatbits_varying_int32, not floatbits(), since with the
+    // latter the current mask enters into the returned result...
+    return __reduce_max_float(__mask ? v : __floatbits_varying_int32(iflt_neg_max));
+}
+
+static inline uniform int reduce_add(int x) {
+    // Zero out the values for lanes that aren't running
+    return __reduce_add_int32(x & (int)__mask);
+}
+
+static inline uniform int reduce_min(int v) {
+    // Set values for non-running lanes to the maximum integer value so
+    // they don't affect the result.
+    int int_max = 0x7fffffff;
+    return __reduce_min_int32(__mask ? v : int_max);
+}
+
+static inline uniform int reduce_max(int v) {
+    // Set values for non-running lanes to the minimum integer value so
+    // they don't affect the result.
+    int int_min = 0x80000000;
+    return __reduce_max_int32(__mask ? v : int_min);
+}
+
+static inline uniform unsigned int reduce_add(unsigned int x) {
+    // Set values for non-running lanes to zero so they don't affect the
+    // result.
+    return __reduce_add_uint32(x & (int)__mask);
+}
+
+static inline uniform unsigned int reduce_min(unsigned int v) {
+    // Set values for non-running lanes to the maximum unsigned integer
+    // value so they don't affect the result.
+    unsigned int uint_max = 0xffffffff;
+    return __reduce_min_uint32(__mask ? v : uint_max);
+}
+
+static inline uniform unsigned int reduce_max(unsigned int v) {
+    // Set values for non-running lanes to zero so they don't affect the
+    // result.
+    return __reduce_max_uint32(__mask ? v : 0);
+}
+
+///////////////////////////////////////////////////////////////////////////
+// packed load, store
+
+static inline uniform unsigned int packed_load_active(uniform int a[], uniform int start,
+                                               reference int vals) {
+    return __packed_load_active(a, start, vals, __mask);
+}
+
+static inline uniform unsigned int packed_store_active(uniform int a[], uniform int start,
+                                                int vals) {
+    return __packed_store_active(a, start, vals, __mask);
+}
+
+///////////////////////////////////////////////////////////////////////////
+// Load/store from/to 8/16-bit types
+
+static inline unsigned int load_from_int8(uniform int a[], uniform int offset) {
+    return __load_uint8(a, offset);
+}
+
+static inline void store_to_int8(uniform int a[], uniform int offset, 
+                          unsigned int val) {
+    __store_uint8(a, offset, val, __mask);
+}
+
+static inline unsigned int load_from_int16(uniform int a[], uniform int offset) {
+    return __load_uint16(a, offset);
+}
+
+static inline void store_to_int16(uniform int a[], uniform int offset, 
+                           unsigned int val) {
+    __store_uint16(a, offset, val, __mask);
+}
+
+///////////////////////////////////////////////////////////////////////////
+// Math
+
+static inline float abs(float a) {
+    // Floating-point hack: zeroing the high bit clears the sign
+    unsigned int i = intbits(a);
+    i &= 0x7fffffff;
+    return floatbits(i);
+}
+
+static inline uniform float abs(uniform float a) {
+    uniform unsigned int i = intbits(a);
+    i &= 0x7fffffff;
+    return floatbits(i);
+}
+
+static inline unsigned int signbits(float x) {
+    unsigned int i = intbits(x);
+    return (i & 0x80000000u);
+}
+
+static inline uniform unsigned int signbits(uniform float x) {
+    uniform unsigned int i = intbits(x);
+    return (i & 0x80000000u);
+}
+
+static inline float round(float x) {
+    return __round_varying_float(x);
+}
+
+static inline uniform float round(uniform float x) {
+    return __round_uniform_float(x);
+}
+
+static inline float floor(float x) {
+    return __floor_varying_float(x);
+}
+
+static inline uniform float floor(uniform float x) {
+    return __floor_uniform_float(x);
+}
+
+static inline float ceil(float x) {
+    return __ceil_varying_float(x);
+}
+
+static inline uniform float ceil(uniform float x) {
+    return __ceil_uniform_float(x);
+}
+
+static inline float rcp(float v) {
+    return __rcp_varying_float(v);
+}
+
+static inline uniform float rcp(uniform float v) {
+    return __rcp_uniform_float(v);
+}
+
+static inline float sqrt(float v) {
+    return __sqrt_varying_float(v);
+}
+
+static inline uniform float sqrt(uniform float v) {
+    return __sqrt_uniform_float(v);
+}
+
+static inline float min(float a, float b) {
+    return __min_varying_float(a, b);
+}
+
+static inline uniform float min(uniform float a, uniform float b) {
+    return __min_uniform_float(a, b);
+}
+
+static inline float max(float a, float b) {
+    return __max_varying_float(a, b);
+}
+
+static inline uniform float max(uniform float a, uniform float b) {
+    return __max_uniform_float(a, b);
+}
+
+static inline unsigned int min(unsigned int a, unsigned int b) {
+    return __min_varying_uint32(a, b);
+}
+
+static inline uniform unsigned int min(uniform unsigned int a, uniform unsigned int b) {
+    return __min_uniform_uint32(a, b);
+}
+
+static inline unsigned int max(unsigned int a, unsigned int b) {
+    return __max_varying_uint32(a, b);
+}
+
+static inline uniform unsigned int max(uniform unsigned int a, uniform unsigned int b) {
+    return __max_uniform_uint32(a, b);
+}
+
+static inline int min(int a, int b) {
+    return __min_varying_int32(a, b);
+}
+
+static inline uniform int min(uniform int a, uniform int b) {
+    return __min_uniform_int32(a, b);
+}
+
+static inline int max(int a, int b) {
+    return __max_varying_int32(a, b);
+}
+
+static inline uniform int max(uniform int a, uniform int b) {
+    return __max_uniform_int32(a, b);
+}
+
+static inline float clamp(float v, float low, float high) {
+    return min(max(v, low), high);
+}
+
+static inline uniform float clamp(uniform float v, uniform float low, uniform float high) {
+    return min(max(v, low), high);
+}
+
+static inline unsigned int clamp(unsigned int v, unsigned int low, unsigned int high) {
+    return min(max(v, low), high);
+}
+
+static inline uniform unsigned int clamp(uniform unsigned int v, uniform unsigned int low, 
+                                  uniform unsigned int high) {
+    return min(max(v, low), high);
+}
+
+static inline int clamp(int v, int low, int high) {
+    return min(max(v, low), high);
+}
+
+static inline uniform int clamp(uniform int v, uniform int low, uniform int high) {
+    return min(max(v, low), high);
+}
+
+///////////////////////////////////////////////////////////////////////////
+// Transcendentals
+
+static inline float rsqrt(float v) {
+    return __rsqrt_varying_float(v);
+}
+
+static inline uniform float rsqrt(uniform float v) {
+    return __rsqrt_uniform_float(v);
+}
+
+static inline float ldexp(float x, int n) {
+    unsigned int ex = 0x7F800000u;
+    unsigned int ix = intbits(x);
+    ex &= ix;              // extract old exponent;
+    ix = ix & ~0x7F800000u;  // clear exponent
+    n = (n << 23) + ex;
+    ix |= n; // insert new exponent
+    return floatbits(ix);
+}
+
+static inline uniform float ldexp(uniform float x, uniform int n) {
+    uniform unsigned int ex = 0x7F800000u;
+    uniform unsigned int ix = intbits(x);
+    ex &= ix;              // extract old exponent;
+    ix = ix & ~0x7F800000u;  // clear exponent
+    n = (n << 23) + ex;
+    ix |= n; // insert new exponent
+    return floatbits(ix);
+}
+
+static inline float frexp(float x, reference int pw2) {
+    unsigned int ex = 0x7F800000u;              // exponent mask
+    unsigned int ix = intbits(x);
+    ex &= ix;
+    ix &= ~0x7F800000u;  // clear exponent
+    pw2 = (int)(ex >> 23) - 126; // compute exponent
+    ix |= 0x3F000000u;         // insert exponent +1 in x
+    return floatbits(ix);
+}
+
+static inline uniform float frexp(uniform float x, reference uniform int pw2) {
+    uniform unsigned int ex = 0x7F800000u;              // exponent mask
+    uniform unsigned int ix = intbits(x);
+    ex &= ix;
+    ix &= ~0x7F800000u;  // clear exponent
+    pw2 = (uniform int)(ex >> 23) - 126; // compute exponent
+    ix |= 0x3F000000u;         // insert exponent +1 in x
+    return floatbits(ix);
+}
+
+// Most of the transcendental implementations in ispc code here come from
+// Solomon Boulos's "syrah": https://github.com/boulos/syrah/
+
+static inline float sin(float x_full) {
+    if (__math_lib == __math_lib_svml) {
+        return __svml_sin(x_full);
+    }
+    else if (__math_lib == __math_lib_system) {
+        float ret;
+        uniform int mask = lanemask();
+        for (uniform int i = 0; i < programCount; ++i) {
+            if ((mask & (1 << i)) == 0)
+                continue;
+            uniform float r = __stdlib_sin(extract(x_full, i));
+            ret = insert(ret, i, r);
+        }
+        return ret;
+    }
+    else if (__math_lib == __math_lib_ispc || 
+             __math_lib == __math_lib_ispc_fast) {
+        static const float pi_over_two_vec = 1.57079637050628662109375;
+        static const float two_over_pi_vec = 0.636619746685028076171875;
+        float scaled = x_full * two_over_pi_vec;
+        float k_real = floor(scaled);
+        int k = (int)k_real;
+
+        // Reduced range version of x
+        float x = x_full - k_real * pi_over_two_vec;
+        int k_mod4 = k & 3;
+        bool sin_usecos = (k_mod4 == 1 || k_mod4 == 3);
+        bool flip_sign = (k_mod4 > 1);
+
+        // These coefficients are from sollya with fpminimax(sin(x)/x, [|0, 2,
+        // 4, 6, 8, 10|], [|single...|], [0;Pi/2]);
+        static const float sin_c2 = -0.16666667163372039794921875;
+        static const float sin_c4 = 8.333347737789154052734375e-3;
+        static const float sin_c6 = -1.9842604524455964565277099609375e-4;
+        static const float sin_c8 = 2.760012648650445044040679931640625e-6;
+        static const float sin_c10 = -2.50293279435709337121807038784027099609375e-8;
+
+        static const float cos_c2 = -0.5;
+        static const float cos_c4 = 4.166664183139801025390625e-2;
+        static const float cos_c6 = -1.388833043165504932403564453125e-3;
+        static const float cos_c8 = 2.47562347794882953166961669921875e-5;
+        static const float cos_c10 = -2.59630184018533327616751194000244140625e-7;
+
+        float outside = sin_usecos ? 1 : x;
+        float c2 = sin_usecos ? cos_c2 : sin_c2;
+        float c4 = sin_usecos ? cos_c4 : sin_c4;
+        float c6 = sin_usecos ? cos_c6 : sin_c6;
+        float c8 = sin_usecos ? cos_c8 : sin_c8;
+        float c10 = sin_usecos ? cos_c10 : sin_c10;
+
+        float x2 = x * x;
+        float formula = x2 * c10 + c8;
+        formula = x2 * formula + c6;
+        formula = x2 * formula + c4;
+        formula = x2 * formula + c2;
+        formula = x2 * formula + 1;
+        formula *= outside;
+
+        formula = flip_sign ? -formula : formula;
+        return formula;
+    }
+}
+
+
+static inline uniform float sin(uniform float x_full) {
+    if (__math_lib == __math_lib_system ||
+        __math_lib == __math_lib_svml) {
+        return __stdlib_sin(x_full);
+    }
+    else if (__math_lib == __math_lib_ispc || 
+             __math_lib == __math_lib_ispc_fast) {
+        static const uniform float pi_over_two_vec = 1.57079637050628662109375;
+        static const uniform float two_over_pi_vec = 0.636619746685028076171875;
+        uniform float scaled = x_full * two_over_pi_vec;
+        uniform float k_real = floor(scaled);
+        uniform int k = (int)k_real;
+
+        // Reduced range version of x
+        uniform float x = x_full - k_real * pi_over_two_vec;
+        uniform int k_mod4 = k & 3;
+        uniform bool sin_usecos = (k_mod4 == 1 || k_mod4 == 3);
+        uniform bool flip_sign = (k_mod4 > 1);
+
+        // These coefficients are from sollya with fpminimax(sin(x)/x, [|0, 2,
+        // 4, 6, 8, 10|], [|single...|], [0;Pi/2]);
+        static const uniform float sin_c2 = -0.16666667163372039794921875;
+        static const uniform float sin_c4 = 8.333347737789154052734375e-3;
+        static const uniform float sin_c6 = -1.9842604524455964565277099609375e-4;
+        static const uniform float sin_c8 = 2.760012648650445044040679931640625e-6;
+        static const uniform float sin_c10 = -2.50293279435709337121807038784027099609375e-8;
+
+        static const uniform float cos_c2 = -0.5;
+        static const uniform float cos_c4 = 4.166664183139801025390625e-2;
+        static const uniform float cos_c6 = -1.388833043165504932403564453125e-3;
+        static const uniform float cos_c8 = 2.47562347794882953166961669921875e-5;
+        static const uniform float cos_c10 = -2.59630184018533327616751194000244140625e-7;
+
+        uniform float outside, c2, c4, c6, c8, c10;
+        if (sin_usecos) {
+            outside = 1.;
+            c2 =  cos_c2;
+            c4 =  cos_c4;
+            c6 =  cos_c6;
+            c8 =  cos_c8;
+            c10 = cos_c10;
+        }
+        else {
+            outside = x;
+            c2 = sin_c2;
+            c4 = sin_c4;
+            c6 = sin_c6;
+            c8 = sin_c8;
+            c10 = sin_c10;
+        }
+
+        uniform float x2 = x * x;
+        uniform float formula = x2 * c10 + c8;
+        formula = x2 * formula + c6;
+        formula = x2 * formula + c4;
+        formula = x2 * formula + c2;
+        formula = x2 * formula + 1.;
+        formula *= outside;
+
+        formula = flip_sign ? -formula : formula;
+        return formula;
+    }
+}
+
+
+static inline float cos(float x_full) {
+    if (__math_lib == __math_lib_svml) {
+        return __svml_cos(x_full);
+    }
+    else if (__math_lib == __math_lib_system) {
+        float ret;
+        uniform int mask = lanemask();
+        for (uniform int i = 0; i < programCount; ++i) {
+            if ((mask & (1 << i)) == 0)
+                continue;
+            uniform float r = __stdlib_cos(extract(x_full, i));
+            ret = insert(ret, i, r);
+        }
+        return ret;
+    }
+    else if (__math_lib == __math_lib_ispc || 
+             __math_lib == __math_lib_ispc_fast) {
+        static const float pi_over_two_vec = 1.57079637050628662109375;
+        static const float two_over_pi_vec = 0.636619746685028076171875;
+        float scaled = x_full * two_over_pi_vec;
+        float k_real = floor(scaled);
+        int k = (int)k_real;
+
+        // Reduced range version of x
+        float x = x_full - k_real * pi_over_two_vec;
+
+        int k_mod4 = k & 3;
+        bool cos_usecos = (k_mod4 == 0 || k_mod4 == 2);
+        bool flip_sign = (k_mod4 == 1 || k_mod4 == 2);
+
+        const float sin_c2 = -0.16666667163372039794921875;
+        const float sin_c4 = 8.333347737789154052734375e-3;
+        const float sin_c6 = -1.9842604524455964565277099609375e-4;
+        const float sin_c8 = 2.760012648650445044040679931640625e-6;
+        const float sin_c10 = -2.50293279435709337121807038784027099609375e-8;
+
+        const float cos_c2 = -0.5;
+        const float cos_c4 = 4.166664183139801025390625e-2;
+        const float cos_c6 = -1.388833043165504932403564453125e-3;
+        const float cos_c8 = 2.47562347794882953166961669921875e-5;
+        const float cos_c10 = -2.59630184018533327616751194000244140625e-7;
+
+        float outside = cos_usecos ? 1. : x;
+        float c2 = cos_usecos ? cos_c2 : sin_c2;
+        float c4 = cos_usecos ? cos_c4 : sin_c4;
+        float c6 = cos_usecos ? cos_c6 : sin_c6;
+        float c8 = cos_usecos ? cos_c8 : sin_c8;
+        float c10 = cos_usecos ? cos_c10 : sin_c10;
+
+        float x2 = x * x;
+        float formula = x2 * c10 + c8;
+        formula = x2 * formula + c6;
+        formula = x2 * formula + c4;
+        formula = x2 * formula + c2;
+        formula = x2 * formula + 1.;
+        formula *= outside;
+
+        formula = flip_sign ? -formula : formula;
+        return formula;
+    }
+}
+
+
+static inline uniform float cos(uniform float x_full) {
+    if (__math_lib == __math_lib_system ||
+        __math_lib == __math_lib_svml) {
+        return __stdlib_cos(x_full);
+    }
+    else if (__math_lib == __math_lib_ispc || 
+             __math_lib == __math_lib_ispc_fast) {
+        static const uniform float pi_over_two_vec = 1.57079637050628662109375;
+        static const uniform float two_over_pi_vec = 0.636619746685028076171875;
+        uniform float scaled = x_full * two_over_pi_vec;
+        uniform float k_real = floor(scaled);
+        uniform int k = (int)k_real;
+
+        // Reduced range version of x
+        uniform float x = x_full - k_real * pi_over_two_vec;
+
+        uniform int k_mod4 = k & 3;
+        uniform bool cos_usecos = (k_mod4 == 0 || k_mod4 == 2);
+        uniform bool flip_sign = (k_mod4 == 1 || k_mod4 == 2);
+
+        const uniform float sin_c2 = -0.16666667163372039794921875;
+        const uniform float sin_c4 = 8.333347737789154052734375e-3;
+        const uniform float sin_c6 = -1.9842604524455964565277099609375e-4;
+        const uniform float sin_c8 = 2.760012648650445044040679931640625e-6;
+        const uniform float sin_c10 = -2.50293279435709337121807038784027099609375e-8;
+
+        const uniform float cos_c2 = -0.5;
+        const uniform float cos_c4 = 4.166664183139801025390625e-2;
+        const uniform float cos_c6 = -1.388833043165504932403564453125e-3;
+        const uniform float cos_c8 = 2.47562347794882953166961669921875e-5;
+        const uniform float cos_c10 = -2.59630184018533327616751194000244140625e-7;
+
+        uniform float outside, c2, c4, c6, c8, c10;
+        if (cos_usecos) {
+            outside = 1.;
+            c2 = cos_c2;
+            c4 = cos_c4;
+            c6 = cos_c6;
+            c8 = cos_c8;
+            c10 = cos_c10;
+        }
+        else {
+            outside = x;
+            c2 = sin_c2;
+            c4 = sin_c4;
+            c6 = sin_c6;
+            c8 = sin_c8;
+            c10 = sin_c10;
+        }
+
+        uniform float x2 = x * x;
+        uniform float formula = x2 * c10 + c8;
+        formula = x2 * formula + c6;
+        formula = x2 * formula + c4;
+        formula = x2 * formula + c2;
+        formula = x2 * formula + 1.;
+        formula *= outside;
+
+        formula = flip_sign ? -formula : formula;
+        return formula;
+    }
+}
+
+
+static inline void sincos(float x_full, reference float sin_result, reference float cos_result) {
+    if (__math_lib == __math_lib_svml) {
+        __svml_sincos(x_full, sin_result, cos_result);
+    }
+    else if (__math_lib == __math_lib_system) {
+        uniform int mask = lanemask();
+        for (uniform int i = 0; i < programCount; ++i) {
+            if ((mask & (1 << i)) == 0)
+                continue;
+            uniform float s, c;
+            __stdlib_sincos(extract(x_full, i), s, c);
+            sin_result = insert(sin_result, i, s);
+            cos_result = insert(cos_result, i, c);
+        }
+    }
+    else if (__math_lib == __math_lib_ispc || 
+             __math_lib == __math_lib_ispc_fast) {
+        const float pi_over_two_vec = 1.57079637050628662109375;
+        const float two_over_pi_vec = 0.636619746685028076171875;
+        float scaled = x_full * two_over_pi_vec;
+        float k_real = floor(scaled);
+        int k = (int)k_real;
+
+        // Reduced range version of x
+        float x = x_full - k_real * pi_over_two_vec;
+        int k_mod4 = k & 3;
+        bool cos_usecos = (k_mod4 == 0 || k_mod4 == 2);
+        bool sin_usecos = (k_mod4 == 1 || k_mod4 == 3);
+        bool sin_flipsign = (k_mod4 > 1);
+        bool cos_flipsign = (k_mod4 == 1 || k_mod4 == 2);
+
+        const float one_vec = 1.;
+        const float sin_c2 = -0.16666667163372039794921875;
+        const float sin_c4 = 8.333347737789154052734375e-3;
+        const float sin_c6 = -1.9842604524455964565277099609375e-4;
+        const float sin_c8 = 2.760012648650445044040679931640625e-6;
+        const float sin_c10 = -2.50293279435709337121807038784027099609375e-8;
+
+        const float cos_c2 = -0.5;
+        const float cos_c4 = 4.166664183139801025390625e-2;
+        const float cos_c6 = -1.388833043165504932403564453125e-3;
+        const float cos_c8 = 2.47562347794882953166961669921875e-5;
+        const float cos_c10 = -2.59630184018533327616751194000244140625e-7;
+
+        float x2 = x * x;
+
+        float sin_formula = x2 * sin_c10 + sin_c8;
+        float cos_formula = x2 * cos_c10 + cos_c8;
+        sin_formula = x2 * sin_formula + sin_c6;
+        cos_formula = x2 * cos_formula + cos_c6;
+
+        sin_formula = x2 * sin_formula + sin_c4;
+        cos_formula = x2 * cos_formula + cos_c4;
+
+        sin_formula = x2 * sin_formula + sin_c2;
+        cos_formula = x2 * cos_formula + cos_c2;
+
+        sin_formula = x2 * sin_formula + one_vec;
+        cos_formula = x2 * cos_formula + one_vec;
+
+        sin_formula *= x;
+
+        sin_result = sin_usecos ? cos_formula : sin_formula;
+        cos_result = cos_usecos ? cos_formula : sin_formula;
+
+        sin_result = sin_flipsign ? -sin_result : sin_result;
+        cos_result = cos_flipsign ? -cos_result : cos_result;
+    }
+}
+
+
+static inline void sincos(uniform float x_full, reference uniform float sin_result,
+                   reference uniform float cos_result) {
+    if (__math_lib == __math_lib_system ||
+        __math_lib == __math_lib_svml) {
+        __stdlib_sincos(x_full, sin_result, cos_result);
+    }
+    else if (__math_lib == __math_lib_ispc || 
+             __math_lib == __math_lib_ispc_fast) {
+        const uniform float pi_over_two_vec = 1.57079637050628662109375;
+        const uniform float two_over_pi_vec = 0.636619746685028076171875;
+        uniform float scaled = x_full * two_over_pi_vec;
+        uniform float k_real = floor(scaled);
+        uniform int k = (uniform int)k_real;
+
+        // Reduced range version of x
+        uniform float x = x_full - k_real * pi_over_two_vec;
+        uniform int k_mod4 = k & 3;
+        uniform bool cos_usecos = (k_mod4 == 0 || k_mod4 == 2);
+        uniform bool sin_usecos = (k_mod4 == 1 || k_mod4 == 3);
+        uniform bool sin_flipsign = (k_mod4 > 1);
+        uniform bool cos_flipsign = (k_mod4 == 1 || k_mod4 == 2);
+
+        const uniform float one_vec = 1.;
+        const uniform float sin_c2 = -0.16666667163372039794921875;
+        const uniform float sin_c4 = 8.333347737789154052734375e-3;
+        const uniform float sin_c6 = -1.9842604524455964565277099609375e-4;
+        const uniform float sin_c8 = 2.760012648650445044040679931640625e-6;
+        const uniform float sin_c10 = -2.50293279435709337121807038784027099609375e-8;
+
+        const uniform float cos_c2 = -0.5;
+        const uniform float cos_c4 = 4.166664183139801025390625e-2;
+        const uniform float cos_c6 = -1.388833043165504932403564453125e-3;
+        const uniform float cos_c8 = 2.47562347794882953166961669921875e-5;
+        const uniform float cos_c10 = -2.59630184018533327616751194000244140625e-7;
+
+        uniform float x2 = x * x;
+
+        uniform float sin_formula = x2 * sin_c10 + sin_c8;
+        uniform float cos_formula = x2 * cos_c10 + cos_c8;
+        sin_formula = x2 * sin_formula + sin_c6;
+        cos_formula = x2 * cos_formula + cos_c6;
+
+        sin_formula = x2 * sin_formula + sin_c4;
+        cos_formula = x2 * cos_formula + cos_c4;
+
+        sin_formula = x2 * sin_formula + sin_c2;
+        cos_formula = x2 * cos_formula + cos_c2;
+
+        sin_formula = x2 * sin_formula + one_vec;
+        cos_formula = x2 * cos_formula + one_vec;
+
+        sin_formula *= x;
+
+        sin_result = sin_usecos ? cos_formula : sin_formula;
+        cos_result = cos_usecos ? cos_formula : sin_formula;
+
+        sin_result = sin_flipsign ? -sin_result : sin_result;
+        cos_result = cos_flipsign ? -cos_result : cos_result;
+    }
+}
+
+
+static inline float tan(float x_full) {
+    if (__math_lib == __math_lib_svml) {
+        return __svml_tan(x_full);
+    }
+    else if (__math_lib == __math_lib_system) {
+        float ret;
+        uniform int mask = lanemask();
+        for (uniform int i = 0; i < programCount; ++i) {
+            if ((mask & (1 << i)) == 0)
+                continue;
+            uniform float r = __stdlib_tan(extract(x_full, i));
+            ret = insert(ret, i, r);
+        }
+        return ret;
+    }
+    else if (__math_lib == __math_lib_ispc || 
+             __math_lib == __math_lib_ispc_fast) {
+        const float pi_over_four_vec = 0.785398185253143310546875;
+        const float four_over_pi_vec = 1.27323949337005615234375;
+
+        bool x_lt_0 = x_full < 0.;
+        float y = x_lt_0 ? -x_full : x_full;
+        float scaled = y * four_over_pi_vec;
+
+        float k_real = floor(scaled);
+        int k = (int)k_real;
+
+        float x = y - k_real * pi_over_four_vec;
+
+        // if k & 1, x -= Pi/4
+        bool need_offset = (k & 1) != 0;
+        x = need_offset ? x - pi_over_four_vec : x;
+
+        // if k & 3 == (0 or 3) let z = tan_In...(y) otherwise z = -cot_In0To...
+        int k_mod4 = k & 3;
+        bool use_cotan = (k_mod4 == 1) || (k_mod4 == 2);
+
+        const float one_vec = 1.0;
+
+        const float tan_c2 = 0.33333075046539306640625;
+        const float tan_c4 = 0.13339905440807342529296875;
+        const float tan_c6 = 5.3348250687122344970703125e-2;
+        const float tan_c8 = 2.46033705770969390869140625e-2;
+        const float tan_c10 = 2.892402000725269317626953125e-3;
+        const float tan_c12 = 9.500005282461643218994140625e-3;
+
+        const float cot_c2 = -0.3333333432674407958984375;
+        const float cot_c4 = -2.222204394638538360595703125e-2;
+        const float cot_c6 = -2.11752182804048061370849609375e-3;
+        const float cot_c8 = -2.0846328698098659515380859375e-4;
+        const float cot_c10 = -2.548247357481159269809722900390625e-5;
+        const float cot_c12 = -3.5257363606433500535786151885986328125e-7;
+
+        float x2 = x * x;
+        float z;
+        cif (use_cotan) {
+            float cot_val = x2 * cot_c12 + cot_c10;
+            cot_val = x2 * cot_val + cot_c8;
+            cot_val = x2 * cot_val + cot_c6;
+            cot_val = x2 * cot_val + cot_c4;
+            cot_val = x2 * cot_val + cot_c2;
+            cot_val = x2 * cot_val + one_vec;
+            // The equation is for x * cot(x) but we need -x * cot(x) for the tan part.
+            cot_val /= -x;
+            z = cot_val;
+        } else {
+            float tan_val = x2 * tan_c12 + tan_c10;
+            tan_val = x2 * tan_val + tan_c8;
+            tan_val = x2 * tan_val + tan_c6;
+            tan_val = x2 * tan_val + tan_c4;
+            tan_val = x2 * tan_val + tan_c2;
+            tan_val = x2 * tan_val + one_vec;
+            // Equation was for tan(x)/x
+            tan_val *= x;
+            z = tan_val;
+        }
+        return x_lt_0 ? -z : z;
+    }
+}
+
+
+static inline uniform float tan(uniform float x_full) {
+    if (__math_lib == __math_lib_system ||
+        __math_lib == __math_lib_svml) {
+        return __stdlib_tan(x_full);
+    }
+    else if (__math_lib == __math_lib_ispc || 
+             __math_lib == __math_lib_ispc_fast) {
+        const uniform float pi_over_four_vec = 0.785398185253143310546875;
+        const uniform float four_over_pi_vec = 1.27323949337005615234375;
+
+        uniform bool x_lt_0 = x_full < 0.;
+        uniform float y = x_lt_0 ? -x_full : x_full;
+        uniform float scaled = y * four_over_pi_vec;
+
+        uniform float k_real = floor(scaled);
+        uniform int k = (int)k_real;
+
+        uniform float x = y - k_real * pi_over_four_vec;
+
+        // if k & 1, x -= Pi/4
+        uniform bool need_offset = (k & 1) != 0;
+        x = need_offset ? x - pi_over_four_vec : x;
+
+        // if k & 3 == (0 or 3) let z = tan_In...(y) otherwise z = -cot_In0To...
+        uniform int k_mod4 = k & 3;
+        uniform bool use_cotan = (k_mod4 == 1) || (k_mod4 == 2);
+
+        const uniform float one_vec = 1.0;
+
+        const uniform float tan_c2 = 0.33333075046539306640625;
+        const uniform float tan_c4 = 0.13339905440807342529296875;
+        const uniform float tan_c6 = 5.3348250687122344970703125e-2;
+        const uniform float tan_c8 = 2.46033705770969390869140625e-2;
+        const uniform float tan_c10 = 2.892402000725269317626953125e-3;
+        const uniform float tan_c12 = 9.500005282461643218994140625e-3;
+
+        const uniform float cot_c2 = -0.3333333432674407958984375;
+        const uniform float cot_c4 = -2.222204394638538360595703125e-2;
+        const uniform float cot_c6 = -2.11752182804048061370849609375e-3;
+        const uniform float cot_c8 = -2.0846328698098659515380859375e-4;
+        const uniform float cot_c10 = -2.548247357481159269809722900390625e-5;
+        const uniform float cot_c12 = -3.5257363606433500535786151885986328125e-7;
+
+        uniform float x2 = x * x;
+        uniform float z;
+        if (use_cotan) {
+            uniform float cot_val = x2 * cot_c12 + cot_c10;
+            cot_val = x2 * cot_val + cot_c8;
+            cot_val = x2 * cot_val + cot_c6;
+            cot_val = x2 * cot_val + cot_c4;
+            cot_val = x2 * cot_val + cot_c2;
+            cot_val = x2 * cot_val + one_vec;
+            // The equation is for x * cot(x) but we need -x * cot(x) for the tan part.
+            cot_val /= -x;
+            z = cot_val;
+        } else {
+            uniform float tan_val = x2 * tan_c12 + tan_c10;
+            tan_val = x2 * tan_val + tan_c8;
+            tan_val = x2 * tan_val + tan_c6;
+            tan_val = x2 * tan_val + tan_c4;
+            tan_val = x2 * tan_val + tan_c2;
+            tan_val = x2 * tan_val + one_vec;
+            // Equation was for tan(x)/x
+            tan_val *= x;
+            z = tan_val;
+        }
+        return x_lt_0 ? -z : z;
+    }
+}
+
+
+static inline float atan(float x_full) {
+    if (__math_lib == __math_lib_svml) {
+        return __svml_atan(x_full);
+    }
+    else if (__math_lib == __math_lib_system) {
+        float ret;
+        uniform int mask = lanemask();
+        for (uniform int i = 0; i < programCount; ++i) {
+            if ((mask & (1 << i)) == 0)
+                continue;
+            uniform float r = __stdlib_atan(extract(x_full, i));
+            ret = insert(ret, i, r);
+        }
+        return ret;
+    }
+    else if (__math_lib == __math_lib_ispc || 
+             __math_lib == __math_lib_ispc_fast) {
+        const float pi_over_two_vec = 1.57079637050628662109375;
+        // atan(-x) = -atan(x) (so flip from negative to positive first)
+        // if x > 1 -> atan(x) = Pi/2 - atan(1/x)
+        bool x_neg = x_full < 0;
+        float x_flipped = x_neg ? -x_full : x_full;
+
+        bool x_gt_1 = x_flipped > 1.;
+        float x = x_gt_1 ? 1./x_flipped : x_flipped;
+
+        // These coefficients approximate atan(x)/x
+        const float atan_c0 = 0.99999988079071044921875;
+        const float atan_c2 = -0.3333191573619842529296875;
+        const float atan_c4 = 0.199689209461212158203125;
+        const float atan_c6 = -0.14015688002109527587890625;
+        const float atan_c8 = 9.905083477497100830078125e-2;
+        const float atan_c10 = -5.93664981424808502197265625e-2;
+        const float atan_c12 = 2.417283318936824798583984375e-2;
+        const float atan_c14 = -4.6721356920897960662841796875e-3;
+
+        float x2 = x * x;
+        float result = x2 * atan_c14 + atan_c12;
+        result = x2 * result + atan_c10;
+        result = x2 * result + atan_c8;
+        result = x2 * result + atan_c6;
+        result = x2 * result + atan_c4;
+        result = x2 * result + atan_c2;
+        result = x2 * result + atan_c0;
+        result *= x;
+
+        result = x_gt_1 ? pi_over_two_vec - result : result;
+        result = x_neg ? -result : result;
+        return result;
+    }
+}
+
+
+static inline uniform float atan(uniform float x_full) {
+    if (__math_lib == __math_lib_system ||
+        __math_lib == __math_lib_svml) {
+        return __stdlib_atan(x_full);
+    }
+    else if (__math_lib == __math_lib_ispc || 
+             __math_lib == __math_lib_ispc_fast) {
+        const uniform float pi_over_two_vec = 1.57079637050628662109375;
+        // atan(-x) = -atan(x) (so flip from negative to positive first)
+        // if x > 1 -> atan(x) = Pi/2 - atan(1/x)
+        uniform bool x_neg = x_full < 0;
+        uniform float x_flipped = x_neg ? -x_full : x_full;
+
+        uniform bool x_gt_1 = x_flipped > 1.;
+        uniform float x = x_gt_1 ? 1./x_flipped : x_flipped;
+
+        // These coefficients approximate atan(x)/x
+        const uniform float atan_c0 = 0.99999988079071044921875;
+        const uniform float atan_c2 = -0.3333191573619842529296875;
+        const uniform float atan_c4 = 0.199689209461212158203125;
+        const uniform float atan_c6 = -0.14015688002109527587890625;
+        const uniform float atan_c8 = 9.905083477497100830078125e-2;
+        const uniform float atan_c10 = -5.93664981424808502197265625e-2;
+        const uniform float atan_c12 = 2.417283318936824798583984375e-2;
+        const uniform float atan_c14 = -4.6721356920897960662841796875e-3;
+
+        uniform float x2 = x * x;
+        uniform float result = x2 * atan_c14 + atan_c12;
+        result = x2 * result + atan_c10;
+        result = x2 * result + atan_c8;
+        result = x2 * result + atan_c6;
+        result = x2 * result + atan_c4;
+        result = x2 * result + atan_c2;
+        result = x2 * result + atan_c0;
+        result *= x;
+
+        result = x_gt_1 ? pi_over_two_vec - result : result;
+        result = x_neg ? -result : result;
+        return result;
+    }
+}
+
+
+static inline float atan2(float y, float x) {
+    if (__math_lib == __math_lib_svml) {
+        return __svml_atan2(y, x);
+    }
+    else if (__math_lib == __math_lib_system) {
+        float ret;
+        uniform int mask = lanemask();
+        for (uniform int i = 0; i < programCount; ++i) {
+            if ((mask & (1 << i)) == 0)
+                continue;
+            uniform float r = __stdlib_atan2(extract(y, i), extract(x, i));
+            ret = insert(ret, i, r);
+        }
+        return ret;
+    }
+    else if (__math_lib == __math_lib_ispc || 
+             __math_lib == __math_lib_ispc_fast) {
+        const float pi_vec = 3.1415927410125732421875;
+        const float pi_over_two_vec = 1.57079637050628662109375;
+        // atan2(y, x) =
+        //
+        // atan2(y > 0, x = +-0) ->  Pi/2
+        // atan2(y < 0, x = +-0) -> -Pi/2
+        // atan2(y = +-0, x < +0) -> +-Pi
+        // atan2(y = +-0, x >= +0) -> +-0
+        //
+        // atan2(y >= 0, x < 0) ->  Pi + atan(y/x)
+        // atan2(y <  0, x < 0) -> -Pi + atan(y/x)
+        // atan2(y, x > 0) -> atan(y/x)
+        //
+        // and then a bunch of code for dealing with infinities.
+        float y_over_x = y/x;
+        float atan_arg = atan(y_over_x);
+        bool x_lt_0 = x < 0;
+        bool y_lt_0 = y < 0;
+        float offset = x_lt_0 ? (y_lt_0 ? -pi_vec : pi_vec) : 0;
+        return offset + atan_arg;
+    }
+}
+
+
+static inline uniform float atan2(uniform float y, uniform float x) {
+    if (__math_lib == __math_lib_system ||
+        __math_lib == __math_lib_svml) {
+        return __stdlib_atan2(y, x);
+    }
+    else if (__math_lib == __math_lib_ispc || 
+             __math_lib == __math_lib_ispc_fast) {
+        const uniform float pi_vec = 3.1415927410125732421875;
+        const uniform float pi_over_two_vec = 1.57079637050628662109375;
+
+        uniform float y_over_x = y/x;
+        uniform float atan_arg = atan(y_over_x);
+        uniform bool x_lt_0 = x < 0;
+        uniform bool y_lt_0 = y < 0;
+        uniform float offset = x_lt_0 ? (y_lt_0 ? -pi_vec : pi_vec) : 0;
+        return offset + atan_arg;
+    }
+}
+
+
+static inline float exp(float x_full) {
+    if (__math_lib == __math_lib_svml) {
+        return __svml_exp(x_full);
+    }
+    else if (__math_lib == __math_lib_system) {
+        float ret;
+        uniform int mask = lanemask();
+        for (uniform int i = 0; i < programCount; ++i) {
+            if ((mask & (1 << i)) == 0)
+                continue;
+            uniform float r = __stdlib_exp(extract(x_full, i));
+            ret = insert(ret, i, r);
+        }
+        return ret;
+    }
+    else if (__math_lib == __math_lib_ispc_fast) {
+        float z = floor(1.44269504088896341f * x_full + 0.5f); 
+        int n;
+        x_full -= z * 0.693359375f;
+        x_full -= z * -2.12194440e-4f;
+        n = (int)z;
+
+        z = x_full * x_full;
+        z = (((((1.9875691500E-4f  * x_full + 1.3981999507E-3f) * x_full +
+                8.3334519073E-3f) * x_full + 4.1665795894E-2f) * x_full +
+              1.6666665459E-1f) * x_full + 5.0000001201E-1f) * z + x_full + 1.f;
+        x_full = ldexp(z, n);
+        return x_full;
+    }
+    else if (__math_lib == __math_lib_ispc) {
+        const float ln2_part1 = 0.6931457519;
+        const float ln2_part2 = 1.4286067653e-6;
+        const float one_over_ln2 = 1.44269502162933349609375;
+
+        float scaled = x_full * one_over_ln2;
+        float k_real = floor(scaled);
+        int k = (int)k_real;
+
+        // Reduced range version of x
+        float x = x_full - k_real * ln2_part1;
+        x -= k_real * ln2_part2;
+
+        // These coefficients are for e^x in [0, ln(2)]
+        const float one = 1.;
+        const float c2 = 0.4999999105930328369140625;
+        const float c3 = 0.166668415069580078125;
+        const float c4 = 4.16539050638675689697265625e-2;
+        const float c5 = 8.378830738365650177001953125e-3;
+        const float c6 = 1.304379315115511417388916015625e-3;
+        const float c7 = 2.7555381529964506626129150390625e-4;
+
+        float result = x * c7 + c6;
+        result = x * result + c5;
+        result = x * result + c4;
+        result = x * result + c3;
+        result = x * result + c2;
+        result = x * result + one;
+        result = x * result + one;
+
+        // Compute 2^k (should differ for float and double, but I'll avoid
+        // it for now and just do floats)
+        const int fpbias = 127;
+        int biased_n = k + fpbias;
+        bool overflow = k > fpbias;
+        // Minimum exponent is -126, so if k is <= -127 (k + 127 <= 0)
+        // we've got underflow. -127 * ln(2) -> -88.02. So the most
+        // negative float input that doesn't result in zero is like -88.
+        bool underflow = (biased_n <= 0);
+        const int InfBits = 0x7f800000;
+        biased_n <<= 23;
+        // Reinterpret this thing as float
+        float two_to_the_n = floatbits(biased_n);
+        // Handle both doubles and floats (hopefully eliding the copy for float)
+        float elemtype_2n = two_to_the_n;
+        result *= elemtype_2n;
+        result = overflow ? floatbits(InfBits) : result;
+        result = underflow ? 0. : result;
+        return result;
+    }
+}
+
+static inline uniform float exp(uniform float x_full) {
+    if (__math_lib == __math_lib_system ||
+        __math_lib == __math_lib_svml) {
+        return __stdlib_exp(x_full);
+    }
+    else if (__math_lib == __math_lib_ispc_fast) {
+        uniform float z = floor(1.44269504088896341f * x_full + 0.5f); 
+        uniform int n;
+        x_full -= z * 0.693359375f;
+        x_full -= z * -2.12194440e-4f;
+        n = (int)z;
+
+        z = x_full * x_full;
+        z = (((((1.9875691500E-4f  * x_full + 1.3981999507E-3f) * x_full +
+                8.3334519073E-3f) * x_full + 4.1665795894E-2f) * x_full +
+              1.6666665459E-1f) * x_full + 5.0000001201E-1f) * z + x_full + 1.f;
+        x_full = ldexp(z, n);
+        return x_full;
+    }
+    else if (__math_lib == __math_lib_ispc) {
+        const uniform float ln2_part1 = 0.6931457519;
+        const uniform float ln2_part2 = 1.4286067653e-6;
+        const uniform float one_over_ln2 = 1.44269502162933349609375;
+
+        uniform float scaled = x_full * one_over_ln2;
+        uniform float k_real = floor(scaled);
+        uniform int k = (uniform int)k_real;
+
+        // Reduced range version of x
+        uniform float x = x_full - k_real * ln2_part1;
+        x -= k_real * ln2_part2;
+
+        // These coefficients are for e^x in [0, ln(2)]
+        const uniform float one = 1.;
+        const uniform float c2 = 0.4999999105930328369140625;
+        const uniform float c3 = 0.166668415069580078125;
+        const uniform float c4 = 4.16539050638675689697265625e-2;
+        const uniform float c5 = 8.378830738365650177001953125e-3;
+        const uniform float c6 = 1.304379315115511417388916015625e-3;
+        const uniform float c7 = 2.7555381529964506626129150390625e-4;
+
+        uniform float result = x * c7 + c6;
+        result = x * result + c5;
+        result = x * result + c4;
+        result = x * result + c3;
+        result = x * result + c2;
+        result = x * result + one;
+        result = x * result + one;
+
+        // Compute 2^k (should differ for uniform float and double, but I'll avoid
+        // it for now and just do uniform floats)
+        const uniform int fpbias = 127;
+        uniform int biased_n = k + fpbias;
+        uniform bool overflow = k > fpbias;
+        // Minimum exponent is -126, so if k is <= -127 (k + 127 <= 0)
+        // we've got underflow. -127 * ln(2) -> -88.02. So the most
+        // negative uniform float input that doesn't result in zero is like -88.
+        uniform bool underflow = (biased_n <= 0);
+        const uniform int InfBits = 0x7f800000;
+        biased_n <<= 23;
+        // Reuniform interpret this thing as uniform float
+        uniform float two_to_the_n = floatbits(biased_n);
+        // Handle both doubles and uniform floats (hopefully eliding the copy for uniform float)
+        uniform float elemtype_2n = two_to_the_n;
+        result *= elemtype_2n;
+        result = overflow ? floatbits(InfBits) : result;
+        result = underflow ? 0. : result;
+        return result;
+    }
+}
+
+// Range reduction for logarithms takes log(x) -> log(2^n * y) -> n
+// * log(2) + log(y) where y is the reduced range (usually in [1/2,
+// 1)).
+static inline void __range_reduce_log(float input, reference float reduced, reference int exponent) {
+    int int_version = intbits(input);
+    // single precision = SEEE EEEE EMMM MMMM MMMM MMMM MMMM MMMM
+    // exponent mask    = 0111 1111 1000 0000 0000 0000 0000 0000
+    //                    0x7  0xF  0x8  0x0  0x0  0x0  0x0  0x0
+    // non-exponent     = 1000 0000 0111 1111 1111 1111 1111 1111
+    //                  = 0x8  0x0  0x7  0xF  0xF  0xF  0xF  0xF
+
+    //const int exponent_mask(0x7F800000)
+    static const int nonexponent_mask = 0x807FFFFF;
+
+    // We want the reduced version to have an exponent of -1 which is -1 + 127 after biasing or 126
+    static const int exponent_neg1 = (126 << 23);
+    // NOTE(boulos): We don't need to mask anything out since we know
+    // the sign bit has to be 0. If it's 1, we need to return infinity/nan
+    // anyway (log(x), x = +-0 -> infinity, x < 0 -> NaN).
+    int biased_exponent = int_version >> 23; // This number is [0, 255] but it means [-127, 128]
+
+    int offset_exponent = biased_exponent + 1; // Treat the number as if it were 2^{e+1} * (1.m)/2
+    exponent = offset_exponent - 127; // get the real value
+
+    // Blend the offset_exponent with the original input (do this in
+    // int for now, until I decide if float can have & and &not)
+    int blended = (int_version & nonexponent_mask) | (exponent_neg1);
+    reduced = floatbits(blended);
+}
+
+
+
+static inline void __range_reduce_log(uniform float input, reference uniform float reduced, 
+                               reference uniform int exponent) {
+    uniform int int_version = intbits(input);
+    static const uniform int nonexponent_mask = 0x807FFFFF;
+
+    static const uniform int exponent_neg1 = (126 << 23);
+    uniform int biased_exponent = int_version >> 23;
+    uniform int offset_exponent = biased_exponent + 1;
+    exponent = offset_exponent - 127; // get the real value
+
+    uniform int blended = (int_version & nonexponent_mask) | (exponent_neg1);
+    reduced = floatbits(blended);
+}
+
+
+static inline float log(float x_full) {
+    if (__math_lib == __math_lib_svml) {
+        return __svml_log(x_full);
+    }
+    else if (__math_lib == __math_lib_system) {
+        float ret;
+        uniform int mask = lanemask();
+        for (uniform int i = 0; i < programCount; ++i) {
+            if ((mask & (1 << i)) == 0)
+                continue;
+            uniform float r = __stdlib_log(extract(x_full, i));
+            ret = insert(ret, i, r);
+        }
+        return ret;
+    }
+    else if (__math_lib == __math_lib_ispc_fast) {
+        int e;
+        x_full = frexp(x_full, e);
+    
+        int x_smaller_SQRTHF = (0.707106781186547524f > x_full) ? 0xffffffff : 0;
+        e += x_smaller_SQRTHF;
+        int ix_add = intbits(x_full);
+        ix_add &= x_smaller_SQRTHF;
+        x_full += floatbits(ix_add) - 1.f;
+
+        float z = x_full * x_full;
+        float y =
+            ((((((((7.0376836292E-2f * x_full
+                    + -1.1514610310E-1f) * x_full
+                   + 1.1676998740E-1f) * x_full
+                  + -1.2420140846E-1f) * x_full
+                 + 1.4249322787E-1f) * x_full
+                + -1.6668057665E-1f) * x_full
+               + 2.0000714765E-1f) * x_full
+              + -2.4999993993E-1f) * x_full
+             + 3.3333331174E-1f) * x_full * z;
+
+        float fe = (float)e;
+        y += fe * -2.12194440e-4;
+        y -= 0.5f * z;
+        z  = x_full + y;
+        return z + 0.693359375 * fe;
+    } 
+    else if (__math_lib == __math_lib_ispc) {
+        float reduced;
+        int exponent;
+
+        const int NaN_bits = 0x7fc00000;
+        const int Neg_Inf_bits = 0xFF800000;
+        const float NaN = floatbits(NaN_bits);
+        const float neg_inf = floatbits(Neg_Inf_bits);
+        bool use_nan = x_full < 0.;
+        bool use_inf = x_full == 0.;
+        bool exceptional = use_nan || use_inf;
+        const float one = 1.0;
+
+        float patched = exceptional ? one : x_full;
+        __range_reduce_log(patched, reduced, exponent);
+
+        const float ln2 = 0.693147182464599609375;
+
+        float x1 = one - reduced;
+        const float c1 = 0.50000095367431640625;
+        const float c2 = 0.33326041698455810546875;
+        const float c3 = 0.2519190013408660888671875;
+        const float c4 = 0.17541764676570892333984375;
+        const float c5 = 0.3424419462680816650390625;
+        const float c6 = -0.599632322788238525390625;
+        const float c7 = +1.98442304134368896484375;
+        const float c8 = -2.4899270534515380859375;
+        const float c9 = +1.7491014003753662109375;
+
+        float result = x1 * c9 + c8;
+        result = x1 * result + c7;
+        result = x1 * result + c6;
+        result = x1 * result + c5;
+        result = x1 * result + c4;
+        result = x1 * result + c3;
+        result = x1 * result + c2;
+        result = x1 * result + c1;
+        result = x1 * result + one;
+
+        // Equation was for -(ln(red)/(1-red))
+        result *= -x1;
+        result += (float)(exponent) * ln2;
+
+        return exceptional ? (use_nan ? NaN : neg_inf) : result;
+    }
+}
+
+static inline uniform float log(uniform float x_full) {
+    if (__math_lib == __math_lib_system ||
+        __math_lib == __math_lib_svml) {
+        return __stdlib_log(x_full);
+    }
+    else if (__math_lib == __math_lib_ispc_fast) {
+        uniform int e;
+        x_full = frexp(x_full, e);
+    
+        uniform int x_smaller_SQRTHF = (0.707106781186547524f > x_full) ? 0xffffffff : 0;
+        e += x_smaller_SQRTHF;
+        uniform int ix_add = intbits(x_full);
+        ix_add &= x_smaller_SQRTHF;
+        x_full += floatbits(ix_add) - 1.f;
+
+        uniform float z = x_full * x_full;
+        uniform float y =
+            ((((((((7.0376836292E-2f * x_full
+                    + -1.1514610310E-1f) * x_full
+                   + 1.1676998740E-1f) * x_full
+                  + -1.2420140846E-1f) * x_full
+                 + 1.4249322787E-1f) * x_full
+                + -1.6668057665E-1f) * x_full
+               + 2.0000714765E-1f) * x_full
+              + -2.4999993993E-1f) * x_full
+             + 3.3333331174E-1f) * x_full * z;
+
+        uniform float fe = (uniform float)e;
+        y += fe * -2.12194440e-4;
+        y -= 0.5f * z;
+        z  = x_full + y;
+        return z + 0.693359375 * fe;
+    }
+    else if (__math_lib == __math_lib_ispc) {
+        uniform float reduced;
+        uniform int exponent;
+
+        const uniform int NaN_bits = 0x7fc00000;
+        const uniform int Neg_Inf_bits = 0xFF800000;
+        const uniform float NaN = floatbits(NaN_bits);
+        const uniform float neg_inf = floatbits(Neg_Inf_bits);
+        uniform bool use_nan = x_full < 0.;
+        uniform bool use_inf = x_full == 0.;
+        uniform bool exceptional = use_nan || use_inf;
+        const uniform float one = 1.0;
+
+        uniform float patched = exceptional ? one : x_full;
+        __range_reduce_log(patched, reduced, exponent);
+
+        const uniform float ln2 = 0.693147182464599609375;
+
+        uniform float x1 = one - reduced;
+        const uniform float c1 = 0.50000095367431640625;
+        const uniform float c2 = 0.33326041698455810546875;
+        const uniform float c3 = 0.2519190013408660888671875;
+        const uniform float c4 = 0.17541764676570892333984375;
+        const uniform float c5 = 0.3424419462680816650390625;
+        const uniform float c6 = -0.599632322788238525390625;
+        const uniform float c7 = +1.98442304134368896484375;
+        const uniform float c8 = -2.4899270534515380859375;
+        const uniform float c9 = +1.7491014003753662109375;
+
+        uniform float result = x1 * c9 + c8;
+        result = x1 * result + c7;
+        result = x1 * result + c6;
+        result = x1 * result + c5;
+        result = x1 * result + c4;
+        result = x1 * result + c3;
+        result = x1 * result + c2;
+        result = x1 * result + c1;
+        result = x1 * result + one;
+
+        // Equation was for -(ln(red)/(1-red))
+        result *= -x1;
+        result += (uniform float)(exponent) * ln2;
+
+        return exceptional ? (use_nan ? NaN : neg_inf) : result;
+    }
+}
+
+static inline float pow(float a, float b) {
+    if (__math_lib == __math_lib_svml) {
+        return __svml_pow(a, b);
+    }
+    else if (__math_lib == __math_lib_system) {
+        float ret;
+        uniform int mask = lanemask();
+        for (uniform int i = 0; i < programCount; ++i) {
+            if ((mask & (1 << i)) == 0)
+                continue;
+            uniform float r = __stdlib_pow(extract(a, i), extract(b, i));
+            ret = insert(ret, i, r);
+        }
+        return ret;
+    }
+    else if (__math_lib == __math_lib_ispc || 
+             __math_lib == __math_lib_ispc_fast) {
+        return exp(b * log(a));
+    }
+}
+
+static inline uniform float pow(uniform float a, uniform float b) {
+    if (__math_lib == __math_lib_system ||
+        __math_lib == __math_lib_svml) {
+        return __stdlib_pow(a, b);
+    }
+    else if (__math_lib == __math_lib_ispc || 
+             __math_lib == __math_lib_ispc_fast) {
+        return exp(b * log(a));
+    }
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// RNG stuff
+
+struct RNGState {
+    unsigned int z1, z2, z3, z4;
+};
+
+static inline unsigned int random(reference uniform RNGState state)
+{
+    unsigned int b;
+
+    b  = ((state.z1 << 6) ^ state.z1) >> 13;
+    state.z1 = ((state.z1 & 4294967294U) << 18) ^ b;
+    b  = ((state.z2 << 2) ^ state.z2) >> 27; 
+    state.z2 = ((state.z2 & 4294967288U) << 2) ^ b;
+    b  = ((state.z3 << 13) ^ state.z3) >> 21;
+    state.z3 = ((state.z3 & 4294967280U) << 7) ^ b;
+    b  = ((state.z4 << 3) ^ state.z4) >> 12;
+    state.z4 = ((state.z4 & 4294967168U) << 13) ^ b;
+    return (state.z1 ^ state.z2 ^ state.z3 ^ state.z4);
+}
+
+static inline float frandom(reference uniform RNGState state)
+{
+    return ((int)(random(state) & ((1<<24)-1))) / (float)(1 << 24);
+}
+
+static inline uniform unsigned int __seed4(reference uniform RNGState state, 
+                                    uniform int start,
+                                    uniform unsigned int seed) {
+    uniform unsigned int c1 = 0xf0f0f0f0;
+    uniform unsigned int c2 = 0x0f0f0f0f;
+
+    state.z1 = insert(state.z1, start + 0, seed);
+    state.z1 = insert(state.z1, start + 1, seed ^ c1);
+    state.z1 = insert(state.z1, start + 2, (seed << 3) ^ c1);
+    state.z1 = insert(state.z1, start + 3, (seed << 2) ^ c2);
+
+    seed += 131;
+    state.z2 = insert(state.z2, start + 0, seed);
+    state.z2 = insert(state.z2, start + 1, seed ^ c1);
+    state.z2 = insert(state.z2, start + 2, (seed << 3) ^ c1);
+    state.z2 = insert(state.z2, start + 3, (seed << 2) ^ c2);
+
+    seed ^= extract(state.z2, 2);
+    state.z3 = insert(state.z3, start + 0, seed);
+    state.z3 = insert(state.z3, start + 1, seed ^ c1);
+    state.z3 = insert(state.z3, start + 2, (seed << 3) ^ c1);
+    state.z3 = insert(state.z3, start + 3, (seed << 2) ^ c2);
+
+    seed <<= 4;
+    seed += 3;
+    seed ^= extract(state.z1, 3);
+    state.z4 = insert(state.z4, start + 0, seed);
+    state.z4 = insert(state.z4, start + 1, seed ^ c1);
+    state.z4 = insert(state.z4, start + 2, (seed << 3) ^ c1);
+    state.z4 = insert(state.z4, start + 3, (seed << 2) ^ c2);
+
+    return seed;
+}
+
+static inline void seed_rng(reference uniform RNGState state, uniform unsigned int seed) {
+    seed = __seed4(state, 0, seed);
+    if (programCount == 8)
+        __seed4(state, 4, seed ^ 0xbeeff00d);
+}
+
+static inline void fastmath() {
+    __fastmath();
+}
diff --git a/stdlib.m4 b/stdlib.m4
new file mode 100644
index 00000000..b098e131
--- /dev/null
+++ b/stdlib.m4
@@ -0,0 +1,835 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+;; This file provides a variety of macros used to generate LLVM bitcode
+;; parametrized in various ways.  Implementations of the standard library
+;; builtins for various targets can use macros from this file to simplify
+;; generating code for their implementations of those builtins.
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+;; Helper macro for calling various SSE instructions for scalar values
+;; but where the instruction takes a vector parameter.
+;; $1 : name of variable to put the final value in
+;; $2 : vector width of the target
+;; $3 : scalar type of the operand
+;; $4 : SSE intrinsic name
+;; $5 : variable name that has the scalar value
+;; For example, the following call causes the variable %ret to have
+;; the result of a call to sqrtss with the scalar value in %0
+;;  sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
+
+define(`sse_unary_scalar', `
+  %$1_vec = insertelement <$2 x $3> undef, $3 $5, i32 0
+  %$1_val = call <$2 x $3> $4(<$2 x $3> %$1_vec)
+  %$1 = extractelement <$2 x $3> %$1_val, i32 0
+')
+
+;; Similar to `sse_unary_scalar', this helper macro is for calling binary
+;; SSE instructions with scalar values, 
+;; $1: name of variable to put the result in
+;; $2: vector width of the target
+;; $3: scalar type of the operand
+;; $4 : SSE intrinsic name
+;; $5 : variable name that has the first scalar operand
+;; $6 : variable name that has the second scalar operand
+
+define(`sse_binary_scalar', `
+  %$1_veca = insertelement <$2 x $3> undef, $3 $5, i32 0
+  %$1_vecb = insertelement <$2 x $3> undef, $3 $6, i32 0
+  %$1_val = call <$2 x $3> $4(<$2 x $3> %$1_veca, <$2 x $3> %$1_vecb)
+  %$1 = extractelement <$2 x $3> %$1_val, i32 0
+')
+
+;; Do a reduction over a 4-wide vector
+;; $1: type of final scalar result
+;; $2: 4-wide function that takes 2 4-wide operands and returns the 
+;;     element-wise reduction
+;; $3: scalar function that takes two scalar operands and returns
+;;     the final reduction
+
+define(`reduce4', `
+  %v1 = shufflevector <4 x $1> %0, <4 x $1> undef,
+                      <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %m1 = call <4 x $1> $2(<4 x $1> %v1, <4 x $1> %0)
+  %m1a = extractelement <4 x $1> %m1, i32 0
+  %m1b = extractelement <4 x $1> %m1, i32 1
+  %m = call $1 $3($1 %m1a, $1 %m1b)
+  ret $1 %m
+'
+)
+
+;; Similar to `reduce4', do a reduction over an 8-wide vector
+;; $1: type of final scalar result
+;; $2: 8-wide function that takes 2 8-wide operands and returns the 
+;;     element-wise reduction
+;; $3: scalar function that takes two scalar operands and returns
+;;     the final reduction
+
+define(`reduce8', `
+  %v1 = shufflevector <8 x $1> %0, <8 x $1> undef,
+        <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %m1 = call <8 x $1> $2(<8 x $1> %v1, <8 x $1> %0)
+  %v2 = shufflevector <8 x $1> %m1, <8 x $1> undef,
+        <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %m2 = call <8 x $1> $2(<8 x $1> %v2, <8 x $1> %m1)
+  %m2a = extractelement <8 x $1> %m2, i32 0
+  %m2b = extractelement <8 x $1> %m2, i32 1
+  %m = call $1 $3($1 %m2a, $1 %m2b)
+  ret $1 %m
+'
+)
+
+;; Do an reduction over an 8-wide vector, using a vector reduction function
+;; that only takes 4-wide vectors
+;; $1: type of final scalar result
+;; $2: 4-wide function that takes 2 4-wide operands and returns the 
+;;     element-wise reduction
+;; $3: scalar function that takes two scalar operands and returns
+;;     the final reduction
+
+define(`reduce8by4', `
+  %v1 = shufflevector <8 x $1> %0, <8 x $1> undef,
+        <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v2 = shufflevector <8 x $1> %0, <8 x $1> undef,
+        <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %m1 = call <4 x $1> $2(<4 x $1> %v1, <4 x $1> %v2)
+  %v3 = shufflevector <4 x $1> %m1, <4 x $1> undef,
+        <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %m2 = call <4 x $1> $2(<4 x $1> %v3, <4 x $1> %m1)
+  %m2a = extractelement <4 x $1> %m2, i32 0
+  %m2b = extractelement <4 x $1> %m2, i32 1
+  %m = call $1 $3($1 %m2a, $1 %m2b)
+  ret $1 %m
+'
+)
+
+
+;; Given a unary function that takes a 2-wide vector and a 4-wide vector
+;; that we'd like to apply it to, extract 2 2-wide vectors from the 4-wide
+;; vector, apply it, and return the corresponding 4-wide vector result
+;; $1: name of variable into which the final result should go
+;; $2: scalar type of the vector elements
+;; $3: 2-wide unary vector function to apply
+;; $4: 4-wide operand value
+
+define(`unary2to4', `
+  %$1_0 = shufflevector <4 x $2> $4, <4 x $2> undef, <2 x i32> <i32 0, i32 1>
+  %v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0)
+  %$1_1 = shufflevector <4 x $2> $4, <4 x $2> undef, <2 x i32> <i32 2, i32 3>
+  %v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1)
+  %$1 = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, 
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+'
+)
+
+;; Similar to `unary2to4', this applies a 2-wide binary function to two 4-wide
+;; vector operands
+;; $1: name of variable into which the final result should go
+;; $2: scalar type of the vector elements
+;; $3: 2-wide binary vector function to apply
+;; $4: First 4-wide operand value
+;; $5: Second 4-wide operand value
+
+define(`binary2to4', `
+%$1_0a = shufflevector <4 x $2> $4, <4 x $2> undef, <2 x i32> <i32 0, i32 1>
+%$1_0b = shufflevector <4 x $2> $5, <4 x $2> undef, <2 x i32> <i32 0, i32 1>
+%v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0a, <2 x $2> %$1_0b)
+%$1_1a = shufflevector <4 x $2> $4, <4 x $2> undef, <2 x i32> <i32 2, i32 3>
+%$1_1b = shufflevector <4 x $2> $5, <4 x $2> undef, <2 x i32> <i32 2, i32 3>
+%v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1a, <2 x $2> %$1_1b)
+%$1 = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, 
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+'
+)
+
+;; Similar to `unary2to4', this maps a 4-wide unary function to an 8-wide 
+;; vector operand
+;; $1: name of variable into which the final result should go
+;; $2: scalar type of the vector elements
+;; $3: 4-wide unary vector function to apply
+;; $4: 8-wide operand value
+
+define(`unary4to8', `
+  %$1_0 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v$1_0 = call <4 x $2> $3(<4 x $2> %$1_0)
+  %$1_1 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v$1_1 = call <4 x $2> $3(<4 x $2> %$1_1)
+  %$1 = shufflevector <4 x $2> %v$1_0, <4 x $2> %v$1_1, 
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+'
+)
+
+;; And along the lines of `binary2to4', this maps a 4-wide binary function to
+;; two 8-wide vector operands
+;; $1: name of variable into which the final result should go
+;; $2: scalar type of the vector elements
+;; $3: 4-wide unary vector function to apply
+;; $4: First 8-wide operand value
+;; $5: Second 8-wide operand value
+
+define(`binary4to8', `
+%$1_0a = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+%$1_0b = shufflevector <8 x $2> $5, <8 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+%v$1_0 = call <4 x $2> $3(<4 x $2> %$1_0a, <4 x $2> %$1_0b)
+%$1_1a = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+%$1_1b = shufflevector <8 x $2> $5, <8 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+%v$1_1 = call <4 x $2> $3(<4 x $2> %$1_1a, <4 x $2> %$1_1b)
+%$1 = shufflevector <4 x $2> %v$1_0, <4 x $2> %v$1_1, 
+         <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+'
+)
+
+
+;; Maps a 2-wide unary function to an 8-wide vector operand, returning an 
+;; 8-wide vector result
+;; $1: name of variable into which the final result should go
+;; $2: scalar type of the vector elements
+;; $3: 2-wide unary vector function to apply
+;; $4: 8-wide operand value
+
+define(`unary2to8', `
+  %$1_0 = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> <i32 0, i32 1>
+  %v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0)
+  %$1_1 = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> <i32 2, i32 3>
+  %v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1)
+  %$1_2 = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> <i32 4, i32 5>
+  %v$1_2 = call <2 x $2> $3(<2 x $2> %$1_2)
+  %$1_3 = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> <i32 6, i32 7>
+  %v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3)
+  %$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, 
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3, 
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1 = shufflevector <4 x $2> %$1a, <4 x $2> %$1b,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>           
+'
+)
+
+;; Maps an 2-wide binary function to two 8-wide vector operands
+;; $1: name of variable into which the final result should go
+;; $2: scalar type of the vector elements
+;; $3: 2-wide unary vector function to apply
+;; $4: First 8-wide operand value
+;; $5: Second 8-wide operand value
+
+define(`binary2to8', `
+  %$1_0a = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> <i32 0, i32 1>
+  %$1_0b = shufflevector <8 x $2> $5, <8 x $2> undef, <2 x i32> <i32 0, i32 1>
+  %v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0a, <2 x $2> %$1_0b)
+  %$1_1a = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> <i32 2, i32 3>
+  %$1_1b = shufflevector <8 x $2> $5, <8 x $2> undef, <2 x i32> <i32 2, i32 3>
+  %v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1a, <2 x $2> %$1_1b)
+  %$1_2a = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> <i32 4, i32 5>
+  %$1_2b = shufflevector <8 x $2> $5, <8 x $2> undef, <2 x i32> <i32 4, i32 5>
+  %v$1_2 = call <2 x $2> $3(<2 x $2> %$1_2a, <2 x $2> %$1_2b)
+  %$1_3a = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> <i32 6, i32 7>
+  %$1_3b = shufflevector <8 x $2> $5, <8 x $2> undef, <2 x i32> <i32 6, i32 7>
+  %v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3a, <2 x $2> %$1_3b)
+
+  %$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, 
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3, 
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1 = shufflevector <4 x $2> %$1a, <4 x $2> %$1b,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>           
+'
+)
+
+;; The unary SSE round intrinsic takes a second argument that encodes the
+;; rounding mode.  This macro makes it easier to apply the 4-wide roundps
+;; to 8-wide vector operands
+;; $1: value to be rounded
+;; $2: integer encoding of rounding mode
+;; FIXME: this just has a ret statement at the end to return the result,
+;; which is inconsistent with the macros above 
+
+define(`round4to8', `
+%v0 = shufflevector <8 x float> $1, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+%v1 = shufflevector <8 x float> $1, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+%r0 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v0, i32 $2)
+%r1 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v1, i32 $2)
+%ret = shufflevector <4 x float> %r0, <4 x float> %r1, 
+         <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ret <8 x float> %ret
+'
+)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; stdlib_core
+;;
+;; This macro defines a bunch of helper routines that only depend on the
+;; target's vector width, which it takes as its first parameter.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define(`stdlib_core', `
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; vector ops
+
+define internal float @__extract(<$1 x float>, i32) nounwind readnone alwaysinline {
+  %extract = extractelement <$1 x float> %0, i32 %1
+  ret float %extract
+}
+
+define internal <$1 x float> @__insert(<$1 x float>, i32, 
+                                       float) nounwind readnone alwaysinline {
+  %insert = insertelement <$1 x float> %0, float %2, i32 %1
+  ret <$1 x float> %insert
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; various bitcasts from one type to another
+
+define internal <$1 x i32> @__intbits_varying_float(<$1 x float>) nounwind readnone alwaysinline {
+  %float_to_int_bitcast = bitcast <$1 x float> %0 to <$1 x i32>
+  ret <$1 x i32> %float_to_int_bitcast
+}
+
+define internal i32 @__intbits_uniform_float(float) nounwind readnone alwaysinline {
+  %float_to_int_bitcast = bitcast float %0 to i32
+  ret i32 %float_to_int_bitcast
+}
+
+define internal <$1 x i64> @__intbits_varying_double(<$1 x double>) nounwind readnone alwaysinline {
+  %double_to_int_bitcast = bitcast <$1 x double> %0 to <$1 x i64>
+  ret <$1 x i64> %double_to_int_bitcast
+}
+
+define internal i64 @__intbits_uniform_double(double) nounwind readnone alwaysinline {
+  %double_to_int_bitcast = bitcast double %0 to i64
+  ret i64 %double_to_int_bitcast
+}
+
+define internal <$1 x float> @__floatbits_varying_int32(<$1 x i32>) nounwind readnone alwaysinline {
+  %int_to_float_bitcast = bitcast <$1 x i32> %0 to <$1 x float>
+  ret <$1 x float> %int_to_float_bitcast
+}
+
+define internal float @__floatbits_uniform_int32(i32) nounwind readnone alwaysinline {
+  %int_to_float_bitcast = bitcast i32 %0 to float
+  ret float %int_to_float_bitcast
+}
+
+define internal <$1 x double> @__doublebits_varying_int64(<$1 x i64>) nounwind readnone alwaysinline {
+  %int_to_double_bitcast = bitcast <$1 x i64> %0 to <$1 x double>
+  ret <$1 x double> %int_to_double_bitcast
+}
+
+define internal double @__doublebits_uniform_int64(i64) nounwind readnone alwaysinline {
+  %int_to_double_bitcast = bitcast i64 %0 to double
+  ret double %int_to_double_bitcast
+}
+
+define internal <$1 x float> @__undef_varying() nounwind readnone alwaysinline {
+  ret <$1 x float> undef
+}
+
+define internal float @__undef_uniform() nounwind readnone alwaysinline {
+  ret float undef
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; stdlib transcendentals
+;;
+;; These functions provide entrypoints that call out to the libm 
+;; implementations of the transcendental functions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+declare float @sinf(float) nounwind readnone
+declare float @cosf(float) nounwind readnone
+declare void @sincosf(float, float *, float *) nounwind readnone
+declare float @tanf(float) nounwind readnone
+declare float @atanf(float) nounwind readnone
+declare float @atan2f(float, float) nounwind readnone
+declare float @expf(float) nounwind readnone
+declare float @logf(float) nounwind readnone
+declare float @powf(float, float) nounwind readnone
+
+define internal float @__stdlib_sin(float) nounwind readnone alwaysinline {
+  %r = call float @sinf(float %0)
+  ret float %r
+}
+
+define internal float @__stdlib_cos(float) nounwind readnone alwaysinline {
+  %r = call float @cosf(float %0)
+  ret float %r
+}
+
+define internal void @__stdlib_sincos(float, float *, float *) nounwind readnone alwaysinline {
+  call void @sincosf(float %0, float *%1, float *%2)
+  ret void
+}
+
+define internal float @__stdlib_tan(float) nounwind readnone alwaysinline {
+  %r = call float @tanf(float %0)
+  ret float %r
+}
+
+define internal float @__stdlib_atan(float) nounwind readnone alwaysinline {
+  %r = call float @atanf(float %0)
+  ret float %r
+}
+
+define internal float @__stdlib_atan2(float, float) nounwind readnone alwaysinline {
+  %r = call float @atan2f(float %0, float %1)
+  ret float %r
+}
+
+define internal float @__stdlib_log(float) nounwind readnone alwaysinline {
+  %r = call float @logf(float %0)
+  ret float %r
+}
+
+define internal float @__stdlib_exp(float) nounwind readnone alwaysinline {
+  %r = call float @expf(float %0)
+  ret float %r
+}
+
+define internal float @__stdlib_pow(float, float) nounwind readnone alwaysinline {
+  %r = call float @powf(float %0, float %1)
+  ret float %r
+}
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Definitions of 8 and 16-bit load and store functions
+;;
+;; The `int8_16' macro defines functions related to loading and storing 8 and
+;; 16-bit values in memory, converting to and from i32.  (This is a workaround
+;; to be able to use in-memory values of types in ispc programs, since the
+;; compiler doesn't yet support 8 and 16-bit datatypes...
+;;
+;; Arguments to pass to `int8_16':
+;; $1: vector width of the target
+
+define(`int8_16', `
+define internal <$1 x i32> @__load_uint8([0 x i32] *, i32 %offset) nounwind alwaysinline {
+  %ptr8 = bitcast [0 x i32] *%0 to i8 *
+  %ptr = getelementptr i8 * %ptr8, i32 %offset
+  %ptr64 = bitcast i8 * %ptr to i`'eval(8*$1) *
+  %val = load i`'eval(8*$1) * %ptr64, align 1
+
+  %vval = bitcast i`'eval(8*$1) %val to <$1 x i8>
+  ; were assuming unsigned, so zero-extend to i32... 
+  %ret = zext <$1 x i8> %vval to <$1 x i32>
+  ret <$1 x i32> %ret
+}
+
+
+define internal <$1 x i32> @__load_uint16([0 x i32] *, i32 %offset) nounwind alwaysinline {
+  %ptr16 = bitcast [0 x i32] *%0 to i16 *
+  %ptr = getelementptr i16 * %ptr16, i32 %offset
+  %ptr64 = bitcast i16 * %ptr to i`'eval(16*$1) *
+  %val = load i`'eval(16*$1) * %ptr64, align 1
+
+  %vval = bitcast i`'eval(16*$1) %val to <$1 x i16>
+  ; unsigned, so use zero-extent...
+  %ret = zext <$1 x i16> %vval to <$1 x i32>
+  ret <$1 x i32> %ret
+}
+
+define internal void @__store_uint8([0 x i32] *, i32 %offset, <$1 x i32> %val32,
+                                    <$1 x i32> %mask) nounwind alwaysinline {
+  %val = trunc <$1 x i32> %val32 to <$1 x i8>
+  %val64 = bitcast <$1 x i8> %val to i`'eval(8*$1)
+
+  %mask8 = trunc <$1 x i32> %mask to <$1 x i8>
+  %mask64 = bitcast <$1 x i8> %mask8 to i`'eval(8*$1)
+  %notmask = xor i`'eval(8*$1) %mask64, -1
+
+  %ptr8 = bitcast [0 x i32] *%0 to i8 *
+  %ptr = getelementptr i8 * %ptr8, i32 %offset
+  %ptr64 = bitcast i8 * %ptr to i`'eval(8*$1) *
+
+  ;; load the old value, use logical ops to blend based on the mask, then
+  ;; store the result back
+  %old = load i`'eval(8*$1) * %ptr64, align 1
+  %oldmasked = and i`'eval(8*$1) %old, %notmask
+  %newmasked = and i`'eval(8*$1) %val64, %mask64
+  %final = or i`'eval(8*$1) %oldmasked, %newmasked
+  store i`'eval(8*$1) %final, i`'eval(8*$1) * %ptr64
+
+  ret void
+}
+
+define internal void @__store_uint16([0 x i32] *, i32 %offset, <$1 x i32> %val32,
+                                     <$1 x i32> %mask) nounwind alwaysinline {
+  %val = trunc <$1 x i32> %val32 to <$1 x i16>
+  %val64 = bitcast <$1 x i16> %val to i`'eval(16*$1)
+
+  %mask8 = trunc <$1 x i32> %mask to <$1 x i16>
+  %mask64 = bitcast <$1 x i16> %mask8 to i`'eval(16*$1)
+  %notmask = xor i`'eval(16*$1) %mask64, -1
+
+  %ptr16 = bitcast [0 x i32] *%0 to i16 *
+  %ptr = getelementptr i16 * %ptr16, i32 %offset
+  %ptr64 = bitcast i16 * %ptr to i`'eval(16*$1) *
+
+  ;; as above, use mask to do blending with logical ops...
+  %old = load i`'eval(16*$1) * %ptr64, align 1
+  %oldmasked = and i`'eval(16*$1) %old, %notmask
+  %newmasked = and i`'eval(16*$1) %val64, %mask64
+  %final = or i`'eval(16*$1) %oldmasked, %newmasked
+  store i`'eval(16*$1) %final, i`'eval(16*$1) * %ptr64
+
+  ret void
+}
+'
+)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; packed load and store functions
+;;
+;; These define functions to emulate those nice packed load and packed store
+;; instructions.  For packed store, given a pointer to destination array and 
+;; an offset into the array, for each lane where the mask is on, the
+;; corresponding value for that lane is stored into packed locations in the
+;; destination array.  For packed load, each lane that has an active mask
+;; loads a sequential value from the array.
+;;
+;; $1: vector width of the target
+;;
+;; FIXME: use the per_lane macro, defined below, to implement these!
+
+define(`packed_load_and_store', `
+declare i1 @__is_compile_time_constant_mask(<$1 x i32> %mask)
+
+define i32 @__packed_load_active([0 x i32] *, i32 %start_offset, <$1 x i32> * %val_ptr,
+                                 <$1 x i32> %full_mask) nounwind alwaysinline {
+entry:
+  %mask = call i32 @__movmsk(<$1 x i32> %full_mask)
+  %baseptr = bitcast [0 x i32] * %0 to i32 *
+  %startptr = getelementptr i32 * %baseptr, i32 %start_offset
+  %mask_known = call i1 @__is_compile_time_constant_mask(<$1 x i32> %full_mask)
+  br i1 %mask_known, label %known_mask, label %unknown_mask
+
+known_mask:
+  %allon = icmp eq i32 %mask, eval((1 << $1) -1)
+  br i1 %allon, label %all_on, label %not_all_on
+
+all_on:
+  ;; everyone wants to load, so just load an entire vector width in a single
+  ;; vector load
+  %vecptr = bitcast i32 *%startptr to <$1 x i32> *
+  %vec_load = load <$1 x i32> *%vecptr, align 4
+  store <$1 x i32> %vec_load, <$1 x i32> * %val_ptr
+  ret i32 $1
+
+not_all_on:
+  %alloff = icmp eq i32 %mask, 0
+  br i1 %alloff, label %all_off, label %unknown_mask
+
+all_off:
+  ;; no one wants to load
+  ret i32 0
+
+unknown_mask:
+  br label %loop
+
+loop:
+  %lane = phi i32 [ 0, %unknown_mask ], [ %nextlane, %loopend ]
+  %lanemask = phi i32 [ 1, %unknown_mask ], [ %nextlanemask, %loopend ]
+  %offset = phi i32 [ 0, %unknown_mask ], [ %nextoffset, %loopend ]
+
+  ; is the current lane on?
+  %and = and i32 %mask, %lanemask
+  %do_load = icmp eq i32 %and, %lanemask
+  br i1 %do_load, label %load, label %loopend 
+
+load:
+  %loadptr = getelementptr i32 *%startptr, i32 %offset
+  %loadval = load i32 *%loadptr
+  %val_ptr_i32 = bitcast <$1 x i32> * %val_ptr to i32 *
+  %storeptr = getelementptr i32 *%val_ptr_i32, i32 %lane
+  store i32 %loadval, i32 *%storeptr
+  %offset1 = add i32 %offset, 1
+  br label %loopend
+
+loopend:
+  %nextoffset = phi i32 [ %offset1, %load ], [ %offset, %loop ]
+  %nextlane = add i32 %lane, 1
+  %nextlanemask = mul i32 %lanemask, 2
+
+  ; are we done yet?
+  %test = icmp ne i32 %nextlane, $1
+  br i1 %test, label %loop, label %done
+
+done:
+  ret i32 %nextoffset
+}
+
+define i32 @__packed_store_active([0 x i32] *, i32 %start_offset, <$1 x i32> %vals,
+                                  <$1 x i32> %full_mask) nounwind alwaysinline {
+entry:
+  %mask = call i32 @__movmsk(<$1 x i32> %full_mask)
+  %baseptr = bitcast [0 x i32] * %0 to i32 *
+  %startptr = getelementptr i32 * %baseptr, i32 %start_offset
+  %mask_known = call i1 @__is_compile_time_constant_mask(<$1 x i32> %full_mask)
+  br i1 %mask_known, label %known_mask, label %unknown_mask
+
+known_mask:
+  %allon = icmp eq i32 %mask, eval((1 << $1) -1)
+  br i1 %allon, label %all_on, label %not_all_on
+
+all_on:
+  %vecptr = bitcast i32 *%startptr to <$1 x i32> *
+  store <$1 x i32> %vals, <$1 x i32> * %vecptr, align 4
+  ret i32 $1
+
+not_all_on:
+  %alloff = icmp eq i32 %mask, 0
+  br i1 %alloff, label %all_off, label %unknown_mask
+
+all_off:
+  ret i32 0
+
+unknown_mask:
+  br label %loop
+
+loop:
+  %lane = phi i32 [ 0, %unknown_mask ], [ %nextlane, %loopend ]
+  %lanemask = phi i32 [ 1, %unknown_mask ], [ %nextlanemask, %loopend ]
+  %offset = phi i32 [ 0, %unknown_mask ], [ %nextoffset, %loopend ]
+
+  ; is the current lane on?
+  %and = and i32 %mask, %lanemask
+  %do_store = icmp eq i32 %and, %lanemask
+  br i1 %do_store, label %store, label %loopend 
+
+store:
+  %storeval = extractelement <$1 x i32> %vals, i32 %lane
+  %storeptr = getelementptr i32 *%startptr, i32 %offset
+  store i32 %storeval, i32 *%storeptr
+  %offset1 = add i32 %offset, 1
+  br label %loopend
+
+loopend:
+  %nextoffset = phi i32 [ %offset1, %store ], [ %offset, %loop ]
+  %nextlane = add i32 %lane, 1
+  %nextlanemask = mul i32 %lanemask, 2
+
+  ; are we done yet?
+  %test = icmp ne i32 %nextlane, $1
+  br i1 %test, label %loop, label %done
+
+done:
+  ret i32 %nextoffset
+}
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; per_lane
+;;
+;; The scary macro below encapsulates the 'scalarization' idiom--i.e. we have
+;; some operation that we'd like to perform only for the lanes where the
+;; mask is on
+;; $1: vector width of the target
+;; $2: variable that holds the mask
+;; $3: block of code to run for each lane that is on
+;;       Inside this code, any instances of the text "LANE" are replaced
+;;       with an i32 value that represents the current lane number
+
+divert(`-1')
+# forloop(var, from, to, stmt) - improved version:
+#   works even if VAR is not a strict macro name
+#   performs sanity check that FROM is larger than TO
+#   allows complex numerical expressions in TO and FROM
+define(`forloop', `ifelse(eval(`($3) >= ($2)'), `1',
+  `pushdef(`$1', eval(`$2'))_$0(`$1',
+    eval(`$3'), `$4')popdef(`$1')')')
+define(`_forloop',
+  `$3`'ifelse(indir(`$1'), `$2', `',
+    `define(`$1', incr(indir(`$1')))$0($@)')')
+divert`'dnl
+
+; num lanes, mask, code block to do per lane
+define(`per_lane', `
+  br label %pl_entry
+
+pl_entry:
+  %pl_mask = call i32 @__movmsk($2)
+  %pl_mask_known = call i1 @__is_compile_time_constant_mask($2)
+  br i1 %pl_mask_known, label %pl_known_mask, label %pl_unknown_mask
+
+pl_known_mask:
+  ;; the mask is known at compile time; see if it is something we can
+  ;; handle more efficiently
+  %pl_is_allon = icmp eq i32 %pl_mask, eval((1<<$1)-1)
+  br i1 %pl_is_allon, label %pl_all_on, label %pl_not_all_on
+
+pl_all_on:
+  ;; the mask is all on--just expand the code for each lane sequentially
+  forloop(i, 0, eval($1-1), 
+          `patsubst(`$3', `ID\|LANE', i)')
+  br label %pl_done
+
+pl_not_all_on:
+  ;; not all on--see if it is all off or mixed
+  ;; for the mixed case, we just run the general case, though we could
+  ;; try to be smart and just emit the code based on what it actually is,
+  ;; for example by emitting the code straight-line without a loop and doing 
+  ;; the lane tests explicitly, leaving later optimization passes to eliminate
+  ;; the stuff that is definitely not needed.  Not clear if we will frequently 
+  ;; encounter a mask that is known at compile-time but is not either all on or
+  ;; all off...
+  %pl_alloff = icmp eq i32 %pl_mask, 0
+  br i1 %pl_alloff, label %pl_done, label %pl_unknown_mask
+
+pl_unknown_mask:
+  br label %pl_loop
+
+pl_loop:
+  ;; Loop over each lane and see if we want to do the work for this lane
+  %pl_lane = phi i32 [ 0, %pl_unknown_mask ], [ %pl_nextlane, %pl_loopend ]
+  %pl_lanemask = phi i32 [ 1, %pl_unknown_mask ], [ %pl_nextlanemask, %pl_loopend ]
+
+  ; is the current lane on?  if so, goto do work, otherwise to end of loop
+  %pl_and = and i32 %pl_mask, %pl_lanemask
+  %pl_doit = icmp eq i32 %pl_and, %pl_lanemask
+  br i1 %pl_doit, label %pl_dolane, label %pl_loopend 
+
+pl_dolane:
+  ;; If so, substitute in the code from the caller and replace the LANE
+  ;; stuff with the current lane number
+  patsubst(`patsubst(`$3', `LANE_ID', `_id')', `LANE', `%pl_lane')
+  br label %pl_loopend
+
+pl_loopend:
+  %pl_nextlane = add i32 %pl_lane, 1
+  %pl_nextlanemask = mul i32 %pl_lanemask, 2
+
+  ; are we done yet?
+  %pl_test = icmp ne i32 %pl_nextlane, $1
+  br i1 %pl_test, label %pl_loop, label %pl_done
+
+pl_done:
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+;;
+;; $1: vector width of the target
+;; $2: scalar type for which to generate functions to do gathers
+
+; vec width, type
+define(`gen_gather', `
+;; Define the utility function to do the gather operation for a single element
+;; of the type
+define internal <$1 x $2> @__gather_elt_$2(i64 %ptr64, <$1 x i32> %offsets, <$1 x $2> %ret,
+                                           i32 %lane) nounwind readonly alwaysinline {
+  ; compute address for this one from the base
+  %offset32 = extractelement <$1 x i32> %offsets, i32 %lane
+  %offset64 = zext i32 %offset32 to i64
+  %ptrdelta = add i64 %ptr64, %offset64
+  %ptr = inttoptr i64 %ptrdelta to $2 *
+
+  ; load value and insert into returned value
+  %val = load $2 *%ptr
+  %updatedret = insertelement <$1 x $2> %ret, $2 %val, i32 %lane
+  ret <$1 x $2> %updatedret
+}
+
+
+define <$1 x $2> @__gather_base_offsets_$2(i8*, <$1 x i32> %offsets,
+                                           <$1 x i32> %vecmask) nounwind readonly alwaysinline {
+entry:
+  %mask = call i32 @__movmsk(<$1 x i32> %vecmask)
+  %ptr64 = ptrtoint i8 * %0 to i64
+
+  %maskKnown = call i1 @__is_compile_time_constant_mask(<$1 x i32> %vecmask)
+  br i1 %maskKnown, label %known_mask, label %unknown_mask
+
+known_mask:
+  %alloff = icmp eq i32 %mask, 0
+  br i1 %alloff, label %gather_all_off, label %unknown_mask
+
+gather_all_off:
+  ret <$1 x $2> undef
+
+unknown_mask:
+  ; We can be clever and avoid the per-lane stuff for gathers if we are willing
+  ; to require that the 0th element of the array being gathered from is always
+  ; legal to read from (and we do indeed require that, given the benefits!) 
+  ;
+  ; Set the offset to zero for lanes that are off
+  %offsetsPtr = alloca <$1 x i32>
+  store <$1 x i32> zeroinitializer, <$1 x i32> * %offsetsPtr
+  call void @__masked_store_blend_32(<$1 x i32> * %offsetsPtr, <$1 x i32> %offsets, 
+                                     <$1 x i32> %vecmask)
+  %newOffsets = load <$1 x i32> * %offsetsPtr
+
+  %ret0 = call <$1 x $2> @__gather_elt_$2(i64 %ptr64, <$1 x i32> %newOffsets,
+                                          <$1 x $2> undef, i32 0)
+  forloop(lane, 1, eval($1-1), 
+          `patsubst(patsubst(`%retLANE = call <$1 x $2> @__gather_elt_$2(i64 %ptr64, 
+                                <$1 x i32> %newOffsets, <$1 x $2> %retPREV, i32 LANE)
+                    ', `LANE', lane), `PREV', eval(lane-1))')
+  ret <$1 x $2> %ret`'eval($1-1)
+}
+'
+)
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gen_scatter
+;; Emit a function declaration for a scalarized scatter.
+;;
+;; $1: target vector width
+;; $2: scalar type for which we want to generate code to scatter
+
+define(`gen_scatter', `
+;; Define the function that descripes the work to do to scatter a single
+;; value
+define internal void @__scatter_elt_$2(i64 %ptr64, <$1 x i32> %offsets, <$1 x $2> %values,
+                                       i32 %lane) nounwind alwaysinline {
+  %offset32 = extractelement <$1 x i32> %offsets, i32 %lane
+  %offset64 = zext i32 %offset32 to i64
+  %ptrdelta = add i64 %ptr64, %offset64
+  %ptr = inttoptr i64 %ptrdelta to $2 *
+  %storeval = extractelement <$1 x $2> %values, i32 %lane
+  store $2 %storeval, $2 * %ptr
+  ret void
+}
+
+define void @__scatter_base_offsets_$2(i8* %base, <$1 x i32> %offsets, <$1 x $2> %values,
+                                       <$1 x i32> %mask) nounwind alwaysinline {
+  ;; And use the `per_lane' macro to do all of the per-lane work for scatter...
+  %ptr64 = ptrtoint i8 * %base to i64
+  per_lane($1, <$1 x i32> %mask, `
+      call void @__scatter_elt_$2(i64 %ptr64, <$1 x i32> %offsets, <$1 x $2> %values, i32 LANE)')
+  ret void
+}
+'
+)
diff --git a/stdlib2cpp.py b/stdlib2cpp.py
new file mode 100755
index 00000000..c1b7bf1b
--- /dev/null
+++ b/stdlib2cpp.py
@@ -0,0 +1,11 @@
+#!/usr/bin/python
+
+import sys
+
+print "const char *stdlib_code = "
+for line in sys.stdin:
+    l=line.rstrip()
+    l=l.replace('"', '\\"')
+    print "\"" + l + "\\n\""
+
+print ";"
diff --git a/stmt.cpp b/stmt.cpp
new file mode 100644
index 00000000..7ded3734
--- /dev/null
+++ b/stmt.cpp
@@ -0,0 +1,1561 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file stmt.cpp
+    @brief File with definitions classes related to statements in the language
+*/
+
+#include "stmt.h"
+#include "ctx.h"
+#include "util.h"
+#include "expr.h"
+#include "type.h"
+#include "decl.h"
+#include "sym.h"
+#include "module.h"
+#include "llvmutil.h"
+
+#include <stdio.h>
+#include <map>
+
+#include <llvm/Module.h>
+#include <llvm/Function.h>
+#include <llvm/Type.h>
+#include <llvm/DerivedTypes.h>
+#include <llvm/LLVMContext.h>
+#include <llvm/Metadata.h>
+#include <llvm/Instructions.h>
+#include <llvm/CallingConv.h>
+#include <llvm/Support/IRBuilder.h>
+#include <llvm/Support/raw_ostream.h>
+
+///////////////////////////////////////////////////////////////////////////
+// ExprStmt
+
+ExprStmt::ExprStmt(Expr *e, SourcePos p) 
+  : Stmt(p) {
+    expr = e;
+}
+
+void
+ExprStmt::EmitCode(FunctionEmitContext *ctx) const {
+    if (!ctx->GetCurrentBasicBlock()) 
+        return;
+
+    ctx->SetDebugPos(pos);
+    if (expr) 
+        expr->GetValue(ctx);
+}
+
+
+Stmt *
+ExprStmt::Optimize() {
+    if (expr) 
+        expr = expr->Optimize();
+    return this;
+}
+
+
+Stmt *
+ExprStmt::TypeCheck() {
+    if (expr) 
+        expr = expr->TypeCheck();
+    return this;
+}
+
+
+void
+ExprStmt::Print(int indent) const {
+    if (!expr) 
+        return;
+
+    printf("%*c", indent, ' ');
+    printf("Expr stmt: ");
+    pos.Print();
+    expr->Print();
+    printf("\n");
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// DeclStmt
+
+DeclStmt::DeclStmt(SourcePos p, Declaration *d, SymbolTable *s)
+        : Stmt(p), declaration(d) {
+    declaration->AddSymbols(s);
+}
+
+
+/** Utility routine that emits code to initialize a symbol given an
+    initializer expression.
+
+    @param lvalue    Memory location of storage for the symbol's data
+    @param symName   Name of symbol (used in error messages)
+    @param type      Type of variable being initialized
+    @param initExpr  Expression for the initializer
+    @param ctx       FunctionEmitContext to use for generating instructions
+    @param pos       Source file position of the variable being initialized
+*/
+static void
+lInitSymbol(llvm::Value *lvalue, const char *symName, const Type *type, 
+            Expr *initExpr, FunctionEmitContext *ctx, SourcePos pos) {
+    if (initExpr == NULL) {
+        // Initialize things without initializers to the undefined value.
+        // To auto-initialize everything to zero, replace 'UndefValue' with
+        // 'NullValue' in the below
+        const llvm::Type *ltype = type->LLVMType(g->ctx);
+        ctx->StoreInst(llvm::UndefValue::get(ltype), lvalue);
+        return;
+    }
+
+    // If the initializer is a straight up expression that isn't an
+    // ExprList, then we'll see if we can type convert it to the type of
+    // the variable.
+    if (dynamic_cast<ExprList *>(initExpr) == NULL) {
+        Expr *tcInit = initExpr->TypeConv(type, "inititalizer", true);
+        if (tcInit != NULL) {
+            llvm::Value *initializerValue = tcInit->GetValue(ctx);
+            if (initializerValue != NULL)
+                // Bingo; store the value in the variable's storage
+                ctx->StoreInst(initializerValue, lvalue);
+            return;
+        }
+    }
+
+    // Atomic types can't be initialized with { ... } initializer
+    // expressions, so print an error and return if that's what we've got
+    // here..
+    if (dynamic_cast<const AtomicType *>(type) != NULL) {
+        if (dynamic_cast<ExprList *>(initExpr) != NULL)
+            Error(initExpr->pos, "Expression list initializers can't be used for "
+                  "variable \"%s\' with atomic type \"%s\".", symName,
+                  type->GetString().c_str());
+        return;
+    }
+
+    const ReferenceType *rt = dynamic_cast<const ReferenceType *>(type);
+    if (rt) {
+        if (!Type::Equal(initExpr->GetType(), rt)) {
+            Error(initExpr->pos, "Initializer for reference type \"%s\" must have same "
+                  "reference type itself. \"%s\" is incompatible.", 
+                  rt->GetString().c_str(), initExpr->GetType()->GetString().c_str());
+            return;
+        }
+
+        llvm::Value *initializerValue = initExpr->GetValue(ctx);
+        if (initializerValue)
+            ctx->StoreInst(initializerValue, lvalue);
+        return;
+    }
+
+    // There are two cases for initializing arrays and vectors; either a single
+    // initializer may be provided (float foo[3] = 0;), in which case all
+    // of the array elements are initialized to the given value, or an
+    // initializer list may be provided (float foo[3] = { 1,2,3 }), in
+    // which case the array elements are initialized with the corresponding
+    // values.
+    const SequentialType *seqType = dynamic_cast<const SequentialType *>(type);
+    if (seqType != NULL) {
+        ExprList *exprList = dynamic_cast<ExprList *>(initExpr);
+        if (exprList == NULL) {
+            // We have single expression; loop over the elements of the
+            // array/vector and initialize each of them with it
+            // individually.
+            for (int i = 0; i < seqType->GetElementCount(); ++i) {
+                llvm::Value *ptr = ctx->GetElementPtrInst(lvalue, 0, i, "offset");
+                lInitSymbol(ptr, symName, seqType->GetElementType(), initExpr, 
+                            ctx, pos);
+            }
+        }
+        else {
+            // Otherwise make sure that we have the same number of elements
+            // in the { } initializer expression as we have in the
+            // array/vector
+            int nInits = exprList->exprs.size();
+            if (nInits != seqType->GetElementCount()) {
+                const char *actualType = dynamic_cast<const ArrayType *>(type) ? 
+                    "Array" : "Vector";
+                Error(initExpr->pos, "%s initializer for variable \"%s\" requires "
+                      "%d values; %d provided.", actualType, symName, 
+                      seqType->GetElementCount(), nInits);
+            }
+            else {
+                // And initialize each of the array/vector elements with
+                // the corresponding expression from the ExprList
+                for (int i = 0; i < nInits; ++i) {
+                    llvm::Value *ptr = ctx->GetElementPtrInst(lvalue, 0, i, "offset");
+                    lInitSymbol(ptr, symName, seqType->GetElementType(), 
+                                exprList->exprs[i], ctx, pos);
+                }
+            }
+        }
+        return;
+    }
+
+    // Structs can similarly be initialized in one of two ways; either with
+    // a list of expressions in braces, one expression per struct member,
+    // or with a single expression that is used to initialize all struct
+    // members.
+    const StructType *st = dynamic_cast<const StructType *>(type);
+    if (st) {
+        ExprList *exprList = dynamic_cast<ExprList *>(initExpr);
+        if (exprList != NULL) {
+            // The { ... } case; make sure we have the same number of
+            // expressions in the ExprList as we have struct members
+            int nInits = exprList->exprs.size();
+            if (nInits != st->NumElements())
+                Error(initExpr->pos, 
+                      "Initializer for struct \"%s\" requires %d values; %d provided.",
+                      symName, st->NumElements(), nInits);
+            else {
+                // Initialize each struct member with the corresponding
+                // value from the ExprList
+                for (int i = 0; i < nInits; ++i) {
+                    llvm::Value *ep = ctx->GetElementPtrInst(lvalue, 0, i, "structelement");
+                    lInitSymbol(ep, symName, st->GetMemberType(i), exprList->exprs[i],
+                                ctx, pos);
+                }
+            }
+        }
+        else if (initExpr->GetType()->IsNumericType() ||
+                 initExpr->GetType()->IsBoolType()) {
+            // Otherwise initialize all of the struct elements in turn with
+            // the initExpr.
+            for (int i = 0; i < st->NumElements(); ++i) {
+                llvm::Value *ep = ctx->GetElementPtrInst(lvalue, 0, i, "structelement");
+                lInitSymbol(ep, symName, st->GetMemberType(i), initExpr, ctx, pos);
+            }
+        }
+        else {
+            Error(initExpr->pos, "Can't assign type \"%s\" to \"%s\".",
+                  initExpr->GetType()->GetString().c_str(),
+                  st->GetString().c_str());
+        }
+        return;
+    }
+
+    FATAL("Unexpected Type in lInitSymbol()");
+}
+
+
+void
+DeclStmt::EmitCode(FunctionEmitContext *ctx) const {
+    if (!ctx->GetCurrentBasicBlock()) 
+        return;
+
+    for (unsigned int i = 0; i < declaration->declarators.size(); ++i) {
+        Declarator *decl = declaration->declarators[i];
+        if (!decl || decl->isFunction) 
+            continue;
+
+        Symbol *sym = decl->sym;
+        const Type *type = sym->type;
+        if (!type)
+            continue;
+
+        // Now that we're in the thick of emitting code, it's easy for us
+        // to find out the level of nesting of varying control flow we're
+        // in at this declaration.  So we can finally set that
+        // Symbol::varyingCFDepth variable.
+        // @todo It's disgusting to be doing this here.
+        sym->varyingCFDepth = ctx->VaryingCFDepth();
+
+        ctx->SetDebugPos(sym->pos);
+
+        // If it's an array that was declared without a size but has an
+        // initializer list, then use the number of elements in the
+        // initializer list to finally set the array's size.
+        const ArrayType *at = dynamic_cast<const ArrayType *>(type);
+        if (at && at->GetElementCount() == 0) {
+            ExprList *exprList = dynamic_cast<ExprList *>(decl->initExpr);
+            if (exprList) {
+                ArrayType *t = at->GetSizedArray(exprList->exprs.size());
+                assert(t != NULL);
+                sym->type = type = t;
+            }
+            else {
+                Error(sym->pos, "Can't declare an unsized array as a local "
+                      "variable without providing an initializer expression to "
+                      "set its size.");
+                continue;
+            }
+        }
+
+        // References must have initializer expressions as well.
+        if (dynamic_cast<const ReferenceType *>(type) && decl->initExpr == NULL) {
+            Error(sym->pos,
+                  "Must provide initializer for reference-type variable \"%s\".",
+                  sym->name.c_str());
+            continue;
+        }
+
+        const llvm::Type *llvmType = type->LLVMType(g->ctx);
+
+        if (declaration->declSpecs->storageClass == SC_STATIC) {
+            // For static variables, we need a compile-time constant value
+            // for its initializer; if there's no initializer, we use a
+            // zero value.
+            llvm::Constant *cinit = NULL;
+            if (decl->initExpr) {
+                cinit = decl->initExpr->GetConstant(type);
+                if (!cinit)
+                    Error(sym->pos, "Initializer for static variable \"%s\" must be a constant.",
+                          sym->name.c_str());
+            }
+            if (!cinit)
+                cinit = llvm::Constant::getNullValue(llvmType);
+
+            // Allocate space for the static variable in global scope, so
+            // that it persists across function calls
+            sym->storagePtr =
+                new llvm::GlobalVariable(*m->module, llvmType, type->IsConstType(),
+                                         llvm::GlobalValue::InternalLinkage, cinit,
+                                         llvm::Twine("static.") +
+                                         llvm::Twine(sym->pos.first_line) + 
+                                         llvm::Twine(".") + sym->name.c_str());
+        }
+        else {
+            // For non-static variables, allocate storage on the stack
+            sym->storagePtr = ctx->AllocaInst(llvmType, sym->name.c_str());
+            // And then get it initialized...
+            lInitSymbol(sym->storagePtr, sym->name.c_str(), type, decl->initExpr,
+                        ctx, sym->pos);
+        }
+
+        // Finally, tell the FunctionEmitContext about the variable 
+        ctx->EmitVariableDebugInfo(sym);
+    }
+}
+
+
+Stmt *
+DeclStmt::Optimize() {
+    for (unsigned int i = 0; i < declaration->declarators.size(); ++i) {
+        Declarator *decl = declaration->declarators[i];
+        if (decl && decl->initExpr) {
+            decl->initExpr = decl->initExpr->Optimize();
+
+            // If the variable is const-qualified, after we've optimized
+            // the initializer expression, see if we have a ConstExpr.  If
+            // so, save it in Symbol::constValue where it can be used in
+            // optimizing later expressions that have this symbol in them.
+            // Note that there are cases where the expression may be
+            // constant but where we don't have a ConstExpr; an example is
+            // const arrays--the ConstExpr implementation just can't
+            // represent an array of values.
+            //
+            // All this is fine in terms of the code that's generated in
+            // the end (LLVM's constant folding stuff is good), but it
+            // means that the ispc compiler's ability to reason about what
+            // is definitely a compile-time constant for things like
+            // computing array sizes from non-trivial expressions is
+            // consequently limited.
+            Symbol *sym = decl->sym;
+            if (sym->type && sym->type->IsConstType() && decl->initExpr && 
+                dynamic_cast<ExprList *>(decl->initExpr) == NULL &&
+                Type::Equal(decl->initExpr->GetType(), sym->type))
+                sym->constValue = dynamic_cast<ConstExpr *>(decl->initExpr);
+        }
+    }
+    return this;
+}
+
+
+Stmt *
+DeclStmt::TypeCheck() {
+    for (unsigned int i = 0; i < declaration->declarators.size(); ++i) {
+        Declarator *decl = declaration->declarators[i];
+        if (!decl || !decl->initExpr)
+            continue;
+
+        decl->initExpr = decl->initExpr->TypeCheck();
+        if (!decl->initExpr)
+            continue;
+
+        // get the right type for stuff like const float foo = 2; so that
+        // the int->float type conversion is in there and we don't return
+        // an int as the constValue later...
+        const Type *type = decl->sym->type;
+        if (dynamic_cast<const AtomicType *>(type) != NULL) {
+            // If it's an expr list with an atomic type, we'll later issue
+            // an error.  Need to leave decl->initExpr as is in that case so it
+            // is in fact caught later, though.
+            if (dynamic_cast<ExprList *>(decl->initExpr) == NULL)
+                decl->initExpr = decl->initExpr->TypeConv(type, "initializer");
+        }
+    }
+    return this;
+}
+
+
+void
+DeclStmt::Print(int indent) const {
+    printf("%*cDecl Stmt:", indent, ' ');
+    pos.Print();
+    if (declaration) 
+        declaration->Print();
+    printf("\n");
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// IfStmt
+
+IfStmt::IfStmt(Expr *t, Stmt *ts, Stmt *fs, bool doUnif, SourcePos p) 
+    : Stmt(p), test(t), trueStmts(ts), falseStmts(fs), 
+      doCoherentCheck(doUnif && !g->opt.disableCoherentControlFlow) {
+}
+
+
+static void
+lEmitIfStatements(FunctionEmitContext *ctx, Stmt *stmts, const char *trueOrFalse) {
+    if (!stmts)
+        return;
+
+    if (dynamic_cast<StmtList *>(stmts) == NULL)
+        ctx->StartScope();
+    ctx->AddInstrumentationPoint(trueOrFalse);
+    stmts->EmitCode(ctx);
+    if (dynamic_cast<const StmtList *>(stmts) == NULL)
+        ctx->EndScope();
+}
+
+void
+IfStmt::EmitCode(FunctionEmitContext *ctx) const {
+    // First check all of the things that might happen due to errors
+    // earlier in compilation and bail out if needed so that we don't
+    // dereference NULL pointers in the below...
+    if (!ctx->GetCurrentBasicBlock()) 
+        return;
+    if (!test) 
+        return;
+    const Type *testType = test->GetType();
+    if (!testType)
+        return;
+
+    ctx->SetDebugPos(pos);
+    bool isUniform = testType->IsUniformType();
+    if (isUniform) {
+        ctx->StartUniformIf(ctx->GetMask());
+        if (doCoherentCheck)
+            Warning(test->pos, "Uniform condition supplied to cif statement.");
+
+        // 'If' statements with uniform conditions are relatively
+        // straightforward.  We evaluate the condition and then jump to
+        // either the 'then' or 'else' clause depending on its value.
+        llvm::Value *vtest = test->GetValue(ctx);
+        if (vtest != NULL) {
+            llvm::BasicBlock *bthen = ctx->CreateBasicBlock("if_then");
+            llvm::BasicBlock *belse = ctx->CreateBasicBlock("if_else");
+            llvm::BasicBlock *bexit = ctx->CreateBasicBlock("if_exit");
+
+            // Jump to the appropriate basic block based on the value of
+            // the 'if' test
+            ctx->BranchInst(bthen, belse, vtest);
+
+            // Emit code for the 'true' case
+            ctx->SetCurrentBasicBlock(bthen);
+            lEmitIfStatements(ctx, trueStmts, "true");
+            if (ctx->GetCurrentBasicBlock()) 
+                ctx->BranchInst(bexit);
+
+            // Emit code for the 'false' case
+            ctx->SetCurrentBasicBlock(belse);
+            lEmitIfStatements(ctx, falseStmts, "false");
+            if (ctx->GetCurrentBasicBlock())
+                ctx->BranchInst(bexit);
+
+            // Set the active basic block to the newly-created exit block
+            // so that subsequent emitted code starts there.
+            ctx->SetCurrentBasicBlock(bexit);
+        }
+        ctx->EndIf();
+    }
+    else {
+        // Code for 'If' statemnts with 'varying' conditions can be
+        // generated in two ways; one takes some care to see if all of the
+        // active program instances want to follow only the 'true' or
+        // 'false' cases, and the other always runs both cases but sets the
+        // mask appropriately.  The first case is handled by the
+        // IfStmt::emitCoherentTests() call, and the second is handled by
+        // IfStmt::emitMaskedTrueAndFalse().
+        llvm::Value *testValue = test->GetValue(ctx);
+        if (testValue) {
+            if (doCoherentCheck) 
+                emitCoherentTests(ctx, testValue);
+            else {
+                llvm::Value *oldMask = ctx->GetMask();
+                ctx->StartVaryingIf(oldMask);
+                emitMaskedTrueAndFalse(ctx, oldMask, testValue);
+                ctx->EndIf();
+            }
+        }
+    }
+}
+
+
+Stmt *
+IfStmt::Optimize() {
+    if (test) 
+        test = test->Optimize();
+    if (trueStmts) 
+        trueStmts = trueStmts->Optimize();
+    if (falseStmts) 
+        falseStmts = falseStmts->Optimize();
+    return this;
+}
+
+
+Stmt *IfStmt::TypeCheck() {
+    if (test) {
+        test = test->TypeCheck();
+        if (test) {
+            const Type *testType = test->GetType();
+            if (testType) {
+                bool isUniform = testType->IsUniformType() && !g->opt.disableUniformControlFlow;
+                if (!testType->IsNumericType() && !testType->IsBoolType()) {
+                    Error(test->pos, "Type \"%s\" can't be converted to boolean for \"if\" test.",
+                          testType->GetString().c_str());
+                    return NULL;
+                }
+                test = new TypeCastExpr(isUniform ? AtomicType::UniformBool : 
+                                        AtomicType::VaryingBool, 
+                                        test, test->pos);
+                assert(test);
+            }
+        }
+    }
+    if (trueStmts)
+        trueStmts = trueStmts->TypeCheck();
+    if (falseStmts) 
+        falseStmts = falseStmts->TypeCheck();
+
+    return this;
+}
+
+
+void
+IfStmt::Print(int indent) const {
+    printf("%*cIf Stmt %s", indent, ' ', doCoherentCheck ? "DO COHERENT CHECK" : "");
+    pos.Print();
+    printf("\n%*cTest: ", indent+4, ' ');
+    test->Print();
+    printf("\n");
+    if (trueStmts) {
+        printf("%*cTrue:\n", indent+4, ' ');
+        trueStmts->Print(indent+8);
+    }
+    if (falseStmts) {
+        printf("%*cFalse:\n", indent+4, ' ');
+        falseStmts->Print(indent+8);
+    }
+}
+
+
+/** Emit code to run both the true and false statements for the if test,
+    with the mask set appropriately before runnign each one. 
+*/
+void
+IfStmt::emitMaskedTrueAndFalse(FunctionEmitContext *ctx, llvm::Value *oldMask, 
+                               llvm::Value *test) const {
+    if (trueStmts) {
+        ctx->MaskAnd(oldMask, test);
+        lEmitIfStatements(ctx, trueStmts, "if: expr mixed, true statements");
+        // under varying control flow,, returns can't stop instruction
+        // emission, so this better be non-NULL...
+        assert(ctx->GetCurrentBasicBlock()); 
+    }
+    if (falseStmts) {
+        ctx->MaskAndNot(oldMask, test);
+        lEmitIfStatements(ctx, falseStmts, "if: expr mixed, false statements");
+        assert(ctx->GetCurrentBasicBlock());
+    }
+}
+
+
+/** Emit code for an if test that checks the mask and the test values and
+    tries to be smart about jumping over code that doesn't need to be run.
+ */
+void
+IfStmt::emitCoherentTests(FunctionEmitContext *ctx, llvm::Value *ltest) const {
+    llvm::Value *oldMask = ctx->GetMask();
+    if (oldMask == LLVMMaskAllOn) {
+        // We can tell that the mask is on statically at compile time; just
+        // emit code for the 'if test with the mask all on' path
+        llvm::BasicBlock *bDone = ctx->CreateBasicBlock("cif_done");
+        emitMaskAllOn(ctx, ltest, bDone);
+        ctx->SetCurrentBasicBlock(bDone);
+    }
+    else {
+        // We can't tell if the mask going into the if is all on at the
+        // compile time.  Emit code to check for this and then either run
+        // the code for the 'all on' or the 'mixed' case depending on the
+        // mask's value at runtime.
+        llvm::BasicBlock *bAllOn = ctx->CreateBasicBlock("cif_mask_all");
+        llvm::BasicBlock *bMixedOn = ctx->CreateBasicBlock("cif_mask_mixed");
+        llvm::BasicBlock *bDone = ctx->CreateBasicBlock("cif_done");
+
+        // Jump to either bAllOn or bMixedOn, depending on the mask's value 
+        llvm::Value *maskAllQ = ctx->All(oldMask);
+        ctx->BranchInst(bAllOn, bMixedOn, maskAllQ);
+
+        // Emit code for the 'mask all on' case
+        ctx->SetCurrentBasicBlock(bAllOn);
+        // We start by explicitly storing "all on" into the mask mask.
+        // Note that this doesn't change its actual value, but doing so
+        // lets the compiler see what's going on so that subsequent
+        // optimizations for code emitted here can operate with the
+        // knowledge that the mask is definitely all on (until it modifies
+        // the mask itself).
+        ctx->SetMask(LLVMMaskAllOn);
+        emitMaskAllOn(ctx, ltest, bDone);
+        
+        // And emit code for the mixed mask case
+        ctx->SetCurrentBasicBlock(bMixedOn);
+        emitMaskMixed(ctx, oldMask, ltest, bDone);
+
+        // When done, set the current basic block to the block that the two
+        // paths above jump to when they're done.
+        ctx->SetCurrentBasicBlock(bDone);
+    }
+}
+
+
+/** Emits code for 'if' tests under the case where we know that the program
+    mask is all on.
+ */
+void
+IfStmt::emitMaskAllOn(FunctionEmitContext *ctx, llvm::Value *ltest, 
+                      llvm::BasicBlock *bDone) const {
+    // First, check the value of the test.  If it's all on, then we jump to
+    // a basic block that will only have code for the true case.
+    llvm::BasicBlock *bTestAll = ctx->CreateBasicBlock("cif_test_all");
+    llvm::BasicBlock *bTestNoneCheck = ctx->CreateBasicBlock("cif_test_none_check");
+    llvm::Value *testAllQ = ctx->All(ltest);
+    ctx->BranchInst(bTestAll, bTestNoneCheck, testAllQ);
+
+    // Emit code for the 'test is all true' case
+    ctx->SetCurrentBasicBlock(bTestAll);
+    ctx->StartVaryingIf(LLVMMaskAllOn);
+    lEmitIfStatements(ctx, trueStmts, "if: all on mask, expr all true");
+    ctx->EndIf();
+    if (ctx->GetCurrentBasicBlock() != NULL)
+        // bblock may legitimately be NULL since if there's a return stmt
+        // or break or continue we can actually jump and end emission since
+        // we know all of the lanes are following this path...
+        ctx->BranchInst(bDone);
+
+    // The test isn't all true.  Now emit code to determine if it's all
+    // false, or has mixed values.
+    ctx->SetCurrentBasicBlock(bTestNoneCheck);
+    llvm::BasicBlock *bTestNone = ctx->CreateBasicBlock("cif_test_none");
+    llvm::BasicBlock *bTestMixed = ctx->CreateBasicBlock("cif_test_mixed");
+    llvm::Value *testMixedQ = ctx->Any(ltest);
+    ctx->BranchInst(bTestMixed, bTestNone, testMixedQ);
+
+    // Emit code for the 'test is all false' case
+    ctx->SetCurrentBasicBlock(bTestNone);
+    ctx->StartVaryingIf(LLVMMaskAllOn);
+    lEmitIfStatements(ctx, falseStmts, "if: all on mask, expr all false");
+    ctx->EndIf();
+    if (ctx->GetCurrentBasicBlock())
+        // bblock may be NULL since if there's a return stmt or break or
+        // continue we can actually jump or whatever and end emission...
+        ctx->BranchInst(bDone);
+
+    // Finally emit code for the 'mixed true/false' case.  We unavoidably
+    // need to run both the true and the false statements.
+    ctx->SetCurrentBasicBlock(bTestMixed);
+    ctx->StartVaryingIf(LLVMMaskAllOn);
+    emitMaskedTrueAndFalse(ctx, LLVMMaskAllOn, ltest);
+    // In this case, return/break/continue isn't allowed to jump and end
+    // emission.
+    assert(ctx->GetCurrentBasicBlock());
+    ctx->EndIf();
+    ctx->BranchInst(bDone);
+}
+
+
+/** Emits code that checks to see if for all of the lanes where the mask is
+    on, the test has the value true.
+ */
+static llvm::Value *
+lTestMatchesMask(FunctionEmitContext *ctx, llvm::Value *test, llvm::Value *mask) {
+    llvm::Value *testAndMask = ctx->BinaryOperator(llvm::Instruction::And, test,
+                                                   mask, "test&mask");
+    return ctx->MasksAllEqual(testAndMask, mask);
+}
+
+
+/** Emit code for an 'if' test where the lane mask is known to be mixed
+    on/off going into it.
+ */
+void
+IfStmt::emitMaskMixed(FunctionEmitContext *ctx, llvm::Value *oldMask, 
+                      llvm::Value *ltest, llvm::BasicBlock *bDone) const {
+    // First, see if, for all of the lanes where the mask is on, if the
+    // value of the test is on.  (i.e. (test&mask) == mask).  In this case,
+    // we only need to run the 'true' case code, since the lanes where the
+    // test was false aren't supposed to be running here anyway.
+     llvm::Value *testAllEqual = lTestMatchesMask(ctx, ltest, oldMask);
+    llvm::BasicBlock *bTestAll = ctx->CreateBasicBlock("cif_mixed_test_all");
+    llvm::BasicBlock *bTestAnyCheck = ctx->CreateBasicBlock("cif_mixed_test_any_check");
+    ctx->BranchInst(bTestAll, bTestAnyCheck, testAllEqual);
+
+    // Emit code for the (test&mask)==mask case.  Not only do we only need
+    // to emit code for the true statements, but we don't need to modify
+    // the mask's value; it's already correct.
+    ctx->SetCurrentBasicBlock(bTestAll);
+    ctx->StartVaryingIf(ctx->GetMask());
+    lEmitIfStatements(ctx, trueStmts, "cif: all running lanes want just true stmts");
+    assert(ctx->GetCurrentBasicBlock());
+    ctx->EndIf();
+    ctx->BranchInst(bDone);
+
+    // Next, see if the active lanes only need to run the false case--i.e. if
+    // (~test & mask) == mask.
+    ctx->SetCurrentBasicBlock(bTestAnyCheck);
+    llvm::Value *notTest = ctx->BinaryOperator(llvm::Instruction::Xor, LLVMMaskAllOn,
+                                               ltest, "~test");
+    llvm::Value *notMatchesMask = lTestMatchesMask(ctx, notTest, oldMask);
+    llvm::BasicBlock *bTestAllNot = ctx->CreateBasicBlock("cif_mixed_test_none");
+    llvm::BasicBlock *bTestMixed = ctx->CreateBasicBlock("cif_mixed_test_mixed");
+    ctx->BranchInst(bTestAllNot, bTestMixed, notMatchesMask);
+
+    // Emit code for the (~test & mask) == mask case.  We only need the
+    // 'false' statements and again don't need to modify the value of the
+    // mask.
+    ctx->SetCurrentBasicBlock(bTestAllNot);
+    ctx->StartVaryingIf(ctx->GetMask());
+    lEmitIfStatements(ctx, falseStmts, "cif: all running lanes want just false stmts");
+    assert(ctx->GetCurrentBasicBlock());
+    ctx->EndIf();
+    ctx->BranchInst(bDone);
+
+    // It's mixed; we need to run both the true and false cases and also do
+    // mask update stuff.
+    ctx->SetCurrentBasicBlock(bTestMixed);
+    ctx->StartVaryingIf(ctx->GetMask());
+    emitMaskedTrueAndFalse(ctx, oldMask, ltest);
+    ctx->EndIf();
+    ctx->BranchInst(bDone);
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// DoStmt
+
+/** Given a statment, walk through it to see if there is a 'break' or
+    'continue' statement inside if its children, under varying control
+    flow.  We need to detect this case for loops since what might otherwise
+    look like a 'uniform' loop needs to have code emitted to do all of the
+    lane management stuff if this is the case.
+ */ 
+static bool
+lHasVaryingBreakOrContinue(Stmt *stmt, bool inVaryingCF = false) {
+    StmtList *sl;
+    IfStmt *is;
+
+    if ((sl = dynamic_cast<StmtList *>(stmt)) != NULL) {
+        // Recurse through the children statements 
+        const std::vector<Stmt *> &stmts = sl->GetStatements();
+        for (unsigned int i = 0; i < stmts.size(); ++i)
+            if (lHasVaryingBreakOrContinue(stmts[i], inVaryingCF))
+                return true;
+    }
+    else if ((is = dynamic_cast<IfStmt *>(stmt)) != NULL) {
+        // We've come to an 'if'.  Is the test type varying?  If so, then
+        // we're under 'varying' control flow when we recurse through the
+        // true and false statements.
+        if (is->test != NULL) {
+            const Type *type = is->test->GetType();
+            if (type)
+                inVaryingCF |= type->IsVaryingType();
+        }
+
+        if (lHasVaryingBreakOrContinue(is->trueStmts, inVaryingCF) ||
+            lHasVaryingBreakOrContinue(is->falseStmts, inVaryingCF))
+            return true;
+    }
+    else if (dynamic_cast<BreakStmt *>(stmt) != NULL) {
+        if (inVaryingCF)
+            return true;
+    }
+    else if (dynamic_cast<ContinueStmt *>(stmt) != NULL) {
+        if (inVaryingCF)
+            return true;
+    }
+    // Important: note that we don't recurse into do/for loops here but
+    // just return false.  For the question of whether a given loop needs
+    // to do mask management stuff, breaks/continues inside nested loops
+    // inside of them don't matter.
+    return false;
+}
+
+
+DoStmt::DoStmt(Expr *t, Stmt *s, bool cc, SourcePos p) 
+    : Stmt(p), testExpr(t), bodyStmts(s), 
+      doCoherentCheck(cc && !g->opt.disableCoherentControlFlow) {
+}
+
+
+void DoStmt::EmitCode(FunctionEmitContext *ctx) const {
+    // Check for things that could be NULL due to earlier errors during
+    // compilation.
+    if (!ctx->GetCurrentBasicBlock()) 
+        return;
+    if (!testExpr || !testExpr->GetType()) 
+        return;
+
+    bool uniformTest = testExpr->GetType()->IsUniformType();
+    if (uniformTest && doCoherentCheck)
+        Warning(pos, "Uniform condition supplied to \"cdo\" statement.");
+
+    llvm::BasicBlock *bloop = ctx->CreateBasicBlock("do_loop");
+    llvm::BasicBlock *bexit = ctx->CreateBasicBlock("do_exit");
+    llvm::BasicBlock *btest = ctx->CreateBasicBlock("do_test");
+
+    ctx->StartLoop(bexit, btest, uniformTest, ctx->GetMask());
+
+    // Start by jumping into the loop body
+    ctx->BranchInst(bloop);
+
+    // And now emit code for the loop body
+    ctx->SetCurrentBasicBlock(bloop);
+    ctx->SetLoopMask(ctx->GetMask());
+    ctx->SetDebugPos(pos);
+    // FIXME: in the StmtList::EmitCode() method takes starts/stops a new
+    // scope around the statements in the list.  So if the body is just a
+    // single statement (and thus not a statement list), we need a new
+    // scope, but we don't want two scopes in the StmtList case.
+    if (!dynamic_cast<StmtList *>(bodyStmts))
+        ctx->StartScope();
+
+    ctx->AddInstrumentationPoint("do loop body");
+    if (doCoherentCheck && !uniformTest) {
+        // Check to see if the mask is all on
+        llvm::BasicBlock *bAllOn = ctx->CreateBasicBlock("do_all_on");
+        llvm::BasicBlock *bMixed = ctx->CreateBasicBlock("do_mixed");
+        ctx->BranchIfMaskAll(bAllOn, bMixed);
+
+        // If so, emit code for the 'mask all on' case.  In particular,
+        // explicitly set the mask to 'all on' (see rationale in
+        // IfStmt::emitCoherentTests()), and then emit the code for the
+        // loop body.
+        ctx->SetCurrentBasicBlock(bAllOn);
+        ctx->SetMask(LLVMMaskAllOn);
+        if (bodyStmts)
+            bodyStmts->EmitCode(ctx);
+        assert(ctx->GetCurrentBasicBlock());
+        ctx->BranchInst(btest);
+
+        // The mask is mixed.  Just emit the code for the loop body.
+        ctx->SetCurrentBasicBlock(bMixed);
+        if (bodyStmts)
+            bodyStmts->EmitCode(ctx);
+        assert(ctx->GetCurrentBasicBlock());
+        ctx->BranchInst(btest);
+    }
+    else {
+        // Otherwise just emit the code for the loop body.  The current
+        // mask is good.
+        if (bodyStmts)
+            bodyStmts->EmitCode(ctx);
+        if (ctx->GetCurrentBasicBlock())
+            ctx->BranchInst(btest);
+    }
+    // End the scope we started above, if needed.
+    if (!dynamic_cast<StmtList *>(bodyStmts))
+        ctx->EndScope();
+
+    // Now emit code for the loop test.
+    ctx->SetCurrentBasicBlock(btest);
+    // First, emit code to restore the mask value for any lanes that
+    // executed a 'continue' during the current loop before we go and emit
+    // the code for the test.  This is only necessary for varying loops;
+    // 'uniform' loops just jump when they hit a continue statement and
+    // don't mess with the mask.
+    if (!uniformTest)
+        ctx->RestoreContinuedLanes();
+    llvm::Value *testValue = testExpr->GetValue(ctx);
+    if (!testValue)
+        return;
+
+    if (uniformTest)
+        // For the uniform case, just jump to the top of the loop or the
+        // exit basic block depending on the value of the test.
+        ctx->BranchInst(bloop, bexit, testValue);
+    else {
+        // For the varying case, update the mask based on the value of the
+        // test.  If any program instances still want to be running, jump
+        // to the top of the loop.  Otherwise, jump out.
+        llvm::Value *mask = ctx->GetMask();
+        ctx->MaskAnd(mask, testValue);
+        ctx->BranchIfMaskAny(bloop, bexit);
+    }
+
+    // ...and we're done.  Set things up for subsequent code to be emitted
+    // in the right basic block.
+    ctx->SetCurrentBasicBlock(bexit);
+    ctx->EndLoop();
+}
+
+
+Stmt *
+DoStmt::Optimize() {
+    if (testExpr) 
+        testExpr = testExpr->Optimize();
+    if (bodyStmts) 
+        bodyStmts = bodyStmts->Optimize();
+    return this;
+}
+
+
+Stmt *
+DoStmt::TypeCheck() {
+    if (testExpr) {
+        testExpr = testExpr->TypeCheck();
+        if (testExpr) {
+            const Type *testType = testExpr->GetType();
+            if (testType) {
+                if (!testType->IsNumericType() && !testType->IsBoolType()) {
+                    Error(testExpr->pos, "Type \"%s\" can't be converted to boolean for \"while\" "
+                          "test in \"do\" loop.", testExpr->GetType()->GetString().c_str());
+                    return NULL;
+                }
+
+                // Should the test condition for the loop be uniform or
+                // varying?  It can be uniform only if three conditions are
+                // met.  First and foremost, the type of the test condition
+                // must be uniform.  Second, the user must not have set the
+                // dis-optimization option that disables uniform flow
+                // control.
+                //
+                // Thirdly, and most subtlely, there must not be any break
+                // or continue statements inside the loop that are within
+                // the scope of a 'varying' if statement.  If there are,
+                // then we type cast the test to be 'varying', so that the
+                // code generated for the loop includes masking stuff, so
+                // that we can track which lanes actually want to be
+                // running, accounting for breaks/continues.
+                bool uniformTest = (testType->IsUniformType() &&
+                                    !g->opt.disableUniformControlFlow &&
+                                    !lHasVaryingBreakOrContinue(bodyStmts));
+                testExpr = new TypeCastExpr(uniformTest ? AtomicType::UniformBool :
+                                                          AtomicType::VaryingBool,
+                                            testExpr, testExpr->pos);
+            }
+        }
+    }
+
+    if (bodyStmts) 
+        bodyStmts = bodyStmts->TypeCheck();
+    return this;
+}
+
+
+void
+DoStmt::Print(int indent) const {
+    printf("%*cDo Stmt", indent, ' ');
+    pos.Print();
+    printf(":\n");
+    printf("%*cTest: ", indent+4, ' ');
+    if (testExpr) testExpr->Print();
+    printf("\n");
+    if (bodyStmts) {
+        printf("%*cStmts:\n", indent+4, ' ');
+        bodyStmts->Print(indent+8);
+    }
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// ForStmt
+
+ForStmt::ForStmt(Stmt *i, Expr *t, Stmt *s, Stmt *st, bool cc, SourcePos p) 
+    : Stmt(p), init(i), test(t), step(s), stmts(st), 
+      doCoherentCheck(cc && !g->opt.disableCoherentControlFlow) {
+}
+
+
+void
+ForStmt::EmitCode(FunctionEmitContext *ctx) const {
+    if (!ctx->GetCurrentBasicBlock()) 
+        return;
+
+    llvm::BasicBlock *btest = ctx->CreateBasicBlock("for_test");
+    llvm::BasicBlock *bstep = ctx->CreateBasicBlock("for_step");
+    llvm::BasicBlock *bloop = ctx->CreateBasicBlock("for_loop");
+    llvm::BasicBlock *bexit = ctx->CreateBasicBlock("for_exit");
+
+    bool uniformTest = test ? test->GetType()->IsUniformType() :
+        (!g->opt.disableUniformControlFlow &&
+         !lHasVaryingBreakOrContinue(stmts));
+
+    ctx->StartLoop(bexit, bstep, uniformTest, ctx->GetMask());
+    ctx->SetDebugPos(pos);
+
+    // If we have an initiailizer statement, start by emitting the code for
+    // it and then jump into the loop test code.  (Also start a new scope
+    // since the initiailizer may be a declaration statement).
+    if (init) {
+        assert(dynamic_cast<StmtList *>(init) == NULL);
+        ctx->StartScope();
+        init->EmitCode(ctx);
+    }
+    ctx->BranchInst(btest);
+
+    assert(ctx->GetCurrentBasicBlock());
+#if 0
+    if (!ctx->GetCurrentBasicBlock()) {
+        // when does this happen??
+        if (init)
+            ctx->EndScope();
+        ctx->EndLoop();
+        return;
+    }
+#endif
+
+    // Emit code to get the value of the loop test.  If no test expression
+    // was provided, just go with a true value.
+    ctx->SetCurrentBasicBlock(btest);
+    llvm::Value *ltest = NULL;
+    if (test) {
+        ltest = test->GetValue(ctx);
+        if (!ltest) {
+            ctx->EndScope();
+            ctx->EndLoop();
+            return;
+        }
+    }
+    else
+        ltest = uniformTest ? LLVMTrue : LLVMBoolVector(true);
+
+    // Now use the test's value.  For a uniform loop, we can either jump to
+    // the loop body or the loop exit, based on whether it's true or false.
+    // For a non-uniform loop, we update the mask and jump into the loop if
+    // any of the mask values are true.
+    if (uniformTest) {
+        if (doCoherentCheck)
+            Warning(pos, "Uniform condition supplied to cfor/cwhile statement.");
+        assert(ltest->getType() == LLVMTypes::BoolType);
+        ctx->BranchInst(bloop, bexit, ltest);
+    }
+    else {
+        llvm::Value *mask = ctx->GetMask();
+        ctx->MaskAnd(mask, ltest);
+        ctx->BranchIfMaskAny(bloop, bexit);
+    }
+
+    // On to emitting the code for the loop body.
+    ctx->SetCurrentBasicBlock(bloop);
+    ctx->SetLoopMask(ctx->GetMask());
+    ctx->AddInstrumentationPoint("for loop body");
+    if (!dynamic_cast<StmtList *>(stmts))
+        ctx->StartScope();
+
+    if (doCoherentCheck && !uniformTest) {
+        // For 'varying' loops with the coherence check, we start by
+        // checking to see if the mask is all on, after it has been updated
+        // based on the value of the test.
+        llvm::BasicBlock *bAllOn = ctx->CreateBasicBlock("for_all_on");
+        llvm::BasicBlock *bMixed = ctx->CreateBasicBlock("for_mixed");
+        ctx->BranchIfMaskAll(bAllOn, bMixed);
+
+        // Emit code for the mask being all on.  Explicitly set the mask to
+        // be on so that the optimizer can see that it's on (i.e. now that
+        // the runtime test has passed, make this fact clear for code
+        // generation at compile time here.)
+        ctx->SetCurrentBasicBlock(bAllOn);
+        ctx->SetMask(LLVMMaskAllOn);
+        if (stmts)
+            stmts->EmitCode(ctx);
+        assert(ctx->GetCurrentBasicBlock());
+        ctx->BranchInst(bstep);
+
+        // Emit code for the mask being mixed.  We should never run the
+        // loop with the mask all off, based on the BranchIfMaskAny call
+        // above.
+        ctx->SetCurrentBasicBlock(bMixed);
+        if (stmts)
+            stmts->EmitCode(ctx);
+        ctx->BranchInst(bstep);
+    }
+    else {
+        // For both uniform loops and varying loops without the coherence
+        // check, we know that at least one program instance wants to be
+        // running the loop, so just emit code for the loop body and jump
+        // to the loop step code.
+        if (stmts)
+            stmts->EmitCode(ctx);
+        if (ctx->GetCurrentBasicBlock())
+            ctx->BranchInst(bstep);
+    }
+    if (!dynamic_cast<StmtList *>(stmts))
+        ctx->EndScope();
+
+    // Emit code for the loop step.  First, restore the lane mask of any
+    // program instances that executed a 'continue' during the previous
+    // iteration.  Then emit code for the loop step and then jump to the
+    // test code.
+    ctx->SetCurrentBasicBlock(bstep);
+    ctx->RestoreContinuedLanes();
+    if (step)
+        step->EmitCode(ctx);
+    ctx->BranchInst(btest);
+
+    // Set the current emission basic block to the loop exit basic block
+    ctx->SetCurrentBasicBlock(bexit);
+    if (init)
+        ctx->EndScope();
+    ctx->EndLoop();
+}
+
+
+Stmt *
+ForStmt::Optimize() {
+    if (test) 
+        test = test->Optimize();
+    if (init) 
+        init = init->Optimize();
+    if (step) 
+        step = step->Optimize();
+    if (stmts) 
+        stmts = stmts->Optimize();
+    return this;
+}
+
+
+Stmt *
+ForStmt::TypeCheck() {
+    if (test) {
+        test = test->TypeCheck();
+        if (test) {
+            const Type *testType = test->GetType();
+            if (testType) {
+                if (!testType->IsNumericType() && !testType->IsBoolType()) {
+                    Error(test->pos, "Type \"%s\" can't be converted to boolean for for loop test.",
+                          test->GetType()->GetString().c_str());
+                    return NULL;
+                }
+
+                // See comments in DoStmt::TypeCheck() regarding
+                // 'uniformTest' and the type cast here.
+                bool uniformTest = (testType->IsUniformType() &&
+                                    !g->opt.disableUniformControlFlow &&
+                                    !lHasVaryingBreakOrContinue(stmts));
+                test = new TypeCastExpr(uniformTest ? AtomicType::UniformBool :
+                                                      AtomicType::VaryingBool,
+                                        test, test->pos);
+            }
+        }
+    }
+
+    if (init) 
+        init = init->TypeCheck();
+    if (step) 
+        step = step->TypeCheck();
+    if (stmts) 
+        stmts = stmts->TypeCheck();
+    return this;
+}
+
+
+void
+ForStmt::Print(int indent) const {
+    printf("%*cFor Stmt", indent, ' ');
+    pos.Print();
+    printf("\n");
+    if (init) {
+        printf("%*cInit:\n", indent+4, ' ');
+        init->Print(indent+8);
+    }
+    if (test) {
+        printf("%*cTest: ", indent+4, ' ');
+        test->Print();
+        printf("\n");
+    }
+    if (step) {
+        printf("%*cStep:\n", indent+4, ' ');
+        step->Print(indent+8);
+    }
+    if (stmts) {
+        printf("%*cStmts:\n", indent+4, ' ');
+        stmts->Print(indent+8);
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////
+// BreakStmt
+
+BreakStmt::BreakStmt(bool cc, SourcePos p) 
+    : Stmt(p), doCoherenceCheck(cc && !g->opt.disableCoherentControlFlow) {
+}
+
+
+void
+BreakStmt::EmitCode(FunctionEmitContext *ctx) const {
+    if (!ctx->GetCurrentBasicBlock()) 
+        return;
+
+    ctx->SetDebugPos(pos);
+    ctx->Break(doCoherenceCheck);
+}
+
+
+Stmt *
+BreakStmt::Optimize() {
+    return this;
+}
+
+
+Stmt *
+BreakStmt::TypeCheck() {
+    return this;
+}
+
+
+void
+BreakStmt::Print(int indent) const {
+    printf("%*c%sBreak Stmt", indent, ' ', doCoherenceCheck ? "Coherent " : "");
+    pos.Print();
+    printf("\n");
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// ContinueStmt
+
+ContinueStmt::ContinueStmt(bool cc, SourcePos p) 
+    : Stmt(p), doCoherenceCheck(cc && !g->opt.disableCoherentControlFlow) {
+}
+
+
+void
+ContinueStmt::EmitCode(FunctionEmitContext *ctx) const {
+    if (!ctx->GetCurrentBasicBlock()) 
+        return;
+
+    ctx->SetDebugPos(pos);
+    ctx->Continue(doCoherenceCheck);
+}
+
+
+Stmt *
+ContinueStmt::Optimize() {
+    return this;
+}
+
+
+Stmt *
+ContinueStmt::TypeCheck() {
+    return this;
+}
+
+
+void
+ContinueStmt::Print(int indent) const {
+    printf("%*c%sContinue Stmt", indent, ' ', doCoherenceCheck ? "Coherent " : "");
+    pos.Print();
+    printf("\n");
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// ReturnStmt
+
+ReturnStmt::ReturnStmt(Expr *v, bool cc, SourcePos p) 
+    : Stmt(p), val(v), 
+      doCoherenceCheck(cc && !g->opt.disableCoherentControlFlow) {
+}
+
+
+void
+ReturnStmt::EmitCode(FunctionEmitContext *ctx) const {
+    if (!ctx->GetCurrentBasicBlock()) 
+        return;
+
+    ctx->SetDebugPos(pos);
+    ctx->CurrentLanesReturned(val, doCoherenceCheck);
+}
+
+
+Stmt *
+ReturnStmt::Optimize() {
+    if (val) 
+        val = val->Optimize();
+    return this;
+}
+
+
+Stmt *
+ReturnStmt::TypeCheck() {
+    // FIXME: We don't have ctx->functionType available here; should we?
+    // We ned up needing to type conversion stuff in EmitCode() method via
+    // FunctionEmitContext::SetReturnValue as a result, which is kind of ugly...
+    if (val)
+        val = val->TypeCheck();
+    return this;
+}
+
+
+void
+ReturnStmt::Print(int indent) const {
+    printf("%*c%sReturn Stmt", indent, ' ', doCoherenceCheck ? "Coherent " : "");
+    pos.Print();
+    if (val) val->Print();
+    else printf("(void)");
+    printf("\n");
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// StmtList
+
+void
+StmtList::EmitCode(FunctionEmitContext *ctx) const {
+    if (!ctx->GetCurrentBasicBlock()) 
+        return;
+
+    ctx->StartScope();
+    ctx->SetDebugPos(pos);
+    for (unsigned int i = 0; i < stmts.size(); ++i)
+        if (stmts[i])
+            stmts[i]->EmitCode(ctx);
+    ctx->EndScope();
+}
+
+
+Stmt *
+StmtList::Optimize() {
+    for (unsigned int i = 0; i < stmts.size(); ++i)
+        if (stmts[i])
+            stmts[i] = stmts[i]->Optimize();
+    return this;
+}
+
+
+Stmt *
+StmtList::TypeCheck() {
+    for (unsigned int i = 0; i < stmts.size(); ++i)
+        if (stmts[i])
+            stmts[i] = stmts[i]->TypeCheck();
+    return this;
+}
+
+
+void
+StmtList::Print(int indent) const {
+    printf("%*cStmt List", indent, ' ');
+    pos.Print();
+    printf(":\n");
+    for (unsigned int i = 0; i < stmts.size(); ++i)
+        if (stmts[i])
+            stmts[i]->Print(indent+4);
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// PrintStmt
+
+PrintStmt::PrintStmt(const std::string &f, Expr *v, SourcePos p) 
+    : Stmt(p), format(f), values(v) {
+}
+
+/* Because the pointers to values that are passed to __do_print() are all
+   void *s (and because ispc print() formatting strings statements don't
+   encode types), we pass along a string to __do_print() where the i'th
+   character encodes the type of the i'th value to be printed.  Needless to
+   say, the encoding chosen here and the decoding code in __do_print() need
+   to agree on the below!
+ */
+static char
+lEncodeType(const Type *t) {
+    if (t == AtomicType::UniformBool)   return 'b';
+    if (t == AtomicType::VaryingBool)   return 'B';
+    if (t == AtomicType::UniformInt32)  return 'i';
+    if (t == AtomicType::VaryingInt32)  return 'I';
+    if (t == AtomicType::UniformUInt32) return 'u';
+    if (t == AtomicType::VaryingUInt32) return 'U';
+    if (t == AtomicType::UniformFloat)  return 'f';
+    if (t == AtomicType::VaryingFloat)  return 'F';
+    if (t == AtomicType::UniformInt64)  return 'l';
+    if (t == AtomicType::VaryingInt64)  return 'L';
+    if (t == AtomicType::UniformUInt64) return 'v';
+    if (t == AtomicType::VaryingUInt64) return 'V';
+    if (t == AtomicType::UniformDouble) return 'd';
+    if (t == AtomicType::VaryingDouble) return 'D';
+    else return '\0';
+}
+
+
+/** Given an Expr for a value to be printed, emit the code to evaluate the
+    expression and store the result to alloca'd memory.  Update the
+    argTypes string with the type encoding for this expression.
+ */
+static llvm::Value *
+lProcessPrintArg(Expr *expr, FunctionEmitContext *ctx, std::string &argTypes) {
+    const Type *type = expr->GetType();
+    if (type == NULL)
+        return NULL;
+
+    if (dynamic_cast<const ReferenceType *>(type) != NULL) {
+        expr = new DereferenceExpr(expr, expr->pos);
+        type = expr->GetType();
+        if (type == NULL)
+            return NULL;
+    }
+
+    char t = lEncodeType(type->GetAsNonConstType());
+    if (t == '\0') {
+        Error(expr->pos, "Only atomic types are allowed in print statements; "
+              "type \"%s\" is illegal.", type->GetString().c_str());
+        return NULL;
+    }
+    else {
+        argTypes.push_back(t);
+
+        const llvm::Type *llvmExprType = type->LLVMType(g->ctx);
+        llvm::Value *ptr = ctx->AllocaInst(llvmExprType, "print_arg");
+        llvm::Value *val = expr->GetValue(ctx);
+        if (!val)
+            return NULL;
+        ctx->StoreInst(val, ptr);
+
+        ptr = ctx->BitCastInst(ptr, LLVMTypes::VoidPointerType);
+        return ptr;
+    }
+}
+
+
+/* PrintStmt works closely with the __do_print() function implemented in
+   the stdlib-c.c file.  In particular, the EmitCode() method here needs to
+   take the arguments passed to it from ispc and generate a valid call to
+   __do_print() with the information that __do_print() then needs to do the
+   actual printing work at runtime.
+ */
+void
+PrintStmt::EmitCode(FunctionEmitContext *ctx) const {
+    ctx->SetDebugPos(pos);
+
+    // __do_print takes 5 arguments; we'll get them stored in the args[] array
+    // in the code emitted below
+    //
+    // 1. the format string
+    // 2. a string encoding the types of the values being printed, 
+    //    one character per value
+    // 3. the number of running program instances (i.e. the target's
+    //    vector width)
+    // 4. the current lane mask
+    // 5. a pointer to an array of pointers to the values to be printed
+    llvm::Value *args[5];
+    std::string argTypes;
+
+    if (values == NULL)
+        args[4] = NULL;
+    else {
+        // Get the values passed to the print() statement evaluated and
+        // stored in memory so that we set up the array of pointers to them
+        // for the 5th __do_print() argument
+        ExprList *elist = dynamic_cast<ExprList *>(values);
+        int nArgs = elist ? elist->exprs.size() : 1;
+
+        // Allocate space for the array of pointers to values to be printed 
+        const llvm::Type *argPtrArrayType = 
+            llvm::ArrayType::get(LLVMTypes::VoidPointerType, nArgs);
+        llvm::Value *argPtrArray = ctx->AllocaInst(argPtrArrayType,
+                                                   "print_arg_ptrs");
+        // Store the array pointer as a void **, which is what __do_print()
+        // expects
+        args[4] = ctx->BitCastInst(argPtrArray, 
+                                   llvm::PointerType::get(LLVMTypes::VoidPointerType, 0));
+
+        // Now, for each of the arguments, emit code to evaluate its value
+        // and store the value into alloca'd storage.  Then store the
+        // pointer to the alloca'd storage into argPtrArray.
+        if (elist) {
+            for (unsigned int i = 0; i < elist->exprs.size(); ++i) {
+                Expr *expr = elist->exprs[i];
+                if (!expr)
+                    return;
+                llvm::Value *ptr = lProcessPrintArg(expr, ctx, argTypes);
+                if (!ptr)
+                    return;
+
+                llvm::Value *arrayPtr = ctx->GetElementPtrInst(argPtrArray, 0, i);
+                ctx->StoreInst(ptr, arrayPtr);
+            }
+        }
+        else {
+            llvm::Value *ptr = lProcessPrintArg(values, ctx, argTypes);
+            if (!ptr)
+                return;
+            llvm::Value *arrayPtr = ctx->GetElementPtrInst(argPtrArray, 0, 0);
+            ctx->StoreInst(ptr, arrayPtr);
+        }
+    }
+
+    // Now we can emit code to call __do_print()
+    llvm::Function *printFunc = m->module->getFunction("__do_print");
+    assert(printFunc);
+
+    // Set up the rest of the parameters to it
+    args[0] = ctx->GetStringPtr(format);
+    args[1] = ctx->GetStringPtr(argTypes);
+    args[2] = LLVMInt32(g->target.vectorWidth);
+    args[3] = ctx->LaneMask(ctx->GetMask());
+    std::vector<llvm::Value *> argVec(&args[0], &args[5]);
+    ctx->CallInst(printFunc, argVec, "");
+}
+
+
+void
+PrintStmt::Print(int indent) const {
+    printf("%*cPrint Stmt (%s)", indent, ' ', format.c_str());
+}
+
+
+Stmt *
+PrintStmt::Optimize() {
+    if (values) 
+        values = values->Optimize();
+    return this;
+}
+
+
+Stmt *
+PrintStmt::TypeCheck() {
+    if (values) 
+        values = values->TypeCheck();
+    return this;
+}
diff --git a/stmt.h b/stmt.h
new file mode 100644
index 00000000..9d469b85
--- /dev/null
+++ b/stmt.h
@@ -0,0 +1,302 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file stmt.h
+    @brief File with declarations for classes related to statements in the language
+*/
+
+#ifndef ISPC_STMT_H
+#define ISPC_STMT_H 1
+
+#include "ispc.h"
+
+/** @brief Interface class for statements in the ispc language.
+
+    This abstract base-class encapsulates methods that AST nodes for
+    statements in the language must implement.
+ */
+class Stmt : public ASTNode {
+public:
+    Stmt(SourcePos p) : ASTNode(p) { }
+
+    /** Emit LLVM IR for the statement, using the FunctionEmitContext to create the
+        necessary instructions.
+     */
+    virtual void EmitCode(FunctionEmitContext *ctx) const = 0;
+
+    /** Print a representation of the statement (and any children AST
+        nodes) to standard output.  This method is used for debuggins. */
+    virtual void Print(int indent) const = 0;
+
+    // Redeclare these methods with Stmt * return values, rather than
+    // ASTNode *s, as in the original ASTNode declarations of them.
+    virtual Stmt *Optimize() = 0;
+    virtual Stmt *TypeCheck() = 0;
+};
+
+
+/** @brief Statement representing a single expression */
+class ExprStmt : public Stmt {
+public:
+    ExprStmt(Expr *expr, SourcePos pos);
+
+    void EmitCode(FunctionEmitContext *ctx) const;
+    void Print(int indent) const;
+
+    Stmt *Optimize();
+    Stmt *TypeCheck();
+
+private:
+    Expr *expr;
+};
+
+
+/** @brief Statement representing a single declaration (which in turn may declare
+    a number of variables. */
+class DeclStmt : public Stmt {
+public:
+    DeclStmt(SourcePos pos, Declaration *declaration, SymbolTable *symbolTable);
+
+    void EmitCode(FunctionEmitContext *ctx) const;
+    void Print(int indent) const;
+
+    Stmt *Optimize();
+    Stmt *TypeCheck();
+
+private:
+    Declaration *declaration;
+};
+
+
+/** @brief Statement representing a single if statement, possibly with an
+    else clause. */
+class IfStmt : public Stmt {
+public:
+    IfStmt(Expr *testExpr, Stmt *trueStmts, Stmt *falseStmts,
+           bool doCoherentCheck, SourcePos pos);
+
+    void EmitCode(FunctionEmitContext *ctx) const;
+    void Print(int indent) const;
+
+    Stmt *Optimize();
+    Stmt *TypeCheck();
+
+    // @todo these are only public for lHasVaryingBreakOrContinue(); would
+    // be nice to clean that up...
+    /** Expression giving the 'if' test. */
+    Expr *test;
+    /** Statements to run if the 'if' test returns a true value */
+    Stmt *trueStmts;
+    /** Statements to run if the 'if' test returns a false value */
+    Stmt *falseStmts;
+
+private:
+    /** This value records if this was a 'coherent' if statement in the
+        source and thus, if the emitted code should check to see if all
+        active program instances want to follow just one of the 'true' or
+        'false' blocks. */
+    const bool doCoherentCheck;
+
+    void emitMaskedTrueAndFalse(FunctionEmitContext *ctx, llvm::Value *oldMask, 
+                                llvm::Value *test) const;
+    void emitCoherentTests(FunctionEmitContext *ctx, llvm::Value *test) const;
+    void emitMaskAllOn(FunctionEmitContext *ctx,
+                       llvm::Value *test, llvm::BasicBlock *bDone) const;
+    void emitMaskMixed(FunctionEmitContext *ctx, llvm::Value *oldMask, 
+                       llvm::Value *test, llvm::BasicBlock *bDone) const;
+};
+
+
+/** @brief Statement implementation representing a 'do' statement in the
+    program.
+ */
+class DoStmt : public Stmt {
+public:
+    DoStmt(Expr *testExpr, Stmt *bodyStmts, bool doCoherentCheck, 
+           SourcePos pos);
+
+    void EmitCode(FunctionEmitContext *ctx) const;
+    void Print(int indent) const;
+
+    Stmt *Optimize();
+    Stmt *TypeCheck();
+
+private:
+    Expr *testExpr;
+    Stmt *bodyStmts;
+    const bool doCoherentCheck;
+};
+
+
+/** @brief Statement implementation for 'for' loops (as well as for 'while'
+    loops).
+ */
+class ForStmt : public Stmt {
+public:
+    ForStmt(Stmt *initializer, Expr *testExpr, Stmt *stepStatements,
+            Stmt *bodyStatements, bool doCoherentCheck, SourcePos pos);
+
+    void EmitCode(FunctionEmitContext *ctx) const;
+    void Print(int indent) const;
+
+    Stmt *Optimize();
+    Stmt *TypeCheck();
+
+private:
+    /** 'for' statment initializer; may be NULL, indicating no intitializer */
+    Stmt *init;
+    /** expression that returns a value indicating whether the loop should
+        continue for the next iteration */
+    Expr *test;
+    /** Statements to run at the end of the loop for the loop step, before
+        the test expression is evaluated. */
+    Stmt *step;
+    /** Loop body statements */
+    Stmt *stmts;
+    const bool doCoherentCheck;
+};
+
+
+/** @brief Statement implementation for a break or 'coherent' break
+    statement in the program. */
+class BreakStmt : public Stmt {
+public:
+    BreakStmt(bool doCoherenceCheck, SourcePos pos);
+
+    void EmitCode(FunctionEmitContext *ctx) const;
+    void Print(int indent) const;
+
+    Stmt *Optimize();
+    Stmt *TypeCheck();
+
+private:
+    /** This indicates whether the generated code will check to see if no
+        more program instances are currently running after the break, in
+        which case the code can have a jump to the end of the current
+        loop. */
+    const bool doCoherenceCheck;
+};
+
+
+/** @brief Statement implementation for a continue or 'coherent' continue
+    statement in the program. */
+class ContinueStmt : public Stmt {
+public:
+    ContinueStmt(bool doCoherenceCheck, SourcePos pos);
+
+    void EmitCode(FunctionEmitContext *ctx) const;
+    void Print(int indent) const;
+
+    Stmt *Optimize();
+    Stmt *TypeCheck();
+
+private:
+    /** This indicates whether the generated code will check to see if no
+        more program instances are currently running after the continue, in
+        which case the code can have a jump to the end of the current
+        loop. */
+    const bool doCoherenceCheck;
+};
+
+
+/** @brief Statement implementation for a 'return' or 'coherent' return
+    statement in the program. */
+class ReturnStmt : public Stmt {
+public:
+    ReturnStmt(Expr *v, bool cc, SourcePos p);
+
+    void EmitCode(FunctionEmitContext *ctx) const;
+    void Print(int indent) const;
+
+    Stmt *Optimize();
+    Stmt *TypeCheck();
+
+private:
+    Expr *val;
+    /** This indicates whether the generated code will check to see if no
+        more program instances are currently running after the return, in
+        which case the code can possibly jump to the end of the current
+        function. */
+    const bool doCoherenceCheck;
+};
+
+
+/** @brief Representation of a list of statements in the program.
+ */
+class StmtList : public Stmt {
+public:
+    StmtList(SourcePos p) : Stmt(p) { }
+
+    void EmitCode(FunctionEmitContext *ctx) const;
+    void Print(int indent) const;
+
+    Stmt *Optimize();
+    Stmt *TypeCheck();
+
+    void Add(Stmt *s) { if (s) stmts.push_back(s); }
+    const std::vector<Stmt *> &GetStatements() { return stmts; }
+
+private:
+    std::vector<Stmt *> stmts;
+};
+
+
+/** @brief Representation of a print() statement in the program.
+
+    It's currently necessary to have a special statement type for print()
+    since strings aren't supported as first-class types in the language,
+    but we need to be able to pass a formatting string as the first
+    argument to print().  We also need this to be a variable argument
+    function, which also isn't supported.  Representing print() as a
+    statement lets us work around both of those ugly little issues...
+  */
+class PrintStmt : public Stmt {
+public:
+    PrintStmt(const std::string &f, Expr *v, SourcePos p);
+
+    void EmitCode(FunctionEmitContext *ctx) const;
+    void Print(int indent) const;
+
+    Stmt *Optimize();
+    Stmt *TypeCheck();
+
+private:
+    /** Format string for the print() statement. */
+    const std::string format;
+    /** This holds the arguments passed to the print() statement.  If more
+        than one was provided, this will be an ExprList. */
+    Expr *values;
+};
+
+
+#endif // ISPC_STMT_H
diff --git a/sym.cpp b/sym.cpp
new file mode 100644
index 00000000..fa137930
--- /dev/null
+++ b/sym.cpp
@@ -0,0 +1,326 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file sym.cpp
+    @brief file with definitions for symbol and symbol table classes. 
+*/
+
+#include "sym.h"
+#include "type.h"
+#include "util.h"
+#include <stdio.h>
+
+///////////////////////////////////////////////////////////////////////////
+// Symbol
+
+Symbol::Symbol(const std::string &n, SourcePos p, const Type *t) 
+  : pos(p), name(n) {
+    storagePtr = NULL;
+    function = NULL;
+    type = t;
+    constValue = NULL;
+    isStatic = false;
+    varyingCFDepth = 0;
+}
+
+
+std::string
+Symbol::MangledName() const {
+    return name + type->Mangle();
+}
+
+///////////////////////////////////////////////////////////////////////////
+// SymbolTable
+
+SymbolTable::SymbolTable() {
+    PushScope();
+}
+
+
+SymbolTable::~SymbolTable() {
+    // Otherwise we have mismatched push/pop scopes
+    assert(variables.size() == 1 && types.size() == 1);
+    PopScope();
+}
+
+void
+SymbolTable::PushScope() { 
+    variables.push_back(new std::vector<Symbol *>); 
+    types.push_back(new TypeMapType);
+}
+
+
+void
+SymbolTable::PopScope() { 
+    // FIXME: delete Symbols in variables vector<>...
+    assert(variables.size() > 1);
+    delete variables.back();
+    variables.pop_back();
+    assert(types.size() > 1);
+    delete types.back();
+    types.pop_back();
+}
+
+
+bool
+SymbolTable::AddVariable(Symbol *symbol) {
+    assert(symbol != NULL);
+
+    // Check to see if a symbol of the same name has already been declared.
+    for (int i = (int)variables.size() - 1; i >= 0; --i) {
+        std::vector<Symbol *> &sv = *(variables[i]);
+        for (int j = (int)sv.size() - 1; j >= 0; --j) {
+            if (sv[j]->name == symbol->name) {
+                if (i == (int)variables.size()-1) {
+                    // If a symbol of the same name was declared in the
+                    // same scope, it's an error.
+                    Error(symbol->pos, "Ignoring redeclaration of symbol \"%s\".", 
+                          symbol->name.c_str());
+                    return false;
+                }
+                else {
+                    // Otherwise it's just shadowing something else, which
+                    // is legal but dangerous..
+                    Warning(symbol->pos, 
+                            "Symbol \"%s\" shadows symbol declared in outer scope.",
+                            symbol->name.c_str());
+                    variables.back()->push_back(symbol);
+                    return true;
+                }
+            }
+        }
+    }
+
+    // No matches, so go ahead and add it...
+    variables.back()->push_back(symbol);
+    return true;
+}
+
+
+Symbol *
+SymbolTable::LookupVariable(const char *name) {
+    // Note that we iterate through the variables vectors backwards, sinec
+    // we want to search from the innermost scope to the outermost, so that
+    // we get the right symbol if we have multiple variables in different
+    // scopes that shadow each other.
+    std::vector<std::vector<Symbol *> *>::reverse_iterator liter = variables.rbegin();
+    while (liter != variables.rend()) {
+        std::vector<Symbol *> &sv = *(*liter);
+        for (int i = (int)sv.size() - 1; i >= 0; --i)
+            if (sv[i]->name == name) 
+                return sv[i];
+        ++liter;
+    }
+    return NULL;
+}
+
+
+bool
+SymbolTable::AddFunction(Symbol *symbol) {
+    const FunctionType *ft = dynamic_cast<const FunctionType *>(symbol->type);
+    assert(ft != NULL);
+    if (LookupFunction(symbol->name.c_str(), ft) != NULL)
+        // A function of the same name and type has already been added to
+        // the symbol table
+        return false;
+
+    functions[symbol->name].push_back(symbol);
+    return true;
+}
+
+
+std::vector<Symbol *> *
+SymbolTable::LookupFunction(const char *name) {
+    if (functions.find(name) != functions.end())
+        return &functions[name];
+    return NULL;
+}
+
+
+Symbol *
+SymbolTable::LookupFunction(const char *name, const FunctionType *type) {
+    if (functions.find(name) == functions.end())
+        return NULL;
+
+    std::vector<Symbol *> &funcs = functions[name];
+    for (unsigned int i = 0; i < funcs.size(); ++i)
+        if (Type::Equal(funcs[i]->type, type))
+            return funcs[i];
+    return NULL;
+}
+
+
+bool
+SymbolTable::AddType(const char *name, const Type *type, SourcePos pos) {
+    // Like AddVariable(), we go backwards through the type maps, working
+    // from innermost scope to outermost.
+    for (int i = types.size()-1; i >= 0; --i) {
+        TypeMapType &sm = *(types[i]);
+        if (sm.find(name) != sm.end()) {
+            if (i == (int)types.size() - 1) {
+                Error(pos, "Ignoring redefinition of type \"%s\".", name);
+                return false;
+            }
+            else {
+                Warning(pos, "Type \"%s\" shadows type declared in outer scope.", name);
+                TypeMapType &sm = *(types.back());
+                sm[name] = type;
+                return true;
+            }
+        }
+    }
+
+    TypeMapType &sm = *(types.back());
+    sm[name] = type;
+    return true;
+}
+
+
+const Type *
+SymbolTable::LookupType(const char *name) const {
+    // Again, search through the type maps backward to get scoping right.
+    for (int i = types.size()-1; i >= 0; --i) {
+        TypeMapType &sm = *(types[i]);
+        if (sm.find(name) != sm.end())
+            return sm[name];
+    }
+    return NULL;
+}
+
+
+std::vector<std::string>
+SymbolTable::ClosestVariableOrFunctionMatch(const char *str) const {
+    // This is a little wasteful, but we'll look through all of the
+    // variable and function symbols and compute the edit distance from the
+    // given string to them.  If the edit distance is under maxDelta, then
+    // it goes in the entry of the matches[] array corresponding to its
+    // edit distance.
+    const int maxDelta = 2;
+    std::vector<std::string> matches[maxDelta+1];
+
+    for (int i = 0; i < (int)variables.size(); ++i) {
+        std::vector<Symbol *> &sv = *(variables[i]);
+        for (int j = 0; j < (int)sv.size(); ++j) {
+            int dist = StringEditDistance(str, sv[j]->name, maxDelta+1);
+            if (dist <= maxDelta)
+                matches[dist].push_back(sv[j]->name);
+        }
+    }
+
+    std::map<std::string, std::vector<Symbol *> >::const_iterator iter;
+    for (iter = functions.begin(); iter != functions.end(); ++iter) {
+        int dist = StringEditDistance(str, iter->first, maxDelta+1);
+            if (dist <= maxDelta)
+                matches[dist].push_back(iter->first);
+    }
+
+    // Now, return the first entry of matches[] that is non-empty, if any.
+    for (int i = 0; i <= maxDelta; ++i) {
+        if (matches[i].size())
+            return matches[i];
+    }
+
+    // Otherwise, no joy.
+    return std::vector<std::string>();
+}
+
+
+std::vector<std::string>
+SymbolTable::ClosestTypeMatch(const char *str) const {
+    // This follows the same approach as ClosestVariableOrFunctionmatch()
+    // above; compute all edit distances, keep the ones shorter than
+    // maxDelta, return the first non-empty vector of one or more sets of
+    // alternatives with minimal edit distance.
+    const int maxDelta = 2;
+    std::vector<std::string> matches[maxDelta+1];
+
+    for (unsigned int i = 0; i < types.size(); ++i) {
+        TypeMapType::const_iterator iter;
+        for (iter = types[i]->begin(); iter != types[i]->end(); ++iter) {
+            int dist = StringEditDistance(str, iter->first, maxDelta+1);
+            if (dist <= maxDelta)
+                matches[dist].push_back(iter->first);
+        }
+    }
+
+    for (int i = 0; i <= maxDelta; ++i) {
+        if (matches[i].size())
+            return matches[i];
+    }
+    return std::vector<std::string>();
+}
+
+
+void
+SymbolTable::Print() {
+    int depth = 0;
+    fprintf(stderr, "Variables:\n----------------\n");
+    std::vector<std::vector<Symbol *> *>::iterator liter = variables.begin();
+    while (liter != variables.end()) {
+        fprintf(stderr, "%*c", depth, ' ');
+        std::vector<Symbol *>::iterator siter = (*liter)->begin();
+        while (siter != (*liter)->end()) {
+            fprintf(stderr, "%s [%s]", (*siter)->name.c_str(), 
+                    (*siter)->type->GetString().c_str());
+            ++siter;
+        }
+        ++liter;
+        fprintf(stderr, "\n");
+        depth += 4;
+    }
+
+    fprintf(stderr, "Functions:\n----------------\n");
+    std::map<std::string, std::vector<Symbol *> >::iterator fiter;
+    fiter = functions.begin();
+    while (fiter != functions.end()) {
+        fprintf(stderr, "%s\n", fiter->first.c_str());
+        std::vector<Symbol *> &syms = fiter->second;
+        for (unsigned int i = 0; i < syms.size(); ++i)
+            fprintf(stderr, "    %s\n", syms[i]->type->GetString().c_str());
+        ++fiter;
+    }
+
+    depth = 0;
+    fprintf(stderr, "Named types:\n---------------\n");
+    for (unsigned int i = 0; i < types.size(); ++i) {
+        TypeMapType &sm = *types[i];
+        TypeMapType::iterator siter = sm.begin();
+        while (siter != sm.end()) {
+            fprintf(stderr, "%*c", depth, ' ');
+            fprintf(stderr, "%s -> %s\n", siter->first.c_str(),
+                    siter->second->GetString().c_str());
+            ++siter;
+        }
+        depth += 4;
+    }
+}
diff --git a/sym.h b/sym.h
new file mode 100644
index 00000000..457cd8cf
--- /dev/null
+++ b/sym.h
@@ -0,0 +1,264 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file sym.h
+
+    @brief header file with declarations for symbol and symbol table
+    classes.
+*/
+
+#ifndef ISPC_SYM_H
+#define ISPC_SYM_H
+
+#include "ispc.h"
+#include <map>
+
+class StructType;
+class ConstExpr;
+
+/**
+   @brief Representation of a program symbol.
+
+   The Symbol class represents a symbol in an ispc program.  Symbols can
+   include variables, functions, and named types.  Note that all of the
+   members are publically accessible; other code throughout the system
+   accesses and modifies the members directly.
+
+   @todo Should we break function symbols into a separate FunctionSymbol
+   class and then not have these members that are not applicable for
+   function symbols (and vice versa, for non-function symbols)?
+ */
+
+class Symbol {
+public:
+    /** The Symbol constructor takes the name of the symbol, its
+        position in a source file, and its type (if known). */
+    Symbol(const std::string &name, SourcePos pos, const Type *t = NULL);
+
+    /** This method should only be called for function symbols; for them,
+        it returns a mangled version of the function name with the argument
+        types encoded into the returned name.  This is used to generate
+        unique symbols in object files for overloaded functions.
+     */
+    std::string MangledName() const;
+
+    SourcePos pos;            /*!< Source file position where the symbol was defined */
+    const std::string name;   /*!< Symbol's name */
+    llvm::Value *storagePtr;  /*!< For symbols with storage associated with
+                                   them (i.e. variables but not functions),
+                                   this member stores a pointer to its
+                                   location in memory.) */
+    llvm::Function *function; /*!< For symbols that represent functions,
+                                   this stores the LLVM Function value for
+                                   the symbol once it has been created. */ 
+    const Type *type;         /*!< The type of the symbol; if not set by the
+                                   constructor, this is set after the
+                                   declaration around the symbol has been parsed.  */
+    ConstExpr *constValue;    /*!< For symbols with const-qualified types, this may store
+                                   the symbol's compile-time constant value.  This value may
+                                   validly be NULL for a const-qualified type, however; for
+                                   example, the ConstExpr class can't currently represent
+                                   struct types.  For cases like these, ConstExpr is NULL,
+                                   though for all const symbols, the value pointed to by the
+                                   storagePtr member will be its constant value.  (This
+                                   messiness is due to needing an ispc ConstExpr for the early 
+                                   constant folding optimizations). */
+    bool isStatic;            /*!< Records whether this symbol had a static qualifier in
+                                   its declaration. */
+    int varyingCFDepth;       /*!< This member records the number of levels of nested 'varying' 
+                                   control flow within which the symbol was declared.  Having
+                                   this value available makes it possible to avoid performing
+                                   masked stores when modifying the symbol's value when the
+                                   store is done at the same 'varying' control flow depth as 
+                                   the one where the symbol was originally declared. */
+};
+
+
+/** @brief Symbol table that holds all known symbols during parsing and compilation.
+
+    A single instance of a SymbolTable is stored in the Module class
+    (Module::symbolTable); it is created in the Module::Module()
+    constructor.  It is then accessed via the global variable Module *\ref m
+    throughout the ispc implementation.
+ */
+
+class SymbolTable {
+public:
+    SymbolTable();
+    ~SymbolTable();
+
+    /** The parser calls this method when it enters a new scope in the
+        program; this allows us to track variables that shadows others in
+        outer scopes with same name as well as to efficiently discard all
+        of the variables declared in a particular scope when we exit that
+        scope. */
+    void PushScope();
+
+    /** For each scope started by a call to SymbolTable::PushScope(), there
+        must be a matching call to SymbolTable::PopScope() at the end of
+        that scope. */ 
+    void PopScope();
+
+    /** Adds the given variable symbol to the symbol table.
+        @param symbol The symbol to be added
+
+        @return true if successful; false if the provided symbol clashes
+        with a symbol defined at the same scope.  (Symbols may shaodow
+        symbols in outer scopes; a warning is issued in this case, but this
+        method still returns true.) */
+    bool AddVariable(Symbol *symbol);
+
+    /** Looks for a variable with the given name in the symbol table.  This
+        method searches outward from the innermost scope to the outermost,
+        returning the first match found.
+
+        @param  name The name of the variable to be searched for.
+        @return A pointer to the Symbol, if a match is found.  NULL if no 
+        Symbol with the given name is in the symbol table. */
+    Symbol *LookupVariable(const char *name);
+
+    /** Adds the given function symbol to the symbol table.
+        @param symbol The function symbol to be added.
+
+        @return true if the symbol has been added.  False if another
+        function symbol with the same name and function signature is
+        already present in the symbol table. */
+    bool AddFunction(Symbol *symbol);
+
+    /** Looks for the function or functions with the given name in the
+        symbol name.  If a function has been overloaded and multiple
+        definitions are present for a given function name, all of them will
+        be returned and it's up the the caller to resolve which one (if
+        any) to use.
+
+        @return vector of Symbol pointers to functions with the given name. */
+    std::vector<Symbol *> *LookupFunction(const char *name);
+
+    /** Looks for a function with the given name and type
+        in the symbol table.
+
+        @return pointer to matching Symbol; NULL if none is found. */
+    Symbol *LookupFunction(const char *name, const FunctionType *type);
+
+    /** Returns all of the functions in the symbol table that match the given 
+        predicate.
+
+        @param pred A unary predicate that returns true or false, given a Symbol 
+        pointer, based on whether the symbol should be included in the returned 
+        set of matches.  It can either be a function, with signature 
+        <tt>bool pred(const Symbol *s)</tt>, or a unary predicate object with 
+        an <tt>bool operator()(const Symbol *)</tt> method.
+
+        @param matches Pointer to a vector in which to return the matching
+        symbols. 
+     */
+    template <typename Predicate> 
+        void GetMatchingFunctions(Predicate pred, 
+                                  std::vector<Symbol *> *matches) const;
+
+    /** Adds the named type to the symbol table.  This is used for both
+        struct definitions (where <tt>struct Foo</tt> causes type \c Foo to
+        be added to the symbol table) as well as for <tt>typedef</tt>s.
+
+        @param name Name of the type to be added
+        @param type Type that \c name represents
+        @param pos Position in source file where the type was named
+        @return true if the named type was successfully added.  False if a type
+        with the same name has already been defined.
+        
+    */
+    bool AddType(const char *name, const Type *type, SourcePos pos);
+
+    /** Looks for a type of the given name in the symbol table.
+
+        @return Pointer to the Type, if found; otherwise NULL is returned.
+    */
+    const Type *LookupType(const char *name) const;
+
+    /** This method returns zero or more strings with the names of symbols
+        in the symbol table that nearly (but not exactly) match the given
+        name.  This is useful for issuing informative error methods when
+        misspelled identifiers are found a programs.
+
+        @param name String to compare variable and function symbol names against.
+        @return vector of zero or more strings that approximately match \c name.
+    */
+    std::vector<std::string> ClosestVariableOrFunctionMatch(const char *name) const;
+    /** This method returns zero or more strings with the names of types
+        in the symbol table that nearly (but not exactly) match the given
+        name. */
+    std::vector<std::string> ClosestTypeMatch(const char *name) const;
+
+    /** Prints out the entire contents of the symbol table to standard error.
+        (Debugging method). */
+    void Print();
+
+private:
+    /** This member variable holds one \c vector of Symbol pointers for
+        each of the current active scopes as the program is being parsed.
+        New vectors of symbols are added and removed from the end of the
+        main vector, so searches for symbols start looking at the end of \c
+        variables and work backwards.
+     */
+    std::vector<std::vector<Symbol *> *> variables;
+    /** Because there is no scoping for function symbols, functions are
+        represented with a single STL \c map from names to symbols.  A STL
+        \c vector is used to store the function symbols for a given name
+        since, due to function overloading, a name can have multiple
+        function symbols associated with it. */
+    std::map<std::string, std::vector<Symbol *> > functions;
+    typedef std::map<std::string, const Type *> TypeMapType;
+    /** Like variables, type definitions can be scoped.  A new \c TypeMapType
+        is added to the back of the \c types \c vector each time a new scope
+        is entered.  (And it's removed when the scope exits).
+     */
+    std::vector<TypeMapType *> types;
+};
+
+
+template <typename Predicate> 
+void SymbolTable::GetMatchingFunctions(Predicate pred, 
+                                       std::vector<Symbol *> *matches) const {
+    // Iterate through all function symbols and apply the given predicate.
+    // If it returns true, add the Symbol * to the provided vector.
+    std::map<std::string, std::vector<Symbol *> >::const_iterator iter;
+    for (iter = functions.begin(); iter != functions.end(); ++iter) {
+        const std::vector<Symbol *> &syms = iter->second;
+        for (unsigned int i = 0; i < syms.size(); ++i) {
+            if (pred(syms[i]))
+                matches->push_back(syms[i]);
+        }
+    }
+}
+
+#endif // ISPC_SYM_H
diff --git a/tests/array-1.ispc b/tests/array-1.ispc
new file mode 100644
index 00000000..f2073b2b
--- /dev/null
+++ b/tests/array-1.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+
+
+static float x[2][1];
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex];
+    x[0][1] = a;
+    RET[programIndex] = x[0][1];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 1+programIndex; }
diff --git a/tests/array-assignment-varying-control.ispc b/tests/array-assignment-varying-control.ispc
new file mode 100644
index 00000000..629c46d3
--- /dev/null
+++ b/tests/array-assignment-varying-control.ispc
@@ -0,0 +1,24 @@
+
+export uniform int width() { return programCount; }
+
+
+
+struct Foo { float f; };
+
+void f(reference uniform Foo foo[], float a) {
+    ++foo[a].f;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    float f[40] = a;
+    float g[40] = b;
+    if (a < 2)
+        f = g;
+    RET[programIndex] = f[a];
+}
+
+export void result(uniform float RET[]) { 
+    RET[programIndex] = 1+programIndex;
+    RET[0] = 5;
+}
diff --git a/tests/array-gather-ifs.ispc b/tests/array-gather-ifs.ispc
new file mode 100644
index 00000000..d635e10f
--- /dev/null
+++ b/tests/array-gather-ifs.ispc
@@ -0,0 +1,24 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform float x[45];
+    uniform int i;
+    for (i = 0; i < 45; ++i)
+        x[i] = i;
+
+    float ret;
+    if ((int)a & 1)
+        ret = x[a-1];
+    else 
+        ret = x[a];
+    RET[programIndex] = ret;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2 * ((programIndex+1) / 2);
+}
diff --git a/tests/array-gather-multi-unif.ispc b/tests/array-gather-multi-unif.ispc
new file mode 100644
index 00000000..bf0794da
--- /dev/null
+++ b/tests/array-gather-multi-unif.ispc
@@ -0,0 +1,22 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int a = aFOO[programIndex];
+    uniform float x[34][34];
+    // HACK to prevent the init from getting turned into a @llvm.memset
+    // intrinsic, which the JIT doesn't deal with...
+    for (uniform int i = 0; i < 29+b; ++i)
+        for (uniform int j = 0; j < 29+b; ++j)
+            x[i][j] = 0;
+    x[a][a] = a;
+    RET[programIndex] = x[4][4] + x[1][1] + x[b][b] + x[0][0];
+}
+
+export void result(uniform float RET[]) { 
+    if (programCount == 4)
+        RET[programIndex] = 5.; 
+    else
+        RET[programIndex] = 10.; 
+}
diff --git a/tests/array-gather-simple.ispc b/tests/array-gather-simple.ispc
new file mode 100644
index 00000000..8835b2f0
--- /dev/null
+++ b/tests/array-gather-simple.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+struct Foo { float f; };
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    float foo[5] = { 0, a, 0, 2 * a, 3 };
+    uniform int offset[4] = { 1,2,3,4 };
+    int x = offset[programIndex & 0x3];
+    RET[programIndex] = foo[x];
+}
+
+export void result(uniform float RET[]) { 
+    RET[0] = 1; RET[4] = 5; RET[8] = 9; RET[12] = 13;
+    RET[1] = RET[5] = RET[9] = RET[13] = 0;
+    RET[2] = 6; RET[6] = 14; RET[10] = 22; RET[14] = 30;
+    RET[3] = RET[7] = RET[11] = RET[15] = 3;
+}
diff --git a/tests/array-gather-unif-runflags.ispc b/tests/array-gather-unif-runflags.ispc
new file mode 100644
index 00000000..f2936f05
--- /dev/null
+++ b/tests/array-gather-unif-runflags.ispc
@@ -0,0 +1,24 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform float x[45];
+    uniform int i;
+    for (i = 0; i < 45; ++i)
+        x[i] = i+b;
+    a -= 1;
+    if (a == 3) a = 0;
+    a *= 10000000;
+    float ret = 1234;
+    if (a < 5)
+        ret = x[a];
+    RET[programIndex] = ret;
+}
+
+
+export void result(uniform float RET[4]) {
+    RET[programIndex] = 1234;
+    RET[0] = RET[3] = 5;
+}
diff --git a/tests/array-gather-unif.ispc b/tests/array-gather-unif.ispc
new file mode 100644
index 00000000..3e040ad3
--- /dev/null
+++ b/tests/array-gather-unif.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform float x[45];
+    uniform int i;
+    for (i = 0; i < 45; ++i)
+        x[i] = i+b;
+    RET[programIndex] = x[a];
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 6+programIndex;
+}
diff --git a/tests/array-gather-vary.ispc b/tests/array-gather-vary.ispc
new file mode 100644
index 00000000..bbbdd85d
--- /dev/null
+++ b/tests/array-gather-vary.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    float x[55];
+    uniform int i;
+    for (i = 0; i < 45; ++i)
+        x[i] = a+b;
+    RET[programIndex] = x[a];
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 6 + programIndex;;
+}
diff --git a/tests/array-mixed-unif-vary-indexing-2.ispc b/tests/array-mixed-unif-vary-indexing-2.ispc
new file mode 100644
index 00000000..bafe7dc7
--- /dev/null
+++ b/tests/array-mixed-unif-vary-indexing-2.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform float x[47][47] = 2.;
+    // all are 2 except (3,4) = 0, (1,4) = 1, (2,4) = 1, (4,4) = 1
+    if (a == 3.)
+        x[a][b-1] = 0;
+    else
+        x[a][b-1] = 1;
+    RET[programIndex] = x[3][a];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 2; RET[3] = 0; }
diff --git a/tests/array-mixed-unif-vary-indexing-3.ispc b/tests/array-mixed-unif-vary-indexing-3.ispc
new file mode 100644
index 00000000..97e53016
--- /dev/null
+++ b/tests/array-mixed-unif-vary-indexing-3.ispc
@@ -0,0 +1,20 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform float x[47][47] = 2.;
+    // all are 2 except (4,2) = 0, (4,...) = 1, (4,programCount-1)=2
+    if (a == 3.)
+        x[b-1][a-1] = 0;
+    else
+        x[b-1][a-1] = 1;
+    RET[programIndex] = x[4][a];
+}
+
+export void result(uniform float RET[]) { 
+    RET[programIndex] = 1;
+    RET[1] = 0;
+    RET[programCount-1] = 2;
+}
diff --git a/tests/array-mixed-unif-vary-indexing.ispc b/tests/array-mixed-unif-vary-indexing.ispc
new file mode 100644
index 00000000..ddefae2d
--- /dev/null
+++ b/tests/array-mixed-unif-vary-indexing.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform float x[47][74] = 2.;
+    x[a][b-1] = 0;
+    RET[programIndex] = x[2][a];
+}
+
+export void result(uniform float RET[]) { 
+    RET[programIndex] = 2;
+    RET[3] = 0;
+}
+
diff --git a/tests/array-multidim-gather-scatter.ispc b/tests/array-multidim-gather-scatter.ispc
new file mode 100644
index 00000000..4ef06c11
--- /dev/null
+++ b/tests/array-multidim-gather-scatter.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    float x[40][40] = b;
+    uniform int index[4] = { 0, 1, 2, 4 };
+    float v = index[programIndex & 0x3];
+    x[a][v] = 0;
+    RET[programIndex] = x[v+1][v];
+}
+
+export void result(uniform float RET[]) { 
+    RET[programIndex] = 5;
+    RET[0] = RET[1] = RET[2] = 0;
+}
+
diff --git a/tests/array-multidim-gather.ispc b/tests/array-multidim-gather.ispc
new file mode 100644
index 00000000..58c22c3d
--- /dev/null
+++ b/tests/array-multidim-gather.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform float x[4][3];
+    uniform int i, j;
+    for (i = 0; i < 4; ++i)
+        for (j = 0; j < 3; ++j)
+            x[i][j] = 3*i+j;
+
+    a = clamp(a-2., 0., 2.);
+    RET[programIndex] = x[a][a];
+}
+
+export void result(uniform float RET[]) { 
+    RET[programIndex] = 8;
+    RET[0] = RET[1] = 0;
+    RET[2] = 4;
+}
diff --git a/tests/array-scatter-unif-2.ispc b/tests/array-scatter-unif-2.ispc
new file mode 100644
index 00000000..2c989e96
--- /dev/null
+++ b/tests/array-scatter-unif-2.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform float x[100];
+    // HACK to avoid @llvm.memset...
+    for (uniform int i = 0; i < b*20; ++i)
+        x[i] = 0;
+    
+    x[2*(a-1)] = b;
+    RET[programIndex] = x[4]+x[5];
+}
+
+    
+export void result(uniform float RET[]) { RET[programIndex] = 5; }
diff --git a/tests/array-scatter-unif-3.ispc b/tests/array-scatter-unif-3.ispc
new file mode 100644
index 00000000..8aad3110
--- /dev/null
+++ b/tests/array-scatter-unif-3.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform float x[100];
+    // HACK to avoid @llvm.memset...
+    for (uniform int i = 0; i < b*20; ++i)
+        x[i] = 0;
+
+    x[2*(a-1)] = b;
+    RET[programIndex] = x[2*(a-1)];
+}
+
+    
+export void result(uniform float RET[]) { RET[programIndex] = 5; }
diff --git a/tests/array-scatter-unif.ispc b/tests/array-scatter-unif.ispc
new file mode 100644
index 00000000..37dadde7
--- /dev/null
+++ b/tests/array-scatter-unif.ispc
@@ -0,0 +1,20 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform float x[40] = 0;
+    x[a] = 2;
+    RET[programIndex] = x[4] + x[0] + x[5];
+}
+
+    
+export void result(uniform float RET[]) { 
+    if (programCount == 4)
+        RET[programIndex] = 2;
+    else
+        RET[programIndex] = 4;
+}
+
diff --git a/tests/array-scatter-vary.ispc b/tests/array-scatter-vary.ispc
new file mode 100644
index 00000000..d8d7b7bb
--- /dev/null
+++ b/tests/array-scatter-vary.ispc
@@ -0,0 +1,20 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    float x[30];
+    // HACK to avoid @llvm.memset...
+    for (uniform int i = 0; i < b*6; ++i)
+        x[i] = 0;
+    x[a] = a;
+    RET[programIndex] = x[4] + x[0] + x[5];
+}
+
+    
+export void result(uniform float RET[4]) { 
+    RET[programIndex] = 0;
+    RET[3] = 4;
+    RET[4] = 5;
+}
diff --git a/tests/array-struct-gather.ispc b/tests/array-struct-gather.ispc
new file mode 100644
index 00000000..7a18acba
--- /dev/null
+++ b/tests/array-struct-gather.ispc
@@ -0,0 +1,26 @@
+
+export uniform int width() { return programCount; }
+
+
+
+struct Foo {
+    uniform float x[17];
+};
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform Foo foo;
+    uniform int i;
+    for (i = 0; i < 17; ++i)
+        foo.x[i] = i;
+
+    if ((int)a & 1)
+        { RET[programIndex] = foo.x[a-1]; return; }
+    else 
+        { RET[programIndex] = foo.x[a]; return; }
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = ((programIndex+1) / 2) * 2;
+}
diff --git a/tests/array.ispc b/tests/array.ispc
new file mode 100644
index 00000000..dc9e1d90
--- /dev/null
+++ b/tests/array.ispc
@@ -0,0 +1,24 @@
+
+export uniform int width() { return programCount; }
+
+
+
+void foo(float a[]) {
+    a[5] = 0;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    float x[10];
+    uniform int i;
+    for (i = 0; i < 10; ++i)
+        x[i] = a*b;
+    if (a >= 2)
+        foo(x);
+    RET[programIndex] = x[b] + x[9];
+}
+
+export void result(uniform float RET[]) { 
+    RET[programIndex] = 5 * (programIndex+1);
+    RET[0] = RET[1] = 10;
+}
diff --git a/tests/bitnot.ispc b/tests/bitnot.ispc
new file mode 100644
index 00000000..c0a8c8f5
--- /dev/null
+++ b/tests/bitnot.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) {
+    unsigned int x = 0xffff0f0f;
+    RET[programIndex] = ~x;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0xf0f0;
+}
diff --git a/tests/bool-float-typeconv.ispc b/tests/bool-float-typeconv.ispc
new file mode 100644
index 00000000..9b523c74
--- /dev/null
+++ b/tests/bool-float-typeconv.ispc
@@ -0,0 +1,10 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex];
+    RET[programIndex] = a < 3.;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 0; RET[0] = RET[1] = 1; }
diff --git a/tests/c-cif-nested-continue.ispc b/tests/c-cif-nested-continue.ispc
new file mode 100644
index 00000000..712cbb74
--- /dev/null
+++ b/tests/c-cif-nested-continue.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex];
+    int i, j;
+    float r = 0;
+    for (i = 0; i < a; ++i) {
+        cif (i != 0) {
+            cif (a != 2)
+                ccontinue;
+            r = 10;
+        }
+        ++r;
+    }
+    RET[programIndex] = r;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 1; RET[1] = 11; }
diff --git a/tests/c-do-2.ispc b/tests/c-do-2.ispc
new file mode 100644
index 00000000..dd43e0c1
--- /dev/null
+++ b/tests/c-do-2.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_v(uniform float RET[]) {
+    int i=0, j=0;
+    do {
+        ++j;
+        if (i >= 5) ccontinue;
+        ++j;
+        } while (++i < 10);
+    RET[programIndex] = (float)j;
+}
+
+
+export void result(uniform float RET[]) { RET[programIndex] = 15; }
diff --git a/tests/c-do-3.ispc b/tests/c-do-3.ispc
new file mode 100644
index 00000000..a54c95fe
--- /dev/null
+++ b/tests/c-do-3.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_v(uniform float RET[]) {
+    int i=0, j=0;
+    do {
+        ++j;
+        if (i >= 5) cbreak;
+        ++j;
+        } while (++i < 10);
+    RET[programIndex] = j;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 11; }
+
diff --git a/tests/c-for-loop-nested-continues.ispc b/tests/c-for-loop-nested-continues.ispc
new file mode 100644
index 00000000..96782f17
--- /dev/null
+++ b/tests/c-for-loop-nested-continues.ispc
@@ -0,0 +1,22 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex];
+    int i, j;
+    float r = 0;
+    for (i = 0; i < a+1; ++i) {
+        if (i == 1)
+            ccontinue;
+        for (j = 0; j < a; ++j) {
+            if (a == 2)
+                ccontinue;
+        }
+        ++r;
+    }
+    RET[programIndex] = r;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 1+programIndex; }
diff --git a/tests/c-for-unif-test-vary-break.ispc b/tests/c-for-unif-test-vary-break.ispc
new file mode 100644
index 00000000..0a5d4d79
--- /dev/null
+++ b/tests/c-for-unif-test-vary-break.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform int x;
+    float aa = a;
+    for (x = 0; x < 99999; ++x) {
+        if (x == a) cbreak;
+        ++aa;
+    }
+    RET[programIndex] = aa;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 2+2*programIndex; }
+
diff --git a/tests/c-test-108.ispc b/tests/c-test-108.ispc
new file mode 100644
index 00000000..a93a6625
--- /dev/null
+++ b/tests/c-test-108.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    int i, j=0;
+    for (i=a; i < 10; ++i) {
+        j += sqrt((a / 3.f) * (1.f / (i+2)));
+        if (i >= 5) ccontinue;
+        j += sqrt(((a+2) / 3.f) * (1.f / (i+3)));
+    }
+    RET[programIndex] = (float)j; 
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}
diff --git a/tests/c-test-131.ispc b/tests/c-test-131.ispc
new file mode 100644
index 00000000..0ecfa895
--- /dev/null
+++ b/tests/c-test-131.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    int i, j=0;
+    for (i=a; i < 10; ++i) {
+        j += sqrt((a / 3.f) * (1.f / (i+2)));
+        cif (i >= 5) ccontinue;
+        j += sqrt(((a+2) / 3.f) * (1.f / (i+3)));
+    }
+    RET[programIndex] = (float)j; 
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}
diff --git a/tests/c-test-133.ispc b/tests/c-test-133.ispc
new file mode 100644
index 00000000..92ac0d19
--- /dev/null
+++ b/tests/c-test-133.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform float i;
+    for (i = 0; i < b; ++i) {
+        cif (i+2 == b) ccontinue;
+        ++a;
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 5+programIndex;
+}
diff --git a/tests/c-test-134.ispc b/tests/c-test-134.ispc
new file mode 100644
index 00000000..047a9d49
--- /dev/null
+++ b/tests/c-test-134.ispc
@@ -0,0 +1,23 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    float i, j;
+    for (i = 0; i < b; ++i) {
+        cif (a == 1.) cbreak;
+        for (j = 0; j < b; ++j) {
+            if (a == 3.) cbreak;
+            ++a;
+        }
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 26+programIndex;
+    RET[0] = 1;
+    RET[1] = RET[2] = 3;
+}
diff --git a/tests/c-test-135.ispc b/tests/c-test-135.ispc
new file mode 100644
index 00000000..73f8579b
--- /dev/null
+++ b/tests/c-test-135.ispc
@@ -0,0 +1,23 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    float i, j;
+    for (i = 0; i < b; ++i) {
+        if (a == 1.) cbreak;
+        for (j = 0; j < b; ++j) {
+            cif (a == 3.) cbreak;
+            ++a;
+        }
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 26+programIndex;
+    RET[0] = 1;
+    RET[1] = RET[2] = 3;
+}
diff --git a/tests/c-test-136.ispc b/tests/c-test-136.ispc
new file mode 100644
index 00000000..eaa4a8bc
--- /dev/null
+++ b/tests/c-test-136.ispc
@@ -0,0 +1,23 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    float i, j;
+    for (i = 0; i < b; ++i) {
+        cif (a == 1.) cbreak;
+        for (j = 0; j < b; ++j) {
+            cif (a == 3.) cbreak;
+            ++a;
+        }
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 26+programIndex;
+    RET[0] = 1;
+    RET[1] = RET[2] = 3;
+}
diff --git a/tests/c-test-62.ispc b/tests/c-test-62.ispc
new file mode 100644
index 00000000..6628ad43
--- /dev/null
+++ b/tests/c-test-62.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform float i;
+    for (i = 0; i < b; ++i) {
+        if (i == 2) cbreak;
+        ++a;
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 3+programIndex;
+}
diff --git a/tests/c-test-63.ispc b/tests/c-test-63.ispc
new file mode 100644
index 00000000..80acffda
--- /dev/null
+++ b/tests/c-test-63.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    float i;
+    for (i = 0; i < b; ++i) {
+        if (i == 2) cbreak;
+        ++a;
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 3+programIndex;
+}
diff --git a/tests/c-test-64.ispc b/tests/c-test-64.ispc
new file mode 100644
index 00000000..3429bf91
--- /dev/null
+++ b/tests/c-test-64.ispc
@@ -0,0 +1,26 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex&0x3];
+    float i;
+    if (a < 3) {
+        ++a;
+    }
+    else {
+        for (i = 0; i < b; ++i) {
+            if (i == 2) cbreak;
+            ++a;
+        }
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[0] = RET[4] = RET[8] = RET[12] = 2;
+    RET[1] = RET[5] = RET[9] = RET[13] = 3;
+    RET[2] = RET[6] = RET[10] = RET[14] = 5;
+    RET[3] = RET[7] = RET[11] = RET[15] = 6;
+}
diff --git a/tests/c-test-65.ispc b/tests/c-test-65.ispc
new file mode 100644
index 00000000..9a363864
--- /dev/null
+++ b/tests/c-test-65.ispc
@@ -0,0 +1,23 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[4], uniform float aFOO[4], uniform float b) {
+    float a = aFOO[programIndex&0x3];
+    float i, j;
+    for (i = 0; i < b; ++i) {
+        if (a == 1.) cbreak;
+        for (j = 0; j < b; ++j) {
+            if (a == 3.) cbreak;
+            ++a;
+        }
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 3;
+    RET[0] = RET[4] = RET[8] = RET[12] = 1;
+    RET[3] = RET[7] = RET[11] = RET[15] = 29;
+}
diff --git a/tests/c-test-66.ispc b/tests/c-test-66.ispc
new file mode 100644
index 00000000..a6c35dc7
--- /dev/null
+++ b/tests/c-test-66.ispc
@@ -0,0 +1,24 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex&0x3];
+    float i, j;
+    for (i = 0; i < b; ++i) {
+        ++a;
+        for (j = 0; j < b; ++j) {
+            if (a == 3.) cbreak;
+            ++a;
+        }
+        ++a;
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 32;
+    RET[2] = RET[6] = RET[10] = RET[14] = 38;
+    RET[3] = RET[7] = RET[11] = RET[15] = 39;
+}
diff --git a/tests/c-test-67.ispc b/tests/c-test-67.ispc
new file mode 100644
index 00000000..ff5a415d
--- /dev/null
+++ b/tests/c-test-67.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    float i, j;
+    for (i = 0; i < b; ++i) {
+        if (a == b) ccontinue;
+        ++a;
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = programIndex < 5 ? 5 : 6+programIndex;
+}
diff --git a/tests/c-test-68.ispc b/tests/c-test-68.ispc
new file mode 100644
index 00000000..9581aee6
--- /dev/null
+++ b/tests/c-test-68.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    float i, j;
+    for (i = 0; i < b; ++i) {
+        if (a == b) ccontinue;
+        ++a;
+        if (a == 2) cbreak;
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 5;
+    if (programIndex >= 5) RET[programIndex] = 6 + programIndex;
+    RET[0] = 2;
+}
diff --git a/tests/c-test-69.ispc b/tests/c-test-69.ispc
new file mode 100644
index 00000000..d84c4508
--- /dev/null
+++ b/tests/c-test-69.ispc
@@ -0,0 +1,22 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    float i, j;
+    for (i = 0; i < b; ++i) {
+        if (a == b) 
+            a += 10;
+        ++a;
+        if (a == 2) cbreak;
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 16+programIndex;
+    if (programIndex >= 5) RET[programIndex] = 6+programIndex;
+    RET[0] = 2;
+}
diff --git a/tests/c-test-70.ispc b/tests/c-test-70.ispc
new file mode 100644
index 00000000..a0a521c8
--- /dev/null
+++ b/tests/c-test-70.ispc
@@ -0,0 +1,26 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    float i, j;
+    if (a >= 4)
+        a = 0;
+    else {
+        for (i = 0; i < b; ++i) {
+            if (a == b) 
+                a += 10;
+            ++a;
+            if (a == 2) cbreak;
+        }
+    }
+    RET[programIndex] = a;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 2;
+    RET[1] = 17;
+    RET[2] = 18;
+}
diff --git a/tests/c-test-71.ispc b/tests/c-test-71.ispc
new file mode 100644
index 00000000..89372169
--- /dev/null
+++ b/tests/c-test-71.ispc
@@ -0,0 +1,26 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    float i, r = a;
+    if (a >= 4)
+        { RET[programIndex] = 0; return; }
+    else {
+        for (i = 0; i < a; ++i) {
+            if (r == b) 
+                r += 10;
+            ++r;
+            if (r == 2) cbreak;
+        }
+    }
+    RET[programIndex] = r;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0]= 2;
+    RET[1] = 4;
+    RET[2] = 16;
+}
diff --git a/tests/c-test-76.ispc b/tests/c-test-76.ispc
new file mode 100644
index 00000000..fbd7641d
--- /dev/null
+++ b/tests/c-test-76.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform float i;
+    for (i = 0; i < b; ++i) {
+        if (i+2 == b) ccontinue;
+        ++a;
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 5+programIndex;
+}
diff --git a/tests/c-test-77.ispc b/tests/c-test-77.ispc
new file mode 100644
index 00000000..13fb7e82
--- /dev/null
+++ b/tests/c-test-77.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform float i;
+    for (i = 0; i < b; ++i) {
+        ++a;
+        if (i+2 == b) cbreak;
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 5+programIndex;
+}
diff --git a/tests/c-test-78.ispc b/tests/c-test-78.ispc
new file mode 100644
index 00000000..d509beba
--- /dev/null
+++ b/tests/c-test-78.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) {
+int i;
+for (i = 0; i < 10; ++i) {
+  if (i == 5) cbreak;
+}
+RET[programIndex] = (float)i; }
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 5.000000;
+}
diff --git a/tests/c-test-79.ispc b/tests/c-test-79.ispc
new file mode 100644
index 00000000..d7dd8ed9
--- /dev/null
+++ b/tests/c-test-79.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) {
+int i, j=0;
+for (i = 0; i < 10; ++i) {
+  if (i >= 5) ccontinue;
+  ++j;}
+RET[programIndex] = (float)j; }
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 5.000000;
+}
diff --git a/tests/c-test-80.ispc b/tests/c-test-80.ispc
new file mode 100644
index 00000000..8aeb1b8e
--- /dev/null
+++ b/tests/c-test-80.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) {
+int i=0, j=0;
+while (i++ < 10) {
+  ++j;
+  if (i >= 5) ccontinue;
+  ++j;}
+RET[programIndex] = (float)j; }
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 14.000000;
+}
diff --git a/tests/c-test-81.ispc b/tests/c-test-81.ispc
new file mode 100644
index 00000000..ce1a4522
--- /dev/null
+++ b/tests/c-test-81.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) {
+int i=0, j=0;
+while (i++ < 10) {
+  ++j;
+  if (i >= 5) cbreak;
+  ++j;}
+RET[programIndex] = (float)j; }
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 9.000000;
+}
diff --git a/tests/cfor-array-gather-ifs.ispc b/tests/cfor-array-gather-ifs.ispc
new file mode 100644
index 00000000..c792a8db
--- /dev/null
+++ b/tests/cfor-array-gather-ifs.ispc
@@ -0,0 +1,24 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform float x[45];
+    uniform int i;
+    cfor (i = 0; i < 45; ++i)
+        x[i] = i;
+
+    float ret;
+    if ((int)a & 1)
+        ret = x[a-1];
+    else 
+        ret = x[a];
+    RET[programIndex] = ret;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2 * ((programIndex+1) / 2);
+}
diff --git a/tests/cfor-array-gather-unif-runflags.ispc b/tests/cfor-array-gather-unif-runflags.ispc
new file mode 100644
index 00000000..75df7532
--- /dev/null
+++ b/tests/cfor-array-gather-unif-runflags.ispc
@@ -0,0 +1,24 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform float x[45];
+    uniform int i;
+    cfor (i = 0; i < 45; ++i)
+        x[i] = i+b;
+    a -= 1;
+    if (a == 3) a = 0;
+    a *= 10000000;
+    float ret = 1234;
+    if (a < 5)
+        ret = x[a];
+    RET[programIndex] = ret;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1234;
+    RET[0] = RET[3] = 5;
+}
diff --git a/tests/cfor-array-gather-unif.ispc b/tests/cfor-array-gather-unif.ispc
new file mode 100644
index 00000000..2b87f6a2
--- /dev/null
+++ b/tests/cfor-array-gather-unif.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform float x[45];
+    uniform int i;
+    cfor (i = 0; i < 45; ++i)
+        x[i] = i+b;
+    RET[programIndex] = x[a];
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 6+programIndex;
+}
diff --git a/tests/cfor-array-gather-vary.ispc b/tests/cfor-array-gather-vary.ispc
new file mode 100644
index 00000000..30f9de44
--- /dev/null
+++ b/tests/cfor-array-gather-vary.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    float x[55];
+    uniform int i;
+    cfor (i = 0; i < 45; ++i)
+        x[i] = a+b;
+    RET[programIndex] = x[a];
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 6 + programIndex;;
+}
diff --git a/tests/cfor-array-multidim-gather.ispc b/tests/cfor-array-multidim-gather.ispc
new file mode 100644
index 00000000..1f302cf3
--- /dev/null
+++ b/tests/cfor-array-multidim-gather.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform float x[4][3];
+    uniform int i, j;
+    cfor (i = 0; i < 4; ++i)
+        cfor (j = 0; j < 3; ++j)
+            x[i][j] = 3*i+j;
+
+    a = clamp(a-2., 0., 2.);
+    RET[programIndex] = x[a][a];
+}
+
+export void result(uniform float RET[]) { 
+    RET[programIndex] = 8;
+    RET[0] = RET[1] = 0;
+    RET[2] = 4;
+}
diff --git a/tests/cfor-array-struct-gather.ispc b/tests/cfor-array-struct-gather.ispc
new file mode 100644
index 00000000..c320ad7c
--- /dev/null
+++ b/tests/cfor-array-struct-gather.ispc
@@ -0,0 +1,26 @@
+
+export uniform int width() { return programCount; }
+
+
+
+struct Foo {
+    uniform float x[17];
+};
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform Foo foo;
+    uniform int i;
+    cfor (i = 0; i < 17; ++i)
+        foo.x[i] = i;
+
+    if ((int)a & 1)
+        { RET[programIndex] = foo.x[a-1]; return; }
+    else 
+        { RET[programIndex] = foo.x[a]; return; }
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = ((programIndex+1) / 2) * 2;
+}
diff --git a/tests/cfor-array.ispc b/tests/cfor-array.ispc
new file mode 100644
index 00000000..8b3a291c
--- /dev/null
+++ b/tests/cfor-array.ispc
@@ -0,0 +1,24 @@
+
+export uniform int width() { return programCount; }
+
+
+
+void foo(float a[]) {
+    a[5] = 0;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    float x[10];
+    uniform int i;
+    cfor (i = 0; i < 10; ++i)
+        x[i] = a*b;
+    if (a >= 2)
+        foo(x);
+    RET[programIndex] = x[b] + x[9];
+}
+
+export void result(uniform float RET[]) { 
+    RET[programIndex] = 5 * (programIndex+1);
+    RET[0] = RET[1] = 10;
+}
diff --git a/tests/cfor-c-cif-nested-continue.ispc b/tests/cfor-c-cif-nested-continue.ispc
new file mode 100644
index 00000000..315ef70f
--- /dev/null
+++ b/tests/cfor-c-cif-nested-continue.ispc
@@ -0,0 +1,35 @@
+static float float4(uniform float a, uniform float b, uniform float c, 
+                    uniform float d) {
+    float ret = 0;
+    cfor (uniform int i = 0; i < programCount; i += 4) {
+        ret = insert(ret, i + 0, a);
+        ret = insert(ret, i + 1, b);
+        ret = insert(ret, i + 2, c);
+        ret = insert(ret, i + 3, d);
+    }
+    return ret;
+}
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex&0x3];
+    int i, j;
+    float r = 0;
+    cfor (i = 0; i < a; ++i) {
+        cif (i != 0) {
+            cif (a != 2)
+                ccontinue;
+            r = 10;
+        }
+        ++r;
+    }
+    RET[programIndex] = r;
+}
+
+export void result(uniform float RET[]) { 
+    uniform float ret[4] = { 1,11,1,1 };
+    RET[programIndex] = ret[programIndex & 0x3];
+}
diff --git a/tests/cfor-c-for-loop-nested-continues.ispc b/tests/cfor-c-for-loop-nested-continues.ispc
new file mode 100644
index 00000000..4b2dfccc
--- /dev/null
+++ b/tests/cfor-c-for-loop-nested-continues.ispc
@@ -0,0 +1,36 @@
+static float float4(uniform float a, uniform float b, uniform float c, 
+                    uniform float d) {
+    float ret = 0;
+    cfor (uniform int i = 0; i < programCount; i += 4) {
+        ret = insert(ret, i + 0, a);
+        ret = insert(ret, i + 1, b);
+        ret = insert(ret, i + 2, c);
+        ret = insert(ret, i + 3, d);
+    }
+    return ret;
+}
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex&0x3];
+    int i, j;
+    float r = 0;
+    cfor (i = 0; i < a+1; ++i) {
+        if (i == 1)
+            ccontinue;
+        cfor (j = 0; j < a; ++j) {
+            if (a == 2)
+                ccontinue;
+        }
+        ++r;
+    }
+    RET[programIndex] = r;
+}
+
+export void result(uniform float RET[]) { 
+    uniform float ret[4] = { 1,2,3,4 };
+    RET[programIndex] = ret[programIndex & 0x3];
+}
diff --git a/tests/cfor-c-for-unif-test-vary-break.ispc b/tests/cfor-c-for-unif-test-vary-break.ispc
new file mode 100644
index 00000000..4a855587
--- /dev/null
+++ b/tests/cfor-c-for-unif-test-vary-break.ispc
@@ -0,0 +1,31 @@
+static float float4(uniform float a, uniform float b, uniform float c, 
+                    uniform float d) {
+    float ret = 0;
+    cfor (uniform int i = 0; i < programCount; i += 4) {
+        ret = insert(ret, i + 0, a);
+        ret = insert(ret, i + 1, b);
+        ret = insert(ret, i + 2, c);
+        ret = insert(ret, i + 3, d);
+    }
+    return ret;
+}
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex&0x3];
+    uniform int x;
+    float aa = a;
+    cfor (x = 0; x < 99999; ++x) {
+        if (x == a) cbreak;
+        ++aa;
+    }
+    RET[programIndex] = aa;
+}
+
+export void result(uniform float RET[]) { 
+    uniform float ret[4] = { 2,4,6,8 };
+    RET[programIndex] = ret[programIndex & 0x3];
+}
diff --git a/tests/cfor-c-test-108.ispc b/tests/cfor-c-test-108.ispc
new file mode 100644
index 00000000..f4ed2ec1
--- /dev/null
+++ b/tests/cfor-c-test-108.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex&0x3];
+    int i, j=0;
+    cfor (i=a; i < 10; ++i) {
+        j += sqrt((a / 3.f) * (1.f / (i+2)));
+        if (i >= 5) ccontinue;
+        j += sqrt(((a+2) / 3.f) * (1.f / (i+3)));
+    }
+    RET[programIndex] = (float)j; 
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}
diff --git a/tests/cfor-c-test-131.ispc b/tests/cfor-c-test-131.ispc
new file mode 100644
index 00000000..d6929370
--- /dev/null
+++ b/tests/cfor-c-test-131.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex&0x3];
+    int i, j=0;
+    cfor (i=a; i < 10; ++i) {
+        j += sqrt((a / 3.f) * (1.f / (i+2)));
+        cif (i >= 5) ccontinue;
+        j += sqrt(((a+2) / 3.f) * (1.f / (i+3)));
+    }
+    RET[programIndex] = (float)j; 
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}
diff --git a/tests/cfor-c-test-133.ispc b/tests/cfor-c-test-133.ispc
new file mode 100644
index 00000000..5831d6f1
--- /dev/null
+++ b/tests/cfor-c-test-133.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex&0x3];
+    uniform float i;
+    cfor (i = 0; i < b; ++i) {
+        cif (i+2 == b) ccontinue;
+        ++a;
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    uniform float ret[4] = { 5,6,7,8 };
+    RET[programIndex] = ret[programIndex & 0x3];
+}
diff --git a/tests/cfor-c-test-134.ispc b/tests/cfor-c-test-134.ispc
new file mode 100644
index 00000000..3c8d97d5
--- /dev/null
+++ b/tests/cfor-c-test-134.ispc
@@ -0,0 +1,22 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex&0x3];
+    float i, j;
+    cfor (i = 0; i < b; ++i) {
+        cif (a == 1.) cbreak;
+        cfor (j = 0; j < b; ++j) {
+            if (a == 3.) cbreak;
+            ++a;
+        }
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    uniform float ret[4] = { 1,3,3,29 };
+    RET[programIndex] = ret[programIndex & 0x3];
+}
diff --git a/tests/cfor-c-test-135.ispc b/tests/cfor-c-test-135.ispc
new file mode 100644
index 00000000..34525b3a
--- /dev/null
+++ b/tests/cfor-c-test-135.ispc
@@ -0,0 +1,22 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[4], uniform float aFOO[4], uniform float b) {
+    float a = aFOO[programIndex&0x3];
+    float i, j;
+    cfor (i = 0; i < b; ++i) {
+        if (a == 1.) cbreak;
+        cfor (j = 0; j < b; ++j) {
+            cif (a == 3.) cbreak;
+            ++a;
+        }
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[4]) {
+    uniform float ret[4] = { 1,3,3,29 };
+    RET[programIndex] = ret[programIndex & 0x3];
+}
diff --git a/tests/cfor-c-test-136.ispc b/tests/cfor-c-test-136.ispc
new file mode 100644
index 00000000..24ec2021
--- /dev/null
+++ b/tests/cfor-c-test-136.ispc
@@ -0,0 +1,22 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[4], uniform float aFOO[4], uniform float b) {
+    float a = aFOO[programIndex&0x3];
+    float i, j;
+    cfor (i = 0; i < b; ++i) {
+        cif (a == 1.) cbreak;
+        cfor (j = 0; j < b; ++j) {
+            cif (a == 3.) cbreak;
+            ++a;
+        }
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[4]) {
+    uniform float ret[4] = { 1,3,3,29 };
+    RET[programIndex] = ret[programIndex & 0x3];
+}
diff --git a/tests/cfor-c-test-62.ispc b/tests/cfor-c-test-62.ispc
new file mode 100644
index 00000000..a2fcd9a7
--- /dev/null
+++ b/tests/cfor-c-test-62.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex&0x3];
+    uniform float i;
+    cfor (i = 0; i < b; ++i) {
+        if (i == 2) cbreak;
+        ++a;
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    uniform float ret[4] = { 3, 4, 5, 6 };
+    RET[programIndex] = ret[programIndex & 0x3];
+}
diff --git a/tests/cfor-c-test-63.ispc b/tests/cfor-c-test-63.ispc
new file mode 100644
index 00000000..95f7f222
--- /dev/null
+++ b/tests/cfor-c-test-63.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex&0x3];
+    float i;
+    cfor (i = 0; i < b; ++i) {
+        if (i == 2) cbreak;
+        ++a;
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    uniform float ret[4] = { 3, 4, 5, 6 };
+    RET[programIndex] = ret[programIndex & 0x3];
+}
diff --git a/tests/cfor-c-test-64.ispc b/tests/cfor-c-test-64.ispc
new file mode 100644
index 00000000..e4018a6d
--- /dev/null
+++ b/tests/cfor-c-test-64.ispc
@@ -0,0 +1,24 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex&0x3];
+    float i;
+    if (a < 3) {
+        ++a;
+    }
+    else {
+        cfor (i = 0; i < b; ++i) {
+            if (i == 2) cbreak;
+            ++a;
+        }
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    uniform float ret[4] = { 2, 3, 5, 6 };
+    RET[programIndex] = ret[programIndex & 0x3];
+}
diff --git a/tests/cfor-c-test-65.ispc b/tests/cfor-c-test-65.ispc
new file mode 100644
index 00000000..1678b128
--- /dev/null
+++ b/tests/cfor-c-test-65.ispc
@@ -0,0 +1,22 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex&0x3];
+    float i, j;
+    cfor (i = 0; i < b; ++i) {
+        if (a == 1.) cbreak;
+        cfor (j = 0; j < b; ++j) {
+            if (a == 3.) cbreak;
+            ++a;
+        }
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    uniform float ret[4] = { 1, 3, 3, 29 };
+    RET[programIndex] = ret[programIndex & 0x3];
+}
diff --git a/tests/cfor-c-test-66.ispc b/tests/cfor-c-test-66.ispc
new file mode 100644
index 00000000..3ca93b7c
--- /dev/null
+++ b/tests/cfor-c-test-66.ispc
@@ -0,0 +1,23 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex&0x3];
+    float i, j;
+    cfor (i = 0; i < b; ++i) {
+        ++a;
+        cfor (j = 0; j < b; ++j) {
+            if (a == 3.) cbreak;
+            ++a;
+        }
+        ++a;
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    uniform float ret[4] = { 32, 32, 38, 39 };
+    RET[programIndex] = ret[programIndex & 0x3];
+}
diff --git a/tests/cfor-c-test-67.ispc b/tests/cfor-c-test-67.ispc
new file mode 100644
index 00000000..e54992bc
--- /dev/null
+++ b/tests/cfor-c-test-67.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex&0x3];
+    float i, j;
+    cfor (i = 0; i < b; ++i) {
+        if (a == b) ccontinue;
+        ++a;
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 5;
+}
diff --git a/tests/cfor-c-test-68.ispc b/tests/cfor-c-test-68.ispc
new file mode 100644
index 00000000..6717acd8
--- /dev/null
+++ b/tests/cfor-c-test-68.ispc
@@ -0,0 +1,20 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex&0x3];
+    float i, j;
+    cfor (i = 0; i < b; ++i) {
+        if (a == b) ccontinue;
+        ++a;
+        if (a == 2) cbreak;
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    uniform float ret[4] = { 2, 5, 5, 5 };
+    RET[programIndex] = ret[programIndex & 0x3];
+}
diff --git a/tests/cfor-c-test-69.ispc b/tests/cfor-c-test-69.ispc
new file mode 100644
index 00000000..20572610
--- /dev/null
+++ b/tests/cfor-c-test-69.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex&0x3];
+    float i, j;
+    cfor (i = 0; i < b; ++i) {
+        if (a == b) 
+            a += 10;
+        ++a;
+        if (a == 2) cbreak;
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    uniform float ret[4] = { 2, 17, 18, 19 };
+    RET[programIndex] = ret[programIndex & 0x3];
+}
diff --git a/tests/cfor-c-test-70.ispc b/tests/cfor-c-test-70.ispc
new file mode 100644
index 00000000..d52b710b
--- /dev/null
+++ b/tests/cfor-c-test-70.ispc
@@ -0,0 +1,24 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex&0x3];
+    float i, j;
+    if (a >= 4)
+        a = 0;
+    else {
+        cfor (i = 0; i < b; ++i) {
+            if (a == b) 
+                a += 10;
+            ++a;
+            if (a == 2) cbreak;
+        }
+    }
+    RET[programIndex] = a;
+}
+
+export void result(uniform float RET[]) {
+    uniform float ret[4] = { 2,17,18,0 };
+    RET[programIndex] = ret[programIndex & 0x3];
+}
diff --git a/tests/cfor-c-test-71.ispc b/tests/cfor-c-test-71.ispc
new file mode 100644
index 00000000..ff77bb2a
--- /dev/null
+++ b/tests/cfor-c-test-71.ispc
@@ -0,0 +1,24 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex&0x3];
+    float i, r = a;
+    if (a >= 4)
+        { RET[programIndex] = 0; return; }
+    else {
+        cfor (i = 0; i < a; ++i) {
+            if (r == b) 
+                r += 10;
+            ++r;
+            if (r == 2) cbreak;
+        }
+    }
+    RET[programIndex] = r;
+}
+
+export void result(uniform float RET[]) {
+    uniform float ret[4] = { 2,4,16,0 };
+    RET[programIndex] = ret[programIndex & 0x3];
+}
diff --git a/tests/cfor-c-test-76.ispc b/tests/cfor-c-test-76.ispc
new file mode 100644
index 00000000..37c5a62b
--- /dev/null
+++ b/tests/cfor-c-test-76.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex&0x3];
+    uniform float i;
+    cfor (i = 0; i < b; ++i) {
+        if (i+2 == b) ccontinue;
+        ++a;
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    uniform float ret[4] = { 5,6,7,8 };
+    RET[programIndex] = ret[programIndex & 0x3];
+}
diff --git a/tests/cfor-c-test-77.ispc b/tests/cfor-c-test-77.ispc
new file mode 100644
index 00000000..68329297
--- /dev/null
+++ b/tests/cfor-c-test-77.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex&0x3];
+    uniform float i;
+    cfor (i = 0; i < b; ++i) {
+        ++a;
+        if (i+2 == b) cbreak;
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    uniform float ret[4] = { 5,6,7,8 };
+    RET[programIndex] = ret[programIndex & 0x3];
+}
diff --git a/tests/cfor-c-test-78.ispc b/tests/cfor-c-test-78.ispc
new file mode 100644
index 00000000..f4987ac6
--- /dev/null
+++ b/tests/cfor-c-test-78.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) {
+int i;
+cfor (i = 0; i < 10; ++i) {
+  if (i == 5) cbreak;
+}
+RET[programIndex] = (float)i; }
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 5.000000;
+}
diff --git a/tests/cfor-c-test-79.ispc b/tests/cfor-c-test-79.ispc
new file mode 100644
index 00000000..d971e2ef
--- /dev/null
+++ b/tests/cfor-c-test-79.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) {
+int i, j=0;
+cfor (i = 0; i < 10; ++i) {
+  if (i >= 5) ccontinue;
+  ++j;}
+RET[programIndex] = (float)j; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 5.000000;
+}
diff --git a/tests/cfor-cif-nested-continue.ispc b/tests/cfor-cif-nested-continue.ispc
new file mode 100644
index 00000000..41af745a
--- /dev/null
+++ b/tests/cfor-cif-nested-continue.ispc
@@ -0,0 +1,35 @@
+static float float4(uniform float a, uniform float b, uniform float c, 
+                    uniform float d) {
+    float ret = 0;
+    cfor (uniform int i = 0; i < programCount; i += 4) {
+        ret = insert(ret, i + 0, a);
+        ret = insert(ret, i + 1, b);
+        ret = insert(ret, i + 2, c);
+        ret = insert(ret, i + 3, d);
+    }
+    return ret;
+}
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex&0x3];
+    int i, j;
+    float r = 0;
+    cfor (i = 0; i < a; ++i) {
+        cif (i != 0) {
+            cif (a != 2)
+                continue;
+            r = 10;
+        }
+        ++r;
+    }
+    RET[programIndex] = r;
+}
+
+export void result(uniform float RET[]) { 
+    uniform float ret[4] = { 1,11,1,1 };
+    RET[programIndex] = ret[programIndex & 0x3];
+}
diff --git a/tests/cfor-const-fold-4.ispc b/tests/cfor-const-fold-4.ispc
new file mode 100644
index 00000000..3ae55dd0
--- /dev/null
+++ b/tests/cfor-const-fold-4.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+
+
+#define N(x)  (3*x+1)
+// p a power of 2
+#define ROUND_UP(x, p) ((x + (p-1)) & ~(p-1))
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex&0x3];
+    uniform float x[ROUND_UP(N(3), 16)];
+    uniform int i;
+    cfor (i = 0; i < ROUND_UP(N(3), 16); ++i)
+        x[i] = i * i;
+    RET[programIndex] = x[4];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 16.; }
diff --git a/tests/cfor-for-loop-nested-continues.ispc b/tests/cfor-for-loop-nested-continues.ispc
new file mode 100644
index 00000000..3a3d5869
--- /dev/null
+++ b/tests/cfor-for-loop-nested-continues.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex];
+    int i, j;
+    float r = 0;
+    cfor (i = 0; i < a+1; ++i) {
+        if (i == 1)
+            continue;
+        cfor (j = 0; j < a; ++j) {
+            if (a == 2)
+                continue;
+        }
+        ++r;
+    }
+    RET[programIndex] = r;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 1+programIndex; }
diff --git a/tests/cfor-for-unif-test-vary-break.ispc b/tests/cfor-for-unif-test-vary-break.ispc
new file mode 100644
index 00000000..e0ff0b4c
--- /dev/null
+++ b/tests/cfor-for-unif-test-vary-break.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform int x;
+    float aa = a;
+    cfor (x = 0; x < 99999; ++x) {
+        if (x == a) break;
+        ++aa;
+    }
+    RET[programIndex] = aa;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 2 + 2*programIndex; }
diff --git a/tests/cfor-gs-double-improve-multidim-1.ispc b/tests/cfor-gs-double-improve-multidim-1.ispc
new file mode 100644
index 00000000..ed672bd8
--- /dev/null
+++ b/tests/cfor-gs-double-improve-multidim-1.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    uniform double udx[25][25];
+    cfor (uniform int i = 0; i < 25; ++i)
+        cfor (uniform int j = 0; j < 25; ++j)
+            udx[i][j] = 10*i+j;
+
+    int x = 1;
+    RET[programIndex] = udx[x][programIndex+1];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 11+programIndex; }
diff --git a/tests/cfor-gs-double-improve-multidim-2.ispc b/tests/cfor-gs-double-improve-multidim-2.ispc
new file mode 100644
index 00000000..0ec184ed
--- /dev/null
+++ b/tests/cfor-gs-double-improve-multidim-2.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    uniform double udx[5][5];
+    cfor (uniform int i = 0; i < 5; ++i)
+        cfor (uniform int j = 0; j < 5; ++j)
+            udx[i][j] = 10*i+j;
+
+    int x = 1;
+    RET[programIndex] = udx[x][(int)b-2];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 13; } 
diff --git a/tests/cfor-gs-double-improve-multidim-3.ispc b/tests/cfor-gs-double-improve-multidim-3.ispc
new file mode 100644
index 00000000..3af85479
--- /dev/null
+++ b/tests/cfor-gs-double-improve-multidim-3.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    uniform double udx[5][5];
+    cfor (uniform int i = 0; i < 5; ++i)
+        cfor (uniform int j = 0; j < 5; ++j)
+            udx[i][j] = 10*i+j;
+
+    int x = 1;
+    RET[programIndex] = udx[b-4][x];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 11; }
diff --git a/tests/cfor-gs-double-improve-multidim-struct-1.ispc b/tests/cfor-gs-double-improve-multidim-struct-1.ispc
new file mode 100644
index 00000000..588b1af3
--- /dev/null
+++ b/tests/cfor-gs-double-improve-multidim-struct-1.ispc
@@ -0,0 +1,22 @@
+
+export uniform int width() { return programCount; }
+
+
+
+struct Foo {
+    uniform double udx[5][35];
+};
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    uniform Foo f[5];
+    cfor (uniform int i = 0; i < 5; ++i)
+        cfor (uniform int j = 0; j < 5; ++j)
+            cfor (uniform int k = 0; k < 35; ++k)
+                f[i].udx[j][k] = 100*i+10*j+k;
+
+    int x = 1;
+    RET[programIndex] = f[x+1].udx[b-4][programIndex];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 210+programIndex; }
diff --git a/tests/cfor-gs-double-improve-multidim-struct.ispc b/tests/cfor-gs-double-improve-multidim-struct.ispc
new file mode 100644
index 00000000..23ae9f15
--- /dev/null
+++ b/tests/cfor-gs-double-improve-multidim-struct.ispc
@@ -0,0 +1,22 @@
+
+export uniform int width() { return programCount; }
+
+
+
+struct Foo {
+    uniform double udx[5][5];
+};
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    uniform Foo f[5];
+    cfor (uniform int i = 0; i < 5; ++i)
+        cfor (uniform int j = 0; j < 5; ++j)
+            cfor (uniform int k = 0; k < 5; ++k)
+                f[i].udx[j][k] = 100*i+10*j+k;
+
+    int x = 1;
+    RET[programIndex] = f[x+1].udx[b-4][x];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 211; }
diff --git a/tests/cfor-gs-double-improve-multidim.ispc b/tests/cfor-gs-double-improve-multidim.ispc
new file mode 100644
index 00000000..d76ac710
--- /dev/null
+++ b/tests/cfor-gs-double-improve-multidim.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    uniform double udx[5][5];
+    cfor (uniform int i = 0; i < 5; ++i)
+        cfor (uniform int j = 0; j < 5; ++j)
+            udx[i][j] = 10*i+j;
+
+    int x = 1;
+    RET[programIndex] = udx[x][x];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 11; }
diff --git a/tests/cfor-gs-double-improve-varying-1.ispc b/tests/cfor-gs-double-improve-varying-1.ispc
new file mode 100644
index 00000000..5a490e06
--- /dev/null
+++ b/tests/cfor-gs-double-improve-varying-1.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    double udx[5];
+    cfor (uniform int i = 0; i < 5; ++i)
+        udx[i] = i + a;
+
+    int x = b-4;
+    RET[programIndex] = udx[x];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 2+programIndex; }
diff --git a/tests/cfor-gs-double-improve-varying.ispc b/tests/cfor-gs-double-improve-varying.ispc
new file mode 100644
index 00000000..a65835b8
--- /dev/null
+++ b/tests/cfor-gs-double-improve-varying.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    double udx[5];
+    cfor (uniform int i = 0; i < 5; ++i)
+        udx[i] = i + a;
+
+    int x = 1;
+    RET[programIndex] = udx[x];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 2+programIndex; }
diff --git a/tests/cfor-gs-improve-multidim-1.ispc b/tests/cfor-gs-improve-multidim-1.ispc
new file mode 100644
index 00000000..b0893617
--- /dev/null
+++ b/tests/cfor-gs-improve-multidim-1.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    uniform float udx[20][20];
+    cfor (uniform int i = 0; i < 20; ++i)
+        cfor (uniform int j = 0; j < 20; ++j)
+            udx[i][j] = 100*i+j;
+
+    int x = 1;
+    RET[programIndex] = udx[x][programIndex+1];
+}
+
+export void result(uniform float RET[]) { 
+    RET[programIndex] = 101+programIndex;
+}
diff --git a/tests/cfor-gs-improve-multidim-2.ispc b/tests/cfor-gs-improve-multidim-2.ispc
new file mode 100644
index 00000000..2356fc15
--- /dev/null
+++ b/tests/cfor-gs-improve-multidim-2.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    uniform float udx[5][5];
+    cfor (uniform int i = 0; i < 5; ++i)
+        cfor (uniform int j = 0; j < 5; ++j)
+            udx[i][j] = 10*i+j;
+
+    int x = 1;
+    RET[programIndex] = udx[x][(int)b-2];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 13; } 
diff --git a/tests/cfor-gs-improve-multidim-3.ispc b/tests/cfor-gs-improve-multidim-3.ispc
new file mode 100644
index 00000000..5bcb73e2
--- /dev/null
+++ b/tests/cfor-gs-improve-multidim-3.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    uniform float udx[5][5];
+    cfor (uniform int i = 0; i < 5; ++i)
+        cfor (uniform int j = 0; j < 5; ++j)
+            udx[i][j] = 10*i+j;
+
+    int x = 1;
+    RET[programIndex] = udx[b-4][x];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 11; }
diff --git a/tests/cfor-gs-improve-multidim-struct-1.ispc b/tests/cfor-gs-improve-multidim-struct-1.ispc
new file mode 100644
index 00000000..d599ceb9
--- /dev/null
+++ b/tests/cfor-gs-improve-multidim-struct-1.ispc
@@ -0,0 +1,22 @@
+
+export uniform int width() { return programCount; }
+
+
+
+struct Foo {
+    uniform float udx[25][25];
+};
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    uniform Foo f[5];
+    cfor (uniform int i = 0; i < 5; ++i)
+        cfor (uniform int j = 0; j < 25; ++j)
+            cfor (uniform int k = 0; k < 25; ++k)
+                f[i].udx[j][k] = 1000*i+100*j+k;
+
+    int x = 1;
+    RET[programIndex] = f[x+1].udx[b-4][programIndex];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 2100 +programIndex; }
diff --git a/tests/cfor-gs-improve-multidim-struct.ispc b/tests/cfor-gs-improve-multidim-struct.ispc
new file mode 100644
index 00000000..a6f6723a
--- /dev/null
+++ b/tests/cfor-gs-improve-multidim-struct.ispc
@@ -0,0 +1,22 @@
+
+export uniform int width() { return programCount; }
+
+
+
+struct Foo {
+    uniform float udx[5][5];
+};
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    uniform Foo f[5];
+    cfor (uniform int i = 0; i < 5; ++i)
+        cfor (uniform int j = 0; j < 5; ++j)
+            cfor (uniform int k = 0; k < 5; ++k)
+                f[i].udx[j][k] = 100*i+10*j+k;
+
+    int x = 1;
+    RET[programIndex] = f[x+1].udx[b-4][x];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 211; }
diff --git a/tests/cfor-gs-improve-multidim.ispc b/tests/cfor-gs-improve-multidim.ispc
new file mode 100644
index 00000000..7be1887c
--- /dev/null
+++ b/tests/cfor-gs-improve-multidim.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    uniform float udx[18][18];
+    cfor (uniform int i = 0; i < 18; ++i)
+        cfor (uniform int j = 0; j < 18; ++j)
+            udx[i][j] = 100*i+j;
+
+    int x = 1;
+    RET[programIndex] = udx[x][x];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 101; }
diff --git a/tests/cfor-gs-improve-varying-1.ispc b/tests/cfor-gs-improve-varying-1.ispc
new file mode 100644
index 00000000..3c1b3313
--- /dev/null
+++ b/tests/cfor-gs-improve-varying-1.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    float udx[5];
+    cfor (uniform int i = 0; i < 5; ++i)
+        udx[i] = i + a;
+
+    int x = b-4;
+    RET[programIndex] = udx[x];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 2+programIndex; }
diff --git a/tests/cfor-gs-improve-varying.ispc b/tests/cfor-gs-improve-varying.ispc
new file mode 100644
index 00000000..31f516fd
--- /dev/null
+++ b/tests/cfor-gs-improve-varying.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    float udx[5];
+    cfor (uniform int i = 0; i < 5; ++i)
+        udx[i] = i + a;
+
+    int x = 1;
+    RET[programIndex] = udx[x];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 2+programIndex; }
diff --git a/tests/cfor-ref-5.ispc b/tests/cfor-ref-5.ispc
new file mode 100644
index 00000000..545b4987
--- /dev/null
+++ b/tests/cfor-ref-5.ispc
@@ -0,0 +1,20 @@
+
+export uniform int width() { return programCount; }
+
+
+
+void foo(reference float a) {
+    a = 0;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    float x[10];
+    uniform int i;
+    cfor (i = 0; i < 10; ++i)
+        x[i] = a*b;
+    foo(x[b]);
+    RET[programIndex] = x[5] + x[9];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 5 * (programIndex+1); }
diff --git a/tests/cfor-ref-6.ispc b/tests/cfor-ref-6.ispc
new file mode 100644
index 00000000..4baf40ba
--- /dev/null
+++ b/tests/cfor-ref-6.ispc
@@ -0,0 +1,26 @@
+
+export uniform int width() { return programCount; }
+
+
+
+void foo(reference float a[10]) {
+    a[5] = 0;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    float x[10];
+    uniform int i;
+    cfor (i = 0; i < 10; ++i)
+        x[i] = a*b;
+    if (a >= 2)
+        foo(x);
+    RET[programIndex] = x[b] + x[9];
+}
+
+export void result(uniform float RET[]) { 
+    RET[programIndex] = 5 * (programIndex+1);
+    RET[0] = 10;
+    RET[1] = 10;
+}
+
diff --git a/tests/cfor-ref-7.ispc b/tests/cfor-ref-7.ispc
new file mode 100644
index 00000000..929300bb
--- /dev/null
+++ b/tests/cfor-ref-7.ispc
@@ -0,0 +1,25 @@
+
+export uniform int width() { return programCount; }
+
+
+
+void foo(reference float a[10]) {
+    a[5] = 0;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    float x[10];
+    uniform int i;
+    cfor (i = 0; i < 10; ++i)
+        x[i] = a*b;
+    if (a > 2)
+        foo(x);
+    RET[programIndex] = x[b] + x[9];
+}
+
+export void result(uniform float RET[]) { 
+    RET[programIndex] = 5 * (programIndex+1);
+    RET[0] = 10;
+    RET[1] = 20;
+}
diff --git a/tests/cfor-struct-gather-2.ispc b/tests/cfor-struct-gather-2.ispc
new file mode 100644
index 00000000..c5f5a677
--- /dev/null
+++ b/tests/cfor-struct-gather-2.ispc
@@ -0,0 +1,23 @@
+
+export uniform int width() { return programCount; }
+
+
+
+struct Foo {
+    float f;
+};
+
+float func(uniform Foo foo[], int offset) {
+    return foo[offset].f;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform Foo foo[17];
+    uniform int i;
+    cfor (i = 0; i < 17; ++i)
+        foo[i].f = i*a;
+    RET[programIndex] = func(foo, (int)a);
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = (1+programIndex)*(1+programIndex); }
diff --git a/tests/cfor-struct-gather-3.ispc b/tests/cfor-struct-gather-3.ispc
new file mode 100644
index 00000000..c5f5a677
--- /dev/null
+++ b/tests/cfor-struct-gather-3.ispc
@@ -0,0 +1,23 @@
+
+export uniform int width() { return programCount; }
+
+
+
+struct Foo {
+    float f;
+};
+
+float func(uniform Foo foo[], int offset) {
+    return foo[offset].f;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform Foo foo[17];
+    uniform int i;
+    cfor (i = 0; i < 17; ++i)
+        foo[i].f = i*a;
+    RET[programIndex] = func(foo, (int)a);
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = (1+programIndex)*(1+programIndex); }
diff --git a/tests/cfor-struct-gather.ispc b/tests/cfor-struct-gather.ispc
new file mode 100644
index 00000000..28ac6e85
--- /dev/null
+++ b/tests/cfor-struct-gather.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+
+
+struct Foo {
+    float f;
+};
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform Foo foo[17];
+    uniform int i;
+    cfor (i = 0; i < 17; ++i)
+        foo[i].f = i*a;
+    RET[programIndex] = foo[(int)a].f;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = (programIndex+1)*(programIndex+1); }
diff --git a/tests/cfor-struct-test-114.ispc b/tests/cfor-struct-test-114.ispc
new file mode 100644
index 00000000..0ea2f65a
--- /dev/null
+++ b/tests/cfor-struct-test-114.ispc
@@ -0,0 +1,25 @@
+
+export uniform int width() { return programCount; }
+
+
+struct Foo {
+    uniform float x;
+    uniform float f;
+};
+
+export void f_fi(uniform float RET[], uniform float aFOO[], uniform int bFOO[]) {
+    float a = aFOO[programIndex];
+    int b = bFOO[programIndex];
+    varying Foo myFoo[17];
+    uniform int i;
+    cfor (i = 0; i < 17; ++i) {
+        myFoo[i].x = i;
+        myFoo[i].f = 2*i;
+    }
+    RET[programIndex] = myFoo[b/2].f;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2 + 2*programIndex;
+}
diff --git a/tests/cfor-test-101.ispc b/tests/cfor-test-101.ispc
new file mode 100644
index 00000000..4f24f2b2
--- /dev/null
+++ b/tests/cfor-test-101.ispc
@@ -0,0 +1,26 @@
+
+export uniform int width() { return programCount; }
+
+
+static uniform float array[10000];
+task void x(uniform int i, float f) {
+    uniform int j;
+    array[i] = i / 10000.;
+    cfor (j = 0; j < 10000; ++j)
+        array[i] = sin(array[i]);
+    if (array[i] < .02)
+        array[i] = i;
+}
+export void f_f(uniform float RET[], uniform float fFOO[]) { 
+    float f = fFOO[programIndex];
+    uniform int i;
+    cfor (i = 0; i < 10000; ++i)
+        launch < x(i, f) >;
+    sync;
+    RET[programIndex] = array[9999];
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 9999.000000;
+}
diff --git a/tests/cfor-test-108.ispc b/tests/cfor-test-108.ispc
new file mode 100644
index 00000000..3f4dc6f7
--- /dev/null
+++ b/tests/cfor-test-108.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    int i, j=0;
+    cfor (i=a; i < 10; ++i) {
+        j += sqrt((a / 3.f) * (1.f / (i+2)));
+        if (i >= 5) continue;
+        j += sqrt(((a+2) / 3.f) * (1.f / (i+3)));
+    }
+    RET[programIndex] = (float)j; 
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}
diff --git a/tests/cfor-test-131.ispc b/tests/cfor-test-131.ispc
new file mode 100644
index 00000000..a7a59d0f
--- /dev/null
+++ b/tests/cfor-test-131.ispc
@@ -0,0 +1,20 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    int i, j=0;
+    cfor (i=a; i < 10; ++i) {
+        j += sqrt((a / 3.f) * (1.f / (i+2)));
+        cif (i >= 5) continue;
+        j += sqrt(((a+2) / 3.f) * (1.f / (i+3)));
+    }
+    RET[programIndex] = (float)j; 
+}
+
+
+export void result(uniform float RET[4]) {
+    RET[programIndex] = 0;
+}
diff --git a/tests/cfor-test-133.ispc b/tests/cfor-test-133.ispc
new file mode 100644
index 00000000..71d2e2e5
--- /dev/null
+++ b/tests/cfor-test-133.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform float i;
+    cfor (i = 0; i < b; ++i) {
+        cif (i+2 == b) continue;
+        ++a;
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 5 + programIndex;
+}
diff --git a/tests/cfor-test-134.ispc b/tests/cfor-test-134.ispc
new file mode 100644
index 00000000..96493dff
--- /dev/null
+++ b/tests/cfor-test-134.ispc
@@ -0,0 +1,24 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex&0x3];
+    float i, j;
+    cfor (i = 0; i < b; ++i) {
+        cif (a == 1.) break;
+        cfor (j = 0; j < b; ++j) {
+            if (a == 3.) break;
+            ++a;
+        }
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[0] = RET[4] = RET[8] = RET[12] = 1;
+    RET[1] = RET[5] = RET[9] = RET[13] = 3;
+    RET[2] = RET[6] = RET[10] = RET[14] = 3;
+    RET[3] = RET[7] = RET[11] = RET[15] = 29;
+}
diff --git a/tests/cfor-test-135.ispc b/tests/cfor-test-135.ispc
new file mode 100644
index 00000000..5926ba30
--- /dev/null
+++ b/tests/cfor-test-135.ispc
@@ -0,0 +1,24 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex&3];
+    float i, j;
+    cfor (i = 0; i < b; ++i) {
+        if (a == 1.) break;
+        cfor (j = 0; j < b; ++j) {
+            cif (a == 3.) break;
+            ++a;
+        }
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[0] = RET[4] = RET[8] = RET[12] = 1;
+    RET[1] = RET[5] = RET[9] = RET[13] = 3;
+    RET[2] = RET[6] = RET[10] = RET[14] = 3;
+    RET[3] = RET[7] = RET[11] = RET[15] = 29;
+}
diff --git a/tests/cfor-test-136.ispc b/tests/cfor-test-136.ispc
new file mode 100644
index 00000000..62834f67
--- /dev/null
+++ b/tests/cfor-test-136.ispc
@@ -0,0 +1,24 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex&3];
+    float i, j;
+    cfor (i = 0; i < b; ++i) {
+        cif (a == 1.) break;
+        cfor (j = 0; j < b; ++j) {
+            cif (a == 3.) break;
+            ++a;
+        }
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[0] = RET[4] = RET[8] = RET[12] = 1;
+    RET[1] = RET[5] = RET[9] = RET[13] = 3;
+    RET[2] = RET[6] = RET[10] = RET[14] = 3;
+    RET[3] = RET[7] = RET[11] = RET[15] = 29;
+}
diff --git a/tests/cfor-test-61.ispc b/tests/cfor-test-61.ispc
new file mode 100644
index 00000000..083593d5
--- /dev/null
+++ b/tests/cfor-test-61.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform float i;
+    cfor (i = 0; i < b; ++i)
+        ++a;
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 6+programIndex;
+}
diff --git a/tests/cfor-test-62.ispc b/tests/cfor-test-62.ispc
new file mode 100644
index 00000000..48156e0c
--- /dev/null
+++ b/tests/cfor-test-62.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform float i;
+    cfor (i = 0; i < b; ++i) {
+        if (i == 2) break;
+        ++a;
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 3+programIndex;
+}
diff --git a/tests/cfor-test-63.ispc b/tests/cfor-test-63.ispc
new file mode 100644
index 00000000..b5152276
--- /dev/null
+++ b/tests/cfor-test-63.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    float i;
+    cfor (i = 0; i < b; ++i) {
+        if (i == 2) break;
+        ++a;
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 3 + programIndex;
+}
diff --git a/tests/cfor-test-64.ispc b/tests/cfor-test-64.ispc
new file mode 100644
index 00000000..9c51c9b0
--- /dev/null
+++ b/tests/cfor-test-64.ispc
@@ -0,0 +1,26 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex&0x3];
+    float i;
+    if (a < 3) {
+        ++a;
+    }
+    else {
+        cfor (i = 0; i < b; ++i) {
+            if (i == 2) break;
+            ++a;
+        }
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[0] = RET[4] = RET[8] = RET[12] = 2;
+    RET[1] = RET[5] = RET[9] = RET[13] = 3;
+    RET[2] = RET[6] = RET[10] = RET[14] = 5;
+    RET[3] = RET[7] = RET[11] = RET[15] = 6;
+}
diff --git a/tests/cfor-test-65.ispc b/tests/cfor-test-65.ispc
new file mode 100644
index 00000000..a3c11c6d
--- /dev/null
+++ b/tests/cfor-test-65.ispc
@@ -0,0 +1,24 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex&0x3];
+    float i, j;
+    cfor (i = 0; i < b; ++i) {
+        if (a == 1.) break;
+        cfor (j = 0; j < b; ++j) {
+            if (a == 3.) break;
+            ++a;
+        }
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[0] = RET[4] = RET[8] = RET[12] = 1;
+    RET[1] = RET[5] = RET[9] = RET[13] = 3;
+    RET[2] = RET[6] = RET[10] = RET[14] = 3;
+    RET[3] = RET[7] = RET[11] = RET[15] = 29;
+}
diff --git a/tests/cfor-test-66.ispc b/tests/cfor-test-66.ispc
new file mode 100644
index 00000000..d3698ffe
--- /dev/null
+++ b/tests/cfor-test-66.ispc
@@ -0,0 +1,25 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex&0x3];
+    float i, j;
+    cfor (i = 0; i < b; ++i) {
+        ++a;
+        cfor (j = 0; j < b; ++j) {
+            if (a == 3.) break;
+            ++a;
+        }
+        ++a;
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[0] = RET[4] = RET[8] = RET[12] = 32;
+    RET[1] = RET[5] = RET[9] = RET[13] = 32;
+    RET[2] = RET[6] = RET[10] = RET[14] = 38;
+    RET[3] = RET[7] = RET[11] = RET[15] = 39;
+}
diff --git a/tests/cfor-test-67.ispc b/tests/cfor-test-67.ispc
new file mode 100644
index 00000000..3a477987
--- /dev/null
+++ b/tests/cfor-test-67.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    float i, j;
+    cfor (i = 0; i < b; ++i) {
+        if (a == b) continue;
+        ++a;
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 6 + programIndex;
+    RET[0] = RET[1] = RET[2] = RET[3] = RET[4] = 5;
+}
diff --git a/tests/cfor-test-68.ispc b/tests/cfor-test-68.ispc
new file mode 100644
index 00000000..60cec19f
--- /dev/null
+++ b/tests/cfor-test-68.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    float i, j;
+    cfor (i = 0; i < b; ++i) {
+        if (a == b) continue;
+        ++a;
+        if (a == 2) break;
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 6 + programIndex;
+    RET[0] = 2;
+    RET[1] = RET[2] = RET[3] = RET[4] = 5;
+}
diff --git a/tests/cfor-test-69.ispc b/tests/cfor-test-69.ispc
new file mode 100644
index 00000000..2567d683
--- /dev/null
+++ b/tests/cfor-test-69.ispc
@@ -0,0 +1,25 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    float i, j;
+    cfor (i = 0; i < b; ++i) {
+        if (a == b) 
+            a += 10;
+        ++a;
+        if (a == 2) break;
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 6 + programIndex;
+    RET[0] = 2;
+    RET[1] = 17;
+    RET[2] = 18;
+    RET[3] = 19;
+    RET[4] = 20;
+}
diff --git a/tests/cfor-test-70.ispc b/tests/cfor-test-70.ispc
new file mode 100644
index 00000000..68c45a27
--- /dev/null
+++ b/tests/cfor-test-70.ispc
@@ -0,0 +1,26 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    float i, j;
+    if (a >= 4)
+        a = 0;
+    else {
+        cfor (i = 0; i < b; ++i) {
+            if (a == b) 
+                a += 10;
+            ++a;
+            if (a == 2) break;
+        }
+    }
+    RET[programIndex] = a;
+}
+
+export void result(uniform float RET[4]) {
+    RET[programIndex] = 0;
+    RET[0] = 2;
+    RET[1] = 17;
+    RET[2] = 18;
+}
diff --git a/tests/cfor-test-71.ispc b/tests/cfor-test-71.ispc
new file mode 100644
index 00000000..2e8d201f
--- /dev/null
+++ b/tests/cfor-test-71.ispc
@@ -0,0 +1,28 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    float i, r = a;
+    float ret;
+    if (a >= 4)
+        ret = 0;
+    else {
+        cfor (i = 0; i < a; ++i) {
+            if (r == b) 
+                r += 10;
+            ++r;
+            if (r == 2) break;
+        }
+        ret = r;
+    }
+    RET[programIndex] = ret;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 2;
+    RET[1] = 4;
+    RET[2] = 16;
+}
diff --git a/tests/cfor-test-72.ispc b/tests/cfor-test-72.ispc
new file mode 100644
index 00000000..340e01be
--- /dev/null
+++ b/tests/cfor-test-72.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) {
+    int i; float r = 0.;
+    cfor (i = 0; i < 10; ++i) { r += 2; }
+    RET[programIndex] = r;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 20.000000;
+}
diff --git a/tests/cfor-test-73.ispc b/tests/cfor-test-73.ispc
new file mode 100644
index 00000000..b5693c60
--- /dev/null
+++ b/tests/cfor-test-73.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    float i, r = 0.;
+    cfor (i = 0; i < a; ++i)
+        r += b;
+    RET[programIndex] = r;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 5 + 5 * programIndex;
+}
diff --git a/tests/cfor-test-74.ispc b/tests/cfor-test-74.ispc
new file mode 100644
index 00000000..42e7b02e
--- /dev/null
+++ b/tests/cfor-test-74.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    float i, r = 0.;
+    cfor (i = 0; i < a+b; ++i)
+        r += b;
+    RET[programIndex] = r;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 30 + 5 * programIndex;
+}
diff --git a/tests/cfor-test-75.ispc b/tests/cfor-test-75.ispc
new file mode 100644
index 00000000..5ddc07e1
--- /dev/null
+++ b/tests/cfor-test-75.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform float i;
+    float r = 0.;
+    cfor (i = 0; i < b; ++i)
+        r += a;
+    RET[programIndex] = r;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 5 + 5 * programIndex;
+}
diff --git a/tests/cfor-test-76.ispc b/tests/cfor-test-76.ispc
new file mode 100644
index 00000000..7d28ae3d
--- /dev/null
+++ b/tests/cfor-test-76.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform float i;
+    cfor (i = 0; i < b; ++i) {
+        if (i+2 == b) continue;
+        ++a;
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 5 + programIndex;
+}
diff --git a/tests/cfor-test-77.ispc b/tests/cfor-test-77.ispc
new file mode 100644
index 00000000..5580d05d
--- /dev/null
+++ b/tests/cfor-test-77.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform float i;
+    cfor (i = 0; i < b; ++i) {
+        ++a;
+        if (i+2 == b) break;
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 5+programIndex;
+}
diff --git a/tests/cfor-test-78.ispc b/tests/cfor-test-78.ispc
new file mode 100644
index 00000000..64a6c96b
--- /dev/null
+++ b/tests/cfor-test-78.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) {
+int i;
+cfor (i = 0; i < 10; ++i) {
+  if (i == 5) break;
+}
+RET[programIndex] = (float)i; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 5.000000;
+}
diff --git a/tests/cfor-test-79.ispc b/tests/cfor-test-79.ispc
new file mode 100644
index 00000000..382710a2
--- /dev/null
+++ b/tests/cfor-test-79.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) {
+int i, j=0;
+cfor (i = 0; i < 10; ++i) {
+  if (i >= 5) continue;
+  ++j;}
+RET[programIndex] = (float)j; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 5.000000;
+}
diff --git a/tests/cfor-test-89.ispc b/tests/cfor-test-89.ispc
new file mode 100644
index 00000000..7bf5d809
--- /dev/null
+++ b/tests/cfor-test-89.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float xxFOO[], uniform float a) {
+    float xx = xxFOO[programIndex];
+    uniform float x[4][3];
+    uniform int i, j;
+    cfor (i = 0; i < 4; ++i)
+        cfor (j = 0; j < 3; ++j)
+            x[i][j] = a * i * j;
+    RET[programIndex] = x[3][2];
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 30.000000;
+}
diff --git a/tests/cfor-test-90.ispc b/tests/cfor-test-90.ispc
new file mode 100644
index 00000000..e70e96bf
--- /dev/null
+++ b/tests/cfor-test-90.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+
+float foo(uniform float x[3]) {
+    return x[1];
+}
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform float x[3];
+    uniform int i;
+    cfor (i = 0; i < 3; ++i) x[i] = b;
+    RET[programIndex] = foo(x);
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 5.000000;
+}
diff --git a/tests/cfor-test-91.ispc b/tests/cfor-test-91.ispc
new file mode 100644
index 00000000..ce95c9c2
--- /dev/null
+++ b/tests/cfor-test-91.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+
+float foo(uniform float x[3]) {
+    return x[1];
+}
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex];
+    uniform float x[3][3];
+    uniform int i, j;
+    cfor (i = 0; i < 3; ++i) 
+        cfor (j = 0; j < 3; ++j)
+            x[i][j] = 3 + i*j;
+    RET[programIndex] = foo(x[1]);
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 4.000000;
+}
diff --git a/tests/cfor-test-95.ispc b/tests/cfor-test-95.ispc
new file mode 100644
index 00000000..b3686100
--- /dev/null
+++ b/tests/cfor-test-95.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform int i;
+    float x[10];
+    cfor (i = 0; i < 10; ++i)
+        x[i] = i;
+    RET[programIndex] = x[b];
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 5;
+}
diff --git a/tests/cfor-test-96.ispc b/tests/cfor-test-96.ispc
new file mode 100644
index 00000000..0df7fcc9
--- /dev/null
+++ b/tests/cfor-test-96.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform int i, j;
+    float x[5][2];
+    cfor (i = 0; i < b; ++i)
+        cfor (j = 0; j < 2; ++j)
+            x[i][j] = i+j;
+    RET[programIndex] = x[b-4][b-4];
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2.;
+}
diff --git a/tests/cfor-unif-struct-test-114.ispc b/tests/cfor-unif-struct-test-114.ispc
new file mode 100644
index 00000000..114e826d
--- /dev/null
+++ b/tests/cfor-unif-struct-test-114.ispc
@@ -0,0 +1,23 @@
+
+export uniform int width() { return programCount; }
+
+
+struct Foo {
+    uniform float x;
+    uniform float f;
+};
+export void f_fi(uniform float RET[], uniform float a[], uniform int bFOO[]) {
+    int b = bFOO[programIndex];
+    uniform struct Foo myFoo[17];
+    uniform int i;
+    cfor (i = 0; i < 17; ++i) {
+        myFoo[i].x = i;
+        myFoo[i].f = 2*i;
+    }
+    RET[programIndex] = myFoo[b/2].f;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2 + 2 * programIndex;
+}
diff --git a/tests/cif-nested-continue.ispc b/tests/cif-nested-continue.ispc
new file mode 100644
index 00000000..5da446f7
--- /dev/null
+++ b/tests/cif-nested-continue.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex];
+    int i, j;
+    float r = 0;
+    for (i = 0; i < a; ++i) {
+        cif (i != 0) {
+            cif (a != 2)
+                continue;
+            r = 10;
+        }
+        ++r;
+    }
+    RET[programIndex] = r;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 1; RET[1] = 11; }
diff --git a/tests/const-fold-1.ispc b/tests/const-fold-1.ispc
new file mode 100644
index 00000000..fc4717ce
--- /dev/null
+++ b/tests/const-fold-1.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex];
+    uniform int x = (1 << 4) - ~0xf0f0f0f0 + (2 * 8 / 2);
+    static uniform int y = (1 << 4) - ~0xf0f0f0f0 + (2 * 8 / 2);
+    RET[programIndex] = (x == y) ? 1. : 0.;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 1.; }
diff --git a/tests/const-fold-2.ispc b/tests/const-fold-2.ispc
new file mode 100644
index 00000000..88743d2f
--- /dev/null
+++ b/tests/const-fold-2.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex];
+    uniform int x = (170 >> 4) % 5;
+    static uniform int y = (170 >> 4) % 5;
+    RET[programIndex] = (x == y) ? 1. : 0.;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 1.; }
diff --git a/tests/const-fold-3.ispc b/tests/const-fold-3.ispc
new file mode 100644
index 00000000..cf5bc915
--- /dev/null
+++ b/tests/const-fold-3.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex];
+    uniform int x = (17 < 2) || (6 >= 5) && (20 >= 20);
+    static uniform int y = (17 < 2) || (6 >= 5) && (20 >= 20);
+    RET[programIndex] = ((x!=0) == (y!=0)) ? 1. : 0.;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 1.; }
diff --git a/tests/const-fold-4.ispc b/tests/const-fold-4.ispc
new file mode 100644
index 00000000..93bf1b70
--- /dev/null
+++ b/tests/const-fold-4.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+
+
+#define N(x)  (3*x+1)
+// p a power of 2
+#define ROUND_UP(x, p) ((x + (p-1)) & ~(p-1))
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex];
+    uniform float x[ROUND_UP(N(3), 16)];
+    uniform int i;
+    for (i = 0; i < ROUND_UP(N(3), 16); ++i)
+        x[i] = i * i;
+    RET[programIndex] = x[4];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 16.; }
diff --git a/tests/const-fold-const-sym-1.ispc b/tests/const-fold-const-sym-1.ispc
new file mode 100644
index 00000000..df3343dc
--- /dev/null
+++ b/tests/const-fold-const-sym-1.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_v(uniform float RET[]) {
+    const uniform float x = 1.;
+    RET[programIndex] = x+x;
+}
+
+
+export void result(uniform float RET[]) { RET[programIndex] = 2; }
diff --git a/tests/const-fold-const-sym-2.ispc b/tests/const-fold-const-sym-2.ispc
new file mode 100644
index 00000000..ed7d1fa2
--- /dev/null
+++ b/tests/const-fold-const-sym-2.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+
+const uniform float x = 1.;
+
+export void f_v(uniform float RET[]) {
+    RET[programIndex] = x+x;
+}
+
+
+export void result(uniform float RET[]) { RET[programIndex] = 2; }
diff --git a/tests/const-fold-const-sym-3.ispc b/tests/const-fold-const-sym-3.ispc
new file mode 100644
index 00000000..ac6a0f8f
--- /dev/null
+++ b/tests/const-fold-const-sym-3.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+
+static const uniform float x = 1.;
+
+export void f_v(uniform float RET[]) {
+    RET[programIndex] = x+x;
+}
+
+
+export void result(uniform float RET[]) { RET[programIndex] = 2; }
diff --git a/tests/const-fold-const-sym.ispc b/tests/const-fold-const-sym.ispc
new file mode 100644
index 00000000..cefc5188
--- /dev/null
+++ b/tests/const-fold-const-sym.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_v(uniform float RET[]) {
+    static const uniform float x = 1.;
+    RET[programIndex] = x+x;
+}
+
+
+export void result(uniform float RET[]) { RET[programIndex] = 2; }
diff --git a/tests/cwhile-test-59.ispc b/tests/cwhile-test-59.ispc
new file mode 100644
index 00000000..bbd26b6f
--- /dev/null
+++ b/tests/cwhile-test-59.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_f(uniform float RET[], uniform float vFOO[]) {
+    float v = vFOO[programIndex ];
+    float r = 0.;
+    cwhile (v > 0.) {
+        r += 1.;
+        v -= .125;
+    }
+    RET[programIndex] = r;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 8 + 8 * programIndex;
+}
diff --git a/tests/cwhile-test-60.ispc b/tests/cwhile-test-60.ispc
new file mode 100644
index 00000000..a9233a49
--- /dev/null
+++ b/tests/cwhile-test-60.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    cwhile (a < 10)
+        ++a;
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 10;
+}
diff --git a/tests/cwhile-test-80.ispc b/tests/cwhile-test-80.ispc
new file mode 100644
index 00000000..53c7dbe1
--- /dev/null
+++ b/tests/cwhile-test-80.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) {
+int i=0, j=0;
+cwhile (i++ < 10) {
+  ++j;
+  if (i >= 5) continue;
+  ++j;}
+RET[programIndex] = (float)j; }
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 14.000000;
+}
diff --git a/tests/cwhile-test-81.ispc b/tests/cwhile-test-81.ispc
new file mode 100644
index 00000000..e978e9bd
--- /dev/null
+++ b/tests/cwhile-test-81.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) {
+int i=0, j=0;
+cwhile (i++ < 10) {
+  ++j;
+  if (i >= 5) break;
+  ++j;}
+RET[programIndex] = (float)j; }
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 9.000000;
+}
diff --git a/tests/default-args-1.ispc b/tests/default-args-1.ispc
new file mode 100644
index 00000000..dbeb2726
--- /dev/null
+++ b/tests/default-args-1.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+
+float foo(float f = 1) { return f; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex];
+    RET[programIndex] = foo();
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 1; }
diff --git a/tests/default-args-2.ispc b/tests/default-args-2.ispc
new file mode 100644
index 00000000..246a2203
--- /dev/null
+++ b/tests/default-args-2.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+
+float foo(uniform float x, float f = 1) { return f+x; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    RET[programIndex] = foo(b);
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 6; }
diff --git a/tests/default-args-3.ispc b/tests/default-args-3.ispc
new file mode 100644
index 00000000..674fb2a4
--- /dev/null
+++ b/tests/default-args-3.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+
+static const float g = 100;
+
+float foo(uniform float x, float f = 2*2) { return f+x; }
+float foo(uniform float x, int i) { return x; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    RET[programIndex] = foo(b);
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 9; }
diff --git a/tests/default-args-4.ispc b/tests/default-args-4.ispc
new file mode 100644
index 00000000..382d4b07
--- /dev/null
+++ b/tests/default-args-4.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+
+static const float g = 100;
+
+float foo(uniform float x, float f = g) { return f+x; }
+float foo(uniform float x, int i) { return x; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    RET[programIndex] = foo(b);
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 105; }
diff --git a/tests/do-1.ispc b/tests/do-1.ispc
new file mode 100644
index 00000000..08274108
--- /dev/null
+++ b/tests/do-1.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float x = aFOO[programIndex]; 
+    float r = 0.;
+    do { 
+        r += 1.; 
+        x -= .25; 
+    } while (x > 0.);
+    RET[programIndex] = r;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 4 * (programIndex+1); }
diff --git a/tests/do-2.ispc b/tests/do-2.ispc
new file mode 100644
index 00000000..19f6e5e6
--- /dev/null
+++ b/tests/do-2.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_v(uniform float RET[]) {
+    int i=0, j=0;
+    do {
+        ++j;
+        if (i >= 5) continue;
+        ++j;
+        } while (++i < 10);
+    RET[programIndex] = (float)j;
+}
+
+
+export void result(uniform float RET[]) { RET[programIndex] = 15; }
diff --git a/tests/do-3.ispc b/tests/do-3.ispc
new file mode 100644
index 00000000..334c9229
--- /dev/null
+++ b/tests/do-3.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) {
+    int i=0, j=0;
+    do {
+        ++j;
+        if (i >= 5) break;
+        ++j;
+        } while (++i < 10);
+    RET[programIndex] = j;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 11; }
+
diff --git a/tests/double-1.ispc b/tests/double-1.ispc
new file mode 100644
index 00000000..ab0de8cd
--- /dev/null
+++ b/tests/double-1.ispc
@@ -0,0 +1,10 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_du(uniform float RET[], uniform double aFOO[], uniform double b) {
+    double a = aFOO[programIndex];
+    RET[programIndex] = a / b;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = (1.+programIndex)/5.;}
diff --git a/tests/double-2.ispc b/tests/double-2.ispc
new file mode 100644
index 00000000..dded67da
--- /dev/null
+++ b/tests/double-2.ispc
@@ -0,0 +1,10 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_duf(uniform float RET[], uniform double aFOO[], uniform float b) {
+    double a = aFOO[programIndex];
+    RET[programIndex] = a / b;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = (1+programIndex)/5.; }
diff --git a/tests/double-3.ispc b/tests/double-3.ispc
new file mode 100644
index 00000000..7212f0ca
--- /dev/null
+++ b/tests/double-3.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_duf(uniform float RET[], uniform double aFOO[], uniform float b) {
+    double a = aFOO[programIndex];
+    if (a <= 2.)
+        { RET[programIndex] = 0; return; }
+    RET[programIndex] = a;
+}
+
+export void result(uniform float RET[]) { 
+    RET[programIndex] = 1+programIndex;
+    RET[0] = RET[1] = 0;
+}
+
diff --git a/tests/double-4.ispc b/tests/double-4.ispc
new file mode 100644
index 00000000..9436592b
--- /dev/null
+++ b/tests/double-4.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_duf(uniform float RET[], uniform double aFOO[], uniform float b) {
+    double a = aFOO[programIndex];
+    if (a+3 > b)
+        { RET[programIndex] = 1; return; }
+    RET[programIndex] = 0;
+}
+
+export void result(uniform float RET[]) { 
+    RET[programIndex] = 1;
+    RET[0] = RET[1] = 0;
+}
+
diff --git a/tests/double-5.ispc b/tests/double-5.ispc
new file mode 100644
index 00000000..0298b890
--- /dev/null
+++ b/tests/double-5.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_di(uniform float RET[], uniform double aFOO[], uniform int bFOO[]) {
+    double a = aFOO[programIndex];
+    int b = bFOO[programIndex];
+    if (a+4 == b)
+        { RET[programIndex] = 1; return; }
+    RET[programIndex] = 0;
+}
+
+export void result(uniform float RET[]) { 
+    RET[programIndex] = 1;
+}
diff --git a/tests/double-mask-test.ispc b/tests/double-mask-test.ispc
new file mode 100644
index 00000000..e3a136be
--- /dev/null
+++ b/tests/double-mask-test.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float ret[], uniform float v[], uniform float u) {
+    double d = v[programIndex];
+    double r = u;
+    if (d <= 2.)
+        r = d;
+    ret[programIndex] = r;
+}
+
+export void result(uniform float ret[]) {
+    ret[programIndex] = 5;
+    ret[0] = 1;
+    ret[1] = 2;
+}
diff --git a/tests/double.ispc b/tests/double.ispc
new file mode 100644
index 00000000..07227a89
--- /dev/null
+++ b/tests/double.ispc
@@ -0,0 +1,10 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_du(uniform float RET[], uniform double aFOO[], uniform double b) {
+    double a = aFOO[programIndex];
+    RET[programIndex] = a + b;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 6 + programIndex; }
diff --git a/tests/for-loop-nested-continues.ispc b/tests/for-loop-nested-continues.ispc
new file mode 100644
index 00000000..761b297a
--- /dev/null
+++ b/tests/for-loop-nested-continues.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex];
+    int i, j;
+    float r = 0;
+    for (i = 0; i < a+1; ++i) {
+        if (i == 1)
+            continue;
+        for (j = 0; j < a; ++j) {
+            if (a == 2)
+                continue;
+        }
+        ++r;
+    }
+    RET[programIndex] = r;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 1+programIndex; }
diff --git a/tests/for-unif-test-vary-break.ispc b/tests/for-unif-test-vary-break.ispc
new file mode 100644
index 00000000..a32bc8fb
--- /dev/null
+++ b/tests/for-unif-test-vary-break.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform int x;
+    float aa = a;
+    for (x = 0; x < 99999; ++x) {
+        if (x == a) break;
+        ++aa;
+    }
+    RET[programIndex] = aa;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 2 + 2*programIndex; }
diff --git a/tests/global-const-initializer.ispc b/tests/global-const-initializer.ispc
new file mode 100644
index 00000000..bba09e49
--- /dev/null
+++ b/tests/global-const-initializer.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+
+/*COextern "C" float f;*/
+const uniform float f = 2.;
+
+export void f_v(uniform float RET[]) {
+    RET[programIndex] = f;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 2.; }
diff --git a/tests/global-initializer.ispc b/tests/global-initializer.ispc
new file mode 100644
index 00000000..cb37cc25
--- /dev/null
+++ b/tests/global-initializer.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+
+/*COextern "C" float f;*/
+uniform float f = 2.;
+
+export void f_v(uniform float RET[]) {
+    RET[programIndex] = f;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 2.; }
diff --git a/tests/gs-double-improve-multidim-1.ispc b/tests/gs-double-improve-multidim-1.ispc
new file mode 100644
index 00000000..c001896e
--- /dev/null
+++ b/tests/gs-double-improve-multidim-1.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    uniform double udx[25][25];
+    for (uniform int i = 0; i < 25; ++i)
+        for (uniform int j = 0; j < 25; ++j)
+            udx[i][j] = 10*i+j;
+
+    int x = 1;
+    RET[programIndex] = udx[x][programIndex+1];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 11+programIndex; }
diff --git a/tests/gs-double-improve-multidim-2.ispc b/tests/gs-double-improve-multidim-2.ispc
new file mode 100644
index 00000000..3031551b
--- /dev/null
+++ b/tests/gs-double-improve-multidim-2.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    uniform double udx[5][5];
+    for (uniform int i = 0; i < 5; ++i)
+        for (uniform int j = 0; j < 5; ++j)
+            udx[i][j] = 10*i+j;
+
+    int x = 1;
+    RET[programIndex] = udx[x][(int)b-2];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 13; } 
diff --git a/tests/gs-double-improve-multidim-3.ispc b/tests/gs-double-improve-multidim-3.ispc
new file mode 100644
index 00000000..f55f5a19
--- /dev/null
+++ b/tests/gs-double-improve-multidim-3.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    uniform double udx[5][5];
+    for (uniform int i = 0; i < 5; ++i)
+        for (uniform int j = 0; j < 5; ++j)
+            udx[i][j] = 10*i+j;
+
+    int x = 1;
+    RET[programIndex] = udx[b-4][x];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 11; }
diff --git a/tests/gs-double-improve-multidim-struct-1.ispc b/tests/gs-double-improve-multidim-struct-1.ispc
new file mode 100644
index 00000000..07962eea
--- /dev/null
+++ b/tests/gs-double-improve-multidim-struct-1.ispc
@@ -0,0 +1,22 @@
+
+export uniform int width() { return programCount; }
+
+
+
+struct Foo {
+    uniform double udx[5][35];
+};
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    uniform Foo f[5];
+    for (uniform int i = 0; i < 5; ++i)
+        for (uniform int j = 0; j < 5; ++j)
+            for (uniform int k = 0; k < 35; ++k)
+                f[i].udx[j][k] = 100*i+10*j+k;
+
+    int x = 1;
+    RET[programIndex] = f[x+1].udx[b-4][programIndex];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 210+programIndex; }
diff --git a/tests/gs-double-improve-multidim-struct.ispc b/tests/gs-double-improve-multidim-struct.ispc
new file mode 100644
index 00000000..1b7088fa
--- /dev/null
+++ b/tests/gs-double-improve-multidim-struct.ispc
@@ -0,0 +1,22 @@
+
+export uniform int width() { return programCount; }
+
+
+
+struct Foo {
+    uniform double udx[5][5];
+};
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    uniform Foo f[5];
+    for (uniform int i = 0; i < 5; ++i)
+        for (uniform int j = 0; j < 5; ++j)
+            for (uniform int k = 0; k < 5; ++k)
+                f[i].udx[j][k] = 100*i+10*j+k;
+
+    int x = 1;
+    RET[programIndex] = f[x+1].udx[b-4][x];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 211; }
diff --git a/tests/gs-double-improve-multidim.ispc b/tests/gs-double-improve-multidim.ispc
new file mode 100644
index 00000000..b601e6f8
--- /dev/null
+++ b/tests/gs-double-improve-multidim.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    uniform double udx[5][5];
+    for (uniform int i = 0; i < 5; ++i)
+        for (uniform int j = 0; j < 5; ++j)
+            udx[i][j] = 10*i+j;
+
+    int x = 1;
+    RET[programIndex] = udx[x][x];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 11; }
diff --git a/tests/gs-double-improve-progindex-plus-const.ispc b/tests/gs-double-improve-progindex-plus-const.ispc
new file mode 100644
index 00000000..72eaee4e
--- /dev/null
+++ b/tests/gs-double-improve-progindex-plus-const.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    uniform double udx[17] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17 };
+    int x = programIndex + 1;
+    RET[programIndex] = udx[x];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 2+programIndex; }
+
diff --git a/tests/gs-double-improve-progindex-plus-unif.ispc b/tests/gs-double-improve-progindex-plus-unif.ispc
new file mode 100644
index 00000000..51ea9983
--- /dev/null
+++ b/tests/gs-double-improve-progindex-plus-unif.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    uniform double udx[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 };
+    int x = -5 + programIndex + (int)b;
+    RET[programIndex] = udx[x];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 1+programIndex; }
diff --git a/tests/gs-double-improve-progindex.ispc b/tests/gs-double-improve-progindex.ispc
new file mode 100644
index 00000000..cce9137e
--- /dev/null
+++ b/tests/gs-double-improve-progindex.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    uniform double udx[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 };
+    int x = programIndex;
+    RET[programIndex] = udx[x];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 1+programIndex; }
diff --git a/tests/gs-double-improve-unif.ispc b/tests/gs-double-improve-unif.ispc
new file mode 100644
index 00000000..e1e18b26
--- /dev/null
+++ b/tests/gs-double-improve-unif.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    uniform double udx[4] = { 1,2,3,4 };
+    int x = b - 5;
+    RET[programIndex] = udx[x];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 1; }
diff --git a/tests/gs-double-improve-vary-one.ispc b/tests/gs-double-improve-vary-one.ispc
new file mode 100644
index 00000000..cf58ac58
--- /dev/null
+++ b/tests/gs-double-improve-vary-one.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    uniform double udx[5] = { 1,2,3,4,5 };
+    int x = 1;
+    RET[programIndex] = udx[x];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 2; }
diff --git a/tests/gs-double-improve-vary-zero.ispc b/tests/gs-double-improve-vary-zero.ispc
new file mode 100644
index 00000000..97ba24f5
--- /dev/null
+++ b/tests/gs-double-improve-vary-zero.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    uniform double udx[5] = { 1,2,3,4,5 };
+    int x = 0;
+    RET[programIndex] = udx[x];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 1; }
diff --git a/tests/gs-double-improve-varying-1.ispc b/tests/gs-double-improve-varying-1.ispc
new file mode 100644
index 00000000..092d6f1b
--- /dev/null
+++ b/tests/gs-double-improve-varying-1.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    double udx[5];
+    for (uniform int i = 0; i < 5; ++i)
+        udx[i] = i + a;
+
+    int x = b-4;
+    RET[programIndex] = udx[x];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 2+programIndex; }
diff --git a/tests/gs-double-improve-varying.ispc b/tests/gs-double-improve-varying.ispc
new file mode 100644
index 00000000..f01ba123
--- /dev/null
+++ b/tests/gs-double-improve-varying.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    double udx[5];
+    for (uniform int i = 0; i < 5; ++i)
+        udx[i] = i + a;
+
+    int x = 1;
+    RET[programIndex] = udx[x];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 2+programIndex; }
diff --git a/tests/gs-improve-multidim-1.ispc b/tests/gs-improve-multidim-1.ispc
new file mode 100644
index 00000000..d85c1541
--- /dev/null
+++ b/tests/gs-improve-multidim-1.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    uniform float udx[20][20];
+    for (uniform int i = 0; i < 20; ++i)
+        for (uniform int j = 0; j < 20; ++j)
+            udx[i][j] = 100*i+j;
+
+    int x = 1;
+    RET[programIndex] = udx[x][programIndex+1];
+}
+
+export void result(uniform float RET[]) { 
+    RET[programIndex] = 101+programIndex;
+}
diff --git a/tests/gs-improve-multidim-2.ispc b/tests/gs-improve-multidim-2.ispc
new file mode 100644
index 00000000..b233074b
--- /dev/null
+++ b/tests/gs-improve-multidim-2.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    uniform float udx[5][5];
+    for (uniform int i = 0; i < 5; ++i)
+        for (uniform int j = 0; j < 5; ++j)
+            udx[i][j] = 10*i+j;
+
+    int x = 1;
+    RET[programIndex] = udx[x][(int)b-2];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 13; } 
diff --git a/tests/gs-improve-multidim-3.ispc b/tests/gs-improve-multidim-3.ispc
new file mode 100644
index 00000000..dd30262e
--- /dev/null
+++ b/tests/gs-improve-multidim-3.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    uniform float udx[5][5];
+    for (uniform int i = 0; i < 5; ++i)
+        for (uniform int j = 0; j < 5; ++j)
+            udx[i][j] = 10*i+j;
+
+    int x = 1;
+    RET[programIndex] = udx[b-4][x];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 11; }
diff --git a/tests/gs-improve-multidim-struct-1.ispc b/tests/gs-improve-multidim-struct-1.ispc
new file mode 100644
index 00000000..6b45e853
--- /dev/null
+++ b/tests/gs-improve-multidim-struct-1.ispc
@@ -0,0 +1,22 @@
+
+export uniform int width() { return programCount; }
+
+
+
+struct Foo {
+    uniform float udx[25][25];
+};
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    uniform Foo f[5];
+    for (uniform int i = 0; i < 5; ++i)
+        for (uniform int j = 0; j < 25; ++j)
+            for (uniform int k = 0; k < 25; ++k)
+                f[i].udx[j][k] = 1000*i+100*j+k;
+
+    int x = 1;
+    RET[programIndex] = f[x+1].udx[b-4][programIndex];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 2100 +programIndex; }
diff --git a/tests/gs-improve-multidim-struct.ispc b/tests/gs-improve-multidim-struct.ispc
new file mode 100644
index 00000000..0b7ff2cd
--- /dev/null
+++ b/tests/gs-improve-multidim-struct.ispc
@@ -0,0 +1,22 @@
+
+export uniform int width() { return programCount; }
+
+
+
+struct Foo {
+    uniform float udx[5][5];
+};
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    uniform Foo f[5];
+    for (uniform int i = 0; i < 5; ++i)
+        for (uniform int j = 0; j < 5; ++j)
+            for (uniform int k = 0; k < 5; ++k)
+                f[i].udx[j][k] = 100*i+10*j+k;
+
+    int x = 1;
+    RET[programIndex] = f[x+1].udx[b-4][x];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 211; }
diff --git a/tests/gs-improve-multidim.ispc b/tests/gs-improve-multidim.ispc
new file mode 100644
index 00000000..9e110afe
--- /dev/null
+++ b/tests/gs-improve-multidim.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    uniform float udx[18][18];
+    for (uniform int i = 0; i < 18; ++i)
+        for (uniform int j = 0; j < 18; ++j)
+            udx[i][j] = 100*i+j;
+
+    int x = 1;
+    RET[programIndex] = udx[x][x];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 101; }
diff --git a/tests/gs-improve-progindex-plus-const.ispc b/tests/gs-improve-progindex-plus-const.ispc
new file mode 100644
index 00000000..fc350b2a
--- /dev/null
+++ b/tests/gs-improve-progindex-plus-const.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    uniform float udx[17] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17 };
+    int x = programIndex + 1;
+    RET[programIndex] = udx[x];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 2+programIndex; }
diff --git a/tests/gs-improve-progindex-plus-unif.ispc b/tests/gs-improve-progindex-plus-unif.ispc
new file mode 100644
index 00000000..e651adf8
--- /dev/null
+++ b/tests/gs-improve-progindex-plus-unif.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    uniform float udx[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 };
+    int x = -5 + programIndex + (int)b;
+    RET[programIndex] = udx[x];
+}
+
+export void result(uniform float RET[4]) { RET[programIndex] = 1+programIndex; }
diff --git a/tests/gs-improve-progindex.ispc b/tests/gs-improve-progindex.ispc
new file mode 100644
index 00000000..3845f72d
--- /dev/null
+++ b/tests/gs-improve-progindex.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    uniform float udx[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 };
+    int x = programIndex;
+    RET[programIndex] = udx[x];
+}
+
+export void result(uniform float RET[]) { 
+    RET[programIndex] = programIndex + 1;
+}
diff --git a/tests/gs-improve-unif.ispc b/tests/gs-improve-unif.ispc
new file mode 100644
index 00000000..207a1fa2
--- /dev/null
+++ b/tests/gs-improve-unif.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    uniform float udx[4] = { 1,2,3,4 };
+    int x = b - 5;
+    RET[programIndex] = udx[x];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 1; }
diff --git a/tests/gs-improve-vary-one.ispc b/tests/gs-improve-vary-one.ispc
new file mode 100644
index 00000000..1b5c952a
--- /dev/null
+++ b/tests/gs-improve-vary-one.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    uniform float udx[5] = { 1,2,3,4,5 };
+    int x = 1;
+    RET[programIndex] = udx[x];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 2; }
diff --git a/tests/gs-improve-vary-zero.ispc b/tests/gs-improve-vary-zero.ispc
new file mode 100644
index 00000000..a4c5f230
--- /dev/null
+++ b/tests/gs-improve-vary-zero.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    uniform float udx[5] = { 1,2,3,4,5 };
+    int x = 0;
+    RET[programIndex] = udx[x];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 1; }
diff --git a/tests/gs-improve-varying-1.ispc b/tests/gs-improve-varying-1.ispc
new file mode 100644
index 00000000..57417f97
--- /dev/null
+++ b/tests/gs-improve-varying-1.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    float udx[5];
+    for (uniform int i = 0; i < 5; ++i)
+        udx[i] = i + a;
+
+    int x = b-4;
+    RET[programIndex] = udx[x];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 2+programIndex; }
diff --git a/tests/gs-improve-varying.ispc b/tests/gs-improve-varying.ispc
new file mode 100644
index 00000000..4098fc90
--- /dev/null
+++ b/tests/gs-improve-varying.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    float udx[5];
+    for (uniform int i = 0; i < 5; ++i)
+        udx[i] = i + a;
+
+    int x = 1;
+    RET[programIndex] = udx[x];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 2+programIndex; }
diff --git a/tests/load-int16-1.ispc b/tests/load-int16-1.ispc
new file mode 100644
index 00000000..5178cd28
--- /dev/null
+++ b/tests/load-int16-1.ispc
@@ -0,0 +1,13 @@
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    uniform int x[9] = { 0x00020001, 0x00040003, 0x00060005, 0x00080007,
+                         0x000a0009, 0x000c000b, 0x000e000d, 0x0010000f,
+                         0x00120011 };
+    unsigned int v = load_from_int16(x, 1);
+    RET[programIndex] = v;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2+programIndex;
+}
diff --git a/tests/load-int16.ispc b/tests/load-int16.ispc
new file mode 100644
index 00000000..66160c28
--- /dev/null
+++ b/tests/load-int16.ispc
@@ -0,0 +1,12 @@
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    uniform int x[8] = { 0x00020001, 0x00040003, 0x00060005, 0x00080007,
+                         0x000a0009, 0x000c000b, 0x000e000d, 0x0010000f };
+    unsigned int v = load_from_int16(x, 0);
+    RET[programIndex] = v;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1+programIndex;
+}
diff --git a/tests/load-int8-1.ispc b/tests/load-int8-1.ispc
new file mode 100644
index 00000000..c8c9a61c
--- /dev/null
+++ b/tests/load-int8-1.ispc
@@ -0,0 +1,12 @@
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    uniform int x[5] = { 0x04030201, 0x08070605, 0x0c0b0a09, 0x100f0e0d,
+                         0x14131211 };
+    unsigned int v = load_from_int8(x, 2);
+    RET[programIndex] = v;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 3+programIndex;
+}
diff --git a/tests/load-int8.ispc b/tests/load-int8.ispc
new file mode 100644
index 00000000..081f0f9b
--- /dev/null
+++ b/tests/load-int8.ispc
@@ -0,0 +1,11 @@
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    uniform int x[4] = { 0x04030201, 0x08070605, 0x0c0b0a09, 0x100f0e0d };
+    unsigned int v = load_from_int8(x, 0);
+    RET[programIndex] = v;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1+programIndex;
+}
diff --git a/tests/masked-scatter-struct.ispc b/tests/masked-scatter-struct.ispc
new file mode 100644
index 00000000..7235b06c
--- /dev/null
+++ b/tests/masked-scatter-struct.ispc
@@ -0,0 +1,25 @@
+
+export uniform int width() { return programCount; }
+
+struct Foo { uniform float x; float y; };
+
+export void f_fu(uniform float ret[], uniform float aa[], uniform float b) {
+    float a = aa[programIndex];
+    Foo foo[32];
+    for (uniform int i = 0; i < 32; ++i)
+        foo[i].x = i;
+    varying Foo fv = foo[a];
+    fv.x += 1000;
+//CO    print("fv.x = %\n", fv.x);
+//CO    print("foo[a] = %\n", foo[a].x);
+    foo[a] = fv;
+//CO    print("after assign foo[a] = %\n", foo[a].x);
+
+    ret[programIndex] = foo[programIndex].x;
+//CO    print("% - %\n", programIndex, ret[programIndex]);
+}
+
+export void result(uniform float ret[]) {
+    ret[programIndex] = 1000+programIndex;
+    ret[0] = 0;
+}
diff --git a/tests/masked-scatter-vector.ispc b/tests/masked-scatter-vector.ispc
new file mode 100644
index 00000000..1abe2814
--- /dev/null
+++ b/tests/masked-scatter-vector.ispc
@@ -0,0 +1,23 @@
+
+export uniform int width() { return programCount; }
+
+typedef int<3> int3;
+
+export void f_fu(uniform float ret[], uniform float aa[], uniform float b) {
+    float a = aa[programIndex];
+    uniform int3 array[32];
+    for (uniform int i = 0; i < 6*b + 2; ++i) {
+        for (uniform int j = 0; j < 3; ++j)
+            array[i][j] = i+100*j;
+    }
+
+    varying int3 vv = array[a];
+    ++vv.y;
+    array[a] = vv;
+    print("fin %\n", array[programIndex].y);
+    ret[programIndex] = array[programIndex].y;
+}
+
+export void result(uniform float ret[]) {
+    ret[programIndex] = 100+programIndex;
+}
diff --git a/tests/masked-struct-scatter-varying.ispc b/tests/masked-struct-scatter-varying.ispc
new file mode 100644
index 00000000..928197a3
--- /dev/null
+++ b/tests/masked-struct-scatter-varying.ispc
@@ -0,0 +1,24 @@
+
+export uniform int width() { return programCount; }
+
+struct Foo { float x; float y; };
+
+export void f_fu(uniform float ret[], uniform float aa[], uniform float b) {
+    float a = aa[programIndex];
+    Foo foo[32];
+    for (uniform int i = 0; i < 32; ++i)
+        foo[i].x = i;
+    varying Foo fv = foo[a];
+    fv.x += 1000;
+//CO    print("fv.x = %\n", fv.x);
+//CO    print("foo[a] = %\n", foo[a].x);
+    foo[a] = fv;
+//CO    print("after assign foo[a] = %\n", foo[a].x);
+
+    ret[programIndex] = foo[a].x;
+//CO    print("% - %\n", programIndex, ret[programIndex]);
+}
+
+export void result(uniform float ret[]) {
+    ret[programIndex] = 1001+programIndex;
+}
diff --git a/tests/max-float-1.ispc b/tests/max-float-1.ispc
new file mode 100644
index 00000000..b77de7e3
--- /dev/null
+++ b/tests/max-float-1.ispc
@@ -0,0 +1,11 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex];
+    RET[programIndex] = max(10 * a, 10.f);
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 10 * (1+programIndex); }
diff --git a/tests/max-float-2.ispc b/tests/max-float-2.ispc
new file mode 100644
index 00000000..ca025c2f
--- /dev/null
+++ b/tests/max-float-2.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex];
+    RET[programIndex] = max(-10 * a, 10.f);
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 10.; }
+
diff --git a/tests/max-int-1.ispc b/tests/max-int-1.ispc
new file mode 100644
index 00000000..7ad02713
--- /dev/null
+++ b/tests/max-int-1.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex];
+    int i = (int)a;
+    RET[programIndex] = max((int)20, i);
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 20.; }
+
diff --git a/tests/max-int.ispc b/tests/max-int.ispc
new file mode 100644
index 00000000..3a4bb641
--- /dev/null
+++ b/tests/max-int.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex];
+    int i = (int)a;
+    RET[programIndex] = max((int)-20, i);
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 1+programIndex; }
+
diff --git a/tests/min-float-1.ispc b/tests/min-float-1.ispc
new file mode 100644
index 00000000..914ae994
--- /dev/null
+++ b/tests/min-float-1.ispc
@@ -0,0 +1,11 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex];
+    RET[programIndex] = min(10 * a, 10.f);
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 10.; }
diff --git a/tests/min-float.ispc b/tests/min-float.ispc
new file mode 100644
index 00000000..3577daab
--- /dev/null
+++ b/tests/min-float.ispc
@@ -0,0 +1,11 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex];
+    RET[programIndex] = min(a, 20.f);
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 1+programIndex; }
diff --git a/tests/min-int-1.ispc b/tests/min-int-1.ispc
new file mode 100644
index 00000000..1c81936f
--- /dev/null
+++ b/tests/min-int-1.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex];
+    int i = (int)a;
+    RET[programIndex] = min((int)-20, i);
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = -20; }
+
diff --git a/tests/min-int.ispc b/tests/min-int.ispc
new file mode 100644
index 00000000..50df3e19
--- /dev/null
+++ b/tests/min-int.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex];
+    int i = (int)a;
+    RET[programIndex] = min((int)20, i);
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 1+programIndex; }
+
diff --git a/tests/movmsk-opt.ispc b/tests/movmsk-opt.ispc
new file mode 100644
index 00000000..70ce5211
--- /dev/null
+++ b/tests/movmsk-opt.ispc
@@ -0,0 +1,35 @@
+static float float4(uniform float a, uniform float b, uniform float c, 
+                    uniform float d) {
+    float ret = 0;
+    for (uniform int i = 0; i < programCount; i += 4) {
+        ret = insert(ret, i + 0, a);
+        ret = insert(ret, i + 1, b);
+        ret = insert(ret, i + 2, c);
+        ret = insert(ret, i + 3, d);
+    }
+    return ret;
+}
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform int ret = 0;
+
+    float v = float4(1,1,0,0);
+    bool b = (v == 1.);
+    ret = __movmsk(((int)b));
+    RET[programIndex] = ret;
+}
+
+
+// fixme for 16-wide...
+export void result(uniform float RET[]) { 
+    uniform int x = -1234;
+    if (programCount == 4) x = 3;
+    else if (programCount == 8) x = 0x33;
+    else if (programCount == 16) x = 0x3333;
+    RET[programIndex] = x;
+}
diff --git a/tests/nested-structs-2.ispc b/tests/nested-structs-2.ispc
new file mode 100644
index 00000000..a613e9a7
--- /dev/null
+++ b/tests/nested-structs-2.ispc
@@ -0,0 +1,22 @@
+
+export uniform int width() { return programCount; }
+
+
+
+struct Foo {
+    float f[18];
+};
+
+struct Bar {
+    uniform Foo foo[6];
+};
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform Bar bar = 2.;
+    bar.foo[5].f[a] = a;
+    RET[programIndex] = bar.foo[b].f[a];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 1+programIndex; }
+
diff --git a/tests/nested-structs.ispc b/tests/nested-structs.ispc
new file mode 100644
index 00000000..2e5115b8
--- /dev/null
+++ b/tests/nested-structs.ispc
@@ -0,0 +1,20 @@
+
+export uniform int width() { return programCount; }
+
+
+
+struct Foo {
+    float f[6];
+};
+
+struct Bar {
+    uniform Foo foo[6];
+};
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform Bar bar = 2.;
+    RET[programIndex] = bar.foo[b].f[b];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 2; }
diff --git a/tests/op-plus-equals-ensure-one-lhs-eval.ispc b/tests/op-plus-equals-ensure-one-lhs-eval.ispc
new file mode 100644
index 00000000..b7488f6d
--- /dev/null
+++ b/tests/op-plus-equals-ensure-one-lhs-eval.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float ret[], uniform float aa[], uniform float b) {
+    uniform float foo[16] = 1;
+    uniform int i = 0;
+    foo[i++] += 1;
+    ret[programIndex] = i;
+}
+
+export void result(uniform float ret[]) {
+    ret[programIndex] = 1;
+}
diff --git a/tests/packed-load-1.ispc b/tests/packed-load-1.ispc
new file mode 100644
index 00000000..e8dee003
--- /dev/null
+++ b/tests/packed-load-1.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int a[programCount];
+    a[programIndex] = aFOO[programIndex];
+    int aa;
+    packed_load_active(a, 0, aa);
+    RET[programIndex] = aa;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1+programIndex;
+}
diff --git a/tests/packed-load-2.ispc b/tests/packed-load-2.ispc
new file mode 100644
index 00000000..2a7505f7
--- /dev/null
+++ b/tests/packed-load-2.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int a[programCount];
+    a[programIndex] = aFOO[programIndex];
+    int aa = 15;
+    uniform int count = 0;
+    if (programIndex < 2)
+        count += packed_load_active(a, 0, aa);
+    RET[programIndex] = aa;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 15;
+    RET[0] = 1;
+    RET[1] = 2;
+}
diff --git a/tests/packed-load-3.ispc b/tests/packed-load-3.ispc
new file mode 100644
index 00000000..98620457
--- /dev/null
+++ b/tests/packed-load-3.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int a[programCount];
+    a[programIndex] = aFOO[programIndex];
+    int aa;
+    uniform int count = 0;
+    if (programIndex < 2)
+        count += packed_load_active(a, 0, aa);
+    RET[programIndex] = count;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2;
+}
diff --git a/tests/packed-load-4.ispc b/tests/packed-load-4.ispc
new file mode 100644
index 00000000..90d194ec
--- /dev/null
+++ b/tests/packed-load-4.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int a[100];
+    for (uniform int i = 0; i < 100; ++i)
+        a[i] = i;
+    int aa = 32;
+    uniform int count = 0;
+    if (programIndex < 2)
+        count += packed_load_active(a, 5, aa);
+    RET[programIndex] = aa;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 32;
+    RET[0] = 5;
+    RET[1] = 6;
+}
diff --git a/tests/packed-load-5.ispc b/tests/packed-load-5.ispc
new file mode 100644
index 00000000..990b8e3c
--- /dev/null
+++ b/tests/packed-load-5.ispc
@@ -0,0 +1,22 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int a[100];
+    for (uniform int i = 0; i < 100; ++i)
+        a[i] = i;
+    int aa = 32;
+    uniform int count = 0;
+    if (programIndex & 1)
+        count += packed_load_active(a, 10, aa);
+    if (!(programIndex & 1))
+        count += packed_load_active(a, 10+count, aa);
+    RET[programIndex] = aa;
+}
+
+export void result(uniform float RET[]) {
+    for (uniform int i = 0; i < programCount/2; ++i) {
+        RET[2*i+1] = 10+i;
+        RET[2*i] = 10+programCount/2+i;
+    }
+}
diff --git a/tests/packed-store-1.ispc b/tests/packed-store-1.ispc
new file mode 100644
index 00000000..3a1b410d
--- /dev/null
+++ b/tests/packed-store-1.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    uniform int pack[2+programCount] = 0;
+    packed_store_active(pack, 2, a);
+    RET[programIndex] = pack[programIndex]; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = programIndex-1;
+    RET[0] = RET[1] = 0;
+}
diff --git a/tests/packed-store-2.ispc b/tests/packed-store-2.ispc
new file mode 100644
index 00000000..aeb97383
--- /dev/null
+++ b/tests/packed-store-2.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    uniform int pack[2+programCount] = 0;
+    if ((int)a & 1)
+        packed_store_active(pack, 2, a);
+    RET[programIndex] = pack[programIndex]; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    uniform int val = 1;
+    for (uniform int i = 2; i < 2+programCount/2; ++i, val += 2)
+        RET[i] = val;
+}
diff --git a/tests/packed-store-3.ispc b/tests/packed-store-3.ispc
new file mode 100644
index 00000000..1dfe2748
--- /dev/null
+++ b/tests/packed-store-3.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    uniform int pack[2+programCount] = 0;
+    uniform int count = 0;
+    if ((int)a & 1)
+        count += packed_store_active(pack, 2, a);
+    RET[programIndex] = count;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = programCount/2;
+}
diff --git a/tests/packed-store.ispc b/tests/packed-store.ispc
new file mode 100644
index 00000000..c8036284
--- /dev/null
+++ b/tests/packed-store.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    uniform int pack[programCount] = 0;
+    packed_store_active(pack, 0, a);
+    RET[programIndex] = pack[programIndex]; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
diff --git a/tests/pass-varying-lvalue-to-ref.ispc b/tests/pass-varying-lvalue-to-ref.ispc
new file mode 100644
index 00000000..bc182125
--- /dev/null
+++ b/tests/pass-varying-lvalue-to-ref.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+void inc(reference float v) { ++v; }
+
+export void f_fu(uniform float ret[], uniform float aa[], uniform float b) {
+    uniform float foo[32] = 10;
+    int a = (int)aa[programIndex];
+    inc(foo[a]);
+    ret[programIndex] = foo[programIndex];
+}
+
+export void result(uniform float ret[]) {
+    ret[programIndex] = 11;
+    ret[0] = 10;
+}
diff --git a/tests/popcnt-1.ispc b/tests/popcnt-1.ispc
new file mode 100644
index 00000000..5b7a1349
--- /dev/null
+++ b/tests/popcnt-1.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex];
+    RET[programIndex] = popcnt((int)a);
+}
+
+export void result(uniform float RET[]) { 
+    uniform int pc[16] = { 1, 1, 2, 1, 2, 2, 3, 1, 1, 2, 2, 3, 2, 3, 3, 4 };
+    RET[programIndex] = pc[programIndex];
+}
+
diff --git a/tests/popcnt-2.ispc b/tests/popcnt-2.ispc
new file mode 100644
index 00000000..f792dde0
--- /dev/null
+++ b/tests/popcnt-2.ispc
@@ -0,0 +1,26 @@
+
+export uniform int width() { return programCount; }
+
+static int int4(uniform int a, uniform int b, uniform int c, 
+                uniform int d) {
+    int ret = 0;
+    for (uniform int i = 0; i < programCount; i += 4) {
+        ret = insert(ret, i + 0, a);
+        ret = insert(ret, i + 1, b);
+        ret = insert(ret, i + 2, c);
+        ret = insert(ret, i + 3, d);
+    }
+    return ret;
+}
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex];
+    RET[programIndex] = popcnt(int4(0xf0f0f0f0, 0xff, 0x10, 0));
+}
+
+export void result(uniform float RET[]) { 
+    RET[0] = RET[4] = RET[8] = RET[12] = 16;
+    RET[1] = RET[5] = RET[9] = RET[13] = 8;
+    RET[2] = RET[6] = RET[10] = RET[14] = 1;
+    RET[3] = RET[7] = RET[11] = RET[15] = 0;
+}
diff --git a/tests/popcnt-3.ispc b/tests/popcnt-3.ispc
new file mode 100644
index 00000000..c1553086
--- /dev/null
+++ b/tests/popcnt-3.ispc
@@ -0,0 +1,11 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_f(uniform float RET[4], uniform float aFOO[]) {
+    float a = aFOO[programIndex];
+    RET[programIndex] = popcnt(a < 3);
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 2; }
diff --git a/tests/pow-0.ispc b/tests/pow-0.ispc
new file mode 100644
index 00000000..00dcb38b
--- /dev/null
+++ b/tests/pow-0.ispc
@@ -0,0 +1,24 @@
+static float float4(uniform float a, uniform float b, uniform float c, 
+                    uniform float d) {
+    float ret = 0;
+    for (uniform int i = 0; i < programCount; i += 4) {
+        ret = insert(ret, i + 0, a);
+        ret = insert(ret, i + 1, b);
+        ret = insert(ret, i + 2, c);
+        ret = insert(ret, i + 3, d);
+    }
+    return ret;
+}
+
+export uniform int width() { return programCount; }
+
+
+bool ok(float x, float ref) { return (abs(x - ref) < 1e-6) || abs((x-ref)/ref) < 1e-5; }
+
+export void f_v(uniform float RET[]) {
+    float a = float4((3.964647769927979),(4.465834140777588),(0.155828490853310),(3.837158679962158));
+    float b = float4((6.809707164764404),(-3.626144647598267),(1.681804418563843),(0.000000000000000));
+    float ref = float4((11846.722656250000000),(0.004399053286761),(0.043872252106667),(1.000000000000000));
+    RET[programIndex] = ok(pow(a, b), ref) ? 1. : 0.;
+}
+export void result(uniform float RET[]) { RET[programIndex] = 1.; }
diff --git a/tests/pow-1.ispc b/tests/pow-1.ispc
new file mode 100644
index 00000000..a873d03e
--- /dev/null
+++ b/tests/pow-1.ispc
@@ -0,0 +1,24 @@
+static float float4(uniform float a, uniform float b, uniform float c, 
+                    uniform float d) {
+    float ret = 0;
+    for (uniform int i = 0; i < programCount; i += 4) {
+        ret = insert(ret, i + 0, a);
+        ret = insert(ret, i + 1, b);
+        ret = insert(ret, i + 2, c);
+        ret = insert(ret, i + 3, d);
+    }
+    return ret;
+}
+
+export uniform int width() { return programCount; }
+
+
+bool ok(float x, float ref) { return (abs(x - ref) < 1e-6) || abs((x-ref)/ref) < 1e-5; }
+
+export void f_v(uniform float RET[]) {
+    float a = float4((8.998542785644531),(5.330646991729736),(2.699711084365845),(7.423774242401123));
+    float b = float4((-6.729081153869629),(2.082883834838867),(-2.190436124801636),(0.000000000000000));
+    float ref = float4((0.000000379575283),(32.643608093261719),(0.113560430705547),(1.000000000000000));
+    RET[programIndex] = ok(pow(a, b), ref) ? 1. : 0.;
+}
+export void result(uniform float RET[]) { RET[programIndex] = 1.; }
diff --git a/tests/pow-2.ispc b/tests/pow-2.ispc
new file mode 100644
index 00000000..91b5d6b9
--- /dev/null
+++ b/tests/pow-2.ispc
@@ -0,0 +1,24 @@
+static float float4(uniform float a, uniform float b, uniform float c, 
+                    uniform float d) {
+    float ret = 0;
+    for (uniform int i = 0; i < programCount; i += 4) {
+        ret = insert(ret, i + 0, a);
+        ret = insert(ret, i + 1, b);
+        ret = insert(ret, i + 2, c);
+        ret = insert(ret, i + 3, d);
+    }
+    return ret;
+}
+
+export uniform int width() { return programCount; }
+
+
+bool ok(float x, float ref) { return (abs(x - ref) < 1e-6) || abs((x-ref)/ref) < 1e-5; }
+
+export void f_v(uniform float RET[]) {
+    float a = float4((4.049826145172119),(6.628306388854980),(4.623792648315430),(2.656122446060181));
+    float b = float4((7.147558689117432),(0.000000000000000),(0.651920497417450),(9.655045509338379));
+    float ref = float4((21962.773437500000000),(1.000000000000000),(2.713476419448853),(12477.703125000000000));
+    RET[programIndex] = ok(pow(a, b), ref) ? 1. : 0.;
+}
+export void result(uniform float RET[]) { RET[programIndex] = 1.; }
diff --git a/tests/pow-3.ispc b/tests/pow-3.ispc
new file mode 100644
index 00000000..b629a6b4
--- /dev/null
+++ b/tests/pow-3.ispc
@@ -0,0 +1,24 @@
+static float float4(uniform float a, uniform float b, uniform float c, 
+                    uniform float d) {
+    float ret = 0;
+    for (uniform int i = 0; i < programCount; i += 4) {
+        ret = insert(ret, i + 0, a);
+        ret = insert(ret, i + 1, b);
+        ret = insert(ret, i + 2, c);
+        ret = insert(ret, i + 3, d);
+    }
+    return ret;
+}
+
+export uniform int width() { return programCount; }
+
+
+bool ok(float x, float ref) { return (abs(x - ref) < 1e-6) || abs((x-ref)/ref) < 1e-5; }
+
+export void f_v(uniform float RET[]) {
+    float a = float4((6.008551597595215),(8.858951568603516),(3.376619100570679),(7.535532951354980));
+    float b = float4((2.174313068389893),(-3.906857967376709),(-2.250460863113403),(2.072321891784668));
+    float ref = float4((49.350250244140625),(0.000198935274966),(0.064665369689465),(65.714691162109375));
+    RET[programIndex] = ok(pow(a, b), ref) ? 1. : 0.;
+}
+export void result(uniform float RET[]) { RET[programIndex] = 1.; }
diff --git a/tests/pow-4.ispc b/tests/pow-4.ispc
new file mode 100644
index 00000000..b25ded13
--- /dev/null
+++ b/tests/pow-4.ispc
@@ -0,0 +1,24 @@
+static float float4(uniform float a, uniform float b, uniform float c, 
+                    uniform float d) {
+    float ret = 0;
+    for (uniform int i = 0; i < programCount; i += 4) {
+        ret = insert(ret, i + 0, a);
+        ret = insert(ret, i + 1, b);
+        ret = insert(ret, i + 2, c);
+        ret = insert(ret, i + 3, d);
+    }
+    return ret;
+}
+
+export uniform int width() { return programCount; }
+
+
+bool ok(float x, float ref) { return (abs(x - ref) < 1e-6) || abs((x-ref)/ref) < 1e-5; }
+
+export void f_v(uniform float RET[]) {
+    float a = float4((4.593603134155273),(9.463705062866211),(0.074282616376877),(0.242991551756859));
+    float b = float4((3.049768924713135),(-2.639202594757080),(0.331999003887177),(1.839089989662170));
+    float ref = float4((104.571922302246094),(0.002654463518411),(0.421828269958496),(0.074139028787613));
+    RET[programIndex] = ok(pow(a, b), ref) ? 1. : 0.;
+}
+export void result(uniform float RET[]) { RET[programIndex] = 1.; }
diff --git a/tests/pow-5.ispc b/tests/pow-5.ispc
new file mode 100644
index 00000000..9edc7988
--- /dev/null
+++ b/tests/pow-5.ispc
@@ -0,0 +1,24 @@
+static float float4(uniform float a, uniform float b, uniform float c, 
+                    uniform float d) {
+    float ret = 0;
+    for (uniform int i = 0; i < programCount; i += 4) {
+        ret = insert(ret, i + 0, a);
+        ret = insert(ret, i + 1, b);
+        ret = insert(ret, i + 2, c);
+        ret = insert(ret, i + 3, d);
+    }
+    return ret;
+}
+
+export uniform int width() { return programCount; }
+
+
+bool ok(float x, float ref) { return (abs(x - ref) < 1e-6) || abs((x-ref)/ref) < 1e-5; }
+
+export void f_v(uniform float RET[]) {
+    float a = float4((8.776933670043945),(3.028291940689087),(7.100255966186523),(6.755406856536865));
+    float b = float4((-8.812625885009766),(7.829904556274414),(-4.271719932556152),(-0.830200552940369));
+    float ref = float4((0.000000004860408),(5857.770996093750000),(0.000230991427088),(0.204749509692192));
+    RET[programIndex] = ok(pow(a, b), ref) ? 1. : 0.;
+}
+export void result(uniform float RET[]) { RET[programIndex] = 1.; }
diff --git a/tests/pow-6.ispc b/tests/pow-6.ispc
new file mode 100644
index 00000000..e9ae39fa
--- /dev/null
+++ b/tests/pow-6.ispc
@@ -0,0 +1,24 @@
+static float float4(uniform float a, uniform float b, uniform float c, 
+                    uniform float d) {
+    float ret = 0;
+    for (uniform int i = 0; i < programCount; i += 4) {
+        ret = insert(ret, i + 0, a);
+        ret = insert(ret, i + 1, b);
+        ret = insert(ret, i + 2, c);
+        ret = insert(ret, i + 3, d);
+    }
+    return ret;
+}
+
+export uniform int width() { return programCount; }
+
+
+bool ok(float x, float ref) { return (abs(x - ref) < 1e-6) || abs((x-ref)/ref) < 1e-5; }
+
+export void f_v(uniform float RET[]) {
+    float a = float4((7.746754169464111),(3.545338869094849),(7.189665794372559),(3.906119108200073));
+    float b = float4((-2.468974351882935),(-3.993634939193726),(1.319090127944946),(6.375326156616211));
+    float ref = float4((0.006379491649568),(0.006380689330399),(13.491989135742188),(5923.391601562500000));
+    RET[programIndex] = ok(pow(a, b), ref) ? 1. : 0.;
+}
+export void result(uniform float RET[]) { RET[programIndex] = 1.; }
diff --git a/tests/pow-7.ispc b/tests/pow-7.ispc
new file mode 100644
index 00000000..30629589
--- /dev/null
+++ b/tests/pow-7.ispc
@@ -0,0 +1,24 @@
+static float float4(uniform float a, uniform float b, uniform float c, 
+                    uniform float d) {
+    float ret = 0;
+    for (uniform int i = 0; i < programCount; i += 4) {
+        ret = insert(ret, i + 0, a);
+        ret = insert(ret, i + 1, b);
+        ret = insert(ret, i + 2, c);
+        ret = insert(ret, i + 3, d);
+    }
+    return ret;
+}
+
+export uniform int width() { return programCount; }
+
+
+bool ok(float x, float ref) { return (abs(x - ref) < 1e-6) || abs((x-ref)/ref) < 1e-5; }
+
+export void f_v(uniform float RET[]) {
+    float a = float4((1.804677724838257),(5.206657886505127),(8.825845718383789),(6.882568359375000));
+    float b = float4((8.867918014526367),(-8.687125205993652),(5.227282524108887),(5.230966091156006));
+    float ref = float4((187.815383911132812),(0.000000595884217),(87848.742187500000000),(24112.632812500000000));
+    RET[programIndex] = ok(pow(a, b), ref) ? 1. : 0.;
+}
+export void result(uniform float RET[]) { RET[programIndex] = 1.; }
diff --git a/tests/pow-8.ispc b/tests/pow-8.ispc
new file mode 100644
index 00000000..78b6dc05
--- /dev/null
+++ b/tests/pow-8.ispc
@@ -0,0 +1,24 @@
+static float float4(uniform float a, uniform float b, uniform float c, 
+                    uniform float d) {
+    float ret = 0;
+    for (uniform int i = 0; i < programCount; i += 4) {
+        ret = insert(ret, i + 0, a);
+        ret = insert(ret, i + 1, b);
+        ret = insert(ret, i + 2, c);
+        ret = insert(ret, i + 3, d);
+    }
+    return ret;
+}
+
+export uniform int width() { return programCount; }
+
+
+bool ok(float x, float ref) { return (abs(x - ref) < 1e-6) || abs((x-ref)/ref) < 1e-5; }
+
+export void f_v(uniform float RET[4]) {
+    float a = float4((1.252511382102966),(8.731211662292480),(8.605137825012207),(2.428840160369873));
+    float b = float4((-0.307321906089783),(0.000000000000000),(-9.626051902770996),(-3.708563089370728));
+    float ref = float4((0.933145880699158),(1.000000000000000),(0.000000001004552),(0.037215489894152));
+    RET[programIndex] = ok(pow(a, b), ref) ? 1. : 0.;
+}
+export void result(uniform float RET[]) { RET[programIndex] = 1.; }
diff --git a/tests/pow-9.ispc b/tests/pow-9.ispc
new file mode 100644
index 00000000..dcfe62ce
--- /dev/null
+++ b/tests/pow-9.ispc
@@ -0,0 +1,24 @@
+static float float4(uniform float a, uniform float b, uniform float c, 
+                    uniform float d) {
+    float ret = 0;
+    for (uniform int i = 0; i < programCount; i += 4) {
+        ret = insert(ret, i + 0, a);
+        ret = insert(ret, i + 1, b);
+        ret = insert(ret, i + 2, c);
+        ret = insert(ret, i + 3, d);
+    }
+    return ret;
+}
+
+export uniform int width() { return programCount; }
+
+
+bool ok(float x, float ref) { return (abs(x - ref) < 1e-6) || abs((x-ref)/ref) < 1e-5; }
+
+export void f_v(uniform float RET[]) {
+    float a = float4((9.355601310729980),(2.201354980468750),(3.210669755935669),(0.018172996118665));
+    float b = float4((6.188633918762207),(1.527074813842773),(-4.773533344268799),(-9.104317665100098));
+    float ref = float4((1022368.125000000000000),(3.336671113967896),(0.003817234421149),(7026414561787904.000000000000000));
+    RET[programIndex] = ok(pow(a, b), ref) ? 1. : 0.;
+}
+export void result(uniform float RET[]) { RET[programIndex] = 1.; }
diff --git a/tests/recursion-forward-func-decl.ispc b/tests/recursion-forward-func-decl.ispc
new file mode 100644
index 00000000..b6dd0496
--- /dev/null
+++ b/tests/recursion-forward-func-decl.ispc
@@ -0,0 +1,22 @@
+
+export uniform int width() { return programCount; }
+
+
+
+float f(float);
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    RET[programIndex] = f(a);
+}
+
+float f(float x) {
+    if (x == 0)  return x;
+    else         return x + f(x-1);
+}
+
+export void result(uniform float RET[]) { 
+    uniform float fib[16] = { 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 66, 78, 91, 105, 120, 136 };
+    RET[programIndex] = fib[programIndex];
+}
+
diff --git a/tests/recursion.ispc b/tests/recursion.ispc
new file mode 100644
index 00000000..30de649d
--- /dev/null
+++ b/tests/recursion.ispc
@@ -0,0 +1,22 @@
+
+export uniform int width() { return programCount; }
+
+
+
+/*COfloat f(float);*/
+
+float f(float x) {
+    if (x == 0)  return x;
+    else         return x + f(x-1);
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    RET[programIndex] = f(a);
+}
+
+export void result(uniform float RET[]) { 
+    uniform float fib[16] = { 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 66, 78, 91, 105, 120, 136 };
+    RET[programIndex] = fib[programIndex];
+}
+
diff --git a/tests/reduce-add-float-1.ispc b/tests/reduce-add-float-1.ispc
new file mode 100644
index 00000000..c01249fe
--- /dev/null
+++ b/tests/reduce-add-float-1.ispc
@@ -0,0 +1,22 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float v = aFOO[programIndex];
+    uniform float m;
+    int iv = (int)v;
+    if (iv & 1)
+        m = reduce_add((float)iv);
+    RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { 
+    uniform int x = -1234;
+    if (programCount == 4) x = 4;
+    else if (programCount == 8) x = 16;
+    else if (programCount == 16) x = 64;
+    RET[programIndex] = x;
+}
+
diff --git a/tests/reduce-add-float-2.ispc b/tests/reduce-add-float-2.ispc
new file mode 100644
index 00000000..3e1368be
--- /dev/null
+++ b/tests/reduce-add-float-2.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float v = aFOO[programIndex];
+    uniform float m;
+    int iv = (int)v;
+    m = reduce_add((float)iv);
+    RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { 
+    uniform int x = -1234;
+    if (programCount == 4) x = 10;
+    else if (programCount == 8) x = 36;
+    else if (programCount == 16) x = 124;
+    RET[programIndex] = x;
+}
+
diff --git a/tests/reduce-add-float.ispc b/tests/reduce-add-float.ispc
new file mode 100644
index 00000000..91fb5689
--- /dev/null
+++ b/tests/reduce-add-float.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float v = aFOO[programIndex];
+    uniform float m;
+    if (v < 3)
+        m = reduce_add(-v);
+    RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = -3; }
diff --git a/tests/reduce-add-int-1.ispc b/tests/reduce-add-int-1.ispc
new file mode 100644
index 00000000..18c2634a
--- /dev/null
+++ b/tests/reduce-add-int-1.ispc
@@ -0,0 +1,22 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float v = aFOO[programIndex];
+    uniform float m;
+    int iv = (int)v;
+    if (iv & 1)
+        m = reduce_add(iv);
+    RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { 
+    uniform int x = -1234;
+    if (programCount == 4) x = 4;
+    else if (programCount == 8) x = 16;
+    else if (programCount == 16) x = 64;
+    RET[programIndex] = x;
+}
+
diff --git a/tests/reduce-add-int.ispc b/tests/reduce-add-int.ispc
new file mode 100644
index 00000000..4ea4577d
--- /dev/null
+++ b/tests/reduce-add-int.ispc
@@ -0,0 +1,22 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float v = aFOO[programIndex];
+    uniform float m;
+    int iv = (int)v;
+/*CO    if (iv & 1)*/
+        m = reduce_add(iv);
+    RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { 
+    uniform int x = -1234;
+    if (programCount == 4) x = 10;
+    else if (programCount == 8) x = 36;
+    else if (programCount == 16) x = 136;
+    RET[programIndex] = x;
+}
+
diff --git a/tests/reduce-add-uint-1.ispc b/tests/reduce-add-uint-1.ispc
new file mode 100644
index 00000000..7702a67c
--- /dev/null
+++ b/tests/reduce-add-uint-1.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float v = aFOO[programIndex];
+    uniform float m;
+    unsigned int iv = (unsigned int)v;
+    if (iv & 1)
+        m = reduce_add(iv);
+    RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { 
+    uniform int x = -1234;
+    if (programCount == 4) x = 4;
+    else if (programCount == 8) x = 16;
+    else if (programCount == 16) x = 64;
+    RET[programIndex] = x;
+}
diff --git a/tests/reduce-add-uint.ispc b/tests/reduce-add-uint.ispc
new file mode 100644
index 00000000..8becef36
--- /dev/null
+++ b/tests/reduce-add-uint.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float v = aFOO[programIndex & 0x3];
+    uniform float m;
+    unsigned int iv = (unsigned int)v;
+/*CO    if (iv & 1)*/
+        m = reduce_add(iv);
+    RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 10 * programCount/4; }
diff --git a/tests/reduce-max-float.ispc b/tests/reduce-max-float.ispc
new file mode 100644
index 00000000..7a914c72
--- /dev/null
+++ b/tests/reduce-max-float.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float v = aFOO[programIndex];
+    uniform float m;
+    if (v >= 3)
+        m = reduce_max(-v);
+    RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = -3; }
diff --git a/tests/reduce-max-int.ispc b/tests/reduce-max-int.ispc
new file mode 100644
index 00000000..a66ac8b5
--- /dev/null
+++ b/tests/reduce-max-int.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float v = aFOO[programIndex];
+    uniform float m;
+    if (v >= 3)
+        m = reduce_max(-(int)v);
+    RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = -3; }
diff --git a/tests/reduce-max-uint.ispc b/tests/reduce-max-uint.ispc
new file mode 100644
index 00000000..ed0ef282
--- /dev/null
+++ b/tests/reduce-max-uint.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float v = aFOO[programIndex];
+    uniform float m;
+    if (v < 3)
+        m = reduce_max((unsigned int)v);
+    RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 2; }
diff --git a/tests/reduce-min-float.ispc b/tests/reduce-min-float.ispc
new file mode 100644
index 00000000..b0e890fb
--- /dev/null
+++ b/tests/reduce-min-float.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float v = aFOO[programIndex];
+    uniform float m;
+    if (v > 0 && v < 3)
+        m = reduce_min(-v);
+    RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = -2; }
diff --git a/tests/reduce-min-int.ispc b/tests/reduce-min-int.ispc
new file mode 100644
index 00000000..4f7bf0b8
--- /dev/null
+++ b/tests/reduce-min-int.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float v = aFOO[programIndex];
+    uniform float m;
+    if (v >= 0 && v < 2)
+        m = reduce_min(-v);
+    RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = -1; }
diff --git a/tests/reduce-min-uint.ispc b/tests/reduce-min-uint.ispc
new file mode 100644
index 00000000..85ac625b
--- /dev/null
+++ b/tests/reduce-min-uint.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float v = aFOO[programIndex];
+    uniform float m;
+    if (v >= 3)
+        m = reduce_min((unsigned int)v);
+    RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 3; }
diff --git a/tests/ref-0.ispc b/tests/ref-0.ispc
new file mode 100644
index 00000000..8c5be7a9
--- /dev/null
+++ b/tests/ref-0.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+
+float foo(reference float a) {
+    a = 0;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    foo(a);
+    RET[programIndex] = a;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 0; }
diff --git a/tests/ref-1.ispc b/tests/ref-1.ispc
new file mode 100644
index 00000000..46d6a8c2
--- /dev/null
+++ b/tests/ref-1.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+
+float foo(reference uniform float a) {
+    a = 0;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    foo(b);
+    RET[programIndex] = b;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 0; }
diff --git a/tests/ref-2.ispc b/tests/ref-2.ispc
new file mode 100644
index 00000000..e373f734
--- /dev/null
+++ b/tests/ref-2.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+
+
+float foo(reference float a) {
+    a = 0;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    if (b > 6)
+        foo(a);
+    RET[programIndex] = a;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 1+programIndex; }
diff --git a/tests/ref-3.ispc b/tests/ref-3.ispc
new file mode 100644
index 00000000..749abfa8
--- /dev/null
+++ b/tests/ref-3.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+
+
+void foo(reference float a) {
+    a = 0;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    if (a >= 3)
+        foo(a);
+    RET[programIndex] = a;
+}
+
+export void result(uniform float RET[4]) { 
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
diff --git a/tests/ref-4.ispc b/tests/ref-4.ispc
new file mode 100644
index 00000000..084bfe6f
--- /dev/null
+++ b/tests/ref-4.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+
+
+void foo(reference float a) {
+    a = 0;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    if (a >= 3)
+        foo(a);
+    RET[programIndex] = a;
+}
+
+export void result(uniform float RET[]) { 
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
diff --git a/tests/ref-5.ispc b/tests/ref-5.ispc
new file mode 100644
index 00000000..fae3994b
--- /dev/null
+++ b/tests/ref-5.ispc
@@ -0,0 +1,20 @@
+
+export uniform int width() { return programCount; }
+
+
+
+void foo(reference float a) {
+    a = 0;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    float x[10];
+    uniform int i;
+    for (i = 0; i < 10; ++i)
+        x[i] = a*b;
+    foo(x[b]);
+    RET[programIndex] = x[5] + x[9];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 5 * (programIndex+1); }
diff --git a/tests/ref-6.ispc b/tests/ref-6.ispc
new file mode 100644
index 00000000..5f9c5d87
--- /dev/null
+++ b/tests/ref-6.ispc
@@ -0,0 +1,26 @@
+
+export uniform int width() { return programCount; }
+
+
+
+void foo(reference float a[10]) {
+    a[5] = 0;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    float x[10];
+    uniform int i;
+    for (i = 0; i < 10; ++i)
+        x[i] = a*b;
+    if (a >= 2)
+        foo(x);
+    RET[programIndex] = x[b] + x[9];
+}
+
+export void result(uniform float RET[]) { 
+    RET[programIndex] = 5 * (programIndex+1);
+    RET[0] = 10;
+    RET[1] = 10;
+}
+
diff --git a/tests/ref-7.ispc b/tests/ref-7.ispc
new file mode 100644
index 00000000..fb9e0eb5
--- /dev/null
+++ b/tests/ref-7.ispc
@@ -0,0 +1,25 @@
+
+export uniform int width() { return programCount; }
+
+
+
+void foo(reference float a[10]) {
+    a[5] = 0;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    float x[10];
+    uniform int i;
+    for (i = 0; i < 10; ++i)
+        x[i] = a*b;
+    if (a > 2)
+        foo(x);
+    RET[programIndex] = x[b] + x[9];
+}
+
+export void result(uniform float RET[]) { 
+    RET[programIndex] = 5 * (programIndex+1);
+    RET[0] = 10;
+    RET[1] = 20;
+}
diff --git a/tests/reference-assignment-struct-unsized-array-as-ptr.ispc b/tests/reference-assignment-struct-unsized-array-as-ptr.ispc
new file mode 100644
index 00000000..6d9ba99f
--- /dev/null
+++ b/tests/reference-assignment-struct-unsized-array-as-ptr.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+
+
+struct Foo {
+    uniform float array[];
+};
+
+export void f_fu(uniform float ret[], uniform float a[], uniform float b) {
+    Foo f;
+    f.array = ret;
+    f.array[programIndex] = b;
+}
+
+export void result(uniform float r[]) {
+    r[programIndex] = 5;
+}
diff --git a/tests/reference-assignment-typeconv.ispc b/tests/reference-assignment-typeconv.ispc
new file mode 100644
index 00000000..21448867
--- /dev/null
+++ b/tests/reference-assignment-typeconv.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+
+
+void foo(reference float x, reference int y) {
+    x = y;
+}
+
+export void f_fu(uniform float ret[], uniform float a[], uniform float b) {
+    float aa = a[programIndex];
+    int bb = (int)b;
+    foo(aa, bb);
+    ret[programIndex] = aa;
+}
+
+export void result(uniform float r[]) {
+    r[programIndex] = 5;
+}
diff --git a/tests/reference-assignment.ispc b/tests/reference-assignment.ispc
new file mode 100644
index 00000000..a358ab66
--- /dev/null
+++ b/tests/reference-assignment.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+
+
+void foo(reference float x, reference float y) {
+    x = y;
+}
+
+export void f_fu(uniform float ret[], uniform float a[], uniform float b) {
+    float aa = a[programIndex], bb = b;
+    foo(aa, bb);
+    ret[programIndex] = aa;
+}
+
+export void result(uniform float r[]) {
+    r[programIndex] = 5;
+}
diff --git a/tests/reference-prepost-increment.ispc b/tests/reference-prepost-increment.ispc
new file mode 100644
index 00000000..55d8eaea
--- /dev/null
+++ b/tests/reference-prepost-increment.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+
+
+void foo(reference float x) {
+    if (x <= 2)
+        ++x;
+}
+
+export void f_fu(uniform float ret[], uniform float a[], uniform float b) {
+    float aa = a[programIndex];
+    foo(aa);
+    ret[programIndex] = aa;
+}
+
+export void result(uniform float r[]) {
+    r[programIndex] = 1+programIndex;
+    r[0] = 2;
+    r[1] = 3;
+}
diff --git a/tests/scatter-struct.ispc b/tests/scatter-struct.ispc
new file mode 100644
index 00000000..2e43fba4
--- /dev/null
+++ b/tests/scatter-struct.ispc
@@ -0,0 +1,36 @@
+
+
+export uniform int width() { return programCount; }
+
+struct Foo {
+    float f;
+    uniform int a;
+    float y;
+};
+
+extern void aa(reference Foo f);
+extern void bb(reference Foo f[]);
+
+void set(Foo f[], int offset, Foo val) {
+    f[offset] = val;
+}
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    Foo foo[35];
+    for (uniform int i = 0; i < 35; ++i) {
+        foo[i].f = a;
+        foo[i].a = i;
+        foo[i].y = 2*a;
+    }
+
+    Foo x = { 3*a, -1, 4*a };
+    set(foo, a-1, x);
+
+    RET[programIndex] = foo[programIndex].f; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 3+3*programIndex;
+}
+
diff --git a/tests/scatter-vector.ispc b/tests/scatter-vector.ispc
new file mode 100644
index 00000000..60ccee94
--- /dev/null
+++ b/tests/scatter-vector.ispc
@@ -0,0 +1,39 @@
+
+
+export uniform int width() { return programCount; }
+
+struct Foo {
+    float f;
+    uniform int a;
+    float y;
+};
+
+extern void aa(reference Foo f);
+extern void bb(reference Foo f[]);
+
+typedef float<3> float3;
+
+void set(uniform float3 f[], int offset, float3 val) {
+    f[offset] = val;
+}
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+
+    uniform float3 foo[35];
+    for (uniform int i = 0; i < 35; ++i) {
+        foo[i].x = i;
+        foo[i].y = -1;
+        foo[i].z = 2*i;
+    }
+
+    float3 bar = { programIndex, 2*programIndex, 3*programIndex };
+    set(foo, a-1, bar);
+
+    RET[programIndex] = foo[programIndex].z; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 3*programIndex;
+}
+
diff --git a/tests/short-circuit.ispc b/tests/short-circuit.ispc
new file mode 100644
index 00000000..9e850713
--- /dev/null
+++ b/tests/short-circuit.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    uniform float array[4] = 1;
+    int index = 1000 * (a-1);
+    RET[programIndex] = (programIndex == 0) ? array[index] : 2.;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2;
+    RET[0] = 1;
+}
diff --git a/tests/short-vec-1.ispc b/tests/short-vec-1.ispc
new file mode 100644
index 00000000..34e792b6
--- /dev/null
+++ b/tests/short-vec-1.ispc
@@ -0,0 +1,16 @@
+
+typedef int<3> int3;
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float ret[], uniform float aa[], uniform float b) {
+    float a = aa[programIndex];
+    int3 v0 = { b, 2*b, 3*b };
+    int3 v1 = { a, 2*a, 3*a };
+    int3 v = v0+v1;
+    ret[programIndex] = v.z; // (v0+v1)[2];
+}
+
+export void result(uniform float ret[]) {
+    ret[programIndex] = 15 + 3 * (programIndex+1);
+}
diff --git a/tests/short-vec-10.ispc b/tests/short-vec-10.ispc
new file mode 100644
index 00000000..95884e03
--- /dev/null
+++ b/tests/short-vec-10.ispc
@@ -0,0 +1,29 @@
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    uniform float<3> x = {1,2,3}, y = {b,b,b};
+
+    RET[programIndex] = 0;
+    uniform float<3> z = x+b;
+    if (programIndex < 3)
+        RET[programIndex] = z[programIndex];
+
+/*CO    return x[y];*/
+
+/*CO    int index = aFOO[programIndex];*/
+/*CO    index = min(index, 3);*/
+/*CO    return x[index];*/
+
+/*CO    return x << 1;*/
+/*CO    return c[0] ? 1 : 0;*/
+/*CO    x = b;*/
+/*CO    y = b;*/
+/*CO    return x+y;*/
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 6;
+    RET[1] = 7;
+    RET[2] = 8;
+}
diff --git a/tests/short-vec-11.ispc b/tests/short-vec-11.ispc
new file mode 100644
index 00000000..6b8c8181
--- /dev/null
+++ b/tests/short-vec-11.ispc
@@ -0,0 +1,29 @@
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    uniform float<3> x = {1,2,3}, y = {b-5,b-5,b-5};
+
+    RET[programIndex] = 0;
+
+    uniform float<3> z = x[b-5];
+
+    if (programIndex < 3)
+        RET[programIndex] = z[programIndex];
+
+/*CO    int index = aFOO[programIndex];*/
+/*CO    index = min(index, 3);*/
+/*CO    return x[index];*/
+
+/*CO    return x << 1;*/
+/*CO    return c[0] ? 1 : 0;*/
+/*CO    x = b;*/
+/*CO    y = b;*/
+/*CO    return x+y;*/
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 1;
+    RET[2] = 1;
+}
diff --git a/tests/short-vec-12.ispc b/tests/short-vec-12.ispc
new file mode 100644
index 00000000..c457bd67
--- /dev/null
+++ b/tests/short-vec-12.ispc
@@ -0,0 +1,26 @@
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    uniform float<4> x = {1,2,3,4}, y = {b-5,b-5,b-5,b-5};
+
+    RET[programIndex] = 0;
+
+    int index = aFOO[programIndex];
+    index = min(index, 3);
+
+    RET[programIndex] = x[index];
+
+
+/*CO    return x << 1;*/
+/*CO    return c[0] ? 1 : 0;*/
+/*CO    x = b;*/
+/*CO    y = b;*/
+/*CO    return x+y;*/
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 4;
+    RET[0] = 2;
+    RET[1] = 3;
+    RET[2] = 4;
+}
diff --git a/tests/short-vec-13.ispc b/tests/short-vec-13.ispc
new file mode 100644
index 00000000..d67aad11
--- /dev/null
+++ b/tests/short-vec-13.ispc
@@ -0,0 +1,28 @@
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    uniform int<4> x = {1,2,3,4}, y = {b-5,b-5,b-5,b-5};
+
+    RET[programIndex] = 0;
+
+    int index = aFOO[programIndex];
+    index = min(index, 3);
+
+    x <<= 1;
+    if (programIndex < 4)
+        RET[programIndex] = x[programIndex];
+
+/*CO    return x << 1;*/
+/*CO    return c[0] ? 1 : 0;*/
+/*CO    x = b;*/
+/*CO    y = b;*/
+/*CO    return x+y;*/
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 2;
+    RET[1] = 4;
+    RET[2] = 6;
+    RET[3] = 8;
+}
diff --git a/tests/short-vec-14.ispc b/tests/short-vec-14.ispc
new file mode 100644
index 00000000..de59777b
--- /dev/null
+++ b/tests/short-vec-14.ispc
@@ -0,0 +1,27 @@
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    uniform int<4> x = {-1,-2,3,4}, y = {b-5,b-5,b-5,b-5};
+
+    RET[programIndex] = 0;
+
+    int index = aFOO[programIndex];
+    index = min(index, 3);
+
+    x = (x < y) ? x : y;
+    if (programIndex < 4)
+        RET[programIndex] = x[programIndex];
+
+/*CO    return c[0] ? 1 : 0;*/
+/*CO    x = b;*/
+/*CO    y = b;*/
+/*CO    return x+y;*/
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = -1;
+    RET[1] = -2;
+    RET[2] = 0;
+    RET[3] = 0;
+}
diff --git a/tests/short-vec-15.ispc b/tests/short-vec-15.ispc
new file mode 100644
index 00000000..b76e9532
--- /dev/null
+++ b/tests/short-vec-15.ispc
@@ -0,0 +1,20 @@
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    float<4> x = {a,a+1,a+2,a+3}, y = {a,a-1,a+2,0};
+
+    RET[programIndex] = 0;
+
+    x = (x < y) ? x : y;
+    if (programIndex < 4)
+        RET[programIndex] = x[programIndex];
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 1;
+    RET[2] = 5;
+    RET[3] = 0;
+}
diff --git a/tests/short-vec-16.ispc b/tests/short-vec-16.ispc
new file mode 100644
index 00000000..173663a2
--- /dev/null
+++ b/tests/short-vec-16.ispc
@@ -0,0 +1,21 @@
+export uniform int width() { return programCount; }
+
+void inc(reference uniform float<4> v) { ++v; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform float<4> x = {b,b+1,b+2,b+3}, y = {b,b-1,b+2,0};
+
+    inc(x);
+    RET[programIndex] = 0;
+    if (programIndex < 4)
+        RET[programIndex] = x[programIndex];
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 6;
+    RET[1] = 7;
+    RET[2] = 8;
+    RET[3] = 9;
+}
diff --git a/tests/short-vec-17.ispc b/tests/short-vec-17.ispc
new file mode 100644
index 00000000..a62ebc36
--- /dev/null
+++ b/tests/short-vec-17.ispc
@@ -0,0 +1,21 @@
+export uniform int width() { return programCount; }
+
+void inc(reference uniform float<4> v) { v=2; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform float<4> x = {b,b+1,b+2,b+3}, y = {b,b-1,b+2,0};
+
+    inc(x);
+    RET[programIndex] = 0;
+    if (programIndex < 4)
+        RET[programIndex] = x[programIndex];
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 2;
+    RET[1] = 2;
+    RET[2] = 2;
+    RET[3] = 2;
+}
diff --git a/tests/short-vec-18.ispc b/tests/short-vec-18.ispc
new file mode 100644
index 00000000..3fafec34
--- /dev/null
+++ b/tests/short-vec-18.ispc
@@ -0,0 +1,21 @@
+export uniform int width() { return programCount; }
+
+void inc(reference uniform float<4> v) { v=2; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform float<4> x[2] = { {b,b+1,b+2,b+3}, {1,2,3,4} };
+
+    inc(x[0]);
+    RET[programIndex] = 0;
+    if (programIndex < 4)
+        RET[programIndex] = x[1][programIndex];
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+    RET[2] = 3;
+    RET[3] = 4;
+}
diff --git a/tests/short-vec-19.ispc b/tests/short-vec-19.ispc
new file mode 100644
index 00000000..cc54351b
--- /dev/null
+++ b/tests/short-vec-19.ispc
@@ -0,0 +1,21 @@
+export uniform int width() { return programCount; }
+
+void inc(reference uniform float<4> v) { ++v; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform float<4> x[2] = { {b,b+1,b+2,b+3}, {1,2,3,4} };
+
+    inc(x[1]);
+    RET[programIndex] = 0;
+    if (programIndex < 4)
+        RET[programIndex] = x[1][programIndex];
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 2;
+    RET[1] = 3;
+    RET[2] = 4;
+    RET[3] = 5;
+}
diff --git a/tests/short-vec-2.ispc b/tests/short-vec-2.ispc
new file mode 100644
index 00000000..b3842f93
--- /dev/null
+++ b/tests/short-vec-2.ispc
@@ -0,0 +1,15 @@
+
+typedef int<3> int3;
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float ret[], uniform float aa[], uniform float b) {
+    float a = aa[programIndex];
+    int3 v0 = { b, 2*b, 3*b };
+    int3 v1 = { a, 2*a, 3*a };
+    ret[programIndex] = (v0+v1)[1];
+}
+
+export void result(uniform float ret[]) {
+    ret[programIndex] = 10 + 2 * (programIndex+1);
+}
diff --git a/tests/short-vec-3.ispc b/tests/short-vec-3.ispc
new file mode 100644
index 00000000..ecfd0b90
--- /dev/null
+++ b/tests/short-vec-3.ispc
@@ -0,0 +1,15 @@
+
+typedef int<3> int3;
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float ret[], uniform float aa[], uniform float b) {
+    float a = aa[programIndex];
+    int3 v0 = { b, 2*b, 3*b };
+    int3 v1 = { a, 2*a, 3*a };
+    ret[programIndex] = (v0+v1).y;
+}
+
+export void result(uniform float ret[]) {
+    ret[programIndex] = 10 + 2 * (programIndex+1);
+}
diff --git a/tests/short-vec-4.ispc b/tests/short-vec-4.ispc
new file mode 100644
index 00000000..6acd7400
--- /dev/null
+++ b/tests/short-vec-4.ispc
@@ -0,0 +1,19 @@
+
+typedef int<3> int3;
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float ret[], uniform float aa[], uniform float b) {
+    float a = aa[programIndex];
+    int3 v0 = { b, 2*b, 3*b };
+    int3 v1 = { a, 2*a, 3*a };
+    if (programIndex & 1)
+        ret[programIndex] = (v0+v1).y;
+    else
+        ret[programIndex] = (v0+v1).z;
+}
+
+export void result(uniform float ret[]) {
+    ret[programIndex] = (programIndex & 1) ? 10 + 2 * (programIndex+1) :
+        15 + 3 * (programIndex+1);
+}
diff --git a/tests/short-vec-5.ispc b/tests/short-vec-5.ispc
new file mode 100644
index 00000000..91f63064
--- /dev/null
+++ b/tests/short-vec-5.ispc
@@ -0,0 +1,19 @@
+
+typedef int<4> int4;
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float ret[], uniform float aa[], uniform float b) {
+    float a = aa[programIndex];
+    int4 v0 = { b, 2*b, 3*b, 2 };
+    int4 v1 = { a, 2*a, 3*a, 1 };
+    v0.y /= 2.;
+    if (programIndex & 1)
+        v1[0] += 1;
+    ret[programIndex] = (v0+v1).y + (v0+v1).x;
+}
+
+export void result(uniform float ret[]) {
+    ret[programIndex] = 5 + 2 * (programIndex+1) + 5 + (programIndex+1);
+    if (programIndex & 1) ++ret[programIndex];
+}
diff --git a/tests/short-vec-6.ispc b/tests/short-vec-6.ispc
new file mode 100644
index 00000000..647a95f7
--- /dev/null
+++ b/tests/short-vec-6.ispc
@@ -0,0 +1,22 @@
+
+typedef int<4> int4;
+
+export uniform int width() { return programCount; }
+
+void inc(reference int4 v) {
+    int4 delta = { 1, 1, 1, 1 };
+    v += delta;
+}
+
+export void f_fu(uniform float ret[], uniform float aa[], uniform float b) {
+    float a = aa[programIndex];
+    int4 v0 = { b, 2*b, 3*b, 2 };
+    inc(v0);
+    if (programIndex & 1) inc(v0);
+    ret[programIndex] = v0.z;
+}
+
+export void result(uniform float ret[]) {
+    ret[programIndex] = 16;
+    if (programIndex & 1) ++ret[programIndex];
+}
diff --git a/tests/short-vec-7.ispc b/tests/short-vec-7.ispc
new file mode 100644
index 00000000..1b64b432
--- /dev/null
+++ b/tests/short-vec-7.ispc
@@ -0,0 +1,22 @@
+
+typedef int<4> int4;
+
+export uniform int width() { return programCount; }
+
+void incXY(reference int4 v) {
+    ++v.x;
+    ++v.y;
+}
+
+export void f_fu(uniform float ret[], uniform float aa[], uniform float b) {
+    float a = aa[programIndex];
+    int4 v0 = { b, 2*b, 3*b, 2 };
+    incXY(v0);
+    if (programIndex & 1) incXY(v0);
+    ret[programIndex] = v0.x + v0.y;
+}
+
+export void result(uniform float ret[]) {
+    ret[programIndex] = 17;
+    if (programIndex & 1) ret[programIndex] += 2;
+}
diff --git a/tests/short-vec-8.ispc b/tests/short-vec-8.ispc
new file mode 100644
index 00000000..dbe15975
--- /dev/null
+++ b/tests/short-vec-8.ispc
@@ -0,0 +1,33 @@
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    uniform float<5> x = {1,2,3,10,20}, y = {3,1,0,0,5};
+    uniform bool<5> c = x < y;
+
+    RET[programIndex] = 0;
+    if (programIndex < 5) {
+        uniform float<5> z = c ? x : y;
+        RET[programIndex] = z[programIndex];
+    }
+/*CO    return x[y];*/
+
+/*CO    int index = aFOO[programIndex];*/
+/*CO    index = min(index, 3);*/
+/*CO    return x[index];*/
+
+/*CO    return x << 1;*/
+/*CO    return c[0] ? 1 : 0;*/
+/*CO    x = b;*/
+/*CO    y = b;*/
+/*CO    return x+y;*/
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 1;
+    RET[2] = 0;
+    RET[3] = 0;
+    if (programCount > 4)
+        RET[4] = 5;
+}
diff --git a/tests/short-vec-9.ispc b/tests/short-vec-9.ispc
new file mode 100644
index 00000000..965b4982
--- /dev/null
+++ b/tests/short-vec-9.ispc
@@ -0,0 +1,29 @@
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    uniform float<3> x = {1,2,3}, y = {b,b,b};
+
+    RET[programIndex] = 0;
+    uniform float<3> z = x+y;
+    if (programIndex < 3)
+        RET[programIndex] = z[programIndex];
+
+/*CO    return x[y];*/
+
+/*CO    int index = aFOO[programIndex];*/
+/*CO    index = min(index, 3);*/
+/*CO    return x[index];*/
+
+/*CO    return x << 1;*/
+/*CO    return c[0] ? 1 : 0;*/
+/*CO    x = b;*/
+/*CO    y = b;*/
+/*CO    return x+y;*/
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 6;
+    RET[1] = 7;
+    RET[2] = 8;
+}
diff --git a/tests/static-global-assign.ispc b/tests/static-global-assign.ispc
new file mode 100644
index 00000000..801aae04
--- /dev/null
+++ b/tests/static-global-assign.ispc
@@ -0,0 +1,15 @@
+
+static uniform float x = 2;
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float ret[], uniform float aa[], uniform float b) {
+    float a = aa[programIndex];
+    if (b > 0)
+        x = 0;
+    ret[programIndex] = x;
+}
+
+export void result(uniform float ret[]) {
+    ret[programIndex] = 0;
+}
diff --git a/tests/static-global-initializer.ispc b/tests/static-global-initializer.ispc
new file mode 100644
index 00000000..2d550546
--- /dev/null
+++ b/tests/static-global-initializer.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+
+/*COextern "C" float f;*/
+static uniform float f = 2.;
+
+export void f_v(uniform float RET[]) {
+    RET[programIndex] = f;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 2.; }
diff --git a/tests/static-init-with-typeconv.ispc b/tests/static-init-with-typeconv.ispc
new file mode 100644
index 00000000..a63ab1a8
--- /dev/null
+++ b/tests/static-init-with-typeconv.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float x_fullFOO[], uniform float a) {
+    float x_full = x_fullFOO[programIndex];
+    static const float two_over_pi_vec = 2;
+    float scaled = x_full * two_over_pi_vec;
+    RET[programIndex] = scaled;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 2+2*programIndex; }
diff --git a/tests/static-init.ispc b/tests/static-init.ispc
new file mode 100644
index 00000000..d648216a
--- /dev/null
+++ b/tests/static-init.ispc
@@ -0,0 +1,10 @@
+
+export uniform int width() { return programCount; }
+
+
+
+static float x = 1;
+
+export void f_v(uniform float RET[]) { RET[programIndex] = x; }
+
+export void result(uniform float RET[]) { RET[programIndex] = 1.; }
diff --git a/tests/store-int16.ispc b/tests/store-int16.ispc
new file mode 100644
index 00000000..cc21dc07
--- /dev/null
+++ b/tests/store-int16.ispc
@@ -0,0 +1,14 @@
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    uniform int x[16] = 0xffffffff;
+    unsigned int val = aFOO[programIndex];
+    store_to_int16(x, 5, val);
+    unsigned int v = load_from_int16(x, 6);
+    RET[programIndex] = v;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2+programIndex;
+    RET[programCount-1] = 0xffff;
+}
diff --git a/tests/store-int8.ispc b/tests/store-int8.ispc
new file mode 100644
index 00000000..45e991bf
--- /dev/null
+++ b/tests/store-int8.ispc
@@ -0,0 +1,14 @@
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    uniform int x[8] = 0xffffffff;
+    unsigned int val = aFOO[programIndex];
+    store_to_int8(x, 2, val);
+    unsigned int v = load_from_int8(x, 1);
+    RET[programIndex] = v;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = programIndex;
+    RET[0] = 0xff;
+}
diff --git a/tests/struct-gather-2.ispc b/tests/struct-gather-2.ispc
new file mode 100644
index 00000000..ce24aa18
--- /dev/null
+++ b/tests/struct-gather-2.ispc
@@ -0,0 +1,23 @@
+
+export uniform int width() { return programCount; }
+
+
+
+struct Foo {
+    float f;
+};
+
+float func(uniform Foo foo[], int offset) {
+    return foo[offset].f;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform Foo foo[17];
+    uniform int i;
+    for (i = 0; i < 17; ++i)
+        foo[i].f = i*a;
+    RET[programIndex] = func(foo, (int)a);
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = (1+programIndex)*(1+programIndex); }
diff --git a/tests/struct-gather-3.ispc b/tests/struct-gather-3.ispc
new file mode 100644
index 00000000..ce24aa18
--- /dev/null
+++ b/tests/struct-gather-3.ispc
@@ -0,0 +1,23 @@
+
+export uniform int width() { return programCount; }
+
+
+
+struct Foo {
+    float f;
+};
+
+float func(uniform Foo foo[], int offset) {
+    return foo[offset].f;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform Foo foo[17];
+    uniform int i;
+    for (i = 0; i < 17; ++i)
+        foo[i].f = i*a;
+    RET[programIndex] = func(foo, (int)a);
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = (1+programIndex)*(1+programIndex); }
diff --git a/tests/struct-gather.ispc b/tests/struct-gather.ispc
new file mode 100644
index 00000000..96ecb011
--- /dev/null
+++ b/tests/struct-gather.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+
+
+struct Foo {
+    float f;
+};
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform Foo foo[17];
+    uniform int i;
+    for (i = 0; i < 17; ++i)
+        foo[i].f = i*a;
+    RET[programIndex] = foo[(int)a].f;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = (programIndex+1)*(programIndex+1); }
diff --git a/tests/struct-ref-lvalue.ispc b/tests/struct-ref-lvalue.ispc
new file mode 100644
index 00000000..061dfa16
--- /dev/null
+++ b/tests/struct-ref-lvalue.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+
+
+struct Foo { float f; };
+
+void f(reference uniform Foo foo[], float a) {
+    ++foo[a].f;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform Foo foo[17] = a;
+    f(foo, a);
+    RET[programIndex] = foo[a].f;
+}
+
+export void result(uniform float RET[]) { 
+    RET[programIndex] = 2+programIndex;
+}
diff --git a/tests/struct-test-111.ispc b/tests/struct-test-111.ispc
new file mode 100644
index 00000000..b33e6f57
--- /dev/null
+++ b/tests/struct-test-111.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) {
+    varying struct { float x, y; int i; } a;
+    a.x = 1;
+    a.y = 2;
+    RET[programIndex] = a.x + a.y;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 3;
+}
diff --git a/tests/struct-test-112.ispc b/tests/struct-test-112.ispc
new file mode 100644
index 00000000..ceb135cc
--- /dev/null
+++ b/tests/struct-test-112.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) {
+    varying struct { varying struct { float x, y; } q; int i; } a[3];
+    a[1].q.x = 1;
+    a[1].q.y = 2;
+    RET[programIndex] = a[1].q.x + a[1].q.y;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 3;
+}
diff --git a/tests/struct-test-113.ispc b/tests/struct-test-113.ispc
new file mode 100644
index 00000000..e8ea662c
--- /dev/null
+++ b/tests/struct-test-113.ispc
@@ -0,0 +1,20 @@
+
+export uniform int width() { return programCount; }
+
+
+struct Foo {
+    int i;
+    float f;
+};
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex];
+    varying Foo myFoo;
+    myFoo.f = a;
+    RET[programIndex] = myFoo.f + a;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2 + 2*programIndex;
+}
diff --git a/tests/struct-test-114.ispc b/tests/struct-test-114.ispc
new file mode 100644
index 00000000..66c4b07c
--- /dev/null
+++ b/tests/struct-test-114.ispc
@@ -0,0 +1,25 @@
+
+export uniform int width() { return programCount; }
+
+
+struct Foo {
+    uniform float x;
+    uniform float f;
+};
+
+export void f_fi(uniform float RET[], uniform float aFOO[], uniform int bFOO[]) {
+    float a = aFOO[programIndex];
+    int b = bFOO[programIndex];
+    varying Foo myFoo[17];
+    uniform int i;
+    for (i = 0; i < 17; ++i) {
+        myFoo[i].x = i;
+        myFoo[i].f = 2*i;
+    }
+    RET[programIndex] = myFoo[b/2].f;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2 + 2*programIndex;
+}
diff --git a/tests/struct-test-115.ispc b/tests/struct-test-115.ispc
new file mode 100644
index 00000000..ee0d9229
--- /dev/null
+++ b/tests/struct-test-115.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+
+struct Foo {
+    uniform float x;
+    uniform float f;
+};
+
+export void f_fi(uniform float RET[], uniform float aFOO[], uniform int b[]) {
+    float a = aFOO[programIndex];
+    varying Foo myFoo = { a, 2 };
+    RET[programIndex] = myFoo.x;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1+programIndex;
+}
diff --git a/tests/struct-test-116.ispc b/tests/struct-test-116.ispc
new file mode 100644
index 00000000..33357c0b
--- /dev/null
+++ b/tests/struct-test-116.ispc
@@ -0,0 +1,20 @@
+
+export uniform int width() { return programCount; }
+
+
+struct Foo {
+    uniform float x;
+    uniform float f;
+};
+
+export void f_fi(uniform float RET[], uniform float aFOO[], uniform int bFOO[]) {
+    float a = aFOO[programIndex];
+    int b = bFOO[programIndex];
+    varying Foo myFoo = { a, b };
+    RET[programIndex] = myFoo.x + myFoo.f;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 3 + 3 * programIndex;
+}
diff --git a/tests/struct-test-117.ispc b/tests/struct-test-117.ispc
new file mode 100644
index 00000000..11a3bd40
--- /dev/null
+++ b/tests/struct-test-117.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+
+struct Foo {
+    float x;
+    float f;
+};
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    varying Foo myFoo = a;
+    RET[programIndex] = myFoo.x;
+}
+
+
+export void result(uniform float RET[4]) {
+    RET[programIndex] = 1+programIndex;
+}
diff --git a/tests/struct-test-118.ispc b/tests/struct-test-118.ispc
new file mode 100644
index 00000000..521d8ad9
--- /dev/null
+++ b/tests/struct-test-118.ispc
@@ -0,0 +1,20 @@
+
+export uniform int width() { return programCount; }
+
+
+struct Foo {
+    float x;
+    float f;
+};
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform Foo myFoo[3] = a;
+    int i = 1;
+    varying Foo barFoo = myFoo[i];
+    RET[programIndex] = barFoo.x;
+}
+
+
+export void result(uniform float RET[4]) {
+    RET[programIndex] = 1+programIndex;
+}
diff --git a/tests/struct-test-119.ispc b/tests/struct-test-119.ispc
new file mode 100644
index 00000000..fddbeb55
--- /dev/null
+++ b/tests/struct-test-119.ispc
@@ -0,0 +1,22 @@
+
+export uniform int width() { return programCount; }
+
+
+struct Foo {
+    float x;
+    float f;
+    int i[3];
+};
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    varying Foo myFoo[3] = b;
+    uniform Foo barFoo;
+    barFoo = myFoo[0];
+    RET[programIndex] = barFoo.x + myFoo[1].i[2];
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 10;
+}
diff --git a/tests/struct-test-120.ispc b/tests/struct-test-120.ispc
new file mode 100644
index 00000000..fabc9fc7
--- /dev/null
+++ b/tests/struct-test-120.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+
+struct Foo {
+    float x;
+    float f;
+    int i[3];
+};
+float bar(struct Foo f) { return f.f; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    varying Foo myFoo[3] = a;
+    RET[programIndex] = bar(myFoo[1]);
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1+programIndex;
+}
diff --git a/tests/struct-test-121.ispc b/tests/struct-test-121.ispc
new file mode 100644
index 00000000..576b5193
--- /dev/null
+++ b/tests/struct-test-121.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+
+struct Foo {
+    float x;
+};
+
+float bar(varying Foo f) { ++f.x; return f.x; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    varying Foo f = a;
+    bar(f);
+    RET[programIndex] = f.x;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1+programIndex;
+}
diff --git a/tests/struct-test-122.ispc b/tests/struct-test-122.ispc
new file mode 100644
index 00000000..9cf6d4c3
--- /dev/null
+++ b/tests/struct-test-122.ispc
@@ -0,0 +1,20 @@
+
+export uniform int width() { return programCount; }
+
+
+struct Foo {
+    float x;
+};
+float bar(reference varying Foo f) { ++f.x; return f.x; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    varying Foo f = a;
+    bar(f);
+    RET[programIndex] = f.x;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2 + programIndex;
+}
diff --git a/tests/struct-test-123.ispc b/tests/struct-test-123.ispc
new file mode 100644
index 00000000..9a4a7657
--- /dev/null
+++ b/tests/struct-test-123.ispc
@@ -0,0 +1,23 @@
+
+export uniform int width() { return programCount; }
+
+
+struct Foo {
+    float x;
+};
+void bar(reference varying Foo f) { ++f.x; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    varying Foo f = a;
+    if (a == 1 || a == 4)
+        bar(f);
+    RET[programIndex] = f.x;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1+programIndex;
+    RET[0] = 2;
+    RET[3] = 5;
+}
diff --git a/tests/struct-vary-index-expr.ispc b/tests/struct-vary-index-expr.ispc
new file mode 100644
index 00000000..8ea9c01d
--- /dev/null
+++ b/tests/struct-vary-index-expr.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+
+struct Foo { float f; };
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform Foo foo[17] = a;
+    ++foo[a].f;
+    uniform int i[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 };
+    RET[programIndex] = foo[i[programIndex]].f;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 2+programIndex; }
diff --git a/tests/test-0.ispc b/tests/test-0.ispc
new file mode 100644
index 00000000..d56f26d8
--- /dev/null
+++ b/tests/test-0.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    RET[programIndex] = a+a; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2 + 2 * programIndex;
+}
diff --git a/tests/test-1.ispc b/tests/test-1.ispc
new file mode 100644
index 00000000..4d71a865
--- /dev/null
+++ b/tests/test-1.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = 0.; b = a; 
+    RET[programIndex] = a+b; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2 + 2*programIndex;
+}
diff --git a/tests/test-10.ispc b/tests/test-10.ispc
new file mode 100644
index 00000000..45d2ebbf
--- /dev/null
+++ b/tests/test-10.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    float r = a;
+    if (a <= 2.) r = b;
+    RET[programIndex] = r;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1+programIndex;
+    RET[0] = RET[1] = 5;
+}
diff --git a/tests/test-100.ispc b/tests/test-100.ispc
new file mode 100644
index 00000000..3b6d73e9
--- /dev/null
+++ b/tests/test-100.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+
+static float array[6];
+task void x(uniform int i, float f) {
+    array[i] = f;
+}
+export void f_fu(uniform float RET[], uniform float fFOO[], uniform float fu) { 
+    float f = fFOO[programIndex];
+    launch < x(fu, f) >;
+    sync;
+    RET[programIndex] = array[5];
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1+programIndex;
+}
diff --git a/tests/test-101.ispc b/tests/test-101.ispc
new file mode 100644
index 00000000..cea927da
--- /dev/null
+++ b/tests/test-101.ispc
@@ -0,0 +1,26 @@
+
+export uniform int width() { return programCount; }
+
+
+static uniform float array[10000];
+task void x(uniform int i, float f) {
+    uniform int j;
+    array[i] = i / 10000.;
+    for (j = 0; j < 10000; ++j)
+        array[i] = sin(array[i]);
+    if (array[i] < .02)
+        array[i] = i;
+}
+export void f_f(uniform float RET[], uniform float fFOO[]) { 
+    float f = fFOO[programIndex];
+    uniform int i;
+    for (i = 0; i < 10000; ++i)
+        launch < x(i, f) >;
+    sync;
+    RET[programIndex] = array[9999];
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 9999.000000;
+}
diff --git a/tests/test-102.ispc b/tests/test-102.ispc
new file mode 100644
index 00000000..d3b95468
--- /dev/null
+++ b/tests/test-102.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+
+static uniform float array[10];
+task void foo(uniform float f) { array[0] = f; }
+task void foo(uniform float f, uniform int i) { array[i] = f; }
+export void f_v(uniform float RET[]) { 
+    launch < foo(12.) >;
+    launch < foo(-1., 1) >;
+    sync;
+    RET[programIndex] = array[0] + array[1];
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 11.000000;
+}
diff --git a/tests/test-103.ispc b/tests/test-103.ispc
new file mode 100644
index 00000000..1c53213a
--- /dev/null
+++ b/tests/test-103.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    RET[programIndex] = a < 17.; 
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1;
+}
diff --git a/tests/test-104.ispc b/tests/test-104.ispc
new file mode 100644
index 00000000..116938da
--- /dev/null
+++ b/tests/test-104.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float x = aFOO[programIndex&0x3]; 
+    float d, ix; 
+    ix = rcp(x);
+    d = (ix - 1. / x);
+    d = (d < 0) ? -d : d;
+    RET[programIndex] = (d < 1e-7) ? 1. : 0.; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1.;
+}
diff --git a/tests/test-105.ispc b/tests/test-105.ispc
new file mode 100644
index 00000000..bbfe388d
--- /dev/null
+++ b/tests/test-105.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float x = aFOO[programIndex]; 
+    float d, invsqrt = rsqrt(x);
+    d = (x * (invsqrt * invsqrt)) - 1.;
+    if (d < 0.) d = -d;
+    RET[programIndex] = d > 1e-5;
+}
+
+
+export void result(uniform float RET[]) {
+       RET[programIndex] = 0;
+}
diff --git a/tests/test-106.ispc b/tests/test-106.ispc
new file mode 100644
index 00000000..5207eee6
--- /dev/null
+++ b/tests/test-106.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex];
+    RET[programIndex] = clamp(3*a, 5., 10.);
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 10;
+    RET[0] = 5;
+    RET[1] = 6;
+    RET[2] = 9;
+}
diff --git a/tests/test-107.ispc b/tests/test-107.ispc
new file mode 100644
index 00000000..324b4107
--- /dev/null
+++ b/tests/test-107.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex];
+    float o = a / (sqrt(a) * sqrt(a));
+    RET[programIndex] = ((o > .9999) && (o < 1.00001));
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1;
+}
diff --git a/tests/test-108.ispc b/tests/test-108.ispc
new file mode 100644
index 00000000..2c9a1539
--- /dev/null
+++ b/tests/test-108.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    int i, j=0;
+    for (i=a; i < 10; ++i) {
+        j += sqrt((a / 3.f) * (1.f / (i+2)));
+        if (i >= 5) continue;
+        j += sqrt(((a+2) / 3.f) * (1.f / (i+3)));
+    }
+    RET[programIndex] = (float)j; 
+}
+
+
+export void result(uniform float RET[4]) {
+    RET[programIndex] = 0;
+}
diff --git a/tests/test-11.ispc b/tests/test-11.ispc
new file mode 100644
index 00000000..b3ff020d
--- /dev/null
+++ b/tests/test-11.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    float r = a;
+    if (a <= 2.) { RET[programIndex] = b; return; }
+    RET[programIndex] = r;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1+programIndex;
+    RET[0] = RET[1] = 5;
+}
diff --git a/tests/test-12.ispc b/tests/test-12.ispc
new file mode 100644
index 00000000..3a2c75cb
--- /dev/null
+++ b/tests/test-12.ispc
@@ -0,0 +1,25 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex];
+    float b = 10., t = 2., c;
+    if (a <= t) {
+        if (a == 1.) c = 7.;
+        else c = a;
+    }
+    else {
+        c = b;
+        if (a == 3.) c = 12;
+    }
+    RET[programIndex] = c;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 10;
+    RET[0] = 7;
+    RET[1] = 2;
+    RET[2] = 12;
+}
diff --git a/tests/test-124.ispc b/tests/test-124.ispc
new file mode 100644
index 00000000..e9b3e336
--- /dev/null
+++ b/tests/test-124.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    if (any(a == 3)) { RET[programIndex] = 1; return; }
+    RET[programIndex] = 0;
+}
+
+
+export void result(uniform float RET[4]) {
+    RET[programIndex] = 1;
+}
diff --git a/tests/test-125.ispc b/tests/test-125.ispc
new file mode 100644
index 00000000..ea16245d
--- /dev/null
+++ b/tests/test-125.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    if (a < 3)
+        if (all(a < 3)) 
+            RET[programIndex] = 1;
+    else RET[programIndex] = 0;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = RET[1] = 1;
+}
diff --git a/tests/test-126.ispc b/tests/test-126.ispc
new file mode 100644
index 00000000..2a856312
--- /dev/null
+++ b/tests/test-126.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    RET[programIndex] = abs(-a);
+}
+
+
+export void result(uniform float RET[4]) {
+    RET[programIndex] = 1+programIndex;
+}
diff --git a/tests/test-127.ispc b/tests/test-127.ispc
new file mode 100644
index 00000000..44598001
--- /dev/null
+++ b/tests/test-127.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex&3];
+    RET[programIndex] = reduce_add(a) * 4 / programCount;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 10;
+}
diff --git a/tests/test-128.ispc b/tests/test-128.ispc
new file mode 100644
index 00000000..24bdebe7
--- /dev/null
+++ b/tests/test-128.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    RET[programIndex] = extract(a, 1);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2.;
+}
diff --git a/tests/test-129.ispc b/tests/test-129.ispc
new file mode 100644
index 00000000..1c3e2dff
--- /dev/null
+++ b/tests/test-129.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    RET[programIndex] = extract(a, (uniform int)b-3);
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 3;
+}
diff --git a/tests/test-13.ispc b/tests/test-13.ispc
new file mode 100644
index 00000000..4da7366f
--- /dev/null
+++ b/tests/test-13.ispc
@@ -0,0 +1,25 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex];
+    float b = 10., t = 2., c;
+    if (a <= t) {
+        if (a == 1.) { RET[programIndex] = 7.; return; }
+        else { RET[programIndex] = a; return; }
+    }
+    else {
+        c = b;
+        if (a == 3.) { RET[programIndex] = 12; return; }
+    }
+    RET[programIndex] = c;
+}
+
+
+export void result(uniform float RET[4]) {
+    RET[programIndex] = 10;
+    RET[0] = 7;
+    RET[1] = 2;
+    RET[2] = 12;
+}
diff --git a/tests/test-130.ispc b/tests/test-130.ispc
new file mode 100644
index 00000000..7f3a16cd
--- /dev/null
+++ b/tests/test-130.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    a = insert(a, (uniform int)b-5, 1234.);
+    RET[programIndex] = extract(a, 0);
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex]=1234;
+}
diff --git a/tests/test-131.ispc b/tests/test-131.ispc
new file mode 100644
index 00000000..fd4676fe
--- /dev/null
+++ b/tests/test-131.ispc
@@ -0,0 +1,20 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    int i, j=0;
+    for (i=a; i < 10; ++i) {
+        j += sqrt((a / 3.f) * (1.f / (i+2)));
+        cif (i >= 5) continue;
+        j += sqrt(((a+2) / 3.f) * (1.f / (i+3)));
+    }
+    RET[programIndex] = (float)j; 
+}
+
+
+export void result(uniform float RET[4]) {
+    RET[programIndex] = 0;
+}
diff --git a/tests/test-132.ispc b/tests/test-132.ispc
new file mode 100644
index 00000000..def5c4f6
--- /dev/null
+++ b/tests/test-132.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) { 
+    cif (true) return; 
+    else RET[programIndex] = 1234.; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0.000000;
+}
diff --git a/tests/test-133.ispc b/tests/test-133.ispc
new file mode 100644
index 00000000..83df40d3
--- /dev/null
+++ b/tests/test-133.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform float i;
+    for (i = 0; i < b; ++i) {
+        cif (i+2 == b) continue;
+        ++a;
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 5 + programIndex;
+}
diff --git a/tests/test-134.ispc b/tests/test-134.ispc
new file mode 100644
index 00000000..baa8ec37
--- /dev/null
+++ b/tests/test-134.ispc
@@ -0,0 +1,24 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex&0x3];
+    float i, j;
+    for (i = 0; i < b; ++i) {
+        cif (a == 1.) break;
+        for (j = 0; j < b; ++j) {
+            if (a == 3.) break;
+            ++a;
+        }
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[0] = RET[4] = RET[8] = RET[12] = 1;
+    RET[1] = RET[5] = RET[9] = RET[13] = 3;
+    RET[2] = RET[6] = RET[10] = RET[14] = 3;
+    RET[3] = RET[7] = RET[11] = RET[15] = 29;
+}
diff --git a/tests/test-135.ispc b/tests/test-135.ispc
new file mode 100644
index 00000000..c350a524
--- /dev/null
+++ b/tests/test-135.ispc
@@ -0,0 +1,24 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex&3];
+    float i, j;
+    for (i = 0; i < b; ++i) {
+        if (a == 1.) break;
+        for (j = 0; j < b; ++j) {
+            cif (a == 3.) break;
+            ++a;
+        }
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[0] = RET[4] = RET[8] = RET[12] = 1;
+    RET[1] = RET[5] = RET[9] = RET[13] = 3;
+    RET[2] = RET[6] = RET[10] = RET[14] = 3;
+    RET[3] = RET[7] = RET[11] = RET[15] = 29;
+}
diff --git a/tests/test-136.ispc b/tests/test-136.ispc
new file mode 100644
index 00000000..ab6c6b5b
--- /dev/null
+++ b/tests/test-136.ispc
@@ -0,0 +1,24 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex&3];
+    float i, j;
+    for (i = 0; i < b; ++i) {
+        cif (a == 1.) break;
+        for (j = 0; j < b; ++j) {
+            cif (a == 3.) break;
+            ++a;
+        }
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[0] = RET[4] = RET[8] = RET[12] = 1;
+    RET[1] = RET[5] = RET[9] = RET[13] = 3;
+    RET[2] = RET[6] = RET[10] = RET[14] = 3;
+    RET[3] = RET[7] = RET[11] = RET[15] = 29;
+}
diff --git a/tests/test-137.ispc b/tests/test-137.ispc
new file mode 100644
index 00000000..6c33a04b
--- /dev/null
+++ b/tests/test-137.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) { 
+    cif (-1 < 0) RET[programIndex] = 10.; 
+    else RET[programIndex] = 0.; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 10.000000;
+}
diff --git a/tests/test-138.ispc b/tests/test-138.ispc
new file mode 100644
index 00000000..08c03d0e
--- /dev/null
+++ b/tests/test-138.ispc
@@ -0,0 +1,25 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex];
+    float b = 10., t = 2., c;
+    cif (a <= t) {
+        if (a == 1.) { RET[programIndex] = 7.; return; }
+        else { RET[programIndex] = a; return; }
+    }
+    else {
+        c = b;
+        if (a == 3.) { RET[programIndex] = 12; return; }
+    }
+    RET[programIndex] = c;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 10;
+    RET[0] = 7;
+    RET[1] = 2;
+    RET[2] = 12;
+}
diff --git a/tests/test-139.ispc b/tests/test-139.ispc
new file mode 100644
index 00000000..5e2c8aac
--- /dev/null
+++ b/tests/test-139.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex&0x3]; 
+    RET[programIndex] = exp(a); 
+}
+
+export void result(uniform float RET[]) {
+    RET[0] = RET[4] = RET[8] = RET[12] = 0x1.5bf0a8p+1;
+    RET[1] = RET[5] = RET[9] = RET[13] = 0x1.d8e64cp+2;
+    RET[2] = RET[6] = RET[10] = RET[14] = 0x1.415e5cp+4;
+    RET[3] = RET[7] = RET[11] = RET[15] = 0x1.b4c904p+5;
+}
diff --git a/tests/test-14.ispc b/tests/test-14.ispc
new file mode 100644
index 00000000..5b411178
--- /dev/null
+++ b/tests/test-14.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    RET[programIndex] = (a == 1.) ? a : 2.*a;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2 + 2*programIndex;
+    RET[0] = 1;
+}
diff --git a/tests/test-140.ispc b/tests/test-140.ispc
new file mode 100644
index 00000000..a983d528
--- /dev/null
+++ b/tests/test-140.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex&3]; 
+    RET[programIndex] = log(a); 
+}
+
+export void result(uniform float RET[]) {
+    RET[0] = RET[4] = RET[8] = RET[12] = 0x0.0p+0;
+    RET[1] = RET[5] = RET[9] = RET[13] = 0x1.62e43p-1;
+    RET[2] = RET[6] = RET[10] = RET[14] =  0x1.193ea8p+0;
+    RET[3] = RET[7] = RET[11] = RET[15] = 0x1.62e43p+0;
+}
diff --git a/tests/test-141.ispc b/tests/test-141.ispc
new file mode 100644
index 00000000..a533b605
--- /dev/null
+++ b/tests/test-141.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    RET[programIndex] = (exp(-log(1/a)) - a) < 1e-7 ? 1 : 0;
+}
+
+export void result(uniform float RET[4]) {
+    RET[programIndex] = 1;
+}
diff --git a/tests/test-142.ispc b/tests/test-142.ispc
new file mode 100644
index 00000000..18053402
--- /dev/null
+++ b/tests/test-142.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    RET[programIndex] = round(a+.499999); 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1+programIndex;
+}
diff --git a/tests/test-143.ispc b/tests/test-143.ispc
new file mode 100644
index 00000000..325bd90f
--- /dev/null
+++ b/tests/test-143.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    RET[programIndex] = round(a+.5); 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (programIndex / 2) * 2 + 2;
+}
diff --git a/tests/test-144.ispc b/tests/test-144.ispc
new file mode 100644
index 00000000..568bdc10
--- /dev/null
+++ b/tests/test-144.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    RET[programIndex] = floor(a+.999999); 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1+programIndex;
+}
diff --git a/tests/test-145.ispc b/tests/test-145.ispc
new file mode 100644
index 00000000..c27603bb
--- /dev/null
+++ b/tests/test-145.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    RET[programIndex] = ceil(a-1e-5); 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1+programIndex;
+}
diff --git a/tests/test-146.ispc b/tests/test-146.ispc
new file mode 100644
index 00000000..09143072
--- /dev/null
+++ b/tests/test-146.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    RET[programIndex] = ceil(-a+1e-5); 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = -programIndex;
+}
diff --git a/tests/test-15.ispc b/tests/test-15.ispc
new file mode 100644
index 00000000..87ec2cf0
--- /dev/null
+++ b/tests/test-15.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    RET[programIndex] = (b == 1.) ? a : 2.*a;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2 + 2*programIndex;
+}
diff --git a/tests/test-16.ispc b/tests/test-16.ispc
new file mode 100644
index 00000000..f1065420
--- /dev/null
+++ b/tests/test-16.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    float r;
+    b -= 2.;
+    RET[programIndex] = (a == b) ? a : 2.*a;
+}
+
+
+export void result(uniform float RET[4]) {
+    RET[programIndex] = 2 + 2*programIndex;
+    RET[2] = 3;
+}
diff --git a/tests/test-17.ispc b/tests/test-17.ispc
new file mode 100644
index 00000000..ee7e9bec
--- /dev/null
+++ b/tests/test-17.ispc
@@ -0,0 +1,9 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) { RET[programIndex] = 12.5; }
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 12.500000;
+}
diff --git a/tests/test-18.ispc b/tests/test-18.ispc
new file mode 100644
index 00000000..c03704a9
--- /dev/null
+++ b/tests/test-18.ispc
@@ -0,0 +1,9 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) { RET[programIndex] = (-1. + 2) -  1; }
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0.000000;
+}
diff --git a/tests/test-19.ispc b/tests/test-19.ispc
new file mode 100644
index 00000000..a1cb1813
--- /dev/null
+++ b/tests/test-19.ispc
@@ -0,0 +1,9 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) { RET[programIndex] = 1. + 2.14159; }
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 3.141590;
+}
diff --git a/tests/test-2.ispc b/tests/test-2.ispc
new file mode 100644
index 00000000..5ea2e5c3
--- /dev/null
+++ b/tests/test-2.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+
+float foo(uniform float xFOO[]) { 
+    float x = xFOO[programIndex];
+    return x + x; 
+}
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    RET[programIndex] = foo(aFOO); 
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2 + 2*programIndex;
+}
diff --git a/tests/test-20.ispc b/tests/test-20.ispc
new file mode 100644
index 00000000..db3a3133
--- /dev/null
+++ b/tests/test-20.ispc
@@ -0,0 +1,9 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) { RET[programIndex] = (2>>1) + 2.14159; }
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 3.141590;
+}
diff --git a/tests/test-21.ispc b/tests/test-21.ispc
new file mode 100644
index 00000000..8922ab35
--- /dev/null
+++ b/tests/test-21.ispc
@@ -0,0 +1,9 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) { RET[programIndex] = (1<<1) + 1.14159; }
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 3.141590;
+}
diff --git a/tests/test-22.ispc b/tests/test-22.ispc
new file mode 100644
index 00000000..34e9e5d8
--- /dev/null
+++ b/tests/test-22.ispc
@@ -0,0 +1,9 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) { RET[programIndex] = 3. * (4<<0) - 7; }
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 5.000000;
+}
diff --git a/tests/test-23.ispc b/tests/test-23.ispc
new file mode 100644
index 00000000..75446f0c
--- /dev/null
+++ b/tests/test-23.ispc
@@ -0,0 +1,11 @@
+
+export uniform int width() { return programCount; }
+
+
+
+
+export void f_v(uniform float RET[]) { RET[programIndex] = 3. * (4>>(-1+1)) - 7; }
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 5.000000;
+}
diff --git a/tests/test-24.ispc b/tests/test-24.ispc
new file mode 100644
index 00000000..243cf90d
--- /dev/null
+++ b/tests/test-24.ispc
@@ -0,0 +1,9 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) { if (~0 > 0) RET[programIndex] = 1.; RET[programIndex] = 2.; }
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2.000000;
+}
diff --git a/tests/test-25.ispc b/tests/test-25.ispc
new file mode 100644
index 00000000..c93f8a7d
--- /dev/null
+++ b/tests/test-25.ispc
@@ -0,0 +1,9 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) { RET[programIndex] = 3. * 4 - 7 + (0 << 4); }
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 5.000000;
+}
diff --git a/tests/test-26.ispc b/tests/test-26.ispc
new file mode 100644
index 00000000..c93f8a7d
--- /dev/null
+++ b/tests/test-26.ispc
@@ -0,0 +1,9 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) { RET[programIndex] = 3. * 4 - 7 + (0 << 4); }
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 5.000000;
+}
diff --git a/tests/test-27.ispc b/tests/test-27.ispc
new file mode 100644
index 00000000..57fecc6c
--- /dev/null
+++ b/tests/test-27.ispc
@@ -0,0 +1,9 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) { if (true) { RET[programIndex] = (5%3) * (4./8.) + 2.14159; } else RET[programIndex] = 1.; }
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 3.141590;
+}
diff --git a/tests/test-28.ispc b/tests/test-28.ispc
new file mode 100644
index 00000000..825d5a77
--- /dev/null
+++ b/tests/test-28.ispc
@@ -0,0 +1,9 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) { if (1.) RET[programIndex] = 10.; else RET[programIndex] = -1.; }
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 10.000000;
+}
diff --git a/tests/test-29.ispc b/tests/test-29.ispc
new file mode 100644
index 00000000..e68186d6
--- /dev/null
+++ b/tests/test-29.ispc
@@ -0,0 +1,9 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) { if (1) RET[programIndex] = 10.; else RET[programIndex] = -1.; }
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 10.000000;
+}
diff --git a/tests/test-3.ispc b/tests/test-3.ispc
new file mode 100644
index 00000000..e9f27479
--- /dev/null
+++ b/tests/test-3.ispc
@@ -0,0 +1,20 @@
+
+export uniform int width() { return programCount; }
+
+
+float foo(float x) {
+    return x + x; 
+}
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b; 
+    b = foo(a); 
+    RET[programIndex] = b; 
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2 + 2*programIndex;
+}
diff --git a/tests/test-30.ispc b/tests/test-30.ispc
new file mode 100644
index 00000000..5ddfe7a6
--- /dev/null
+++ b/tests/test-30.ispc
@@ -0,0 +1,9 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) { if (1. && true) RET[programIndex] = 10.; else RET[programIndex] = -1.; }
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 10.000000;
+}
diff --git a/tests/test-31.ispc b/tests/test-31.ispc
new file mode 100644
index 00000000..73d262b0
--- /dev/null
+++ b/tests/test-31.ispc
@@ -0,0 +1,9 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) { if (true || 0.) RET[programIndex] = 10.; else RET[programIndex] = -1.; }
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 10.000000;
+}
diff --git a/tests/test-32.ispc b/tests/test-32.ispc
new file mode 100644
index 00000000..b42429b0
--- /dev/null
+++ b/tests/test-32.ispc
@@ -0,0 +1,9 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) { if (0. && true) RET[programIndex] = -1.; else RET[programIndex] = 10.; }
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 10.000000;
+}
diff --git a/tests/test-33.ispc b/tests/test-33.ispc
new file mode 100644
index 00000000..0c2942d7
--- /dev/null
+++ b/tests/test-33.ispc
@@ -0,0 +1,9 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) { if (!true) RET[programIndex] = -1.; else RET[programIndex] = 10.; }
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 10.000000;
+}
diff --git a/tests/test-34.ispc b/tests/test-34.ispc
new file mode 100644
index 00000000..ea54f305
--- /dev/null
+++ b/tests/test-34.ispc
@@ -0,0 +1,9 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) { if (1 > 5) RET[programIndex] = -1.; else RET[programIndex] = 10.; }
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 10.000000;
+}
diff --git a/tests/test-35.ispc b/tests/test-35.ispc
new file mode 100644
index 00000000..8a974147
--- /dev/null
+++ b/tests/test-35.ispc
@@ -0,0 +1,9 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) { if (1. != 1) RET[programIndex] = -1.; else RET[programIndex] = 10.; }
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 10.000000;
+}
diff --git a/tests/test-36.ispc b/tests/test-36.ispc
new file mode 100644
index 00000000..56ea64a1
--- /dev/null
+++ b/tests/test-36.ispc
@@ -0,0 +1,9 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) { if (3 >= 3.000001) RET[programIndex] = -1.; else RET[programIndex] = 10.; }
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 10.000000;
+}
diff --git a/tests/test-37.ispc b/tests/test-37.ispc
new file mode 100644
index 00000000..e54d2a92
--- /dev/null
+++ b/tests/test-37.ispc
@@ -0,0 +1,9 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) { if (-1 < 0) RET[programIndex] = 10.; else RET[programIndex] = 0.; }
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 10.000000;
+}
diff --git a/tests/test-38.ispc b/tests/test-38.ispc
new file mode 100644
index 00000000..9fe7cd6c
--- /dev/null
+++ b/tests/test-38.ispc
@@ -0,0 +1,9 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) { if (!((true || false) && (1. && -10.))) RET[programIndex] = -1.; else RET[programIndex] = 10.; }
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 10.000000;
+}
diff --git a/tests/test-39.ispc b/tests/test-39.ispc
new file mode 100644
index 00000000..fb5a93ae
--- /dev/null
+++ b/tests/test-39.ispc
@@ -0,0 +1,9 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) { RET[programIndex] = (2 < 3) ? 42. : -1.; }
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 42.000000;
+}
diff --git a/tests/test-4.ispc b/tests/test-4.ispc
new file mode 100644
index 00000000..8ef82ce0
--- /dev/null
+++ b/tests/test-4.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex];
+    float b;
+    b = 1.;
+    RET[programIndex] = a + b;
+ }
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2+programIndex;
+}
diff --git a/tests/test-40.ispc b/tests/test-40.ispc
new file mode 100644
index 00000000..51cf4258
--- /dev/null
+++ b/tests/test-40.ispc
@@ -0,0 +1,9 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) { RET[programIndex] = 1 + ((true == false) ? -1. : .14159) + 2.; }
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 3.141590;
+}
diff --git a/tests/test-41.ispc b/tests/test-41.ispc
new file mode 100644
index 00000000..a5fcc8e2
--- /dev/null
+++ b/tests/test-41.ispc
@@ -0,0 +1,11 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) { 
+    float x = 1.;  RET[programIndex] = (x == 1.) ? (x + 2.14159) : 0.; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 3.141590;
+}
diff --git a/tests/test-42.ispc b/tests/test-42.ispc
new file mode 100644
index 00000000..9e06ec5f
--- /dev/null
+++ b/tests/test-42.ispc
@@ -0,0 +1,9 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) { float x = 1.;  x = 2; RET[programIndex] = x; }
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2.000000;
+}
diff --git a/tests/test-43.ispc b/tests/test-43.ispc
new file mode 100644
index 00000000..81b22000
--- /dev/null
+++ b/tests/test-43.ispc
@@ -0,0 +1,9 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) { int x = 10.;  x = 2.; RET[programIndex] = (float)x; }
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2.000000;
+}
diff --git a/tests/test-44.ispc b/tests/test-44.ispc
new file mode 100644
index 00000000..6c48df04
--- /dev/null
+++ b/tests/test-44.ispc
@@ -0,0 +1,9 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) { float x = 10; RET[programIndex] = --x; }
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 9.000000;
+}
diff --git a/tests/test-45.ispc b/tests/test-45.ispc
new file mode 100644
index 00000000..2e47d3a6
--- /dev/null
+++ b/tests/test-45.ispc
@@ -0,0 +1,9 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) { float x = 10; RET[programIndex] = x--; }
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 10.000000;
+}
diff --git a/tests/test-46.ispc b/tests/test-46.ispc
new file mode 100644
index 00000000..b0d12fa1
--- /dev/null
+++ b/tests/test-46.ispc
@@ -0,0 +1,9 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) { float x = 10, y; y = ++x; RET[programIndex] = y; }
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 11.000000;
+}
diff --git a/tests/test-47.ispc b/tests/test-47.ispc
new file mode 100644
index 00000000..cd701347
--- /dev/null
+++ b/tests/test-47.ispc
@@ -0,0 +1,9 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) { float x = 10, y; y = x++; RET[programIndex] = y; }
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 10.000000;
+}
diff --git a/tests/test-48.ispc b/tests/test-48.ispc
new file mode 100644
index 00000000..b432ba21
--- /dev/null
+++ b/tests/test-48.ispc
@@ -0,0 +1,9 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) { float x = 10; x += 9; RET[programIndex] = x; }
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 19.000000;
+}
diff --git a/tests/test-49.ispc b/tests/test-49.ispc
new file mode 100644
index 00000000..5ca26db2
--- /dev/null
+++ b/tests/test-49.ispc
@@ -0,0 +1,9 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) { int x = 1; x <<= 3; RET[programIndex] = (float)x; }
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 8.000000;
+}
diff --git a/tests/test-5.ispc b/tests/test-5.ispc
new file mode 100644
index 00000000..b4347240
--- /dev/null
+++ b/tests/test-5.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex];
+    uniform float b;
+    b = 1.;
+    RET[programIndex] = a + b;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2+programIndex;
+}
diff --git a/tests/test-55.ispc b/tests/test-55.ispc
new file mode 100644
index 00000000..d2f42f85
--- /dev/null
+++ b/tests/test-55.ispc
@@ -0,0 +1,10 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) { RET[programIndex] = 1.; } 
+export void f_v(uniform float RET[]) { RET[programIndex] = 2.; }
+
+export result(uniform float RET[]) {
+    RET[programIndex] = 1.000000;
+}
diff --git a/tests/test-56.ispc b/tests/test-56.ispc
new file mode 100644
index 00000000..3148d8b4
--- /dev/null
+++ b/tests/test-56.ispc
@@ -0,0 +1,9 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) { float z=-3.; RET[programIndex] = z+3.; }
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0.000000;
+}
diff --git a/tests/test-57.ispc b/tests/test-57.ispc
new file mode 100644
index 00000000..fbf5186e
--- /dev/null
+++ b/tests/test-57.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+float bar(float x) { return 2. * x; }
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float x = aFOO[programIndex]; 
+    RET[programIndex] = extract(bar(x)  * -1., 0); 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = -2.000000;
+}
diff --git a/tests/test-58.ispc b/tests/test-58.ispc
new file mode 100644
index 00000000..7611b64c
--- /dev/null
+++ b/tests/test-58.ispc
@@ -0,0 +1,11 @@
+
+export uniform int width() { return programCount; }
+
+
+float bar(float x);
+export void f_v(uniform float RET[]) { RET[programIndex] = bar(3.); }
+float bar(float x) { return 2. * x; }
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 6.000000;
+}
diff --git a/tests/test-59.ispc b/tests/test-59.ispc
new file mode 100644
index 00000000..8c0d771e
--- /dev/null
+++ b/tests/test-59.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_f(uniform float RET[], uniform float vFOO[]) {
+    float v = vFOO[programIndex];
+    float r = 0.;
+    while (v > 0.) {
+        r += 1.;
+        v -= .125;
+    }
+    RET[programIndex] = r;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 8 + 8 * programIndex;
+}
diff --git a/tests/test-6.ispc b/tests/test-6.ispc
new file mode 100644
index 00000000..82e293c2
--- /dev/null
+++ b/tests/test-6.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex];
+    uniform float b = 1.;
+    RET[programIndex] = a + b;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2 + programIndex;
+}
diff --git a/tests/test-60.ispc b/tests/test-60.ispc
new file mode 100644
index 00000000..fbdf8bd3
--- /dev/null
+++ b/tests/test-60.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    while (a < 20)
+        ++a;
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 20;
+}
diff --git a/tests/test-61.ispc b/tests/test-61.ispc
new file mode 100644
index 00000000..9fea0ffc
--- /dev/null
+++ b/tests/test-61.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform float i;
+    for (i = 0; i < b; ++i)
+        ++a;
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 6+programIndex;
+}
diff --git a/tests/test-62.ispc b/tests/test-62.ispc
new file mode 100644
index 00000000..b8406143
--- /dev/null
+++ b/tests/test-62.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform float i;
+    for (i = 0; i < b; ++i) {
+        if (i == 2) break;
+        ++a;
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 3+programIndex;
+}
diff --git a/tests/test-63.ispc b/tests/test-63.ispc
new file mode 100644
index 00000000..9d095b98
--- /dev/null
+++ b/tests/test-63.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    float i;
+    for (i = 0; i < b; ++i) {
+        if (i == 2) break;
+        ++a;
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 3 + programIndex;
+}
diff --git a/tests/test-64.ispc b/tests/test-64.ispc
new file mode 100644
index 00000000..5a674a95
--- /dev/null
+++ b/tests/test-64.ispc
@@ -0,0 +1,26 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex&0x3];
+    float i;
+    if (a < 3) {
+        ++a;
+    }
+    else {
+        for (i = 0; i < b; ++i) {
+            if (i == 2) break;
+            ++a;
+        }
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[0] = RET[4] = RET[8] = RET[12] = 2;
+    RET[1] = RET[5] = RET[9] = RET[13] = 3;
+    RET[2] = RET[6] = RET[10] = RET[14] = 5;
+    RET[3] = RET[7] = RET[11] = RET[15] = 6;
+}
diff --git a/tests/test-65.ispc b/tests/test-65.ispc
new file mode 100644
index 00000000..71b8b39c
--- /dev/null
+++ b/tests/test-65.ispc
@@ -0,0 +1,24 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex&0x3];
+    float i, j;
+    for (i = 0; i < b; ++i) {
+        if (a == 1.) break;
+        for (j = 0; j < b; ++j) {
+            if (a == 3.) break;
+            ++a;
+        }
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[0] = RET[4] = RET[8] = RET[12] = 1;
+    RET[1] = RET[5] = RET[9] = RET[13] = 3;
+    RET[2] = RET[6] = RET[10] = RET[14] = 3;
+    RET[3] = RET[7] = RET[11] = RET[15] = 29;
+}
diff --git a/tests/test-66.ispc b/tests/test-66.ispc
new file mode 100644
index 00000000..8f69e8a5
--- /dev/null
+++ b/tests/test-66.ispc
@@ -0,0 +1,25 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex&0x3];
+    float i, j;
+    for (i = 0; i < b; ++i) {
+        ++a;
+        for (j = 0; j < b; ++j) {
+            if (a == 3.) break;
+            ++a;
+        }
+        ++a;
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[0] = RET[4] = RET[8] = RET[12] = 32;
+    RET[1] = RET[5] = RET[9] = RET[13] = 32;
+    RET[2] = RET[6] = RET[10] = RET[14] = 38;
+    RET[3] = RET[7] = RET[11] = RET[15] = 39;
+}
diff --git a/tests/test-67.ispc b/tests/test-67.ispc
new file mode 100644
index 00000000..331fc6be
--- /dev/null
+++ b/tests/test-67.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    float i, j;
+    for (i = 0; i < b; ++i) {
+        if (a == b) continue;
+        ++a;
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 6 + programIndex;
+    RET[0] = RET[1] = RET[2] = RET[3] = RET[4] = 5;
+}
diff --git a/tests/test-68.ispc b/tests/test-68.ispc
new file mode 100644
index 00000000..76594be0
--- /dev/null
+++ b/tests/test-68.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    float i, j;
+    for (i = 0; i < b; ++i) {
+        if (a == b) continue;
+        ++a;
+        if (a == 2) break;
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 6 + programIndex;
+    RET[0] = 2;
+    RET[1] = RET[2] = RET[3] = RET[4] = 5;
+}
diff --git a/tests/test-69.ispc b/tests/test-69.ispc
new file mode 100644
index 00000000..010b444d
--- /dev/null
+++ b/tests/test-69.ispc
@@ -0,0 +1,25 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    float i, j;
+    for (i = 0; i < b; ++i) {
+        if (a == b) 
+            a += 10;
+        ++a;
+        if (a == 2) break;
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 6 + programIndex;
+    RET[0] = 2;
+    RET[1] = 17;
+    RET[2] = 18;
+    RET[3] = 19;
+    RET[4] = 20;
+}
diff --git a/tests/test-7.ispc b/tests/test-7.ispc
new file mode 100644
index 00000000..fa3dfa69
--- /dev/null
+++ b/tests/test-7.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    RET[programIndex] = a + 2. * b;
+}
+
+
+export void result(uniform float RET[4]) {
+    RET[programIndex] = 11 + programIndex;
+}
diff --git a/tests/test-70.ispc b/tests/test-70.ispc
new file mode 100644
index 00000000..392a1d3e
--- /dev/null
+++ b/tests/test-70.ispc
@@ -0,0 +1,26 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    float i, j;
+    if (a >= 4)
+        a = 0;
+    else {
+        for (i = 0; i < b; ++i) {
+            if (a == b) 
+                a += 10;
+            ++a;
+            if (a == 2) break;
+        }
+    }
+    RET[programIndex] = a;
+}
+
+export void result(uniform float RET[4]) {
+    RET[programIndex] = 0;
+    RET[0] = 2;
+    RET[1] = 17;
+    RET[2] = 18;
+}
diff --git a/tests/test-71.ispc b/tests/test-71.ispc
new file mode 100644
index 00000000..bd536a17
--- /dev/null
+++ b/tests/test-71.ispc
@@ -0,0 +1,28 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    float i, r = a;
+    float ret;
+    if (a >= 4)
+        ret = 0;
+    else {
+        for (i = 0; i < a; ++i) {
+            if (r == b) 
+                r += 10;
+            ++r;
+            if (r == 2) break;
+        }
+        ret = r;
+    }
+    RET[programIndex] = ret;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 2;
+    RET[1] = 4;
+    RET[2] = 16;
+}
diff --git a/tests/test-72.ispc b/tests/test-72.ispc
new file mode 100644
index 00000000..62456a26
--- /dev/null
+++ b/tests/test-72.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) {
+    int i; float r = 0.;
+    for (i = 0; i < 10; ++i) { r += 2; }
+    RET[programIndex] = r;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 20.000000;
+}
diff --git a/tests/test-73.ispc b/tests/test-73.ispc
new file mode 100644
index 00000000..f515de7b
--- /dev/null
+++ b/tests/test-73.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    float i, r = 0.;
+    for (i = 0; i < a; ++i)
+        r += b;
+    RET[programIndex] = r;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 5 + 5 * programIndex;
+}
diff --git a/tests/test-74.ispc b/tests/test-74.ispc
new file mode 100644
index 00000000..8d216fa6
--- /dev/null
+++ b/tests/test-74.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    float i, r = 0.;
+    for (i = 0; i < a+b; ++i)
+        r += b;
+    RET[programIndex] = r;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 30 + 5 * programIndex;
+}
diff --git a/tests/test-75.ispc b/tests/test-75.ispc
new file mode 100644
index 00000000..cb455eff
--- /dev/null
+++ b/tests/test-75.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform float i;
+    float r = 0.;
+    for (i = 0; i < b; ++i)
+        r += a;
+    RET[programIndex] = r;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 5 + 5 * programIndex;
+}
diff --git a/tests/test-76.ispc b/tests/test-76.ispc
new file mode 100644
index 00000000..35ae269d
--- /dev/null
+++ b/tests/test-76.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform float i;
+    for (i = 0; i < b; ++i) {
+        if (i+2 == b) continue;
+        ++a;
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 5 + programIndex;
+}
diff --git a/tests/test-77.ispc b/tests/test-77.ispc
new file mode 100644
index 00000000..b474f9cf
--- /dev/null
+++ b/tests/test-77.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform float i;
+    for (i = 0; i < b; ++i) {
+        ++a;
+        if (i+2 == b) break;
+    }
+    RET[programIndex] = a;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 5+programIndex;
+}
diff --git a/tests/test-78.ispc b/tests/test-78.ispc
new file mode 100644
index 00000000..477bacc7
--- /dev/null
+++ b/tests/test-78.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) {
+int i;
+for (i = 0; i < 10; ++i) {
+  if (i == 5) break;
+}
+RET[programIndex] = (float)i; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 5.000000;
+}
diff --git a/tests/test-79.ispc b/tests/test-79.ispc
new file mode 100644
index 00000000..6683f266
--- /dev/null
+++ b/tests/test-79.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) {
+int i, j=0;
+for (i = 0; i < 10; ++i) {
+  if (i >= 5) continue;
+  ++j;}
+RET[programIndex] = (float)j; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 5.000000;
+}
diff --git a/tests/test-8.ispc b/tests/test-8.ispc
new file mode 100644
index 00000000..6bc186c1
--- /dev/null
+++ b/tests/test-8.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    float r = a;
+    if (b == 5.) r = b;
+    RET[programIndex] = r;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 5;
+}
diff --git a/tests/test-80.ispc b/tests/test-80.ispc
new file mode 100644
index 00000000..559d1847
--- /dev/null
+++ b/tests/test-80.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) {
+int i=0, j=0;
+while (i++ < 10) {
+  ++j;
+  if (i >= 5) continue;
+  ++j;}
+RET[programIndex] = (float)j; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 14.000000;
+}
diff --git a/tests/test-81.ispc b/tests/test-81.ispc
new file mode 100644
index 00000000..8ceab8f6
--- /dev/null
+++ b/tests/test-81.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) {
+int i=0, j=0;
+while (i++ < 10) {
+  ++j;
+  if (i >= 5) break;
+  ++j;}
+RET[programIndex] = (float)j; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 9.000000;
+}
diff --git a/tests/test-82.ispc b/tests/test-82.ispc
new file mode 100644
index 00000000..f54bf215
--- /dev/null
+++ b/tests/test-82.ispc
@@ -0,0 +1,10 @@
+
+export uniform int width() { return programCount; }
+
+
+void foo() { }
+export void f_v(uniform float RET[]) { foo(); RET[programIndex] = 1.; }
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1.000000;
+}
diff --git a/tests/test-83.ispc b/tests/test-83.ispc
new file mode 100644
index 00000000..eaca92d7
--- /dev/null
+++ b/tests/test-83.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+
+float f(int i) { return i + 1.; }
+float f(float v) { return 2 * v; }
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    RET[programIndex] = f(a) + f(10); 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 13 + 2 * programIndex;
+}
diff --git a/tests/test-84.ispc b/tests/test-84.ispc
new file mode 100644
index 00000000..f39568b0
--- /dev/null
+++ b/tests/test-84.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+
+float f(float v) { return 2 * v; }
+float f(int i) { return i + 1.; }
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    RET[programIndex] = f(a) + f(10); 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 13 + 2 * programIndex;
+}
diff --git a/tests/test-85.ispc b/tests/test-85.ispc
new file mode 100644
index 00000000..0001816c
--- /dev/null
+++ b/tests/test-85.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+float f(float v) { return 2 * v; }
+float f() { return 2.; }
+float f(float a, int b) { return a + b; }
+float f(int i) { return i + 1.; }
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    RET[programIndex] = f(a) + f() + f(a, a) + f(10); 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 17 + 4 * programIndex;
+}
diff --git a/tests/test-86.ispc b/tests/test-86.ispc
new file mode 100644
index 00000000..48c1f1a6
--- /dev/null
+++ b/tests/test-86.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+float f() { return 2.; }
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    RET[programIndex] = 1. + f() + 2.; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 5.000000;
+}
diff --git a/tests/test-87.ispc b/tests/test-87.ispc
new file mode 100644
index 00000000..f0c22473
--- /dev/null
+++ b/tests/test-87.ispc
@@ -0,0 +1,9 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) { if (true) return; else RET[programIndex] = 1234.; }
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0.000000;
+}
diff --git a/tests/test-88.ispc b/tests/test-88.ispc
new file mode 100644
index 00000000..a64315b6
--- /dev/null
+++ b/tests/test-88.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    if (false) RET[programIndex] = a; 
+    else RET[programIndex] = 1230 + 4; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1234.000000;
+}
diff --git a/tests/test-89.ispc b/tests/test-89.ispc
new file mode 100644
index 00000000..b0529b79
--- /dev/null
+++ b/tests/test-89.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float xxFOO[], uniform float a) {
+    float xx = xxFOO[programIndex];
+    uniform float x[4][3];
+    uniform int i, j;
+    for (i = 0; i < 4; ++i)
+        for (j = 0; j < 3; ++j)
+            x[i][j] = a * i * j;
+    RET[programIndex] = x[3][2];
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 30.000000;
+}
diff --git a/tests/test-9.ispc b/tests/test-9.ispc
new file mode 100644
index 00000000..2c3b7d73
--- /dev/null
+++ b/tests/test-9.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    float r = a;
+    float ret;
+    if (b == 5.) ret = b;
+    else ret = r;
+    RET[programIndex] = ret;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 5;
+}
diff --git a/tests/test-90.ispc b/tests/test-90.ispc
new file mode 100644
index 00000000..9aa72022
--- /dev/null
+++ b/tests/test-90.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+
+float foo(uniform float x[3]) {
+    return x[1];
+}
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform float x[3];
+    uniform int i;
+    for (i = 0; i < 3; ++i) x[i] = b;
+    RET[programIndex] = foo(x);
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 5.000000;
+}
diff --git a/tests/test-91.ispc b/tests/test-91.ispc
new file mode 100644
index 00000000..362cd851
--- /dev/null
+++ b/tests/test-91.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+
+float foo(uniform float x[3]) {
+    return x[1];
+}
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex];
+    uniform float x[3][3];
+    uniform int i, j;
+    for (i = 0; i < 3; ++i) 
+        for (j = 0; j < 3; ++j)
+            x[i][j] = 3 + i*j;
+    RET[programIndex] = foo(x[1]);
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 4.000000;
+}
diff --git a/tests/test-92.ispc b/tests/test-92.ispc
new file mode 100644
index 00000000..48e0fd9d
--- /dev/null
+++ b/tests/test-92.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float x = aFOO[programIndex];
+    float y;
+    y = ++x, 3;
+    RET[programIndex] = y;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2+programIndex;
+}
diff --git a/tests/test-93.ispc b/tests/test-93.ispc
new file mode 100644
index 00000000..195a76f8
--- /dev/null
+++ b/tests/test-93.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float i) {
+    float a = aFOO[programIndex];
+    uniform float x[6] = { 0, 1, 2, 3, 4, 2 };
+    RET[programIndex] = a * x[i];
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2 + 2 * programIndex;
+}
diff --git a/tests/test-94.ispc b/tests/test-94.ispc
new file mode 100644
index 00000000..20aad9c4
--- /dev/null
+++ b/tests/test-94.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform float x[3][5] = { { 0, 1, 2, 3, 4 }, { 5, 6, 7, 8, 9 }, 
+                              { 10, 11, 12, 13, 14 } };
+    RET[programIndex] = a * x[1][b-4];
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 6 * (1+programIndex);
+}
diff --git a/tests/test-95.ispc b/tests/test-95.ispc
new file mode 100644
index 00000000..fccc261d
--- /dev/null
+++ b/tests/test-95.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform int i;
+    float x[10];
+    for (i = 0; i < 10; ++i)
+        x[i] = i;
+    RET[programIndex] = x[b];
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 5;
+}
diff --git a/tests/test-96.ispc b/tests/test-96.ispc
new file mode 100644
index 00000000..f9e1a2c1
--- /dev/null
+++ b/tests/test-96.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform int i, j;
+    float x[5][2];
+    for (i = 0; i < b; ++i)
+        for (j = 0; j < 2; ++j)
+            x[i][j] = i+j;
+    RET[programIndex] = x[b-4][b-4];
+}
+
+
+export void result(uniform float RET[4]) {
+    RET[programIndex] = 2.;
+}
diff --git a/tests/test-97.ispc b/tests/test-97.ispc
new file mode 100644
index 00000000..57b997c6
--- /dev/null
+++ b/tests/test-97.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) {
+    int i=0;
+    float f=0;
+    bool b=0;
+    RET[programIndex] = b ? f : (i+1);
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1.000000;
+}
diff --git a/tests/test-98.ispc b/tests/test-98.ispc
new file mode 100644
index 00000000..4ce3c82f
--- /dev/null
+++ b/tests/test-98.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform int x[7] = b;
+    RET[programIndex] = x[b];
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 5.000000;
+}
diff --git a/tests/test-99.ispc b/tests/test-99.ispc
new file mode 100644
index 00000000..b97f8fda
--- /dev/null
+++ b/tests/test-99.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float x = aFOO[programIndex];
+    float a;
+    a = (x *= 2.);
+    RET[programIndex] = a + x;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 4 * (1+programIndex);
+}
diff --git a/tests/transcendentals-0-0.ispc b/tests/transcendentals-0-0.ispc
new file mode 100644
index 00000000..86210179
--- /dev/null
+++ b/tests/transcendentals-0-0.ispc
@@ -0,0 +1,23 @@
+static float float4(uniform float a, uniform float b, uniform float c, 
+                    uniform float d) {
+    float ret = 0;
+    for (uniform int i = 0; i < programCount; i += 4) {
+        ret = insert(ret, i + 0, a);
+        ret = insert(ret, i + 1, b);
+        ret = insert(ret, i + 2, c);
+        ret = insert(ret, i + 3, d);
+    }
+    return ret;
+}
+
+export uniform int width() { return programCount; }
+
+
+bool ok(float x, float ref) { return (abs(x - ref) < 1e-6) || abs((x-ref)/ref) < 1e-5; }
+
+export void f_v(uniform float RET[]) {
+    float v = float4((-3.141592741012573),(0.000000000000000),(3.141592741012573),(6.283185482025146));
+    float ref = float4((0.000000087422777),(0.000000000000000),(-0.000000087422777),(0.000000174845553));
+    RET[programIndex] = ok(sin(v), ref) ? 1. : 0.;
+}
+export void result(uniform float RET[]) { RET[programIndex] = 1.; }
diff --git a/tests/transcendentals-0-1.ispc b/tests/transcendentals-0-1.ispc
new file mode 100644
index 00000000..ac51f392
--- /dev/null
+++ b/tests/transcendentals-0-1.ispc
@@ -0,0 +1,23 @@
+static float float4(uniform float a, uniform float b, uniform float c, 
+                    uniform float d) {
+    float ret = 0;
+    for (uniform int i = 0; i < programCount; i += 4) {
+        ret = insert(ret, i + 0, a);
+        ret = insert(ret, i + 1, b);
+        ret = insert(ret, i + 2, c);
+        ret = insert(ret, i + 3, d);
+    }
+    return ret;
+}
+
+export uniform int width() { return programCount; }
+
+
+bool ok(float x, float ref) { return (abs(x - ref) < 1e-6) || abs((x-ref)/ref) < 1e-5; }
+
+export void f_v(uniform float RET[]) {
+    float v = float4((1.000000000000000),(-1.000000000000000),(-1.570796370506287),(1.570796370506287));
+    float ref = float4((0.841470956802368),(-0.841470956802368),(-1.000000000000000),(1.000000000000000));
+    RET[programIndex] = ok(sin(v), ref) ? 1. : 0.;
+}
+export void result(uniform float RET[]) { RET[programIndex] = 1.; }
diff --git a/tests/transcendentals-0-2.ispc b/tests/transcendentals-0-2.ispc
new file mode 100644
index 00000000..ed75d71c
--- /dev/null
+++ b/tests/transcendentals-0-2.ispc
@@ -0,0 +1,23 @@
+static float float4(uniform float a, uniform float b, uniform float c, 
+                    uniform float d) {
+    float ret = 0;
+    for (uniform int i = 0; i < programCount; i += 4) {
+        ret = insert(ret, i + 0, a);
+        ret = insert(ret, i + 1, b);
+        ret = insert(ret, i + 2, c);
+        ret = insert(ret, i + 3, d);
+    }
+    return ret;
+}
+
+export uniform int width() { return programCount; }
+
+
+bool ok(float x, float ref) { return (abs(x - ref) < 1e-6) || abs((x-ref)/ref) < 1e-5; }
+
+export void f_v(uniform float RET[]) {
+    float v = float4((-9.424777984619141),(4.000000000000000),(10.000000000000000),(-10.000000000000000));
+    float ref = float4((0.000000023849761),(-0.756802499294281),(-0.544021129608154),(0.544021129608154));
+    RET[programIndex] = ok(sin(v), ref) ? 1. : 0.;
+}
+export void result(uniform float RET[]) { RET[programIndex] = 1.; }
diff --git a/tests/transcendentals-0-3.ispc b/tests/transcendentals-0-3.ispc
new file mode 100644
index 00000000..2a291ba3
--- /dev/null
+++ b/tests/transcendentals-0-3.ispc
@@ -0,0 +1,23 @@
+static float float4(uniform float a, uniform float b, uniform float c, 
+                    uniform float d) {
+    float ret = 0;
+    for (uniform int i = 0; i < programCount; i += 4) {
+        ret = insert(ret, i + 0, a);
+        ret = insert(ret, i + 1, b);
+        ret = insert(ret, i + 2, c);
+        ret = insert(ret, i + 3, d);
+    }
+    return ret;
+}
+
+export uniform int width() { return programCount; }
+
+
+bool ok(float x, float ref) { return (abs(x - ref) < 1e-6) || abs((x-ref)/ref) < 1e-5; }
+
+export void f_v(uniform float RET[]) {
+    float v = float4((14.300000190734863),(-6.699999809265137),(-21.200000762939453),(9.000000000000000));
+    float ref = float4((0.986771941184998),(-0.404849767684937),(-0.711160659790039),(0.412118494510651));
+    RET[programIndex] = ok(sin(v), ref) ? 1. : 0.;
+}
+export void result(uniform float RET[]) { RET[programIndex] = 1.; }
diff --git a/tests/transcendentals-1-0.ispc b/tests/transcendentals-1-0.ispc
new file mode 100644
index 00000000..9c2a92fd
--- /dev/null
+++ b/tests/transcendentals-1-0.ispc
@@ -0,0 +1,23 @@
+static float float4(uniform float a, uniform float b, uniform float c, 
+                    uniform float d) {
+    float ret = 0;
+    for (uniform int i = 0; i < programCount; i += 4) {
+        ret = insert(ret, i + 0, a);
+        ret = insert(ret, i + 1, b);
+        ret = insert(ret, i + 2, c);
+        ret = insert(ret, i + 3, d);
+    }
+    return ret;
+}
+
+export uniform int width() { return programCount; }
+
+
+bool ok(float x, float ref) { return (abs(x - ref) < 1e-6) || abs((x-ref)/ref) < 1e-5; }
+
+export void f_v(uniform float RET[]) {
+    float v = float4((-3.141592741012573),(0.000000000000000),(3.141592741012573),(6.283185482025146));
+    float ref = float4((-1.000000000000000),(1.000000000000000),(-1.000000000000000),(1.000000000000000));
+    RET[programIndex] = ok(cos(v), ref) ? 1. : 0.;
+}
+export void result(uniform float RET[]) { RET[programIndex] = 1.; }
diff --git a/tests/transcendentals-1-1.ispc b/tests/transcendentals-1-1.ispc
new file mode 100644
index 00000000..1b7ae40b
--- /dev/null
+++ b/tests/transcendentals-1-1.ispc
@@ -0,0 +1,23 @@
+static float float4(uniform float a, uniform float b, uniform float c, 
+                    uniform float d) {
+    float ret = 0;
+    for (uniform int i = 0; i < programCount; i += 4) {
+        ret = insert(ret, i + 0, a);
+        ret = insert(ret, i + 1, b);
+        ret = insert(ret, i + 2, c);
+        ret = insert(ret, i + 3, d);
+    }
+    return ret;
+}
+
+export uniform int width() { return programCount; }
+
+
+bool ok(float x, float ref) { return (abs(x - ref) < 1e-6) || abs((x-ref)/ref) < 1e-5; }
+
+export void f_v(uniform float RET[]) {
+    float v = float4((1.000000000000000),(-1.000000000000000),(-1.570796370506287),(1.570796370506287));
+    float ref = float4((0.540302336215973),(0.540302336215973),(-0.000000043711388),(-0.000000043711388));
+    RET[programIndex] = ok(cos(v), ref) ? 1. : 0.;
+}
+export void result(uniform float RET[]) { RET[programIndex] = 1.; }
diff --git a/tests/transcendentals-1-2.ispc b/tests/transcendentals-1-2.ispc
new file mode 100644
index 00000000..b6de5dc8
--- /dev/null
+++ b/tests/transcendentals-1-2.ispc
@@ -0,0 +1,23 @@
+static float float4(uniform float a, uniform float b, uniform float c, 
+                    uniform float d) {
+    float ret = 0;
+    for (uniform int i = 0; i < programCount; i += 4) {
+        ret = insert(ret, i + 0, a);
+        ret = insert(ret, i + 1, b);
+        ret = insert(ret, i + 2, c);
+        ret = insert(ret, i + 3, d);
+    }
+    return ret;
+}
+
+export uniform int width() { return programCount; }
+
+
+bool ok(float x, float ref) { return (abs(x - ref) < 1e-6) || abs((x-ref)/ref) < 1e-5; }
+
+export void f_v(uniform float RET[]) {
+    float v = float4((-9.424777984619141),(4.000000000000000),(10.000000000000000),(-10.000000000000000));
+    float ref = float4((-1.000000000000000),(-0.653643608093262),(-0.839071512222290),(-0.839071512222290));
+    RET[programIndex] = ok(cos(v), ref) ? 1. : 0.;
+}
+export void result(uniform float RET[]) { RET[programIndex] = 1.; }
diff --git a/tests/transcendentals-1-3.ispc b/tests/transcendentals-1-3.ispc
new file mode 100644
index 00000000..cefa3547
--- /dev/null
+++ b/tests/transcendentals-1-3.ispc
@@ -0,0 +1,23 @@
+static float float4(uniform float a, uniform float b, uniform float c, 
+                    uniform float d) {
+    float ret = 0;
+    for (uniform int i = 0; i < programCount; i += 4) {
+        ret = insert(ret, i + 0, a);
+        ret = insert(ret, i + 1, b);
+        ret = insert(ret, i + 2, c);
+        ret = insert(ret, i + 3, d);
+    }
+    return ret;
+}
+
+export uniform int width() { return programCount; }
+
+
+bool ok(float x, float ref) { return (abs(x - ref) < 1e-6) || abs((x-ref)/ref) < 1e-5; }
+
+export void f_v(uniform float RET[]) {
+    float v = float4((14.300000190734863),(-6.699999809265137),(-21.200000762939453),(9.000000000000000));
+    float ref = float4((-0.162114620208740),(0.914383232593536),(-0.703029513359070),(-0.911130249500275));
+    RET[programIndex] = ok(cos(v), ref) ? 1. : 0.;
+}
+export void result(uniform float RET[]) { RET[programIndex] = 1.; }
diff --git a/tests/transcendentals-2-0.ispc b/tests/transcendentals-2-0.ispc
new file mode 100644
index 00000000..29883fd1
--- /dev/null
+++ b/tests/transcendentals-2-0.ispc
@@ -0,0 +1,23 @@
+static float float4(uniform float a, uniform float b, uniform float c, 
+                    uniform float d) {
+    float ret = 0;
+    for (uniform int i = 0; i < programCount; i += 4) {
+        ret = insert(ret, i + 0, a);
+        ret = insert(ret, i + 1, b);
+        ret = insert(ret, i + 2, c);
+        ret = insert(ret, i + 3, d);
+    }
+    return ret;
+}
+
+export uniform int width() { return programCount; }
+
+
+bool ok(float x, float ref) { return (abs(x - ref) < 1e-6) || abs((x-ref)/ref) < 1e-5; }
+
+export void f_v(uniform float RET[]) {
+    float v = float4((-3.141592741012573),(0.000000000000000),(3.141592741012573),(6.283185482025146));
+    float ref = float4((-0.000000087422777),(0.000000000000000),(0.000000087422777),(0.000000174845553));
+    RET[programIndex] = ok(tan(v), ref) ? 1. : 0.;
+}
+export void result(uniform float RET[]) { RET[programIndex] = 1.; }
diff --git a/tests/transcendentals-2-1.ispc b/tests/transcendentals-2-1.ispc
new file mode 100644
index 00000000..02a42c2a
--- /dev/null
+++ b/tests/transcendentals-2-1.ispc
@@ -0,0 +1,23 @@
+static float float4(uniform float a, uniform float b, uniform float c, 
+                    uniform float d) {
+    float ret = 0;
+    for (uniform int i = 0; i < programCount; i += 4) {
+        ret = insert(ret, i + 0, a);
+        ret = insert(ret, i + 1, b);
+        ret = insert(ret, i + 2, c);
+        ret = insert(ret, i + 3, d);
+    }
+    return ret;
+}
+
+export uniform int width() { return programCount; }
+
+
+bool ok(float x, float ref) { return (abs(x - ref) < 1e-6) || abs((x-ref)/ref) < 1e-5; }
+
+export void f_v(uniform float RET[]) {
+    float v = float4((1.000000000000000),(-1.000000000000000),(0),(0));
+    float ref = float4((1.557407736778259),(-1.557407736778259),(0),(0));
+    RET[programIndex] = ok(tan(v), ref) ? 1. : 0.;
+}
+export void result(uniform float RET[]) { RET[programIndex] = 1.; }
diff --git a/tests/transcendentals-2-2.ispc b/tests/transcendentals-2-2.ispc
new file mode 100644
index 00000000..609ef387
--- /dev/null
+++ b/tests/transcendentals-2-2.ispc
@@ -0,0 +1,23 @@
+static float float4(uniform float a, uniform float b, uniform float c, 
+                    uniform float d) {
+    float ret = 0;
+    for (uniform int i = 0; i < programCount; i += 4) {
+        ret = insert(ret, i + 0, a);
+        ret = insert(ret, i + 1, b);
+        ret = insert(ret, i + 2, c);
+        ret = insert(ret, i + 3, d);
+    }
+    return ret;
+}
+
+export uniform int width() { return programCount; }
+
+
+bool ok(float x, float ref) { return (abs(x - ref) < 1e-6) || abs((x-ref)/ref) < 1e-5; }
+
+export void f_v(uniform float RET[]) {
+    float v = float4((-9.424777984619141),(4.000000000000000),(10.000000000000000),(-10.000000000000000));
+    float ref = float4((-0.000000023849761),(1.157821297645569),(0.648360848426819),(-0.648360848426819));
+    RET[programIndex] = ok(tan(v), ref) ? 1. : 0.;
+}
+export void result(uniform float RET[]) { RET[programIndex] = 1.; }
diff --git a/tests/transcendentals-2-3.ispc b/tests/transcendentals-2-3.ispc
new file mode 100644
index 00000000..5975ed98
--- /dev/null
+++ b/tests/transcendentals-2-3.ispc
@@ -0,0 +1,23 @@
+static float float4(uniform float a, uniform float b, uniform float c, 
+                    uniform float d) {
+    float ret = 0;
+    for (uniform int i = 0; i < programCount; i += 4) {
+        ret = insert(ret, i + 0, a);
+        ret = insert(ret, i + 1, b);
+        ret = insert(ret, i + 2, c);
+        ret = insert(ret, i + 3, d);
+    }
+    return ret;
+}
+
+export uniform int width() { return programCount; }
+
+
+bool ok(float x, float ref) { return (abs(x - ref) < 1e-6) || abs((x-ref)/ref) < 1e-5; }
+
+export void f_v(uniform float RET[]) {
+    float v = float4((14.300000190734863),(-6.699999809265137),(-21.200000762939453),(9.000000000000000));
+    float ref = float4((-6.086877822875977),(-0.442757189273834),(1.011565923690796),(-0.452315658330917));
+    RET[programIndex] = ok(tan(v), ref) ? 1. : 0.;
+}
+export void result(uniform float RET[]) { RET[programIndex] = 1.; }
diff --git a/tests/transcendentals-3-0.ispc b/tests/transcendentals-3-0.ispc
new file mode 100644
index 00000000..896f78f5
--- /dev/null
+++ b/tests/transcendentals-3-0.ispc
@@ -0,0 +1,23 @@
+static float float4(uniform float a, uniform float b, uniform float c, 
+                    uniform float d) {
+    float ret = 0;
+    for (uniform int i = 0; i < programCount; i += 4) {
+        ret = insert(ret, i + 0, a);
+        ret = insert(ret, i + 1, b);
+        ret = insert(ret, i + 2, c);
+        ret = insert(ret, i + 3, d);
+    }
+    return ret;
+}
+
+export uniform int width() { return programCount; }
+
+
+bool ok(float x, float ref) { return (abs(x - ref) < 1e-6) || abs((x-ref)/ref) < 1e-5; }
+
+export void f_v(uniform float RET[]) {
+    float v = float4((-3.141592741012573),(0.000000000000000),(3.141592741012573),(6.283185482025146));
+    float ref = float4((0.043213915079832),(1.000000000000000),(23.140695571899414),(535.491760253906250));
+    RET[programIndex] = ok(exp(v), ref) ? 1. : 0.;
+}
+export void result(uniform float RET[]) { RET[programIndex] = 1.; }
diff --git a/tests/transcendentals-3-1.ispc b/tests/transcendentals-3-1.ispc
new file mode 100644
index 00000000..1d433e6f
--- /dev/null
+++ b/tests/transcendentals-3-1.ispc
@@ -0,0 +1,23 @@
+static float float4(uniform float a, uniform float b, uniform float c, 
+                    uniform float d) {
+    float ret = 0;
+    for (uniform int i = 0; i < programCount; i += 4) {
+        ret = insert(ret, i + 0, a);
+        ret = insert(ret, i + 1, b);
+        ret = insert(ret, i + 2, c);
+        ret = insert(ret, i + 3, d);
+    }
+    return ret;
+}
+
+export uniform int width() { return programCount; }
+
+
+bool ok(float x, float ref) { return (abs(x - ref) < 1e-6) || abs((x-ref)/ref) < 1e-5; }
+
+export void f_v(uniform float RET[]) {
+    float v = float4((1.000000000000000),(-1.000000000000000),(-1.570796370506287),(1.570796370506287));
+    float ref = float4((2.718281745910645),(0.367879450321198),(0.207879573106766),(4.810477733612061));
+    RET[programIndex] = ok(exp(v), ref) ? 1. : 0.;
+}
+export void result(uniform float RET[]) { RET[programIndex] = 1.; }
diff --git a/tests/transcendentals-3-2.ispc b/tests/transcendentals-3-2.ispc
new file mode 100644
index 00000000..bae3e908
--- /dev/null
+++ b/tests/transcendentals-3-2.ispc
@@ -0,0 +1,23 @@
+static float float4(uniform float a, uniform float b, uniform float c, 
+                    uniform float d) {
+    float ret = 0;
+    for (uniform int i = 0; i < programCount; i += 4) {
+        ret = insert(ret, i + 0, a);
+        ret = insert(ret, i + 1, b);
+        ret = insert(ret, i + 2, c);
+        ret = insert(ret, i + 3, d);
+    }
+    return ret;
+}
+
+export uniform int width() { return programCount; }
+
+
+bool ok(float x, float ref) { return (abs(x - ref) < 1e-6) || abs((x-ref)/ref) < 1e-5; }
+
+export void f_v(uniform float RET[]) {
+    float v = float4((-9.424777984619141),(4.000000000000000),(10.000000000000000),(-10.000000000000000));
+    float ref = float4((0.000080699515820),(54.598148345947266),(22026.464843750000000),(0.000045399930968));
+    RET[programIndex] = ok(exp(v), ref) ? 1. : 0.;
+}
+export void result(uniform float RET[]) { RET[programIndex] = 1.; }
diff --git a/tests/transcendentals-3-3.ispc b/tests/transcendentals-3-3.ispc
new file mode 100644
index 00000000..321375ca
--- /dev/null
+++ b/tests/transcendentals-3-3.ispc
@@ -0,0 +1,23 @@
+static float float4(uniform float a, uniform float b, uniform float c, 
+                    uniform float d) {
+    float ret = 0;
+    for (uniform int i = 0; i < programCount; i += 4) {
+        ret = insert(ret, i + 0, a);
+        ret = insert(ret, i + 1, b);
+        ret = insert(ret, i + 2, c);
+        ret = insert(ret, i + 3, d);
+    }
+    return ret;
+}
+
+export uniform int width() { return programCount; }
+
+
+bool ok(float x, float ref) { return (abs(x - ref) < 1e-6) || abs((x-ref)/ref) < 1e-5; }
+
+export void f_v(uniform float RET[]) {
+    float v = float4((14.300000190734863),(-6.699999809265137),(-21.200000762939453),(9.000000000000000));
+    float ref = float4((1623346.250000000000000),(0.001230912166648),(0.000000000620807),(8103.083984375000000));
+    RET[programIndex] = ok(exp(v), ref) ? 1. : 0.;
+}
+export void result(uniform float RET[]) { RET[programIndex] = 1.; }
diff --git a/tests/transcendentals-4-0.ispc b/tests/transcendentals-4-0.ispc
new file mode 100644
index 00000000..db15f5b2
--- /dev/null
+++ b/tests/transcendentals-4-0.ispc
@@ -0,0 +1,23 @@
+static float float4(uniform float a, uniform float b, uniform float c, 
+                    uniform float d) {
+    float ret = 0;
+    for (uniform int i = 0; i < programCount; i += 4) {
+        ret = insert(ret, i + 0, a);
+        ret = insert(ret, i + 1, b);
+        ret = insert(ret, i + 2, c);
+        ret = insert(ret, i + 3, d);
+    }
+    return ret;
+}
+
+export uniform int width() { return programCount; }
+
+
+bool ok(float x, float ref) { return (abs(x - ref) < 1e-6) || abs((x-ref)/ref) < 1e-5; }
+
+export void f_v(uniform float RET[]) {
+    float v = float4((3.141602741012573),(0.000010000000000),(3.141602741012573),(6.283195482025146));
+    float ref = float4((1.144733071327209),(-11.512925148010254),(1.144733071327209),(1.837878704071045));
+    RET[programIndex] = ok(log(v), ref) ? 1. : 0.;
+}
+export void result(uniform float RET[]) { RET[programIndex] = 1.; }
diff --git a/tests/transcendentals-4-1.ispc b/tests/transcendentals-4-1.ispc
new file mode 100644
index 00000000..c7377b5d
--- /dev/null
+++ b/tests/transcendentals-4-1.ispc
@@ -0,0 +1,23 @@
+static float float4(uniform float a, uniform float b, uniform float c, 
+                    uniform float d) {
+    float ret = 0;
+    for (uniform int i = 0; i < programCount; i += 4) {
+        ret = insert(ret, i + 0, a);
+        ret = insert(ret, i + 1, b);
+        ret = insert(ret, i + 2, c);
+        ret = insert(ret, i + 3, d);
+    }
+    return ret;
+}
+
+export uniform int width() { return programCount; }
+
+
+bool ok(float x, float ref) { return (abs(x - ref) < 1e-6) || abs((x-ref)/ref) < 1e-5; }
+
+export void f_v(uniform float RET[]) {
+    float v = float4((1.000010000000000),(1.000010000000000),(1.570806370506287),(1.570806370506287));
+    float ref = float4((0.000010013530300),(0.000010013530300),(0.451589107513428),(0.451589107513428));
+    RET[programIndex] = ok(log(v), ref) ? 1. : 0.;
+}
+export void result(uniform float RET[]) { RET[programIndex] = 1.; }
diff --git a/tests/transcendentals-4-2.ispc b/tests/transcendentals-4-2.ispc
new file mode 100644
index 00000000..fe57dd8e
--- /dev/null
+++ b/tests/transcendentals-4-2.ispc
@@ -0,0 +1,23 @@
+static float float4(uniform float a, uniform float b, uniform float c, 
+                    uniform float d) {
+    float ret = 0;
+    for (uniform int i = 0; i < programCount; i += 4) {
+        ret = insert(ret, i + 0, a);
+        ret = insert(ret, i + 1, b);
+        ret = insert(ret, i + 2, c);
+        ret = insert(ret, i + 3, d);
+    }
+    return ret;
+}
+
+export uniform int width() { return programCount; }
+
+
+bool ok(float x, float ref) { return (abs(x - ref) < 1e-6) || abs((x-ref)/ref) < 1e-5; }
+
+export void f_v(uniform float RET[]) {
+    float v = float4((9.424787984619140),(4.000010000000000),(10.000010000000000),(10.000010000000000));
+    float ref = float4((2.243343114852905),(1.386296868324280),(2.302586078643799),(2.302586078643799));
+    RET[programIndex] = ok(log(v), ref) ? 1. : 0.;
+}
+export void result(uniform float RET[]) { RET[programIndex] = 1.; }
diff --git a/tests/transcendentals-4-3.ispc b/tests/transcendentals-4-3.ispc
new file mode 100644
index 00000000..f1a6a7c8
--- /dev/null
+++ b/tests/transcendentals-4-3.ispc
@@ -0,0 +1,23 @@
+static float float4(uniform float a, uniform float b, uniform float c, 
+                    uniform float d) {
+    float ret = 0;
+    for (uniform int i = 0; i < programCount; i += 4) {
+        ret = insert(ret, i + 0, a);
+        ret = insert(ret, i + 1, b);
+        ret = insert(ret, i + 2, c);
+        ret = insert(ret, i + 3, d);
+    }
+    return ret;
+}
+
+export uniform int width() { return programCount; }
+
+
+bool ok(float x, float ref) { return (abs(x - ref) < 1e-6) || abs((x-ref)/ref) < 1e-5; }
+
+export void f_v(uniform float RET[]) {
+    float v = float4((14.300010190734863),(6.700009809265136),(21.200010762939453),(9.000010000000000));
+    float ref = float4((2.660260200500488),(1.902109026908875),(3.054001569747925),(2.197225570678711));
+    RET[programIndex] = ok(log(v), ref) ? 1. : 0.;
+}
+export void result(uniform float RET[4]) { RET[programIndex] = 1.; }
diff --git a/tests/typedef-1.ispc b/tests/typedef-1.ispc
new file mode 100644
index 00000000..dfd7f648
--- /dev/null
+++ b/tests/typedef-1.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+
+typedef float Float;
+
+export void f_fu(uniform float RET[], uniform Float aFOO[], uniform Float b) {
+    float a = aFOO[programIndex];
+    RET[programIndex] = a+b;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 6 + programIndex; }
diff --git a/tests/typedef-2.ispc b/tests/typedef-2.ispc
new file mode 100644
index 00000000..dd9a89ab
--- /dev/null
+++ b/tests/typedef-2.ispc
@@ -0,0 +1,22 @@
+
+export uniform int width() { return programCount; }
+
+typedef float FLOAT;
+
+struct Foo {
+    FLOAT x[16];
+};
+
+typedef Foo FOO;
+
+struct Bar {
+    FOO foo[16];
+};
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    Bar bar = b;
+    RET[programIndex] = bar.foo[a-1].x[a-1];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 5; }
diff --git a/tests/typedef.ispc b/tests/typedef.ispc
new file mode 100644
index 00000000..929a3e52
--- /dev/null
+++ b/tests/typedef.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+
+
+typedef float Float[3];
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    Float x;
+    x[0] = a;
+    x[1] = 1;
+    x[2] = 2;
+    RET[programIndex] = x[b-5];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 1+programIndex; }
diff --git a/tests/unif-struct-test-111.ispc b/tests/unif-struct-test-111.ispc
new file mode 100644
index 00000000..0d199df2
--- /dev/null
+++ b/tests/unif-struct-test-111.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) {
+    uniform struct { float x, y; int i; } a;
+    a.x = 1;
+    a.y = 2;
+    RET[programIndex] = a.x + a.y;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 3;
+}
diff --git a/tests/unif-struct-test-112.ispc b/tests/unif-struct-test-112.ispc
new file mode 100644
index 00000000..0a2688f0
--- /dev/null
+++ b/tests/unif-struct-test-112.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_v(uniform float RET[]) {
+    uniform struct { uniform struct { float x, y; } q; int i; } a[3];
+    a[1].q.x = 1;
+    a[1].q.y = 2;
+    RET[programIndex] = a[1].q.x + a[1].q.y;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 3;
+}
diff --git a/tests/unif-struct-test-113.ispc b/tests/unif-struct-test-113.ispc
new file mode 100644
index 00000000..4968a9c6
--- /dev/null
+++ b/tests/unif-struct-test-113.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+
+struct Foo {
+    int i;
+    float f;
+};
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex];
+    uniform struct Foo myFoo;
+    myFoo.f = a;
+    RET[programIndex] = myFoo.f + a;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2 + 2*programIndex;
+}
diff --git a/tests/unif-struct-test-114.ispc b/tests/unif-struct-test-114.ispc
new file mode 100644
index 00000000..488722fe
--- /dev/null
+++ b/tests/unif-struct-test-114.ispc
@@ -0,0 +1,23 @@
+
+export uniform int width() { return programCount; }
+
+
+struct Foo {
+    uniform float x;
+    uniform float f;
+};
+export void f_fi(uniform float RET[], uniform float a[], uniform int bFOO[]) {
+    int b = bFOO[programIndex];
+    uniform struct Foo myFoo[17];
+    uniform int i;
+    for (i = 0; i < 17; ++i) {
+        myFoo[i].x = i;
+        myFoo[i].f = 2*i;
+    }
+    RET[programIndex] = myFoo[b/2].f;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2 + 2 * programIndex;
+}
diff --git a/tests/unif-struct-test-115.ispc b/tests/unif-struct-test-115.ispc
new file mode 100644
index 00000000..c3650b53
--- /dev/null
+++ b/tests/unif-struct-test-115.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+
+struct Foo {
+    uniform float x;
+    uniform float f;
+};
+export void f_fi(uniform float RET[], uniform float a[], uniform int b[]) {
+    uniform struct Foo myFoo = { 1, 2 };
+    RET[programIndex] = myFoo.x;
+}
+
+
+export void result(uniform float RET[4]) {
+    RET[programIndex] = 1;
+}
diff --git a/tests/unif-struct-test-116.ispc b/tests/unif-struct-test-116.ispc
new file mode 100644
index 00000000..371dbee4
--- /dev/null
+++ b/tests/unif-struct-test-116.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+
+struct Foo {
+    uniform float x;
+    uniform float f;
+};
+
+export void f_fi(uniform float RET[], uniform float a[], uniform int b[]) {
+    uniform struct Foo myFoo = 2;
+    RET[programIndex] = myFoo.x + myFoo.f;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 4;
+}
diff --git a/tests/unif-struct-test-117.ispc b/tests/unif-struct-test-117.ispc
new file mode 100644
index 00000000..bd0de9f2
--- /dev/null
+++ b/tests/unif-struct-test-117.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+
+struct Foo {
+    float x;
+    float f;
+};
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform struct Foo myFoo = a;
+    RET[programIndex] = myFoo.x;
+}
+
+
+export void result(uniform float RET[4]) {
+    RET[programIndex] = 1+programIndex;
+}
diff --git a/tests/unif-struct-test-118.ispc b/tests/unif-struct-test-118.ispc
new file mode 100644
index 00000000..f4e387af
--- /dev/null
+++ b/tests/unif-struct-test-118.ispc
@@ -0,0 +1,20 @@
+
+export uniform int width() { return programCount; }
+
+
+struct Foo {
+    float x;
+    float f;
+};
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform struct Foo myFoo[3] = a;
+    uniform struct Foo barFoo;
+    barFoo = myFoo[1];
+    RET[programIndex] = barFoo.x;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1+programIndex;
+}
diff --git a/tests/unif-struct-test-119.ispc b/tests/unif-struct-test-119.ispc
new file mode 100644
index 00000000..2aa2df1b
--- /dev/null
+++ b/tests/unif-struct-test-119.ispc
@@ -0,0 +1,22 @@
+
+export uniform int width() { return programCount; }
+
+
+struct Foo {
+    float x;
+    float f;
+    int i[3];
+};
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform struct Foo myFoo[3] = b;
+    uniform struct Foo barFoo;
+    barFoo = myFoo[0];
+    RET[programIndex] = barFoo.x + myFoo[1].i[2];
+}
+
+
+export void result(uniform float RET[4]) {
+    RET[programIndex] = 10;
+}
diff --git a/tests/unif-struct-test-120.ispc b/tests/unif-struct-test-120.ispc
new file mode 100644
index 00000000..41e2d921
--- /dev/null
+++ b/tests/unif-struct-test-120.ispc
@@ -0,0 +1,20 @@
+
+export uniform int width() { return programCount; }
+
+
+struct Foo {
+    float x;
+    float f;
+    int i[3];
+};
+float bar(uniform struct Foo f) { return f.f; }
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform struct Foo myFoo[3] = a;
+    RET[programIndex] = bar(myFoo[1]);
+}
+
+
+export void result(uniform float RET[4]) {
+    RET[programIndex]=1+programIndex;
+}
diff --git a/tests/unif-struct-test-121.ispc b/tests/unif-struct-test-121.ispc
new file mode 100644
index 00000000..91b351a3
--- /dev/null
+++ b/tests/unif-struct-test-121.ispc
@@ -0,0 +1,20 @@
+
+export uniform int width() { return programCount; }
+
+
+struct Foo {
+    float x;
+};
+float bar(uniform struct Foo f) { ++f.x; return f.x; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform struct Foo f = a;
+    bar(f);
+    RET[programIndex] = f.x;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1+programIndex;
+}
diff --git a/tests/unif-struct-test-122.ispc b/tests/unif-struct-test-122.ispc
new file mode 100644
index 00000000..8b9cded9
--- /dev/null
+++ b/tests/unif-struct-test-122.ispc
@@ -0,0 +1,20 @@
+
+export uniform int width() { return programCount; }
+
+
+struct Foo {
+    float x;
+};
+float bar(reference uniform struct Foo f) { ++f.x; return f.x; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform struct Foo f = a;
+    bar(f);
+    RET[programIndex] = f.x;
+}
+
+
+export void result(uniform float RET[4]) {
+    RET[programIndex] = 2 + programIndex;
+}
diff --git a/tests/unif-struct-test-123.ispc b/tests/unif-struct-test-123.ispc
new file mode 100644
index 00000000..18535244
--- /dev/null
+++ b/tests/unif-struct-test-123.ispc
@@ -0,0 +1,22 @@
+
+export uniform int width() { return programCount; }
+
+
+struct Foo {
+    float x;
+};
+void bar(reference uniform struct Foo f) { ++f.x; }
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    uniform struct Foo f = a;
+    if (a == 1 || a == 4)
+        bar(f);
+    RET[programIndex] = f.x;
+}
+
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1+programIndex;
+    RET[0] = 2;
+    RET[3] = 5;
+}
diff --git a/tests/varying-struct-1.ispc b/tests/varying-struct-1.ispc
new file mode 100644
index 00000000..8b3c59dd
--- /dev/null
+++ b/tests/varying-struct-1.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+
+
+struct Foo {
+    uniform float x, y;
+};
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    Foo f[3] = { { b, b }, { 2*b, 2*b }, { 3*b, 3*b } };
+    varying Foo g; 
+    g = f[0];
+    RET[programIndex] = g.x;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 5; }
diff --git a/tests/varying-struct-2.ispc b/tests/varying-struct-2.ispc
new file mode 100644
index 00000000..c0842fbb
--- /dev/null
+++ b/tests/varying-struct-2.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+
+
+struct Foo {
+    uniform float x, y;
+};
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    Foo f[3] = { { b, b }, { 2*b, 2*b }, { 3*b, 3*b } };
+    int index = (a <= 2) ? 1 : 2;
+    varying Foo g = f[index];
+    RET[programIndex] = g.x;
+}
+
+export void result(uniform float RET[]) { 
+    RET[programIndex] = 15;
+    RET[0] = RET[1] = 10;
+}
diff --git a/tests/varying-struct-3.ispc b/tests/varying-struct-3.ispc
new file mode 100644
index 00000000..9d536944
--- /dev/null
+++ b/tests/varying-struct-3.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+
+
+struct Foo {
+    uniform float x, y;
+};
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    Foo f[3] = { { b, b }, { 2*b, 2*b }, { 3*b, 3*b } };
+    int index = (a <= 2) ? 1 : 2;
+    varying Foo g = f[index];
+    RET[programIndex] = g.x;
+}
+
+export void result(uniform float RET[4]) { 
+    RET[programIndex] = 15;
+    RET[0] = RET[1] = 10;
+}
diff --git a/tests/varying-struct-4.ispc b/tests/varying-struct-4.ispc
new file mode 100644
index 00000000..fd9d6382
--- /dev/null
+++ b/tests/varying-struct-4.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+
+
+struct Foo {
+    uniform float x, y;
+};
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    Foo f[3] = { { b, 2*b }, { 3*b, 4*b }, { 5*b, 6*b } };
+    int index = (a <= 2) ? 1 : 2;
+    varying Foo g = f[index];
+    RET[programIndex] = g.x;
+}
+
+export void result(uniform float RET[]) { 
+    RET[programIndex] = 25;
+    RET[0] = RET[1] = 15;
+}
diff --git a/tests/varying-struct-5.ispc b/tests/varying-struct-5.ispc
new file mode 100644
index 00000000..c5a74605
--- /dev/null
+++ b/tests/varying-struct-5.ispc
@@ -0,0 +1,24 @@
+
+export uniform int width() { return programCount; }
+
+
+
+struct Foo {
+    uniform float x, y;
+};
+
+struct Bar {
+    uniform Foo uf;
+    varying Foo vf;
+};
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    //    Bar bar = { { b, b }, { a, a } };
+    Bar bar;
+    bar.uf.x = b;
+    bar.vf.y = a;
+    RET[programIndex] = bar.uf.x + bar.vf.y;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 6+programIndex; }
diff --git a/tests/varying-struct-6.ispc b/tests/varying-struct-6.ispc
new file mode 100644
index 00000000..49afc339
--- /dev/null
+++ b/tests/varying-struct-6.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+
+
+struct Foo {
+    uniform float x, y;
+};
+
+struct Bar {
+    uniform Foo uf;
+    varying Foo vf;
+};
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    Bar bar = { { b, b }, { a, a } };
+    RET[programIndex] = bar.uf.x + bar.vf.y;
+}
+
+export void result(uniform float RET[4]) { RET[programIndex] = 6+programIndex; }
diff --git a/tests/varying-struct-8.ispc b/tests/varying-struct-8.ispc
new file mode 100644
index 00000000..59a73e81
--- /dev/null
+++ b/tests/varying-struct-8.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+
+
+struct Foo {
+    uniform float x[2];
+};
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    Foo f[3] = { { { b, b } }, { { 2*b, 2*b } }, { { 3*b, 3*b } } };
+    varying Foo g = f[0];
+    RET[programIndex] = g.x[0];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 5; }
diff --git a/tests/varying-struct-9.ispc b/tests/varying-struct-9.ispc
new file mode 100644
index 00000000..4f529032
--- /dev/null
+++ b/tests/varying-struct-9.ispc
@@ -0,0 +1,22 @@
+
+export uniform int width() { return programCount; }
+
+
+
+struct Bar {
+    uniform float x;
+};
+
+struct Foo {
+    uniform float x[2];
+    uniform Bar bar;
+};
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    Foo f[3] = { { { b, b }, {b+10} }, { { 2*b, 2*b }, {b+20} }, { { 3*b, 3*b }, {b+30} } };
+    varying Foo g = f[0];
+    RET[programIndex] = g.x[0];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 5; }
diff --git a/tests/varying-struct.ispc b/tests/varying-struct.ispc
new file mode 100644
index 00000000..f32663aa
--- /dev/null
+++ b/tests/varying-struct.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+
+
+struct Foo {
+    uniform float x, y;
+};
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    varying Foo g;
+    g.x = a;
+    RET[programIndex] = g.x;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 1+programIndex; }
diff --git a/tests/write-same-loc.ispc b/tests/write-same-loc.ispc
new file mode 100644
index 00000000..1ae84904
--- /dev/null
+++ b/tests/write-same-loc.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float ret[], uniform float aa[], uniform float b) {
+    uniform int foo[10] = 10;
+    int bb = b;
+    foo[bb] = 0;
+    ret[programIndex] = foo[4] + foo[5];
+}
+
+export void result(uniform float ret[]) {
+    ret[programIndex] = 10;
+}
diff --git a/type.cpp b/type.cpp
new file mode 100644
index 00000000..e22bcb40
--- /dev/null
+++ b/type.cpp
@@ -0,0 +1,1802 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file type.cpp
+    @brief Definitions for classes related to type representation
+*/
+
+#include "type.h"
+#include "expr.h"
+#include "util.h"
+#include "sym.h"
+#include "llvmutil.h"
+#include "module.h"
+
+#include <stdio.h>
+#include <llvm/Value.h>
+#include <llvm/Module.h>
+#ifndef LLVM_2_8
+#include <llvm/Analysis/DIBuilder.h>
+#endif
+#include <llvm/Analysis/DebugInfo.h>
+#include <llvm/Support/Dwarf.h>
+
+
+/** Utility routine used in code that prints out declarations; returns true
+    if the given name should be printed, false otherwise.  This allows us
+    to omit the names for various internal things (whose names start with
+    double underscores) and emit anonymous declarations for them instead.
+ */
+
+static bool
+lShouldPrintName(const std::string &name) {
+    if (name.size() == 0)
+        return false;
+    else if (name[0] != '_')
+        return true;
+    else
+        return (name.size() == 1) || (name[1] != '_');
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// AtomicType
+
+const AtomicType *AtomicType::UniformBool = new AtomicType(TYPE_BOOL, true, false);
+const AtomicType *AtomicType::VaryingBool = new AtomicType(TYPE_BOOL, false, false);
+const AtomicType *AtomicType::UniformInt32 = new AtomicType(TYPE_INT32, true, false);
+const AtomicType *AtomicType::VaryingInt32 = new AtomicType(TYPE_INT32, false, false);
+const AtomicType *AtomicType::UniformUInt32 = new AtomicType(TYPE_UINT32, true, false);
+const AtomicType *AtomicType::VaryingUInt32 = new AtomicType(TYPE_UINT32, false, false);
+const AtomicType *AtomicType::UniformFloat = new AtomicType(TYPE_FLOAT, true, false);
+const AtomicType *AtomicType::VaryingFloat = new AtomicType(TYPE_FLOAT, false, false);
+const AtomicType *AtomicType::UniformInt64 = new AtomicType(TYPE_INT64, true, false);
+const AtomicType *AtomicType::VaryingInt64 = new AtomicType(TYPE_INT64, false, false);
+const AtomicType *AtomicType::UniformUInt64 = new AtomicType(TYPE_UINT64, true, false);
+const AtomicType *AtomicType::VaryingUInt64 = new AtomicType(TYPE_UINT64, false, false);
+const AtomicType *AtomicType::UniformDouble = new AtomicType(TYPE_DOUBLE, true, false);
+const AtomicType *AtomicType::VaryingDouble = new AtomicType(TYPE_DOUBLE, false, false);
+
+const AtomicType *AtomicType::UniformConstBool = new AtomicType(TYPE_BOOL, true, true);
+const AtomicType *AtomicType::VaryingConstBool = new AtomicType(TYPE_BOOL, false, true);
+const AtomicType *AtomicType::UniformConstInt32 = new AtomicType(TYPE_INT32, true, true);
+const AtomicType *AtomicType::VaryingConstInt32 = new AtomicType(TYPE_INT32, false, true);
+const AtomicType *AtomicType::UniformConstUInt32 = new AtomicType(TYPE_UINT32, true, true);
+const AtomicType *AtomicType::VaryingConstUInt32 = new AtomicType(TYPE_UINT32, false, true);
+const AtomicType *AtomicType::UniformConstFloat = new AtomicType(TYPE_FLOAT, true, true);
+const AtomicType *AtomicType::VaryingConstFloat = new AtomicType(TYPE_FLOAT, false, true);
+const AtomicType *AtomicType::UniformConstInt64 = new AtomicType(TYPE_INT64, true, true);
+const AtomicType *AtomicType::VaryingConstInt64 = new AtomicType(TYPE_INT64, false, true);
+const AtomicType *AtomicType::UniformConstUInt64 = new AtomicType(TYPE_UINT64, true, true);
+const AtomicType *AtomicType::VaryingConstUInt64 = new AtomicType(TYPE_UINT64, false, true);
+const AtomicType *AtomicType::UniformConstDouble = new AtomicType(TYPE_DOUBLE, true, true);
+const AtomicType *AtomicType::VaryingConstDouble = new AtomicType(TYPE_DOUBLE, false, true);
+const AtomicType *AtomicType::Void = new AtomicType(TYPE_VOID, true, false);
+
+
+AtomicType::AtomicType(BasicType bt, bool iu, bool ic) 
+    : basicType(bt), isUniform(iu), isConst(ic) {
+}
+
+
+bool
+AtomicType::IsUniformType() const {
+    return isUniform;
+}
+
+
+bool
+AtomicType::IsFloatType() const {
+    return (basicType == TYPE_FLOAT || basicType == TYPE_DOUBLE);
+}
+
+
+bool
+AtomicType::IsIntType() const {
+    return (basicType == TYPE_INT32 || basicType == TYPE_UINT32 ||
+            basicType == TYPE_INT64 || basicType == TYPE_UINT64);
+}
+
+
+bool
+AtomicType::IsUnsignedType() const {
+    return (basicType == TYPE_UINT32 || basicType == TYPE_UINT64);
+}
+
+
+bool
+AtomicType::IsBoolType() const {
+    return basicType == TYPE_BOOL;
+}
+
+
+bool
+AtomicType::IsConstType() const { 
+    return isConst; 
+}
+
+
+const AtomicType *
+AtomicType::GetAsUnsignedType() const {
+    if (IsUnsignedType()) 
+        return this;
+
+    if (this == AtomicType::UniformInt32)           return AtomicType::UniformUInt32;
+    else if (this == AtomicType::VaryingInt32)      return AtomicType::VaryingUInt32;
+    else if (this == AtomicType::UniformInt64)      return AtomicType::UniformUInt64;
+    else if (this == AtomicType::VaryingInt64)      return AtomicType::VaryingUInt64;
+    else if (this == AtomicType::UniformConstInt32) return AtomicType::UniformConstUInt32;
+    else if (this == AtomicType::VaryingConstInt32) return AtomicType::VaryingConstUInt32;
+    else if (this == AtomicType::UniformConstInt64) return AtomicType::UniformConstUInt64;
+    else if (this == AtomicType::VaryingConstInt64) return AtomicType::VaryingConstUInt64;
+    else                                            return NULL;
+}
+
+
+const AtomicType *
+AtomicType::GetAsConstType() const {
+    if (this == AtomicType::Void) 
+        return this;
+
+    switch (basicType) {
+    case TYPE_BOOL:    return isUniform ? UniformConstBool   : VaryingConstBool;
+    case TYPE_INT32:   return isUniform ? UniformConstInt32  : VaryingConstInt32;
+    case TYPE_UINT32:  return isUniform ? UniformConstUInt32 : VaryingConstUInt32;
+    case TYPE_FLOAT:   return isUniform ? UniformConstFloat  : VaryingConstFloat;
+    case TYPE_INT64:   return isUniform ? UniformConstInt64  : VaryingConstInt64;
+    case TYPE_UINT64:  return isUniform ? UniformConstUInt64 : VaryingConstUInt64;
+    case TYPE_DOUBLE:  return isUniform ? UniformConstDouble : VaryingConstDouble;
+    default:
+        FATAL("logic error in AtomicType::GetAsConstType()");
+        return NULL;
+    }
+}
+
+
+const AtomicType *
+AtomicType::GetAsNonConstType() const {
+    if (this == AtomicType::Void) 
+        return this;
+
+    switch (basicType) {
+    case TYPE_BOOL:    return isUniform ? UniformBool   : VaryingBool;
+    case TYPE_INT32:   return isUniform ? UniformInt32  : VaryingInt32;
+    case TYPE_UINT32:  return isUniform ? UniformUInt32 : VaryingUInt32;
+    case TYPE_FLOAT:   return isUniform ? UniformFloat  : VaryingFloat;
+    case TYPE_INT64:   return isUniform ? UniformInt64  : VaryingInt64;
+    case TYPE_UINT64:  return isUniform ? UniformUInt64 : VaryingUInt64;
+    case TYPE_DOUBLE:  return isUniform ? UniformDouble : VaryingDouble;
+    default:
+        FATAL("logic error in AtomicType::GetAsNonConstType()");
+        return NULL;
+    }
+}
+
+
+const AtomicType *
+AtomicType::GetBaseType() const {
+    return this;
+}
+
+
+const AtomicType *
+AtomicType::GetAsVaryingType() const {
+    if (IsVaryingType()) 
+        return this;
+
+    switch (basicType) {
+    case TYPE_VOID:   return this;
+    case TYPE_BOOL:   return isConst ? AtomicType::VaryingConstBool   : AtomicType::VaryingBool;
+    case TYPE_INT32:  return isConst ? AtomicType::VaryingConstInt32  : AtomicType::VaryingInt32;
+    case TYPE_UINT32: return isConst ? AtomicType::VaryingConstUInt32 : AtomicType::VaryingUInt32;
+    case TYPE_FLOAT:  return isConst ? AtomicType::VaryingConstFloat  : AtomicType::VaryingFloat;
+    case TYPE_INT64:  return isConst ? AtomicType::VaryingConstInt64  : AtomicType::VaryingInt64;
+    case TYPE_UINT64: return isConst ? AtomicType::VaryingConstUInt64 : AtomicType::VaryingUInt64;
+    case TYPE_DOUBLE: return isConst ? AtomicType::VaryingConstDouble : AtomicType::VaryingDouble;
+    default:          FATAL("Logic error in AtomicType::GetAsVaryingType()");
+    }
+    return NULL;
+}
+
+
+const AtomicType *
+AtomicType::GetAsUniformType() const {
+    if (IsUniformType()) 
+        return this;
+
+    switch (basicType) {
+    case TYPE_VOID:   return this;
+    case TYPE_BOOL:   return isConst ? AtomicType::UniformConstBool : AtomicType::UniformBool;
+    case TYPE_INT32:  return isConst ? AtomicType::UniformConstInt32 : AtomicType::UniformInt32;
+    case TYPE_UINT32: return isConst ? AtomicType::UniformConstUInt32 : AtomicType::UniformUInt32;
+    case TYPE_FLOAT:  return isConst ? AtomicType::UniformConstFloat : AtomicType::UniformFloat;
+    case TYPE_INT64:  return isConst ? AtomicType::UniformConstInt64 : AtomicType::UniformInt64;
+    case TYPE_UINT64: return isConst ? AtomicType::UniformConstUInt64 : AtomicType::UniformUInt64;
+    case TYPE_DOUBLE: return isConst ? AtomicType::UniformConstDouble : AtomicType::UniformDouble;
+    default:          FATAL("Logic error in AtomicType::GetAsUniformType()");
+    }
+    return NULL;
+}
+
+
+const Type *
+AtomicType::GetSOAType(int width) const {
+    assert(width > 0);
+    return new ArrayType(this, width);
+}
+
+
+std::string
+AtomicType::GetString() const {
+    std::string ret;
+    if (basicType != TYPE_VOID) {
+        if (isConst)   ret += "const ";
+        if (isUniform) ret += "uniform ";
+    }
+
+    switch (basicType) {
+    case TYPE_VOID:   ret += "void";            break;
+    case TYPE_BOOL:   ret += "bool";            break;
+    case TYPE_INT32:  ret += "int32";           break;
+    case TYPE_UINT32: ret += "unsigned int32";  break;
+    case TYPE_FLOAT:  ret += "float";           break;
+    case TYPE_INT64:  ret += "int64";           break;
+    case TYPE_UINT64: ret += "unsigned int64";  break;
+    case TYPE_DOUBLE: ret += "double";          break;
+    default: FATAL("Logic error in AtomicType::GetString()");
+    }
+    return ret;
+}
+
+
+std::string
+AtomicType::Mangle() const {
+    std::string ret;
+    if (isConst)   ret += "C";
+    if (isUniform) ret += "U";
+
+    switch (basicType) {
+    case TYPE_VOID:   ret += "v"; break;
+    case TYPE_BOOL:   ret += "b"; break;
+    case TYPE_INT32:  ret += "i"; break;
+    case TYPE_UINT32: ret += "u"; break;
+    case TYPE_FLOAT:  ret += "f"; break;
+    case TYPE_INT64:  ret += "I"; break;
+    case TYPE_UINT64: ret += "U"; break;
+    case TYPE_DOUBLE: ret += "d"; break;
+    default: FATAL("Logic error in AtomicType::Mangle()");
+    }
+    return ret;
+}
+
+
+std::string
+AtomicType::GetCDeclaration(const std::string &name) const {
+    std::string ret;
+    assert(isUniform);
+    if (isConst) ret += "const ";
+
+    switch (basicType) {
+    case TYPE_VOID:   ret += "void";     break;
+    case TYPE_BOOL:   ret += "bool";     break;
+    case TYPE_INT32:  ret += "int32_t";  break;
+    case TYPE_UINT32: ret += "uint32_t"; break;
+    case TYPE_FLOAT:  ret += "float";    break;
+    case TYPE_DOUBLE: ret += "double";   break;
+    case TYPE_INT64:  ret += "int64_t";  break;
+    case TYPE_UINT64: ret += "uint64_t"; break;
+    default: FATAL("Logic error in AtomicType::GetCDeclaration()");
+    }
+
+    if (lShouldPrintName(name)) {
+        ret += " ";
+        ret += name;
+    }
+    return ret;
+}
+
+
+const llvm::Type *
+AtomicType::LLVMType(llvm::LLVMContext *ctx) const {
+    switch (basicType) {
+    case TYPE_VOID:
+        return llvm::Type::getVoidTy(*ctx);
+    case TYPE_BOOL:
+        return isUniform ? LLVMTypes::BoolType : LLVMTypes::BoolVectorType;
+    case TYPE_INT32:
+    case TYPE_UINT32:
+        return isUniform ? LLVMTypes::Int32Type : LLVMTypes::Int32VectorType;
+    case TYPE_FLOAT:
+        return isUniform ? LLVMTypes::FloatType : LLVMTypes::FloatVectorType;
+    case TYPE_INT64:
+    case TYPE_UINT64:
+        return isUniform ? LLVMTypes::Int64Type : LLVMTypes::Int64VectorType;
+    case TYPE_DOUBLE:
+        return isUniform ? LLVMTypes::DoubleType : LLVMTypes::DoubleVectorType;
+    default:
+        FATAL("logic error in AtomicType::LLVMType");
+        return NULL;
+    }
+}
+
+
+llvm::DIType
+AtomicType::GetDIType(llvm::DIDescriptor scope) const {
+#ifdef LLVM_2_8
+    FATAL("debug info not supported in llvm 2.8");
+    return llvm::DIType();
+#else
+    if (isUniform) {
+        switch (basicType) {
+        case TYPE_VOID:
+            return llvm::DIType();
+        case TYPE_BOOL:
+            return m->diBuilder->createBasicType("bool", 32 /* size */, 32 /* align */,
+                                                 llvm::dwarf::DW_ATE_unsigned);
+            break;
+        case TYPE_INT32:
+            return m->diBuilder->createBasicType("int32", 32 /* size */, 32 /* align */,
+                                                 llvm::dwarf::DW_ATE_signed);
+            break;
+        case TYPE_UINT32:
+            return m->diBuilder->createBasicType("uint32", 32 /* size */, 32 /* align */,
+                                                 llvm::dwarf::DW_ATE_unsigned);
+            break;
+        case TYPE_FLOAT:
+            return m->diBuilder->createBasicType("float", 32 /* size */, 32 /* align */,
+                                                 llvm::dwarf::DW_ATE_float);
+            break;
+        case TYPE_DOUBLE:
+            return m->diBuilder->createBasicType("double", 64 /* size */, 64 /* align */,
+                                                 llvm::dwarf::DW_ATE_float);
+            break;
+        case TYPE_INT64:
+            return m->diBuilder->createBasicType("int64", 64 /* size */, 64 /* align */,
+                                                 llvm::dwarf::DW_ATE_signed);
+            break;
+        case TYPE_UINT64:
+            return m->diBuilder->createBasicType("uint64", 64 /* size */, 64 /* align */,
+                                                 llvm::dwarf::DW_ATE_unsigned);
+            break;
+        default:
+            FATAL("unhandled basic type in AtomicType::GetDIType()");
+            return llvm::DIType();
+        }
+    }
+    else {
+        llvm::DIType unifType = GetAsUniformType()->GetDIType(scope);
+        llvm::Value *sub = m->diBuilder->getOrCreateSubrange(0, g->target.vectorWidth-1);
+#ifdef LLVM_2_9
+        llvm::Value *suba[] = { sub };
+        llvm::DIArray subArray = m->diBuilder->getOrCreateArray(suba, 1);
+#else
+        llvm::DIArray subArray = m->diBuilder->getOrCreateArray(sub);
+#endif // LLVM_2_9
+        uint64_t size =  unifType.getSizeInBits()  * g->target.vectorWidth;
+        uint64_t align = unifType.getAlignInBits() * g->target.vectorWidth;
+        return m->diBuilder->createVectorType(size, align, unifType, subArray);
+    }
+#endif // LLVM_2_8
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// ArrayType
+
+ArrayType::ArrayType(const Type *c, int a) 
+    : child(c), numElements(a) {
+    // 0 -> unsized array.
+    assert(numElements >= 0);
+}
+
+
+const llvm::ArrayType *
+ArrayType::LLVMType(llvm::LLVMContext *ctx) const {
+    if (!child)
+        return NULL;
+
+    const llvm::Type *ct = child->LLVMType(ctx);
+    if (!ct)
+        return NULL;
+    return llvm::ArrayType::get(ct, numElements);
+}
+
+
+bool
+ArrayType::IsUniformType() const {
+    return child->IsUniformType(); 
+}
+
+
+bool
+ArrayType::IsFloatType() const {
+    return false; 
+}
+
+
+bool
+ArrayType::IsIntType() const {
+    return false; 
+}
+
+
+bool
+ArrayType::IsUnsignedType() const {
+    return false; 
+}
+
+
+bool
+ArrayType::IsBoolType() const {
+    return false; 
+}
+
+
+bool
+ArrayType::IsConstType() const {
+    return child->IsConstType(); 
+}
+
+
+const Type *
+ArrayType::GetBaseType() const {
+    const Type *type = child;
+    const ArrayType *at = dynamic_cast<const ArrayType *>(type);
+    // Keep walking until we reach a child that isn't itself an array
+    while (at) {
+        type = at->child;
+        at = dynamic_cast<const ArrayType *>(type);
+    }
+    return type;
+}
+
+
+const ArrayType *
+ArrayType::GetAsVaryingType() const {
+    return new ArrayType(child->GetAsVaryingType(), numElements);
+}
+
+
+const ArrayType *
+ArrayType::GetAsUniformType() const {
+    return new ArrayType(child->GetAsUniformType(), numElements);
+}
+
+
+const Type *
+ArrayType::GetSOAType(int width) const {
+    return new ArrayType(child->GetSOAType(width), numElements);
+}
+
+
+const ArrayType *
+ArrayType::GetAsConstType() const {
+    return new ArrayType(child->GetAsConstType(), numElements);
+}
+
+
+const ArrayType *
+ArrayType::GetAsNonConstType() const {
+    return new ArrayType(child->GetAsNonConstType(), numElements);
+}
+
+
+int
+ArrayType::GetElementCount() const {
+    return numElements;
+}
+
+
+const Type *
+ArrayType::GetElementType() const {
+    return child;
+}
+
+
+std::string
+ArrayType::GetString() const {
+    std::string s = GetBaseType()->GetString();
+
+    const ArrayType *at = this;
+    // Walk through this and any children arrays and print all of their
+    // dimensions
+    while (at) {
+        char buf[16];
+        if (numElements > 0)
+            sprintf(buf, "%d", at->numElements);
+        else
+            buf[0] = '\0';
+        s += std::string("[") + std::string(buf) + std::string("]");
+        at = dynamic_cast<const ArrayType *>(at->child);
+    }
+    return s;
+}
+
+
+std::string
+ArrayType::Mangle() const {
+    std::string s = child->Mangle();
+    char buf[16];
+    if (numElements > 0)
+        sprintf(buf, "%d", numElements);
+    else
+        buf[0] = '\0';
+    return s + "[" + buf + "]";
+}
+
+
+std::string
+ArrayType::GetCDeclaration(const std::string &name) const {
+    std::string s = GetBaseType()->GetCDeclaration(name);
+
+    const ArrayType *at = this;
+    while (at) {
+        char buf[16];
+        if (numElements > 0)
+            sprintf(buf, "%d", at->numElements);
+        else
+            buf[0] = '\0';
+        s += std::string("[") + std::string(buf) + std::string("]");
+        at = dynamic_cast<const ArrayType *>(at->child);
+    }
+    return s;
+}
+
+
+int
+ArrayType::TotalElementCount() const {
+    const ArrayType *ct = dynamic_cast<const ArrayType *>(child);
+    if (ct)
+        return numElements * ct->TotalElementCount();
+    else
+        return numElements;
+}
+
+
+llvm::DIType
+ArrayType::GetDIType(llvm::DIDescriptor scope) const {
+#ifdef LLVM_2_8
+    FATAL("debug info not supported in llvm 2.8");
+    return llvm::DIType();
+#else
+    if (!child)
+        return llvm::DIType();
+
+    llvm::DIType eltType = child->GetDIType(scope);
+
+    int lowerBound = 0, upperBound = numElements-1;
+    if (numElements == 0) {
+        // unsized array -> indicate with low > high
+        lowerBound = 1;
+        upperBound = 0;
+    }
+
+    llvm::Value *sub = m->diBuilder->getOrCreateSubrange(lowerBound, upperBound);
+    std::vector<llvm::Value *> subs;
+    subs.push_back(sub);
+#ifdef LLVM_2_9
+    llvm::DIArray subArray = m->diBuilder->getOrCreateArray(&subs[0], subs.size());
+#else
+    llvm::DIArray subArray = m->diBuilder->getOrCreateArray(subs);
+#endif
+
+    // it's intentional that size is zero for unsized arrays
+    uint64_t size = eltType.getSizeInBits() * numElements;
+    uint64_t align = eltType.getAlignInBits();
+
+    return m->diBuilder->createArrayType(size, align, eltType, subArray);
+#endif // LLVM_2_8
+}
+
+
+ArrayType *
+ArrayType::GetSizedArray(int sz) const {
+    assert(numElements == 0);
+    return new ArrayType(child, sz);
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// SOAArrayType
+
+SOAArrayType::SOAArrayType(const StructType *eltType, int nElem, int sw) 
+    : ArrayType(eltType, nElem), soaWidth(sw) {
+    assert(soaWidth > 0);
+    if (numElements > 0)
+        assert((numElements % soaWidth) == 0);
+}
+
+
+// FIXME: do we need to implement GetBaseType() here to return child->SOAType()?
+
+const SOAArrayType *
+SOAArrayType::GetAsVaryingType() const {
+    return new SOAArrayType(dynamic_cast<const StructType *>(child->GetAsVaryingType()), 
+                            numElements, soaWidth);
+}
+
+
+const SOAArrayType *
+SOAArrayType::GetAsUniformType() const {
+    return new SOAArrayType(dynamic_cast<const StructType *>(child->GetAsUniformType()), 
+                            numElements, soaWidth);
+}
+
+
+const Type *
+SOAArrayType::GetSOAType(int width) const {
+    return new SOAArrayType(dynamic_cast<const StructType *>(child->GetSOAType(width)), 
+                            numElements, soaWidth);
+}
+
+
+const SOAArrayType *
+SOAArrayType::GetAsConstType() const {
+    return new SOAArrayType(dynamic_cast<const StructType *>(child->GetAsConstType()), 
+                            numElements, soaWidth);
+}
+
+
+const SOAArrayType *
+SOAArrayType::GetAsNonConstType() const {
+    return new SOAArrayType(dynamic_cast<const StructType *>(child->GetAsNonConstType()), 
+                            numElements, soaWidth);
+}
+
+
+std::string
+SOAArrayType::GetString() const {
+    std::string s;
+
+    char buf[32];
+    sprintf(buf, "soa<%d> ", soaWidth);
+    s += buf;
+    s += GetBaseType()->GetString();
+
+    const ArrayType *at = this;
+    while (at) {
+        char buf[16];
+        if (numElements > 0)
+            sprintf(buf, "%d", at->numElements);
+        else
+            buf[0] = '\0';
+        s += std::string("[") + std::string(buf) + std::string("]");
+        at = dynamic_cast<const ArrayType *>(at->child);
+    }
+    return s;
+}
+
+
+std::string
+SOAArrayType::Mangle() const {
+    const Type *t = soaType();
+    return t->Mangle();
+}
+
+
+std::string
+SOAArrayType::GetCDeclaration(const std::string &name) const {
+    const Type *t = soaType();
+    return t->GetCDeclaration(name);
+}
+
+
+int
+SOAArrayType::TotalElementCount() const {
+    int sz = numElements / soaWidth;
+    const ArrayType *ct = dynamic_cast<const ArrayType *>(child);
+    if (ct)
+        return sz * ct->TotalElementCount();
+    else
+        return sz;
+}
+
+
+const llvm::ArrayType *
+SOAArrayType::LLVMType(llvm::LLVMContext *ctx) const {
+    if (!child)
+        return NULL;
+
+    const ArrayType *a = soaType();
+    if (!a)
+        return NULL;
+    return a->LLVMType(ctx);
+}
+
+
+llvm::DIType
+SOAArrayType::GetDIType(llvm::DIDescriptor scope) const {
+#ifdef LLVM_2_8
+    FATAL("debug info not supported in llvm 2.8");
+    return llvm::DIType();
+#else
+    if (!child)
+        return llvm::DIType();
+
+    const Type *t = soaType();
+    return t->GetDIType(scope);
+#endif
+}
+
+
+SOAArrayType *
+SOAArrayType::GetSizedArray(int size) const {
+    if ((size % soaWidth) != 0)
+        return NULL;
+    return new SOAArrayType(dynamic_cast<const StructType *>(child), size, soaWidth);
+}
+
+
+const ArrayType *
+SOAArrayType::soaType() const {
+    const Type *childSOA = child->GetSOAType(soaWidth);
+    return new ArrayType(childSOA, numElements / soaWidth);
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// VectorType
+
+VectorType::VectorType(const AtomicType *b, int a) 
+    : base(b), numElements(a) {
+    assert(numElements > 0);
+    assert(base != NULL);
+}
+
+
+bool
+VectorType::IsUniformType() const {
+    return base->IsUniformType(); 
+}
+
+
+bool
+VectorType::IsFloatType() const {
+    return base->IsFloatType(); 
+}
+
+
+bool
+VectorType::IsIntType() const {
+    return base->IsIntType(); 
+}
+
+
+bool
+VectorType::IsUnsignedType() const {
+    return base->IsUnsignedType(); 
+}
+
+
+bool
+VectorType::IsBoolType() const {
+    return base->IsBoolType(); 
+}
+
+
+bool
+VectorType::IsConstType() const {
+    return base->IsConstType(); 
+}
+
+
+const Type *
+VectorType::GetBaseType() const {
+    return base;
+}
+
+
+const VectorType *
+VectorType::GetAsVaryingType() const {
+    return new VectorType(base->GetAsVaryingType(), numElements);
+}
+
+
+const VectorType *
+VectorType::GetAsUniformType() const {
+    return new VectorType(base->GetAsUniformType(), numElements);
+}
+
+
+const Type *
+VectorType::GetSOAType(int width) const {
+    // FIXME: is this right??
+    return new ArrayType(this, width);
+}
+
+
+const VectorType *
+VectorType::GetAsConstType() const {
+    return new VectorType(base->GetAsConstType(), numElements);
+}
+
+
+const VectorType *
+VectorType::GetAsNonConstType() const {
+    return new VectorType(base->GetAsNonConstType(), numElements);
+}
+
+
+std::string
+VectorType::GetString() const {
+    std::string s = base->GetString();
+    char buf[16];
+    sprintf(buf, "<%d>", numElements);
+    return s + std::string(buf);
+}
+
+
+std::string
+VectorType::Mangle() const {
+    std::string s = base->Mangle();
+    char buf[16];
+    sprintf(buf, "<%d>", numElements);
+    return s + std::string(buf);
+}
+
+
+std::string
+VectorType::GetCDeclaration(const std::string &name) const {
+    std::string s = base->GetCDeclaration("");
+    char buf[16];
+    sprintf(buf, "%d", numElements);
+    return s + std::string(buf) + "  " + name;
+}
+
+
+int
+VectorType::GetElementCount() const {
+    return numElements;
+}
+
+
+const AtomicType *
+VectorType::GetElementType() const {
+    return base;
+}
+
+
+const llvm::Type *
+VectorType::LLVMType(llvm::LLVMContext *ctx) const {
+    const llvm::Type *bt = base->LLVMType(ctx);
+    if (!bt)
+        return NULL;
+
+    if (base->IsUniformType())
+        // vectors of uniform types are laid out across LLVM vectors, with
+        // the llvm vector size set to be a multiple of the machine's
+        // natural vector size (e.g. 4 on SSE).  This is a roundabout way
+        // of ensuring that LLVM lays them out into machine vector
+        // registers so that e.g. if we want to add two uniform 4 float
+        // vectors, that is turned into a single addps on SSE.
+        return llvm::VectorType::get(bt, getVectorMemoryCount());
+    else
+        // varying types are already laid out to fill HW vector registers,
+        // so a vector type here is just expanded out as an llvm array.
+        return llvm::ArrayType::get(bt, getVectorMemoryCount());
+}
+
+
+llvm::DIType
+VectorType::GetDIType(llvm::DIDescriptor scope) const {
+#ifdef LLVM_2_8
+    FATAL("debug info not supported in llvm 2.8");
+    return llvm::DIType();
+#else
+    llvm::DIType eltType = base->GetDIType(scope);
+    llvm::Value *sub = m->diBuilder->getOrCreateSubrange(0, numElements-1);
+#ifdef LLVM_2_9
+    llvm::Value *subs[1] = { sub };
+    llvm::DIArray subArray = m->diBuilder->getOrCreateArray(subs, 1);
+#else
+    llvm::DIArray subArray = m->diBuilder->getOrCreateArray(sub);
+#endif
+
+    uint64_t sizeBits = eltType.getSizeInBits() * numElements;
+
+    // vectors of varying types are already naturally aligned to the
+    // machine's vector width, but arrays of uniform types need to be
+    // explicitly aligned to the machines natural vector alignment.
+    uint64_t align = eltType.getAlignInBits();
+    if (IsUniformType())
+        align = 4 * g->target.nativeVectorWidth;
+
+    return m->diBuilder->createVectorType(sizeBits, align, eltType, subArray);
+#endif // LLVM_2_8
+}
+
+
+int
+VectorType::getVectorMemoryCount() const {
+    if (base->IsVaryingType())
+        return numElements;
+    else {
+        int nativeWidth = g->target.nativeVectorWidth;
+        if (base->GetAsUniformType() == AtomicType::UniformInt64 ||
+            base->GetAsUniformType() == AtomicType::UniformUInt64 ||
+            base->GetAsUniformType() == AtomicType::UniformDouble)
+            // target.nativeVectorWidth should be in terms of 32-bit
+            // values, so for the 64-bit guys, it takes half as many of
+            // them to fill the native width
+            nativeWidth /= 2;
+        // and now round up the element count to be a multiple of
+        // nativeWidth
+        return (numElements + (nativeWidth - 1)) & ~(nativeWidth-1);
+    }
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// StructType
+
+StructType::StructType(const std::string &n, const std::vector<const Type *> &elts, 
+                       const std::vector<std::string> &en,
+                       bool ic, bool iu, SourcePos p) 
+    : name(n), elementTypes(elts), elementNames(en), isUniform(iu), isConst(ic), 
+      pos(p) {
+}
+
+
+bool
+StructType::IsUniformType() const  {
+    return isUniform; 
+}
+
+
+bool
+StructType::IsBoolType() const {
+    return false; 
+}
+
+
+bool
+StructType::IsFloatType() const {
+    return false; 
+}
+
+
+bool
+StructType::IsIntType() const  {
+    return false; 
+}
+
+
+bool
+StructType::IsUnsignedType() const {
+    return false; 
+}
+
+
+bool
+StructType::IsConstType() const {
+    return isConst; 
+}
+
+
+const Type *
+StructType::GetBaseType() const {
+    return this;
+}
+
+
+const StructType *
+StructType::GetAsVaryingType() const {
+    if (IsVaryingType()) 
+        return this;
+    else
+        return new StructType(name, elementTypes, elementNames, isConst,
+                              false, pos);
+}
+
+
+const StructType *
+StructType::GetAsUniformType() const {
+    if (IsUniformType()) 
+        return this;
+    else
+        return new StructType(name, elementTypes, elementNames, isConst,
+                              true, pos);
+}
+
+
+const Type *
+StructType::GetSOAType(int width) const {
+    std::vector<const Type *> et;
+    // The SOA version of a structure is just a structure that holds SOAed
+    // versions of its elements
+    for (int i = 0; i < NumElements(); ++i) {
+        const Type *t = GetMemberType(i);
+        et.push_back(t->GetSOAType(width));
+    }
+    return new StructType(name, et, elementNames, isConst, isUniform, pos);
+}
+
+
+const StructType *
+StructType::GetAsConstType() const {
+    if (IsConstType()) 
+        return this;
+    else
+        return new StructType(name, elementTypes, elementNames, true,
+                              isUniform, pos);
+}
+
+
+const StructType *
+StructType::GetAsNonConstType() const {
+    if (!IsConstType()) 
+        return this;
+    else
+        return new StructType(name, elementTypes, elementNames, false,
+                              isUniform, pos);
+}
+
+
+std::string
+StructType::GetString() const {
+    std::string ret;
+    if (isConst)   ret += "const ";
+    if (isUniform) ret += "uniform ";
+    else           ret += "varying ";
+
+    // Don't print the entire struct declaration, just print the struct's name.
+    // @todo Do we need a separate method that prints the declaration?
+#if 0
+    ret += std::string("struct { ") + name;
+    for (unsigned int i = 0; i < elementTypes.size(); ++i) {
+        ret += elementTypes[i]->GetString();
+        ret += " ";
+        ret += elementNames[i];
+        ret += "; ";
+    }
+    ret += "}";
+#else
+    ret += "struct ";
+    ret += name;
+#endif
+    return ret;
+}
+
+
+std::string
+StructType::Mangle() const {
+    std::string ret;
+    ret += "s[";
+    if (isConst)
+        ret += "_c_";
+    if (isUniform)
+        ret += "_u_";
+    ret += name + std::string("]<");
+    for (unsigned int i = 0; i < elementTypes.size(); ++i)
+        ret += elementTypes[i]->Mangle();
+    ret += ">";
+    return ret;
+}
+    
+
+std::string
+StructType::GetCDeclaration(const std::string &n) const {
+    std::string ret;
+    if (isConst) ret += "const ";
+    ret += std::string("struct ") + name;
+    if (lShouldPrintName(n))
+        ret += std::string(" ") + n;
+    if (!isUniform) {
+        char buf[16];
+        sprintf(buf, "[%d]", g->target.vectorWidth);
+        ret += buf;
+    }
+    return ret;
+}
+
+
+const llvm::Type *
+StructType::LLVMType(llvm::LLVMContext *ctx) const {
+    std::vector<const llvm::Type *> llvmTypes;
+    for (int i = 0; i < NumElements(); ++i) {
+        const Type *type = GetMemberType(i);
+        llvmTypes.push_back(type->LLVMType(ctx));
+    }
+    return llvm::StructType::get(*ctx, llvmTypes);
+}
+
+
+llvm::DIType
+StructType::GetDIType(llvm::DIDescriptor scope) const {
+#ifdef LLVM_2_8
+    FATAL("debug info not supported in llvm 2.8");
+    return llvm::DIType();
+#else
+    uint64_t currentSize = 0, align = 0;
+    llvm::DIFile diFile = pos.GetDIFile();
+
+    std::vector<llvm::Value *> elementLLVMTypes;
+    // Walk through the elements of the struct; for each one figure out its
+    // alignment and size, using that to figure out its offset w.r.t. the
+    // start of the structure.
+    for (unsigned int i = 0; i < elementTypes.size(); ++i) {
+        llvm::DIType eltType = GetMemberType(i)->GetDIType(scope);
+        uint64_t eltAlign = eltType.getAlignInBits();
+        uint64_t eltSize = eltType.getSizeInBits();
+
+        // The alignment for the entire structure is the maximum of the
+        // required alignments of its elements
+        align = std::max(align, eltAlign);
+
+        // Move the current size forward if needed so that the current
+        // element starts at an offset that's the correct alignment.
+        if (currentSize > 0 && (currentSize % eltAlign))
+            currentSize += eltAlign - (currentSize % eltAlign);
+        assert((currentSize == 0) || (currentSize % eltAlign) == 0);
+
+        // FIXME: we should pass this actual file/line number for the
+        // member, not the position of the struct declaration
+        llvm::DIType fieldType = 
+            m->diBuilder->createMemberType(elementNames[i], diFile, pos.first_line,
+                                           eltSize, eltAlign, currentSize, 0,
+                                           eltType);
+        elementLLVMTypes.push_back(fieldType);
+
+        currentSize += eltSize;
+    }
+
+    // Round up the struct's entire size so that it's a multiple of the
+    // required alignment that we figured out along the way...
+    if (currentSize > 0 && (currentSize % align))
+        currentSize += align - (currentSize % align);
+
+#ifdef LLVM_2_9
+    llvm::DIArray elements = m->diBuilder->getOrCreateArray(&elementLLVMTypes[0], 
+                                                            elementLLVMTypes.size());
+#else
+    llvm::DIArray elements = m->diBuilder->getOrCreateArray(elementLLVMTypes);
+#endif
+    return m->diBuilder->createStructType(scope, name, diFile, pos.first_line, currentSize, 
+                                          align, 0, elements);
+#endif // LLVM_2_8
+}
+
+
+const Type *
+StructType::GetMemberType(int i) const {
+    assert(i < (int)elementTypes.size());
+    // If the struct is uniform qualified, then each member comes out with
+    // the same type as in the original source file.  If it's varying, then
+    // all members are promoted to varying.
+    const Type *ret = isUniform ? elementTypes[i] : 
+        elementTypes[i]->GetAsVaryingType();
+    return isConst ? ret->GetAsConstType() : ret;
+}
+
+
+const Type *
+StructType::GetMemberType(const std::string &n) const {
+    for (unsigned int i = 0; i < elementNames.size(); ++i)
+        if (elementNames[i] == n) {
+            const Type *ret = isUniform ? elementTypes[i] : 
+                elementTypes[i]->GetAsVaryingType();
+            return isConst ? ret->GetAsConstType() : ret;
+        }
+    return NULL;
+}
+
+
+int
+StructType::GetMemberNumber(const std::string &n) const {
+    for (unsigned int i = 0; i < elementNames.size(); ++i)
+        if (elementNames[i] == n)
+            return i;
+    return -1;
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// ReferenceType
+
+ReferenceType::ReferenceType(const Type *t, bool ic) 
+    : isConst(ic), targetType(t) {
+}
+
+
+bool
+ReferenceType::IsUniformType() const {
+    return targetType->IsUniformType(); 
+}
+
+
+bool
+ReferenceType::IsBoolType() const {
+    return targetType->IsBoolType(); 
+}
+
+
+bool
+ReferenceType::IsFloatType() const {
+    return targetType->IsFloatType(); 
+}
+
+
+bool
+ReferenceType::IsIntType() const {
+    return targetType->IsIntType(); 
+}
+
+
+bool
+ReferenceType::IsUnsignedType() const {
+    return targetType->IsUnsignedType(); 
+}
+
+
+bool
+ReferenceType::IsConstType() const {
+    return isConst; 
+}
+
+
+const Type *
+ReferenceType::GetReferenceTarget() const {
+    return targetType;
+}
+
+
+const Type *
+ReferenceType::GetBaseType() const {
+    return targetType->GetBaseType();
+}
+
+
+const ReferenceType *
+ReferenceType::GetAsVaryingType() const {
+    if (IsVaryingType()) 
+        return this;
+    return new ReferenceType(targetType->GetAsVaryingType(), isConst);
+}
+
+
+const ReferenceType *
+ReferenceType::GetAsUniformType() const {
+    if (IsUniformType()) 
+        return this;
+    return new ReferenceType(targetType->GetAsUniformType(), isConst);
+}
+
+
+const Type *
+ReferenceType::GetSOAType(int width) const {
+    return new ReferenceType(targetType->GetSOAType(width), isConst);
+}
+
+
+const ReferenceType *
+ReferenceType::GetAsConstType() const {
+    if (IsConstType())
+        return this;
+    return new ReferenceType(targetType, true);
+}
+
+
+const ReferenceType *
+ReferenceType::GetAsNonConstType() const {
+    if (!IsConstType())
+        return this;
+    return new ReferenceType(targetType, false);
+}
+
+
+std::string
+ReferenceType::GetString() const {
+    std::string ret;
+    if (isConst || targetType->IsConstType())
+        ret += "const ";
+    ret += std::string("reference<") + targetType->GetAsNonConstType()->GetString() + 
+        std::string(">");
+    return ret;
+}
+
+
+std::string
+ReferenceType::Mangle() const {
+    std::string ret;
+    if (isConst)
+        ret += "C";
+    ret += std::string("REF") + targetType->Mangle();
+    return ret;
+}
+
+
+std::string
+ReferenceType::GetCDeclaration(const std::string &name) const {
+    const ArrayType *at = dynamic_cast<const ArrayType *>(targetType);
+    if (at != NULL) {
+        if (at->GetElementCount() == 0) {
+            // emit unsized arrays as pointers to the base type..
+            std::string ret;
+            if (isConst || at->GetElementType()->IsConstType())
+                ret += "const ";
+            ret += at->GetElementType()->GetAsNonConstType()->GetCDeclaration("") + 
+                std::string(" *");
+            if (lShouldPrintName(name))
+                ret += name;
+            return ret;
+        }
+        else
+            // otherwise forget about the reference part if it's an
+            // array since C already passes arrays by reference...
+            return targetType->GetCDeclaration(name);
+    }
+    else {
+        std::string ret;
+        if (isConst || targetType->IsConstType())
+            ret += "const ";
+        ret += targetType->GetAsNonConstType()->GetCDeclaration("") + 
+            std::string(" *");
+        if (lShouldPrintName(name))
+            ret += name;
+        return ret;
+    }
+}
+
+
+const llvm::Type *
+ReferenceType::LLVMType(llvm::LLVMContext *ctx) const {
+    if (!targetType)
+        return NULL;
+    const llvm::Type *t = targetType->LLVMType(ctx);
+    if (!t)
+        return NULL;
+    return llvm::PointerType::get(t, 0);
+}
+
+
+llvm::DIType
+ReferenceType::GetDIType(llvm::DIDescriptor scope) const {
+#ifdef LLVM_2_8
+    FATAL("debug info not supported in llvm 2.8");
+    return llvm::DIType();
+#else
+    llvm::DIType diTargetType = targetType->GetDIType(scope);
+    return m->diBuilder->createReferenceType(diTargetType);
+#endif // LLVM_2_8
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// FunctionType
+
+FunctionType::FunctionType(const Type *r, const std::vector<const Type *> &a, 
+                           SourcePos p, const std::vector<std::string> *an, 
+                           bool it, bool is, bool ec) 
+    : isTask(it), isExported(is), isExternC(ec), returnType(r), argTypes(a), 
+      argNames(an ? *an : std::vector<std::string>()), pos(p) {
+    assert(returnType != NULL);
+}
+
+
+bool
+FunctionType::IsUniformType() const {
+    return returnType->IsUniformType(); 
+}
+
+
+bool
+FunctionType::IsFloatType() const {
+    return returnType->IsFloatType(); 
+}
+
+
+bool
+FunctionType::IsIntType() const {
+    return returnType->IsIntType(); 
+}
+
+
+bool
+FunctionType::IsBoolType() const {
+    return returnType->IsBoolType(); 
+}
+
+
+bool
+FunctionType::IsUnsignedType() const {
+    return returnType->IsUnsignedType(); 
+}
+
+
+bool
+FunctionType::IsConstType() const {
+    return returnType->IsConstType(); 
+}
+
+
+const Type *
+FunctionType::GetBaseType() const {
+    FATAL("FunctionType::GetBaseType() shouldn't be called");
+    return NULL;
+}
+
+const Type *
+FunctionType::GetAsVaryingType() const {
+    FATAL("FunctionType::GetAsVaryingType shouldn't be called");
+    return NULL;
+}
+
+
+const Type *
+FunctionType::GetAsUniformType() const {
+    FATAL("FunctionType::GetAsUniformType shouldn't be called");
+    return NULL;
+}
+
+
+const Type *
+FunctionType::GetSOAType(int width) const {
+    FATAL("FunctionType::GetSOAType shouldn't be called");
+    return NULL;
+}
+
+
+const Type *
+FunctionType::GetAsConstType() const {
+    FATAL("FunctionType::GetAsConstType shouldn't be called");
+    return NULL;
+}
+
+
+const Type *
+FunctionType::GetAsNonConstType() const {
+    FATAL("FunctionType::GetAsNonConstType shouldn't be called");
+    return NULL;
+}
+
+
+std::string
+FunctionType::GetString() const {
+    std::string ret;
+    if (isTask) ret += "task ";
+    ret += returnType->GetString();
+    ret += "(";
+    for (unsigned int i = 0; i < argTypes.size(); ++i) {
+        ret += argTypes[i]->GetString();
+        if (i != argTypes.size() - 1)
+            ret += ", ";
+    }
+    ret += ")";
+    return ret;
+}
+
+
+std::string
+FunctionType::Mangle() const {
+    std::string ret = "___";
+    for (unsigned int i = 0; i < argTypes.size(); ++i)
+        ret += argTypes[i]->Mangle();
+    return ret;
+}
+
+
+std::string
+FunctionType::GetCDeclaration(const std::string &fname) const {
+    std::string ret;
+    ret += returnType->GetCDeclaration("");
+    ret += " ";
+    ret += fname;
+    ret += "(";
+    for (unsigned int i = 0; i < argTypes.size(); ++i) {
+        if (argNames.size())
+            ret += argTypes[i]->GetCDeclaration(argNames[i]);
+        else
+            ret += argTypes[i]->GetString();
+        if (i != argTypes.size() - 1)
+            ret += ", ";
+    }
+    ret += ")";
+    return ret;
+}
+
+
+const llvm::Type *
+FunctionType::LLVMType(llvm::LLVMContext *ctx) const {
+    FATAL("FunctionType::LLVMType() shouldn't be called");
+    return NULL;
+}
+
+
+llvm::DIType
+FunctionType::GetDIType(llvm::DIDescriptor scope) const {
+    // @todo need to implement FunctionType::GetDIType()
+    FATAL("need to implement FunctionType::GetDIType()");
+    return llvm::DIType();
+}
+
+
+const llvm::FunctionType *
+FunctionType::LLVMFunctionType(llvm::LLVMContext *ctx, bool includeMask) const {
+    if (!includeMask && isTask) {
+        Error(pos, "Function can't have both \"task\" and \"export\" qualifiers");
+        return NULL;
+    }
+
+    // Get the LLVM Type *s for the function arguments
+    std::vector<const llvm::Type *> llvmArgTypes;
+    for (unsigned int i = 0; i < argTypes.size(); ++i) {
+        if (!argTypes[i])
+            return NULL;
+
+        const llvm::Type *t = argTypes[i]->LLVMType(ctx);
+        if (!t)
+            return NULL;
+        llvmArgTypes.push_back(t);
+    }
+
+    // And add the function mask, if asked for
+    if (includeMask)
+        llvmArgTypes.push_back(LLVMTypes::MaskType);
+
+    std::vector<const llvm::Type *> callTypes;
+    if (isTask) {
+        // Tasks take three arguments: a pointer to a struct that holds the
+        // actual task arguments, the thread index, and the total number of
+        // threads the tasks system has running.  (Task arguments are
+        // marshalled in a struct so that it's easy to allocate space to
+        // hold them until the task actually runs.)
+        llvm::Type *st = llvm::StructType::get(*ctx, llvmArgTypes);
+        callTypes.push_back(llvm::PointerType::getUnqual(st));
+        callTypes.push_back(LLVMTypes::Int32Type); // threadIndex
+        callTypes.push_back(LLVMTypes::Int32Type); // threadCount
+    }
+    else
+        // Otherwise we already have the types of the arguments 
+        callTypes = llvmArgTypes;
+
+    return llvm::FunctionType::get(returnType->LLVMType(g->ctx), callTypes, false);
+}
+
+
+void
+FunctionType::SetArgumentDefaults(const std::vector<ConstExpr *> &d) const {
+    assert(argDefaults.size() == 0);
+    assert(d.size() == argTypes.size());
+    argDefaults = d;
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// Type
+
+const Type *
+Type::GetReferenceTarget() const {
+    // only ReferenceType needs to override this method
+    return this;
+}
+
+
+const Type *
+Type::GetAsUnsignedType() const {
+    // For many types, this doesn't make any sesne
+    return NULL;
+}
+
+
+/** Given an atomic or vector type, return a vector type of the given
+    vecSize.  Issue an error if given a vector type that isn't already that
+    size.
+ */
+static const Type *
+lVectorConvert(const Type *type, SourcePos pos, const char *reason, int vecSize) {
+    const VectorType *vt = dynamic_cast<const VectorType *>(type);
+    if (vt) {
+        if (vt->GetElementCount() != vecSize) {
+            Error(pos, "Implicit conversion between from vector type "
+                  "\"%s\" to vector type of length %d for %s is not possible.", 
+                  type->GetString().c_str(), vecSize, reason);
+            return NULL;
+        }
+        return vt;
+    }
+    else {
+        const AtomicType *at = dynamic_cast<const AtomicType *>(type);
+        if (!at) {
+            Error(pos, "Non-atomic type \"%s\" can't be converted to vector type "
+                  "for %s.", type->GetString().c_str(), reason);
+            return NULL;
+        }
+        return new VectorType(at, vecSize);
+    }
+}
+
+
+const Type *
+Type::MoreGeneralType(const Type *t0, const Type *t1, SourcePos pos, const char *reason, 
+                      bool forceVarying, int vecSize) {
+    assert(reason != NULL);
+
+    // First, if we need to go varying, promote both of the types to be
+    // varying.
+    if (t0->IsVaryingType() || t1->IsVaryingType() || forceVarying) {
+        t0 = t0->GetAsVaryingType();
+        t1 = t1->GetAsVaryingType();
+    }
+
+    // And similarly, promote them both to vectors if the caller requested
+    // a particular vector size
+    if (vecSize > 0) {
+        t0 = lVectorConvert(t0, pos, reason, vecSize);
+        t1 = lVectorConvert(t1, pos, reason, vecSize);
+        if (!t0 || !t1)
+            return NULL;
+    }
+
+    // Are they both the same type?  If so, we're done, QED.
+    if (Type::Equal(t0, t1)) 
+        return t0;
+
+    // Not the same types, but only a const/non-const difference?  Return
+    // the non-const type as the more general one.
+    if (Type::Equal(t0->GetAsConstType(), t1->GetAsConstType()))
+        return t0->GetAsNonConstType();
+
+    const VectorType *vt0 = dynamic_cast<const VectorType *>(t0);
+    const VectorType *vt1 = dynamic_cast<const VectorType *>(t1);
+    if (vt0 && vt1) {
+        // both are vectors; convert their base types and make a new vector
+        // type, as long as their lengths match
+        if (vt0->GetElementCount() != vt1->GetElementCount()) {
+            Error(pos, "Implicit conversion between differently sized vector types "
+                  "(%s, %s) for %s is not possible.", t0->GetString().c_str(),
+                  t1->GetString().c_str(), reason);
+            return NULL;
+        }
+        const Type *t = MoreGeneralType(vt0->GetElementType(), vt1->GetElementType(),
+                                        pos, reason, forceVarying);
+        if (!t) 
+            return NULL;
+
+        // The 'more general' version of the two vector element types must
+        // be an AtomicType (that's all that vectors can hold...)
+        const AtomicType *at = dynamic_cast<const AtomicType *>(t);
+        assert(at != NULL);
+
+        return new VectorType(at, vt0->GetElementCount());
+    }
+    else if (vt0) {
+        // If one type is a vector type but the other isn't, see if we can
+        // promote the other one to a vector type.  This will fail and
+        // return NULL if t1 is e.g. an array type and it's illegal to have
+        // a vector of it..
+        const Type *t = MoreGeneralType(vt0->GetElementType(), t1, pos, 
+                                        reason, forceVarying);
+        if (!t) 
+            return NULL;
+
+        const AtomicType *at = dynamic_cast<const AtomicType *>(t);
+        assert(at != NULL);
+        return new VectorType(at, vt0->GetElementCount());
+    }
+    else if (vt1) {
+        // As in the above case, see if we can promote t0 to make a vector
+        // that matches vt1.
+        const Type *t = MoreGeneralType(t0, vt1->GetElementType(), pos, 
+                                        reason, forceVarying);
+        if (!t) 
+            return NULL;
+
+        const AtomicType *at = dynamic_cast<const AtomicType *>(t);
+        assert(at != NULL);
+        return new VectorType(at, vt1->GetElementCount());
+    }
+
+    // TODO: what do we need to do about references here, if anything??
+
+    // Now all we can do is promote atomic types...
+    const AtomicType *at0 = dynamic_cast<const AtomicType *>(t0->GetReferenceTarget());
+    const AtomicType *at1 = dynamic_cast<const AtomicType *>(t1->GetReferenceTarget());
+
+    if (!at0 || !at1) {
+        assert(reason);
+        Error(pos, "Implicit conversion from type \"%s\" to \"%s\" for %s not possible.",
+              t0->GetString().c_str(), t1->GetString().c_str(), reason);
+        return NULL;
+    }
+
+    // Finally, to determine which of the two atomic types is more general,
+    // use the ordering of entries in the AtomicType::BasicType enumerant.
+    return (int(at0->basicType) >= int(at1->basicType)) ? at0 : at1;
+}
+
+
+bool
+Type::Equal(const Type *a, const Type *b) {
+    if (!a || !b)
+        return false;
+
+    // We can compare AtomicTypes with pointer equality, since the
+    // AtomicType constructor is private so that there isonly the single
+    // canonical instance of the AtomicTypes (AtomicType::UniformInt32,
+    // etc.)
+    if (dynamic_cast<const AtomicType *>(a) &&
+        dynamic_cast<const AtomicType *>(b))
+        return a == b;
+
+    // For all of the other types, we need to see if we have the same two
+    // general types.  If so, then we dig into the details of the type and
+    // see if all of the relevant bits are equal...
+    const ArrayType *ata = dynamic_cast<const ArrayType *>(a);
+    const ArrayType *atb = dynamic_cast<const ArrayType *>(b);
+    if (ata && atb)
+        return (ata->GetElementCount() == atb->GetElementCount() && 
+                Equal(ata->GetElementType(), atb->GetElementType()));
+
+    const VectorType *vta = dynamic_cast<const VectorType *>(a);
+    const VectorType *vtb = dynamic_cast<const VectorType *>(b);
+    if (vta && vtb)
+        return (vta->GetElementCount() == vtb->GetElementCount() && 
+                Equal(vta->GetElementType(), vtb->GetElementType()));
+
+    const StructType *sta = dynamic_cast<const StructType *>(a);
+    const StructType *stb = dynamic_cast<const StructType *>(b);
+    if (sta && stb) {
+        if (sta->NumElements() != stb->NumElements())
+            return false;
+        for (int i = 0; i < sta->NumElements(); ++i)
+            if (!Equal(sta->GetMemberType(i), stb->GetMemberType(i)))
+                return false;
+        return true;
+    }
+
+    const ReferenceType *rta = dynamic_cast<const ReferenceType *>(a);
+    const ReferenceType *rtb = dynamic_cast<const ReferenceType *>(b);
+    if (rta && rtb)
+        return Type::Equal(rta->GetReferenceTarget(),
+                           rtb->GetReferenceTarget());
+
+    const FunctionType *fta = dynamic_cast<const FunctionType *>(a);
+    const FunctionType *ftb = dynamic_cast<const FunctionType *>(b);
+    if (fta && ftb) {
+        // Both the return types and all of the argument types must match
+        // for function types to match
+        if (!Equal(fta->GetReturnType(), ftb->GetReturnType()))
+            return false;
+
+        const std::vector<const Type *> &aargs = fta->GetArgumentTypes();
+        const std::vector<const Type *> &bargs = ftb->GetArgumentTypes();
+        if (aargs.size() != bargs.size())
+            return false;
+        for (unsigned int i = 0; i < aargs.size(); ++i)
+            if (!Equal(aargs[i], bargs[i]))
+                return false;
+        return true;
+    }
+
+    return false;
+}
diff --git a/type.h b/type.h
new file mode 100644
index 00000000..0fec0bfb
--- /dev/null
+++ b/type.h
@@ -0,0 +1,630 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file type.h
+    @brief File with declarations for classes related to type representation
+*/
+
+#ifndef ISPC_TYPE_H
+#define ISPC_TYPE_H 1
+
+#include "ispc.h"
+#include <llvm/Type.h>
+#include <llvm/DerivedTypes.h>
+
+class ConstExpr;
+class StructType;
+
+/** @brief Interface class that defines the type abstraction.
+
+    Abstract base class that defines the interface that must be implemented
+    for all types in the language.
+ */
+class Type {
+public:
+    /** Returns true if the underlying type is boolean.  In other words,
+        this is true for individual bools and for short-vectors with
+        underlying bool type, but not for arrays of bools. */
+    virtual bool IsBoolType() const = 0;
+
+    /** Returns true if the underlying type is float or double.  In other
+        words, this is true for individual floats/doubles and for
+        short-vectors of them, but not for arrays of them. */
+    virtual bool IsFloatType() const = 0;
+
+    /** Returns true if the underlying type is an integer type.  In other
+        words, this is true for individual integers and for short-vectors
+        of integer types, but not for arrays of integer types. */
+    virtual bool IsIntType() const = 0;
+
+    /** Returns true if the underlying type is unsigned.  In other words,
+        this is true for unsigned integers and short vectors of unsigned
+        integer types. */
+    virtual bool IsUnsignedType() const = 0;
+
+    /** Returns true if this type is 'const'-qualified. */
+    virtual bool IsConstType() const = 0;
+    
+    /** Returns true if the underlying type is a float or integer type. */
+    bool IsNumericType() const { return IsFloatType() || IsIntType(); }
+
+    /** Returns true if the underlying type is uniform */
+    virtual bool IsUniformType() const = 0;
+
+    /** Returns true if the underlying type is varying */
+    bool IsVaryingType() const { return !IsUniformType(); }
+
+    /** Return a "uniform" instance of this type.  If the type is already
+        uniform, its "this" pointer will be returned. */
+    virtual const Type *GetAsUniformType() const = 0;
+
+    /** Return a "varying" instance of this type.  If the type is already
+        uniform, its "this" pointer will be returned. */
+    virtual const Type *GetAsVaryingType() const = 0;
+
+    /** If this is a signed integer type, return the unsigned version of
+        the type.  Otherwise, return the original type. */
+    virtual const Type *GetAsUnsignedType() const;
+
+    /** Returns the basic root type of the given type.  For example, for an
+        array or short-vector, this returns the element type.  For a struct
+        or atomic type, it returns itself. */
+    virtual const Type *GetBaseType() const = 0;
+
+    /** If this is a reference type, returns the type it is referring to.
+        For all other types, just returns its own type. */
+    virtual const Type *GetReferenceTarget() const;
+
+    /** Return a new type representing the current type laid out in
+        width-wide SOA (structure of arrays) format. */
+    virtual const Type *GetSOAType(int width) const = 0;
+
+    /** Get a const version of this type.  If it's already const, then the old 
+        Type pointer is returned. */
+    virtual const Type *GetAsConstType() const = 0;
+
+    /** Get a non-const version of this type.  If it's already not const,
+        then the old Type pointer is returned. */
+    virtual const Type *GetAsNonConstType() const = 0;
+
+    /** Returns a text representation of the type (for example, for use in
+        warning and error messages). */
+    virtual std::string GetString() const = 0;
+
+    /** Returns a string that represents the mangled type (for use in
+        mangling function symbol names for function overloading).  The
+        various Types implementations of this method should collectively
+        ensure that all of them use mangling schemes that are guaranteed
+        not to clash. */
+    virtual std::string Mangle() const = 0;
+
+    /** Returns a string that is the declaration of the same type in C
+        syntax. */
+    virtual std::string GetCDeclaration(const std::string &name) const = 0;
+
+    /** Returns the LLVM type corresponding to this ispc type */
+    virtual const llvm::Type *LLVMType(llvm::LLVMContext *ctx) const = 0;
+
+    /** Returns the DIType (LLVM's debugging information structure),
+        corresponding to this type. */
+    virtual llvm::DIType GetDIType(llvm::DIDescriptor scope) const = 0;
+
+    /** Checks two types for equality.  Returns true if they are exactly
+        the same, false otherwise. */
+    static bool Equal(const Type *a, const Type *b);
+
+    /** Given two types, returns the least general Type that is more general
+        than both of them.  (i.e. that can represent their values without
+        any loss of data.)  If there is no such Type, return NULL.
+
+        @param type0        First of the two types
+        @param type1        Second of the two types
+        @param pos          Source file position where the general type is
+                            needed.
+        @param reason       String describing the context of why the general
+                            type is needed (e.g. "+ operator").
+        @param forceVarying If \c true, then make sure that the returned 
+                            type is "varying".
+        @param vecSize      The vector size of the returned type.  If non-zero,
+                            the returned type will be a VectorType of the
+                            more general type with given length.  If zero,
+                            this parameter has no effect.
+        @return             The more general type, based on the provided parameters.
+
+        @todo the vecSize and forceVarying parts of this should probably be
+        factored out and done separately in the cases when needed.
+        
+    */
+    static const Type *MoreGeneralType(const Type *type0, const Type *type1,
+                                       SourcePos pos, const char *reason,
+                                       bool forceVarying = false, int vecSize = 0);
+};
+
+
+/** @brief AtomicType represents basic types like floats, ints, etc.  
+
+    AtomicTypes can be either uniform or varying.  Unique instances of all
+    of the possible <tt>AtomicType</tt>s are available in the static members
+    like AtomicType::UniformInt32.  It is thus possible to compare
+    AtomicTypes for equality with simple pointer equality tests; this is
+    not true for the other Type implementations.
+ */
+class AtomicType : public Type {
+public:
+    bool IsUniformType() const;
+    bool IsBoolType() const;
+    bool IsFloatType() const;
+    bool IsIntType() const;
+    bool IsUnsignedType() const;
+    bool IsConstType() const;
+
+    /** For AtomicTypes, the base type is just the same as the AtomicType
+        itself. */
+    const AtomicType *GetBaseType() const;
+    const AtomicType *GetAsVaryingType() const;
+    const AtomicType *GetAsUniformType() const;
+    const AtomicType *GetAsUnsignedType() const;
+    const Type *GetSOAType(int width) const;
+    const AtomicType *GetAsConstType() const;
+    const AtomicType *GetAsNonConstType() const;
+
+    std::string GetString() const;
+    std::string Mangle() const;
+    std::string GetCDeclaration(const std::string &name) const;
+
+    const llvm::Type *LLVMType(llvm::LLVMContext *ctx) const;
+    llvm::DIType GetDIType(llvm::DIDescriptor scope) const;
+
+    /** This enumerant records the basic types that AtomicTypes can be 
+        built from.  */
+    enum BasicType {
+        TYPE_VOID,
+        TYPE_BOOL,
+        TYPE_INT32,
+        TYPE_UINT32,
+        TYPE_FLOAT,
+        TYPE_INT64,
+        TYPE_UINT64,
+        TYPE_DOUBLE,
+    };
+
+    const BasicType basicType;
+
+    static const AtomicType *UniformBool, *VaryingBool;
+    static const AtomicType *UniformInt32, *VaryingInt32;
+    static const AtomicType *UniformUInt32, *VaryingUInt32;
+    static const AtomicType *UniformFloat, *VaryingFloat;
+    static const AtomicType *UniformInt64, *VaryingInt64;
+    static const AtomicType *UniformUInt64, *VaryingUInt64;
+    static const AtomicType *UniformDouble, *VaryingDouble;
+    static const AtomicType *UniformConstBool, *VaryingConstBool;
+    static const AtomicType *UniformConstInt32, *VaryingConstInt32;
+    static const AtomicType *UniformConstUInt32, *VaryingConstUInt32;
+    static const AtomicType *UniformConstFloat, *VaryingConstFloat;
+    static const AtomicType *UniformConstInt64, *VaryingConstInt64;
+    static const AtomicType *UniformConstUInt64, *VaryingConstUInt64;
+    static const AtomicType *UniformConstDouble, *VaryingConstDouble;
+    static const AtomicType *Void;
+
+private:
+    const bool isUniform;
+    const bool isConst;
+    AtomicType(BasicType basicType, bool isUniform, bool isConst);
+};
+
+
+/** @brief Abstract base class for tpyes that represent sequences
+
+    SequentialType is an abstract base class that adds interface routines
+    for types that represent linear sequences of other types (i.e., arrays
+    and vectors).
+ */
+class SequentialType : public Type {
+public:
+    /** Returns the total number of elements in the sequence. */
+    virtual int GetElementCount() const = 0;
+
+    /** Returns the Type of the elements that the sequence stores. */
+    virtual const Type *GetElementType() const = 0;
+};
+
+
+/** @brief One-dimensional array type.
+
+    ArrayType represents a one-dimensional array of instances of some other
+    type.  (Multi-dimensional arrays are represented by ArrayTypes that in
+    turn hold ArrayTypes as their child types.)  
+*/
+class ArrayType : public SequentialType {
+public:
+    /** An ArrayType is created by providing the type of the elements that
+        it stores, and the SOA width to use in laying out the array in
+        memory.
+
+        @param elementType  Type of the array elements
+        @param numElements  Total number of elements in the array.  This
+                            parameter may be zero, in which case this is an
+                            "unsized" array type.  (Arrays of specific size
+                            can be converted to unsized arrays to be passed
+                            to functions that take array parameters, for
+                            example).
+     */
+    ArrayType(const Type *elementType, int numElements);
+
+    bool IsUniformType() const;
+    bool IsBoolType() const;
+    bool IsFloatType() const;
+    bool IsIntType() const;
+    bool IsUnsignedType() const;
+    bool IsConstType() const;
+
+    const Type *GetBaseType() const;
+    const ArrayType *GetAsVaryingType() const;
+    const ArrayType *GetAsUniformType() const;
+    const Type *GetSOAType(int width) const;
+    const ArrayType *GetAsConstType() const;
+    const ArrayType *GetAsNonConstType() const;
+
+    std::string GetString() const;
+    std::string Mangle() const;
+    std::string GetCDeclaration(const std::string &name) const;
+
+    llvm::DIType GetDIType(llvm::DIDescriptor scope) const;
+    const llvm::ArrayType *LLVMType(llvm::LLVMContext *ctx) const;
+
+    /** This method returns the total number of elements in the array,
+        including all dimensions if this is a multidimensional array. */
+    int TotalElementCount() const;
+
+    int GetElementCount() const;
+    const Type *GetElementType() const;
+
+    /** Returns a new array of the same child type, but with the given
+        length. */
+    virtual ArrayType *GetSizedArray(int length) const;
+
+private:
+    friend class SOAArrayType;
+    /** Type of the elements of the array. */
+    const Type * const child;
+    /** Number of elements in the array. */
+    const int numElements;
+};
+
+
+/** @brief "Structure of arrays" array type.
+
+    This type represents an array with elements of a structure type,
+    "SOA-ized" to some width w.  This corresponds to replicating the struct
+    element types w times and then having an array of size/w of these
+    widened structs.  This memory layout often makes it possible to access
+    data with regular vector loads, rather than gathers that are needed
+    with "AOS" (array of structures) layout.
+
+    @todo Native support for SOA stuff is still a work in progres...
+ */
+class SOAArrayType : public ArrayType {
+public:
+    /**
+       SOAType constructor.
+
+       @param elementType  Type of the array elements.  Must be a StructType.
+       @param numElements  Total number of elements in the array.  This
+                           parameter may be zero, in which case this is an
+                           "unsized" array type.  (Arrays of specific size
+                           can be converted to unsized arrays to be passed
+                           to functions that take array parameters, for
+                           example).
+       @param soaWidth     If non-zero, this gives the SOA width to use in
+                           laying out the array data in memory.  (This value
+                           must be a power of two).  For example, if the
+                           array's element type is: 
+                           <tt>struct { uniform float x, y, z; }</tt>,
+                           the SOA width is four, and the number of elements
+                           is 12, then the array will be laid out in memory
+                           as xxxxyyyyzzzzxxxxyyyyzzzzxxxxyyyyzzzz.
+    */
+    SOAArrayType(const StructType *elementType, int numElements, 
+                 int soaWidth);
+
+    const SOAArrayType *GetAsVaryingType() const;
+    const SOAArrayType *GetAsUniformType() const;
+    const Type *GetSOAType(int width) const;
+    const SOAArrayType *GetAsConstType() const;
+    const SOAArrayType *GetAsNonConstType() const;
+
+    std::string GetString() const;
+    std::string Mangle() const;
+    std::string GetCDeclaration(const std::string &name) const;
+
+    int TotalElementCount() const;
+
+    const llvm::ArrayType *LLVMType(llvm::LLVMContext *ctx) const;
+    llvm::DIType GetDIType(llvm::DIDescriptor scope) const;
+
+    SOAArrayType *GetSizedArray(int size) const;
+
+private:
+    /** This member variable records the rate at which the structure
+        elements are replicated. */
+    const int soaWidth;
+
+    /** Returns a regular ArrayType with the struct type's elements widened
+        out and with correspondingly fewer array elements. */
+    const ArrayType *soaType() const;
+};
+
+
+/** @brief A (short) vector of atomic types.
+
+    VectorType is used to represent a fixed-size array of elements of an
+    AtomicType.  Vectors are similar to arrays in that they support
+    indexing of the elements, but have two key differences.  First, all
+    arithmetic and logical operations that are value for the element type
+    can be performed on corresponding VectorTypes (as long as the two
+    VectorTypes have the same size). Second, VectorTypes of uniform
+    elements are laid out in memory aligned to the target's vector size;
+    this allows them to be packed 'horizontally' into vector registers.
+ */
+class VectorType : public SequentialType {
+public:
+    VectorType(const AtomicType *base, int size);
+
+    bool IsUniformType() const;
+    bool IsBoolType() const;
+    bool IsFloatType() const;
+    bool IsIntType() const;
+    bool IsUnsignedType() const;
+    bool IsConstType() const;
+
+    const Type *GetBaseType() const;
+    const VectorType *GetAsVaryingType() const;
+    const VectorType *GetAsUniformType() const;
+    const Type *GetSOAType(int width) const;
+    const VectorType *GetAsConstType() const;
+    const VectorType *GetAsNonConstType() const;
+
+    std::string GetString() const;
+    std::string Mangle() const;
+    std::string GetCDeclaration(const std::string &name) const;
+
+    const llvm::Type *LLVMType(llvm::LLVMContext *ctx) const;
+    llvm::DIType GetDIType(llvm::DIDescriptor scope) const;
+
+    int GetElementCount() const;
+    const AtomicType *GetElementType() const;
+
+private:
+    /** Base type that the vector holds elements of */
+    const AtomicType * const base;
+    /** Number of elements in the vector */
+    const int numElements;
+
+    /** Returns the number of elements stored in memory for the vector.
+        For uniform vectors, this is rounded up so that the number of
+        elements evenly divides the target's native vector width. */
+    int getVectorMemoryCount() const;
+};
+
+
+/** @brief Representation of a structure holding a number of members.
+ */
+class StructType : public Type {
+public:
+    StructType(const std::string &name, const std::vector<const Type *> &elts, 
+               const std::vector<std::string> &eltNames, bool isConst, 
+               bool isUniform, SourcePos pos);
+
+    bool IsUniformType() const;
+    bool IsBoolType() const;
+    bool IsFloatType() const;
+    bool IsIntType() const;
+    bool IsUnsignedType() const;
+    bool IsConstType() const;
+
+    const Type *GetBaseType() const;
+    const StructType *GetAsVaryingType() const;
+    const StructType *GetAsUniformType() const;
+    const Type *GetSOAType(int width) const;
+    const StructType *GetAsConstType() const;
+    const StructType *GetAsNonConstType() const;
+
+    std::string GetString() const;
+    std::string Mangle() const;
+    std::string GetCDeclaration(const std::string &name) const;
+
+    const llvm::Type *LLVMType(llvm::LLVMContext *ctx) const;
+    llvm::DIType GetDIType(llvm::DIDescriptor scope) const;
+
+    /** Returns the type of the structure element with the given name (if any).
+        Returns NULL if there is no such named element. */
+    const Type *GetMemberType(const std::string &name) const;
+
+    /** Returns the type of the i'th structure element.  The value of \c i must
+        be between 0 and NumElements()-1. */
+    const Type *GetMemberType(int i) const;
+
+    /** Returns which structure element number (starting from zero) that
+        has the given name.  If there is no such element, return -1. */
+    int GetMemberNumber(const std::string &name) const;
+
+    /** Returns the name of the i'th element of the structure. */
+    const std::string GetElementName(int i) const { return elementNames[i]; }
+    
+    /** Returns the total number of elements in the structure. */
+    int NumElements() const { return int(elementTypes.size()); }
+
+    /** Returns the name of the structure type.  (e.g. struct Foo -> "Foo".) */
+    const std::string &GetStructName() const { return name; }
+
+private:
+    const std::string name;
+    /** The types of the struct elements.  Note that we store these with
+        uniform/varying exactly as they were declared in the source file.
+        (In other words, even if this struct has a varying qualifier and
+        thus all of its members are going to be widened out to be varying,
+        we still store any members that were declared as uniform as uniform
+        types in the elementTypes array, converting them to varying as
+        needed in the implementation.)  This is so that if we later need to
+        make a uniform version of the struct, we've maintained the original
+        information about the member types.
+     */
+    const std::vector<const Type *> elementTypes;
+    const std::vector<std::string> elementNames;
+    const bool isUniform;
+    const bool isConst;
+    const SourcePos pos;
+};
+
+
+/** @brief Type representing a reference to another (non-reference) type.
+ */
+class ReferenceType : public Type {
+public:
+    ReferenceType(const Type *targetType, bool isConst);
+
+    bool IsUniformType() const;
+    bool IsBoolType() const;
+    bool IsFloatType() const;
+    bool IsIntType() const;
+    bool IsUnsignedType() const;
+    bool IsConstType() const;
+
+    const Type *GetBaseType() const;
+    const Type *GetReferenceTarget() const;
+    const ReferenceType *GetAsVaryingType() const;
+    const ReferenceType *GetAsUniformType() const;
+    const Type *GetSOAType(int width) const;
+    const ReferenceType *GetAsConstType() const;
+    const ReferenceType *GetAsNonConstType() const;
+
+    std::string GetString() const;
+    std::string Mangle() const;
+    std::string GetCDeclaration(const std::string &name) const;
+
+    const llvm::Type *LLVMType(llvm::LLVMContext *ctx) const;
+    llvm::DIType GetDIType(llvm::DIDescriptor scope) const;
+
+private:
+    const bool isConst;
+    const Type * const targetType;
+};
+
+
+/** @brief Type representing a function (return type + argument types)
+
+    FunctionType encapsulates the information related to a function's type,
+    including the return type and the types of the arguments.
+
+    @todo This class has a fair number of methods inherited from Type that
+    don't make sense here (e.g. IsUniformType(), GetBaseType(), LLVMType(), etc.
+    Would be nice to refactor the inheritance hierarchy to move most of
+    those interface methods to a sub-class of Type, which in turn all of
+    the other Type implementations inherit from.
+ */
+class FunctionType : public Type {
+public:
+    FunctionType(const Type *returnType, 
+                 const std::vector<const Type *> &argTypes, SourcePos pos,
+                 const std::vector<std::string> *argNames = NULL,
+                 bool isTask = false, bool isExported = false, 
+                 bool isExternC = false);
+
+    bool IsUniformType() const;
+    bool IsBoolType() const;
+    bool IsFloatType() const;
+    bool IsIntType() const;
+    bool IsUnsignedType() const;
+    bool IsConstType() const;
+
+    const Type *GetBaseType() const;
+    const Type *GetAsVaryingType() const;
+    const Type *GetAsUniformType() const;
+    const Type *GetSOAType(int width) const;
+    const Type *GetAsConstType() const;
+    const Type *GetAsNonConstType() const;
+
+    std::string GetString() const;
+    std::string Mangle() const;
+    std::string GetCDeclaration(const std::string &fname) const;
+
+    const llvm::Type *LLVMType(llvm::LLVMContext *ctx) const;
+    llvm::DIType GetDIType(llvm::DIDescriptor scope) const;
+
+    const Type *GetReturnType() const { return returnType; }
+
+    /** This method returns the LLVM FunctionType that corresponds to this
+        function type.  The \c includeMask parameter indicates whether the
+        llvm::FunctionType should have a mask as the last argument in its
+        function signature. */
+    const llvm::FunctionType *LLVMFunctionType(llvm::LLVMContext *ctx, 
+                                               bool includeMask = false) const;
+
+    const std::vector<const Type *> &GetArgumentTypes() const { return argTypes; }
+    const std::vector<ConstExpr *> &GetArgumentDefaults() const { return argDefaults; }
+    const std::string &GetArgumentName(int i) const { return argNames[i]; }
+
+    /** @todo It would be nice to pull this information together and pass
+        it when the constructor is called; it's kind of ugly to set it like
+        this later. */
+    void SetArgumentDefaults(const std::vector<ConstExpr *> &d) const;
+
+    /** This value is true if the function had a 'task' qualifier in the
+        source program. */
+    const bool isTask;
+
+    /** This value is true if the function had a 'export' qualifier in the
+        source program. */
+    const bool isExported;
+
+    /** This value is true if the function was declared as an 'extern "C"'
+        function in the source program. */
+    const bool isExternC;
+
+private:
+    const Type * const returnType;
+    const std::vector<const Type *> argTypes;
+    /** Default values of the functions arguments.  For arguments without
+        default values provided, NULL is stored; this means that the length
+        of this array is the same as the argTypes member, and the i'th
+        elements of them correspond with each other. */
+    mutable std::vector<ConstExpr *> argDefaults;
+    /** The names provided (if any) with the function arguments in the
+        function's signature.  These should only be used for error messages
+        and the like and shouldn't affect testing function types for
+        equality, etc. */
+    const std::vector<std::string> argNames;
+    const SourcePos pos;
+};
+
+#endif // ISPC_TYPE_H
diff --git a/util.cpp b/util.cpp
new file mode 100644
index 00000000..6a671891
--- /dev/null
+++ b/util.cpp
@@ -0,0 +1,393 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file util.cpp
+    @brief Various small utility routines.
+*/
+
+#include "util.h"
+#include "module.h"
+#ifndef ISPC_IS_WINDOWS
+#include <alloca.h>
+#endif
+#include <stdio.h>
+
+#include <stdio.h>
+#include <assert.h>
+#include <ctype.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+#ifdef ISPC_IS_WINDOWS
+#include <io.h>
+#include <direct.h>
+#include <windows.h>
+#else
+#include <sys/ioctl.h>
+#include <unistd.h>
+#include <errno.h>
+#endif // ISPC_IS_WINDOWS
+#include <set>
+
+/** Returns the width of the terminal where the compiler is running.
+    Finding this out may fail in a variety of reasonable situations (piping
+    compiler output to 'less', redirecting output to a file, running the
+    compiler under a debuffer; in this case, just return a reasonable
+    default.
+ */
+static int
+lTerminalWidth() {
+#if defined(ISPC_IS_WINDOWS)
+    HANDLE h = GetStdHandle(STD_OUTPUT_HANDLE);
+    if (h == INVALID_HANDLE_VALUE || h == NULL)
+        return 80;
+    CONSOLE_SCREEN_BUFFER_INFO bufferInfo = { 0 };
+    GetConsoleScreenBufferInfo(h, &bufferInfo);
+    return bufferInfo.dwSize.X;
+#else
+    struct winsize w;
+    if (ioctl(STDOUT_FILENO, TIOCGWINSZ, &w) < 0)
+        return 80;
+    return w.ws_col;
+#endif // ISPC_IS_WINDOWS
+}
+
+
+/** Given a pointer into a string, find the end of the current word and
+    return a pointer to its last character. 
+*/
+static const char *
+lFindWordEnd(const char *buf) {
+    while (*buf != '\0' && !isspace(*buf))
+        ++buf;
+    return buf;
+}
+
+/** When printing error messages, we sometimes want to include the source
+    file line for context.  This function print the line(s) of the file
+    corresponding to the provided SourcePos and underlines the range of the
+    SourcePos with '^' symbols.
+*/
+static void
+lPrintFileLineContext(SourcePos p) {
+    if (p.first_line == 0)
+        return;
+
+    FILE *f = fopen(p.name, "r");
+    if (!f)
+        return;
+
+    int c, curLine = 1;
+    while ((c = fgetc(f)) != EOF) {
+        if (curLine >= p.first_line && curLine <= p.last_line)
+            fputc(c, stderr);
+        if (c == '\n')
+            ++curLine;
+        if (curLine > p.last_line)
+            break;
+    }
+
+    int i = 1;
+    for (; i < p.first_column; ++i)
+        fputc(' ', stderr);
+    fputc('^', stderr);
+    ++i;
+    for (; i < p.last_column; ++i)
+        fputc('^', stderr);
+    fputc('\n', stderr);
+    fputc('\n', stderr);
+
+    fclose(f);
+}
+
+/** Print the given string to the given FILE, assuming the given output
+    column width.  Break words as needed to avoid words spilling past the
+    last column.  */
+static void
+lPrintWithWordBreaks(const char *buf, int columnWidth, FILE *out) {
+#ifdef ISPC_IS_WINDOWS
+    fputs(buf, out);
+#else
+    int column = 0;
+    assert(strchr(buf, ':') != NULL);
+    int indent = strchr(buf, ':') - buf + 2;
+    int width = std::max(40, columnWidth - 2);
+
+    // Collect everything into a string and print it all at once at the end
+    // -> try to avoid troubles with mangled error messages with
+    // multi-threaded builds.
+    std::string outStr;
+
+    const char *msgPos = buf;
+    while (true) {
+        while (*msgPos != '\0' && isspace(*msgPos))
+            ++msgPos;
+        if (*msgPos == '\0')
+            break;
+
+        const char *wordEnd = lFindWordEnd(msgPos);
+        if (column > indent && column + wordEnd - msgPos > width) {
+            // This word would overflow, so start a new line
+            column = indent;
+            outStr.push_back('\n');
+            // Indent to the same column as the ":" at the start of the
+            // message
+            for (int i = 0; i < indent; ++i)
+                outStr.push_back(' ');
+        }
+
+        // Finally go and copy the word
+        while (msgPos != wordEnd) {
+            outStr.push_back(*msgPos++);
+            ++column;
+        }
+        outStr.push_back(' ');
+        ++column;
+    }
+    outStr.push_back('\n');
+    fputs(outStr.c_str(), out);
+#endif
+}
+
+
+/** Helper function for Error(), Warning(), etc.
+
+    @param type   The type of message being printed (e.g. "Warning")
+    @param p      Position in source file that is connected to the message
+                  being printed
+    @param fmt    printf()-style format string
+    @param args   Arguments with values for format string % entries
+*/
+static void
+lPrint(const char *type, SourcePos p, const char *fmt, va_list args) {
+#ifdef ISPC_IS_WINDOWS
+    char errorBuf[2048], formattedBuf[2048];
+    if (vsnprintf_s(errorBuf, sizeof(errorBuf), _TRUNCATE, fmt, args) == -1) {
+        fprintf(stderr, "vsnprintf_s() error!\n");
+        return;
+    }
+
+    if (p.first_line == 0) {
+        // We don't have a valid SourcePos, so create a message without it
+        if (sprintf_s(formattedBuf, sizeof(formattedBuf), "%s: %s\n", 
+                       type, errorBuf) == -1) {
+            fprintf(stderr, "vsnprintf_s() error!\n");
+            exit(1);
+        }
+    }
+    else {
+        // Create an error message that includes the file and line number
+        if (sprintf_s(formattedBuf, sizeof(formattedBuf), "%s(%d): %s: %s\n", 
+                      p.name, p.first_line, type, errorBuf) == -1) {
+            fprintf(stderr, "vsnprintf_s() error!\n");
+            exit(1);
+        }
+    }
+#else
+    char *errorBuf, *formattedBuf;
+    if (vasprintf(&errorBuf, fmt, args) == -1) {
+        fprintf(stderr, "vasprintf() unable to allocate memory!\n");
+        abort();
+    }
+    if (p.first_line == 0) {
+        // We don't have a valid SourcePos, so create a message without it
+        if (asprintf(&formattedBuf, "%s: %s\n", type, errorBuf) == -1) {
+            fprintf(stderr, "asprintf() unable to allocate memory!\n");
+            exit(1);
+        }
+    }
+    else {
+        // Create an error message that includes the file and line number
+        if (asprintf(&formattedBuf, "%s:%d:%d: %s: %s\n", p.name, 
+                     p.first_line, p.first_column, type, errorBuf) == -1) {
+            fprintf(stderr, "asprintf() unable to allocate memory!\n");
+            exit(1);
+        }
+    }
+#endif
+
+    // Now that we've done all that work, see if we've already printed the
+    // exact same error message.  If so, return, so we don't redundantly
+    // print it and annoy the user.
+    static std::set<std::string> printed;
+    if (printed.find(formattedBuf) != printed.end())
+        return;
+    printed.insert(formattedBuf);
+
+    lPrintWithWordBreaks(formattedBuf, lTerminalWidth(), stderr);
+    lPrintFileLineContext(p);
+
+#ifndef ISPC_IS_WINDOWS
+    free(errorBuf);
+    free(formattedBuf);
+#endif // !ISPC_IS_WINDOWS
+}
+
+
+void
+Error(SourcePos p, const char *fmt, ...) {
+    va_list args;
+    va_start(args, fmt);
+    lPrint("Error", p, fmt, args);
+    ++m->errorCount;
+    va_end(args);
+}
+
+
+void
+Debug(SourcePos p, const char *fmt, ...) {
+    if (!g->debugPrint)
+        return;
+
+    va_list args;
+    va_start(args, fmt);
+    lPrint("Debug", p, fmt, args);
+    va_end(args);
+}
+
+
+void
+Warning(SourcePos p, const char *fmt, ...) {
+    if (g->disableWarnings)
+        return;
+
+    va_list args;
+    va_start(args, fmt);
+    lPrint("Warning", p, fmt, args);
+    va_end(args);
+}
+
+
+void
+PerformanceWarning(SourcePos p, const char *fmt, ...) {
+    if (!g->emitPerfWarnings)
+        return;
+
+    va_list args;
+    va_start(args, fmt);
+    lPrint("Performance Warning", p, fmt, args);
+    va_end(args);
+}
+
+
+void
+FatalError(const char *file, int line, const char *message) {
+    fprintf(stderr, "%s(%d): FATAL ERROR: %s\n", file, line, message);
+    abort();
+}
+
+///////////////////////////////////////////////////////////////////////////
+
+// http://en.wikipedia.org/wiki/Levenshtein_distance
+int
+StringEditDistance(const std::string &str1, const std::string &str2, int maxDist) {
+    int n1 = (int)str1.size(), n2 = (int)str2.size();
+    int nmax = std::max(n1, n2);
+
+    int *current =  (int *)alloca((nmax+1) * sizeof(int));
+    int *previous = (int *)alloca((nmax+1) * sizeof(int));
+
+    for (int i = 0; i <= n2; ++i)
+        previous[i] = i;
+
+    for (int y = 1; y <= n1; ++y) {
+        current[0] = y;
+        int rowBest = y;
+
+        for (int x = 1; x <= n2; ++x) {
+            current[x] = std::min(previous[x-1] + (str1[y-1] == str2[x-1] ? 0 : 1),
+                                  std::min(current[x-1], previous[x])+1);
+            rowBest = std::min(rowBest, current[x]);
+        }
+
+        if (maxDist != 0 && rowBest > maxDist)
+            return maxDist + 1;
+
+        std::swap(current, previous);
+    }
+
+    return previous[n2];
+}
+
+
+std::vector<std::string> 
+MatchStrings(const std::string &str, const std::vector<std::string> &options) {
+    const int maxDelta = 2;
+    std::vector<std::string> matches[maxDelta+1];
+
+    // For all of the options that are up to maxDelta edit distance, store
+    // them in the element of matches[] that corresponds to their edit
+    // distance.
+    for (int i = 0; i < (int)options.size(); ++i) {
+        int dist = StringEditDistance(str, options[i], maxDelta+1);
+        if (dist <= maxDelta)
+            matches[dist].push_back(options[i]);
+    }
+
+    // And return the first one of them, if any, that has at least one
+    // match.
+    for (int i = 0; i <= maxDelta; ++i) {
+        if (matches[i].size())
+            return matches[i];
+    }
+    return std::vector<std::string>();
+}
+
+
+void
+GetDirectoryAndFileName(const std::string &currentDirectory, 
+                        const std::string &relativeName,
+                        std::string *directory, std::string *filename) {
+    // We need a fully qualified path.  First, see if the current file name
+    // is fully qualified itself--in that case, the current working
+    // directory isn't needed.  
+    // @todo This probably needs to be smarter for Windows...
+    std::string fullPath;
+    if (relativeName[0] == '/')
+        fullPath = relativeName;
+    else {
+        fullPath = g->currentDirectory;
+        if (fullPath[fullPath.size()-1] != '/')
+            fullPath.push_back('/');
+        fullPath += relativeName;
+    }
+
+    // now, we need to separate it into the base name and the directory
+    const char *fp = fullPath.c_str();
+    const char *basenameStart = strrchr(fp, '/');
+    assert(basenameStart != NULL);
+    ++basenameStart;
+    assert(basenameStart != '\0');
+    *filename = basenameStart;
+    *directory = std::string(fp, basenameStart - fp);
+}
diff --git a/util.h b/util.h
new file mode 100644
index 00000000..874e5bdb
--- /dev/null
+++ b/util.h
@@ -0,0 +1,141 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file util.h
+
+    @brief 
+*/
+
+#ifndef ISPC_UTIL_H
+#define ISPC_UTIL_H
+
+#include "ispc.h"
+
+struct SourcePos;
+
+/** Rounds up the given value to the next power of two, if it isn't a power
+    of two already. */
+inline uint32_t RoundUpPow2(uint32_t v) {
+    v--;
+    v |= v >> 1;    
+    v |= v >> 2;
+    v |= v >> 4;    
+    v |= v >> 8;
+    v |= v >> 16;
+    return v+1;
+}
+
+#ifdef __GNUG__
+#define PRINTF_FUNC __attribute__ \
+    ((__format__ (__printf__, 2, 3)))
+#else
+#define PRINTF_FUNC
+#endif // __GNUG__
+
+/** Prints a debugging message.  These messages are only printed if
+    g->debugPrint is \c true.  In addition to a program source code
+    position to associate with the message, a printf()-style format string
+    is passed along with any values needed for items in the format
+    string. 
+*/
+void Debug(SourcePos p, const char *format, ...) PRINTF_FUNC;
+
+/** Prints a warning about an issue found during compilation.  Compilation
+    can still continue after warnings; they are purely informative for the
+    user.  In addition to a program source code position to associate with
+    the message, a printf()-style format string is passed along with any
+    values needed for items in the format string.
+*/
+void Warning(SourcePos p, const char *format, ...) PRINTF_FUNC;
+
+/** Prints an error message.  It is assumd that compilation can not be
+    successfully completed after an error has been issued, though the
+    system tries to continue compiling as much as possible, in order to be
+    able to issue any subsequent error messages.  In addition to a program
+    source code position to associate with the message, a printf()-style
+    format string is passed along with any values needed for items in the
+    format string. 
+*/
+void Error(SourcePos p, const char *format, ...) PRINTF_FUNC;
+
+/** Prints a message about a potential performance issue in the user's
+    code.  These messages are purely informative and don't affect the
+    completion of compilation.  In addition to a program source code
+    position to associate with the message, a printf()-style format string
+    is passed along with any values needed for items in the format
+    string.  
+*/
+void PerformanceWarning(SourcePos p, const char *format, ...) PRINTF_FUNC;
+
+/** Reports a fatal error that causes the program to terminate.  This
+    should only be used for cases where there is an internal error in the
+    compiler.
+ */
+#define FATAL(message) FatalError(__FILE__, __LINE__, message)
+
+/** This function generally shouldn't be called directly, but should be
+    used via the FATAL macro, which includes the file and line number where
+    the error was issued.
+ */
+void FatalError(const char *file, int line, const char *message);
+
+/** Returns the number of single-character edits needed to transform
+    between the two strings.
+
+    @param str1    First string
+    @param str2    Second string
+    @param maxDist Maximum number of single-character edits allowed
+    @return        Number of single-character edits to transform from str1
+                   to str2, or maxDist+1 if it's not psosible to do so
+                   in fewer than maxDist steps
+*/
+int StringEditDistance(const std::string &str1, const std::string &str2,
+                       int maxDist);
+
+/** Given a string and a set of candidate strings, returns the set of
+    candidates that are "close" to the given string, where distance is
+    measured by the number of single-character changes needed to transform
+    between the two.  An empty vector may be returned if none of the
+    options is close to \c str.
+ */
+std::vector<std::string> MatchStrings(const std::string &str,
+                                      const std::vector<std::string> &options);
+
+/** Given the current working directory and a filename relative to that
+    directory, this function returns the final directory that the resulting
+    file is in and the base name of the file itself. */
+void GetDirectoryAndFileName(const std::string &currentDir, 
+                             const std::string &relativeName,
+                             std::string *directory, std::string *filename);
+
+#endif // ISPC_UTIL_H
diff --git a/winstuff/unistd.h b/winstuff/unistd.h
new file mode 100644
index 00000000..f2f5dfae
--- /dev/null
+++ b/winstuff/unistd.h
@@ -0,0 +1,6 @@
+#ifndef MY_UNISTD_H
+#define MY_UNISTD_H 1
+
+inline bool isatty(int) { return false; }
+
+#endif // MY_UNISTD_H
\ No newline at end of file