Initial commit.

2011-06-21 06:23:29 -07:00
commit 18af5226ba
587 changed files with 45117 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,2 @@
+*.pyc
+*~
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -0,0 +1,116 @@
+Copyright (c) 2010-2011, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+===========================================================================
+Copyrights and Licenses for Third Party Software Distrubted with 
+The Intel(r) SPMD Program Compiler
+===========================================================================
+
+ISPC incorporates code from the Syrah library, which is covered by the
+following license:
+
+Copyright (c) 2009, Stanford University, and authors listed below.
+All rights reserved.
+
+Original authors:
+  Solomon Boulos
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+Neither the name of Stanford University nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+---------------------------------------------------------------------------
+
+Binary distributions of ISPC are linked with the LLVM libraries, which are
+covered by the following license:
+
+University of Illinois/NCSA
+Open Source License
+
+Copyright (c) 2003-2010 University of Illinois at Urbana-Champaign.
+All rights reserved.
+
+Developed by:
+
+    LLVM Team
+
+    University of Illinois at Urbana-Champaign
+
+    http://llvm.org
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal with
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimers.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimers in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the names of the LLVM Team, University of Illinois at
+      Urbana-Champaign, nor the names of its contributors may be used to
+      endorse or promote products derived from this Software without specific
+      prior written permission.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+SOFTWARE.
--- a/118
+++ b/118
@@ -0,0 +1,118 @@
+#
+# ispc Makefile
+#
+
+ARCH = $(shell uname)
+
+CLANG=clang
+LLVM_LIBS=$(shell llvm-config --ldflags --libs) -lpthread -ldl
+LLVM_CXXFLAGS=$(shell llvm-config --cppflags)
+LLVM_VERSION_DEF=-DLLVM_$(shell llvm-config --version | sed s/\\./_/)
+
+BUILD_DATE=$(shell date +%Y%m%d)
+BUILD_VERSION=$(shell git log | head -1)
+
+CXX=g++
+CPP=cpp
+CXXFLAGS=-g3 $(LLVM_CXXFLAGS) -I. -Iobjs/ -Wall $(LLVM_VERSION_DEF) \
+	-DBUILD_DATE="\"$(BUILD_DATE)\"" -DBUILD_VERSION="\"$(BUILD_VERSION)\""
+
+LDFLAGS=
+ifeq ($(ARCH),Linux)
+  # try to link everything statically under Linux (including libstdc++) so
+  # that the binaries we generate will be portable across distributions...
+  LDFLAGS=-static -L/usr/lib/gcc/x86_64-linux-gnu/4.4
+endif
+
+LEX=flex
+YACC=bison -d -v -t
+
+###########################################################################
+
+CXX_SRC=builtins.cpp ctx.cpp decl.cpp expr.cpp ispc.cpp \
+	llvmutil.cpp main.cpp module.cpp opt.cpp stmt.cpp sym.cpp type.cpp \
+	util.cpp
+HEADERS=builtins.h ctx.h decl.h expr.h ispc.h llvmutil.h module.h \
+	opt.h stmt.h sym.h type.h util.h
+STDLIB_SRC=stdlib-avx.ll stdlib-sse2.ll stdlib-sse4.ll stdlib-sse4x2.ll
+BISON_SRC=parse.yy
+FLEX_SRC=lex.ll
+
+OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(STDLIB_SRC:.ll=.o) stdlib-c.o stdlib_ispc.o \
+	$(BISON_SRC:.yy=.o) $(FLEX_SRC:.ll=.o))
+
+default: ispc ispc_test
+
+.PHONY: dirs clean depend doxygen
+.PRECIOUS: objs/stdlib-%.cpp
+
+depend: $(CXX_SRC) $(HEADERS)
+	@echo Updating dependencies
+	@gcc -MM $(CXXFLAGS) $(CXX_SRC) | sed 's_^\([a-z]\)_objs/\1_g' > depend
+
+-include depend
+
+dirs:
+	@echo Creating objs/ directory
+	@/bin/mkdir -p objs
+
+clean:
+	/bin/rm -rf objs ispc ispc_test
+
+doxygen:
+	/bin/rm -rf docs/doxygen
+	doxygen doxygen.cfg
+
+ispc: dirs $(OBJS)
+	@echo Creating ispc executable
+	@$(CXX) $(LDFLAGS) -o $@ $(OBJS) $(LLVM_LIBS)
+
+ispc_test: dirs ispc_test.cpp
+	@echo Creating ispc_test executable
+	@$(CXX) $(LDFLAGS) $(CXXFLAGS) -o $@ ispc_test.cpp $(LLVM_LIBS)
+
+objs/%.o: %.cpp
+	@echo Compiling $<
+	@$(CXX) $(CXXFLAGS) -o $@ -c $<
+
+objs/parse.cc: parse.yy
+	@echo Running bison on $<
+	@$(YACC) -o $@ $<
+
+objs/parse.o: objs/parse.cc $(HEADERS)
+	@echo Compiling $<
+	@$(CXX) $(CXXFLAGS) -o $@ -c $<
+
+objs/lex.cpp: lex.ll
+	@echo Running flex on $<
+	@$(LEX) -o $@ $<
+
+objs/lex.o: objs/lex.cpp $(HEADERS)
+	@echo Compiling $<
+	@$(CXX) $(CXXFLAGS) -o $@ -c $<
+
+$(STDLIB_SRC): stdlib.m4
+
+objs/stdlib-%.cpp: stdlib-%.ll
+	@echo Creating C++ source from stdlib file $<
+	@m4 stdlib.m4 $< | ./bitcode2cpp.py $< > $@
+
+objs/stdlib-%.o: objs/stdlib-%.cpp
+	@echo Compiling $<
+	@$(CXX) $(CXXFLAGS) -o $@ -c $<
+
+objs/stdlib-c.cpp: stdlib-c.c
+	@echo Creating C++ source from stdlib file $<
+	@$(CLANG) -I /opt/l1om/usr/include/ -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py $< > $@
+
+objs/stdlib-c.o: objs/stdlib-c.cpp
+	@echo Compiling $<
+	@$(CXX) $(CXXFLAGS) -o $@ -c $<
+
+objs/stdlib_ispc.cpp: stdlib.ispc
+	@echo Creating C++ source from $<
+	@$(CPP) -DISPC=1 -DPI=3.1415936535 $< | ./stdlib2cpp.py > $@
+
+objs/stdlib_ispc.o: objs/stdlib_ispc.cpp
+	@echo Compiling $<
+	@$(CXX) $(CXXFLAGS) -o $@ -c $<
--- a/READMErst.txt
+++ b/READMErst.txt
@@ -0,0 +1,22 @@
+==============================
+Intel(r) SPMD Program Compiler
+==============================
+
+Welcome to the Intel(r) SPMD Program Compiler (ispc)!  
+
+ispc is a new compiler for "single program, multiple data" (SPMD)
+programs. Under the SPMD model, the programmer writes a program that mostly
+appears to be a regular serial program, though the execution model is
+actually that a number of program instances execute in parallel on the
+hardware. ispc compiles a C-based SPMD programming language to run on the
+SIMD units of CPUs; it frequently provides a a 3x or more speedup on CPUs
+with 4-wide SSE units, without any of the difficulty of writing intrinsics
+code.
+
+ispc is an open source compiler under the BSD license; see the file
+LICENSE.txt.  ispc supports Windows, Mac, and Linux, with both x86 and
+x86-64 targets. It currently supports the SSE2 and SSE4 instruction sets,
+though support for AVX should be available soon.
+
+For more information and examples, as well as a wiki and the bug database,
+see the ispc distribution site, http://ispc.github.com.
--- a/bitcode2cpp.py
+++ b/bitcode2cpp.py
@@ -0,0 +1,34 @@
+#!/usr/bin/python
+
+import sys
+import string
+import re
+import subprocess
+
+length=0
+
+src=str(sys.argv[1])
+
+target = re.sub(".*stdlib-", "", src)
+target = re.sub("\.ll$", "", target)
+target = re.sub("\.c$", "", target)
+target = re.sub("-", "_", target)
+
+try:
+    as_out=subprocess.Popen([ "llvm-as", "-", "-o", "-"], stdout=subprocess.PIPE)
+except IOError:
+    print >> sys.stderr, "Couldn't open " + src
+    sys.exit(1)
+
+print "unsigned char stdlib_bitcode_" + target + "[] = {"
+for line in as_out.stdout.readlines():
+    length = length + len(line)
+    for c in line:
+        print ord(c)
+        print ", "
+print " 0 };\n\n"
+print "int stdlib_bitcode_" + target + "_length = " + str(length) + ";\n"
+
+as_out.wait()
+
+sys.exit(as_out.returncode)
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -0,0 +1,617 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file builtins.cpp
+    @brief Definitions of functions related to setting up the standard library 
+           and other builtins.
+*/
+
+#include "builtins.h"
+#include "type.h"
+#include "util.h"
+#include "sym.h"
+#include "expr.h"
+#include "llvmutil.h"
+#include "module.h"
+#include "ctx.h"
+
+#include <math.h>
+#include <stdlib.h>
+#include <llvm/LLVMContext.h>
+#include <llvm/Module.h>
+#include <llvm/Type.h>
+#include <llvm/DerivedTypes.h>
+#include <llvm/Instructions.h>
+#include <llvm/Linker.h>
+#include <llvm/Support/MemoryBuffer.h>
+#include <llvm/Bitcode/ReaderWriter.h>
+
+extern int yyparse();
+struct yy_buffer_state;
+extern yy_buffer_state *yy_scan_string(const char *);
+
+
+/** Given an LLVM type, try to find the equivalent ispc type.  Note that
+    this is an under-constrained problem due to LLVM's type representations
+    carrying less information than ispc's.  (For example, LLVM doesn't
+    distinguish between signed and unsigned integers in its types.)  
+
+    However, because this function is only used for generating ispc
+    declarations of functions defined in LLVM bitcode in the stdlib-*.ll
+    files, in practice we can get enough of what we need for the relevant
+    cases to make things work.
+ */
+static const Type *
+lLLVMTypeToISPCType(const llvm::Type *t) {
+    if (t == LLVMTypes::VoidType)
+        return AtomicType::Void;
+    else if (t == LLVMTypes::BoolType)
+        return AtomicType::UniformBool;
+    else if (t == LLVMTypes::Int32Type)
+        return AtomicType::UniformInt32;
+    else if (t == LLVMTypes::FloatType)
+        return AtomicType::UniformFloat;
+    else if (t == LLVMTypes::DoubleType)
+        return AtomicType::UniformDouble;
+    else if (t == LLVMTypes::Int64Type)
+        return AtomicType::UniformInt64;
+    else if (t == LLVMTypes::Int32VectorType)
+        return AtomicType::VaryingInt32;
+    else if (t == LLVMTypes::FloatVectorType)
+        return AtomicType::VaryingFloat;
+    else if (t == LLVMTypes::DoubleVectorType)
+        return AtomicType::VaryingDouble;
+    else if (t == LLVMTypes::Int64VectorType)
+        return AtomicType::VaryingInt64;
+    else if (t == LLVMTypes::Int32PointerType)
+        return new ReferenceType(AtomicType::UniformInt32, false);
+    else if (t == LLVMTypes::FloatPointerType)
+        return new ReferenceType(AtomicType::UniformFloat, false);
+    else if (t == LLVMTypes::Int32VectorPointerType)
+        return new ReferenceType(AtomicType::VaryingInt32, false);
+    else if (t == LLVMTypes::FloatVectorPointerType)
+        return new ReferenceType(AtomicType::VaryingFloat, false);
+    else if (llvm::isa<const llvm::PointerType>(t)) {
+        const llvm::PointerType *pt = llvm::dyn_cast<const llvm::PointerType>(t);
+
+        // Is it a pointer to an unsized array of objects?  If so, then
+        // create the equivalent ispc type.  Note that it has to be a
+        // reference to an array, since ispc passes arrays to functions by
+        // reference.
+        //
+        // FIXME: generalize this to do more than uniform int32s (that's
+        // all that's necessary for the stdlib currently.)
+        const llvm::ArrayType *at = 
+            llvm::dyn_cast<const llvm::ArrayType>(pt->getElementType());
+        if (at && at->getNumElements() == 0 &&
+            at->getElementType() == LLVMTypes::Int32Type)
+            return new ReferenceType(new ArrayType(AtomicType::UniformInt32, 0),
+                                     false);
+    }
+
+    return NULL;
+}
+
+
+/** Given an LLVM function declaration, synthesize the equivalent ispc
+    symbol for the function (if possible).  Returns true on success, false
+    on failure.
+ */
+static bool
+lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
+    SourcePos noPos;
+    noPos.name = "__stdlib";
+
+    const llvm::FunctionType *ftype = func->getFunctionType();
+    std::string name = func->getName();
+
+    const Type *returnType = lLLVMTypeToISPCType(ftype->getReturnType());
+    if (!returnType)
+        // return type not representable in ispc -> not callable from ispc
+        return false;
+
+    // Iterate over the arguments and try to find their equivalent ispc
+    // types.
+    std::vector<const Type *> argTypes;
+    for (unsigned int i = 0; i < ftype->getNumParams(); ++i) {
+        const llvm::Type *llvmArgType = ftype->getParamType(i);
+        const Type *type = lLLVMTypeToISPCType(llvmArgType);
+        if (type == NULL)
+            return false;
+        argTypes.push_back(type);
+    }
+
+    FunctionType *funcType = new FunctionType(returnType, argTypes, noPos);
+    Symbol *sym = new Symbol(name, noPos, funcType);
+    sym->function = func;
+    symbolTable->AddFunction(sym);
+    return true;
+}
+
+
+/** Given an LLVM module, create ispc symbols for the functions in the
+    module.
+ */
+static void
+lAddModuleSymbols(llvm::Module *module, SymbolTable *symbolTable) {
+#if 0
+    // FIXME: handle globals?
+    assert(module->global_empty());
+#endif
+
+    llvm::Module::iterator iter;
+    for (iter = module->begin(); iter != module->end(); ++iter) {
+        llvm::Function *func = iter;
+        lCreateISPCSymbol(func, symbolTable);
+    }
+}
+
+/** Declare the function symbol 'bool __is_compile_time_constant_mask(mask type)'.  
+    This function will never be defined; it's just a placeholder
+    that will be handled during the optimization process.  See the
+    discussion of the implementation of CompileTimeConstantResolvePass for
+    more details.
+ */
+static void
+lDeclareCompileTimeConstant(llvm::Module *module) {
+    SourcePos noPos;
+    noPos.name = "__stdlib";
+
+    std::vector<const llvm::Type *> argTypes;
+    argTypes.push_back(LLVMTypes::MaskType);
+
+    llvm::FunctionType *fType = 
+        llvm::FunctionType::get(LLVMTypes::BoolType, argTypes, false);
+    llvm::Function *func =
+        llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
+                               "__is_compile_time_constant_mask", module);
+    func->setOnlyReadsMemory(true);
+    func->setDoesNotThrow(true);
+}
+
+
+/** Declare the 'pseudo-gather' functions.  When the ispc front-end needs
+    to perform a gather, it generates a call to one of these functions,
+    which have signatures:
+    
+    varying int32 __pseudo_gather(varying int32 *, mask)
+    varying int64 __pseudo_gather(varying int64 *, mask)
+
+    These functions are never actually implemented; the
+    GatherScatterFlattenOpt optimization pass finds them and then converts
+    them to make calls to the following functions, which represent gathers
+    from a common base pointer with offsets.  This approach allows the
+    front-end to be relatively simple in how it emits address calculation
+    for gathers.
+
+    varying int32 __pseudo_gather_base_offsets_32(uniform int32 *base, 
+                                                  int32 offsets, mask)
+    varying int64 __pseudo_gather_base_offsets_64(uniform int64 *base, 
+                                                  int64 offsets, mask)
+
+    Then, the GSImprovementsPass optimizations finds these and either
+    converts them to native gather functions or converts them to vector
+    loads, if equivalent.
+ */
+static void
+lDeclarePseudoGathers(llvm::Module *module) {
+    SourcePos noPos;
+    noPos.name = "__stdlib";
+
+    {
+        std::vector<const llvm::Type *> argTypes;
+        argTypes.push_back(LLVMTypes::VoidPointerVectorType);
+        argTypes.push_back(LLVMTypes::MaskType);
+
+        llvm::FunctionType *fType = 
+            llvm::FunctionType::get(LLVMTypes::Int32VectorType, argTypes, false);
+        llvm::Function *func =
+            llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
+                                   "__pseudo_gather_32", module);
+        func->setOnlyReadsMemory(true);
+        func->setDoesNotThrow(true);
+
+        fType = llvm::FunctionType::get(LLVMTypes::Int64VectorType, argTypes, false);
+        func = llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
+                                      "__pseudo_gather_64", module);
+        func->setOnlyReadsMemory(true);
+        func->setDoesNotThrow(true);
+    }
+
+    {
+        std::vector<const llvm::Type *> argTypes;
+        argTypes.push_back(LLVMTypes::VoidPointerType);
+        argTypes.push_back(LLVMTypes::Int32VectorType);
+        argTypes.push_back(LLVMTypes::MaskType);
+
+        llvm::FunctionType *fType = 
+            llvm::FunctionType::get(LLVMTypes::Int32VectorType, argTypes, false);
+        llvm::Function *func =
+            llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
+                                   "__pseudo_gather_base_offsets_32", module);
+        func->setOnlyReadsMemory(true);
+        func->setDoesNotThrow(true);
+
+        fType = llvm::FunctionType::get(LLVMTypes::Int64VectorType, argTypes, false);
+        func = llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
+                                      "__pseudo_gather_base_offsets_64", module);
+        func->setOnlyReadsMemory(true);
+        func->setDoesNotThrow(true);
+    }
+}
+
+
+/** Similarly to the 'pseudo-gathers' defined by lDeclarePseudoGathers(),
+    we also declare (but never define) pseudo-scatter instructions with
+    signatures:
+
+    void __pseudo_scatter_32(varying int32 *, varying int32 values, mask)
+    void __pseudo_scatter_64(varying int64 *, varying int64 values, mask)
+
+    The GatherScatterFlattenOpt optimization pass also finds these and
+    transforms them to scatters like:
+
+    void __pseudo_scatter_base_offsets_32(uniform int32 *base, 
+                    varying int32 offsets, varying int32 values, mask)
+    void __pseudo_scatter_base_offsets_64(uniform int64 *base, 
+                    varying int62 offsets, varying int64 values, mask)
+
+    And the GSImprovementsPass in turn converts these to actual native
+    scatters or masked stores.  
+*/
+static void
+lDeclarePseudoScatters(llvm::Module *module) {
+    SourcePos noPos;
+    noPos.name = "__stdlib";
+
+    {
+        std::vector<const llvm::Type *> argTypes;
+        argTypes.push_back(LLVMTypes::VoidPointerVectorType);
+        argTypes.push_back(LLVMTypes::Int32VectorType);
+        argTypes.push_back(LLVMTypes::MaskType);
+
+        llvm::FunctionType *fType = 
+            llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
+        llvm::Function *func =
+            llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
+                                   "__pseudo_scatter_32", module);
+        func->setDoesNotThrow(true);
+    }
+    {
+        std::vector<const llvm::Type *> argTypes;
+        argTypes.push_back(LLVMTypes::VoidPointerVectorType);
+        argTypes.push_back(LLVMTypes::Int64VectorType);
+        argTypes.push_back(LLVMTypes::MaskType);
+
+        llvm::FunctionType *fType = 
+            llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
+        llvm::Function *func =
+            llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
+                                   "__pseudo_scatter_64", module);
+        func->setDoesNotThrow(true);
+    }
+
+    {
+        std::vector<const llvm::Type *> argTypes;
+        argTypes.push_back(LLVMTypes::VoidPointerType);
+        argTypes.push_back(LLVMTypes::Int32VectorType);
+        argTypes.push_back(LLVMTypes::Int32VectorType);
+        argTypes.push_back(LLVMTypes::MaskType);
+
+        llvm::FunctionType *fType = 
+            llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
+        llvm::Function *func =
+            llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
+                                   "__pseudo_scatter_base_offsets_32", module);
+        func->setDoesNotThrow(true);
+    }
+    {
+        std::vector<const llvm::Type *> argTypes;
+        argTypes.push_back(LLVMTypes::VoidPointerType);
+        argTypes.push_back(LLVMTypes::Int32VectorType);
+        argTypes.push_back(LLVMTypes::Int64VectorType);
+        argTypes.push_back(LLVMTypes::MaskType);
+
+        llvm::FunctionType *fType = 
+            llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
+        llvm::Function *func =
+            llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
+                                   "__pseudo_scatter_base_offsets_64", module);
+        func->setDoesNotThrow(true);
+    }
+}
+
+
+/** This function declares placeholder masked store functions for the
+    front-end to use.
+
+    void __pseudo_masked_store_32(uniform int32 *ptr, varying int32 values, mask)
+    void __pseudo_masked_store_64(uniform int64 *ptr, varying int64 values, mask)
+
+    These in turn are converted to native masked stores or to regular
+    stores (if the mask is all on) by the MaskedStoreOptPass optimization
+    pass.
+ */
+static void
+lDeclarePseudoMaskedStore(llvm::Module *module) {
+    SourcePos noPos;
+    noPos.name = "__stdlib";
+
+    {
+    std::vector<const llvm::Type *> argTypes;
+    argTypes.push_back(LLVMTypes::Int32VectorPointerType);
+    argTypes.push_back(LLVMTypes::Int32VectorType);
+    argTypes.push_back(LLVMTypes::MaskType);
+
+    llvm::FunctionType *fType = 
+        llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
+    llvm::Function *func = 
+        llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
+                               "__pseudo_masked_store_32", module);
+    func->setDoesNotThrow(true);
+    func->addFnAttr(llvm::Attribute::AlwaysInline);
+    func->setDoesNotCapture(1, true);
+    }
+
+    {
+    std::vector<const llvm::Type *> argTypes;
+    argTypes.push_back(LLVMTypes::Int64VectorPointerType);
+    argTypes.push_back(LLVMTypes::Int64VectorType);
+    argTypes.push_back(LLVMTypes::MaskType);
+
+    llvm::FunctionType *fType = 
+        llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
+    llvm::Function *func = 
+        llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
+                               "__pseudo_masked_store_64", module);
+    func->setDoesNotThrow(true);
+    func->addFnAttr(llvm::Attribute::AlwaysInline);
+    func->setDoesNotCapture(1, true);
+    }
+}
+
+
+/** This utility function takes serialized binary LLVM bitcode and adds its
+    definitions to the given module.  Functions in the bitcode that can be
+    mapped to ispc functions are also added to the symbol table.
+
+    @param bitcode     Binary LLVM bitcode (e.g. the contents of a *.bc file)
+    @param length      Length of the bitcode buffer
+    @param module      Module to link the bitcode into
+    @param symbolTable Symbol table to add definitions to
+ */
+static void
+lAddBitcode(const unsigned char *bitcode, int length,
+            llvm::Module *module, SymbolTable *symbolTable) {
+    std::string bcErr;
+    llvm::StringRef sb = llvm::StringRef((char *)bitcode, length);
+    llvm::MemoryBuffer *bcBuf = llvm::MemoryBuffer::getMemBuffer(sb);
+    llvm::Module *bcModule = llvm::ParseBitcodeFile(bcBuf, *g->ctx, &bcErr);
+    if (!bcModule)
+        Error(SourcePos(), "Error parsing stdlib bitcode: %s", bcErr.c_str());
+    else {
+        std::string(linkError);
+        if (llvm::Linker::LinkModules(module, bcModule, &linkError))
+            Error(SourcePos(), "Error linking stdlib bitcode: %s", linkError.c_str());
+        lAddModuleSymbols(module, symbolTable);
+    }
+}
+
+
+/** Utility routine that defines a constant int32 with given value, adding
+    the symbol to both the ispc symbol table and the given LLVM module.
+ */
+static void
+lDefineConstantInt(const char *name, int val, llvm::Module *module,
+                   SymbolTable *symbolTable) {
+    Symbol *pw = new Symbol(name, SourcePos(), AtomicType::UniformConstInt32);
+    pw->isStatic = true;
+    pw->constValue = new ConstExpr(pw->type, val, SourcePos());
+    const llvm::Type *ltype = LLVMTypes::Int32Type;
+    llvm::Constant *linit = LLVMInt32(val);
+    pw->storagePtr = new llvm::GlobalVariable(*module, ltype, true, 
+                                              llvm::GlobalValue::InternalLinkage,
+                                              linit, pw->name.c_str());
+    symbolTable->AddVariable(pw);
+}
+
+
+static void
+lDefineProgramIndex(llvm::Module *module, SymbolTable *symbolTable) {
+    Symbol *pidx = new Symbol("programIndex", SourcePos(), 
+                              AtomicType::VaryingConstInt32);
+    pidx->isStatic = true;
+
+    int pi[ISPC_MAX_NVEC];
+    for (int i = 0; i < g->target.vectorWidth; ++i)
+        pi[i] = i;
+    pidx->constValue = new ConstExpr(pidx->type, pi, SourcePos());
+
+    const llvm::Type *ltype = LLVMTypes::Int32VectorType;
+    llvm::Constant *linit = LLVMInt32Vector(pi);
+    pidx->storagePtr = new llvm::GlobalVariable(*module, ltype, true, 
+                                                llvm::GlobalValue::InternalLinkage, linit, 
+                                                pidx->name.c_str());
+    symbolTable->AddVariable(pidx);
+}
+
+
+void
+DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *module,
+             bool includeStdlibISPC) {
+    // Add the definitions from the compiled stdlib-c.c file
+    extern unsigned char stdlib_bitcode_c[];
+    extern int stdlib_bitcode_c_length;
+    lAddBitcode(stdlib_bitcode_c, stdlib_bitcode_c_length, module, symbolTable);
+
+    // Next, add the target's custom implementations of the various needed
+    // builtin functions (e.g. __masked_store_32(), etc).
+    switch (g->target.isa) {
+    case Target::SSE2:
+        extern unsigned char stdlib_bitcode_sse2[];
+        extern int stdlib_bitcode_sse2_length;
+        lAddBitcode(stdlib_bitcode_sse2, stdlib_bitcode_sse2_length, module,
+                    symbolTable);
+        break;
+    case Target::SSE4:
+        extern unsigned char stdlib_bitcode_sse4[];
+        extern int stdlib_bitcode_sse4_length;
+        extern unsigned char stdlib_bitcode_sse4x2[];
+        extern int stdlib_bitcode_sse4x2_length;
+        switch (g->target.vectorWidth) {
+        case 4: 
+            lAddBitcode(stdlib_bitcode_sse4, stdlib_bitcode_sse4_length, 
+                        module, symbolTable);
+            break;
+        case 8:
+            lAddBitcode(stdlib_bitcode_sse4x2, stdlib_bitcode_sse4x2_length, 
+                        module, symbolTable);
+            break;
+        default:
+            FATAL("logic error in DefineStdlib");
+        }
+        break;
+    case Target::AVX:
+        extern unsigned char stdlib_bitcode_avx[];
+        extern int stdlib_bitcode_avx_length;
+        lAddBitcode(stdlib_bitcode_avx, stdlib_bitcode_avx_length, module, 
+                    symbolTable);
+        break;
+    default:
+        FATAL("logic error");
+    }
+
+    // Add a declaration of void *ISPCMalloc(int64_t).  The user is
+    // responsible for linking in a definition of this if it's needed by
+    // the compiled program.
+    { std::vector<const llvm::Type *> argTypes;
+        argTypes.push_back(llvm::Type::getInt64Ty(*ctx));
+        llvm::FunctionType *ftype = llvm::FunctionType::get(LLVMTypes::VoidPointerType, 
+                                                            argTypes, false);
+        llvm::Function *func = 
+            llvm::Function::Create(ftype, llvm::GlobalValue::ExternalLinkage,
+                                   "ISPCMalloc", module);
+        func->setDoesNotThrow(true);
+    }
+
+    // Add a declaration of void ISPCFree(void *).  The user is
+    // responsible for linking in a definition of this if it's needed by
+    // the compiled program.
+    { std::vector<const llvm::Type *> argTypes;
+        argTypes.push_back(LLVMTypes::VoidPointerType);
+        llvm::FunctionType *ftype = llvm::FunctionType::get(LLVMTypes::VoidPointerType, 
+                                                            argTypes, false);
+        llvm::Function *func = 
+            llvm::Function::Create(ftype, llvm::GlobalValue::ExternalLinkage,
+                                   "ISPCFree", module);
+        func->setDoesNotThrow(true);
+    }
+
+    // Add a declaration of void ISPCLaunch(void *funcPtr, void *data).
+    // The user is responsible for linking in a definition of this if it's
+    // needed by the compiled program.
+    { std::vector<const llvm::Type *> argTypes;
+        argTypes.push_back(LLVMTypes::VoidPointerType);
+        argTypes.push_back(LLVMTypes::VoidPointerType);
+        llvm::FunctionType *ftype = llvm::FunctionType::get(LLVMTypes::VoidType, 
+                                                            argTypes, false);
+        llvm::Function *func = 
+            llvm::Function::Create(ftype, llvm::GlobalValue::ExternalLinkage,
+                                   "ISPCLaunch", module);
+        func->setDoesNotThrow(true);
+    }
+
+    // Add a declaration of void ISPCSync().  The user is responsible for
+    // linking in a definition of this if it's needed by the compiled
+    // program.
+    { 
+        std::vector<const llvm::Type *> argTypes;
+        llvm::FunctionType *ftype = llvm::FunctionType::get(LLVMTypes::VoidType, 
+                                                            argTypes, false);
+        llvm::Function *func = 
+            llvm::Function::Create(ftype, llvm::GlobalValue::ExternalLinkage,
+                                   "ISPCSync", module);
+        func->setDoesNotThrow(true);
+    }
+
+    // Add a declaration of void ISPCInstrument(void *, void *, int, int).
+    // The user is responsible for linking in a definition of this if it's
+    // needed by the compiled program.
+    { 
+        std::vector<const llvm::Type *> argTypes;
+        argTypes.push_back(llvm::PointerType::get(llvm::Type::getInt8Ty(*g->ctx), 0));
+        argTypes.push_back(llvm::PointerType::get(llvm::Type::getInt8Ty(*g->ctx), 0));
+        argTypes.push_back(LLVMTypes::Int32Type);
+        argTypes.push_back(LLVMTypes::Int32Type);
+        llvm::FunctionType *ftype = llvm::FunctionType::get(LLVMTypes::VoidType, 
+                                                            argTypes, false);
+        llvm::Function *func = 
+            llvm::Function::Create(ftype, llvm::GlobalValue::ExternalLinkage,
+                                   "ISPCInstrument", module);
+        func->setDoesNotThrow(true);
+    }
+
+    // Declare various placeholder functions that the optimizer will later
+    // find and replace with something more useful.
+    lDeclareCompileTimeConstant(module);
+    lDeclarePseudoGathers(module);
+    lDeclarePseudoScatters(module);
+    lDeclarePseudoMaskedStore(module);
+
+    // define the 'programCount' builtin variable
+    lDefineConstantInt("programCount", g->target.vectorWidth, module, symbolTable);
+
+    // define the 'programIndex' builtin
+    lDefineProgramIndex(module, symbolTable);
+
+    // Define __math_lib stuff.  This is used by stdlib.ispc, for example, to
+    // figure out which math routines to end up calling...
+    lDefineConstantInt("__math_lib", (int)g->mathLib, module, symbolTable);
+    lDefineConstantInt("__math_lib_ispc", (int)Globals::Math_ISPC, module,
+                       symbolTable);
+    lDefineConstantInt("__math_lib_ispc_fast", (int)Globals::Math_ISPCFast, 
+                       module, symbolTable);
+    lDefineConstantInt("__math_lib_svml", (int)Globals::Math_SVML, module,
+                       symbolTable);
+    lDefineConstantInt("__math_lib_system", (int)Globals::Math_System, module,
+                       symbolTable);
+
+    if (includeStdlibISPC) {
+        // If the user wants the standard library to be included, parse the
+        // serialized version of the stdlib.ispc file to get its definitions
+        // added.
+        extern const char *stdlib_code;
+        yy_scan_string(stdlib_code);
+        yyparse();
+    }
+}
--- a/builtins.h
+++ b/builtins.h
@@ -0,0 +1,58 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file builtins.h
+    @brief Declarations of functions related to builtins and the 
+           standard library
+*/
+
+#ifndef ISPC_STDLIB_H
+#define ISPC_STDLIB_H 1
+
+#include "ispc.h"
+
+/** Adds declarations and definitions of ispc standard library functions
+    and types to the given module.
+
+    @param symbolTable     SymbolTable in which to add symbol definitions for
+                           stdlib stuff
+    @param ctx             llvm::LLVMContext to use for getting types and the
+                           like for standard library definitions
+    @param module          Module in which to add the declarations/definitions
+    @param includeStdlib   Indicates whether the definitions from the stdlib.ispc
+                           file should be added to the module.
+ */
+void DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *module,
+                  bool includeStdlib);
+
+#endif // ISPC_STDLIB_H
--- a/ctx.cpp
+++ b/ctx.cpp
--- a/ctx.h
+++ b/ctx.h
@@ -0,0 +1,507 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file ctx.h
+    @brief Declaration of the FunctionEmitContext class
+*/
+
+#ifndef ISPC_CTX_H
+#define ISPC_CTX_H 1
+
+#include "ispc.h"
+#include <llvm/InstrTypes.h>
+#include <llvm/Instructions.h>
+#ifndef LLVM_2_8
+#include <llvm/Analysis/DIBuilder.h>
+#endif
+#include <llvm/Analysis/DebugInfo.h>
+
+struct CFInfo;
+
+/** FunctionEmitContext is one of the key classes in ispc; it is used to
+    help with emitting the intermediate representation of a function during
+    compilation.  It carries information the current program context during
+    IR emission (e.g. the basic block into which instructions should be
+    added; or, the current source file and line number, so debugging
+    symbols can be correctly generated).  This class also provides a number
+    of helper routines that are useful for code that emits IR.
+ */
+class FunctionEmitContext {
+public:
+    /** Create a new FunctionEmitContext.
+        @param returnType   The return type of the function
+        @param function     LLVM function in the current module that corresponds
+                            to the function
+        @param funSym       Symbol that corresponds to the function
+        @param firstStmtPos Source file position of the first statement in the
+                            function
+     */
+    FunctionEmitContext(const Type *returnType, llvm::Function *function, Symbol *funSym,
+                        SourcePos firstStmtPos);
+    ~FunctionEmitContext();
+
+    /** @name Current basic block management
+        @{
+     */
+    /** Returns the current basic block pointer */ 
+    llvm::BasicBlock *GetCurrentBasicBlock();
+    
+    /** Set the given llvm::BasicBlock to be the basic block to emit
+        forthcoming instructions into. */
+    void SetCurrentBasicBlock(llvm::BasicBlock *bblock);
+
+    /** @name Mask management
+        @{
+     */
+    /** Returns the current mask value */ 
+    llvm::Value *GetMask();
+
+    /** Provides the value of the mask at function entry */
+    void SetEntryMask(llvm::Value *val);
+
+    /** Sets the mask to a new value */
+    void SetMask(llvm::Value *val);
+
+    /** Sets the mask to (oldMask & val) */
+    void MaskAnd(llvm::Value *oldMask, llvm::Value *val);
+
+    /** Sets the mask to (oldMask & ~val) */
+    void MaskAndNot(llvm::Value *oldMask, llvm::Value *test);
+
+    /** Emits a branch instruction to the basic block btrue if any of the
+        lanes of current mask are on and bfalse if none are on. */
+    void BranchIfMaskAny(llvm::BasicBlock *btrue, llvm::BasicBlock *bfalse);
+
+    /** Emits a branch instruction to the basic block btrue if all of the
+        lanes of current mask are on and bfalse if none are on. */
+    void BranchIfMaskAll(llvm::BasicBlock *btrue, llvm::BasicBlock *bfalse);
+
+    /** Emits a branch instruction to the basic block btrue if none of the
+        lanes of current mask are on and bfalse if none are on. */
+    void BranchIfMaskNone(llvm::BasicBlock *btrue, llvm::BasicBlock *bfalse);
+    /** @} */
+
+    /** @name Control flow management
+        @{
+    */
+    /** Notifies the FunctionEmitContext that we're starting emission of an
+        'if' statement with a uniform test.  The value of the mask going
+        into the 'if' statement is provided in the oldMask parameter. */
+    void StartUniformIf(llvm::Value *oldMask);
+
+    /** Notifies the FunctionEmitContext that we're starting emission of an
+        'if' statement with a varying test.  The value of the mask going
+        into the 'if' statement is provided in the oldMask parameter. */
+    void StartVaryingIf(llvm::Value *oldMask);
+
+    /** Notifies the FunctionEmitConitext that we're done emitting the IR
+        for an 'if' statement. */
+    void EndIf();
+
+    /** Notifies the FunctionEmitContext that we're starting to emit IR
+        for a loop.  Basic blocks are provides for where 'break' and
+        'continue' statements should jump to (if all running lanes want to
+        break or continue), uniformControlFlow indicates whether the loop
+        condition is 'uniform', and oldMask provides the current mask going
+        into the loop. */
+    void StartLoop(llvm::BasicBlock *breakTarget, llvm::BasicBlock *continueTarget, 
+                   bool uniformControlFlow, llvm::Value *oldMask);
+
+    /** Informs FunctionEmitContext of the value of the mask at the start
+        of a loop body. */
+    void SetLoopMask(llvm::Value *mask);
+
+    /** Informs FunctionEmitContext that code generation for a loop is
+        finished. */
+    void EndLoop();
+
+    /** Emit code for a 'break' statement in a loop.  If doCoherenceCheck
+        is true, then if we're in a 'varying' loop, code will be emitted to
+        see if all of the lanes want to break, in which case a jump to the
+        break target will be taken.  (For 'uniform' loops, the jump is
+        always done). */
+    void Break(bool doCoherenceCheck);
+
+    /** Emit code for a 'continue' statement in a loop.  If
+        doCoherenceCheck is true, then if we're in a 'varying' loop, code
+        will be emitted to see if all of the lanes want to continue, in
+        which case a jump to the continue target will be taken.  (For
+        'uniform' loops, the jump is always done). */
+    void Continue(bool doCoherenceCheck);
+
+    /** This method is called by code emitting IR for a loop at the end of
+        the loop body; it restores the lanes of the mask that executed a
+        'continue' statement when going through the loop body in the
+        previous iteration. */
+    void RestoreContinuedLanes();
+
+    /** Returns the current number of nested levels of 'varying' control
+        flow */
+    int VaryingCFDepth() const;
+
+    /** Called to generate code for 'return' statement; value is the
+        expression in the return statement (if non-NULL), and
+        doCoherenceCheck indicates whether instructions should be generated
+        to see if all of the currently-running lanes have returned (if
+        we're under varying control flow).  */
+    void CurrentLanesReturned(Expr *value, bool doCoherenceCheck);
+    /** @} */
+
+    /** @name Small helper/utility routines
+        @{ 
+    */
+    /** Given a boolean mask value of type LLVMTypes::MaskType, return an
+        i1 value that indicates if any of the mask lanes are on. */
+    llvm::Value *Any(llvm::Value *mask);
+
+    /** Given a boolean mask value of type LLVMTypes::MaskType, return an
+        i1 value that indicates if all of the mask lanes are on. */
+    llvm::Value *All(llvm::Value *mask);
+
+    /** Given a boolean mask value of type LLVMTypes::MaskType, return an
+        i32 value wherein the i'th bit is on if and only if the i'th lane
+        of the mask is on. */
+    llvm::Value *LaneMask(llvm::Value *mask);
+
+    /** Given two masks of type LLVMTypes::MaskType, return an i1 value
+        that indicates whether the two masks are equal. */
+    llvm::Value *MasksAllEqual(llvm::Value *mask1, llvm::Value *mask2);
+
+    /** Given a string, create an anonymous global variable to hold its
+        value and return the pointer to the string. */
+    llvm::Value *GetStringPtr(const std::string &str);
+
+    /** Create a new basic block with given name */
+    llvm::BasicBlock *CreateBasicBlock(const char *name);
+
+    /** Given a vector with element type i1, return a vector of type
+        LLVMTypes::BoolVectorType.  This method handles the conversion for
+        the targets where the bool vector element type is, for example,
+        i32. */
+    llvm::Value *I1VecToBoolVec(llvm::Value *b);
+
+    /** Emit code to call the user-supplied ISPCMalloc function to
+        allocate space for an object of thee given type.  Returns the
+        pointer value returned by the ISPCMalloc call. */
+    llvm::Value *EmitMalloc(const llvm::Type *ty);
+
+    /** Emit code to call the user-supplied ISPCFree function, passing it
+        the given pointer to storage previously allocated by an
+        EmitMalloc() call. */
+    void EmitFree(llvm::Value *ptr);
+
+    /** If the user has asked to compile the program with instrumentation,
+        this inserts a callback to the user-supplied instrumentation
+        function at the current point in the code. */
+    void AddInstrumentationPoint(const char *note);
+    /** @} */
+
+    /** @name Debugging support
+        @{
+    */
+    /** Set the current source file position; subsequent emitted
+        instructions will have this position associated with them if
+        debugging information is being generated. */
+    void SetDebugPos(SourcePos pos);
+
+    SourcePos GetDebugPos() const;
+
+    /** Adds debugging metadata to the given instruction.  If pos == NULL,
+        use FunctionEmitContext::currentPos as the source file position for
+        the instruction.  Similarly, if a DIScope is provided, it's used
+        and otherwise the scope is found from a GetDIScope() call.  This
+        takes a llvm::Value for the instruction rather than an
+        llvm::Instruction for convenience; in calling code we often have
+        Instructions stored using Value pointers; the code here returns
+        silently if it's not actually given an instruction. */
+    void AddDebugPos(llvm::Value *instruction, const SourcePos *pos = NULL, 
+                     llvm::DIScope *scope = NULL);
+
+    /** Inform the debugging information generation code that a new scope
+        is starting in the source program. */
+    void StartScope();
+
+    /** Inform the debugging information generation code that the current
+        scope is ending in the source program. */
+    void EndScope();
+
+    /** Returns the llvm::DIScope corresponding to the current program
+        scope. */
+    llvm::DIScope GetDIScope() const;
+
+    /** Emits debugging information for the variable represented by
+        sym.  */
+    void EmitVariableDebugInfo(Symbol *sym);
+
+    /** Emits debugging information for the function parameter represented
+        by sym.  */
+    void EmitFunctionParameterDebugInfo(Symbol *sym);
+    /** @} */
+
+    /** @name IR instruction emission
+        @brief These methods generally closely correspond to LLVM IR
+        instructions.  See the LLVM assembly language reference manual
+        (http://llvm.org/docs/LangRef.html) and the LLVM doxygen documentaion
+        (http://llvm.org/doxygen) for more information.  Here we will only
+        document significant generalizations to the functionality of the 
+        corresponding basic LLVM instructions.
+
+        Beyond actually emitting the instruction, the implementations of
+        these methods in FunctionEmitContext also handle adding debugging
+        metadata if debugging symbols are enabled, adding the instructions
+        to the current basic block, and handling generalizations like
+        'varying' lvalues, arithmetic operations with VectorType operands,
+        etc.
+        @{
+    */
+    /** Emit the binary operator given by the inst parameter.  If
+        llvm::Values corresponding to VectorTypes are given as operands,
+        this also handles applying the given operation to the vector
+        elements. */
+    llvm::Value *BinaryOperator(llvm::Instruction::BinaryOps inst,
+                                llvm::Value *v0, llvm::Value *v1, 
+                                const char *name = NULL);
+
+    /** Emit the "not" operator.  Like BinaryOperator(), this also handles
+        a VectorType-based operand. */
+    llvm::Value *NotOperator(llvm::Value *v, const char *name = NULL);
+
+    /** Emit a comparison instruction.  If the operands are VectorTypes,
+        then a value for the corresponding boolean VectorType is
+        returned. */
+    llvm::Value *CmpInst(llvm::Instruction::OtherOps inst, 
+                         llvm::CmpInst::Predicate pred,
+                         llvm::Value *v0, llvm::Value *v1, const char *name = NULL);
+
+    llvm::Value *BitCastInst(llvm::Value *value, const llvm::Type *type,
+                             const char *name = NULL);
+    llvm::Instruction *PtrToIntInst(llvm::Value *value, const llvm::Type *type,
+                                    const char *name = NULL);
+    llvm::Instruction *IntToPtrInst(llvm::Value *value, const llvm::Type *type,
+                                    const char *name = NULL);
+    llvm::Instruction *TruncInst(llvm::Value *value, const llvm::Type *type,
+                                 const char *name = NULL);
+    llvm::Instruction *CastInst(llvm::Instruction::CastOps op, llvm::Value *value,
+                                const llvm::Type *type, const char *name = NULL);
+    llvm::Instruction *FPCastInst(llvm::Value *value, const llvm::Type *type, 
+                                  const char *name = NULL);
+    llvm::Instruction *SExtInst(llvm::Value *value, const llvm::Type *type, 
+                                const char *name = NULL);
+    llvm::Instruction *ZExtInst(llvm::Value *value, const llvm::Type *type, 
+                                const char *name = NULL);
+
+    /** This GEP method is a generalization of the standard one in LLVM; it
+        supports both uniform and varying basePtr values (an array of
+        pointers) as well as uniform and varying index values (arrays of
+        indices). */
+    llvm::Value *GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index0,
+                                   llvm::Value *index1, const char *name = NULL);
+
+    /** This is a convenience method to generate a GEP instruction with
+        indices with values with known constant values as the ispc program
+        is being compiled. */
+    llvm::Value *GetElementPtrInst(llvm::Value *basePtr, int v0, int v1,
+                                   const char *name = NULL);
+
+    /** Load from the memory location(s) given by lvalue.  The lvalue may
+        be varying, in which case this corresponds to a gather from the
+        multiple memory locations given by the array of pointer values
+        given by the lvalue.  If the lvalue is not varying, then the type
+        parameter may be NULL. */
+    llvm::Value *LoadInst(llvm::Value *lvalue, const Type *type,
+                          const char *name = NULL);
+
+    /** Emits an alloca instruction to allocate stack storage for the given
+        type.  If a non-zero alignment is specified, the object is also
+        allocated at the given alignment.  By default, the alloca
+        instruction is added at the start of the function in the entry
+        basic block; if it should be added to the current basic block, then
+        the atEntryBlock parameter should be false. */ 
+    llvm::Value *AllocaInst(const llvm::Type *llvmType, const char *name = NULL,
+                            int align = 0, bool atEntryBlock = true);
+
+    /** Standard store instruction; for this variant, the lvalue must be a
+        single pointer, not a varying lvalue. */
+    void StoreInst(llvm::Value *rvalue, llvm::Value *lvalue, 
+                   const char *name = NULL);
+
+    /** In this variant of StoreInst(), the lvalue may be varying.  If so,
+        this corresponds to a scatter.  Whether the lvalue is uniform of
+        varying, the given storeMask is used to mask the stores so that
+        they only execute for the active program instances. */
+    void StoreInst(llvm::Value *rvalue, llvm::Value *lvalue,
+                   llvm::Value *storeMask, const Type *rvalueType,
+                   const char *name = NULL);
+
+    void BranchInst(llvm::BasicBlock *block);
+    void BranchInst(llvm::BasicBlock *trueBlock, llvm::BasicBlock *falseBlock,
+                    llvm::Value *test);
+
+    /** This convenience method maps to an llvm::ExtractElementInst if the
+        given value is a llvm::VectorType, and to an llvm::ExtractValueInst
+        otherwise. */
+    llvm::Value *ExtractInst(llvm::Value *v, int elt, const char *name = NULL);
+
+    /** This convenience method maps to an llvm::InsertElementInst if the
+        given value is a llvm::VectorType, and to an llvm::InsertValueInst
+        otherwise. */
+    llvm::Value *InsertInst(llvm::Value *v, llvm::Value *eltVal, int elt, 
+                            const char *name = NULL);
+
+    llvm::PHINode *PhiNode(const llvm::Type *type, int count, const char *name = NULL);
+    llvm::Instruction *SelectInst(llvm::Value *test, llvm::Value *val0,
+                                  llvm::Value *val1, const char *name = NULL);
+
+    llvm::Instruction *CallInst(llvm::Function *func, 
+                                const std::vector<llvm::Value *> &args,
+                                const char *name = NULL);
+    /** This is a convenience method that issues a call instruction to a
+        function that takes just a single argument. */
+    llvm::Instruction *CallInst(llvm::Function *func, llvm::Value *arg,
+                                const char *name = NULL);
+
+    /** This is a convenience method that issues a call instruction to a
+        function that takes two arguments. */
+    llvm::Instruction *CallInst(llvm::Function *func, llvm::Value *arg0,
+                                llvm::Value *arg1, const char *name = NULL);
+
+    /** Launch an asynchronous task to run the given function, passing it
+        he given argument values. */
+    llvm::Instruction *LaunchInst(llvm::Function *callee, 
+                                  std::vector<llvm::Value *> &argVals);
+
+    llvm::Instruction *ReturnInst();
+    /** @} */
+
+private:
+    /** The basic block into which we add any alloca instructions that need
+        to go at the very start of the function. */
+    llvm::BasicBlock *allocaBlock;
+
+    /** The current basic block into which we're emitting new
+        instructions */
+    llvm::BasicBlock *bblock;
+
+    /** Pointer to stack-allocated memory that stores the current value of
+        the program mask. */
+    llvm::Value *maskPtr;
+
+    /** Current source file position; if debugging information is being
+        generated, this position is used to set file/line information for
+        instructions. */
+    SourcePos currentPos;
+
+    /** Source file position where the function definition started.  Used
+        for error messages and debugging symbols. */
+    SourcePos funcStartPos;
+
+    /** Type of result that the current function returns. */
+    const Type *returnType;
+
+    /** Value of the program mask when the function starts execution.  */
+    llvm::Value *entryMask;
+
+    /** If currently in a loop body, the value of the mask at the start of
+        the loop. */
+    llvm::Value *loopMask;
+
+    /** If currently in a loop body, this is a pointer to memory to store a
+        mask value that represents which of the lanes have executed a
+        'break' statement.  If we're not in a loop body, this should be
+        NULL. */
+    llvm::Value *breakLanesPtr;
+
+    /** Similar to breakLanesPtr, if we're inside a loop, this is a pointer
+        to memory to record which of the program instances have executed a
+        'continue' statement. */
+    llvm::Value *continueLanesPtr;
+
+    /** If we're inside a loop, this gives the basic block immediately
+        after the current loop, which we will jump to if all of the lanes
+        have executed a break statement or are otherwise done with the
+        loop. */
+    llvm::BasicBlock *breakTarget;
+
+    /** If we're inside a loop, this gives the block to jump to if all of
+        the running lanes have executed a 'continue' statement. */
+    llvm::BasicBlock *continueTarget;
+
+    /** A pointer to memory that records which of the program instances
+        have executed a 'return' statement (and are thus really truly done
+        running any more instructions in this functions. */
+    llvm::Value *returnedLanesPtr;
+
+    /** A pointer to memory to store the return value for the function.
+        Since difference program instances may execute 'return' statements
+        at different times, we need to accumulate the return values as they
+        come in until we return for real. */
+    llvm::Value *returnValuePtr;
+
+    /** The CFInfo structure records information about a nesting level of
+        control flow.  This vector lets us see what control flow is going
+        around outside the current position in the function being
+        emitted. */
+    std::vector<CFInfo *> controlFlowInfo;
+
+    /** DIFile object corresponding to the source file where the current
+        function was defined (used for debugging info0. */
+    llvm::DIFile diFile;
+
+    /** DISubprogram corresponding to this function (used for debugging
+        info). */
+    llvm::DISubprogram diFunction;
+
+    /** These correspond to the current set of nested scopes in the
+        function. */
+    std::vector<llvm::DILexicalBlock> debugScopes;
+
+    /** True if a 'launch' statement has been encountered in the function. */
+    bool launchedTasks;
+
+    llvm::Value *pointerVectorToVoidPointers(llvm::Value *value);
+    static void addGSMetadata(llvm::Instruction *inst, SourcePos pos);
+    bool ifsInLoopAllUniform() const;
+    void jumpIfAllLoopLanesAreDone(llvm::BasicBlock *target);
+    llvm::Value *emitGatherCallback(llvm::Value *lvalue, llvm::Value *retPtr);
+
+    void restoreMaskGivenReturns(llvm::Value *oldMask);
+
+    void scatter(llvm::Value *rvalue, llvm::Value *lvalue, 
+                 llvm::Value *maskPtr, const Type *rvalueType);
+    llvm::Value *gather(llvm::Value *lvalue, const Type *type,
+                        const char *name);
+    void maskedStore(llvm::Value *rvalue, llvm::Value *lvalue,
+                     const Type *rvalueType, llvm::Value *maskPtr);
+};
+
+#endif // ISPC_CTX_H
--- a/decl.cpp
+++ b/decl.cpp
@@ -0,0 +1,348 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file decl.cpp
+    @brief Implementations of classes related to turning declarations into 
+           symbols and types.
+*/
+
+#include "decl.h"
+#include "util.h"
+#include "sym.h"
+#include "type.h"
+#include "expr.h"
+#include <stdio.h>
+
+///////////////////////////////////////////////////////////////////////////
+// DeclSpecs
+
+DeclSpecs::DeclSpecs(const Type *t, StorageClass sc, int tq) {
+    baseType = t;
+    storageClass = sc;
+    typeQualifier = tq;
+    soaWidth = 0;
+    vectorSize = 0;
+}
+
+
+void
+DeclSpecs::Print() const {
+    if (storageClass == SC_EXTERN)   printf("extern ");
+    if (storageClass == SC_EXTERN_C) printf("extern \"C\" ");
+    if (storageClass == SC_EXPORT)   printf("export ");
+    if (storageClass == SC_STATIC)   printf("static ");
+    if (storageClass == SC_TYPEDEF)  printf("typedef ");
+
+    if (soaWidth > 0) printf("soa<%d> ", soaWidth);
+
+    if (typeQualifier & TYPEQUAL_INLINE)    printf("inline ");
+    if (typeQualifier & TYPEQUAL_CONST)     printf("const ");
+    if (typeQualifier & TYPEQUAL_UNIFORM)   printf("uniform ");
+    if (typeQualifier & TYPEQUAL_VARYING)   printf("varying ");
+    if (typeQualifier & TYPEQUAL_TASK)      printf("task ");
+    if (typeQualifier & TYPEQUAL_REFERENCE) printf("reference ");
+    if (typeQualifier & TYPEQUAL_UNSIGNED)  printf("unsigned ");
+
+    printf("%s", baseType->GetString().c_str());
+
+    if (vectorSize > 0) printf("<%d>", vectorSize);
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// Declarator
+
+Declarator::Declarator(Symbol *s, SourcePos p) 
+  : pos(p) { 
+    sym = s;
+    functionArgs = NULL;
+    isFunction = false;
+    initExpr = NULL;
+}
+
+
+void
+Declarator::AddArrayDimension(int size) {
+    assert(size > 0 || size == -1); // -1 -> unsized
+    arraySize.push_back(size);
+}
+
+
+void
+Declarator::InitFromDeclSpecs(DeclSpecs *ds) {
+    sym->type = GetType(ds);
+
+    if (ds->storageClass == SC_STATIC)
+        sym->isStatic = true;
+}
+
+
+void
+Declarator::Print() const {
+    printf("%s", sym->name.c_str());
+    if (initExpr != NULL) {
+        printf(" = (");
+        initExpr->Print();
+        printf(")");
+    }
+    pos.Print();
+}
+
+
+static const Type *
+lGetType(const Declarator *decl, DeclSpecs *ds, 
+         std::vector<int>::const_iterator arrayIter) {
+    if (arrayIter == decl->arraySize.end()) {
+        // If we don't have an array (or have processed all of the array
+        // dimensions in previous recursive calls), we can go ahead and
+        // figure out the final non-array type we have here.
+        const Type *type = ds->baseType;
+        if (type == NULL) {
+            Error(decl->pos, "Type not provided in variable declaration for variable \"%s\".",
+                  decl->sym->name.c_str());
+            return NULL;
+        }
+
+        // Account for 'unsigned' and 'const' qualifiers in the type
+        if ((ds->typeQualifier & TYPEQUAL_UNSIGNED) != 0) {
+            const Type *unsignedType = type->GetAsUnsignedType();
+            if (unsignedType != NULL)
+                type = unsignedType;
+            else
+                Error(decl->pos, "\"unsigned\" qualifier is illegal with \"%s\" type.",
+                      type->GetString().c_str());
+        }
+        if ((ds->typeQualifier & TYPEQUAL_CONST) != 0)
+            type = type->GetAsConstType();
+
+        if (ds->vectorSize > 0) {
+            const AtomicType *atomicType = dynamic_cast<const AtomicType *>(type);
+            if (atomicType == NULL) {
+                Error(decl->pos, "Only atomic types (int, float, ...) are legal for vector "
+                      "types.");
+                return NULL;
+            }
+            type = new VectorType(atomicType, ds->vectorSize);
+        }
+
+        // if uniform/varying is specified explicitly, then go with that
+        if ((ds->typeQualifier & TYPEQUAL_UNIFORM) != 0)
+            return type->GetAsUniformType();
+        else if ((ds->typeQualifier & TYPEQUAL_VARYING) != 0)
+            return type->GetAsVaryingType();
+        else {
+            // otherwise, structs are uniform by default and everything
+            // else is varying by default
+            if (dynamic_cast<const StructType *>(type) != NULL)
+                return type->GetAsUniformType();
+            else
+                return type->GetAsVaryingType();
+        }
+    }
+    else {
+        // Peel off one dimension of the array
+        int arraySize = *arrayIter;
+        ++arrayIter;
+
+        // Get the type, not including the arraySize dimension peeled off
+        // above.
+        const Type *childType = lGetType(decl, ds, arrayIter);
+
+        int soaWidth = ds->soaWidth;
+        if (soaWidth == 0)
+            // If there's no "soa<n>" stuff going on, just return a regular
+            // array with the appropriate size 
+            return new ArrayType(childType, arraySize == -1 ? 0 : arraySize);
+       else {
+            // Make sure we actually have an array of structs ..
+            const StructType *childStructType = 
+                dynamic_cast<const StructType *>(childType);
+            if (childStructType == NULL) {
+                Error(decl->pos, "Illegal to provide soa<%d> qualifier with non-struct "
+                      "type \"%s\".", soaWidth, childType->GetString().c_str());
+                return new ArrayType(childType, arraySize == -1 ? 0 : arraySize);
+            }
+            else if ((soaWidth & (soaWidth - 1)) != 0) {
+                Error(decl->pos, "soa<%d> width illegal.  Value must be power of two.",
+                      soaWidth);
+                return NULL;
+            }
+            else if (arraySize != -1 && (arraySize % soaWidth) != 0) {
+                Error(decl->pos, "soa<%d> width must evenly divide array size %d.",
+                      soaWidth, arraySize);
+                return NULL;
+            }
+            return new SOAArrayType(childStructType, arraySize == -1 ? 0 : arraySize,
+                                    soaWidth);
+        }
+    }
+}
+
+
+const Type *
+Declarator::GetType(DeclSpecs *ds) const {
+    bool hasUniformQual = ((ds->typeQualifier & TYPEQUAL_UNIFORM) != 0);
+    bool hasVaryingQual = ((ds->typeQualifier & TYPEQUAL_VARYING) != 0);
+    bool isTask =         ((ds->typeQualifier & TYPEQUAL_TASK) != 0);
+    bool isReference =    ((ds->typeQualifier & TYPEQUAL_REFERENCE) != 0);
+
+    if (hasUniformQual && hasVaryingQual) {
+        Error(pos, "Can't provide both \"uniform\" and \"varying\" qualifiers.");
+        return NULL;
+    }
+
+    if (isFunction) {
+        std::vector<const Type *> args;
+        std::vector<std::string> argNames;
+        if (functionArgs) {
+            // Loop over the function arguments and get names and types for
+            // each one in the args and argNames arrays
+            for (unsigned int i = 0; i < functionArgs->size(); ++i) {
+                Declaration *d = (*functionArgs)[i];
+                Symbol *sym;
+                if (d->declarators.size() == 0) {
+                    // function declaration like foo(float), w/o a name for
+                    // the parameter
+                    char buf[32];
+                    sprintf(buf, "__anon_parameter_%d", i);
+                    sym = new Symbol(buf, pos);
+                    Declarator *declarator = new Declarator(sym, sym->pos);
+                    sym->type = declarator->GetType(ds);
+                    d->declarators.push_back(declarator);
+                }
+                else {
+                    assert(d->declarators.size() == 1);
+                    sym = d->declarators[0]->sym;
+                }
+
+                // Arrays are passed by reference, so convert array
+                // parameters to be references here.
+                if (dynamic_cast<const ArrayType *>(sym->type) != NULL)
+                    sym->type = new ReferenceType(sym->type, sym->type->IsConstType());
+
+                args.push_back(sym->type);
+                argNames.push_back(sym->name);
+            }
+        }
+
+        if (ds->baseType == NULL) {
+            Warning(pos, "No return type provided in declaration of function \"%s\". "
+                    "Treating as \"void\".", sym->name.c_str());
+            ds->baseType = AtomicType::Void;
+        }
+
+        if (isReference) {
+            Error(pos, "Function return types can't be reference types.");
+            return NULL;
+        }
+
+        const Type *returnType = lGetType(this, ds, arraySize.begin());
+        if (returnType == NULL)
+            return NULL;
+
+        bool isExported = (ds->storageClass == SC_EXPORT);
+        bool isExternC =  (ds->storageClass == SC_EXTERN_C);
+        return new FunctionType(returnType, args, pos, &argNames, isTask, 
+                                isExported, isExternC);
+    }
+    else {
+        if (isTask)
+            Error(pos, "\"task\" qualifier illegal in variable declaration \"%s\".",
+                  sym->name.c_str());
+
+        const Type *type = lGetType(this, ds, arraySize.begin());
+
+        if (type != NULL && isReference) {
+            bool hasConstQual = ((ds->typeQualifier & TYPEQUAL_CONST) != 0);
+            type = new ReferenceType(type, hasConstQual);
+        }
+
+        return type;
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////
+// Declaration
+
+void
+Declaration::AddSymbols(SymbolTable *st) const {
+    assert(declSpecs->storageClass != SC_TYPEDEF);
+
+    for (unsigned int i = 0; i < declarators.size(); ++i)
+       if (declarators[i])
+           st->AddVariable(declarators[i]->sym);
+}
+
+
+void
+Declaration::Print() const {
+    printf("Declaration: specs [");
+    declSpecs->Print();
+    printf("], declarators [");
+    for (unsigned int i = 0 ; i < declarators.size(); ++i) {
+        declarators[i]->Print();
+        printf("%s", (i == declarators.size() - 1) ? "]" : ", ");
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////
+
+void
+GetStructTypesAndNames(const std::vector<StructDeclaration *> &sd,
+                       std::vector<const Type *> *elementTypes,
+                       std::vector<std::string> *elementNames) {
+    for (unsigned int i = 0; i < sd.size(); ++i) {
+        const Type *type = sd[i]->type;
+        // FIXME: making this fake little DeclSpecs here is really
+        // disgusting
+        DeclSpecs ds(type);
+        if (type->IsUniformType()) 
+            ds.typeQualifier |= TYPEQUAL_UNIFORM;
+        else
+            ds.typeQualifier |= TYPEQUAL_VARYING;
+
+        for (unsigned int j = 0; j < sd[i]->declarators->size(); ++j) {
+            Declarator *d = (*sd[i]->declarators)[j];
+            d->InitFromDeclSpecs(&ds);
+
+            // if it's an unsized array, make it a reference to an unsized
+            // array, so the caller can pass a pointer...
+            const ArrayType *at = dynamic_cast<const ArrayType *>(d->sym->type);
+            if (at && at->GetElementCount() == 0)
+                d->sym->type = new ReferenceType(d->sym->type, type->IsConstType());
+
+            elementTypes->push_back(d->sym->type);
+            elementNames->push_back(d->sym->name);
+        }
+    }
+}
--- a/decl.h
+++ b/decl.h
@@ -0,0 +1,203 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file decl.h
+    @brief Declarations related to type declarations; the parser basically
+    creates instances of these classes, which are then turned into actual
+    Types.
+
+    Three classes work together to represent declarations.  As an example,
+    consider a declaration like:
+
+    static uniform int foo, bar[10];
+
+    An instance of the Declaration class represents this entire declaration
+    of two variables, 'foo' and 'bar'.  It holds a single instance of the
+    DeclSpecs class represents the common specifiers for all of the
+    variables--here, that the declaration has the 'static' and 'uniform'
+    qualifiers, and that it's basic type is 'int'.  Then for each variable
+    declaration, the Declaraiton class holds an instance of a Declarator,
+    which in turn records the per-variable information like the symbol
+    name, array size (if any), initializer expression, etc.
+*/
+
+#ifndef ISPC_DECL_H
+#define ISPC_DECL_H
+
+#include "ispc.h"
+
+enum StorageClass {
+    SC_NONE,
+    SC_EXTERN,
+    SC_EXPORT,
+    SC_STATIC,
+    SC_TYPEDEF,
+    SC_EXTERN_C
+};
+
+
+/* Multiple qualifiers can be provided with types in declarations;
+   therefore, they are set up so that they can be ANDed together into an
+   int. */
+#define TYPEQUAL_NONE           0
+#define TYPEQUAL_CONST      (1<<0)
+#define TYPEQUAL_UNIFORM    (1<<1)
+#define TYPEQUAL_VARYING    (1<<2)
+#define TYPEQUAL_TASK       (1<<3)
+#define TYPEQUAL_REFERENCE  (1<<4)
+#define TYPEQUAL_UNSIGNED   (1<<5)
+#define TYPEQUAL_INLINE     (1<<6)
+
+/** @brief Representation of the declaration specifiers in a declaration.
+
+    In other words, this represents all of the stuff that applies to all of
+    the (possibly multiple) variables in a declaration.
+ */
+class DeclSpecs {
+public:
+    DeclSpecs(const Type *t = NULL, StorageClass sc = SC_NONE, int tq = TYPEQUAL_NONE);
+
+    void Print() const;
+
+    StorageClass storageClass;
+
+    /** Zero or more of the TYPEQUAL_* values, ANDed together. */
+    int typeQualifier;
+
+    /** The basic type provided in the declaration; this should be an
+        AtomicType, a StructType, or a VectorType; other types (like
+        ArrayTypes) will end up being created if a particular declaration
+        has an array size, etc.
+    */
+    const Type *baseType;
+
+    /** If this is a declaration with a vector type, this gives the vector
+        width.  For non-vector types, this is zero.
+     */
+    int vectorSize;
+
+    /** If this is a declaration with an "soa<n>" qualifier, this gives the
+        SOA width specified.  Otherwise this is zero.
+     */
+    int soaWidth;
+};
+
+
+/** @brief Representation of the declaration of a single variable.  
+
+    In conjunction with an instance of the DeclSpecs, this gives us
+    everything we need for a full variable declaration.
+ */
+class Declarator {
+public:
+    Declarator(Symbol *s, SourcePos p);
+
+    /** As the parser peels off array dimension declarations after the
+        symbol name, it calls this method to provide them to the
+        Declarator.
+     */
+    void AddArrayDimension(int size);
+
+    /** Once a DeclSpecs instance is available, this method completes the
+        initialization of the Symbol, setting its Type accordingly.
+     */
+    void InitFromDeclSpecs(DeclSpecs *ds);
+
+    /** Get the actual type of the combination of Declarator and the given
+        DeclSpecs */
+    const Type *GetType(DeclSpecs *ds) const;
+
+    void Print() const;
+
+    const SourcePos pos;
+    Symbol *sym;
+    /** If this declarator includes an array specification, the sizes of
+        the array dimensions are represented here.
+     */
+    std::vector<int> arraySize;
+    /** Initialization expression for the variable.  May be NULL. */
+    Expr *initExpr;
+    bool isFunction;
+    std::vector<Declaration *> *functionArgs;
+};
+
+
+/** @brief Representation of a full declaration of one or more variables,
+    including the shared DeclSpecs as well as the per-variable Declarators.
+ */
+class Declaration {
+public:
+    Declaration(DeclSpecs *ds, std::vector<Declarator *> *dlist = NULL) {
+        declSpecs = ds;
+        if (dlist != NULL)
+            declarators = *dlist;
+        for (unsigned int i = 0; i < declarators.size(); ++i)
+            if (declarators[i] != NULL)
+                declarators[i]->InitFromDeclSpecs(declSpecs);
+    }
+    Declaration(DeclSpecs *ds, Declarator *d) {
+        declSpecs = ds;
+        if (d) {
+            d->InitFromDeclSpecs(ds);
+            declarators.push_back(d);
+        }
+    }
+
+    /** Adds the symbols for the variables in the declaration to the symbol
+        table. */
+    void AddSymbols(SymbolTable *st) const;
+    void Print() const;
+
+    DeclSpecs *declSpecs;
+    std::vector<Declarator *> declarators;
+};
+
+
+/** The parser creates instances of StructDeclaration for the members of
+    structs as it's parsing their declarations. */
+struct StructDeclaration {
+    StructDeclaration(const Type *t, std::vector<Declarator *> *d)
+        : type(t), declarators(d) { }
+
+    const Type *type;
+    std::vector<Declarator *> *declarators;
+};
+
+
+/** Given a set of StructDeclaration instances, this returns the types of
+    the elements of the corresponding struct and their names. */
+extern void GetStructTypesAndNames(const std::vector<StructDeclaration *> &sd,
+                                   std::vector<const Type *> *elementTypes,
+                                   std::vector<std::string> *elementNames);
+
+#endif // ISPC_DECL_H
--- a/docs/build.sh
+++ b/docs/build.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+rst2html ispc.txt > ispc.html
+
+#rst2latex --section-numbering --documentclass=article --documentoptions=DIV=9,10pt,letterpaper ispc.txt > ispc.tex
+#pdflatex ispc.tex
+#/bin/rm -f ispc.aux ispc.log ispc.out ispc.tex
--- a/docs/ispc.txt
+++ b/docs/ispc.txt
--- a/doxygen.cfg
+++ b/doxygen.cfg
--- a/examples/README.txt
+++ b/examples/README.txt
@@ -0,0 +1,88 @@
+====================
+ISPC Examples README
+====================
+
+This directory has a number of sample ispc programs.  Before building them
+(on an system), install the appropriate ispc compiler binary into a
+directory in your path.  Then, if you're running Windows, open the
+"examples.sln" file and built from there.  For building under Linux/OSX,
+there are makefiles in each directory that build the examples individually.
+
+Almost all of them benchmark ispc implementations of the given computation
+against regular serial C++ implementations, printing out a comparison of
+the runtimes and the speedup delivered by ispc.  It may be instructive to
+do a side-by-side diff of the C++ and ispc implementations of these
+algorithms to learn more about wirting ispc code.
+ 
+AOBench
+=======
+
+This is an ISPC implementation of the "AO bench" benchmark
+(http://syoyo.wordpress.com/2009/01/26/ao-bench-is-evolving/).  The command
+line arguments are:
+
+ao (num iterations) (x res) (yres)
+
+It executes the program for the given number of iterations, rendering an
+(xres x yres) image each time and measuring the computation time with both
+serial and ispc implementations.
+
+AOBench_Instrumented
+====================
+
+This version of AO Bench is compiled with the --instrument ispc compiler
+flag.  This causes the compiler to emit calls to a (user-supplied)
+ISPCInstrument() function at interesting places in the compiled code.  An
+example implementation of this function that counts the number of times the
+callback is made and records some statistics about control flow coherence
+is provided in the instrument.cpp file.
+
+*** Note: on Linux, this example currently hits an assertion in LLVM during
+*** compilation
+
+Mandelbrot
+==========
+
+Mandelbrot set generation.  This example is extensively documented at the
+http://ispc.github.com/example.html page.
+
+Mandelbrot_tasks
+================
+
+Implementation of Mandelbrot set generation that also parallelizes across
+cores using tasks.  Under Windows, a simple task system built on
+Microsoft's Concurrency Runtime is used (see tasks_concrt.cpp).  On OSX, a
+task system based on Grand Central Dispatch is used (tasks_gcd.cpp), and on
+Linux, a pthreads-based task system is used (tasks_pthreads.cpp).  When
+using tasks with ispc, no task system is mandated; the user is free to plug
+in any task system they want, for ease of interoperating with existing task
+systems.
+ 
+Options
+=======
+
+This program implements both the Black-Scholes and Binomial options pricing
+models in both ispc and regular serial C++ code.
+
+RT
+==
+
+This is a simple ray tracer; it reads in camera parameters and a bounding
+volume hierarchy and renders the scene from the given viewpoint.  The
+command line arguments are:
+
+rt <scene name base>
+
+Where <scene base name> is one of "cornell", "teapot", or "sponza".
+
+The implementation originally derives from the bounding volume hierarchy
+and triangle intersection code from pbrt; see the pbrt source code and/or
+"Physically Based Rendering" book for more about the basic algorithmic
+details.
+
+Simple
+======
+
+This is a simple "hello world" type program that shows a ~10 line
+application program calling out to a ~5 line ispc program to do a simple
+computation.
--- a/examples/aobench/Makefile
+++ b/examples/aobench/Makefile
@@ -0,0 +1,26 @@
+
+CXX=g++
+CXXFLAGS=-Iobjs/ -O3 -Wall
+ISPC=ispc
+ISPCFLAGS=-O2 --fast-math
+
+default: ao
+
+.PHONY: dirs clean
+
+dirs:
+	/bin/mkdir -p objs/
+
+clean:
+	/bin/rm -rf objs *~ ao
+
+ao: dirs objs/ao.o objs/ao_serial.o objs/ao_ispc.o
+	$(CXX) $(CXXFLAGS) -o $@ objs/ao.o objs/ao_ispc.o objs/ao_serial.o -lm -lpthread
+
+objs/%.o: %.cpp
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/ao.o: objs/ao_ispc.h 
+
+objs/%_ispc.h objs/%_ispc.o: %.ispc
+	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/aobench/ao.cpp
+++ b/examples/aobench/ao.cpp
@@ -0,0 +1,182 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#define NOMINMAX
+#pragma warning (disable: 4244)
+#pragma warning (disable: 4305)
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#ifdef __linux__
+#include <malloc.h>
+#endif
+#include <math.h>
+#include <map>
+#include <string>
+#include <algorithm>
+#include <sys/types.h>
+
+#include "ao_ispc.h"
+using namespace ispc;
+
+#include "../timing.h"
+
+#define NSUBSAMPLES        2
+
+extern void ao_serial(int w, int h, int nsubsamples, float image[]);
+
+static unsigned int test_iterations;
+static unsigned int width, height;
+static unsigned char *img;
+static float *fimg;
+
+
+static unsigned char
+clamp(float f)
+{
+    int i = (int)(f * 255.5);
+
+    if (i < 0) i = 0;
+    if (i > 255) i = 255;
+
+    return (unsigned char)i;
+}
+
+
+static void
+savePPM(const char *fname, int w, int h)
+{
+    for (int y = 0; y < h; y++) {
+        for (int x = 0; x < w; x++)  {
+            img[3 * (y * w + x) + 0] = clamp(fimg[3 *(y * w + x) + 0]);
+            img[3 * (y * w + x) + 1] = clamp(fimg[3 *(y * w + x) + 1]);
+            img[3 * (y * w + x) + 2] = clamp(fimg[3 *(y * w + x) + 2]);
+        }
+    }
+
+    FILE *fp = fopen(fname, "wb");
+    if (!fp) {
+        perror(fname);
+        exit(1);
+    }
+
+    fprintf(fp, "P6\n");
+    fprintf(fp, "%d %d\n", w, h);
+    fprintf(fp, "255\n");
+    fwrite(img, w * h * 3, 1, fp);
+    fclose(fp);
+}
+
+
+// Allocate memory with 64-byte alignment.
+float *
+AllocAligned(int size) {
+#if defined(_WIN32) || defined(_WIN64)
+    return (float *)_aligned_malloc(size, 64);
+#elif defined (__APPLE__)
+    // Allocate excess memory to ensure an aligned pointer can be returned
+    void *mem = malloc(size + (64-1) + sizeof(void*));
+    char *amem = ((char*)mem) + sizeof(void*);
+    amem += 64 - (reinterpret_cast<uint64_t>(amem) & (64 - 1));
+    ((void**)amem)[-1] = mem;
+    return (float *)amem;
+#else
+    return (float *)memalign(64, size);
+#endif
+}
+
+
+int main(int argc, char **argv)
+{
+    if (argc != 4) {
+        printf ("%s\n", argv[0]);
+        printf ("Usage: ao [num test iterations] [width] [height]\n");
+        getchar();
+        exit(-1);
+    }
+    else {
+        test_iterations = atoi(argv[1]);
+        width = atoi (argv[2]);
+        height = atoi (argv[3]);
+    }
+
+    // Allocate space for output images
+    img = (unsigned char *)AllocAligned(width * height * 3);
+    fimg = (float *)AllocAligned(sizeof(float) * width * height * 3);
+
+    //
+    // Run the ispc path, test_iterations times, and report the minimum
+    // time for any of them.
+    //
+    double minTimeISPC = 1e30;
+    for (unsigned int i = 0; i < test_iterations; i++) {
+        memset((void *)fimg, 0, sizeof(float) * width * height * 3);
+        assert(NSUBSAMPLES == 2);
+
+        reset_and_start_timer();
+        ao_ispc(width, height, NSUBSAMPLES, fimg);
+        double t = get_elapsed_mcycles();
+        minTimeISPC = std::min(minTimeISPC, t);
+    }
+
+    // Report results and save image
+    printf("[aobench ispc]:\t\t\t[%.3f] M cycles (%d x %d image)\n", minTimeISPC, 
+           width, height);
+    savePPM("ao-ispc.ppm", width, height); 
+
+    //
+    // Run the serial path, again test_iteration times, and report the
+    // minimum time.
+    //
+    double minTimeSerial = 1e30;
+    for (unsigned int i = 0; i < test_iterations; i++) {
+        memset((void *)fimg, 0, sizeof(float) * width * height * 3);
+        reset_and_start_timer();
+        ao_serial(width, height, NSUBSAMPLES, fimg);
+        double t = get_elapsed_mcycles();
+        minTimeSerial = std::min(minTimeSerial, t);
+    }
+
+    // Report more results, save another image...
+    printf("[aobench serial]:\t\t[%.3f] M cycles (%d x %d image)\n", minTimeSerial, 
+           width, height);
+    printf("\t\t\t\t(%.2fx speedup from ISPC)\n", minTimeSerial / minTimeISPC);
+    savePPM("ao-serial.ppm", width, height); 
+        
+    return 0;
+}
--- a/examples/aobench/ao.ispc
+++ b/examples/aobench/ao.ispc
@@ -0,0 +1,317 @@
+// -*- mode: c++ -*-
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+/*
+  Based on Syoyo Fujita's aobench: http://code.google.com/p/aobench
+*/
+
+#define NAO_SAMPLES		8
+#define M_PI 3.1415926535f
+
+typedef float<3> vec;
+
+struct Isect {
+    float      t;
+    vec        p;
+    vec        n;
+    int        hit; 
+};
+
+struct Sphere {
+    vec        center;
+    float      radius;
+
+};
+
+struct Plane {
+    vec    p;
+    vec    n;
+};
+
+struct Ray {
+    vec org;
+    vec dir;
+};
+
+static inline float dot(vec a, vec b) {
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+static inline vec vcross(vec v0, vec v1) {
+    vec ret;
+    ret.x = v0.y * v1.z - v0.z * v1.y;
+    ret.y = v0.z * v1.x - v0.x * v1.z;
+    ret.z = v0.x * v1.y - v0.y * v1.x;
+    return ret;
+}
+
+static inline void vnormalize(reference vec v) {
+    float len2 = dot(v, v);
+    float invlen = rsqrt(len2);
+    v *= invlen;
+}
+
+
+static inline void
+ray_plane_intersect(reference Isect isect, reference Ray ray, 
+                    reference Plane plane) {
+    float d = -dot(plane.p, plane.n);
+    float v = dot(ray.dir, plane.n);
+
+    cif (abs(v) < 1.0e-17) 
+        return;
+    else {
+        float t = -(dot(ray.org, plane.n) + d) / v;
+
+        cif ((t > 0.0) && (t < isect.t)) {
+            isect.t = t;
+            isect.hit = 1;
+            isect.p = ray.org + ray.dir * t;
+            isect.n = plane.n;
+        }
+    }
+}
+
+
+static inline void
+ray_sphere_intersect(reference Isect isect, reference Ray ray, 
+                     reference Sphere sphere) {
+    vec rs = ray.org - sphere.center;
+
+    float B = dot(rs, ray.dir);
+    float C = dot(rs, rs) - sphere.radius * sphere.radius;
+    float D = B * B - C;
+
+    cif (D > 0.) {
+        float t = -B - sqrt(D);
+
+        cif ((t > 0.0) && (t < isect.t)) {
+            isect.t = t;
+            isect.hit = 1;
+            isect.p = ray.org + t * ray.dir;
+            isect.n = isect.p - sphere.center;
+            vnormalize(isect.n);
+        }
+    }
+}
+
+
+static inline void
+orthoBasis(reference vec basis[3], vec n) {
+    basis[2] = n;
+    basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;
+
+    if ((n.x < 0.6) && (n.x > -0.6)) {
+        basis[1].x = 1.0;
+    } else if ((n.y < 0.6) && (n.y > -0.6)) {
+        basis[1].y = 1.0;
+    } else if ((n.z < 0.6) && (n.z > -0.6)) {
+        basis[1].z = 1.0;
+    } else {
+        basis[1].x = 1.0;
+    }
+
+    basis[0] = vcross(basis[1], basis[2]);
+    vnormalize(basis[0]);
+
+    basis[1] = vcross(basis[2], basis[0]);
+    vnormalize(basis[1]);
+}
+
+
+static inline float
+ambient_occlusion(reference Isect isect, reference Plane plane, 
+                  reference Sphere spheres[3], reference RNGState rngstate) {
+    float eps = 0.0001f;
+    vec p, n;
+    vec basis[3];
+    float occlusion = 0.0;
+
+    p = isect.p + eps * isect.n;
+
+    orthoBasis(basis, isect.n);
+
+    static const uniform int ntheta = NAO_SAMPLES;
+    static const uniform int nphi   = NAO_SAMPLES;
+    for (uniform int j = 0; j < ntheta; j++) {
+        for (uniform int i = 0; i < nphi; i++) {
+            Ray ray;
+            Isect occIsect;
+
+            float theta = sqrt(frandom(rngstate));
+            float phi   = 2.0f * M_PI * frandom(rngstate);
+            float x = cos(phi) * theta;
+            float y = sin(phi) * theta;
+            float z = sqrt(1.0 - theta * theta);
+
+            // local . global
+            float rx = x * basis[0].x + y * basis[1].x + z * basis[2].x;
+            float ry = x * basis[0].y + y * basis[1].y + z * basis[2].y;
+            float rz = x * basis[0].z + y * basis[1].z + z * basis[2].z;
+
+            ray.org = p;
+            ray.dir.x = rx;
+            ray.dir.y = ry;
+            ray.dir.z = rz;
+
+            occIsect.t   = 1.0e+17;
+            occIsect.hit = 0;
+
+            for (uniform int snum = 0; snum < 3; ++snum)
+                ray_sphere_intersect(occIsect, ray, spheres[snum]); 
+            ray_plane_intersect (occIsect, ray, plane); 
+
+            if (occIsect.hit) occlusion += 1.0;
+        }
+    }
+
+    occlusion = (ntheta * nphi - occlusion) / (float)(ntheta * nphi);
+    return occlusion;
+}
+
+
+/* Compute the image for the scanlines from [y0,y1), for an overall image
+   of width w and height h.
+ */
+void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h, 
+                  uniform int nsubsamples, reference uniform float image[]) {
+    static Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
+    static Sphere spheres[3] = {
+        { { -2.0f, 0.0f, -3.5f }, 0.5f },
+        { { -0.5f, 0.0f, -3.0f }, 0.5f },
+        { { 1.0f, 0.0f, -2.2f }, 0.5f } };
+    RNGState rngstate;
+
+    seed_rng(rngstate, y0);
+
+    // Compute the mapping between the 'programCount'-wide program
+    // instances running in parallel and samples in the image.  
+    //
+    // For now, we'll always take four samples per pixel, so start by
+    // initializing du and dv with offsets into subpixel samples.  We'll
+    // take care of further updating du and dv for the case where we're
+    // doing more than 4 program instances in parallel shortly.
+    uniform float uSteps[4] = { 0, 1, 0, 1 };
+    uniform float vSteps[4] = { 0, 0, 1, 1 };
+    float du = uSteps[programIndex % 4] / nsubsamples;
+    float dv = vSteps[programIndex % 4] / nsubsamples;
+
+    // Now handle the case where we are able to do more than one pixel's
+    // worth of work at once.  nx records the number of pixels in the x
+    // direction we do per iteration and ny the number in y.
+    uniform int nx = 1, ny = 1;
+
+    if (programCount == 8) {
+        // Do two pixels at once in the x direction
+        nx = 2;
+        if (programIndex >= 4) 
+            // And shift the offsets for the second pixel's worth of work
+            ++du;
+    }
+    else if (programCount == 16) {
+        // Two at once in both x and y
+        nx = ny = 2;
+        if ((programIndex >= 4 && programIndex < 8) || programIndex >= 12)
+            ++du;
+        if (programIndex >= 8)  
+            ++dv;
+    }
+
+    // Now loop over all of the pixels, stepping in x and y as calculated
+    // above.  (Assumes that ny divides y and nx divides x...)
+    for (uniform int y = y0; y < y1; y += ny) {
+        for (uniform int x = 0; x < w; x += nx)  {
+            // Figur out x,y pixel in NDC
+            float px =  (x + du - (w / 2.0f)) / (w / 2.0f);
+            float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
+            float ret = 0.f;
+            Ray ray;
+            Isect isect;
+
+            ray.org = 0.f;
+
+            // Poor man's perspective projection
+            ray.dir.x = px;
+            ray.dir.y = py;
+            ray.dir.z = -1.0;
+            vnormalize(ray.dir);
+
+            isect.t   = 1.0e+17;
+            isect.hit = 0;
+
+            for (uniform int snum = 0; snum < 3; ++snum)
+                ray_sphere_intersect(isect, ray, spheres[snum]);
+            ray_plane_intersect(isect, ray, plane);
+
+            // Note use of 'coherent' if statement; the set of rays we
+            // trace will often all hit or all miss the scene
+            cif (isect.hit)
+                ret = ambient_occlusion(isect, plane, spheres, rngstate);
+
+            // This is a little grungy; we have results for
+            // programCount-worth of values.  Because we're doing 2x2
+            // subsamples, we need to peel them off in groups of four,
+            // average the four values for each pixel, and update the
+            // output image.
+            //
+            // Store the varying value to a uniform array of the same size.
+            // See the discussion about communication among program
+            // instances in the ispc user's manual for more discussion on
+            // this idiom.
+            uniform float retArray[programCount];
+            retArray[programIndex] = ret;
+
+            // offset to the first pixel in the image
+            uniform int offset = 3 * (y * w + x);
+            for (uniform int p = 0; p < programCount; p += 4, ++offset) {
+                // Get the four sample values for this pixel
+                uniform float sumret = retArray[p] + retArray[p+1] + retArray[p+2] +
+                    retArray[p+3];
+
+                // Normalize by number of samples taken
+                sumret /= nsubsamples * nsubsamples; 
+                
+                // Store result in the image
+                image[offset+0] = sumret;
+                image[offset+1] = sumret;
+                image[offset+2] = sumret;
+            }
+        }
+    }
+}
+
+
+export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples, 
+                    uniform float image[]) {
+    ao_scanlines(0, h, w, h, nsubsamples, image);
+}
--- a/examples/aobench/ao_serial.cpp
+++ b/examples/aobench/ao_serial.cpp
@@ -0,0 +1,314 @@
+// -*- mode: c++ -*-
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+/*
+  Based on Syoyo Fujita's aobench: http://code.google.com/p/aobench
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#define NOMINMAX
+#pragma warning (disable: 4244)
+#pragma warning (disable: 4305)
+#endif
+
+#include <stdlib.h>
+#include <math.h>
+
+#ifdef _MSC_VER
+static long long drand48_x = 0x1234ABCD330E;
+
+static inline void srand48(int x) {
+    drand48_x = x ^ (x << 16);
+}
+
+static inline double drand48() {
+    drand48_x = drand48_x * 0x5DEECE66D + 0xB;
+    return (drand48_x & 0xFFFFFFFFFFFF) * (1.0 / 281474976710656.0);
+}
+#endif // _MSC_VER
+
+#ifdef _MSC_VER
+__declspec(align(16)) 
+#endif
+struct vec {
+    vec() { x=y=z=pad=0.; }
+    vec(float xx, float yy, float zz) { x = xx; y = yy; z = zz; }
+
+    vec operator*(float f) const { return vec(x*f, y*f, z*f); }
+    vec operator+(const vec &f2) const { 
+        return vec(x+f2.x, y+f2.y, z+f2.z); 
+    }
+    vec operator-(const vec &f2) const { 
+        return vec(x-f2.x, y-f2.y, z-f2.z); 
+    }
+    vec operator*(const vec &f2) const { 
+        return vec(x*f2.x, y*f2.y, z*f2.z); 
+    }
+    float x, y, z;
+    float pad;
+}
+#ifndef _MSC_VER
+__attribute__ ((aligned(16)))
+#endif
+;
+inline vec operator*(float f, const vec &v) { return vec(f*v.x, f*v.y, f*v.z); }
+
+
+#define NAO_SAMPLES		8
+
+#ifdef M_PI
+#undef M_PI
+#endif
+#define M_PI 3.1415926535f
+
+struct Isect {
+    float      t;
+    vec        p;
+    vec        n;
+    int        hit; 
+};
+
+struct Sphere {
+    vec        center;
+    float      radius;
+
+};
+
+struct Plane {
+    vec    p;
+    vec    n;
+};
+
+struct Ray {
+    vec org;
+    vec dir;
+};
+
+static inline float dot(const vec &a, const vec &b) {
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+static inline vec vcross(const vec &v0, const vec &v1) {
+    vec ret;
+    ret.x = v0.y * v1.z - v0.z * v1.y;
+    ret.y = v0.z * v1.x - v0.x * v1.z;
+    ret.z = v0.x * v1.y - v0.y * v1.x;
+    return ret;
+}
+
+static inline void vnormalize(vec &v) {
+    float len2 = dot(v, v);
+    float invlen = 1.f / sqrtf(len2);
+    v = v * invlen;
+}
+
+
+static inline void
+ray_plane_intersect(Isect &isect, Ray &ray, 
+                    Plane &plane) {
+    float d = -dot(plane.p, plane.n);
+    float v = dot(ray.dir, plane.n);
+
+    if (fabsf(v) < 1.0e-17) 
+        return;
+    else {
+        float t = -(dot(ray.org, plane.n) + d) / v;
+
+        if ((t > 0.0) && (t < isect.t)) {
+            isect.t = t;
+            isect.hit = 1;
+            isect.p = ray.org + ray.dir * t;
+            isect.n = plane.n;
+        }
+    }
+}
+
+
+static inline void
+ray_sphere_intersect(Isect &isect, Ray &ray, 
+                     Sphere &sphere) {
+    vec rs = ray.org - sphere.center;
+
+    float B = dot(rs, ray.dir);
+    float C = dot(rs, rs) - sphere.radius * sphere.radius;
+    float D = B * B - C;
+
+    if (D > 0.) {
+        float t = -B - sqrtf(D);
+
+        if ((t > 0.0) && (t < isect.t)) {
+            isect.t = t;
+            isect.hit = 1;
+            isect.p = ray.org + t * ray.dir;
+            isect.n = isect.p - sphere.center;
+            vnormalize(isect.n);
+        }
+    }
+}
+
+
+static inline void
+orthoBasis(vec basis[3], const vec &n) {
+    basis[2] = n;
+    basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;
+
+    if ((n.x < 0.6) && (n.x > -0.6)) {
+        basis[1].x = 1.0;
+    } else if ((n.y < 0.6) && (n.y > -0.6)) {
+        basis[1].y = 1.0;
+    } else if ((n.z < 0.6) && (n.z > -0.6)) {
+        basis[1].z = 1.0;
+    } else {
+        basis[1].x = 1.0;
+    }
+
+    basis[0] = vcross(basis[1], basis[2]);
+    vnormalize(basis[0]);
+
+    basis[1] = vcross(basis[2], basis[0]);
+    vnormalize(basis[1]);
+}
+
+
+static float
+ambient_occlusion(Isect &isect, Plane &plane, 
+                  Sphere spheres[3]) {
+    float eps = 0.0001f;
+    vec p, n;
+    vec basis[3];
+    float occlusion = 0.0;
+
+    p = isect.p + eps * isect.n;
+
+    orthoBasis(basis, isect.n);
+
+    static const int ntheta = NAO_SAMPLES;
+    static const int nphi   = NAO_SAMPLES;
+    for (int j = 0; j < ntheta; j++) {
+        for (int i = 0; i < nphi; i++) {
+            Ray ray;
+            Isect occIsect;
+
+            float theta = sqrtf(drand48());
+            float phi   = 2.0f * M_PI * drand48();
+            float x = cosf(phi) * theta;
+            float y = sinf(phi) * theta;
+            float z = sqrtf(1.0 - theta * theta);
+
+            // local . global
+            float rx = x * basis[0].x + y * basis[1].x + z * basis[2].x;
+            float ry = x * basis[0].y + y * basis[1].y + z * basis[2].y;
+            float rz = x * basis[0].z + y * basis[1].z + z * basis[2].z;
+
+            ray.org = p;
+            ray.dir.x = rx;
+            ray.dir.y = ry;
+            ray.dir.z = rz;
+
+            occIsect.t   = 1.0e+17;
+            occIsect.hit = 0;
+
+            for (int snum = 0; snum < 3; ++snum)
+                ray_sphere_intersect(occIsect, ray, spheres[snum]); 
+            ray_plane_intersect (occIsect, ray, plane); 
+
+            if (occIsect.hit) occlusion += 1.0;
+        }
+    }
+
+    occlusion = (ntheta * nphi - occlusion) / (float)(ntheta * nphi);
+    return occlusion;
+}
+
+
+/* Compute the image for the scanlines from [y0,y1), for an overall image
+   of width w and height h.
+ */
+static void ao_scanlines(int y0, int y1, int w, int h, int nsubsamples,
+                         float image[]) {
+    static Plane plane = { vec(0.0f, -0.5f, 0.0f), vec(0.f, 1.f, 0.f) };
+    static Sphere spheres[3] = {
+        { vec(-2.0f, 0.0f, -3.5f), 0.5f },
+        { vec(-0.5f, 0.0f, -3.0f), 0.5f },
+        { vec(1.0f, 0.0f, -2.2f), 0.5f } };
+
+    srand48(y0);
+    
+    for (int y = y0; y < y1; ++y) {
+        for (int x = 0; x < w; ++x)  {
+            int offset = 3 * (y * w + x);
+            for (int u = 0; u < nsubsamples; ++u) {
+                for (int v = 0; v < nsubsamples; ++v) {
+                    float px = (x + (u / (float)nsubsamples) - (w / 2.0f)) / (w / 2.0f);
+                    float py = -(y + (v / (float)nsubsamples) - (h / 2.0f)) / (h / 2.0f);
+                    float ret = 0.f;
+                    Ray ray;
+                    Isect isect;
+
+                    ray.org = vec(0.f, 0.f, 0.f);
+
+                    ray.dir.x = px;
+                    ray.dir.y = py;
+                    ray.dir.z = -1.0;
+                    vnormalize(ray.dir);
+
+                    isect.t   = 1.0e+17;
+                    isect.hit = 0;
+
+                    for (int snum = 0; snum < 3; ++snum)
+                        ray_sphere_intersect(isect, ray, spheres[snum]);
+                    ray_plane_intersect(isect, ray, plane);
+
+                    if (isect.hit)
+                        ret = ambient_occlusion(isect, plane, spheres);
+
+                    // Update image for AO for this ray
+                    image[offset+0] += ret;
+                    image[offset+1] += ret;
+                    image[offset+2] += ret;
+                }
+            }
+            // Normalize image pixels by number of samples taken per pixel
+            image[offset+0] /= nsubsamples * nsubsamples;
+            image[offset+1] /= nsubsamples * nsubsamples;
+            image[offset+2] /= nsubsamples * nsubsamples;
+        }
+    }
+}
+
+
+void ao_serial(int w, int h, int nsubsamples, 
+               float image[]) {
+    ao_scanlines(0, h, w, h, nsubsamples, image);
+}
--- a/examples/aobench/aobench.vcxproj
+++ b/examples/aobench/aobench.vcxproj
@@ -0,0 +1,161 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="ao.cpp" />
+    <ClCompile Include="ao_serial.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="ao.ispc">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>aobench</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
--- a/examples/aobench_instrumented/Makefile
+++ b/examples/aobench_instrumented/Makefile
@@ -0,0 +1,26 @@
+
+CXX=g++
+CXXFLAGS=-Iobjs/ -g3 -Wall
+ISPC=ispc
+ISPCFLAGS=-O2 --fast-math --instrument
+
+default: ao
+
+.PHONY: dirs clean
+
+dirs:
+	/bin/mkdir -p objs/
+
+clean:
+	/bin/rm -rf objs *~ ao
+
+ao: dirs objs/ao.o objs/instrument.o objs/ao_ispc.o
+	$(CXX) $(CXXFLAGS) -o $@ objs/ao.o objs/ao_ispc.o objs/instrument.o -lm -lpthread
+
+objs/%.o: %.cpp
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/ao.o: objs/ao_ispc.h 
+
+objs/%_ispc.h objs/%_ispc.o: %.ispc
+	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/aobench_instrumented/ao.cpp
+++ b/examples/aobench_instrumented/ao.cpp
@@ -0,0 +1,148 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#define NOMINMAX
+#pragma warning (disable: 4244)
+#pragma warning (disable: 4305)
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#ifdef __linux__
+#include <malloc.h>
+#endif
+#include <math.h>
+#include <map>
+#include <string>
+#include <algorithm>
+#include <sys/types.h>
+
+#include "ao_ispc.h"
+using namespace ispc;
+
+#include "instrument.h"
+#include "../timing.h"
+
+#define NSUBSAMPLES        2
+
+static unsigned int test_iterations;
+static unsigned int width, height;
+static unsigned char *img;
+static float *fimg;
+
+
+static unsigned char
+clamp(float f)
+{
+    int i = (int)(f * 255.5);
+
+    if (i < 0) i = 0;
+    if (i > 255) i = 255;
+
+    return (unsigned char)i;
+}
+
+
+static void
+savePPM(const char *fname, int w, int h)
+{
+    for (int y = 0; y < h; y++) {
+        for (int x = 0; x < w; x++)  {
+            img[3 * (y * w + x) + 0] = clamp(fimg[3 *(y * w + x) + 0]);
+            img[3 * (y * w + x) + 1] = clamp(fimg[3 *(y * w + x) + 1]);
+            img[3 * (y * w + x) + 2] = clamp(fimg[3 *(y * w + x) + 2]);
+        }
+    }
+
+    FILE *fp = fopen(fname, "wb");
+    if (!fp) {
+        perror(fname);
+        exit(1);
+    }
+
+    fprintf(fp, "P6\n");
+    fprintf(fp, "%d %d\n", w, h);
+    fprintf(fp, "255\n");
+    fwrite(img, w * h * 3, 1, fp);
+    fclose(fp);
+}
+
+
+// Allocate memory with 64-byte alignment.
+float *
+AllocAligned(int size) {
+#if defined(_WIN32) || defined(_WIN64)
+    return (float *)_aligned_malloc(size, 64);
+#elif defined (__APPLE__)
+    // Allocate excess memory to ensure an aligned pointer can be returned
+    void *mem = malloc(size + (64-1) + sizeof(void*));
+    char *amem = ((char*)mem) + sizeof(void*);
+    amem += 64 - (reinterpret_cast<uint64_t>(amem) & (64 - 1));
+    ((void**)amem)[-1] = mem;
+    return (float *)amem;
+#else
+    return (float *)memalign(64, size);
+#endif
+}
+
+
+int main(int argc, char **argv)
+{
+    if (argc != 4) {
+        printf ("%s\n", argv[0]);
+        printf ("Usage: ao [num test iterations] [width] [height]\n");
+        getchar();
+        exit(-1);
+    }
+    else {
+        test_iterations = atoi(argv[1]);
+        width = atoi (argv[2]);
+        height = atoi (argv[3]);
+    }
+
+    // Allocate space for output images
+    img = (unsigned char *)AllocAligned(width * height * 3);
+    fimg = (float *)AllocAligned(sizeof(float) * width * height * 3);
+
+    ao_ispc(width, height, NSUBSAMPLES, fimg);
+
+    savePPM("ao-ispc.ppm", width, height); 
+
+    ISPCPrintInstrument();
+
+    return 0;
+}
--- a/examples/aobench_instrumented/ao.ispc
+++ b/examples/aobench_instrumented/ao.ispc
@@ -0,0 +1,317 @@
+// -*- mode: c++ -*-
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+/*
+  Based on Syoyo Fujita's aobench: http://code.google.com/p/aobench
+*/
+
+#define NAO_SAMPLES		8
+#define M_PI 3.1415926535f
+
+typedef float<3> vec;
+
+struct Isect {
+    float      t;
+    vec        p;
+    vec        n;
+    int        hit; 
+};
+
+struct Sphere {
+    vec        center;
+    float      radius;
+
+};
+
+struct Plane {
+    vec    p;
+    vec    n;
+};
+
+struct Ray {
+    vec org;
+    vec dir;
+};
+
+static inline float dot(vec a, vec b) {
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+static inline vec vcross(vec v0, vec v1) {
+    vec ret;
+    ret.x = v0.y * v1.z - v0.z * v1.y;
+    ret.y = v0.z * v1.x - v0.x * v1.z;
+    ret.z = v0.x * v1.y - v0.y * v1.x;
+    return ret;
+}
+
+static inline void vnormalize(reference vec v) {
+    float len2 = dot(v, v);
+    float invlen = rsqrt(len2);
+    v *= invlen;
+}
+
+
+static inline void
+ray_plane_intersect(reference Isect isect, reference Ray ray, 
+                    reference Plane plane) {
+    float d = -dot(plane.p, plane.n);
+    float v = dot(ray.dir, plane.n);
+
+    cif (abs(v) < 1.0e-17) 
+        return;
+    else {
+        float t = -(dot(ray.org, plane.n) + d) / v;
+
+        cif ((t > 0.0) && (t < isect.t)) {
+            isect.t = t;
+            isect.hit = 1;
+            isect.p = ray.org + ray.dir * t;
+            isect.n = plane.n;
+        }
+    }
+}
+
+
+static inline void
+ray_sphere_intersect(reference Isect isect, reference Ray ray, 
+                     reference Sphere sphere) {
+    vec rs = ray.org - sphere.center;
+
+    float B = dot(rs, ray.dir);
+    float C = dot(rs, rs) - sphere.radius * sphere.radius;
+    float D = B * B - C;
+
+    cif (D > 0.) {
+        float t = -B - sqrt(D);
+
+        cif ((t > 0.0) && (t < isect.t)) {
+            isect.t = t;
+            isect.hit = 1;
+            isect.p = ray.org + t * ray.dir;
+            isect.n = isect.p - sphere.center;
+            vnormalize(isect.n);
+        }
+    }
+}
+
+
+static inline void
+orthoBasis(reference vec basis[3], vec n) {
+    basis[2] = n;
+    basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;
+
+    if ((n.x < 0.6) && (n.x > -0.6)) {
+        basis[1].x = 1.0;
+    } else if ((n.y < 0.6) && (n.y > -0.6)) {
+        basis[1].y = 1.0;
+    } else if ((n.z < 0.6) && (n.z > -0.6)) {
+        basis[1].z = 1.0;
+    } else {
+        basis[1].x = 1.0;
+    }
+
+    basis[0] = vcross(basis[1], basis[2]);
+    vnormalize(basis[0]);
+
+    basis[1] = vcross(basis[2], basis[0]);
+    vnormalize(basis[1]);
+}
+
+
+static inline float
+ambient_occlusion(reference Isect isect, reference Plane plane, 
+                  reference Sphere spheres[3], reference RNGState rngstate) {
+    float eps = 0.0001f;
+    vec p, n;
+    vec basis[3];
+    float occlusion = 0.0;
+
+    p = isect.p + eps * isect.n;
+
+    orthoBasis(basis, isect.n);
+
+    static const uniform int ntheta = NAO_SAMPLES;
+    static const uniform int nphi   = NAO_SAMPLES;
+    for (uniform int j = 0; j < ntheta; j++) {
+        for (uniform int i = 0; i < nphi; i++) {
+            Ray ray;
+            Isect occIsect;
+
+            float theta = sqrt(frandom(rngstate));
+            float phi   = 2.0f * M_PI * frandom(rngstate);
+            float x = cos(phi) * theta;
+            float y = sin(phi) * theta;
+            float z = sqrt(1.0 - theta * theta);
+
+            // local . global
+            float rx = x * basis[0].x + y * basis[1].x + z * basis[2].x;
+            float ry = x * basis[0].y + y * basis[1].y + z * basis[2].y;
+            float rz = x * basis[0].z + y * basis[1].z + z * basis[2].z;
+
+            ray.org = p;
+            ray.dir.x = rx;
+            ray.dir.y = ry;
+            ray.dir.z = rz;
+
+            occIsect.t   = 1.0e+17;
+            occIsect.hit = 0;
+
+            for (uniform int snum = 0; snum < 3; ++snum)
+                ray_sphere_intersect(occIsect, ray, spheres[snum]); 
+            ray_plane_intersect (occIsect, ray, plane); 
+
+            if (occIsect.hit) occlusion += 1.0;
+        }
+    }
+
+    occlusion = (ntheta * nphi - occlusion) / (float)(ntheta * nphi);
+    return occlusion;
+}
+
+
+/* Compute the image for the scanlines from [y0,y1), for an overall image
+   of width w and height h.
+ */
+void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h, 
+                  uniform int nsubsamples, reference uniform float image[]) {
+    static Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
+    static Sphere spheres[3] = {
+        { { -2.0f, 0.0f, -3.5f }, 0.5f },
+        { { -0.5f, 0.0f, -3.0f }, 0.5f },
+        { { 1.0f, 0.0f, -2.2f }, 0.5f } };
+    RNGState rngstate;
+
+    seed_rng(rngstate, y0);
+
+    // Compute the mapping between the 'programCount'-wide program
+    // instances running in parallel and samples in the image.  
+    //
+    // For now, we'll always take four samples per pixel, so start by
+    // initializing du and dv with offsets into subpixel samples.  We'll
+    // take care of further updating du and dv for the case where we're
+    // doing more than 4 program instances in parallel shortly.
+    uniform float uSteps[4] = { 0, 1, 0, 1 };
+    uniform float vSteps[4] = { 0, 0, 1, 1 };
+    float du = uSteps[programIndex % 4] / nsubsamples;
+    float dv = vSteps[programIndex % 4] / nsubsamples;
+
+    // Now handle the case where we are able to do more than one pixel's
+    // worth of work at once.  nx records the number of pixels in the x
+    // direction we do per iteration and ny the number in y.
+    uniform int nx = 1, ny = 1;
+
+    if (programCount == 8) {
+        // Do two pixels at once in the x direction
+        nx = 2;
+        if (programIndex >= 4) 
+            // And shift the offsets for the second pixel's worth of work
+            ++du;
+    }
+    else if (programCount == 16) {
+        // Two at once in both x and y
+        nx = ny = 2;
+        if ((programIndex >= 4 && programIndex < 8) || programIndex >= 12)
+            ++du;
+        if (programIndex >= 8)  
+            ++dv;
+    }
+
+    // Now loop over all of the pixels, stepping in x and y as calculated
+    // above.  (Assumes that ny divides y and nx divides x...)
+    for (uniform int y = y0; y < y1; y += ny) {
+        for (uniform int x = 0; x < w; x += nx)  {
+            // Figur out x,y pixel in NDC
+            float px =  (x + du - (w / 2.0f)) / (w / 2.0f);
+            float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
+            float ret = 0.f;
+            Ray ray;
+            Isect isect;
+
+            ray.org = 0.f;
+
+            // Poor man's perspective projection
+            ray.dir.x = px;
+            ray.dir.y = py;
+            ray.dir.z = -1.0;
+            vnormalize(ray.dir);
+
+            isect.t   = 1.0e+17;
+            isect.hit = 0;
+
+            for (uniform int snum = 0; snum < 3; ++snum)
+                ray_sphere_intersect(isect, ray, spheres[snum]);
+            ray_plane_intersect(isect, ray, plane);
+
+            // Note use of 'coherent' if statement; the set of rays we
+            // trace will often all hit or all miss the scene
+            cif (isect.hit)
+                ret = ambient_occlusion(isect, plane, spheres, rngstate);
+
+            // This is a little grungy; we have results for
+            // programCount-worth of values.  Because we're doing 2x2
+            // subsamples, we need to peel them off in groups of four,
+            // average the four values for each pixel, and update the
+            // output image.
+            //
+            // Store the varying value to a uniform array of the same size.
+            // See the discussion about communication among program
+            // instances in the ispc user's manual for more discussion on
+            // this idiom.
+            uniform float retArray[programCount];
+            retArray[programIndex] = ret;
+
+            // offset to the first pixel in the image
+            uniform int offset = 3 * (y * w + x);
+            for (uniform int p = 0; p < programCount; p += 4, ++offset) {
+                // Get the four sample values for this pixel
+                uniform float sumret = retArray[p] + retArray[p+1] + retArray[p+2] +
+                    retArray[p+3];
+
+                // Normalize by number of samples taken
+                sumret /= nsubsamples * nsubsamples; 
+                
+                // Store result in the image
+                image[offset+0] = sumret;
+                image[offset+1] = sumret;
+                image[offset+2] = sumret;
+            }
+        }
+    }
+}
+
+
+export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples, 
+                    uniform float image[]) {
+    ao_scanlines(0, h, w, h, nsubsamples, image);
+}
--- a/examples/aobench_instrumented/aobench_instrumented.vcxproj
+++ b/examples/aobench_instrumented/aobench_instrumented.vcxproj
@@ -0,0 +1,161 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="ao.cpp" />
+    <ClCompile Include="instrument.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="ao.ispc">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --instrument
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --instrument
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --instrument
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --instrument
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>aobench_instrumented</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
--- a/examples/aobench_instrumented/instrument.cpp
+++ b/examples/aobench_instrumented/instrument.cpp
@@ -0,0 +1,94 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#include "instrument.h"
+#include <stdio.h>
+#include <assert.h>
+#include <string>
+#include <map>
+
+struct CallInfo {
+    CallInfo() { count = laneCount = allOff = 0; }
+    int count;
+    int laneCount;
+    int allOff;
+};
+
+static std::map<std::string, CallInfo> callInfo;
+
+int countbits(int i) {
+    int ret = 0;
+    while (i) {
+        if (i & 0x1)
+            ++ret;
+        i >>= 1;
+    }
+    return ret;
+}
+
+
+// Callback function that ispc compiler emits calls to when --instrument
+// command-line flag is given while compiling.
+void
+ISPCInstrument(const char *fn, const char *note, int line, int mask) {
+    char sline[16];
+    sprintf(sline, "%04d", line);
+    std::string s = std::string(fn) + std::string("(") + std::string(sline) +
+        std::string(") - ") + std::string(note);
+
+    // Find or create a CallInfo instance for this callsite.
+    CallInfo &ci = callInfo[s];
+
+    // And update its statistics... 
+    ++ci.count;
+    if (mask == 0)
+        ++ci.allOff;
+    ci.laneCount += countbits(mask);
+}
+
+
+void
+ISPCPrintInstrument() {
+    // When program execution is done, go through the stats and print them
+    // out.  (This function is called by ao.cpp).
+    std::map<std::string, CallInfo>::iterator citer = callInfo.begin();
+    while (citer != callInfo.end()) {
+        CallInfo &ci = citer->second;
+        float activePct = 100.f * ci.laneCount / (4.f * ci.count);
+        float allOffPct = 100.f * ci.allOff / ci.count;
+        printf("%s: %d calls (%d / %.2f%% all off!), %.2f%% active lanes\n",
+               citer->first.c_str(), ci.count, ci.allOff, allOffPct,
+               activePct);
+        ++citer;
+    }
+}
--- a/examples/aobench_instrumented/instrument.h
+++ b/examples/aobench_instrumented/instrument.h
@@ -0,0 +1,45 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifndef INSTRUMENT_H
+#define INSTRUMENT_H 1
+
+#include <stdint.h>
+
+extern "C" { 
+    void ISPCInstrument(const char *fn, const char *note, int line, int mask);
+}
+
+void ISPCPrintInstrument();
+
+#endif // INSTRUMENT_H
--- a/examples/examples.sln
+++ b/examples/examples.sln
@@ -0,0 +1,86 @@
+
+Microsoft Visual Studio Solution File, Format Version 11.00
+# Visual Studio 2010
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simple", "simple\simple.vcxproj", "{947C5311-8B78-4D05-BEE4-BCF342D4B367}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "rt", "rt\rt.vcxproj", "{E787BC3F-2D2E-425E-A64D-4721E2FF3DC9}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "aobench", "aobench\aobench.vcxproj", "{F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mandelbrot", "mandelbrot\mandelbrot.vcxproj", "{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C1}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "options", "options\options.vcxproj", "{8C7B5D29-1E76-44E6-BBB8-09830E5DEEAE}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mandelbrot_tasks", "mandelbrot_tasks\mandelbrot_tasks.vcxproj", "{E80DA7D4-AB22-4648-A068-327307156BE6}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "aobench_instrumented", "aobench_instrumented\aobench_instrumented.vcxproj", "{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Win32 = Debug|Win32
+		Debug|x64 = Debug|x64
+		Release|Win32 = Release|Win32
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{947C5311-8B78-4D05-BEE4-BCF342D4B367}.Debug|Win32.ActiveCfg = Debug|Win32
+		{947C5311-8B78-4D05-BEE4-BCF342D4B367}.Debug|Win32.Build.0 = Debug|Win32
+		{947C5311-8B78-4D05-BEE4-BCF342D4B367}.Debug|x64.ActiveCfg = Debug|x64
+		{947C5311-8B78-4D05-BEE4-BCF342D4B367}.Debug|x64.Build.0 = Debug|x64
+		{947C5311-8B78-4D05-BEE4-BCF342D4B367}.Release|Win32.ActiveCfg = Release|Win32
+		{947C5311-8B78-4D05-BEE4-BCF342D4B367}.Release|Win32.Build.0 = Release|Win32
+		{947C5311-8B78-4D05-BEE4-BCF342D4B367}.Release|x64.ActiveCfg = Release|x64
+		{947C5311-8B78-4D05-BEE4-BCF342D4B367}.Release|x64.Build.0 = Release|x64
+		{E787BC3F-2D2E-425E-A64D-4721E2FF3DC9}.Debug|Win32.ActiveCfg = Debug|Win32
+		{E787BC3F-2D2E-425E-A64D-4721E2FF3DC9}.Debug|Win32.Build.0 = Debug|Win32
+		{E787BC3F-2D2E-425E-A64D-4721E2FF3DC9}.Debug|x64.ActiveCfg = Debug|x64
+		{E787BC3F-2D2E-425E-A64D-4721E2FF3DC9}.Debug|x64.Build.0 = Debug|x64
+		{E787BC3F-2D2E-425E-A64D-4721E2FF3DC9}.Release|Win32.ActiveCfg = Release|Win32
+		{E787BC3F-2D2E-425E-A64D-4721E2FF3DC9}.Release|Win32.Build.0 = Release|Win32
+		{E787BC3F-2D2E-425E-A64D-4721E2FF3DC9}.Release|x64.ActiveCfg = Release|x64
+		{E787BC3F-2D2E-425E-A64D-4721E2FF3DC9}.Release|x64.Build.0 = Release|x64
+		{F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB}.Debug|Win32.ActiveCfg = Debug|Win32
+		{F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB}.Debug|Win32.Build.0 = Debug|Win32
+		{F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB}.Debug|x64.ActiveCfg = Debug|x64
+		{F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB}.Debug|x64.Build.0 = Debug|x64
+		{F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB}.Release|Win32.ActiveCfg = Release|Win32
+		{F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB}.Release|Win32.Build.0 = Release|Win32
+		{F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB}.Release|x64.ActiveCfg = Release|x64
+		{F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB}.Release|x64.Build.0 = Release|x64
+		{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C1}.Debug|Win32.ActiveCfg = Debug|Win32
+		{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C1}.Debug|Win32.Build.0 = Debug|Win32
+		{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C1}.Debug|x64.ActiveCfg = Debug|x64
+		{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C1}.Debug|x64.Build.0 = Debug|x64
+		{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C1}.Release|Win32.ActiveCfg = Release|Win32
+		{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C1}.Release|Win32.Build.0 = Release|Win32
+		{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C1}.Release|x64.ActiveCfg = Release|x64
+		{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C1}.Release|x64.Build.0 = Release|x64
+		{8C7B5D29-1E76-44E6-BBB8-09830E5DEEAE}.Debug|Win32.ActiveCfg = Debug|Win32
+		{8C7B5D29-1E76-44E6-BBB8-09830E5DEEAE}.Debug|Win32.Build.0 = Debug|Win32
+		{8C7B5D29-1E76-44E6-BBB8-09830E5DEEAE}.Debug|x64.ActiveCfg = Debug|x64
+		{8C7B5D29-1E76-44E6-BBB8-09830E5DEEAE}.Debug|x64.Build.0 = Debug|x64
+		{8C7B5D29-1E76-44E6-BBB8-09830E5DEEAE}.Release|Win32.ActiveCfg = Release|Win32
+		{8C7B5D29-1E76-44E6-BBB8-09830E5DEEAE}.Release|Win32.Build.0 = Release|Win32
+		{8C7B5D29-1E76-44E6-BBB8-09830E5DEEAE}.Release|x64.ActiveCfg = Release|x64
+		{8C7B5D29-1E76-44E6-BBB8-09830E5DEEAE}.Release|x64.Build.0 = Release|x64
+		{E80DA7D4-AB22-4648-A068-327307156BE6}.Debug|Win32.ActiveCfg = Debug|Win32
+		{E80DA7D4-AB22-4648-A068-327307156BE6}.Debug|Win32.Build.0 = Debug|Win32
+		{E80DA7D4-AB22-4648-A068-327307156BE6}.Debug|x64.ActiveCfg = Debug|x64
+		{E80DA7D4-AB22-4648-A068-327307156BE6}.Debug|x64.Build.0 = Debug|x64
+		{E80DA7D4-AB22-4648-A068-327307156BE6}.Release|Win32.ActiveCfg = Release|Win32
+		{E80DA7D4-AB22-4648-A068-327307156BE6}.Release|Win32.Build.0 = Release|Win32
+		{E80DA7D4-AB22-4648-A068-327307156BE6}.Release|x64.ActiveCfg = Release|x64
+		{E80DA7D4-AB22-4648-A068-327307156BE6}.Release|x64.Build.0 = Release|x64
+		{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}.Debug|Win32.ActiveCfg = Debug|Win32
+		{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}.Debug|Win32.Build.0 = Debug|Win32
+		{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}.Debug|x64.ActiveCfg = Debug|x64
+		{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}.Debug|x64.Build.0 = Debug|x64
+		{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}.Release|Win32.ActiveCfg = Release|Win32
+		{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}.Release|Win32.Build.0 = Release|Win32
+		{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}.Release|x64.ActiveCfg = Release|x64
+		{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/examples/mandelbrot/Makefile
+++ b/examples/mandelbrot/Makefile
@@ -0,0 +1,26 @@
+
+CXX=g++
+CXXFLAGS=-Iobjs/ -O3 -Wall
+ISPC=ispc
+ISPCFLAGS=-O2 --target=sse4x2
+
+default: mandelbrot
+
+.PHONY: dirs clean
+
+dirs:
+	/bin/mkdir -p objs/
+
+clean:
+	/bin/rm -rf objs *~ mandelbrot
+
+mandelbrot: dirs objs/mandelbrot.o objs/mandelbrot_serial.o objs/mandelbrot_ispc.o
+	$(CXX) $(CXXFLAGS) -o $@ objs/mandelbrot.o objs/mandelbrot_ispc.o objs/mandelbrot_serial.o -lm
+
+objs/%.o: %.cpp
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/mandelbrot.o: objs/mandelbrot_ispc.h 
+
+objs/%_ispc.h objs/%_ispc.o: %.ispc
+	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/mandelbrot/mandelbrot.cpp
+++ b/examples/mandelbrot/mandelbrot.cpp
@@ -0,0 +1,117 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#define NOMINMAX
+#pragma warning (disable: 4244)
+#pragma warning (disable: 4305)
+#endif
+
+#include <stdio.h>
+#include <algorithm>
+#include "../timing.h"
+#include "mandelbrot_ispc.h"
+using namespace ispc;
+
+extern void mandelbrot_serial(float x0, float y0, float x1, float y1,
+                              int width, int height, int maxIterations,
+                              int output[]);
+
+/* Write a PPM image file with the image of the Mandelbrot set */
+static void
+writePPM(int *buf, int width, int height, const char *fn) {
+    FILE *fp = fopen(fn, "wb");
+    fprintf(fp, "P6\n");
+    fprintf(fp, "%d %d\n", width, height);
+    fprintf(fp, "255\n");
+    for (int i = 0; i < width*height; ++i) {
+        // Map the iteration count to colors by just alternating between
+        // two greys.
+        char c = (buf[i] & 0x1) ? 240 : 20;
+        for (int j = 0; j < 3; ++j)
+            fputc(c, fp);
+    }
+    fclose(fp);
+}
+
+
+int main() {
+    unsigned int width = 768;
+    unsigned int height = 512;
+    float x0 = -2;
+    float x1 = 1;
+    float y0 = -1;
+    float y1 = 1;
+
+    int maxIterations = 256;
+    int *buf = new int[width*height];
+
+    //
+    // Compute the image using the ispc implementation; report the minimum
+    // time of three runs.
+    //
+    double minISPC = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        reset_and_start_timer();
+        mandelbrot_ispc(x0, y0, x1, y1, width, height, maxIterations, buf);
+        double dt = get_elapsed_mcycles();
+        minISPC = std::min(minISPC, dt);
+    }
+
+    printf("[mandelbrot ispc]:\t\t[%.3f] million cycles\n", minISPC);
+    writePPM(buf, width, height, "mandelbrot-ispc.ppm");
+
+    // Clear out the buffer
+    for (unsigned int i = 0; i < width * height; ++i)
+        buf[i] = 0;
+
+    // 
+    // And run the serial implementation 3 times, again reporting the
+    // minimum time.
+    //
+    double minSerial = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        reset_and_start_timer();
+        mandelbrot_serial(x0, y0, x1, y1, width, height, maxIterations, buf);
+        double dt = get_elapsed_mcycles();
+        minSerial = std::min(minSerial, dt);
+    }
+
+    printf("[mandelbrot serial]:\t\t[%.3f] millon cycles\n", minSerial);
+    writePPM(buf, width, height, "mandelbrot-serial.ppm");
+
+    printf("\t\t\t\t(%.2fx speedup from ISPC)\n", minSerial/minISPC);
+
+    return 0;
+}
--- a/examples/mandelbrot/mandelbrot.ispc
+++ b/examples/mandelbrot/mandelbrot.ispc
@@ -0,0 +1,76 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+static inline int mandel(float c_re, float c_im, int count) {
+    float z_re = c_re, z_im = c_im;
+    int i;
+    for (i = 0; i < count; ++i) {
+        if (z_re * z_re + z_im * z_im > 4.)
+            break;
+
+        float new_re = z_re*z_re - z_im*z_im;
+        float new_im = 2.f * z_re * z_im;
+        z_re = c_re + new_re;
+        z_im = c_im + new_im;
+    }
+
+    return i;
+}
+
+export void mandelbrot_ispc(uniform float x0, uniform float y0, 
+                            uniform float x1, uniform float y1,
+                            uniform int width, uniform int height, 
+                            uniform int maxIterations,
+                            reference uniform int output[])
+{
+    float dx = (x1 - x0) / width;
+    float dy = (y1 - y0) / height;
+
+    for (uniform int j = 0; j < height; j++) {
+        // Note that we'll be doing programCount computations in parallel,
+        // so increment i by that much.  This assumes that width evenly
+        // divides programCount.
+        for (uniform int i = 0; i < width; i += programCount) {
+            // Figure out the position on the complex plane to compute the
+            // number of iterations at.  Note that the x values are
+            // different across different program instances, since its
+            // initializer incorporates the value of the programIndex
+            // variable.
+            float x = x0 + (programIndex + i) * dx;
+            float y = y0 + j * dy;
+
+            int index = j * width + i + programIndex;
+            output[index] = mandel(x, y, maxIterations);
+        }
+    }
+}
--- a/examples/mandelbrot/mandelbrot.vcxproj
+++ b/examples/mandelbrot/mandelbrot.vcxproj
@@ -0,0 +1,161 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C1}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>mandelbrot</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="mandelbrot.cpp" />
+    <ClCompile Include="mandelbrot_serial.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="mandelbrot.ispc">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
--- a/examples/mandelbrot/mandelbrot_serial.cpp
+++ b/examples/mandelbrot/mandelbrot_serial.cpp
@@ -0,0 +1,68 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+
+static int mandel(float c_re, float c_im, int count) {
+    float z_re = c_re, z_im = c_im;
+    int i;
+    for (i = 0; i < count; ++i) {
+        if (z_re * z_re + z_im * z_im > 4.)
+            break;
+
+        float new_re = z_re*z_re - z_im*z_im;
+        float new_im = 2.f * z_re * z_im;
+        z_re = c_re + new_re;
+        z_im = c_im + new_im;
+    }
+
+    return i;
+}
+
+void mandelbrot_serial(float x0, float y0, float x1, float y1,
+                       int width, int height, int maxIterations,
+                       int output[])
+{
+    float dx = (x1 - x0) / width;
+    float dy = (y1 - y0) / height;
+
+    for (int j = 0; j < height; j++) {
+        for (int i = 0; i < width; ++i) {
+            float x = x0 + i * dx;
+            float y = y0 + j * dy;
+
+            int index = (j * width + i);
+            output[index] = mandel(x, y, maxIterations);
+        }
+    }
+}
+
--- a/examples/mandelbrot_tasks/Makefile
+++ b/examples/mandelbrot_tasks/Makefile
@@ -0,0 +1,38 @@
+
+ARCH = $(shell uname)
+
+TASK_CXX=tasks_pthreads.cpp
+TASK_LIB=-lpthread
+
+ifeq ($(ARCH), Darwin)
+  TASK_CXX=tasks_gcd.cpp
+  TASK_LIB=
+endif
+
+TASK_OBJ=$(addprefix objs/, $(TASK_CXX:.cpp=.o))
+
+CXX=g++
+CXXFLAGS=-Iobjs/ -O3 -Wall
+ISPC=ispc
+ISPCFLAGS=-O2 --target=sse4x2
+
+default: mandelbrot
+
+.PHONY: dirs clean
+
+dirs:
+	/bin/mkdir -p objs/
+
+clean:
+	/bin/rm -rf objs *~ mandelbrot
+
+mandelbrot: dirs objs/mandelbrot.o objs/mandelbrot_serial.o objs/mandelbrot_ispc.o $(TASK_OBJ)
+	$(CXX) $(CXXFLAGS) -o $@ objs/mandelbrot.o objs/mandelbrot_ispc.o objs/mandelbrot_serial.o $(TASK_OBJ) -lm $(TASK_LIB)
+
+objs/%.o: %.cpp
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/mandelbrot.o: objs/mandelbrot_ispc.h 
+
+objs/%_ispc.h objs/%_ispc.o: %.ispc
+	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/mandelbrot_tasks/mandelbrot.cpp
+++ b/examples/mandelbrot_tasks/mandelbrot.cpp
@@ -0,0 +1,120 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#define NOMINMAX
+#pragma warning (disable: 4244)
+#pragma warning (disable: 4305)
+#endif
+
+#include <stdio.h>
+#include <algorithm>
+#include "../timing.h"
+#include "mandelbrot_ispc.h"
+using namespace ispc;
+
+extern void mandelbrot_serial(float x0, float y0, float x1, float y1,
+                              int width, int height, int maxIterations,
+                              int output[]);
+
+/* Write a PPM image file with the image of the Mandelbrot set */
+static void
+writePPM(int *buf, int width, int height, const char *fn) {
+    FILE *fp = fopen(fn, "wb");
+    fprintf(fp, "P6\n");
+    fprintf(fp, "%d %d\n", width, height);
+    fprintf(fp, "255\n");
+    for (int i = 0; i < width*height; ++i) {
+        // Map the iteration count to colors by just alternating between
+        // two greys.
+        char c = (buf[i] & 0x1) ? 240 : 20;
+        for (int j = 0; j < 3; ++j)
+            fputc(c, fp);
+    }
+    fclose(fp);
+}
+
+
+int main() {
+    unsigned int width = 1536;
+    unsigned int height = 1024;
+    float x0 = -2;
+    float x1 = 1;
+    float y0 = -1;
+    float y1 = 1;
+
+    extern void TasksInit();
+    TasksInit();
+
+    int maxIterations = 512;
+    int *buf = new int[width*height];
+
+    //
+    // Compute the image using the ispc implementation; report the minimum
+    // time of three runs.
+    //
+    double minISPC = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        reset_and_start_timer();
+        mandelbrot_ispc(x0, y0, x1, y1, width, height, maxIterations, buf);
+        double dt = get_elapsed_mcycles();
+        minISPC = std::min(minISPC, dt);
+    }
+
+    printf("[mandelbrot ispc+tasks]:\t[%.3f] million cycles\n", minISPC);
+    writePPM(buf, width, height, "mandelbrot-ispc.ppm");
+
+    // Clear out the buffer
+    for (unsigned int i = 0; i < width * height; ++i)
+        buf[i] = 0;
+
+    // 
+    // And run the serial implementation 3 times, again reporting the
+    // minimum time.
+    //
+    double minSerial = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        reset_and_start_timer();
+        mandelbrot_serial(x0, y0, x1, y1, width, height, maxIterations, buf);
+        double dt = get_elapsed_mcycles();
+        minSerial = std::min(minSerial, dt);
+    }
+
+    printf("[mandelbrot serial]:\t\t[%.3f] millon cycles\n", minSerial);
+    writePPM(buf, width, height, "mandelbrot-serial.ppm");
+
+    printf("\t\t\t\t(%.2fx speedup from ISPC)\n", minSerial/minISPC);
+
+    return 0;
+}
--- a/examples/mandelbrot_tasks/mandelbrot.ispc
+++ b/examples/mandelbrot_tasks/mandelbrot.ispc
@@ -0,0 +1,86 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+static inline int
+mandel(float c_re, float c_im, int count) {
+    float z_re = c_re, z_im = c_im;
+    int i;
+    for (i = 0; i < count; ++i) {
+        if (z_re * z_re + z_im * z_im > 4.)
+            break;
+
+        float new_re = z_re*z_re - z_im*z_im;
+        float new_im = 2.f * z_re * z_im;
+        z_re = c_re + new_re;
+        z_im = c_im + new_im;
+    }
+
+    return i;
+}
+
+
+/* Task to compute the Mandelbrot iterations for a span of scanlines from
+   [ystart,yend).
+ */
+task void
+mandelbrot_scanlines(uniform int ystart, uniform int yend,
+                     uniform float x0, uniform float dx, 
+                     uniform float y0, uniform float dy,
+                     uniform int width, uniform int maxIterations,
+                     reference uniform int output[]) {
+    for (uniform int j = ystart; j < yend; ++j) {
+        for (uniform int i = 0; i < width; i += programCount) {
+            float x = x0 + (programIndex + i) * dx;
+            float y = y0 + j * dy;
+
+            int index = j * width + i + programIndex;
+            output[index] = mandel(x, y, maxIterations);
+        }
+    }
+}
+                               
+
+export void
+mandelbrot_ispc(uniform float x0, uniform float y0, 
+                uniform float x1, uniform float y1,
+                uniform int width, uniform int height, 
+                uniform int maxIterations, reference uniform int output[]) {
+    uniform float dx = (x1 - x0) / width;
+    uniform float dy = (y1 - y0) / height;
+
+    /* Launch task to compute results for spans of 'span' scanlines. */
+    uniform int span = 2;
+    for (uniform int j = 0; j < height; j += span)
+        launch < mandelbrot_scanlines(j, j+span, x0, dx, y0, dy, width,
+                                      maxIterations, output) >;
+}
--- a/examples/mandelbrot_tasks/mandelbrot_serial.cpp
+++ b/examples/mandelbrot_tasks/mandelbrot_serial.cpp
@@ -0,0 +1,68 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+
+static int mandel(float c_re, float c_im, int count) {
+    float z_re = c_re, z_im = c_im;
+    int i;
+    for (i = 0; i < count; ++i) {
+        if (z_re * z_re + z_im * z_im > 4.)
+            break;
+
+        float new_re = z_re*z_re - z_im*z_im;
+        float new_im = 2.f * z_re * z_im;
+        z_re = c_re + new_re;
+        z_im = c_im + new_im;
+    }
+
+    return i;
+}
+
+void mandelbrot_serial(float x0, float y0, float x1, float y1,
+                       int width, int height, int maxIterations,
+                       int output[])
+{
+    float dx = (x1 - x0) / width;
+    float dy = (y1 - y0) / height;
+
+    for (int j = 0; j < height; j++) {
+        for (int i = 0; i < width; ++i) {
+            float x = x0 + i * dx;
+            float y = y0 + j * dy;
+
+            int index = (j * width + i);
+            output[index] = mandel(x, y, maxIterations);
+        }
+    }
+}
+
--- a/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
+++ b/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
@@ -0,0 +1,162 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{E80DA7D4-AB22-4648-A068-327307156BE6}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>mandelbrot</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="mandelbrot.cpp" />
+    <ClCompile Include="mandelbrot_serial.cpp" />
+    <ClCompile Include="tasks_concrt.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="mandelbrot.ispc">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
--- a/examples/mandelbrot_tasks/tasks_concrt.cpp
+++ b/examples/mandelbrot_tasks/tasks_concrt.cpp
@@ -0,0 +1,115 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/* Simple task system implementation for ispc based on Microsoft's
+   Concurrency Runtime. */
+
+#include <windows.h>
+#include <concrt.h>
+using namespace Concurrency;
+#include <assert.h>
+#include <stdio.h>
+
+// ispc expects these functions to have C linkage / not be mangled
+extern "C" { 
+    void ISPCLaunch(void *f, void *data);
+    void ISPCSync();
+}
+
+typedef void (*TaskFuncType)(void *, int, int);
+
+struct TaskInfo {
+    TaskFuncType ispcFunc;
+    void *ispcData;
+};
+
+// This is a simple implementation that just aborts if more than MAX_TASKS
+// are launched.  It could easily be extended to be more general...
+
+#define MAX_TASKS 4096
+static int taskOffset;
+static TaskInfo taskInfo[MAX_TASKS];
+static event *events[MAX_TASKS];
+static CRITICAL_SECTION criticalSection;
+
+void
+TasksInit() {
+    InitializeCriticalSection(&criticalSection);
+    for (int i = 0; i < MAX_TASKS; ++i)
+        events[i] = new event;
+}
+
+
+void __cdecl
+lRunTask(LPVOID param) {
+    TaskInfo *ti = (TaskInfo *)param;
+    
+    // Actually run the task. 
+    // FIXME: like the tasks_gcd.cpp implementation, this is passing bogus
+    // values for the threadIndex and threadCount builtins, which in turn
+    // will cause bugs in code that uses those.  FWIW this example doesn't
+    // use them...
+    int threadIndex = 0;
+    int threadCount = 1;
+    ti->ispcFunc(ti->ispcData, threadIndex, threadCount);
+
+    // Signal the event that this task is done
+    int taskNum = ti - &taskInfo[0];
+    events[taskNum]->set();
+}
+
+
+void
+ISPCLaunch(void *func, void *data) {
+    // Get a TaskInfo struct for this task
+    EnterCriticalSection(&criticalSection);
+    TaskInfo *ti = &taskInfo[taskOffset++];
+    assert(taskOffset < MAX_TASKS);
+    LeaveCriticalSection(&criticalSection);
+
+    // And pass it on to the Concurrency Runtime...
+    ti->ispcFunc = (TaskFuncType)func;
+    ti->ispcData = data;
+    CurrentScheduler::ScheduleTask(lRunTask, ti);
+}
+
+
+void ISPCSync() {
+    event::wait_for_multiple(&events[0], taskOffset, true, 
+                             COOPERATIVE_TIMEOUT_INFINITE);
+
+    for (int i = 0; i < taskOffset; ++i)
+        events[i]->reset();
+
+    taskOffset = 0;
+}
--- a/examples/mandelbrot_tasks/tasks_gcd.cpp
+++ b/examples/mandelbrot_tasks/tasks_gcd.cpp
@@ -0,0 +1,90 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/* A simple task system for ispc programs based on Apple's Grand Central
+   Dispatch. */
+
+#include <dispatch/dispatch.h>
+
+static dispatch_queue_t gcdQueue;
+static dispatch_group_t gcdGroup;
+
+// ispc expects these functions to have C linkage / not be mangled
+extern "C" {
+    void ISPCLaunch(void *f, void *data);
+    void ISPCSync();
+}
+
+struct TaskInfo {
+    void *func;
+    void *data;
+};
+
+
+void
+TasksInit() {
+    gcdQueue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0);
+    gcdGroup = dispatch_group_create();
+}
+
+
+static void
+lRunTask(void *ti) {
+    typedef void (*TaskFuncType)(void *, int, int);
+    TaskInfo *taskInfo = (TaskInfo *)ti;
+
+    TaskFuncType func = (TaskFuncType)(taskInfo->func);
+
+    // FIXME: these are bogus values; may cause bugs in code that depends
+    // on them having unique values in different threads.
+    int threadIndex = 0;
+    int threadCount = 1;
+    // Actually run the task
+    func(taskInfo->data, threadIndex, threadCount);
+
+    // FIXME: taskInfo leaks...
+}
+
+
+void ISPCLaunch(void *func, void *data) {
+    TaskInfo *ti = new TaskInfo;
+    ti->func = func;
+    ti->data = data;
+    dispatch_group_async_f(gcdGroup, gcdQueue, ti, lRunTask);
+}
+
+
+void ISPCSync() {
+    // Wait for all of the tasks in the group to complete before returning
+    dispatch_group_wait(gcdGroup, DISPATCH_TIME_FOREVER);
+}
--- a/examples/mandelbrot_tasks/tasks_pthreads.cpp
+++ b/examples/mandelbrot_tasks/tasks_pthreads.cpp
@@ -0,0 +1,285 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#include <pthread.h>
+#include <semaphore.h>
+#include <string.h>
+#include <unistd.h>
+#include <assert.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/param.h>
+#include <sys/sysctl.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <vector>
+
+// ispc expects these functions to have C linkage / not be mangled
+extern "C" { 
+    void ISPCLaunch(void *f, void *data);
+    void ISPCSync();
+}
+
+
+static int nThreads;
+static pthread_t *threads;
+static pthread_mutex_t taskQueueMutex;
+static std::vector<std::pair<void *, void *> > taskQueue;
+static sem_t *workerSemaphore;
+static uint32_t numUnfinishedTasks;
+static pthread_mutex_t tasksRunningConditionMutex;
+static pthread_cond_t tasksRunningCondition;
+
+static void *lTaskEntry(void *arg);
+
+/** Figure out how many CPU cores there are in the system
+ */
+static int
+lNumCPUCores() {
+#if defined(__linux__)
+    return sysconf(_SC_NPROCESSORS_ONLN);
+#else
+    // Mac
+    int mib[2];
+    mib[0] = CTL_HW;
+    size_t length = 2;
+    if (sysctlnametomib("hw.logicalcpu", mib, &length) == -1) {
+        fprintf(stderr, "sysctlnametomib() filed.  Guessing 2 cores.");
+        return 2;
+    }
+    assert(length == 2);
+
+    int nCores = 0;
+    size_t size = sizeof(nCores);
+
+    if (sysctl(mib, 2, &nCores, &size, NULL, 0) == -1) {
+        fprintf(stderr, "sysctl() to find number of cores present failed.  Guessing 2.");
+        return 2;
+    }
+    return nCores;
+#endif
+}
+
+void
+TasksInit() {
+    nThreads = lNumCPUCores();
+
+    threads = new pthread_t[nThreads];
+
+    int err;
+    if ((err = pthread_mutex_init(&taskQueueMutex, NULL)) != 0) {
+        fprintf(stderr, "Error creating mutex: %s\n", strerror(err));
+        exit(1);
+    }
+
+    char name[32];
+    sprintf(name, "mandelbrot.%d", (int)getpid());
+    workerSemaphore = sem_open(name, O_CREAT, S_IRUSR|S_IWUSR, 0);
+    if (!workerSemaphore) {
+        fprintf(stderr, "Error creating semaphore: %s\n", strerror(err));
+        exit(1);
+    }
+
+    if ((err = pthread_cond_init(&tasksRunningCondition, NULL)) != 0) {
+        fprintf(stderr, "Error creating condition variable: %s\n", strerror(err));
+        exit(1);
+    }
+
+    if ((err = pthread_mutex_init(&tasksRunningConditionMutex, NULL)) != 0) {
+        fprintf(stderr, "Error creating mutex: %s\n", strerror(err));
+        exit(1);
+    }
+
+    for (int i = 0; i < nThreads; ++i) {
+        err = pthread_create(&threads[i], NULL, &lTaskEntry, reinterpret_cast<void *>(i));
+        if (err != 0) {
+            fprintf(stderr, "Error creating pthread %d: %s\n", i, strerror(err));
+            exit(1);
+        }
+    }
+}
+
+
+void
+ISPCLaunch(void *f, void *d) {
+    //
+    // Acquire mutex, add task
+    //
+    int err;
+    if ((err = pthread_mutex_lock(&taskQueueMutex)) != 0) {
+        fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
+        exit(1);
+    }
+
+    taskQueue.push_back(std::make_pair(f, d));
+
+    if ((err = pthread_mutex_unlock(&taskQueueMutex)) != 0) {
+        fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
+        exit(1);
+    }
+
+    //
+    // Update count of number of tasks left to run
+    //
+    if ((err = pthread_mutex_lock(&tasksRunningConditionMutex)) != 0) {
+        fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
+        exit(1);
+    }
+
+    ++numUnfinishedTasks;
+
+    if ((err = pthread_mutex_unlock(&tasksRunningConditionMutex)) != 0) {
+        fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
+        exit(1);
+    }
+
+    //
+    // Post to the worker semaphore to wake up worker threads that are
+    // sleeping waiting for tasks to show up
+    //
+    if ((err = sem_post(workerSemaphore)) != 0) {
+        fprintf(stderr, "Error from sem_post: %s\n", strerror(err));
+        exit(1);
+    }
+}
+
+
+static void *
+lTaskEntry(void *arg) {
+    int threadIndex = int(reinterpret_cast<int64_t>(arg));
+    int threadCount = nThreads;
+
+    while (true) {
+        int err;
+        if ((err = sem_wait(workerSemaphore)) != 0) {
+            fprintf(stderr, "Error from sem_wait: %s\n", strerror(err));
+            exit(1);
+        }
+
+        std::pair<void *, void *> myTask;
+        //
+        // Acquire mutex, get task
+        //
+        if ((err = pthread_mutex_lock(&taskQueueMutex)) != 0) {
+            fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
+            exit(1);
+        }
+        if (taskQueue.size() == 0) {
+            //
+            // Task queue is empty, go back and wait on the semaphore
+            //
+            if ((err = pthread_mutex_unlock(&taskQueueMutex)) != 0) {
+                fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
+                exit(1);
+            }
+            continue;
+        }
+
+        myTask = taskQueue.back();
+        taskQueue.pop_back();
+
+        if ((err = pthread_mutex_unlock(&taskQueueMutex)) != 0) {
+            fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
+            exit(1);
+        }
+
+        //
+        // Do work for _myTask_
+        //
+        typedef void (*TaskFunType)(void *, int, int);
+        TaskFunType func = (TaskFunType)myTask.first;
+        func(myTask.second, threadIndex, threadCount);
+
+        //
+        // Decrement the number of unfinished tasks counter
+        //
+        if ((err = pthread_mutex_lock(&tasksRunningConditionMutex)) != 0) {
+            fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
+            exit(1);
+        }
+
+        int unfinished = --numUnfinishedTasks;
+        if (unfinished == 0) {
+            //
+            // Signal the "no more tasks are running" condition if all of
+            // them are done.
+            //
+            int err;
+            if ((err = pthread_cond_signal(&tasksRunningCondition)) != 0) {
+                fprintf(stderr, "Error from pthread_cond_signal: %s\n", strerror(err));
+                exit(1);
+            }
+        }
+
+        if ((err = pthread_mutex_unlock(&tasksRunningConditionMutex)) != 0) {
+            fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
+            exit(1);
+        }
+    }
+
+    pthread_exit(NULL);
+    return 0;
+}
+
+
+void ISPCSync() {
+    int err;
+    if ((err = pthread_mutex_lock(&tasksRunningConditionMutex)) != 0) {
+        fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
+        exit(1);
+    }
+
+    // As long as there are tasks running, wait on the condition variable;
+    // doing so causes this thread to go to sleep until someone signals on
+    // the tasksRunningCondition condition variable.
+    while (numUnfinishedTasks > 0) {
+        if ((err = pthread_cond_wait(&tasksRunningCondition, 
+                                     &tasksRunningConditionMutex)) != 0) {
+            fprintf(stderr, "Error from pthread_cond_wait: %s\n", strerror(err));
+            exit(1);
+        }
+    }
+    
+    // We acquire ownership of the condition variable mutex when the above
+    // pthread_cond_wait returns.
+    // FIXME: is there a lurking issue here if numUnfinishedTasks gets back
+    // to zero by the time we get to ISPCSync() and thence we're trying to
+    // unlock a mutex we don't have a lock on?
+    if ((err = pthread_mutex_unlock(&tasksRunningConditionMutex)) != 0) {
+        fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
+        exit(1);
+    }
+}
--- a/examples/options/Makefile
+++ b/examples/options/Makefile
@@ -0,0 +1,26 @@
+
+CXX=g++
+CXXFLAGS=-Iobjs/ -g -Wall
+ISPC=ispc
+ISPCFLAGS=-O2 --target=sse4x2
+
+default: options
+
+.PHONY: dirs clean
+
+dirs:
+	/bin/mkdir -p objs/
+
+clean:
+	/bin/rm -rf objs *~ options
+
+options: dirs objs/options.o objs/options_serial.o objs/options_ispc.o
+	$(CXX) $(CXXFLAGS) -o $@ objs/options.o objs/options_ispc.o objs/options_serial.o -lm
+
+objs/%.o: %.cpp
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/options.o: objs/options_ispc.h options_defs.h
+
+objs/%_ispc.h objs/%_ispc.o: %.ispc options_defs.h
+	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/options/options.cpp
+++ b/examples/options/options.cpp
@@ -0,0 +1,151 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <math.h>
+#include <algorithm>
+#ifndef __APPLE__
+#include <malloc.h>
+#endif // !__APPLE__
+using std::max;
+
+#include "options_defs.h"
+#include "../timing.h"
+
+#include "options_ispc.h"
+using namespace ispc;
+
+// Allocate memory with 64-byte alignment.
+float *AllocFloats(int count) {
+    int size = count * sizeof(float);
+#if defined(_WIN32) || defined(_WIN64)
+    return (float *)_aligned_malloc(size, 64);
+#elif defined (__APPLE__)
+    // Allocate excess memory to ensure an aligned pointer can be returned
+    void *mem = malloc(size + (64-1) + sizeof(void*));
+    char *amem = ((char*)mem) + sizeof(void*);
+    amem += 64 - (reinterpret_cast<uint64_t>(amem) & (64 - 1));
+    ((void**)amem)[-1] = mem;
+    return (float *)amem;
+#else
+    return (float *)memalign(64, size);
+#endif
+}
+
+extern void black_scholes_serial(float Sa[], float Xa[], float Ta[], 
+                                 float ra[], float va[], 
+                                 float result[], int count);
+
+extern void binomial_put_serial(float Sa[], float Xa[], float Ta[], 
+                                float ra[], float va[], 
+                                float result[], int count);
+
+int main() {
+    // Pointers passed to ispc code must have alignment of the target's
+    // vector width at minimum.
+    float *S = AllocFloats(N_OPTIONS);
+    float *X = AllocFloats(N_OPTIONS);
+    float *T = AllocFloats(N_OPTIONS);
+    float *r = AllocFloats(N_OPTIONS);
+    float *v = AllocFloats(N_OPTIONS);
+    float *result = AllocFloats(N_OPTIONS);
+
+    for (int i = 0; i < N_OPTIONS; ++i) {
+        S[i] = 100;  // stock price
+        X[i] = 98;   // option strike price
+        T[i] = 2;    // time (years)
+        r[i] = .02;  // risk-free interest rate
+        v[i] = 5;    // volatility
+    }
+
+    //
+    // Binomial options pricing model, ispc implementation
+    //
+    reset_and_start_timer();
+    binomial_put_ispc(S, X, T, r, v, result, N_OPTIONS);
+    double binomial_ispc = get_elapsed_mcycles();
+    float sum = 0.f;
+    for (int i = 0; i < N_OPTIONS; ++i)
+        sum += result[i];
+    printf("[binomial ispc]:\t\t[%.3f] million cycles (avg %f)\n", 
+           binomial_ispc, sum / N_OPTIONS);
+
+    //
+    // Binomial options, serial implementation
+    //
+    reset_and_start_timer();
+    binomial_put_serial(S, X, T, r, v, result, N_OPTIONS);
+    double binomial_serial = get_elapsed_mcycles();
+    sum = 0.f;
+    for (int i = 0; i < N_OPTIONS; ++i)
+        sum += result[i];
+    printf("[binomial serial]:\t\t[%.3f] million cycles (avg %f)\n", 
+           binomial_serial, sum / N_OPTIONS);
+
+    printf("\t\t\t\t(%.2fx speedup from ISPC)\n", binomial_serial / binomial_ispc);
+
+    //
+    // Black-Scholes options pricing model, ispc implementation
+    //
+    sum = 0.f;
+    reset_and_start_timer();
+    for (int a = 0; a < N_BLACK_SCHOLES_ROUNDS; ++a) {
+        black_scholes_ispc(S, X, T, r, v, result, N_OPTIONS);
+        for (int i = 0; i < N_OPTIONS; ++i)
+            sum += result[i];
+    }
+    double bs_ispc = get_elapsed_mcycles();
+    printf("[black-scholes ispc]:\t\t[%.3f] million cycles (avg %f)\n", 
+           bs_ispc, sum / (N_BLACK_SCHOLES_ROUNDS * N_OPTIONS));
+
+    //
+    // Black-Scholes options pricing model, serial implementation
+    //
+    sum = 0.f;
+    reset_and_start_timer();
+    for (int a = 0; a < N_BLACK_SCHOLES_ROUNDS; ++a) {
+        black_scholes_serial(S, X, T, r, v, result, N_OPTIONS);
+        for (int i = 0; i < N_OPTIONS; ++i)
+            sum += result[i];
+    }
+    double bs_serial = get_elapsed_mcycles();
+    printf("[black-scholes serial]:\t\t[%.3f] million cycles (avg %f)\n", bs_serial, 
+           sum / (N_BLACK_SCHOLES_ROUNDS * N_OPTIONS));
+
+    printf("\t\t\t\t(%.2fx speedup from ISPC)\n", bs_serial / bs_ispc);
+
+    return 0;
+}
--- a/examples/options/options.ispc
+++ b/examples/options/options.ispc
@@ -0,0 +1,103 @@
+// -*- mode: c++ -*-
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#include "options_defs.h"
+
+// Cumulative normal distribution function
+static inline float
+CND(float X) {
+    float L = abs(X);
+
+    float k = 1.0 / (1.0 + 0.2316419 * L);
+    float k2 = k*k;
+    float k3 = k2*k;
+    float k4 = k2*k2;
+    float k5 = k3*k2;
+
+    const float invSqrt2Pi = 0.39894228040f;
+    float w = (0.31938153f * k - 0.356563782f * k2 + 1.781477937f * k3 +
+               -1.821255978f * k4 + 1.330274429f * k5);
+    w *= invSqrt2Pi * exp(-L * L * .5f);
+
+    if (X > 0.f)
+        w = 1.0 - w;
+    return w;
+}
+
+export void
+black_scholes_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[],
+                   uniform float ra[], uniform float va[], 
+                   uniform float result[], uniform int count) {
+    for (uniform int i = 0; i < count; i += programCount) {
+        float S = Sa[i + programIndex], X = Xa[i + programIndex];
+        float T = Ta[i + programIndex], r = ra[i + programIndex];
+        float v = va[i + programIndex];
+
+        float d1 = (log(S/X) + (r + v * v * .5f) * T) / (v * sqrt(T));
+        float d2 = d1 - v * sqrt(T);
+
+        result[i + programIndex] = S * CND(d1) - X * exp(-r * T) * CND(d2);
+    }
+}
+
+
+export void
+binomial_put_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[], 
+                  uniform float ra[], uniform float va[], 
+                  uniform float result[], uniform int count) {
+    float V[BINOMIAL_NUM];
+
+    for (uniform int i = 0; i < count; i += programCount) {
+        float S = Sa[i + programIndex], X = Xa[i + programIndex];
+        float T = Ta[i + programIndex], r = ra[i + programIndex];
+        float v = va[i + programIndex];
+
+        float dt = T / BINOMIAL_NUM;
+        float u = exp(v * sqrt(dt));
+        float d = 1. / u;
+        float disc = exp(r * dt);
+        float Pu = (disc - d) / (u - d);
+
+        for (uniform int j = 0; j < BINOMIAL_NUM; ++j) {
+            float upow = pow(u, (float)(2*j-BINOMIAL_NUM));
+            V[j] = max(0., X - S * upow);
+        }
+
+        for (uniform int j = BINOMIAL_NUM-1; j >= 0; --j)
+            for (uniform int k = 0; k < j; ++k)
+                V[k] = ((1 - Pu) * V[k] + Pu * V[k + 1]) / disc;
+
+        result[i + programIndex] = V[0];
+    }
+}
--- a/examples/options/options.vcxproj
+++ b/examples/options/options.vcxproj
@@ -0,0 +1,168 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{8C7B5D29-1E76-44E6-BBB8-09830E5DEEAE}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>options</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <DisableSpecificWarnings>4305</DisableSpecificWarnings>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <DisableSpecificWarnings>4305</DisableSpecificWarnings>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <DisableSpecificWarnings>4305</DisableSpecificWarnings>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <DisableSpecificWarnings>4305</DisableSpecificWarnings>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="options.cpp" />
+    <ClCompile Include="options_serial.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="options.ispc">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="options_defs.h" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
--- a/examples/options/options_defs.h
+++ b/examples/options/options_defs.h
@@ -0,0 +1,42 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifndef OPTIONS_DEFS_H
+#define OPTIONS_DEFS_H 1
+
+#define BINOMIAL_NUM 64
+#define N_OPTIONS 65536
+#define N_BLACK_SCHOLES_ROUNDS 20
+
+
+#endif // OPTIONS_DEFS_H
--- a/examples/options/options_serial.cpp
+++ b/examples/options/options_serial.cpp
@@ -0,0 +1,114 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#define NOMINMAX
+#pragma warning (disable: 4244)
+#pragma warning (disable: 4305)
+#endif
+
+#include "options_defs.h"
+#include <math.h>
+#include <algorithm>
+
+// Cumulative normal distribution function
+static inline float
+CND(float X) {
+    float L = fabsf(X);
+
+    float k = 1.0 / (1.0 + 0.2316419 * L);
+    float k2 = k*k;
+    float k3 = k2*k;
+    float k4 = k2*k2;
+    float k5 = k3*k2;
+
+    const float invSqrt2Pi = 0.39894228040f;
+    float w = (0.31938153f * k - 0.356563782f * k2 + 1.781477937f * k3 +
+               -1.821255978f * k4 + 1.330274429f * k5);
+    w *= invSqrt2Pi * expf(-L * L * .5f);
+
+    if (X > 0.f)
+        w = 1.0 - w;
+    return w;
+}
+
+
+void
+black_scholes_serial(float Sa[], float Xa[], float Ta[], 
+                     float ra[], float va[], 
+                     float result[], int count) {
+    for (int i = 0; i < count; ++i) {
+        float S = Sa[i], X = Xa[i];
+        float T = Ta[i], r = ra[i];
+        float v = va[i];
+
+        float d1 = (logf(S/X) + (r + v * v * .5f) * T) / (v * sqrtf(T));
+        float d2 = d1 - v * sqrtf(T);
+
+        result[i] = S * CND(d1) - X * expf(-r * T) * CND(d2);
+    }
+}
+
+
+void
+binomial_put_serial(float Sa[], float Xa[], float Ta[], 
+                    float ra[], float va[], 
+                    float result[], int count) {
+    float V[BINOMIAL_NUM];
+
+    for (int i = 0; i < count; ++i) {
+        float S = Sa[i], X = Xa[i];
+        float T = Ta[i], r = ra[i];
+        float v = va[i];
+
+        float dt = T / BINOMIAL_NUM;
+        float u = expf(v * sqrtf(dt));
+        float d = 1. / u;
+        float disc = expf(r * dt);
+        float Pu = (disc - d) / (u - d);
+
+        for (int j = 0; j < BINOMIAL_NUM; ++j) {
+            float upow = powf(u, (float)(2*j-BINOMIAL_NUM));
+            V[j] = std::max(0.f, X - S * upow);
+        }
+
+        for (int j = BINOMIAL_NUM-1; j >= 0; --j)
+            for (int k = 0; k < j; ++k)
+                V[k] = ((1 - Pu) * V[k] + Pu * V[k + 1]) / disc;
+
+        result[i] = V[0];
+    }
+}
+
+
--- a/examples/rt/Makefile
+++ b/examples/rt/Makefile
@@ -0,0 +1,24 @@
+
+CXX=g++
+CXXFLAGS=-Iobjs/ -O3 -Wall
+ISPC=ispc
+ISPCFLAGS=-O2 --target=sse4x2
+
+default: rt
+
+.PHONY: dirs clean
+
+dirs:
+	/bin/mkdir -p objs/
+
+clean:
+	/bin/rm -rf objs *~ rt
+
+rt: dirs objs/rt.o objs/rt_serial.o objs/rt_ispc.o
+	$(CXX) $(CXXFLAGS) -o $@ objs/rt.o objs/rt_ispc.o objs/rt_serial.o -lm
+
+objs/%.o: %.cpp objs/rt_ispc.h
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/%_ispc.h objs/%_ispc.o: %.ispc
+	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/rt/cornell.bvh
+++ b/examples/rt/cornell.bvh
--- a/examples/rt/cornell.camera
+++ b/examples/rt/cornell.camera
--- a/examples/rt/rt.cpp
+++ b/examples/rt/rt.cpp
@@ -0,0 +1,244 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#define NOMINMAX
+#pragma warning (disable: 4244)
+#pragma warning (disable: 4305)
+#endif
+
+#include <stdio.h>
+#include <math.h>
+#include <algorithm>
+#include <assert.h>
+#include <sys/types.h>
+#ifndef __APPLE__
+#include <malloc.h>
+#endif
+#include "../timing.h"
+#include "rt_ispc.h"
+
+using namespace ispc;
+
+typedef unsigned int uint;
+
+template <typename T> 
+T *AllocAligned(int count) {
+    int size = count * sizeof(T);
+#if defined(_WIN32) || defined(_WIN64)
+    return (T *)_aligned_malloc(size, 64);
+#elif defined (__APPLE__)
+    // Allocate excess memory to ensure an aligned pointer can be returned
+    void *mem = malloc(size + (64-1) + sizeof(void*));
+    char *amem = ((char*)mem) + sizeof(void*);
+    amem += 64 - (reinterpret_cast<uint64_t>(amem) & (64 - 1));
+    ((void**)amem)[-1] = mem;
+    return (T *)amem;
+#else
+    return (T *)memalign(64, size);
+#endif
+}
+
+extern void raytrace_serial(int width, int height, const float raster2camera[4][4], 
+                            const float camera2world[4][4], float image[],
+                            int id[], const LinearBVHNode nodes[],
+                            const Triangle triangles[]);
+
+
+static void writeImage(int *idImage, float *depthImage, int width, int height,
+                       const char *filename) {
+    FILE *f = fopen(filename, "wb");
+    if (!f) {
+        perror(filename);
+        exit(1);
+    }
+
+    fprintf(f, "P6\n%d %d\n255\n", width, height);
+    for (int y = 0; y < height; ++y) {
+        for (int x = 0; x < width; ++x) {
+            // use the bits from the object id of the hit object to make a
+            // random color
+            int id = idImage[y * width + x];
+            unsigned char r = 0, g = 0, b = 0;
+
+            for (int i = 0; i < 8; ++i) {
+                // extract bit 3*i for red, 3*i+1 for green, 3*i+2 for blue
+                int rbit = (id & (1 << (3*i)))   >> (3*i);
+                int gbit = (id & (1 << (3*i+1))) >> (3*i+1);
+                int bbit = (id & (1 << (3*i+2))) >> (3*i+2);
+                // and then set the bits of the colors starting from the
+                // high bits...
+                r |= rbit << (7-i);
+                g |= gbit << (7-i);
+                b |= bbit << (7-i);
+            }
+            fputc(r, f);
+            fputc(g, f);
+            fputc(b, f);
+        }
+    }            
+    fclose(f);
+}
+
+
+int main(int argc, char *argv[]) {
+    if (argc != 2) {
+        fprintf(stderr, "usage: rt <filename base>\n");
+        exit(1);
+    }
+
+#define READ(var, n)                                            \
+    if (fread(&(var), sizeof(var), n, f) != (unsigned int)n) {  \
+        fprintf(stderr, "Unexpected EOF reading scene file\n"); \
+        return 1;                                               \
+    } else /* eat ; */                                                     
+
+    //
+    // Read the camera specification information from the camera file
+    //
+    char fnbuf[1024];
+    sprintf(fnbuf, "%s.camera", argv[1]);
+    FILE *f = fopen(fnbuf, "rb");
+    if (!f) {
+        perror(argv[1]);
+        return 1;
+    }
+
+    //
+    // Nothing fancy, and trouble if we run on a big-endian system, just
+    // fread in the bits
+    //
+    int width, height;
+    float camera2world[4][4], raster2camera[4][4];
+    READ(width, 1);
+    READ(height, 1);
+    READ(camera2world[0][0], 16);
+    READ(raster2camera[0][0], 16);
+
+    //
+    // Read in the serialized BVH 
+    //
+    sprintf(fnbuf, "%s.bvh", argv[1]);
+    f = fopen(fnbuf, "rb");
+    if (!f) {
+        perror(argv[2]);
+        return 1;
+    }
+
+    // The BVH file starts with an int that gives the total number of BVH
+    // nodes
+    uint nNodes;
+    READ(nNodes, 1);
+
+    LinearBVHNode *nodes = AllocAligned<LinearBVHNode>(nNodes);
+    for (unsigned int i = 0; i < nNodes; ++i) {
+        // Each node is 6x floats for a boox, then an integer for an offset
+        // to the second child node, then an integer that encodes the type
+        // of node, the total number of int it if a leaf node, etc.
+        float b[6];
+        READ(b[0], 6);
+        nodes[i].bounds[0].v[0] = b[0];
+        nodes[i].bounds[0].v[1] = b[1];
+        nodes[i].bounds[0].v[2] = b[2];
+        nodes[i].bounds[1].v[0] = b[3];
+        nodes[i].bounds[1].v[1] = b[4];
+        nodes[i].bounds[1].v[2] = b[5];
+        READ(nodes[i].offset, 1);
+        READ(nodes[i].primsAxis, 1);
+    }
+
+    // And then read the triangles 
+    uint nTris;
+    READ(nTris, 1);
+    Triangle *triangles = AllocAligned<Triangle>(nTris);
+    for (uint i = 0; i < nTris; ++i) {
+        // 9x floats for the 3 vertices
+        float v[9];
+        READ(v[0], 9);
+        float *vp = v;
+        for (int j = 0; j < 3; ++j) {
+            triangles[i].p[j].v[0] = *vp++;
+            triangles[i].p[j].v[1] = *vp++;
+            triangles[i].p[j].v[2] = *vp++;
+        }
+        // And create an object id
+        triangles[i].id = i+1;
+    }
+    fclose(f);
+
+    // round image resolution up to multiple of 4 to makethings easy for
+    // the code that assigns pixels to ispc program instances
+    height = (height + 3) & ~3;
+    width = (width + 3) & ~3;
+
+    // allocate images; one to hold hit object ids, one to hold depth to
+    // the first interseciton
+    int *id = new int[width*height];
+    float *image = new float[width*height];
+
+    //
+    // Run 3 iterations with ispc, record the minimum time
+    //
+    double minTimeISPC = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        reset_and_start_timer();
+        raytrace(width, height, raster2camera, camera2world, 
+                 image, id, nodes, triangles);
+        double dt = get_elapsed_mcycles();
+        minTimeISPC = std::min(dt, minTimeISPC);
+    }
+    printf("[rt ispc]:\t\t\t[%.3f] million cycles for %d x %d image\n", minTimeISPC, width, height);
+
+    writeImage(id, image, width, height, "rt-ispc.ppm");
+
+    //
+    // And 3 iterations with the serial implementation, reporting the
+    // minimum time.
+    //
+    double minTimeSerial = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        reset_and_start_timer();
+        raytrace_serial(width, height, raster2camera, camera2world, 
+                        image, id, nodes, triangles);
+        double dt = get_elapsed_mcycles();
+        minTimeSerial = std::min(dt, minTimeSerial);
+    }
+    printf("[rt serial]:\t\t\t[%.3f] million cycles for %d x %d image\n", 
+           minTimeSerial, width, height);
+    printf("\t\t\t\t(%.2fx speedup from ISPC)\n", minTimeSerial / minTimeISPC);
+
+    writeImage(id, image, width, height, "rt-serial.ppm");
+
+    return 0;
+}
--- a/examples/rt/rt.ispc
+++ b/examples/rt/rt.ispc
@@ -0,0 +1,273 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#define bool int
+
+typedef float<3> float3;
+
+struct Ray {
+    float3 origin, dir, invDir;
+    uniform unsigned int dirIsNeg[3];
+    float mint, maxt;
+    int hitId;
+};
+
+struct Triangle {
+    uniform float3 p[3];
+    uniform int id;
+};
+
+struct LinearBVHNode {
+    uniform float3 bounds[2];
+    uniform unsigned int offset;     // num primitives for leaf, second child for interior
+    uniform unsigned int primsAxis;  // 0:7 nPrimitives, 8:15 split axis, 16:31 padding
+};
+
+static inline uniform int nPrims(const reference LinearBVHNode node) {
+    return (node.primsAxis & 0xff);
+}
+
+static inline uniform int axis(const reference LinearBVHNode node) {
+    return ((node.primsAxis >> 8) & 0xff);
+}
+
+static inline uniform bool isInterior(const reference LinearBVHNode node) {
+    return nPrims(node) == 0;
+}
+
+static inline float3 Cross(const float3 v1, const float3 v2) {
+    float v1x = v1.x, v1y = v1.y, v1z = v1.z;
+    float v2x = v2.x, v2y = v2.y, v2z = v2.z;
+    float3 ret;
+    ret.x = (v1y * v2z) - (v1z * v2y);
+    ret.y = (v1z * v2x) - (v1x * v2z);
+    ret.z = (v1x * v2y) - (v1y * v2x);
+    return ret;
+}
+
+static inline float Dot(const float3 a, const float3 b) {
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+
+static void generateRay(uniform const float raster2camera[4][4], 
+                        uniform const float camera2world[4][4],
+                        float x, float y, reference Ray ray) {
+    ray.mint = 0.f;
+    ray.maxt = 1e30f;
+
+    ray.hitId = 0;
+
+    // transform raster coordinate (x, y, 0) to camera space
+    float camx = raster2camera[0][0] * x + raster2camera[0][1] * y + raster2camera[0][3];
+    float camy = raster2camera[1][0] * x + raster2camera[1][1] * y + raster2camera[1][3];
+    float camz = raster2camera[2][3];
+    float camw = raster2camera[3][3];
+    camx /= camw;
+    camy /= camw;
+    camz /= camw;
+
+    ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy + camera2world[0][2] * camz;
+    ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy + camera2world[1][2] * camz;
+    ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy + camera2world[2][2] * camz;
+
+    ray.origin.x = camera2world[0][3] / camera2world[3][3];
+    ray.origin.y = camera2world[1][3] / camera2world[3][3];
+    ray.origin.z = camera2world[2][3] / camera2world[3][3];
+
+    ray.invDir = 1.f / ray.dir;
+
+    ray.dirIsNeg[0] = any(ray.invDir.x < 0) ? 1 : 0;
+    ray.dirIsNeg[1] = any(ray.invDir.y < 0) ? 1 : 0;
+    ray.dirIsNeg[2] = any(ray.invDir.z < 0) ? 1 : 0;
+}
+
+
+static inline bool BBoxIntersect(const reference uniform float3 bounds[2], 
+                                 const reference Ray ray) {
+    float t0 = ray.mint, t1 = ray.maxt;
+
+    // Check all three axis-aligned slabs.  Don't try to early out; it's
+    // not worth the trouble
+    float3 tNear = (bounds[0] - ray.origin) * ray.invDir;
+    float3 tFar  = (bounds[1] - ray.origin) * ray.invDir;
+    if (tNear.x > tFar.x) {
+        float tmp = tNear.x;
+        tNear.x = tFar.x;
+        tFar.x = tmp;
+    }
+    t0 = max(tNear.x, t0);
+    t1 = min(tFar.x, t1);
+
+    if (tNear.y > tFar.y) {
+        float tmp = tNear.y;
+        tNear.y = tFar.y;
+        tFar.y = tmp;
+    }
+    t0 = max(tNear.y, t0);
+    t1 = min(tFar.y, t1);
+
+    if (tNear.z > tFar.z) {
+        float tmp = tNear.z;
+        tNear.z = tFar.z;
+        tFar.z = tmp;
+    }
+    t0 = max(tNear.z, t0);
+    t1 = min(tFar.z, t1);
+    
+    return (t0 <= t1);
+}
+
+
+
+static inline bool TriIntersect(const reference Triangle tri, reference Ray ray) {
+    uniform float3 e1 = tri.p[1] - tri.p[0];
+    uniform float3 e2 = tri.p[2] - tri.p[0];
+
+    float3 s1 = Cross(ray.dir, e2);
+    float divisor = Dot(s1, e1);
+    bool hit = true;
+
+    if (divisor == 0.)
+        hit = false;
+    float invDivisor = 1.f / divisor;
+
+    // Compute first barycentric coordinate
+    float3 d = ray.origin - tri.p[0];
+    float b1 = Dot(d, s1) * invDivisor;
+    if (b1 < 0. || b1 > 1.)
+        hit = false;
+
+    // Compute second barycentric coordinate
+    float3 s2 = Cross(d, e1);
+    float b2 = Dot(ray.dir, s2) * invDivisor;
+    if (b2 < 0. || b1 + b2 > 1.)
+        hit = false;
+
+    // Compute _t_ to intersection point
+    float t = Dot(e2, s2) * invDivisor;
+    if (t < ray.mint || t > ray.maxt)
+        hit = false;
+
+    if (hit) {
+        ray.maxt = t;
+        ray.hitId = tri.id;
+    }
+    return hit;
+}
+
+
+bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[], 
+                  reference Ray r) {
+    Ray ray = r;
+    bool hit = false;
+    // Follow ray through BVH nodes to find primitive intersections
+    uniform int todoOffset = 0, nodeNum = 0;
+    uniform int todo[64];
+
+    while (true) {
+        // Check ray against BVH node
+        LinearBVHNode node = nodes[nodeNum];
+        if (any(BBoxIntersect(node.bounds, ray))) {
+            uniform unsigned int nPrimitives = nPrims(node);
+            if (nPrimitives > 0) {
+                // Intersect ray with primitives in leaf BVH node
+                uniform unsigned int primitivesOffset = node.offset;
+                for (uniform unsigned int i = 0; i < nPrimitives; ++i) {
+                    if (TriIntersect(tris[primitivesOffset+i], ray))
+                        hit = true;
+                }
+                if (todoOffset == 0) 
+                    break;
+                nodeNum = todo[--todoOffset];
+            }
+            else {
+                // Put far BVH node on _todo_ stack, advance to near node
+                if (r.dirIsNeg[axis(node)]) {
+                   todo[todoOffset++] = nodeNum + 1;
+                   nodeNum = node.offset;
+                }
+                else {
+                   todo[todoOffset++] = node.offset;
+                   nodeNum = nodeNum + 1;
+                }
+            }
+        }
+        else {
+            if (todoOffset == 0)
+                break;
+            nodeNum = todo[--todoOffset];
+        }
+    }
+    r.maxt = ray.maxt;
+    r.hitId = ray.hitId;
+
+    return hit;
+}
+
+
+export void raytrace(uniform int width, uniform int height,
+                     const uniform float raster2camera[4][4], 
+                     const uniform float camera2world[4][4],
+                     uniform float image[], uniform int id[],
+                     const LinearBVHNode nodes[],
+                     const Triangle triangles[]) {
+    static const uniform float udx[16] = { 0, 1, 0, 1, 2, 3, 2, 3, 
+                                           0, 1, 0, 1, 2, 3, 2, 3 };
+    static const uniform float udy[16] = { 0, 0, 1, 1, 0, 0, 1, 1, 
+                                           2, 2, 3, 3, 2, 2, 3, 3 };
+
+    // The outer loops are always over blocks of 4x4 pixels
+    for (uniform int y = 0; y < height; y += 4) {
+        for (uniform int x = 0; x < width; x += 4) {
+            // Now we have a block of 4x4=16 pixels to process; it will
+            // take 16/programCount iterations of this loop to process
+            // them.
+            for (uniform int o = 0; o < 16 / programCount; ++o) {
+                // Map program instances to samples in the udx/udy arrays
+                // to figure out which pixel each program instance is
+                // responsible for
+                const float dx = udx[o * programCount + programIndex];
+                const float dy = udy[o * programCount + programIndex];
+
+                Ray ray;
+                generateRay(raster2camera, camera2world, x+dx, y+dy, ray);
+                BVHIntersect(nodes, triangles, ray);
+
+                int offset = (y + (int)dy) * width + (x + (int)dx);
+                image[offset] = ray.maxt;
+                id[offset] = ray.hitId;
+            }
+        }
+    }
+}
--- a/examples/rt/rt.vcxproj
+++ b/examples/rt/rt.vcxproj
@@ -0,0 +1,165 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{E787BC3F-2D2E-425E-A64D-4721E2FF3DC9}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>rt</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CustomBuild Include="rt.ispc">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="rt.cpp" />
+    <ClCompile Include="rt_serial.cpp" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
--- a/examples/rt/rt_serial.cpp
+++ b/examples/rt/rt_serial.cpp
@@ -0,0 +1,288 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#define NOMINMAX
+#pragma warning (disable: 4244)
+#pragma warning (disable: 4305)
+#endif
+
+#include <algorithm>
+
+// Just enough of a float3 class to do what we need in this file.
+#ifdef _MSC_VER
+__declspec(align(16)) 
+#endif
+struct float3 {
+    float3() { }
+    float3(float xx, float yy, float zz) { x = xx; y = yy; z = zz; }
+
+    float3 operator*(float f) const { return float3(x*f, y*f, z*f); }
+    float3 operator-(const float3 &f2) const { 
+        return float3(x-f2.x, y-f2.y, z-f2.z); 
+    }
+    float3 operator*(const float3 &f2) const { 
+        return float3(x*f2.x, y*f2.y, z*f2.z); 
+    }
+    float x, y, z;
+    float pad;  // match padding/alignment of ispc version 
+}
+#ifndef _MSC_VER
+__attribute__ ((aligned(16)))
+#endif
+;
+
+struct Ray {
+    float3 origin, dir, invDir;
+    unsigned int dirIsNeg[3];
+    float mint, maxt;
+    int hitId;
+};
+
+
+// Declare these in a namespace so the mangling matches
+namespace ispc {
+    struct Triangle {
+        float3 p[3];
+        int id;
+    };
+
+    struct LinearBVHNode {
+        float3 bounds[2];
+        unsigned int offset;     // primitives for leaf, second child for interior
+        unsigned int primsAxis;  // 0:7 nPrimitives, 8:15 split axis, 16:31 padding
+    };
+}
+
+using namespace ispc;
+
+inline int nPrims(const LinearBVHNode &node) {
+    return (node.primsAxis & 0xff);
+}
+
+inline int axis(const LinearBVHNode &node) {
+    return ((node.primsAxis >> 8) & 0xff);
+}
+
+inline bool isInterior(const LinearBVHNode &node) {
+    return nPrims(node) == 0;
+}
+
+inline float3 Cross(const float3 &v1, const float3 &v2) {
+    float v1x = v1.x, v1y = v1.y, v1z = v1.z;
+    float v2x = v2.x, v2y = v2.y, v2z = v2.z;
+    float3 ret;
+    ret.x = (v1y * v2z) - (v1z * v2y);
+    ret.y = (v1z * v2x) - (v1x * v2z);
+    ret.z = (v1x * v2y) - (v1y * v2x);
+    return ret;
+}
+
+inline float Dot(const float3 &a, const float3 &b) {
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+
+static void generateRay(const float raster2camera[4][4], 
+                        const float camera2world[4][4],
+                        float x, float y, Ray &ray) {
+    ray.mint = 0.f;
+    ray.maxt = 1e30f;
+
+    ray.hitId = 0;
+
+    // transform raster coordinate (x, y, 0) to camera space
+    float camx = raster2camera[0][0] * x + raster2camera[0][1] * y + raster2camera[0][3];
+    float camy = raster2camera[1][0] * x + raster2camera[1][1] * y + raster2camera[1][3];
+    float camz = raster2camera[2][3];
+    float camw = raster2camera[3][3];
+    camx /= camw;
+    camy /= camw;
+    camz /= camw;
+
+    ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy + camera2world[0][2] * camz;
+    ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy + camera2world[1][2] * camz;
+    ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy + camera2world[2][2] * camz;
+
+    ray.origin.x = camera2world[0][3] / camera2world[3][3];
+    ray.origin.y = camera2world[1][3] / camera2world[3][3];
+    ray.origin.z = camera2world[2][3] / camera2world[3][3];
+
+    ray.invDir.x = 1.f / ray.dir.x;
+    ray.invDir.y = 1.f / ray.dir.y;
+    ray.invDir.z = 1.f / ray.dir.z;
+
+    ray.dirIsNeg[0] = (ray.invDir.x < 0) ? 1 : 0;
+    ray.dirIsNeg[1] = (ray.invDir.y < 0) ? 1 : 0;
+    ray.dirIsNeg[2] = (ray.invDir.z < 0) ? 1 : 0;
+}
+
+
+static inline bool BBoxIntersect(const float3 bounds[2], 
+                                 const Ray &ray) {
+    float t0 = ray.mint, t1 = ray.maxt;
+
+    float3 tNear = (bounds[0] - ray.origin) * ray.invDir;
+    float3 tFar  = (bounds[1] - ray.origin) * ray.invDir;
+    if (tNear.x > tFar.x) {
+        float tmp = tNear.x;
+        tNear.x = tFar.x;
+        tFar.x = tmp;
+    }
+    t0 = std::max(tNear.x, t0);
+    t1 = std::min(tFar.x, t1);
+
+    if (tNear.y > tFar.y) {
+        float tmp = tNear.y;
+        tNear.y = tFar.y;
+        tFar.y = tmp;
+    }
+    t0 = std::max(tNear.y, t0);
+    t1 = std::min(tFar.y, t1);
+
+    if (tNear.z > tFar.z) {
+        float tmp = tNear.z;
+        tNear.z = tFar.z;
+        tFar.z = tmp;
+    }
+    t0 = std::max(tNear.z, t0);
+    t1 = std::min(tFar.z, t1);
+    
+    return (t0 <= t1);
+}
+
+
+
+inline bool TriIntersect(const Triangle &tri, Ray &ray) {
+    float3 e1 = tri.p[1] - tri.p[0];
+    float3 e2 = tri.p[2] - tri.p[0];
+
+    float3 s1 = Cross(ray.dir, e2);
+    float divisor = Dot(s1, e1);
+
+    if (divisor == 0.)
+        return false;
+    float invDivisor = 1.f / divisor;
+
+    // Compute first barycentric coordinate
+    float3 d = ray.origin - tri.p[0];
+    float b1 = Dot(d, s1) * invDivisor;
+    if (b1 < 0. || b1 > 1.)
+        return false;
+
+    // Compute second barycentric coordinate
+    float3 s2 = Cross(d, e1);
+    float b2 = Dot(ray.dir, s2) * invDivisor;
+    if (b2 < 0. || b1 + b2 > 1.)
+        return false;
+
+    // Compute _t_ to intersection point
+    float t = Dot(e2, s2) * invDivisor;
+    if (t < ray.mint || t > ray.maxt)
+        return false;
+
+    ray.maxt = t;
+    ray.hitId = tri.id;
+    return true;
+}
+
+
+bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[], 
+                  Ray &r) {
+    Ray ray = r;
+    bool hit = false;
+    // Follow ray through BVH nodes to find primitive intersections
+    int todoOffset = 0, nodeNum = 0;
+    int todo[64];
+
+    while (true) {
+        // Check ray against BVH node
+        const LinearBVHNode &node = nodes[nodeNum];
+        if (BBoxIntersect(node.bounds, ray)) {
+            unsigned int nPrimitives = nPrims(node);
+            if (nPrimitives > 0) {
+                // Intersect ray with primitives in leaf BVH node
+                unsigned int primitivesOffset = node.offset;
+                for (unsigned int i = 0; i < nPrimitives; ++i) {
+                    if (TriIntersect(tris[primitivesOffset+i], ray))
+                        hit = true;
+                }
+                if (todoOffset == 0) 
+                    break;
+                nodeNum = todo[--todoOffset];
+            }
+            else {
+                // Put far BVH node on _todo_ stack, advance to near node
+                if (r.dirIsNeg[axis(node)]) {
+                   todo[todoOffset++] = nodeNum + 1;
+                   nodeNum = node.offset;
+                }
+                else {
+                   todo[todoOffset++] = node.offset;
+                   nodeNum = nodeNum + 1;
+                }
+            }
+        }
+        else {
+            if (todoOffset == 0)
+                break;
+            nodeNum = todo[--todoOffset];
+        }
+    }
+    r.maxt = ray.maxt;
+    r.hitId = ray.hitId;
+
+    return hit;
+}
+
+
+void raytrace_serial(int width, int height,
+                     const float raster2camera[4][4], 
+                     const float camera2world[4][4],
+                     float image[],
+                     int id[],
+                     const LinearBVHNode nodes[],
+                     const Triangle triangles[]) {
+    for (int y = 0; y < height; ++y) {
+        for (int x = 0; x < width; ++x) {
+                Ray ray;
+                generateRay(raster2camera, camera2world, x, y, ray);
+                BVHIntersect(nodes, triangles, ray);
+
+                int offset = y * width + x;
+                image[offset] = ray.maxt;
+                id[offset] = ray.hitId;
+        }
+    }
+}
--- a/examples/rt/sponza.bvh
+++ b/examples/rt/sponza.bvh
--- a/examples/rt/sponza.camera
+++ b/examples/rt/sponza.camera
--- a/examples/rt/teapot.bvh
+++ b/examples/rt/teapot.bvh
--- a/examples/rt/teapot.camera
+++ b/examples/rt/teapot.camera
--- a/examples/simple/Makefile
+++ b/examples/simple/Makefile
@@ -0,0 +1,25 @@
+
+CXX=g++
+CXXFLAGS=-Iobjs/ -O3 -Wall
+ISPC=ispc
+ISPCFLAGS=-O2
+
+default: simple
+
+.PHONY: dirs clean
+.PRECIOUS: objs/simple.h
+
+dirs:
+	/bin/mkdir -p objs/
+
+clean:
+	/bin/rm -rf objs *~ simple
+
+simple: dirs  objs/simple.o objs/simple_ispc.o
+	$(CXX) $(CXXFLAGS) -o $@ objs/simple.o objs/simple_ispc.o
+
+objs/simple.o: simple.cpp objs/simple_ispc.h 
+	$(CXX) $(CXXFLAGS) -c -o $@ $<
+
+objs/%_ispc.h objs/%_ispc.o: %.ispc
+	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -0,0 +1,63 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#include <stdio.h>
+
+// Include the header file that the ispc compiler generates
+#include "simple_ispc.h"
+using namespace ispc;
+
+int main() {
+    // Pointers passed to ispc-compiled code are currently required to have
+    // alignment equal to the target's native vector size.  Here we align
+    // to 32 bytes to be safe for both SSE and AVX targets.
+#ifdef _MSC_VER
+    __declspec(align(32)) float vin[16], vout[16];
+#else
+    float vin[16] __attribute__((aligned(32)));
+    float vout[16] __attribute__((aligned(32)));
+#endif
+
+    // Initialize input buffer
+    for (int i = 0; i < 16; ++i)
+        vin[i] = (float)i;
+
+    // Call simple() function from simple.ispc file
+    simple(vin, vout, 16);
+
+    // Print results
+    for (int i = 0; i < 16; ++i)
+        printf("%d: simple(%f) = %f\n", i, vin[i], vout[i]);
+
+    return 0;
+}
--- a/examples/simple/simple.ispc
+++ b/examples/simple/simple.ispc
@@ -0,0 +1,53 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+
+export void simple(uniform float vin[], uniform float vout[], 
+                   uniform int count) {
+    // Compute the result for 'programCount' values in parallel
+    for (uniform int i = 0; i < count; i += programCount) {
+        int index = i + programIndex;
+        // Load the appropriate input value for this program instance.
+        float v = vin[index];
+
+        // Do an arbitrary little computation, but at least make the
+        // computation dependent on the value being processed
+        if (v < 3.)
+            v = v * v;
+        else
+            v = sqrt(v);
+
+        // And write the result to the output array.
+        vout[index] = v;
+    }
+}
--- a/examples/simple/simple.vcxproj
+++ b/examples/simple/simple.vcxproj
@@ -0,0 +1,164 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="simple.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="simple.ispc">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+cl /E /TP %(Filename).ispc | ispc -O2 -o %(Filename).obj -h %(Filename)_ispc.h
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{947C5311-8B78-4D05-BEE4-BCF342D4B367}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>simple</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
--- a/examples/timing.h
+++ b/examples/timing.h
@@ -0,0 +1,67 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#include <stdint.h>
+
+
+#ifdef WIN32
+#include <windows.h>
+#define rdtsc __rdtsc
+#else
+extern "C" {
+    __inline__ uint64_t rdtsc() {
+        uint32_t low, high;
+        __asm__ __volatile__ (
+            "xorl %%eax,%%eax \n    cpuid"
+            ::: "%rax", "%rbx", "%rcx", "%rdx" );
+        __asm__ __volatile__ (
+                              "rdtsc" : "=a" (low), "=d" (high));
+        return (uint64_t)high << 32 | low;
+    }
+}
+#endif            
+            
+static uint64_t start, end;
+
+static inline void reset_and_start_timer()
+{
+    start = rdtsc();
+}
+
+/* Returns the number of millions of elapsed processor cycles since the
+   last reset_and_start_timer() call. */
+static inline double get_elapsed_mcycles()
+{
+    end = rdtsc();
+    return (end-start) / (1024. * 1024.);
+}
--- a/expr.cpp
+++ b/expr.cpp
--- a/expr.h
+++ b/expr.h
@@ -0,0 +1,543 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file expr.h
+    @brief Expr abstract base class and expression implementations
+*/
+
+#ifndef ISPC_EXPR_H
+#define ISPC_EXPR_H 1
+
+#include "ispc.h"
+
+class FunctionSymbolExpr;
+
+/** @brief Expr is the abstract base class that defines the interface that
+    all expression types must implement.
+ */
+class Expr : public ASTNode {
+public:
+    Expr(SourcePos p) : ASTNode(p) { }
+
+    /** This is the main method for Expr implementations to implement.  It
+        should call methods in the FunctionEmitContext to emit LLVM IR
+        instructions to the current basic block in order to generate an
+        llvm::Value that represents the expression's value. */
+    virtual llvm::Value *GetValue(FunctionEmitContext *ctx) const = 0;
+
+    /** For expressions that can provide an lvalue (e.g. array indexing),
+        this function should emit IR that computes the expression's lvalue
+        and returns the corresponding llvm::Value.  Expressions that can't
+        provide an lvalue should leave this unimplemented; the default
+        implementation returns NULL.  */
+    virtual llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
+
+    /** Returns the Type of the expression. */
+    virtual const Type *GetType() const = 0;
+
+    /** For expressions that have values based on a symbol (e.g. regular
+        symbol references, array indexing, etc.), this returns a pointer to
+        that symbol. */
+    virtual Symbol *GetBaseSymbol() const;
+
+    /** If this is a constant expression that can be converted to a
+        constant of the given type, this method should return the
+        corresponding llvm::Constant value.  Otherwise it should return
+        NULL. */
+    virtual llvm::Constant *GetConstant(const Type *type) const;
+
+    /** This method should perform early optimizations of the expression
+        (constant folding, etc.) and return a pointer to the resulting
+        expression.  If an error is encountered during optimization, NULL
+        should be returned. */
+    virtual Expr *Optimize() = 0;
+
+    /** This method should perform type checking of the expression and
+        return a pointer to the resulting expression.  If an error is
+        encountered, NULL should be returned. */
+    virtual Expr *TypeCheck() = 0;
+
+    /** Prints the expression to standard output (used for debugging). */
+    virtual void Print() const = 0;
+
+    /** This method tries to convert the expression to the given type.  In
+        the event of failure, if the failureOk parameter is true, then no
+        error is issued.  If failureOk is false, then an error is printed
+        that incorporates the given error message string.  In either
+        failure case, NULL is returned.  */
+    Expr *TypeConv(const Type *type, const char *errorMsgBase = NULL, 
+                   bool failureOk = false);
+};
+
+
+/** @brief Unary expression */
+class UnaryExpr : public Expr {
+public:
+    enum Op {
+        PreInc,      ///< Pre-increment
+        PreDec,      ///< Pre-decrement 
+        PostInc,     ///< Post-increment
+        PostDec,     ///< Post-decrement
+        Negate,      ///< Negation
+        LogicalNot,  ///< Logical not
+        BitNot,      ///< Bit not
+    };
+
+    UnaryExpr(Op op, Expr *expr, SourcePos pos);
+
+    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
+    const Type *GetType() const;
+    void Print() const;
+    Expr *Optimize();
+    Expr *TypeCheck();
+
+private:
+    const Op op;
+    Expr *expr;
+};
+
+
+/** @brief Binary expression */
+class BinaryExpr : public Expr {
+public:
+    enum Op {
+        Add,           ///< Addition
+        Sub,           ///< Subtraction
+        Mul,           ///< Multiplication
+        Div,           ///< Division
+        Mod,           ///< Modulus
+        Shl,           ///< Shift left
+        Shr,           ///< Shift right
+
+        Lt,            ///< Less than
+        Gt,            ///< Greater than
+        Le,            ///< Less than or equal
+        Ge,            ///< Greater than or equal
+        Equal,         ///< Equal
+        NotEqual,      ///< Not equal
+
+        BitAnd,        ///< Bitwise AND
+        BitXor,        ///< Bitwise XOR
+        BitOr,         ///< Bitwise OR
+        LogicalAnd,    ///< Logical AND
+        LogicalOr,     ///< Logical OR
+
+        Comma,         ///< Comma operator
+    };
+
+    BinaryExpr(Op o, Expr *a, Expr *b, SourcePos p);
+
+    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
+    const Type *GetType() const;
+    void Print() const;
+
+    Expr *Optimize();
+    Expr *TypeCheck();
+
+private:
+    const Op op;
+    Expr *arg0, *arg1;
+};
+
+
+/** @brief Assignment expression */
+class AssignExpr : public Expr {
+public:
+    enum Op {
+        Assign,     ///< Regular assignment
+        MulAssign,  ///< *= assignment
+        DivAssign,  ///< /= assignment
+        ModAssign,  ///< %= assignment
+        AddAssign,  ///< += assignment
+        SubAssign,  ///< -= assignment
+        ShlAssign,  ///< <<= assignment
+        ShrAssign,  ///< >>= assignment
+        AndAssign,  ///< &= assignment
+        XorAssign,  ///< ^= assignment
+        OrAssign,   ///< |= assignment
+    };
+
+    AssignExpr(Op o, Expr *a, Expr *b, SourcePos p);
+
+    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
+    const Type *GetType() const;
+    void Print() const;
+
+    Expr *Optimize();
+    Expr *TypeCheck();
+
+private:
+    const Op op;
+    Expr *lvalue, *rvalue;
+};
+
+
+/** @brief Selection expression, corresponding to "test ? a : b".  
+
+    Returns the value of "a" or "b", depending on the value of "test".
+*/
+class SelectExpr : public Expr {
+public:
+    SelectExpr(Expr *test, Expr *a, Expr *b, SourcePos p);
+
+    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
+    const Type *GetType() const;
+    void Print() const;
+
+    Expr *Optimize();
+    Expr *TypeCheck();
+
+private:
+    Expr *test, *expr1, *expr2;
+};
+
+
+/** @brief A list of expressions.
+
+    These are mostly used for representing curly-brace delimited
+    initializers for initializers for complex types and for representing
+    the arguments passed to a function call.
+ */
+class ExprList : public Expr {
+public:
+    ExprList(SourcePos p) : Expr(p) { }
+    ExprList(Expr *e, SourcePos p) : Expr(p) { exprs.push_back(e); }
+
+    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
+    const Type *GetType() const;
+    void Print() const;
+    llvm::Constant *GetConstant(const Type *type) const;
+    ExprList *Optimize();
+    ExprList *TypeCheck();
+
+    std::vector<Expr *> exprs;
+};
+
+
+/** @brief Expression representing a function call.
+ */
+class FunctionCallExpr : public Expr {
+public:
+    FunctionCallExpr(Expr *func, ExprList *args, SourcePos p, bool isLaunch);
+
+    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
+    const Type *GetType() const;
+    void Print() const;
+
+    Expr *Optimize();
+    Expr *TypeCheck();
+
+private:
+    Expr *func;
+    ExprList *args;
+    bool isLaunch;
+
+    void resolveFunctionOverloads();
+    bool tryResolve(bool (*matchFunc)(Expr *, const Type *));
+};
+
+
+/** @brief Expression representing indexing into something with an integer
+    offset.
+
+    This is used for both array indexing and indexing into VectorTypes. 
+*/
+class IndexExpr : public Expr {
+public:
+    IndexExpr(Expr *arrayOrVector, Expr *index, SourcePos p);
+
+    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
+    llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
+    const Type *GetType() const;
+    Symbol *GetBaseSymbol() const;
+    void Print() const;
+
+    Expr *Optimize();
+    Expr *TypeCheck();
+
+private:
+    Expr *arrayOrVector, *index;
+};
+
+
+/** @brief Expression representing member selection ("foo.bar").
+ */
+class MemberExpr : public Expr {
+public:
+    MemberExpr(Expr *expr, const char *identifier, SourcePos pos, 
+               SourcePos identifierPos);
+
+    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
+    llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
+    const Type *GetType() const;
+    Symbol *GetBaseSymbol() const;
+    void Print() const;
+    Expr *Optimize();
+    Expr *TypeCheck();
+
+private:
+    std::string getCandidateNearMatches() const;
+    int getElementNumber() const;
+
+    Expr *expr;
+    std::string identifier;
+    const SourcePos identifierPos;
+};
+
+
+/** @brief Expression representing a compile-time constant value.  
+
+    This class can currently represent compile-time constants of anything
+    that is an AtomicType; for anything more complex, we don't currently
+    have a representation of a compile-time constant that can be further
+    reasoned about.
+ */
+class ConstExpr : public Expr {
+public:
+    /** Create a ConstExpr from a uniform int32 value */
+    ConstExpr(const Type *t, int32_t i, SourcePos p);
+    /** Create a ConstExpr from a varying int32 value */
+    ConstExpr(const Type *t, int32_t *i, SourcePos p);
+    /** Create a ConstExpr from a uniform uint32 value */
+    ConstExpr(const Type *t, uint32_t u, SourcePos p);
+    /** Create a ConstExpr from a varying uint32 value */
+    ConstExpr(const Type *t, uint32_t *u, SourcePos p);
+    /** Create a ConstExpr from a uniform float value */
+    ConstExpr(const Type *t, float f, SourcePos p);
+    /** Create a ConstExpr from a varying float value */
+    ConstExpr(const Type *t, float *f, SourcePos p);
+    /** Create a ConstExpr from a uniform double value */
+    ConstExpr(const Type *t, double d, SourcePos p);
+    /** Create a ConstExpr from a varying double value */
+    ConstExpr(const Type *t, double *d, SourcePos p);
+    /** Create a ConstExpr from a uniform int64 value */
+    ConstExpr(const Type *t, int64_t i, SourcePos p);
+    /** Create a ConstExpr from a varying int64 value */
+    ConstExpr(const Type *t, int64_t *i, SourcePos p);
+    /** Create a ConstExpr from a uniform uint64 value */
+    ConstExpr(const Type *t, uint64_t i, SourcePos p);
+    /** Create a ConstExpr from a varying uint64 value */
+    ConstExpr(const Type *t, uint64_t *i, SourcePos p);
+    /** Create a ConstExpr from a uniform bool value */
+    ConstExpr(const Type *t, bool b, SourcePos p);
+    /** Create a ConstExpr from a varying bool value */
+    ConstExpr(const Type *t, bool *b, SourcePos p);
+    /** Create a ConstExpr of the same type as the given old ConstExpr,
+        with values given by the "vales" parameter. */
+    ConstExpr(ConstExpr *old, double *values);
+
+    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
+    const Type *GetType() const;
+    void Print() const;
+    llvm::Constant *GetConstant(const Type *type) const;
+
+    Expr *TypeCheck();
+    Expr *Optimize();
+
+    /** Return the ConstExpr's values as booleans, doing type conversion
+        from the actual type if needed.  If forceVarying is true, then type
+        convert to 'varying' so as to always return a number of values
+        equal to the target vector width into the given pointer. */
+    int AsBool(bool *, bool forceVarying = false) const;
+
+    /** Return the ConstExpr's values as int32s, doing type conversion
+        from the actual type if needed.  If forceVarying is true, then type
+        convert to 'varying' so as to always return a number of values
+        equal to the target vector width into the given pointer. */
+    int AsInt32(int32_t *, bool forceVarying = false) const;
+
+    /** Return the ConstExpr's values as uint32s, doing type conversion
+        from the actual type if needed.  If forceVarying is true, then type
+        convert to 'varying' so as to always return a number of values
+        equal to the target vector width into the given pointer. */
+    int AsUInt32(uint32_t *, bool forceVarying = false) const;
+
+    /** Return the ConstExpr's values as floats, doing type conversion
+        from the actual type if needed.  If forceVarying is true, then type
+        convert to 'varying' so as to always return a number of values
+        equal to the target vector width into the given pointer. */
+    int AsFloat(float *, bool forceVarying = false) const;
+
+    /** Return the ConstExpr's values as int64s, doing type conversion
+        from the actual type if needed.  If forceVarying is true, then type
+        convert to 'varying' so as to always return a number of values
+        equal to the target vector width into the given pointer. */
+    int AsInt64(int64_t *, bool forceVarying = false) const;
+
+    /** Return the ConstExpr's values as uint64s, doing type conversion
+        from the actual type if needed.  If forceVarying is true, then type
+        convert to 'varying' so as to always return a number of values
+        equal to the target vector width into the given pointer. */
+    int AsUInt64(uint64_t *, bool forceVarying = false) const;
+
+    /** Return the ConstExpr's values as doubles, doing type conversion
+        from the actual type if needed.  If forceVarying is true, then type
+        convert to 'varying' so as to always return a number of values
+        equal to the target vector width into the given pointer. */
+    int AsDouble(double *, bool forceVarying = false) const;
+
+    /** Return the number of values in the ConstExpr; should be either 1,
+        if it has uniform type, or the target's vector width if it's
+        varying. */
+    int Count() const;
+
+private:
+    const AtomicType *type;
+    union {
+        int32_t int32Val[ISPC_MAX_NVEC];
+        uint32_t uint32Val[ISPC_MAX_NVEC];
+        bool boolVal[ISPC_MAX_NVEC];
+        float floatVal[ISPC_MAX_NVEC];
+        double doubleVal[ISPC_MAX_NVEC];
+        int64_t int64Val[ISPC_MAX_NVEC];
+        uint64_t uint64Val[ISPC_MAX_NVEC];
+    };
+};
+
+
+/** @brief Expression representing a type cast of the given expression to a
+    probably-different type. */
+class TypeCastExpr : public Expr {
+public:
+    TypeCastExpr(const Type *t, Expr *e, SourcePos p);
+
+    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
+    const Type *GetType() const;
+    void Print() const;
+    Expr *TypeCheck();
+    Expr *Optimize();
+
+private:
+    const Type *type;
+    Expr *expr;
+};
+
+
+/** @brief Expression that represents taking a reference of a (non-reference)
+    variable. */
+class ReferenceExpr : public Expr {
+public:
+    ReferenceExpr(Expr *e, SourcePos p);
+
+    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
+    const Type *GetType() const;
+    Symbol *GetBaseSymbol() const;
+    void Print() const;
+    Expr *TypeCheck();
+    Expr *Optimize();
+
+private:
+    Expr *expr;
+};
+
+
+/** @brief Expression that represents dereferencing a reference to get its
+    value. */
+class DereferenceExpr : public Expr {
+public:
+    DereferenceExpr(Expr *e, SourcePos p);
+
+    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
+    llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
+    const Type *GetType() const;
+    Symbol *GetBaseSymbol() const;
+    void Print() const;
+    Expr *TypeCheck();
+    Expr *Optimize();
+
+private:
+    Expr *expr;
+};
+
+
+/** @brief Expression representing a symbol reference in the program */
+class SymbolExpr : public Expr {
+public:
+    SymbolExpr(Symbol *s, SourcePos p);
+
+    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
+    llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
+    const Type *GetType() const;
+    Symbol *GetBaseSymbol() const;
+    Expr *TypeCheck();
+    Expr *Optimize();
+    void Print() const;
+
+private:
+    Symbol *symbol;
+};
+
+
+/** @brief Expression representing a function symbol in the program (generally
+    used for a function call).
+ */    
+class FunctionSymbolExpr : public Expr {
+public:
+    FunctionSymbolExpr(std::vector<Symbol *> *candidateFunctions, 
+                       SourcePos pos);
+
+    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
+    const Type *GetType() const;
+    Symbol *GetBaseSymbol() const;
+    Expr *TypeCheck();
+    Expr *Optimize();
+    void Print() const;
+
+private:
+    friend class FunctionCallExpr;
+
+    /** All of the functions with the name given in the function call;
+        there may be more then one, in which case we need to resolve which
+        overload is the best match. */
+    std::vector<Symbol *> *candidateFunctions;
+
+    /** The actual matching function found after overload resolution; this
+        value is set by FunctionCallExpr::resolveFunctionOverloads() */
+    Symbol *matchingFunc;
+};
+
+
+/** @brief A sync statement in the program (waits for all launched tasks before
+    proceeding). */
+class SyncExpr : public Expr {
+public:
+    SyncExpr(SourcePos p) : Expr(p) { }
+
+    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
+    const Type *GetType() const;
+    Expr *TypeCheck();
+    Expr *Optimize();
+    void Print() const;
+};
+
+#endif // ISPC_EXPR_H
--- a/failing_tests/max-uint-1.ispc
+++ b/failing_tests/max-uint-1.ispc
@@ -0,0 +1,19 @@
+static float float4(uniform float a, uniform float b, uniform float c, 
+                    uniform float d) {
+    float ret = 0;
+    for (uniform int i = 0; i < programCount; i += 4) {
+        ret = insert(ret, i + 0, a);
+        ret = insert(ret, i + 1, b);
+        ret = insert(ret, i + 2, c);
+        ret = insert(ret, i + 3, d);
+    }
+    return ret;
+}
+
+export float f_f(float a) {
+    unsigned int i = (unsigned int)a;
+    return max((unsigned int)2, i);
+}
+
+export float result() { return float4(2,2,3,4); }
+
--- a/failing_tests/max-uint.ispc
+++ b/failing_tests/max-uint.ispc
@@ -0,0 +1,8 @@
+
+export float f_f(float a) {
+    unsigned int i = (unsigned int)a;
+    return max((unsigned int)10, i);
+}
+
+export float result() { return 10; }
+
--- a/failing_tests/min-uint-1.ispc
+++ b/failing_tests/min-uint-1.ispc
@@ -0,0 +1,19 @@
+static float float4(uniform float a, uniform float b, uniform float c, 
+                    uniform float d) {
+    float ret = 0;
+    for (uniform int i = 0; i < programCount; i += 4) {
+        ret = insert(ret, i + 0, a);
+        ret = insert(ret, i + 1, b);
+        ret = insert(ret, i + 2, c);
+        ret = insert(ret, i + 3, d);
+    }
+    return ret;
+}
+
+export float f_f(float a) {
+    unsigned int i = (unsigned int)a;
+    return min((unsigned int)2, i);
+}
+
+export float result() { return float4(1,2,2,2); }
+
--- a/failing_tests/min-uint-2.ispc
+++ b/failing_tests/min-uint-2.ispc
@@ -0,0 +1,19 @@
+static float float4(uniform float a, uniform float b, uniform float c, 
+                    uniform float d) {
+    float ret = 0;
+    for (uniform int i = 0; i < programCount; i += 4) {
+        ret = insert(ret, i + 0, a);
+        ret = insert(ret, i + 1, b);
+        ret = insert(ret, i + 2, c);
+        ret = insert(ret, i + 3, d);
+    }
+    return ret;
+}
+
+export float f_f(float a) {
+    unsigned int i = (unsigned int)a;
+    return min((unsigned int)20, i);
+}
+
+export float result() { return float4(1,2,3,4); }
+
--- a/failing_tests/struct-array-assign.ispc
+++ b/failing_tests/struct-array-assign.ispc
@@ -0,0 +1,11 @@
+
+struct Foo {
+    float f;
+};
+
+
+export float foo(Foo f[], int i, uniform int j) {
+    Foo x = f[i];
+    return x.f;
+}
+
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -0,0 +1,137 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file ispc.cpp
+    @brief ispc global definitions
+*/
+
+#include "ispc.h"
+#include "module.h"
+#include "util.h"
+#include <stdio.h>
+#ifdef ISPC_IS_WINDOWS
+#include <windows.h>
+#include <direct.h>
+#endif
+#include <llvm/LLVMContext.h>
+#include <llvm/Module.h>
+#ifndef LLVM_2_8
+#include <llvm/Analysis/DIBuilder.h>
+#endif
+#include <llvm/Analysis/DebugInfo.h>
+#include <llvm/Support/Dwarf.h>
+
+Globals *g;
+Module *m;
+
+///////////////////////////////////////////////////////////////////////////
+// Target
+
+Target::Target() {
+    arch = "x86-64";
+    cpu = "nehalem";
+    isa = SSE4;
+    nativeVectorWidth = 4;
+    vectorWidth = 4;
+}
+
+///////////////////////////////////////////////////////////////////////////
+// Opt
+
+Opt::Opt() {
+    level = 1;
+    fastMath = false;
+    disableBlendedMaskedStores = false;
+    disableCoherentControlFlow = false;
+    disableUniformControlFlow = false;
+    disableGatherScatterOptimizations = false;
+    disableMaskedStoreToStore = false;
+    disableGatherScatterFlattening = false;
+    disableUniformMemoryOptimizations = false;
+    disableMaskedStoreOptimizations = false;
+}
+
+///////////////////////////////////////////////////////////////////////////
+// Globals
+
+Globals::Globals() {
+    mathLib = Globals::Math_ISPC;
+
+    includeStdlib = true;
+    runCPP = true;
+    debugPrint = false;
+    disableWarnings = false;
+    emitPerfWarnings = true;
+    emitInstrumentation = false;
+    generateDebuggingSymbols = false;
+
+    ctx = new llvm::LLVMContext;
+
+#ifdef ISPC_IS_WINDOWS
+    _getcwd(currentDirectory, sizeof(currentDirectory));
+#else
+    getcwd(currentDirectory, sizeof(currentDirectory));
+#endif
+}
+
+///////////////////////////////////////////////////////////////////////////
+// ASTNode
+
+ASTNode::~ASTNode() {
+}
+
+///////////////////////////////////////////////////////////////////////////
+// SourcePos
+
+SourcePos::SourcePos(const char *n, int l, int c) {
+    name = n ? n : m->module->getModuleIdentifier().c_str();
+    first_line = last_line = l;
+    first_column = last_column = c;
+}
+
+llvm::DIFile SourcePos::GetDIFile() const {
+#ifdef LLVM_2_8
+    return llvm::DIFile();
+#else
+    std::string directory, filename;
+    GetDirectoryAndFileName(g->currentDirectory, name, &directory, &filename);
+    return m->diBuilder->createFile(filename, directory);
+#endif // LLVM_2_8
+}
+
+
+void
+SourcePos::Print() const { 
+    printf(" @ [%s:%d.%d - %d.%d] ", name, first_line, first_column,
+           last_line, last_column); 
+}
--- a/ispc.h
+++ b/ispc.h
@@ -0,0 +1,313 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file ispc.h
+    @brief Main ispc.header file
+*/
+
+#ifndef ISPC_H
+#define ISPC_H
+
+#if defined(_WIN32) || defined(_WIN64)
+#define ISPC_IS_WINDOWS
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
+#include <assert.h>
+#include <stdint.h>
+#include <vector>
+#include <string>
+
+/** @def ISPC_MAX_NVEC maximum vector size of any of the compliation
+    targets.
+ */
+#define ISPC_MAX_NVEC 16
+
+// Forward declarations of a number of widely-used LLVM types
+namespace llvm {
+    class BasicBlock;
+    class Constant;
+    class ConstantValue;
+    class DIBuilder;
+    class DIDescriptor;
+    class DIFile;
+    class DIType;
+    class Function;
+    class FunctionType;
+    class LLVMContext;
+    class Module;
+    class Type;
+    class Value;
+}
+
+class ArrayType;
+class AtomicType;
+class DeclSpecs;
+class Declaration;
+class Declarator;
+class FunctionEmitContext;
+class Expr;
+class ExprList;
+class FunctionType;
+class GatherBuffer;
+class Module;
+class Stmt;
+class Symbol;
+class SymbolTable;
+class Type;
+
+/** @brief Representation of a range of positions in a source file.
+
+    This class represents a range of characters in a source file
+    (e.g. those that span a token's definition), from starting line and
+    column to ending line and column.  (These values are tracked by the
+    lexing code).  Both lines and columns are counted starting from one.
+ */
+struct SourcePos {
+    SourcePos(const char *n = NULL, int l = 0, int c = 0);
+
+    const char *name;
+    int first_line;
+    int first_column;
+    int last_line;
+    int last_column;
+
+    /** Prints the filename and line/column range to standard output. */
+    void Print() const;
+
+    /** Returns a LLVM DIFile object that represents the SourcePos's file */
+    llvm::DIFile GetDIFile() const;
+};
+
+
+/** @brief Abstract base class for nodes in the abstract syntax tree (AST).
+
+    This class defines a basic interface that all abstract syntax tree
+    (AST) nodes must implement.  The base classes for both expressions
+    (Expr) and statements (Stmt) inherit from this class.
+*/
+class ASTNode {
+public:
+    ASTNode(SourcePos p) : pos(p) { }
+    virtual ~ASTNode();
+
+    /** The Optimize() method should perform any appropriate early-stage
+        optimizations on the node (e.g. constant folding).  The caller
+        should use the returned ASTNode * in place of the original node.
+        This method may return NULL if an error is encountered during
+        optimization. */
+    virtual ASTNode *Optimize() = 0;
+
+    /** Type checking should be performed by the node when this method is
+        called.  In the event of an error, a NULL value may be returned.
+        As with ASTNode::Optimize(), the caller should store the returned
+        pointer in place of the original ASTNode *. */
+    virtual ASTNode *TypeCheck() = 0;
+
+    /** All AST nodes must track the file position where they are
+        defined. */
+    const SourcePos pos;
+};
+
+/** @brief Structure that defines a compilation target 
+
+    This structure defines a compilation target for the ispc compiler.
+*/
+struct Target {
+    Target();
+
+    /** Enumerant giving the instruction sets that the compiler can
+        target. */
+    enum ISA { SSE2, SSE4, AVX };
+
+    /** Instruction set being compiled to. */
+    ISA isa;
+
+    /** Target system architecture.  (e.g. "x86-64", "x86"). */
+    std::string arch;
+
+    /** Target CPU. (e.g. "corei7", "corei7-avx", ..) */
+    std::string cpu;
+
+    /** Native vector width of the vector instruction set.  Note that this
+        value is directly derived from the ISA Being used (e.g. it's 4 for
+        SSE, 8 for AVX, etc.) */
+    int nativeVectorWidth;
+
+    /** Actual vector width currently being compiled to.  This may be an
+        integer multiple of the native vector width, for example if we're
+        "doubling up" and compiling 8-wide on a 4-wide SSE system. */
+    int vectorWidth;
+};
+
+/** @brief Structure that collects optimization options
+
+    This structure collects all of the options related to optimization of
+    generated code. 
+*/
+struct Opt {
+    Opt();
+    
+    /** Optimization level.  Currently, the only valid values are 0,
+        indicating essentially no optimization, and 1, indicating as much
+        optimization as possible. */
+    int level;
+
+    /** Indicates whether "fast and loose" numerically unsafe optimizations
+        should be performed.  This is false by default. */
+    bool fastMath;
+
+    /** On targets that don't have a masked store instruction but do have a
+        blending instruction, by default, we simulate masked stores by
+        loading the old value, blending, and storing the result.  This can
+        potentially be unsafe in multi-threaded code, in that it writes to
+        locations that aren't supposed to be written to.  Setting this
+        value to true disables this work-around, and instead implements
+        masked stores by 'scalarizing' them, so that we iterate over the
+        ISIMD lanes and do a scalar write for the ones that are running. */
+    bool disableBlendedMaskedStores;
+
+    /** Disables the 'coherent control flow' constructs in the
+        language. (e.g. this causes "cif" statements to be demoted to "if"
+        statements.)  This is likely only useful for measuring the impact
+        of coherent control flow. */
+    bool disableCoherentControlFlow;
+
+    /** Disables uniform control flow optimizations (e.g. this changes an
+        "if" statement with a uniform condition to have a varying
+        condition).  This is likely only useful for measuring the impact of
+        uniform control flow. */
+    bool disableUniformControlFlow;
+
+    /** Disables the backend optimizations related to gather/scatter
+        (e.g. transforming gather from sequential locations to an unaligned
+        load, etc.)  This is likely only useful for measuring the impact of
+        these optimizations. */
+    bool disableGatherScatterOptimizations;
+
+    /** Disables the optimization that demotes masked stores to regular
+        stores when the store is happening at the same control flow level
+        where the variable was declared.  This is likely only useful for
+        measuring the impact of this optimization. */
+    bool disableMaskedStoreToStore;
+
+    /** Disables the optimization that detects when the execution mask is
+        all on and emits code for gathers and scatters that doesn't loop
+        over the SIMD lanes but just does the scalar loads and stores
+        directly. */
+    bool disableGatherScatterFlattening;
+
+    /** Disables the optimizations that detect when arrays are being
+        indexed with 'uniform' values and issue scalar loads/stores rather
+        than gathers/scatters.  This is likely only useful for measuring
+        the impact of this optimization. */
+    bool disableUniformMemoryOptimizations;
+
+    /** Disables optimizations for masked stores: masked stores with the
+        mask all on are transformed to regular stores, and masked stores
+        with the mask are all off are removed (which in turn can allow
+        eliminating additional dead code related to computing the value
+        stored).  This is likely only useful for measuring the impact of
+        this optimization. */
+    bool disableMaskedStoreOptimizations;
+};
+
+/** @brief This structure collects together a number of global variables. 
+
+    This structure collects a number of global variables that mostly
+    represent parameter settings for this compilation run.  In particular,
+    none of these values should change after compilation befins; their
+    values are all set during command-line argument processing or very
+    early during the compiler's execution, before any files are parsed.
+  */
+struct Globals {
+    Globals();
+
+    /** Optimization option settings */
+    Opt opt;
+    /** Compilation target information */
+    Target target;
+
+    /** There are a number of math libraries that can be used for
+        transcendentals and the like during program compilation. */
+    enum MathLib { Math_ISPC, Math_ISPCFast, Math_SVML, Math_System };
+    MathLib mathLib;
+
+    /** Records whether the ispc standard library should be made available
+        to the program during compilations. (Default is true.) */
+    bool includeStdlib;
+
+    /** Indicates whether the C pre-processor should be run over the
+        program source before compiling it.  (Default is true.) */
+    bool runCPP;
+
+    /** When \c true, voluminous debugging output will be printed during
+        ispc's execution. */
+    bool debugPrint;
+
+    /** Indicates whether all warning messages should be surpressed. */
+    bool disableWarnings;
+
+    /** Indicates whether additional warnings should be issued about
+        possible performance pitfalls. */
+    bool emitPerfWarnings;
+
+    /** Indicates whether calls should be emitted in the program to an
+        externally-defined program instrumentation function. (See the
+        "Instrumenting your ispc programs" section in the user's
+        manual.) */
+    bool emitInstrumentation; 
+
+    /** Indicates whether ispc should generate debugging symbols for the
+        program in its output. */
+    bool generateDebuggingSymbols;
+
+    /** Global LLVMContext object */
+    llvm::LLVMContext *ctx;
+
+    /** Current working directory when the ispc compiler starts
+        execution. */
+    char currentDirectory[1024];
+
+    /** Arguments to pass along to the C pre-processor, if it is run on the
+        program before compilation. */
+    std::vector<std::string> cppArgs;
+};
+
+extern Globals *g;
+extern Module *m;
+
+#endif // ISPC_H
--- a/ispc.sln
+++ b/ispc.sln
@@ -0,0 +1,25 @@
+
+Microsoft Visual Studio Solution File, Format Version 11.00
+# Visual Studio 2010
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ispc", "ispc.vcxproj", "{9861F490-F516-480C-B63C-D62A77AFA9D5}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ispc_test", "ispc_test.vcxproj", "{92547BA8-BE86-4E78-8799-1D72A70E5831}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Win32 = Debug|Win32
+		Release|Win32 = Release|Win32
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{9861F490-F516-480C-B63C-D62A77AFA9D5}.Debug|Win32.ActiveCfg = Debug|Win32
+		{9861F490-F516-480C-B63C-D62A77AFA9D5}.Debug|Win32.Build.0 = Debug|Win32
+		{9861F490-F516-480C-B63C-D62A77AFA9D5}.Release|Win32.ActiveCfg = Release|Win32
+		{9861F490-F516-480C-B63C-D62A77AFA9D5}.Release|Win32.Build.0 = Release|Win32
+		{92547BA8-BE86-4E78-8799-1D72A70E5831}.Debug|Win32.ActiveCfg = Debug|Win32
+		{92547BA8-BE86-4E78-8799-1D72A70E5831}.Debug|Win32.Build.0 = Debug|Win32
+		{92547BA8-BE86-4E78-8799-1D72A70E5831}.Release|Win32.ActiveCfg = Release|Win32
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/ispc.vcxproj
+++ b/ispc.vcxproj
@@ -0,0 +1,216 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="builtins.cpp" />
+    <ClCompile Include="ctx.cpp" />
+    <ClCompile Include="decl.cpp" />
+    <ClCompile Include="expr.cpp" />
+    <ClCompile Include="gen-bitcode-avx.cpp" />
+    <ClCompile Include="gen-bitcode-c.cpp" />
+    <ClCompile Include="gen-bitcode-sse2.cpp" />
+    <ClCompile Include="gen-bitcode-sse4.cpp" />
+    <ClCompile Include="gen-bitcode-sse4x2.cpp" />
+    <ClCompile Include="gen-stdlib.cpp" />
+    <ClCompile Include="ispc.cpp" />
+    <ClCompile Include="lex.cc" />
+    <ClCompile Include="llvmutil.cpp" />
+    <ClCompile Include="module.cpp" />
+    <ClCompile Include="main.cpp" />
+    <ClCompile Include="opt.cpp" />
+    <ClCompile Include="parse.cc" />
+    <CustomBuild Include="stdlib-c.c">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%LLVM_INSTALL_DIR%\bin\clang -emit-llvm stdlib-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py stdlib-c.c &gt; gen-bitcode-c.cpp</Command>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">clang stdlib-c.c</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%LLVM_INSTALL_DIR%\bin\clang -emit-llvm stdlib-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py stdlib-c.c &gt; gen-bitcode-c.cpp</Command>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">clang stdlib-c.c</Message>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-c.cpp</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-c.cpp</Outputs>
+    </CustomBuild>
+    <ClCompile Include="stmt.cpp" />
+    <ClCompile Include="sym.cpp" />
+    <ClCompile Include="type.cpp" />
+    <ClCompile Include="util.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="builtins.h" />
+    <ClInclude Include="ctx.h" />
+    <ClInclude Include="decl.h" />
+    <ClInclude Include="expr.h" />
+    <ClInclude Include="ispc.h" />
+    <ClInclude Include="llvmutil.h" />
+    <ClInclude Include="module.h" />
+    <ClInclude Include="opt.h" />
+    <ClInclude Include="stmt.h" />
+    <ClInclude Include="sym.h" />
+    <ClInclude Include="type.h" />
+    <ClInclude Include="util.h" />
+    <ClInclude Include="winstuff\unistd.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="stdlib.ispc">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">cl /EP /TP %(Filename).ispc /DISPC=1 /DPI=3.1415926535 | python stdlib2cpp.py &gt; gen-stdlib.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-stdlib.cpp</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">cl /EP /TP %(Filename).ispc /DISPC=1 /DPI=3.1415926535 | python stdlib2cpp.py &gt; gen-stdlib.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-stdlib.cpp</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-stdlib.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-stdlib.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="stdlib-sse4.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 stdlib.m4 stdlib-sse4.ll | python bitcode2cpp.py stdlib-sse4.ll &gt; gen-bitcode-sse4.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse4.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">stdlib.m4;stdlib-sse.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 stdlib.m4 stdlib-sse4.ll | python bitcode2cpp.py stdlib-sse4.ll &gt; gen-bitcode-sse4.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse4.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">stdlib.m4;stdlib-sse.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse4.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse4.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="stdlib-sse4x2.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 stdlib.m4 stdlib-sse4x2.ll | python bitcode2cpp.py stdlib-sse4x2.ll &gt; gen-bitcode-sse4x2.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse4x2.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">stdlib.m4;stdlib-sse.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 stdlib.m4 stdlib-sse4x2.ll | python bitcode2cpp.py stdlib-sse4x2.ll &gt; gen-bitcode-sse4x2.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse4x2.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">stdlib.m4;stdlib-sse.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse4x2.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse4x2.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="stdlib-sse2.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 stdlib.m4 stdlib-sse2.ll | python bitcode2cpp.py stdlib-sse2.ll &gt; gen-bitcode-sse2.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse2.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">stdlib.m4;stdlib-sse.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 stdlib.m4 stdlib-sse2.ll | python bitcode2cpp.py stdlib-sse2.ll &gt; gen-bitcode-sse2.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse2.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">stdlib.m4;stdlib-sse.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse2.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse2.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="stdlib-avx.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 stdlib.m4 stdlib-avx.ll | python bitcode2cpp.py stdlib-avx.ll &gt; gen-bitcode-avx.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">stdlib.m4;stdlib-sse.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 stdlib.m4 stdlib-avx.ll | python bitcode2cpp.py stdlib-avx.ll &gt; gen-bitcode-avx.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">stdlib.m4;stdlib-sse.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="lex.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">flex -t lex.ll &gt; lex.cc</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">lex.cc</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">flex -t lex.ll &gt; lex.cc</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">lex.cc</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc.h;decl.h;parse.hh;sym.h</AdditionalInputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc.h;decl.h;parse.hh;sym.h</AdditionalInputs>
+    </CustomBuild>
+    <CustomBuild Include="parse.yy">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">bison -d -v -t -o parse.cc parse.yy</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">parse.cc;parse.h</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">bison -d -v -t -o parse.cc parse.yy</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">parse.cc;parse.h</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc.h;type.h;decl.h;expr.h;sym.h;stmt.h</AdditionalInputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc.h;type.h;decl.h;expr.h;sym.h;stmt.h</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Running bison on parse.yy</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Running bison on parse.yy</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{9861F490-F516-480C-B63C-D62A77AFA9D5}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>ispc</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>NOMINMAX</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)\include;.;.\winstuff;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <DisableSpecificWarnings>4146;4800;4996;4355;4624</DisableSpecificWarnings>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <AdditionalDependencies>LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>NOMINMAX</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)\include;.;.\winstuff;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <DisableSpecificWarnings>4146;4800;4996;4355;4624</DisableSpecificWarnings>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <AdditionalDependencies>LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
--- a/ispc_test.cpp
+++ b/ispc_test.cpp
@@ -0,0 +1,313 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#define _CRT_SECURE_NO_WARNINGS
+
+#include <stdio.h>
+#include <stdint.h>
+
+#ifdef ISPC_HAVE_SVML
+#include <xmmintrin.h>
+extern "C" {
+    extern __m128 __svml_sinf4(__m128);
+    extern __m128 __svml_cosf4(__m128);
+    extern __m128 __svml_sincosf4(__m128 *,__m128);
+    extern __m128 __svml_tanf4(__m128);
+    extern __m128 __svml_atanf4(__m128);
+    extern __m128 __svml_atan2f4(__m128, __m128);
+    extern __m128 __svml_expf4(__m128);
+    extern __m128 __svml_logf4(__m128);
+    extern __m128 __svml_powf4(__m128, __m128);
+}
+#endif
+
+#include <llvm/LLVMContext.h>
+#include <llvm/Module.h>
+#include <llvm/Type.h>
+#include <llvm/DerivedTypes.h>
+#include <llvm/Instructions.h>
+#include <llvm/ExecutionEngine/ExecutionEngine.h>
+#include <llvm/ExecutionEngine/JIT.h>
+#include <llvm/Target/TargetSelect.h>
+#include <llvm/Target/TargetOptions.h>
+#include <llvm/Target/TargetData.h>
+#include <llvm/Transforms/Scalar.h>
+#include <llvm/Transforms/IPO.h>
+#include <llvm/PassManager.h>
+#include <llvm/Support/CFG.h>
+#include <llvm/Analysis/Verifier.h>
+#include <llvm/Assembly/PrintModulePass.h>
+#include <llvm/Support/raw_ostream.h>
+#include <llvm/Bitcode/ReaderWriter.h>
+#include <llvm/Support/MemoryBuffer.h>
+#ifndef LLVM_2_8
+#include <llvm/Support/system_error.h>
+#endif
+
+extern "C" { 
+    void ISPCLaunch(void *, void *);
+    void ISPCSync();
+}
+
+void ISPCLaunch(void *func, void *data) {
+    typedef void (*TaskFuncType)(void *, int, int);
+    TaskFuncType tft = (TaskFuncType)(func);
+    tft(data, 0, 1);
+}
+
+
+void ISPCSync() {
+}
+
+static void usage(int ret) {
+    fprintf(stderr, "usage: ispc_test\n");
+    fprintf(stderr, "\t[-h/--help]\tprint help\n");
+    fprintf(stderr, "\t<files>\n");
+    exit(ret);
+}
+
+static void svml_missing() {
+    fprintf(stderr, "Program called unavailable SVML function!\n");
+    exit(1);
+}
+
+static bool lRunTest(const char *fn) {
+    llvm::LLVMContext *ctx = new llvm::LLVMContext;
+
+#ifdef LLVM_2_8
+    std::string err;
+    llvm::MemoryBuffer *buf = llvm::MemoryBuffer::getFileOrSTDIN(fn, &err);
+    if (!buf) {
+        fprintf(stderr, "Unable to open file \"%s\": %s\n", fn, err.c_str());
+        delete ctx;
+        return false;
+    }
+    std::string bcErr;
+    llvm::Module *module = llvm::ParseBitcodeFile(buf, *ctx, &bcErr);
+#else
+    llvm::OwningPtr<llvm::MemoryBuffer> buf;
+    llvm::error_code err = llvm::MemoryBuffer::getFileOrSTDIN(fn, buf);
+    if (err) {
+        fprintf(stderr, "Unable to open file \"%s\": %s\n", fn, err.message().c_str());
+        delete ctx;
+        return false;
+    }
+    std::string bcErr;
+    llvm::Module *module = llvm::ParseBitcodeFile(buf.get(), *ctx, &bcErr);
+#endif
+
+    if (!module) {
+        fprintf(stderr, "Bitcode reader failed for \"%s\": %s\n", fn, bcErr.c_str());
+        delete ctx;
+        return false;
+    }
+
+    std::string eeError;
+    llvm::ExecutionEngine *ee = llvm::ExecutionEngine::createJIT(module, &eeError);
+    if (!ee) {
+        fprintf(stderr, "Unable to create ExecutionEngine: %s\n", eeError.c_str());
+        return false;
+    }
+
+    llvm::Function *func;
+    if ((func = module->getFunction("ISPCLaunch")) != NULL)
+        ee->addGlobalMapping(func, (void *)ISPCLaunch);
+    if ((func = module->getFunction("ISPCSync")) != NULL)
+        ee->addGlobalMapping(func, (void *)ISPCSync);
+    if ((func = module->getFunction("putchar")) != NULL)
+        ee->addGlobalMapping(func, (void *)putchar);
+    if ((func = module->getFunction("printf")) != NULL)
+        ee->addGlobalMapping(func, (void *)printf);
+    if ((func = module->getFunction("fflush")) != NULL)
+        ee->addGlobalMapping(func, (void *)fflush);
+    if ((func = module->getFunction("sinf")) != NULL)
+        ee->addGlobalMapping(func, (void *)sinf);
+    if ((func = module->getFunction("cosf")) != NULL)
+        ee->addGlobalMapping(func, (void *)cosf);
+    if ((func = module->getFunction("tanf")) != NULL)
+        ee->addGlobalMapping(func, (void *)tanf);
+    if ((func = module->getFunction("atanf")) != NULL)
+        ee->addGlobalMapping(func, (void *)atanf);
+    if ((func = module->getFunction("atan2f")) != NULL)
+        ee->addGlobalMapping(func, (void *)atan2f);
+    if ((func = module->getFunction("powf")) != NULL)
+        ee->addGlobalMapping(func, (void *)powf);
+    if ((func = module->getFunction("expf")) != NULL)
+        ee->addGlobalMapping(func, (void *)expf);
+    if ((func = module->getFunction("logf")) != NULL)
+        ee->addGlobalMapping(func, (void *)logf);
+
+#ifdef ISPC_HAVE_SVML
+#define DO_SVML(FUNC ,FUNCNAME)                           \
+    if ((func = module->getFunction(FUNCNAME)) != NULL)   \
+        ee->addGlobalMapping(func, (void *)FUNC)
+#else
+#define DO_SVML(FUNC, FUNCNAME)                                         \
+    if ((func = module->getFunction(FUNCNAME)) != NULL)                 \
+        ee->addGlobalMapping(func, (void *)svml_missing)
+#endif
+
+    DO_SVML(__svml_sinf4, "__svml_sinf4");
+    DO_SVML(__svml_cosf4, "__svml_cosf4");
+    DO_SVML(__svml_sincosf4, "__svml_sincosf4");
+    DO_SVML(__svml_tanf4, "__svml_tanf4");
+    DO_SVML(__svml_atanf4, "__svml_atanf4");
+    DO_SVML(__svml_atan2f4, "__svml_atan2f4");
+    DO_SVML(__svml_expf4, "__svml_expf4");
+    DO_SVML(__svml_logf4, "__svml_logf4");
+    DO_SVML(__svml_powf4, "__svml_powf4");
+
+    // figure out the vector width in the compiled code
+    func = module->getFunction("width");
+    if (!func) {
+        fprintf(stderr, "No width() function found!\n");
+        return false;
+    }
+    int width;
+    {
+        typedef int (*PFN)();
+        PFN pfn = reinterpret_cast<PFN>(ee->getPointerToFunction(func));
+        width = pfn();
+        assert(width == 4 || width == 8 || width == 12 || width == 16);
+    }
+
+    // find the value that returns the desired result
+    func = module->getFunction("result");
+    bool foundResult = (func != NULL);
+    float result[16];
+    for (int i = 0; i < 16; ++i)
+        result[i] = 0;
+    bool ok = true;
+    if (foundResult) {
+        typedef void (*PFN)(float *);
+        PFN pfn = reinterpret_cast<PFN>(ee->getPointerToFunction(func));
+        pfn(result);
+    }
+    else
+        fprintf(stderr, "Warning: no result() function found.\n");
+
+    // try to find a function to run
+    float returned[16];
+    for (int i = 0; i < 16; ++i)
+        returned[i] = 0;
+    float vfloat[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
+    double vdouble[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
+    int vint[16] = { 2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32 };
+    int vint2[16] = { 5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20};
+
+    if ((func = module->getFunction("f_v")) != NULL) {
+        typedef void (*PFN)(float *);
+        PFN pfn = reinterpret_cast<PFN>(ee->getPointerToFunction(func));
+        pfn(returned);
+    }
+    else if ((func = module->getFunction("f_f")) != NULL) {
+        typedef void (*PFN)(float *, float *);
+        PFN pfn = reinterpret_cast<PFN>(ee->getPointerToFunction(func));
+        llvm::verifyFunction(*func);
+        pfn(returned, vfloat);
+    }
+    else if ((func = module->getFunction("f_fu")) != NULL) {
+        typedef void (*PFN)(float *, float *, float fu);
+        PFN pfn = reinterpret_cast<PFN>(ee->getPointerToFunction(func));
+        llvm::verifyFunction(*func);
+        pfn(returned, vfloat, 5.);
+    }
+    else if ((func = module->getFunction("f_fi")) != NULL) {
+        typedef void (*PFN)(float *, float *, int *);
+        PFN pfn = reinterpret_cast<PFN>(ee->getPointerToFunction(func));
+        pfn(returned, vfloat, vint);
+    }
+    else if ((func = module->getFunction("f_du")) != NULL) {
+        typedef void (*PFN)(float *, double *, double);
+        PFN pfn = reinterpret_cast<PFN>(ee->getPointerToFunction(func));
+        pfn(returned, vdouble, 5.);
+    }
+    else if ((func = module->getFunction("f_duf")) != NULL) {
+        typedef void (*PFN)(float *, double *, float);
+        PFN pfn = reinterpret_cast<PFN>(ee->getPointerToFunction(func));
+        pfn(returned, vdouble, 5.f);
+    }
+    else if ((func = module->getFunction("f_di")) != NULL) {
+        typedef void (*PFN)(float *, double *, int *);
+        PFN pfn = reinterpret_cast<PFN>(ee->getPointerToFunction(func));
+        pfn(returned, vdouble, vint2);
+    }
+    else {
+        fprintf(stderr, "Unable to find runnable function in file \"%s\"\n", fn);
+        ok = false;
+    }
+
+    // see if we got the right result
+    if (ok) {
+        if (foundResult) {
+            for (int i = 0; i < width; ++i)
+                if (returned[i] != result[i]) {
+                    ok = false;
+                    fprintf(stderr, "Test \"%s\" RETURNED %d: %g / %a EXPECTED %g / %a\n",
+                            fn, i, returned[i], returned[i], result[i], result[i]);
+                }
+        }
+        else {
+            for (int i = 0; i < width; ++i)
+                fprintf(stderr, "Test \"%s\" returned %d: %g / %a\n",
+                        fn, i, returned[i], returned[i]);
+        }
+    }
+
+    delete ee;
+    delete ctx;
+
+    return ok && foundResult;
+}
+
+int main(int argc, char *argv[]) {
+    llvm::InitializeNativeTarget();
+
+    std::vector<const char *> files;
+    for (int i = 1; i < argc; ++i) {
+        if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h"))
+            usage(0);
+        else
+            files.push_back(argv[i]);
+    }
+
+    int passes = 0, fails = 0;
+    for (unsigned int i = 0; i < files.size(); ++i) {
+        if (lRunTest(files[i])) ++passes;
+        else ++fails;
+    }
+
+    if (fails > 0)
+        fprintf(stderr, "%d/%d tests passed\n", passes, passes+fails);
+    return fails > 0;
+}
--- a/ispc_test.vcxproj
+++ b/ispc_test.vcxproj
@@ -0,0 +1,88 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="ispc_test.cpp" />
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{92547BA8-BE86-4E78-8799-1D72A70E5831}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>ispc_test</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)/include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)/lib</AdditionalLibraryDirectories>
+      <AdditionalDependencies>LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMJIT.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)/include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)/lib</AdditionalLibraryDirectories>
+      <AdditionalDependencies>LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMJIT.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
--- a/lex.ll
+++ b/lex.ll
@@ -0,0 +1,426 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+%{
+
+#include "ispc.h"
+#include "decl.h"
+#include "parse.hh"
+#include "sym.h"
+#include "util.h"
+#include "module.h"
+
+static uint32_t lParseBinary(const char *ptr, SourcePos pos);
+static void lCComment(SourcePos *);
+static void lCppComment(SourcePos *);
+static void lHandleCppHash(SourcePos *);
+static void lStringConst(YYSTYPE *, SourcePos *);
+
+#define YY_USER_ACTION \
+    yylloc->first_line = yylloc->last_line; \
+    yylloc->first_column = yylloc->last_column; \
+    yylloc->last_column += yyleng;
+
+#ifdef ISPC_IS_WINDOWS
+inline int isatty(int) { return 0; }
+#endif // ISPC_IS_WINDOWS
+
+%}
+
+%option nounput
+%option noyywrap
+%option bison-bridge
+%option bison-locations
+%option nounistd
+
+WHITESPACE [ \t\r]+
+INT_NUMBER (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))
+FLOAT_NUMBER (([0-9]+|(([0-9]+\.[0-9]*[fF]?)|(\.[0-9]+)))([eE][-+]?[0-9]+)?[fF]?)|([-]?0x[01]\.?[0-9a-fA-F]+p[-+]?[0-9]+[fF]?)
+
+IDENT [a-zA-Z_][a-zA-Z_0-9]*
+
+%%
+"/*"            { lCComment(yylloc); }
+"//"            { lCppComment(yylloc); }
+
+bool { return TOKEN_BOOL; }
+break { return TOKEN_BREAK; }
+case { return TOKEN_CASE; }
+cbreak { return TOKEN_CBREAK; }
+ccontinue { return TOKEN_CCONTINUE; }
+cdo { return TOKEN_CDO; }
+cfor { return TOKEN_CFOR; }
+char { return TOKEN_CHAR; }
+cif { return TOKEN_CIF; }
+cwhile { return TOKEN_CWHILE; }
+const { return TOKEN_CONST; }
+continue { return TOKEN_CONTINUE; }
+creturn { return TOKEN_CRETURN; }
+default { return TOKEN_DEFAULT; }
+do { return TOKEN_DO; }
+double { return TOKEN_DOUBLE; }
+else { return TOKEN_ELSE; }
+enum { return TOKEN_ENUM; }
+export { return TOKEN_EXPORT; }
+extern { return TOKEN_EXTERN; }
+false { return TOKEN_FALSE; }
+float { return TOKEN_FLOAT; }
+for { return TOKEN_FOR; }
+goto { return TOKEN_GOTO; }
+if { return TOKEN_IF; }
+inline { return TOKEN_INLINE; }
+int { return TOKEN_INT; }
+int32 { return TOKEN_INT; }
+int64 { return TOKEN_INT64; }
+launch { return TOKEN_LAUNCH; }
+print { return TOKEN_PRINT; }
+reference { return TOKEN_REFERENCE; }
+return { return TOKEN_RETURN; }
+soa { return TOKEN_SOA; }
+static { return TOKEN_STATIC; }
+struct { return TOKEN_STRUCT; }
+switch { return TOKEN_SWITCH; }
+sync { return TOKEN_SYNC; }
+task { return TOKEN_TASK; }
+true { return TOKEN_TRUE; }
+typedef { return TOKEN_TYPEDEF; }
+uniform { return TOKEN_UNIFORM; }
+unsigned { return TOKEN_UNSIGNED; }
+varying { return TOKEN_VARYING; }
+void { return TOKEN_VOID; }
+while { return TOKEN_WHILE; }
+
+L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL; }
+
+{IDENT} { 
+    /* We have an identifier--is it a type name or an identifier?
+       The symbol table will straighten us out... */
+    yylval->stringVal = new std::string(yytext);
+    if (m->symbolTable->LookupType(yytext) != NULL)
+        return TOKEN_TYPE_NAME;
+    else
+        return TOKEN_IDENTIFIER; 
+}
+
+{INT_NUMBER} { 
+    char *endPtr = NULL;
+#ifdef ISPC_IS_WINDOWS
+    unsigned long val;
+#else
+    unsigned long long val;
+#endif
+
+    if (yytext[0] == '0' && yytext[1] == 'b')
+        val = lParseBinary(yytext+2, *yylloc);
+    else {
+#ifdef ISPC_IS_WINDOWS
+        val = strtoul(yytext, &endPtr, 0);
+#else
+        val = strtoull(yytext, &endPtr, 0);
+#endif
+    }
+    yylval->int32Val = (int32_t)val;
+    if (val != (unsigned int)yylval->int32Val)
+        Warning(*yylloc, "32-bit integer has insufficient bits to represent value %s (%x %llx)",
+                yytext, yylval->int32Val, (unsigned long long)val);
+    return TOKEN_INT_CONSTANT; 
+}
+
+{INT_NUMBER}[uU] {
+    char *endPtr = NULL;
+#ifdef ISPC_IS_WINDOWS
+    unsigned long val;
+#else
+    unsigned long long val;
+#endif
+
+    if (yytext[0] == '0' && yytext[1] == 'b')
+        val = lParseBinary(yytext+2, *yylloc);
+    else {
+#ifdef ISPC_IS_WINDOWS
+        val = strtoul(yytext, &endPtr, 0);
+#else
+        val = strtoull(yytext, &endPtr, 0);
+#endif
+    }
+
+    yylval->int32Val = (int32_t)val;
+    if (val != (unsigned int)yylval->int32Val)
+        Warning(*yylloc, "32-bit integer has insufficient bits to represent value %s (%x %llx)",
+                yytext, yylval->int32Val, (unsigned long long)val);
+    return TOKEN_UINT_CONSTANT; 
+}
+
+{FLOAT_NUMBER} { 
+    /* FIXME: need to implement a hex float constant parser so that we can 
+       support them on Windows (which doesn't handle them in its atof()
+       implementation... */
+    yylval->floatVal = atof(yytext); 
+    return TOKEN_FLOAT_CONSTANT; 
+}
+
+"++" { return TOKEN_INC_OP; }
+"--" { return TOKEN_DEC_OP; }
+"<<" { return TOKEN_LEFT_OP; }
+">>" { return TOKEN_RIGHT_OP; }
+"<=" { return TOKEN_LE_OP; }
+">=" { return TOKEN_GE_OP; }
+"==" { return TOKEN_EQ_OP; }
+"!=" { return TOKEN_NE_OP; }
+"&&" { return TOKEN_AND_OP; }
+"||" { return TOKEN_OR_OP; }
+"*=" { return TOKEN_MUL_ASSIGN; }
+"/=" { return TOKEN_DIV_ASSIGN; }
+"%=" { return TOKEN_MOD_ASSIGN; }
+"+=" { return TOKEN_ADD_ASSIGN; }
+"-=" { return TOKEN_SUB_ASSIGN; }
+"<<=" { return TOKEN_LEFT_ASSIGN; }
+">>=" { return TOKEN_RIGHT_ASSIGN; }
+"&=" { return TOKEN_AND_ASSIGN; }
+"^=" { return TOKEN_XOR_ASSIGN; }
+"|=" { return TOKEN_OR_ASSIGN; }
+";"             { return ';'; }
+("{"|"<%")      { return '{'; }
+("}"|"%>")      { return '}'; }
+","             { return ','; }
+":"             { return ':'; }
+"="             { return '='; }
+"("             { return '('; }
+")"             { return ')'; }
+("["|"<:")      { return '['; }
+("]"|":>")      { return ']'; }
+"."             { return '.'; }
+"&"             { return '&'; }
+"!"             { return '!'; }
+"~"             { return '~'; }
+"-"             { return '-'; }
+"+"             { return '+'; }
+"*"             { return '*'; }
+"/"             { return '/'; }
+"%"             { return '%'; }
+"<"             { return '<'; }
+">"             { return '>'; }
+"^"             { return '^'; }
+"|"             { return '|'; }
+"?"             { return '?'; }
+
+{WHITESPACE} { }
+
+\n {
+    yylloc->last_line++; 
+    yylloc->last_column = 1; 
+}
+
+#(line)?[ ][0-9]+[ ]\"(\\.|[^\\"])*\"[^\n]* { 
+    lHandleCppHash(yylloc); 
+}
+
+. {
+    Error(*yylloc, "Illegal character: %c (0x%x)", yytext[0], int(yytext[0]));
+    YY_USER_ACTION 
+}
+
+%%
+
+/*sizeof { return TOKEN_SIZEOF; }*/
+/*"->" { return TOKEN_PTR_OP; }*/
+/*short { return TOKEN_SHORT; }*/
+/*long { return TOKEN_LONG; }*/
+/*signed { return TOKEN_SIGNED; }*/
+/*volatile { return TOKEN_VOLATILE; }*/
+/*"long"[ \t\v\f\n]+"long" { return TOKEN_LONGLONG; }*/
+/*union { return TOKEN_UNION; }*/
+/*"..." { return TOKEN_ELLIPSIS; }*/
+
+/** Return the integer version of a binary constant from a string.
+ */
+static uint32_t
+lParseBinary(const char *ptr, SourcePos pos) {
+    uint32_t val = 0;
+    bool warned = false;
+
+    while (*ptr != '\0') {
+        /* if this hits, the regexp for 0b... constants is broken */
+        assert(*ptr == '0' || *ptr == '1');
+
+        if ((val & (1<<31)) && warned == false) {
+            // We're about to shift out a set bit
+            // FIXME: 64-bit int constants...
+            Warning(pos, "Can't represent binary constant with 32-bit integer type");
+            warned = true;
+        }
+
+        val = (val << 1) | (*ptr == '0' ? 0 : 1);
+        ++ptr;
+    }
+    return val;
+}
+
+
+/** Handle a C-style comment in the source. 
+ */
+static void
+lCComment(SourcePos *pos) {
+    char c, prev = 0;
+  
+    while ((c = yyinput()) != 0) {
+        if (c == '\n') {
+            pos->last_line++;
+            pos->last_column = 1;
+        }
+        if (c == '/' && prev == '*')
+            return;
+        prev = c;
+    }
+    Error(*pos, "unterminated comment");
+}
+
+/** Handle a C++-style comment--eat everything up until the end of the line.
+ */
+static void
+lCppComment(SourcePos *pos) {
+    char c;
+    do {
+        c = yyinput();
+    } while (c != 0 && c != '\n');
+    if (c == '\n') {
+        pos->last_line++;
+        pos->last_column = 1;
+    }
+}
+
+/** Handle a line that starts with a # character; this should be something
+    left behind by the preprocessor indicating the source file/line
+    that our current position corresponds to.
+ */
+static void lHandleCppHash(SourcePos *pos) {
+    char *ptr, *src;
+
+    // Advance past the opening stuff on the line.
+    assert(yytext[0] == '#');
+    if (yytext[1] == ' ')
+        // On Linux/OSX, the preprocessor gives us lines like
+        // # 1234 "foo.c"
+        ptr = yytext + 2;
+    else {
+        // On windows, cl.exe's preprocessor gives us lines of the form:
+        // #line 1234 "foo.c"
+        assert(!strncmp(yytext+1, "line ", 5));
+        ptr = yytext + 6;
+    }
+
+    // Now we can set the line number based on the integer in the string
+    // that ptr is pointing at.
+    pos->last_line = strtol(ptr, &src, 10) - 1;
+    pos->last_column = 1;
+    // Make sure that the character after the integer is a space and that
+    // then we have open quotes
+    assert(src != ptr && src[0] == ' ' && src[1] == '"');
+    src += 2;
+
+    // And the filename is everything up until the closing quotes
+    std::string filename;
+    while (*src != '"') {
+        assert(*src && *src != '\n');
+        filename.push_back(*src);
+        ++src;
+    }
+    pos->name = strdup(filename.c_str());
+}
+
+
+/** Given a pointer to a position in a string, return the character that it
+    represents, accounting for the escape characters supported in string
+    constants.  (i.e. given the literal string "\\", return the character
+    '/').  The return value is the new position in the string and the
+    decoded character is returned in *pChar.
+*/
+static char *
+lEscapeChar(char *str, char *pChar, SourcePos *pos)
+{
+    if (*str != '\\') {
+        *pChar = *str;
+    }
+    else {
+        char *tail;
+        ++str;
+        switch (*str) {
+        case '\'': *pChar = '\''; break;
+        case '\"': *pChar = '\"'; break;
+        case '?':  *pChar = '\?'; break;
+        case '\\': *pChar = '\\'; break;
+        case 'a':  *pChar = '\a'; break;
+        case 'b':  *pChar = '\b'; break;
+        case 'f':  *pChar = '\f'; break;
+        case 'n':  *pChar = '\n'; break;
+        case 'r':  *pChar = '\r'; break;
+        case 't':  *pChar = '\t'; break;
+        case 'v':  *pChar = '\v'; break;
+        // octal constants \012
+        case '0': case '1': case '2': case '3': case '4':
+        case '5': case '6': case '7':
+            *pChar = strtol(str, &tail, 8);
+            str = tail - 1;
+            break;
+        // hexidecimal constant \xff
+        case 'x':
+            *pChar = strtol(str, &tail, 16);
+            str = tail - 1;
+            break;
+        default:
+            Error(*pos, "Bad character escape sequence: '%s'\n.", str);
+            break;
+        }
+    }
+    ++str;
+    return str;
+}
+
+
+/** Parse a string constant in the source file.  For each character in the
+    string, handle any escaped characters with lEscapeChar() and keep eating
+    characters until we come to the closing quote.
+*/
+static void
+lStringConst(YYSTYPE *yylval, SourcePos *pos)
+{
+    char *p;
+    std::string str;
+    p = strchr(yytext, '"') + 1;
+    while (*p != '\"') {
+       char cval;
+       p = lEscapeChar(p, &cval, pos);
+       str.push_back(cval);
+    } 
+    yylval->stringVal = new std::string(str);
+}
--- a/llvmutil.cpp
+++ b/llvmutil.cpp
@@ -0,0 +1,329 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file llvmutil.cpp
+    @brief Implementations of various LLVM utility types and classes.
+*/
+
+#include "llvmutil.h"
+#include "type.h"
+
+const llvm::Type *LLVMTypes::VoidType = NULL;
+const llvm::PointerType *LLVMTypes::VoidPointerType = NULL;
+const llvm::Type *LLVMTypes::BoolType = NULL;
+const llvm::Type *LLVMTypes::Int8Type = NULL;
+const llvm::Type *LLVMTypes::Int16Type = NULL;
+const llvm::Type *LLVMTypes::Int32Type = NULL;
+const llvm::Type *LLVMTypes::Int32PointerType = NULL;
+const llvm::Type *LLVMTypes::Int64Type = NULL;
+const llvm::Type *LLVMTypes::Int64PointerType = NULL;
+const llvm::Type *LLVMTypes::FloatType = NULL;
+const llvm::Type *LLVMTypes::FloatPointerType = NULL;
+const llvm::Type *LLVMTypes::DoubleType = NULL;
+
+const llvm::VectorType *LLVMTypes::MaskType = NULL;
+const llvm::VectorType *LLVMTypes::BoolVectorType = NULL;
+const llvm::VectorType *LLVMTypes::Int1VectorType = NULL;
+const llvm::VectorType *LLVMTypes::Int32VectorType = NULL;
+const llvm::Type *LLVMTypes::Int32VectorPointerType = NULL;
+const llvm::VectorType *LLVMTypes::Int64VectorType = NULL;
+const llvm::Type *LLVMTypes::Int64VectorPointerType = NULL;
+const llvm::VectorType *LLVMTypes::FloatVectorType = NULL;
+const llvm::Type *LLVMTypes::FloatVectorPointerType = NULL;
+const llvm::VectorType *LLVMTypes::DoubleVectorType = NULL;
+const llvm::ArrayType *LLVMTypes::VoidPointerVectorType = NULL;
+
+llvm::Constant *LLVMTrue = NULL;
+llvm::Constant *LLVMFalse = NULL;
+llvm::Constant *LLVMMaskAllOn = NULL;
+llvm::Constant *LLVMMaskAllOff = NULL;
+
+
+void
+InitLLVMUtil(llvm::LLVMContext *ctx, Target target) {
+    LLVMTypes::VoidType = llvm::Type::getVoidTy(*ctx);
+    LLVMTypes::VoidPointerType = llvm::PointerType::get(llvm::Type::getInt8Ty(*ctx), 0);
+    LLVMTypes::BoolType = llvm::Type::getInt1Ty(*ctx);
+    LLVMTypes::Int8Type = llvm::Type::getInt8Ty(*ctx);
+    LLVMTypes::Int16Type = llvm::Type::getInt16Ty(*ctx);
+    LLVMTypes::Int32Type = llvm::Type::getInt32Ty(*ctx);
+    LLVMTypes::Int32PointerType = llvm::PointerType::get(LLVMTypes::Int32Type, 0);
+    LLVMTypes::Int64Type = llvm::Type::getInt64Ty(*ctx);
+    LLVMTypes::Int64PointerType = llvm::PointerType::get(LLVMTypes::Int64Type, 0);
+    LLVMTypes::FloatType = llvm::Type::getFloatTy(*ctx);
+    LLVMTypes::FloatPointerType = llvm::PointerType::get(LLVMTypes::FloatType, 0);
+    LLVMTypes::DoubleType = llvm::Type::getDoubleTy(*ctx);
+
+    // Note that both the mask and bool vectors are vector of int32s
+    // (not i1s).  LLVM ends up generating much better SSE code with
+    // this representation.
+    LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
+        llvm::VectorType::get(llvm::Type::getInt32Ty(*ctx), target.vectorWidth);
+
+    LLVMTypes::Int1VectorType = 
+        llvm::VectorType::get(llvm::Type::getInt1Ty(*ctx), target.vectorWidth);
+    LLVMTypes::Int32VectorType = 
+        llvm::VectorType::get(LLVMTypes::Int32Type, target.vectorWidth);
+    LLVMTypes::Int32VectorPointerType = llvm::PointerType::get(LLVMTypes::Int32VectorType, 0);
+    LLVMTypes::Int64VectorType = 
+        llvm::VectorType::get(LLVMTypes::Int64Type, target.vectorWidth);
+    LLVMTypes::Int64VectorPointerType = llvm::PointerType::get(LLVMTypes::Int64VectorType, 0);
+    LLVMTypes::FloatVectorType = 
+        llvm::VectorType::get(LLVMTypes::FloatType, target.vectorWidth);
+    LLVMTypes::FloatVectorPointerType = llvm::PointerType::get(LLVMTypes::FloatVectorType, 0);
+    LLVMTypes::DoubleVectorType = 
+        llvm::VectorType::get(LLVMTypes::DoubleType, target.vectorWidth);
+    LLVMTypes::VoidPointerVectorType = 
+        llvm::ArrayType::get(LLVMTypes::VoidPointerType, target.vectorWidth);
+
+    LLVMTrue = llvm::ConstantInt::getTrue(*ctx);
+    LLVMFalse = llvm::ConstantInt::getFalse(*ctx);
+
+    std::vector<llvm::Constant *> maskOnes;
+    llvm::Constant *onMask = NULL;
+    onMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), -1,
+                                    true /*signed*/); // 0xffffffff
+
+    for (int i = 0; i < target.vectorWidth; ++i)
+        maskOnes.push_back(onMask);
+    LLVMMaskAllOn = llvm::ConstantVector::get(LLVMTypes::MaskType, maskOnes);
+
+    std::vector<llvm::Constant *> maskZeros;
+    llvm::Constant *offMask = NULL;
+    offMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), 0,
+                                     true /*signed*/);
+
+    for (int i = 0; i < target.vectorWidth; ++i)
+        maskZeros.push_back(offMask);
+    LLVMMaskAllOff = llvm::ConstantVector::get(LLVMTypes::MaskType, maskZeros);
+}
+
+
+llvm::ConstantInt *LLVMInt32(int32_t ival) {
+    return llvm::ConstantInt::get(llvm::Type::getInt32Ty(*g->ctx), ival,
+                                  true /*signed*/);
+}
+
+
+llvm::ConstantInt *
+LLVMUInt32(uint32_t ival) {
+    return llvm::ConstantInt::get(llvm::Type::getInt32Ty(*g->ctx), ival,
+                                  false /*unsigned*/);
+}
+
+
+llvm::ConstantInt *
+LLVMInt64(int64_t ival) {
+    return llvm::ConstantInt::get(llvm::Type::getInt64Ty(*g->ctx), ival,
+                                  true /*signed*/);
+}
+
+
+llvm::ConstantInt *
+LLVMUInt64(uint64_t ival) {
+    return llvm::ConstantInt::get(llvm::Type::getInt64Ty(*g->ctx), ival,
+                                  false /*unsigned*/);
+}
+
+
+llvm::Constant *
+LLVMFloat(float fval) {
+    return llvm::ConstantFP::get(llvm::Type::getFloatTy(*g->ctx), fval);
+}
+
+
+llvm::Constant *
+LLVMDouble(double dval) {
+    return llvm::ConstantFP::get(llvm::Type::getDoubleTy(*g->ctx), dval);
+}
+
+
+llvm::Constant *
+LLVMInt32Vector(int32_t ival) {
+    llvm::Constant *v = LLVMInt32(ival);
+    std::vector<llvm::Constant *> vals;
+    for (int i = 0; i < g->target.vectorWidth; ++i)
+        vals.push_back(v);
+    return llvm::ConstantVector::get(LLVMTypes::Int32VectorType, vals);
+}
+
+
+llvm::Constant *
+LLVMInt32Vector(const int32_t *ivec) {
+    std::vector<llvm::Constant *> vals;
+    for (int i = 0; i < g->target.vectorWidth; ++i)
+        vals.push_back(LLVMInt32(ivec[i]));
+    return llvm::ConstantVector::get(LLVMTypes::Int32VectorType, vals);
+}
+
+
+llvm::Constant *
+LLVMUInt32Vector(uint32_t ival) {
+    llvm::Constant *v = LLVMUInt32(ival);
+    std::vector<llvm::Constant *> vals;
+    for (int i = 0; i < g->target.vectorWidth; ++i)
+        vals.push_back(v);
+    return llvm::ConstantVector::get(LLVMTypes::Int32VectorType, vals);
+}
+
+
+llvm::Constant *
+LLVMUInt32Vector(const uint32_t *ivec) {
+    std::vector<llvm::Constant *> vals;
+    for (int i = 0; i < g->target.vectorWidth; ++i)
+        vals.push_back(LLVMUInt32(ivec[i]));
+    return llvm::ConstantVector::get(LLVMTypes::Int32VectorType, vals);
+}
+
+
+llvm::Constant *
+LLVMFloatVector(float fval) {
+    llvm::Constant *v = LLVMFloat(fval);
+    std::vector<llvm::Constant *> vals;
+    for (int i = 0; i < g->target.vectorWidth; ++i)
+        vals.push_back(v);
+    return llvm::ConstantVector::get(LLVMTypes::FloatVectorType, vals);
+}
+
+
+llvm::Constant *
+LLVMFloatVector(const float *fvec) {
+    std::vector<llvm::Constant *> vals;
+    for (int i = 0; i < g->target.vectorWidth; ++i)
+        vals.push_back(LLVMFloat(fvec[i]));
+    return llvm::ConstantVector::get(LLVMTypes::FloatVectorType, vals);
+}
+
+
+llvm::Constant *
+LLVMDoubleVector(double dval) {
+    llvm::Constant *v = LLVMDouble(dval);
+    std::vector<llvm::Constant *> vals;
+    for (int i = 0; i < g->target.vectorWidth; ++i)
+        vals.push_back(v);
+    return llvm::ConstantVector::get(LLVMTypes::DoubleVectorType, vals);
+}
+
+
+llvm::Constant *
+LLVMDoubleVector(const double *dvec) {
+    std::vector<llvm::Constant *> vals;
+    for (int i = 0; i < g->target.vectorWidth; ++i)
+        vals.push_back(LLVMDouble(dvec[i]));
+    return llvm::ConstantVector::get(LLVMTypes::DoubleVectorType, vals);
+}
+
+
+llvm::Constant *
+LLVMInt64Vector(int64_t ival) {
+    llvm::Constant *v = LLVMInt64(ival);
+    std::vector<llvm::Constant *> vals;
+    for (int i = 0; i < g->target.vectorWidth; ++i)
+        vals.push_back(v);
+    return llvm::ConstantVector::get(LLVMTypes::Int64VectorType, vals);
+}
+
+
+llvm::Constant *
+LLVMInt64Vector(const int64_t *ivec) {
+    std::vector<llvm::Constant *> vals;
+    for (int i = 0; i < g->target.vectorWidth; ++i)
+        vals.push_back(LLVMInt64(ivec[i]));
+    return llvm::ConstantVector::get(LLVMTypes::Int64VectorType, vals);
+}
+
+
+llvm::Constant *
+LLVMUInt64Vector(uint64_t ival) {
+    llvm::Constant *v = LLVMUInt64(ival);
+    std::vector<llvm::Constant *> vals;
+    for (int i = 0; i < g->target.vectorWidth; ++i)
+        vals.push_back(v);
+    return llvm::ConstantVector::get(LLVMTypes::Int64VectorType, vals);
+}
+
+
+llvm::Constant *
+LLVMUInt64Vector(const uint64_t *ivec) {
+    std::vector<llvm::Constant *> vals;
+    for (int i = 0; i < g->target.vectorWidth; ++i)
+        vals.push_back(LLVMUInt64(ivec[i]));
+    return llvm::ConstantVector::get(LLVMTypes::Int64VectorType, vals);
+}
+
+
+llvm::Constant *
+LLVMBoolVector(bool b) {
+    llvm::Constant *v;
+    if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) 
+        v = llvm::ConstantInt::get(LLVMTypes::Int32Type, b ? 0xffffffff : 0, 
+                                   false /*unsigned*/);
+    else {
+        assert(LLVMTypes::BoolVectorType->getElementType() == 
+               llvm::Type::getInt1Ty(*g->ctx));
+        v = b ? LLVMTrue : LLVMFalse;
+    }
+
+    std::vector<llvm::Constant *> vals;
+    for (int i = 0; i < g->target.vectorWidth; ++i)
+        vals.push_back(v);
+    return llvm::ConstantVector::get(LLVMTypes::BoolVectorType, vals);
+}
+
+
+llvm::Constant *
+LLVMBoolVector(const bool *bvec) {
+    std::vector<llvm::Constant *> vals;
+    for (int i = 0; i < g->target.vectorWidth; ++i) {
+        llvm::Constant *v;
+        if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) 
+            v = llvm::ConstantInt::get(LLVMTypes::Int32Type, bvec[i] ? 0xffffffff : 0, 
+                                       false /*unsigned*/);
+        else {
+            assert(LLVMTypes::BoolVectorType->getElementType() == 
+                   llvm::Type::getInt1Ty(*g->ctx));
+            v = bvec[i] ? LLVMTrue : LLVMFalse;
+        }
+
+        vals.push_back(v);
+    }
+    return llvm::ConstantVector::get(LLVMTypes::BoolVectorType, vals);
+}
+
+
+const llvm::ArrayType *
+LLVMPointerVectorType(const llvm::Type *t) {
+    // NOTE: ArrayType, not VectorType
+    return llvm::ArrayType::get(llvm::PointerType::get(t, 0), 
+                                g->target.vectorWidth);
+}
--- a/llvmutil.h
+++ b/llvmutil.h
@@ -0,0 +1,157 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file llvmutil.h
+    @brief Header file with declarations for various LLVM utility stuff
+*/
+
+#ifndef ISPC_LLVMUTIL_H
+#define ISPC_LLVMUTIL_H 1
+
+#include "ispc.h"
+#include <llvm/LLVMContext.h>
+#include <llvm/Type.h>
+#include <llvm/DerivedTypes.h>
+#include <llvm/Constants.h>
+
+/** This structure holds pointers to a variety of LLVM types; code
+    elsewhere can use them from here, ratherthan needing to make more
+    verbose LLVM API calls.
+ */ 
+struct LLVMTypes {
+    static const llvm::Type *VoidType;
+    static const llvm::PointerType *VoidPointerType;
+    static const llvm::Type *BoolType;
+    static const llvm::Type *Int8Type;
+    static const llvm::Type *Int16Type;
+    static const llvm::Type *Int32Type;
+    static const llvm::Type *Int32PointerType;
+    static const llvm::Type *Int64Type;
+    static const llvm::Type *Int64PointerType;
+    static const llvm::Type *FloatType;
+    static const llvm::Type *FloatPointerType;
+    static const llvm::Type *DoubleType;
+
+    static const llvm::VectorType *MaskType;
+    static const llvm::VectorType *BoolVectorType;
+    static const llvm::VectorType *Int1VectorType;
+    static const llvm::VectorType *Int32VectorType;
+    static const llvm::Type *Int32VectorPointerType;
+    static const llvm::VectorType *Int64VectorType;
+    static const llvm::Type *Int64VectorPointerType;
+    static const llvm::VectorType *FloatVectorType;
+    static const llvm::Type *FloatVectorPointerType;
+    static const llvm::VectorType *DoubleVectorType;
+    static const llvm::ArrayType *VoidPointerVectorType;
+};
+
+/** These variables hold the corresponding LLVM constant values as a
+    convenience to code elsewhere in the system.
+ */
+extern llvm::Constant *LLVMTrue, *LLVMFalse;
+
+/** This should be called early in initialization to initialize the members
+    of LLVMTypes and the LLVMTrue/LLVMFalse constants.  However, it can't
+    be called until the compilation target is known.
+ */
+extern void InitLLVMUtil(llvm::LLVMContext *ctx, Target target);
+
+/** Returns an LLVM i32 constant of the given value */
+extern llvm::ConstantInt *LLVMInt32(int32_t i);
+/** Returns an LLVM i32 constant of the given value */
+extern llvm::ConstantInt *LLVMUInt32(uint32_t i);
+/** Returns an LLVM i64 constant of the given value */
+extern llvm::ConstantInt *LLVMInt64(int64_t i);
+/** Returns an LLVM i64 constant of the given value */
+extern llvm::ConstantInt *LLVMUInt64(uint64_t i);
+/** Returns an LLVM float constant of the given value */
+extern llvm::Constant *LLVMFloat(float f);
+/** Returns an LLVM double constant of the given value */
+extern llvm::Constant *LLVMDouble(double f);
+
+/** Returns an LLVM boolean vector constant of the given value smeared
+    across all elements */
+extern llvm::Constant *LLVMBoolVector(bool v);
+/** Returns an LLVM i32 vector constant of the given value smeared
+    across all elements */
+extern llvm::Constant *LLVMInt32Vector(int32_t i);
+/** Returns an LLVM i32 vector constant of the given value smeared
+    across all elements */
+extern llvm::Constant *LLVMUInt32Vector(uint32_t i);
+/** Returns an LLVM i64 vector constant of the given value smeared
+    across all elements */
+extern llvm::Constant *LLVMInt64Vector(int64_t i);
+/** Returns an LLVM i64 vector constant of the given value smeared
+    across all elements */
+extern llvm::Constant *LLVMUInt64Vector(uint64_t i);
+/** Returns an LLVM float vector constant of the given value smeared
+    across all elements */
+extern llvm::Constant *LLVMFloatVector(float f);
+/** Returns an LLVM double vector constant of the given value smeared
+    across all elements */
+extern llvm::Constant *LLVMDoubleVector(double f);
+
+/** Returns an LLVM boolean vector based on the given array of values.
+    The array should have g->target.vectorWidth elements. */
+extern llvm::Constant *LLVMBoolVector(const bool *v);
+/** Returns an LLVM i32 vector based on the given array of values.
+    The array should have g->target.vectorWidth elements. */
+extern llvm::Constant *LLVMInt32Vector(const int32_t *i);
+/** Returns an LLVM i32 vector based on the given array of values.
+    The array should have g->target.vectorWidth elements. */
+extern llvm::Constant *LLVMUInt32Vector(const uint32_t *i);
+/** Returns an LLVM i64 vector based on the given array of values.
+    The array should have g->target.vectorWidth elements. */
+extern llvm::Constant *LLVMInt64Vector(const int64_t *i);
+/** Returns an LLVM i64 vector based on the given array of values.
+    The array should have g->target.vectorWidth elements. */
+extern llvm::Constant *LLVMUInt64Vector(const uint64_t *i);
+/** Returns an LLVM float vector based on the given array of values.
+    The array should have g->target.vectorWidth elements. */
+extern llvm::Constant *LLVMFloatVector(const float *f);
+/** Returns an LLVM double vector based on the given array of values.
+    The array should have g->target.vectorWidth elements. */
+extern llvm::Constant *LLVMDoubleVector(const double *f);
+
+/** LLVM constant value representing an 'all on' SIMD lane mask */
+extern llvm::Constant *LLVMMaskAllOn;
+/** LLVM constant value representing an 'all off' SIMD lane mask */
+extern llvm::Constant *LLVMMaskAllOff;
+
+/** Given an LLVM type, returns the corresponding type for a vector of
+    pointers to that type.  (In practice, an array of pointers, since LLVM
+    prohibits vectors of pointers.
+ */
+extern const llvm::ArrayType *LLVMPointerVectorType(const llvm::Type *t);
+
+#endif // ISPC_LLVMUTIL_H
--- a/main.cpp
+++ b/main.cpp
@@ -0,0 +1,330 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file main.cpp
+    @brief main() entrypoint implementation for ispc
+*/
+
+#include "ispc.h"
+#include "module.h"
+#include <stdio.h>
+#include <llvm/Support/PrettyStackTrace.h>
+#ifdef LLVM_2_8
+#include <llvm/System/Signals.h>
+#else
+#include <llvm/Support/Signals.h>
+#endif
+
+#ifdef ISPC_IS_WINDOWS
+#define strcasecmp stricmp
+#define BUILD_DATE __DATE__
+#define BUILD_VERSION ""
+#endif // ISPC_IS_WINDOWS
+
+static void usage(int ret) {
+    printf("This is the Intel(r) SPMD Program Compiler (ispc), build %s (%s)\n\n", BUILD_DATE, BUILD_VERSION);
+    printf("usage: ispc\n");
+    printf("    [--arch={x86,x86-64}]\t\tSelect target architecture\n");
+    printf("    [--cpu=<cpu>]\t\t\tSelect target CPU type\n");
+    printf("         (atom, barcelona, core2, corei7, corei7-avx, istanbul, nocona,\n");
+    printf("          penryn, westmere)\n");
+#ifndef ISPC_IS_WINDOWS
+    printf("    [-D<foo>]\t\t\t\t#define value when running preprocessor\n");
+#endif
+    printf("    [--debug]\t\t\t\tPrint information useful for debugging ispc\n");
+    printf("    [--emit-asm]\t\t\tGenerate assembly language file as output\n");
+    printf("    [--emit-llvm]\t\t\tEmit LLVM bitode file as output\n");
+    printf("    [--emit-obj]\t\t\tGenerate object file file as output\n");
+    printf("    [--fast-math]\t\t\tPerform non-IEEE-compliant optimizations of numeric expressions\n");
+    printf("    [-g]\t\t\t\tGenerate debugging information\n");
+    printf("    [--help]\t\t\t\tPrint help\n");
+    printf("    [-h] <name>\t\t\t\tOutput filename for header\n");
+    printf("    [--instrument]\t\t\tEmit instrumentation to gather performance data\n");
+    printf("    [--math-lib=<option>]\t\tSelect math library\n");
+    printf("        default\t\t\t\tUse ispc's built-in math functions\n");
+    printf("        fast\t\t\t\tUse high-performance but lower-accuracy math functions\n");
+    printf("        svml\t\t\t\tUse the Intel SVML math libraries\n");
+    printf("        system\t\t\t\tUse the system's math library (*may be quite slow*)\n");
+    printf("    [--nostdlib]\t\t\tDon't make the ispc standard library available\n");
+#ifndef ISPC_IS_WINDOWS
+    printf("    [--nocpp]\t\t\t\tDon't run the C preprocessor\n");
+#endif
+    printf("    [-o/--outfile] <name>\t\tOutput filename for bitcode (may be \"-\" for standard output)\n");
+    printf("    [-O0/-O1]\t\t\t\tSet optimization level\n");
+    printf("    [--opt=<option>]\t\t\tSet optimization option\n");
+    printf("        disable-blended-masked-stores\t\tScalarize masked stores on SSE (vs. using vblendps)\n");
+    printf("        disable-coherent-control-flow\t\tDisable coherent control flow optimizations\n");
+    printf("        disable-uniform-control-flow\t\tDisable uniform control flow optimizations\n");
+    printf("        disable-gather-scatter-optimizations\tDisable improvements to gather/scatter\n");
+    printf("        disable-blending-removal\t\tDisable eliminating blend at same scope\n");
+    printf("        disable-gather-scatter-flattening\tDisable flattening when all lanes are on\n");
+    printf("        disable-uniform-memory-optimizations\tDisable uniform-based coherent memory access\n");
+    printf("        disable-masked-store-optimizations\tDisable lowering to regular stores when possible\n");
+    printf("    [--target={sse2,sse4,sse4x2,avx}] Select target ISA (SSE4 is default)\n");
+    printf("    [--version]\t\t\t\tPrint ispc version\n");
+    printf("    [--woff]\t\t\t\tDisable warnings\n");
+    printf("    [--wno-perf]\t\t\tDon't issue warnings related to performance-related issues\n");
+    printf("    <file to compile or \"-\" for stdin>\n");
+    exit(ret);
+}
+
+/** Given a target name string, set initialize the global g->target
+    structure appropriately. 
+*/
+static void lDoTarget(const char *target) {
+    if (!strcasecmp(target, "sse2")) {
+        g->target.isa = Target::SSE2;
+        g->target.nativeVectorWidth = 4;
+        g->target.vectorWidth = 4;
+    }
+    else if (!strcasecmp(target, "sse4")) {
+        g->target.isa = Target::SSE4;
+        g->target.nativeVectorWidth = 4;
+        g->target.vectorWidth = 4;
+    }
+    else if (!strcasecmp(target, "sse4x2")) {
+        g->target.isa = Target::SSE4;
+        g->target.nativeVectorWidth = 4;
+        g->target.vectorWidth = 8;
+    }
+    else if (!strcasecmp(target, "avx")) {
+        g->target.isa = Target::AVX;
+        g->target.nativeVectorWidth = 8;
+        g->target.vectorWidth = 8;
+    }
+    else
+        usage(1);
+}
+
+
+/** We take arguments from both the command line as well as from the
+    ISPC_ARGS environment variable.  This function returns a new set of
+    arguments representing the ones from those two sources merged together.
+ */ 
+static void lGetAllArgs(int Argc, char *Argv[], int &argc, char *argv[128]) {
+    // Copy over the command line arguments (passed in)
+    for (int i = 0; i < Argc; ++i)
+        argv[i] = Argv[i];
+    argc = Argc;
+
+    // See if we have any set via the environment variable
+    const char *env = getenv("ISPC_ARGS");
+    if (!env)
+        return;
+    while (true) {
+        // Look for the next space in the string, which delimits the end of
+        // the current argument
+        const char *end = strchr(env, ' ');
+        if (end == NULL)
+            end = env + strlen(env);
+        int len = end - env;
+
+        // Copy the argument into a newly allocated memory (so we can
+        // NUL-terminate it).
+        char *ptr = new char[len+1];
+        strncpy(ptr, env, len);
+        ptr[len] = '\0';
+
+        // Add it to the args array and get out of here 
+        argv[argc++] = ptr;
+        if (*end == '\0')
+            break;
+
+        // Advance the starting pointer of the string to the next non-space
+        // character
+        env = end+1;
+        while (*env == ' ')
+            ++env;
+
+        // Hit the end of the string; get out of here
+        if (*env == '\0')
+            break;
+    }
+}
+
+
+int main(int Argc, char *Argv[]) {
+    int argc;
+    char *argv[128];
+    lGetAllArgs(Argc, Argv, argc, argv);
+
+    // Use LLVM's little utility function to print out nice stack traces if
+    // we crash
+    llvm::sys::PrintStackTraceOnErrorSignal();
+    llvm::PrettyStackTraceProgram X(argc, argv);
+
+    char *file = NULL;
+    const char *headerFileName = NULL;
+    const char *outFileName = NULL;
+
+    // Initiailize globals early so that we can set various option values
+    // as we're parsing below
+    g = new Globals;
+
+    bool debugSet = false, optSet = false;
+    Module::OutputType ot = Module::Object;
+
+    for (int i = 1; i < argc; ++i) {
+        if (!strcmp(argv[i], "--help"))
+            usage(0);
+#ifndef ISPC_IS_WINDOWS
+        else if (!strncmp(argv[i], "-D", 2)) {
+            g->cppArgs.push_back(argv[i]);
+        }
+#endif // !ISPC_IS_WINDOWS
+        else if (!strncmp(argv[i], "--arch=", 7))
+            g->target.arch = argv[i] + 7;
+        else if (!strncmp(argv[i], "--cpu=", 6))
+            g->target.cpu = argv[i] + 6;
+        else if (!strcmp(argv[i], "--fast-math"))
+            g->opt.fastMath = true;
+        else if (!strcmp(argv[i], "--debug"))
+            g->debugPrint = true;
+        else if (!strcmp(argv[i], "--instrument"))
+            g->emitInstrumentation = true;
+        else if (!strcmp(argv[i], "-g")) {
+            g->generateDebuggingSymbols = true;
+            debugSet = true;
+        }
+        else if (!strcmp(argv[i], "--emit-asm"))
+            ot = Module::Asm;
+        else if (!strcmp(argv[i], "--emit-llvm"))
+            ot = Module::Bitcode;
+        else if (!strcmp(argv[i], "--emit-obj"))
+            ot = Module::Object;
+        else if (!strcmp(argv[i], "--target")) {
+            if (++i == argc) usage(1);
+            lDoTarget(argv[i]);
+        }
+        else if (!strncmp(argv[i], "--target=", 9)) {
+            const char *target = argv[i] + 9;
+            lDoTarget(target);
+        }
+        else if (!strncmp(argv[i], "--math-lib=", 11)) {
+            const char *lib = argv[i] + 11;
+            if (!strcmp(lib, "default"))
+                g->mathLib = Globals::Math_ISPC;
+            else if (!strcmp(lib, "fast"))
+                g->mathLib = Globals::Math_ISPCFast;
+            else if (!strcmp(lib, "svml"))
+                g->mathLib = Globals::Math_SVML;
+            else if (!strcmp(lib, "system"))
+                g->mathLib = Globals::Math_System;
+            else
+                usage(1);
+        }
+        else if (!strncmp(argv[i], "--opt=", 6)) {
+            const char *opt = argv[i] + 6;
+            if (!strcmp(opt, "disable-blended-masked-stores"))
+                g->opt.disableBlendedMaskedStores = true;
+            else if (!strcmp(opt, "disable-coherent-control-flow"))
+                g->opt.disableCoherentControlFlow = true;
+            else if (!strcmp(opt, "disable-uniform-control-flow"))
+                g->opt.disableUniformControlFlow = true;
+            else if (!strcmp(opt, "disable-gather-scatter-optimizations"))
+                g->opt.disableGatherScatterOptimizations = true;
+            else if (!strcmp(opt, "disable-blending-removal"))
+                g->opt.disableMaskedStoreToStore = true;
+            else if (!strcmp(opt, "disable-gather-scatter-flattening"))
+                g->opt.disableGatherScatterFlattening = true;
+            else if (!strcmp(opt, "disable-uniform-memory-optimizations"))
+                g->opt.disableUniformMemoryOptimizations = true;
+            else if (!strcmp(opt, "disable-masked-store-optimizations"))
+                g->opt.disableMaskedStoreOptimizations = true;
+            else 
+                usage(1);
+        }
+        else if (!strcmp(argv[i], "--woff") || !strcmp(argv[i], "-woff")) {
+            g->disableWarnings = true;
+            g->emitPerfWarnings = false;
+        }
+        else if (!strcmp(argv[i], "--wno-perf") || !strcmp(argv[i], "-wno-perf"))
+            g->emitPerfWarnings = false;
+        else if (!strcmp(argv[i], "-o") || !strcmp(argv[i], "--outfile")) {
+            if (++i == argc) usage(1);
+            outFileName = argv[i];
+        }
+        else if (!strcmp(argv[i], "-h") || !strcmp(argv[i], "--header-outfile")) {
+            if (++i == argc) usage(1);
+            headerFileName = argv[i];
+        }
+        else if (!strcmp(argv[i], "-O0")) {
+            g->opt.level = 0;
+            optSet = true;
+        }
+        else if (!strcmp(argv[i], "-O") ||  !strcmp(argv[i], "-O1") || 
+                 !strcmp(argv[i], "-O2") || !strcmp(argv[i], "-O3")) {
+            g->opt.level = 1;
+            optSet = true;
+        }
+        else if (!strcmp(argv[i], "-"))
+            ;
+        else if (!strcmp(argv[i], "--nostdlib"))
+            g->includeStdlib = false;
+        else if (!strcmp(argv[i], "--nocpp"))
+            g->runCPP = false;
+        else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--version")) {
+            printf("Intel(r) SPMD Program Compiler (ispc) build %s (%s)\n", 
+                   BUILD_DATE, BUILD_VERSION);
+            return 0;
+        }
+        else if (argv[i][0] == '-')
+            usage(1);
+        else {
+            if (file != NULL)
+                usage(1);
+            else
+                file = argv[i];
+        }
+    }
+
+    // If the user specified -g, then the default optimization level is 0.
+    // If -g wasn't specified, the default optimization level is 1 (full
+    // optimization).
+    if (debugSet && !optSet)
+        g->opt.level = 0;
+
+    m = new Module(file);
+    if (m->CompileFile() == 0) {
+        if (outFileName != NULL)
+            if (!m->WriteOutput(ot, outFileName))
+                return 1;
+        if (headerFileName != NULL)
+            if (!m->WriteOutput(Module::Header, headerFileName))
+                return 1;
+    }
+    int errorCount = m->errorCount;
+    delete m;
+
+    return errorCount > 0;
+}
--- a/module.cpp
+++ b/module.cpp
--- a/module.h
+++ b/module.h
@@ -0,0 +1,113 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file module.h
+    @brief Declaration of the Module class, which is the ispc-side representation
+    of the results of compiling a source file.
+ */
+
+#ifndef ISPC_MODULE_H
+#define ISPC_MODULE_H 1
+
+#include "ispc.h"
+
+class Module {
+public:
+    /** The name of the source file being compiled should be passed as the
+        module name. */
+    Module(const char *filename);
+
+    /** Compiles the source file passed to the Module constructor, adding
+        its global variables and functions to both the llvm::Module and
+        SymbolTable.  Returns the number of errors during compilation.  */
+    int CompileFile();
+
+    /** Adds the global variable described by the declaration information to
+        the module. */
+    void AddGlobal(DeclSpecs *ds, Declarator *decl);
+
+    /** Adds the function described by the declaration information and the
+        provided statements to the module. */
+    void AddFunction(DeclSpecs *ds, Declarator *decl, Stmt *code);
+
+    /** After a source file has been compiled, output can be generated in a
+        number of different formats. */
+    enum OutputType { Asm,      /** Generate text assembly language output */
+                      Bitcode,  /** Generate LLVM IR bitcode output */
+                      Object,   /** Generate a native object file */
+                      Header    /** Generate a C/C++ header file with 
+                                    declarations of 'export'ed functions, global
+                                    variables, and the types used by them. */
+    };
+
+    /** Write the corresponding output type to the given file.  Returns
+        true on success, false if there has been an error.  The given
+        filename may be NULL, indicating that output should go to standard
+        output. */
+    bool WriteOutput(OutputType ot, const char *filename);
+
+    /** Total number of errors encountered during compilation. */
+    int errorCount;
+
+    /** Symbol table to hold symbols visible in the current scope during
+        compilation. */
+    SymbolTable *symbolTable;
+
+    /** llvm Module object into which globals and functions are added. */
+    llvm::Module *module; 
+
+#ifndef LLVM_2_8
+    /** The diBuilder manages generating debugging information (only
+        supported in LLVM 2.9 and beyond...) */
+    llvm::DIBuilder *diBuilder;
+#endif
+
+    GatherBuffer *gatherBuffer;
+
+private:
+    const char *filename;
+
+    /** This member records the global variables that have been defined
+        with 'extern' linkage, so that it's easy to include their
+        declarations in generated header files.
+
+        @todo FIXME: it would be nice to eliminate this and then query the
+        symbol table or the llvm Module for them when/if we need them.
+     */
+    std::vector<Symbol *> externGlobals;
+
+    bool writeHeader(const char *filename);
+    bool writeObjectFileOrAssembly(OutputType outputType, const char *filename);
+};
+
+#endif // ISPC_MODULE_H
--- a/opt.cpp
+++ b/opt.cpp
--- a/opt.h
+++ b/opt.h
@@ -0,0 +1,50 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file opt.h
+    @brief Declarations related to optimization passes
+*/
+
+#ifndef ISPC_OPT_H
+#define ISPC_OPT_H 1
+
+#include "ispc.h"
+
+/** Optimize the functions in the given module, applying the specified
+    level of optimization.  optLevel zero corresponds to essentially no
+    optimization--just enough to generate correct code, while level one
+    corresponds to full optimization.  
+*/
+void Optimize(llvm::Module *module, int optLevel);
+
+#endif // ISPC_OPT_H
--- a/parse.yy
+++ b/parse.yy
--- a/run_tests.sh
+++ b/run_tests.sh
@@ -0,0 +1,43 @@
+#!/bin/zsh
+
+surprises=0
+
+echo Running correctness tests
+
+for i in tests/*.ispc; do
+    bc=${i%%ispc}bc
+    ispc -O2 $i -woff -o $bc --emit-llvm --target=sse4
+    if [[ $? != 0 ]]; then
+        surprises=1
+        echo Test $i FAILED ispc compile
+        echo
+    else
+        ispc_test $bc
+        if [[ $? != 0 ]]; then
+            surprises=1
+            echo Test $i FAILED ispc_test
+            echo
+        fi
+#        cmp $bc tests_bitcode${bc##tests}
+#        if [[ $? == 0 ]]; then
+#            /bin/rm $bc
+#        fi
+    fi
+    /bin/rm $bc
+done
+
+echo Running failing tests
+for i in failing_tests/*.ispc; do
+    (ispc -O2 $i -woff -o - --emit-llvm | ispc_test -) 2>/dev/null 1>/dev/null
+    if [[ $? == 0 ]]; then
+        surprises=1
+        echo Test $i UNEXPECTEDLY PASSED
+        echo
+    fi
+done
+
+if [[ $surprises == 0 ]]; then
+    echo No surprises.
+fi
+
+exit $surprises
--- a/stdlib-avx.ll
+++ b/stdlib-avx.ll
@@ -0,0 +1,589 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; *** Untested *** AVX target implementation.
+;;
+;; The LLVM AVX code generator is incomplete, so the ispc AVX target
+;; hasn't yet been tested.  There is therefore a higher-than-normal
+;; chance that there are bugs in the code in this file.
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Basic 8-wide definitions
+
+stdlib_core(8)
+packed_load_and_store(8)
+int8_16(8)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+declare <8 x float> @llvm.x86.avx.rcp.ps(<8 x float>) nounwind readnone
+declare <8 x float> @llvm.x86.avx.rcp.ss(<8 x float>) nounwind readnone
+
+define internal <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ;  float iv = __rcp_v(v);
+  ;  return iv * (2. - v * iv);
+
+  %call = call <8 x float> @llvm.x86.avx.rcp.ps(<8 x float> %0)
+  ; do one N-R iteration
+  %v_iv = fmul <8 x float> %0, %call
+  %two_minus = fsub <8 x float> <float 2., float 2., float 2., float 2.,
+                                 float 2., float 2., float 2., float 2.>, %v_iv  
+  %iv_mul = fmul <8 x float> %call, %two_minus
+  ret <8 x float> %iv_mul
+}
+
+define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
+;    uniform float iv = extract(__rcp_u(v), 0);
+;    return iv * (2. - v * iv);
+  %vecval = insertelement <8 x float> undef, float %0, i32 0
+  %call = call <8 x float> @llvm.x86.avx.rcp.ss(<8 x float> %vecval)
+  %scall = extractelement <8 x float> %call, i32 0
+
+  ; do one N-R iteration
+  %v_iv = fmul float %0, %scall
+  %two_minus = fsub float 2., %v_iv  
+  %iv_mul = fmul float %scall, %two_minus
+  ret float %iv_mul
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding
+
+declare <8 x float> @llvm.x86.avx.round.ps(<8 x float>, i32) nounwind readnone
+declare <8 x float> @llvm.x86.avx.round.ss(<8 x float>, <8 x float>, i32) nounwind readnone
+
+define internal <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  %call = call <8 x float> @llvm.x86.avx.round.ps(<8 x float> %0, i32 8)
+  ret <8 x float> %call
+}
+
+define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
+  ; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  ; the roundss intrinsic is a total mess--docs say:
+  ;
+  ;  __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
+  ;       
+  ;  b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
+  ;  on b0. The higher order 96 bits are copied directly from input parameter a. The
+  ;  return value is described by the following equations:
+  ;
+  ;  r0 = RND(b0)
+  ;  r1 = a1
+  ;  r2 = a2
+  ;  r3 = a3
+  ;
+  ;  It doesn't matter what we pass as a, since we only need the r0 value
+  ;  here.  So we pass the same register for both.
+  %xi = insertelement <8 x float> undef, float %0, i32 0
+  %xr = call <8 x float> @llvm.x86.avx.round.ss(<8 x float> %xi, <8 x float> %xi, i32 8)
+  %rs = extractelement <8 x float> %xr, i32 0
+  ret float %rs
+}
+
+define internal <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  %call = call <8 x float> @llvm.x86.avx.round.ps(<8 x float> %0, i32 9)
+  ret <8 x float> %call
+}
+
+define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <8 x float> undef, float %0, i32 0
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  %xr = call <8 x float> @llvm.x86.avx.round.ss(<8 x float> %xi, <8 x float> %xi, i32 9)
+  %rs = extractelement <8 x float> %xr, i32 0
+  ret float %rs
+}
+
+define internal <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  %call = call <8 x float> @llvm.x86.avx.round.ps(<8 x float> %0, i32 10)
+  ret <8 x float> %call
+}
+
+define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <8 x float> undef, float %0, i32 0
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  %xr = call <8 x float> @llvm.x86.avx.round.ss(<8 x float> %xi, <8 x float> %xi, i32 10)
+  %rs = extractelement <8 x float> %xr, i32 0
+  ret float %rs
+}
+
+declare <8 x float> @llvm.x86.avx.rsqrt.ps(<8 x float>) nounwind readnone
+declare <8 x float> @llvm.x86.avx.rsqrt.ss(<8 x float>) nounwind readnone
+
+define internal <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
+  ;  float is = __rsqrt_v(v);
+  %is = call <8 x float> @llvm.x86.avx.rsqrt.ps(<8 x float> %v)
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul <8 x float> %v, %is
+  %v_is_is = fmul <8 x float> %v_is, %is
+  %three_sub = fsub <8 x float> <float 3., float 3., float 3., float 3., float 3., float 3., float 3., float 3.>, %v_is_is
+  %is_mul = fmul <8 x float> %is, %three_sub
+  %half_scale = fmul <8 x float> <float 0.5, float 0.5, float 0.5, float 0.5, float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  ret <8 x float> %half_scale
+}
+
+define internal float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
+  ;  uniform float is = extract(__rsqrt_u(v), 0);
+  %v = insertelement <8 x float> undef, float %0, i32 0
+  %vis = call <8 x float> @llvm.x86.avx.rsqrt.ss(<8 x float> %v)
+  %is = extractelement <8 x float> %vis, i32 0
+
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul float %0, %is
+  %v_is_is = fmul float %v_is, %is
+  %three_sub = fsub float 3., %v_is_is
+  %is_mul = fmul float %is, %three_sub
+  %half_scale = fmul float 0.5, %is_mul
+  ret float %half_scale
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; sqrt
+
+declare <8 x float> @llvm.x86.avx.sqrt.ps(<8 x float>) nounwind readnone
+declare <8 x float> @llvm.x86.avx.sqrt.ss(<8 x float>) nounwind readnone
+
+define internal <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  %call = call <8 x float> @llvm.x86.avx.sqrt.ps(<8 x float> %0)
+  ret <8 x float> %call
+}
+
+define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
+  sse_unary_scalar(ret, 8, float, @llvm.x86.avx.sqrt.ss, %0)
+  ret float %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; fastmath
+
+declare void @llvm.x86.avx.stmxcsr(i32 *) nounwind
+declare void @llvm.x86.avx.ldmxcsr(i32 *) nounwind
+
+define internal void @__fastmath() nounwind alwaysinline {
+  %ptr = alloca i32
+  call void @llvm.x86.avx.stmxcsr(i32 * %ptr)
+  %oldval = load i32 *%ptr
+
+  ; turn on DAZ (64)/FTZ (32768) -> 32832
+  %update = or i32 %oldval, 32832
+  store i32 %update, i32 *%ptr
+  call void @llvm.x86.avx.ldmxcsr(i32 * %ptr)
+  ret void
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; svml
+
+; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
+; or, use the macro to call the 4-wide ones twice with our 8-wide
+; vectors...
+
+declare <8 x float> @__svml_sin(<8 x float>)
+declare <8 x float> @__svml_cos(<8 x float>)
+declare void @__svml_sincos(<8 x float>, <8 x float> *, <8 x float> *)
+declare <8 x float> @__svml_tan(<8 x float>)
+declare <8 x float> @__svml_atan(<8 x float>)
+declare <8 x float> @__svml_atan2(<8 x float>, <8 x float>)
+declare <8 x float> @__svml_exp(<8 x float>)
+declare <8 x float> @__svml_log(<8 x float>)
+declare <8 x float> @__svml_pow(<8 x float>, <8 x float>)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+declare <8 x float> @llvm.x86.avx.max.ps(<8 x float>, <8 x float>) nounwind readnone
+declare <8 x float> @llvm.x86.avx.max.ss(<8 x float>, <8 x float>) nounwind readnone
+declare <8 x float> @llvm.x86.avx.min.ps(<8 x float>, <8 x float>) nounwind readnone
+declare <8 x float> @llvm.x86.avx.min.ss(<8 x float>, <8 x float>) nounwind readnone
+
+define internal <8 x float> @__max_varying_float(<8 x float>,
+                                                 <8 x float>) nounwind readonly alwaysinline {
+  %call = call <8 x float> @llvm.x86.avx.max.ps(<8 x float> %0, <8 x float> %1)
+  ret <8 x float> %call
+}
+
+define internal float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 8, float, @llvm.x86.avx.max.ss, %0, %1)
+  ret float %ret
+}
+
+define internal <8 x float> @__min_varying_float(<8 x float>,
+                                                 <8 x float>) nounwind readonly alwaysinline {
+  %call = call <8 x float> @llvm.x86.avx.min.ps(<8 x float> %0, <8 x float> %1)
+  ret <8 x float> %call
+}
+
+define internal float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 8, float, @llvm.x86.avx.min.ss, %0, %1)
+  ret float %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+declare <8 x i32> @llvm.x86.avx.pminsd(<8 x i32>, <8 x i32>) nounwind readnone
+declare <8 x i32> @llvm.x86.avx.pmaxsd(<8 x i32>, <8 x i32>) nounwind readnone
+
+define internal <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  %call = call <8 x i32> @llvm.x86.avx.pminsd(<8 x i32> %0, <8 x i32> %1)
+  ret <8 x i32> %call
+}
+
+define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 8, i32, @llvm.x86.avx.pminsd, %0, %1)
+  ret i32 %ret
+}
+
+define internal <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  %call = call <8 x i32> @llvm.x86.avx.pmaxsd(<8 x i32> %0, <8 x i32> %1)
+  ret <8 x i32> %call
+}
+
+define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 8, i32, @llvm.x86.avx.pmaxsd, %0, %1)
+  ret i32 %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+declare <8 x i32> @llvm.x86.avx.pminud(<8 x i32>, <8 x i32>) nounwind readnone
+declare <8 x i32> @llvm.x86.avx.pmaxud(<8 x i32>, <8 x i32>) nounwind readnone
+
+define internal <8 x i32> @__min_varying_uint32(<8 x i32>,
+                                                <8 x i32>) nounwind readonly alwaysinline {
+  %call = call <8 x i32> @llvm.x86.avx.pminud(<8 x i32> %0, <8 x i32> %1)
+  ret <8 x i32> %call
+}
+
+define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 8, i32, @llvm.x86.avx.pminud, %0, %1)
+  ret i32 %ret
+}
+
+define internal <8 x i32> @__max_varying_uint32(<8 x i32>,
+                                                <8 x i32>) nounwind readonly alwaysinline {
+  %call = call <8 x i32> @llvm.x86.avx.pmaxud(<8 x i32> %0, <8 x i32> %1)
+  ret <8 x i32> %call
+}
+
+define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 8, i32, @llvm.x86.avx.pmaxud, %0, %1)
+  ret i32 %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops
+
+declare i32 @llvm.ctpop.i32(i32) nounwind readnone
+
+define internal i32 @__popcnt(i32) nounwind readonly alwaysinline {
+  %call = call i32 @llvm.ctpop.i32(i32 %0)
+  ret i32 %call
+}
+
+declare i32 @llvm.x86.avx.movmsk.ps(<8 x float>) nounwind readnone
+
+define internal i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %v = call i32 @llvm.x86.avx.movmsk.ps(<8 x float> %floatmask) nounwind readnone
+  ret i32 %v
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal float ops
+
+declare <8 x float> @llvm.x86.avx.hadd.ps(<8 x float>, <8 x float>) nounwind readnone
+
+define internal float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
+  %v1 = call <8 x float> @llvm.x86.avx.hadd.ps(<8 x float> %0, <8 x float> %0)
+  %v2 = call <8 x float> @llvm.x86.avx.hadd.ps(<8 x float> %v1, <8 x float> %v1)
+  %scalar1 = extractelement <8 x float> %v2, i32 0
+  %scalar2 = extractelement <8 x float> %v2, i32 4
+  %sum = fadd float %scalar1, %scalar2
+  ret float %sum
+}
+
+
+define internal float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
+  reduce8(float, @__min_varying_float, @__min_uniform_float)
+}
+
+
+define internal float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
+  reduce8(float, @__max_varying_float, @__max_uniform_float)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int32 ops
+
+define internal <8 x i32> @__add_varying_int32(<8 x i32>,
+                                               <8 x i32>) nounwind readnone alwaysinline {
+  %s = add <8 x i32> %0, %1
+  ret <8 x i32> %s
+}
+
+define internal i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
+  %s = add i32 %0, %1
+  ret i32 %s
+}
+
+define internal i32 @__reduce_add_int32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8(i32, @__add_varying_int32, @__add_uniform_int32)
+}
+
+
+define internal i32 @__reduce_min_int32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8(i32, @__min_varying_int32, @__min_uniform_int32)
+}
+
+
+define internal i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8(i32, @__max_varying_int32, @__max_uniform_int32)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; horizontal uint32 ops
+
+define internal i32 @__reduce_add_uint32(<8 x i32> %v) nounwind readnone alwaysinline {
+  %r = call i32 @__reduce_add_int32(<8 x i32> %v)
+  ret i32 %r
+}
+
+define internal i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32)
+}
+
+
+define internal i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+define <8 x i32> @__load_and_broadcast_32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
+  %mm = call i32 @__movmsk(<8 x i32> %mask)
+  %any_on = icmp ne i32 %mm, 0
+  br i1 %any_on, label %load, label %skip
+
+load:
+  %ptr = bitcast i8 * %0 to i32 *
+  %val = load i32 * %ptr
+
+  %ret0 = insertelement <8 x i32> undef, i32 %val, i32 0
+  %ret1 = insertelement <8 x i32> %ret0, i32 %val, i32 1
+  %ret2 = insertelement <8 x i32> %ret1, i32 %val, i32 2
+  %ret3 = insertelement <8 x i32> %ret2, i32 %val, i32 3
+  %ret4 = insertelement <8 x i32> %ret3, i32 %val, i32 4
+  %ret5 = insertelement <8 x i32> %ret4, i32 %val, i32 5
+  %ret6 = insertelement <8 x i32> %ret5, i32 %val, i32 6
+  %ret7 = insertelement <8 x i32> %ret6, i32 %val, i32 7
+  ret <8 x i32> %ret7
+
+skip:
+  ret <8 x i32> undef
+}
+
+
+define <8 x i64> @__load_and_broadcast_64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
+  %mm = call i32 @__movmsk(<8 x i32> %mask)
+  %any_on = icmp ne i32 %mm, 0
+  br i1 %any_on, label %load, label %skip
+
+load:
+  %ptr = bitcast i8 * %0 to i64 *
+  %val = load i64 * %ptr
+
+  %ret0 = insertelement <8 x i64> undef, i64 %val, i32 0
+  %ret1 = insertelement <8 x i64> %ret0, i64 %val, i32 1
+  %ret2 = insertelement <8 x i64> %ret1, i64 %val, i32 2
+  %ret3 = insertelement <8 x i64> %ret2, i64 %val, i32 3
+  %ret4 = insertelement <8 x i64> %ret3, i64 %val, i32 4
+  %ret5 = insertelement <8 x i64> %ret4, i64 %val, i32 5
+  %ret6 = insertelement <8 x i64> %ret5, i64 %val, i32 6
+  %ret7 = insertelement <8 x i64> %ret6, i64 %val, i32 7
+  ret <8 x i64> %ret3
+
+skip:
+  ret <8 x i64> undef
+}
+
+
+define <8 x i32> @__load_masked_32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
+  %mm = call i32 @__movmsk(<8 x i32> %mask)
+  %any_on = icmp ne i32 %mm, 0
+  br i1 %any_on, label %load, label %skip
+
+load:
+  %ptr = bitcast i8 * %0 to <8 x i32> *
+  %val = load <8 x i32> * %ptr, align 4
+  ret <8 x i32> %val
+
+skip:
+  ret <8 x i32> undef
+}
+
+
+define <8 x i64> @__load_masked_64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
+  %mm = call i32 @__movmsk(<8 x i32> %mask)
+  %any_on = icmp ne i32 %mm, 0
+  br i1 %any_on, label %load, label %skip
+
+load:
+  %ptr = bitcast i8 * %0 to <8 x i64> *
+  %val = load <8 x i64> * %ptr, align 8
+  ret <8 x i64> %val
+
+skip:
+  ret <8 x i64> undef
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+define void @__masked_store_32(<8 x i32>* nocapture, <8 x i32>, 
+                               <8 x i32>) nounwind alwaysinline {
+  per_lane(8, <8 x i32> %2, `
+      ; compute address for this one
+      %ptr_ID = getelementptr <8 x i32> * %0, i32 0, i32 LANE
+      %storeval_ID = extractelement <8 x i32> %1, i32 LANE
+      store i32 %storeval_ID, i32 * %ptr_ID')
+  ret void
+}
+
+define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>,
+                               <8 x i32>) nounwind alwaysinline {
+  per_lane(8, <8 x i32> %2, `
+      %ptr_ID = getelementptr <8 x i64> * %0, i32 0, i32 LANE
+      %storeval_ID = extractelement <8 x i64> %1, i32 LANE
+      store i64 %storeval_ID, i64 * %ptr_ID')
+  ret void
+}
+
+
+declare <8 x float> @llvm.x86.avx.blendvps(<8 x float>, <8 x float>,
+                                           <8 x float>) nounwind readnone
+
+
+define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
+                                           <8 x i32>) nounwind alwaysinline {
+  %mask_as_float = bitcast <8 x i32> %2 to <8 x float>
+  %oldValue = load <8 x i32>* %0
+  %oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float>
+  %newAsFloat = bitcast <8 x i32> %1 to <8 x float>
+  %blend = call <8 x float> @llvm.x86.avx.blendvps(<8 x float> %oldAsFloat,
+                                                   <8 x float> %newAsFloat,
+                                                   <8 x float> %mask_as_float)
+  %blendAsInt = bitcast <8 x float> %blend to <8 x i32>
+  store <8 x i32> %blendAsInt, <8 x i32>* %0
+  ret void
+}
+
+
+define void @__masked_store_blend_64(<8 x i64>* nocapture, <8 x i64>,
+                                     <8 x i32>) nounwind alwaysinline {
+  ; always just serialize it
+  ; FIXME: should implement the "do two 32-bit masked stores" stuff that
+  ; other targets do...
+  call void @__masked_store_64(<8 x i64>* nocapture %0, <8 x i64> %1, <8 x i32> %2)
+  ret void
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather/scatter
+
+gen_gather(8, i32)
+gen_gather(8, i64)
+gen_scatter(8, i32)
+gen_scatter(8, i64)
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <4 x double> @llvm.x86.avx.sqrt.pd(<4 x double>) nounwind readnone
+declare <4 x double> @llvm.x86.avx.sqrt.sd(<4 x double>) nounwind readnone
+
+define internal <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
+  unary4to8(ret, double, @llvm.x86.avx.sqrt.pd, %0)
+  ret <8 x double> %ret
+}
+
+define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
+  sse_unary_scalar(ret, 4, double, @llvm.x86.avx.sqrt.pd, %0)
+  ret double %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <4 x double> @llvm.x86.avx.max.pd(<4 x double>, <4 x double>) nounwind readnone
+declare <4 x double> @llvm.x86.avx.max.sd(<4 x double>, <4 x double>) nounwind readnone
+declare <4 x double> @llvm.x86.avx.min.pd(<4 x double>, <4 x double>) nounwind readnone
+declare <4 x double> @llvm.x86.avx.min.sd(<4 x double>, <4 x double>) nounwind readnone
+
+define internal <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
+  binary4to8(ret, double, @llvm.x86.avx.min.pd, %0, %1)
+  ret <8 x double> %ret
+}
+
+define internal double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
+  sse_binary_scalar(ret, 4, double, @llvm.x86.avx.min.pd, %0, %1)
+  ret double %ret
+}
+
+define internal <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
+  binary4to8(ret, double, @llvm.x86.avx.max.pd, %0, %1)
+  ret <8 x double> %ret
+}
+
+define internal double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
+  sse_binary_scalar(ret, 4, double, @llvm.x86.avx.max.pd, %0, %1)
+  ret double %ret
+}
--- a/stdlib-c.c
+++ b/stdlib-c.c
@@ -0,0 +1,141 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file stdlib-c.c
+    @brief Standard library function implementations written in C.
+
+    This file provides C implementations of various functions that can be
+    called from ispc programs; in other words, this file is *not* linked
+    into the ispc compiler executable, but rather provides functions that
+    can be compiled into ispc programs.
+
+    When the ispc compiler is built, this file is compiled with clang to
+    generate LLVM bitcode.  This bitcode is later linked in to the program
+    being compiled by the DefineStdlib() function.  The first way to access
+    definitions from this file is by asking for them name from the
+    llvm::Module's' symbol table (e.g. as the PrintStmt implementation does
+    with __do_print() below.  Alternatively, if a function defined in this
+    file has a signature that can be mapped back to ispc types by the
+    lLLVMTypeToIspcType() function, then its declaration will be made
+    available to ispc programs at compile time automatically.
+  */
+
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdarg.h>
+
+typedef int Bool;
+
+#define PRINT_SCALAR(fmt, type)  \
+    printf(fmt, *((type *)ptr)); \
+    break
+
+#define PRINT_VECTOR(fmt, type)                                         \
+    putchar('[');                                                       \
+    for (int i = 0; i < width; ++i) {                                   \
+        /* only print the value if the current lane is executing */     \
+        if (mask & (1<<i))                                              \
+            printf(fmt, ((type *)ptr)[i]);                              \
+        else                                                            \
+            printf("((" fmt "))", ((type *)ptr)[i]);                    \
+        putchar(i != width-1 ? ',' : ']');                              \
+    }                                                                   \
+    break
+
+/** This function is called by PrintStmt to do the work of printing values
+    from ispc programs.  Note that the function signature here must match
+    the parameters that PrintStmt::EmitCode() generates.
+
+    @param format  Print format string
+    @param types   Encoded types of the values being printed.
+                   (See lEncodeType()). 
+    @param width   Vector width of the compilation target
+    @param mask    Current lane mask when the print statemnt is called
+    @param args    Array of pointers to the values to be printed
+ */
+void __do_print(const char *format, const char *types, int width, int mask, 
+                void **args) {
+    if (mask == 0) 
+        return;
+
+    int argCount = 0;
+    while (*format) {
+        // Format strings are just single percent signs.
+        if (*format != '%')
+            putchar(*format);
+        else {
+            if (*types) {
+                void *ptr = args[argCount++];
+                // Based on the encoding in the types string, cast the
+                // value appropriately and print it with a reasonable
+                // printf() formatting string.
+                switch (*types) {
+                case 'b': {
+                    printf("%s", *((Bool *)ptr) ? "true" : "false");
+                    break;
+                }
+                case 'B': {
+                    putchar('[');
+                    for (int i = 0; i < width; ++i) {
+                        if (mask & (1<<i))
+                            printf("%s", ((Bool *)ptr)[i] ? "true" : "false");
+                        else
+                            printf("_________");
+                        putchar(i != width-1 ? ',' : ']');
+                    }
+                    break;
+                }
+                case 'i': PRINT_SCALAR("%d", int);
+                case 'I': PRINT_VECTOR("%d", int);
+                case 'u': PRINT_SCALAR("%u", unsigned int);
+                case 'U': PRINT_VECTOR("%u", unsigned int);
+                case 'f': PRINT_SCALAR("%f", float);
+                case 'F': PRINT_VECTOR("%f", float);
+                case 'l': PRINT_SCALAR("%lld", long long);
+                case 'L': PRINT_VECTOR("%lld", long long);
+                case 'v': PRINT_SCALAR("%llu", unsigned long long);
+                case 'V': PRINT_VECTOR("%llu", unsigned long long);
+                case 'd': PRINT_SCALAR("%f", double);
+                case 'D': PRINT_VECTOR("%f", double);
+                default:
+                    printf("UNKNOWN TYPE ");
+                    putchar(*types);
+                }
+                ++types;
+            }
+        }
+        ++format;
+    }
+    fflush(stdout);
+}
--- a/stdlib-sse.ll
+++ b/stdlib-sse.ll
@@ -0,0 +1,441 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+;; This file declares implementations of various stdlib builtins that
+;; only require SSE version 1 and 2 functionality; this file, in turn
+;; is then included by stdlib-sse2.ll and stdlib-sse4.ll to provide
+;; those definitions for them.
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+int8_16(4)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
+
+define internal <4 x float> @__rcp_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  %call = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %0)
+  ; do one N-R iteration to improve precision
+  ;  float iv = __rcp_v(v);
+  ;  return iv * (2. - v * iv);
+  %v_iv = fmul <4 x float> %0, %call
+  %two_minus = fsub <4 x float> <float 2., float 2., float 2., float 2.>, %v_iv  
+  %iv_mul = fmul <4 x float> %call, %two_minus
+  ret <4 x float> %iv_mul
+}
+
+define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
+  ; do the rcpss call
+  %vecval = insertelement <4 x float> undef, float %0, i32 0
+  %call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
+  %scall = extractelement <4 x float> %call, i32 0
+
+  ; do one N-R iteration to improve precision, as above
+  %v_iv = fmul float %0, %scall
+  %two_minus = fsub float 2., %v_iv  
+  %iv_mul = fmul float %scall, %two_minus
+  ret float %iv_mul
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; rsqrt
+
+declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
+
+define internal <4 x float> @__rsqrt_varying_float(<4 x float> %v) nounwind readonly alwaysinline {
+  ;  float is = __rsqrt_v(v);
+  %is = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %v)
+  ; Newton-Raphson iteration to improve precision
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul <4 x float> %v, %is
+  %v_is_is = fmul <4 x float> %v_is, %is
+  %three_sub = fsub <4 x float> <float 3., float 3., float 3., float 3.>, %v_is_is
+  %is_mul = fmul <4 x float> %is, %three_sub
+  %half_scale = fmul <4 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  ret <4 x float> %half_scale
+}
+
+define internal float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
+  ;  uniform float is = extract(__rsqrt_u(v), 0);
+  %v = insertelement <4 x float> undef, float %0, i32 0
+  %vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
+  %is = extractelement <4 x float> %vis, i32 0
+
+  ; Newton-Raphson iteration to improve precision
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul float %0, %is
+  %v_is_is = fmul float %v_is, %is
+  %three_sub = fsub float 3., %v_is_is
+  %is_mul = fmul float %is, %three_sub
+  %half_scale = fmul float 0.5, %is_mul
+  ret float %half_scale
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; sqrt
+
+declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
+
+define internal <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  %call = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %0)
+  ret <4 x float> %call
+}
+
+define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
+  sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
+  ret float %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; fast math mode
+
+declare void @llvm.x86.sse.stmxcsr(i32 *) nounwind
+declare void @llvm.x86.sse.ldmxcsr(i32 *) nounwind
+
+define internal void @__fastmath() nounwind alwaysinline {
+  %ptr = alloca i32
+  call void @llvm.x86.sse.stmxcsr(i32 * %ptr)
+  %oldval = load i32 *%ptr
+
+  ; turn on DAZ (64)/FTZ (32768) -> 32832
+  %update = or i32 %oldval, 32832
+  store i32 %update, i32 *%ptr
+  call void @llvm.x86.sse.ldmxcsr(i32 * %ptr)
+  ret void
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; svml stuff
+
+declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
+declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
+
+
+define internal <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
+  ret <4 x float> %ret
+}
+
+define internal <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
+  ret <4 x float> %ret
+}
+
+define internal void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
+  %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
+  store <4 x float> %s, <4 x float> * %1
+  ret void
+}
+
+define internal <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
+  ret <4 x float> %ret
+}
+
+define internal <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
+  ret <4 x float> %ret
+}
+
+define internal <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
+  ret <4 x float> %ret
+}
+
+define internal <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_expf4(<4 x float> %0)
+  ret <4 x float> %ret
+}
+
+define internal <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_logf4(<4 x float> %0)
+  ret <4 x float> %ret
+}
+
+define internal <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
+  ret <4 x float> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define internal <4 x float> @__max_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
+  %call = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %0, <4 x float> %1)
+  ret <4 x float> %call
+}
+
+define internal float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
+  ret float %ret
+}
+
+define internal <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
+  %call = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %0, <4 x float> %1)
+  ret <4 x float> %call
+}
+
+define internal float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
+  ret float %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops / reductions
+
+declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
+
+define internal i32 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i32> %0 to <4 x float>
+  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+  ret i32 %v
+}
+
+define internal float @__reduce_min_float(<4 x float>) nounwind readnone {
+  reduce4(float, @__min_varying_float, @__min_uniform_float)
+}
+
+define internal float @__reduce_max_float(<4 x float>) nounwind readnone {
+  reduce4(float, @__max_varying_float, @__max_uniform_float)
+}
+
+define internal i32 @__reduce_add_int32(<4 x i32> %v) nounwind readnone {
+  %v1 = shufflevector <4 x i32> %v, <4 x i32> undef,
+                      <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %m1 = add <4 x i32> %v1, %v
+  %m1a = extractelement <4 x i32> %m1, i32 0
+  %m1b = extractelement <4 x i32> %m1, i32 1
+  %sum = add i32 %m1a, %m1b
+  ret i32 %sum
+}
+
+define internal i32 @__reduce_min_int32(<4 x i32>) nounwind readnone {
+  reduce4(i32, @__min_varying_int32, @__min_uniform_int32)
+}
+
+define internal i32 @__reduce_max_int32(<4 x i32>) nounwind readnone {
+  reduce4(i32, @__max_varying_int32, @__max_uniform_int32)
+}
+
+define internal i32 @__reduce_add_uint32(<4 x i32> %v) nounwind readnone {
+  %r = call i32 @__reduce_add_int32(<4 x i32> %v)
+  ret i32 %r
+}
+
+define internal i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone {
+  reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32)
+}
+
+define internal i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone {
+  reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32)
+ }
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+define void @__masked_store_32(<4 x i32>* nocapture, <4 x i32>, <4 x i32>) nounwind alwaysinline {
+  per_lane(4, <4 x i32> %2, `
+      ; compute address for this one
+      %ptr_ID = getelementptr <4 x i32> * %0, i32 0, i32 LANE
+      %storeval_ID = extractelement <4 x i32> %1, i32 LANE
+      store i32 %storeval_ID, i32 * %ptr_ID')
+  ret void
+}
+
+define void @__masked_store_64(<4 x i64>* nocapture, <4 x i64>, <4 x i32>) nounwind alwaysinline {
+  per_lane(4, <4 x i32> %2, `
+      %ptr_ID = getelementptr <4 x i64> * %0, i32 0, i32 LANE
+      %storeval_ID = extractelement <4 x i64> %1, i32 LANE
+      store i64 %storeval_ID, i64 * %ptr_ID')
+  ret void
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+define <4 x i32> @__load_and_broadcast_32(i8 *, <4 x i32> %mask) nounwind alwaysinline {
+  ; must not load if the mask is all off; the address may be invalid
+  %mm = call i32 @__movmsk(<4 x i32> %mask)
+  %any_on = icmp ne i32 %mm, 0
+  br i1 %any_on, label %load, label %skip
+
+load:
+  %ptr = bitcast i8 * %0 to i32 *
+  %val = load i32 * %ptr
+
+  %ret0 = insertelement <4 x i32> undef, i32 %val, i32 0
+  %ret1 = insertelement <4 x i32> %ret0, i32 %val, i32 1
+  %ret2 = insertelement <4 x i32> %ret1, i32 %val, i32 2
+  %ret3 = insertelement <4 x i32> %ret2, i32 %val, i32 3
+  ret <4 x i32> %ret3
+
+skip:
+  ret <4 x i32> undef
+}
+
+define <4 x i64> @__load_and_broadcast_64(i8 *, <4 x i32> %mask) nounwind alwaysinline {
+  ; must not load if the mask is all off; the address may be invalid
+  %mm = call i32 @__movmsk(<4 x i32> %mask)
+  %any_on = icmp ne i32 %mm, 0
+  br i1 %any_on, label %load, label %skip
+
+load:
+  %ptr = bitcast i8 * %0 to i64 *
+  %val = load i64 * %ptr
+
+  %ret0 = insertelement <4 x i64> undef, i64 %val, i32 0
+  %ret1 = insertelement <4 x i64> %ret0, i64 %val, i32 1
+  %ret2 = insertelement <4 x i64> %ret1, i64 %val, i32 2
+  %ret3 = insertelement <4 x i64> %ret2, i64 %val, i32 3
+  ret <4 x i64> %ret3
+
+skip:
+  ret <4 x i64> undef
+}
+
+define <4 x i32> @__load_masked_32(i8 *, <4 x i32> %mask) nounwind alwaysinline {
+  %mm = call i32 @__movmsk(<4 x i32> %mask)
+  %any_on = icmp ne i32 %mm, 0
+  br i1 %any_on, label %load, label %skip
+
+load: 
+  ; if any mask lane is on, just load all of the values
+  ; FIXME: there is a lurking bug here if we straddle a page boundary, the
+  ; next page is invalid to read, but the mask bits are set so that we
+  ; aren't supposed to be reading those elements...
+  %ptr = bitcast i8 * %0 to <4 x i32> *
+  %val = load <4 x i32> * %ptr, align 4
+  ret <4 x i32> %val
+
+skip:
+  ret <4 x i32> undef
+}
+
+define <4 x i64> @__load_masked_64(i8 *, <4 x i32> %mask) nounwind alwaysinline {
+  %mm = call i32 @__movmsk(<4 x i32> %mask)
+  %any_on = icmp ne i32 %mm, 0
+  br i1 %any_on, label %load, label %skip
+
+load:
+  ; if any mask lane is on, just load all of the values
+  ; FIXME: there is a lurking bug here if we straddle a page boundary, the
+  ; next page is invalid to read, but the mask bits are set so that we
+  ; aren't supposed to be reading those elements...
+  %ptr = bitcast i8 * %0 to <4 x i64> *
+  %val = load <4 x i64> * %ptr, align 8
+  ret <4 x i64> %val
+
+skip:
+  ret <4 x i64> undef
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather/scatter
+
+; define these with the macros from stdlib.m4
+
+gen_gather(4, i32)
+gen_gather(4, i64)
+gen_scatter(4, i32)
+gen_scatter(4, i64)
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
+
+define internal <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
+  unary2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
+  ret <4 x double> %ret
+}
+
+
+define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
+  sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.pd, %0)
+  ret double %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define internal <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone {
+  binary2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
+  ret <4 x double> %ret
+}
+
+
+define internal double @__min_uniform_double(double, double) nounwind readnone {
+  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.pd, %0, %1)
+  ret double %ret
+}
+
+
+define internal <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone {
+  binary2to4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
+  ret <4 x double> %ret
+}
+
+
+define internal double @__max_uniform_double(double, double) nounwind readnone {
+  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.pd, %0, %1)
+  ret double %ret
+}
--- a/stdlib-sse2.ll
+++ b/stdlib-sse2.ll
@@ -0,0 +1,328 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Define the standard library builtins for the SSE2 target
+
+; Define some basics for a 4-wide target
+stdlib_core(4)
+packed_load_and_store(4)
+
+; Include the various definitions of things that only require SSE1 and SSE2
+include(`stdlib-sse.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding
+;;
+;; There are not any rounding instructions in SSE2, so we have to emulate
+;; the functionality with multiple instructions...
+
+; The code for __round_* is the result of compiling the following source
+; code.
+;
+; export float Round(float x) {
+;    unsigned int sign = signbits(x);
+;    unsigned int ix = intbits(x);
+;    ix ^= sign;
+;    x = floatbits(ix);
+;    x += 0x1.0p23f;
+;    x -= 0x1.0p23f;
+;    ix = intbits(x);
+;    ix ^= sign;
+;    x = floatbits(ix);
+;    return x;
+;}
+
+define internal <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  %float_to_int_bitcast.i.i.i.i = bitcast <4 x float> %0 to <4 x i32>
+  %bitop.i.i = and <4 x i32> %float_to_int_bitcast.i.i.i.i, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+  %bitop.i = xor <4 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i40.i = bitcast <4 x i32> %bitop.i to <4 x float>
+  %binop.i = fadd <4 x float> %int_to_float_bitcast.i.i40.i, <float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06>
+  %binop21.i = fadd <4 x float> %binop.i, <float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06>
+  %float_to_int_bitcast.i.i.i = bitcast <4 x float> %binop21.i to <4 x i32>
+  %bitop31.i = xor <4 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i.i = bitcast <4 x i32> %bitop31.i to <4 x float>
+  ret <4 x float> %int_to_float_bitcast.i.i.i
+}
+
+define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
+  %float_to_int_bitcast.i.i.i.i = bitcast float %0 to i32
+  %bitop.i.i = and i32 %float_to_int_bitcast.i.i.i.i, -2147483648
+  %bitop.i = xor i32 %bitop.i.i, %float_to_int_bitcast.i.i.i.i
+  %int_to_float_bitcast.i.i40.i = bitcast i32 %bitop.i to float
+  %binop.i = fadd float %int_to_float_bitcast.i.i40.i, 8.388608e+06
+  %binop21.i = fadd float %binop.i, -8.388608e+06
+  %float_to_int_bitcast.i.i.i = bitcast float %binop21.i to i32
+  %bitop31.i = xor i32 %float_to_int_bitcast.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop31.i to float
+  ret float %int_to_float_bitcast.i.i.i
+}
+
+;; Similarly, for implementations of the __floor* functions below, we have the
+;; bitcode from compiling the following source code...
+
+;export float Floor(float x) {
+;    float y = Round(x);
+;    unsigned int cmp = y > x ? 0xffffffff : 0;
+;    float delta = -1.f;
+;    unsigned int idelta = intbits(delta);
+;    idelta &= cmp;
+;    delta = floatbits(idelta);
+;    return y + delta;
+;}
+
+define internal <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <4 x float> @__round_varying_float(<4 x float> %0) nounwind
+  %bincmp.i = fcmp ogt <4 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <4 x i1> %bincmp.i to <4 x i32>
+  %bitop.i = and <4 x i32> %val_to_boolvec32.i, <i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432>
+  %int_to_float_bitcast.i.i.i = bitcast <4 x i32> %bitop.i to <4 x float>
+  %binop.i = fadd <4 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <4 x float> %binop.i
+}
+
+define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
+  %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
+  %bincmp.i = fcmp ogt float %calltmp.i, %0
+  %selectexpr.i = sext i1 %bincmp.i to i32
+  %bitop.i = and i32 %selectexpr.i, -1082130432
+  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
+  %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret float %binop.i
+}
+
+;; And here is the code we compiled to get the __ceil* functions below
+;
+;export uniform float Ceil(uniform float x) {
+;    uniform float y = Round(x);
+;    uniform int yltx = y < x ? 0xffffffff : 0;
+;    uniform float delta = 1.f;
+;    uniform int idelta = intbits(delta);
+;    idelta &= yltx;
+;    delta = floatbits(idelta);
+;    return y + delta;
+;}
+
+define internal <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <4 x float> @__round_varying_float(<4 x float> %0) nounwind
+  %bincmp.i = fcmp olt <4 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <4 x i1> %bincmp.i to <4 x i32>
+  %bitop.i = and <4 x i32> %val_to_boolvec32.i, <i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216>
+  %int_to_float_bitcast.i.i.i = bitcast <4 x i32> %bitop.i to <4 x float>
+  %binop.i = fadd <4 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <4 x float> %binop.i
+}
+
+define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
+  %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
+  %bincmp.i = fcmp olt float %calltmp.i, %0
+  %selectexpr.i = sext i1 %bincmp.i to i32
+  %bitop.i = and i32 %selectexpr.i, 1065353216
+  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
+  %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret float %binop.i
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; min/max
+
+; There is no blend instruction with SSE2, so we simulate it with bit
+; operations on i32s.  For these two vselect functions, for each
+; vector element, if the mask is on, we return the corresponding value
+; from %1, and otherwise return the value from %0.
+
+define internal <4 x i32> @__vselect_i32(<4 x i32>, <4 x i32> ,
+                                         <4 x i32> %mask) nounwind readnone alwaysinline {
+  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %cleared_old = and <4 x i32> %0, %notmask
+  %masked_new = and <4 x i32> %1, %mask
+  %new = or <4 x i32> %cleared_old, %masked_new
+  ret <4 x i32> %new
+}
+
+define internal <4 x float> @__vselect_float(<4 x float>, <4 x float>,
+                                             <4 x i32> %mask) nounwind readnone alwaysinline {
+  %v0 = bitcast <4 x float> %0 to <4 x i32>
+  %v1 = bitcast <4 x float> %1 to <4 x i32>
+  %r = call <4 x i32> @__vselect_i32(<4 x i32> %v0, <4 x i32> %v1, <4 x i32> %mask)
+  %rf = bitcast <4 x i32> %r to <4 x float>
+  ret <4 x float> %rf
+}
+
+
+; To do vector integer min and max, we do the vector compare and then sign
+; extend the i1 vector result to an i32 mask.  The __vselect does the
+; rest...
+
+define internal <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %c = icmp slt <4 x i32> %0, %1
+  %mask = sext <4 x i1> %c to <4 x i32>
+  %v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
+  ret <4 x i32> %v
+}
+
+define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp slt i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define internal <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %c = icmp sgt <4 x i32> %0, %1
+  %mask = sext <4 x i1> %c to <4 x i32>
+  %v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
+  ret <4 x i32> %v
+}
+
+define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp sgt i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+
+; The functions for unsigned ints are similar, just with unsigned
+; comparison functions...
+
+define internal <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %c = icmp ult <4 x i32> %0, %1
+  %mask = sext <4 x i1> %c to <4 x i32>
+  %v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
+  ret <4 x i32> %v
+}
+
+define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp ult i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define internal <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %c = icmp ugt <4 x i32> %0, %1
+  %mask = sext <4 x i1> %c to <4 x i32>
+  %v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
+  ret <4 x i32> %v
+}
+
+define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp ugt i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops / reductions
+
+; FIXME: this is very inefficient, loops over all 32 bits...
+
+define internal i32 @__popcnt(i32) nounwind readonly alwaysinline {
+entry:
+  br label %loop
+
+loop:
+  %count = phi i32 [ 0, %entry ], [ %newcount, %loop ]
+  %val = phi i32 [ %0, %entry ], [ %newval, %loop ]
+  %delta = and i32 %val, 1
+  %newcount = add i32 %count, %delta
+  %newval = lshr i32 %val, 1
+  %done = icmp eq i32 %newval, 0
+  br i1 %done, label %exit, label %loop
+
+exit:
+  ret i32 %newcount
+}
+
+
+define internal float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline {
+  %v1 = shufflevector <4 x float> %v, <4 x float> undef,
+                      <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %m1 = fadd <4 x float> %v1, %v
+  %m1a = extractelement <4 x float> %m1, i32 0
+  %m1b = extractelement <4 x float> %m1, i32 1
+  %sum = fadd float %m1a, %m1b
+  ret float %sum
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>, 
+                                     <4 x i32> %mask) nounwind alwaysinline {
+  %val = load <4 x i32> * %0
+  %newval = call <4 x i32> @__vselect_i32(<4 x i32> %val, <4 x i32> %1, <4 x i32> %mask) 
+  store <4 x i32> %newval, <4 x i32> * %0
+  ret void
+}
+
+define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
+                                     <4 x i32> %mask) nounwind alwaysinline {
+  %oldValue = load <4 x i64>* %ptr
+
+  ; Do 4x64-bit blends by doing two <4 x i32> blends, where the <4 x i32> values
+  ; are actually bitcast <2 x i64> values
+  ;
+  ; set up the first two 64-bit values
+  %old01  = shufflevector <4 x i64> %oldValue, <4 x i64> undef,
+                          <2 x i32> <i32 0, i32 1>
+  %old01f = bitcast <2 x i64> %old01 to <4 x float>
+  %new01  = shufflevector <4 x i64> %new, <4 x i64> undef,
+                          <2 x i32> <i32 0, i32 1>
+  %new01f = bitcast <2 x i64> %new01 to <4 x float>
+  ; compute mask--note that the indices 0 and 1 are doubled-up
+  %mask01 = shufflevector <4 x i32> %mask, <4 x i32> undef,
+                          <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+  ; and blend the two of the values
+  %result01f = call <4 x float> @__vselect_float(<4 x float> %old01f, <4 x float> %new01f, <4 x i32> %mask01)
+  %result01 = bitcast <4 x float> %result01f to <2 x i64>
+
+  ; and again
+  %old23  = shufflevector <4 x i64> %oldValue, <4 x i64> undef,
+                          <2 x i32> <i32 2, i32 3>
+  %old23f = bitcast <2 x i64> %old23 to <4 x float>
+  %new23  = shufflevector <4 x i64> %new, <4 x i64> undef,
+                          <2 x i32> <i32 2, i32 3>
+  %new23f = bitcast <2 x i64> %new23 to <4 x float>
+  ; compute mask--note that the values 2 and 3 are doubled-up
+  %mask23 = shufflevector <4 x i32> %mask, <4 x i32> undef,
+                          <4 x i32> <i32 2, i32 2, i32 3, i32 3>
+  ; and blend the two of the values
+  %result23f = call <4 x float> @__vselect_float(<4 x float> %old23f, <4 x float> %new23f, <4 x i32> %mask23)
+  %result23 = bitcast <4 x float> %result23f to <2 x i64>
+
+  ; reconstruct the final <4 x i64> vector
+  %final = shufflevector <2 x i64> %result01, <2 x i64> %result23,
+                         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x i64> %final, <4 x i64> * %ptr
+  ret void
+}
+
--- a/stdlib-sse4.ll
+++ b/stdlib-sse4.ll
@@ -0,0 +1,248 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Define common 4-wide stuff
+stdlib_core(4)
+packed_load_and_store(4)
+
+; Define the stuff that can be done with base SSE1/SSE2 instructions
+include(`stdlib-sse.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; math
+
+declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
+declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
+
+define internal <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 8)
+  ret <4 x float> %call
+}
+
+define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
+  ; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  ; the roundss intrinsic is a total mess--docs say:
+  ;
+  ;  __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
+  ;       
+  ;  b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
+  ;  on b0. The higher order 96 bits are copied directly from input parameter a. The
+  ;  return value is described by the following equations:
+  ;
+  ;  r0 = RND(b0)
+  ;  r1 = a1
+  ;  r2 = a2
+  ;  r3 = a3
+  ;
+  ;  It doesn't matter what we pass as a, since we only need the r0 value
+  ;  here.  So we pass the same register for both.  Further, only the 0th
+  ;  element of the b parameter matters
+  %xi = insertelement <4 x float> undef, float %0, i32 0
+  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
+  %rs = extractelement <4 x float> %xr, i32 0
+  ret float %rs
+}
+
+define internal <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 9)
+  ret <4 x float> %call
+}
+
+define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <4 x float> undef, float %0, i32 0
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
+  %rs = extractelement <4 x float> %xr, i32 0
+  ret float %rs
+}
+
+define internal <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 10)
+  ret <4 x float> %call
+}
+
+define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <4 x float> undef, float %0, i32 0
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
+  %rs = extractelement <4 x float> %xr, i32 0
+  ret float %rs
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; integer min/max
+
+declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
+
+define internal <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %call = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %call
+}
+
+define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret i32 %ret
+}
+
+define internal <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %call = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %call
+}
+
+define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret i32 %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; unsigned int min/max
+
+declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
+
+define internal <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %call = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %call
+}
+
+define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret i32 %ret
+}
+
+define internal <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %call = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %call
+}
+
+define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret i32 %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops / reductions
+
+declare i32 @llvm.ctpop.i32(i32) nounwind readnone
+
+define internal i32 @__popcnt(i32) nounwind readonly alwaysinline {
+  %call = call i32 @llvm.ctpop.i32(i32 %0)
+  ret i32 %call
+}
+
+declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define internal float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline {
+  %v1 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %0, <4 x float> %0)
+  %v2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %v1, <4 x float> %v1)
+  %scalar = extractelement <4 x float> %v2, i32 0
+  ret float %scalar
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
+                                             <4 x float>) nounwind readnone
+
+
+define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>, 
+                                     <4 x i32> %mask) nounwind alwaysinline {
+  %mask_as_float = bitcast <4 x i32> %mask to <4 x float>
+  %oldValue = load <4 x i32>* %0
+  %oldAsFloat = bitcast <4 x i32> %oldValue to <4 x float>
+  %newAsFloat = bitcast <4 x i32> %1 to <4 x float>
+  %blend = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %oldAsFloat,
+                                                     <4 x float> %newAsFloat,
+                                                     <4 x float> %mask_as_float)
+  %blendAsInt = bitcast <4 x float> %blend to <4 x i32>
+  store <4 x i32> %blendAsInt, <4 x i32>* %0
+  ret void
+}
+
+
+define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
+                                     <4 x i32> %i32mask) nounwind alwaysinline {
+  %oldValue = load <4 x i64>* %ptr
+  %mask = bitcast <4 x i32> %i32mask to <4 x float>
+
+  ; Do 4x64-bit blends by doing two <4 x i32> blends, where the <4 x i32> values
+  ; are actually bitcast <2 x i64> values
+  ;
+  ; set up the first two 64-bit values
+  %old01  = shufflevector <4 x i64> %oldValue, <4 x i64> undef,
+                          <2 x i32> <i32 0, i32 1>
+  %old01f = bitcast <2 x i64> %old01 to <4 x float>
+  %new01  = shufflevector <4 x i64> %new, <4 x i64> undef,
+                          <2 x i32> <i32 0, i32 1>
+  %new01f = bitcast <2 x i64> %new01 to <4 x float>
+  ; compute mask--note that the indices 0 and 1 are doubled-up
+  %mask01 = shufflevector <4 x float> %mask, <4 x float> undef,
+                          <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+  ; and blend the two of the values
+  %result01f = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %old01f,
+                                                         <4 x float> %new01f,
+                                                         <4 x float> %mask01)
+  %result01 = bitcast <4 x float> %result01f to <2 x i64>
+
+  ; and again
+  %old23  = shufflevector <4 x i64> %oldValue, <4 x i64> undef,
+                          <2 x i32> <i32 2, i32 3>
+  %old23f = bitcast <2 x i64> %old23 to <4 x float>
+  %new23  = shufflevector <4 x i64> %new, <4 x i64> undef,
+                          <2 x i32> <i32 2, i32 3>
+  %new23f = bitcast <2 x i64> %new23 to <4 x float>
+  ; compute mask--note that the values 2 and 3 are doubled-up
+  %mask23 = shufflevector <4 x float> %mask, <4 x float> undef,
+                          <4 x i32> <i32 2, i32 2, i32 3, i32 3>
+  ; and blend the two of the values
+  %result23f = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %old23f,
+                                                         <4 x float> %new23f,
+                                                         <4 x float> %mask23)
+  %result23 = bitcast <4 x float> %result23f to <2 x i64>
+
+  ; reconstruct the final <4 x i64> vector
+  %final = shufflevector <2 x i64> %result01, <2 x i64> %result23,
+                         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x i64> %final, <4 x i64> * %ptr
+  ret void
+}
--- a/stdlib-sse4x2.ll
+++ b/stdlib-sse4x2.ll
@@ -0,0 +1,703 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+
+;; This file defines the target for "double-pumped" SSE4, i.e. running
+;; with 8-wide vectors
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; standard 8-wide definitions from m4 macros
+
+stdlib_core(8)
+packed_load_and_store(8)
+int8_16(8)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
+
+define internal <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ;  float iv = __rcp_v(v);
+  ;  return iv * (2. - v * iv);
+
+  unary4to8(call, float, @llvm.x86.sse.rcp.ps, %0)
+  ; do one N-R iteration
+  %v_iv = fmul <8 x float> %0, %call
+  %two_minus = fsub <8 x float> <float 2., float 2., float 2., float 2.,
+                                 float 2., float 2., float 2., float 2.>, %v_iv  
+  %iv_mul = fmul <8 x float> %call, %two_minus
+  ret <8 x float> %iv_mul
+}
+
+define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
+;    uniform float iv = extract(__rcp_u(v), 0);
+;    return iv * (2. - v * iv);
+  %vecval = insertelement <4 x float> undef, float %0, i32 0
+  %call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
+  %scall = extractelement <4 x float> %call, i32 0
+
+  ; do one N-R iteration
+  %v_iv = fmul float %0, %scall
+  %two_minus = fsub float 2., %v_iv  
+  %iv_mul = fmul float %scall, %two_minus
+  ret float %iv_mul
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rsqrt
+
+declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
+
+define internal <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
+  ;  float is = __rsqrt_v(v);
+  unary4to8(is, float, @llvm.x86.sse.rsqrt.ps, %v)
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul <8 x float> %v, %is
+  %v_is_is = fmul <8 x float> %v_is, %is
+  %three_sub = fsub <8 x float> <float 3., float 3., float 3., float 3.,
+                                 float 3., float 3., float 3., float 3.>, %v_is_is
+  %is_mul = fmul <8 x float> %is, %three_sub
+  %half_scale = fmul <8 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
+                                  float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  ret <8 x float> %half_scale
+}
+
+define internal float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
+  ;  uniform float is = extract(__rsqrt_u(v), 0);
+  %v = insertelement <4 x float> undef, float %0, i32 0
+  %vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
+  %is = extractelement <4 x float> %vis, i32 0
+
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul float %0, %is
+  %v_is_is = fmul float %v_is, %is
+  %three_sub = fsub float 3., %v_is_is
+  %is_mul = fmul float %is, %three_sub
+  %half_scale = fmul float 0.5, %is_mul
+  ret float %half_scale
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; sqrt
+
+declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
+
+define internal <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  unary4to8(call, float, @llvm.x86.sse.sqrt.ps, %0)
+  ret <8 x float> %call
+}
+
+define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
+  sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
+  ret float %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; fast math
+
+declare void @llvm.x86.sse.stmxcsr(i32 *) nounwind
+declare void @llvm.x86.sse.ldmxcsr(i32 *) nounwind
+
+define internal void @__fastmath() nounwind alwaysinline {
+  %ptr = alloca i32
+  call void @llvm.x86.sse.stmxcsr(i32 * %ptr)
+  %oldval = load i32 *%ptr
+
+  ; turn on DAZ (64)/FTZ (32768) -> 32832
+  %update = or i32 %oldval, 32832
+  store i32 %update, i32 *%ptr
+  call void @llvm.x86.sse.ldmxcsr(i32 * %ptr)
+  ret void
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; svml stuff
+
+declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
+declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
+
+
+define internal <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline {
+  unary4to8(ret, float, @__svml_sinf4, %0)
+  ret <8 x float> %ret
+}
+
+define internal <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline {
+  unary4to8(ret, float, @__svml_cosf4, %0)
+  ret <8 x float> %ret
+}
+
+define internal void @__svml_sincos(<8 x float>, <8 x float> *,
+                                    <8 x float> *) nounwind readnone alwaysinline {
+  ; call svml_sincosf4 two times with the two 4-wide sub-vectors
+  %a = shufflevector <8 x float> %0, <8 x float> undef,
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %b = shufflevector <8 x float> %0, <8 x float> undef,
+         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+
+  %cospa = alloca <4 x float>
+  %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a)
+
+  %cospb = alloca <4 x float>
+  %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b)
+
+  %sin = shufflevector <4 x float> %sa, <4 x float> %sb,
+         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
+                    i32 4, i32 5, i32 6, i32 7>
+  store <8 x float> %sin, <8 x float> * %1
+
+  %cosa = load <4 x float> * %cospa
+  %cosb = load <4 x float> * %cospb
+  %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb,
+         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
+                    i32 4, i32 5, i32 6, i32 7>
+  store <8 x float> %cos, <8 x float> * %2
+
+  ret void
+}
+
+define internal <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline {
+  unary4to8(ret, float, @__svml_tanf4, %0)
+  ret <8 x float> %ret
+}
+
+define internal <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline {
+  unary4to8(ret, float, @__svml_atanf4, %0)
+  ret <8 x float> %ret
+}
+
+define internal <8 x float> @__svml_atan2(<8 x float>,
+                                          <8 x float>) nounwind readnone alwaysinline {
+  binary4to8(ret, float, @__svml_atan2f4, %0, %1)
+  ret <8 x float> %ret
+}
+
+define internal <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline {
+  unary4to8(ret, float, @__svml_expf4, %0)
+  ret <8 x float> %ret
+}
+
+define internal <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline {
+  unary4to8(ret, float, @__svml_logf4, %0)
+  ret <8 x float> %ret
+}
+
+define internal <8 x float> @__svml_pow(<8 x float>,
+                                        <8 x float>) nounwind readnone alwaysinline {
+  binary4to8(ret, float, @__svml_powf4, %0, %1)
+  ret <8 x float> %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define internal <8 x float> @__max_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
+  binary4to8(call, float, @llvm.x86.sse.max.ps, %0, %1)
+  ret <8 x float> %call
+}
+
+define internal float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
+  ret float %ret
+}
+
+define internal <8 x float> @__min_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
+  binary4to8(call, float, @llvm.x86.sse.min.ps, %0, %1)
+  ret <8 x float> %call
+}
+
+define internal float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
+  ret float %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
+
+define internal <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(call, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret <8 x i32> %call
+}
+
+define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret i32 %ret
+}
+
+define internal <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(call, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret <8 x i32> %call
+}
+
+define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret i32 %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; unsigned int min/max
+
+declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
+
+define internal <8 x i32> @__min_varying_uint32(<8 x i32>,
+                                                <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(call, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret <8 x i32> %call
+}
+
+define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret i32 %ret
+}
+
+define internal <8 x i32> @__max_varying_uint32(<8 x i32>,
+                                                <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(call, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret <8 x i32> %call
+}
+
+define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret i32 %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops / reductions
+
+declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
+
+define internal i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
+  ; first do two 4-wide movmsk calls
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
+  %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
+
+  ; and shift the first one over by 4 before ORing it with the value 
+  ; of the second one
+  %v1s = shl i32 %v1, 4
+  %v = or i32 %v0, %v1s
+  ret i32 %v
+}
+
+define internal float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
+  reduce8by4(float, @llvm.x86.sse.min.ps, @__min_uniform_float)
+}
+
+define internal float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
+  reduce8by4(float, @llvm.x86.sse.max.ps, @__max_uniform_float)
+}
+
+; helper function for reduce_add_int32
+define internal <4 x i32> @__vec4_add_int32(<4 x i32> %v0,
+                                            <4 x i32> %v1) nounwind readnone alwaysinline {
+  %v = add <4 x i32> %v0, %v1
+  ret <4 x i32> %v
+}
+
+; helper function for reduce_add_int32
+define internal i32 @__add_int32(i32, i32) nounwind readnone alwaysinline {
+  %v = add i32 %0, %1
+  ret i32 %v
+}
+
+define internal i32 @__reduce_add_int32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8by4(i32, @__vec4_add_int32, @__add_int32)
+}
+
+define internal i32 @__reduce_min_int32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8by4(i32, @llvm.x86.sse41.pminsd, @__min_uniform_int32)
+}
+
+define internal i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8by4(i32, @llvm.x86.sse41.pmaxsd, @__max_uniform_int32)
+}
+
+define internal i32 @__reduce_add_uint32(<8 x i32> %v) nounwind readnone alwaysinline {
+  %r = call i32 @__reduce_add_int32(<8 x i32> %v)
+  ret i32 %r
+}
+
+define internal i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8by4(i32, @llvm.x86.sse41.pminud, @__min_uniform_uint32)
+}
+
+define internal i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8by4(i32, @llvm.x86.sse41.pmaxud, @__max_uniform_uint32)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+define void @__masked_store_32(<8 x i32>* nocapture, <8 x i32>,
+                               <8 x i32>) nounwind alwaysinline {
+  per_lane(8, <8 x i32> %2, `
+      ; compute address for this one
+      %ptr_ID = getelementptr <8 x i32> * %0, i32 0, i32 LANE
+      %storeval_ID = extractelement <8 x i32> %1, i32 LANE
+      store i32 %storeval_ID, i32 * %ptr_ID')
+  ret void
+}
+
+
+define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>,
+                               <8 x i32>) nounwind alwaysinline {
+  per_lane(8, <8 x i32> %2, `
+      ; compute address for this one
+      %ptr_ID = getelementptr <8 x i64> * %0, i32 0, i32 LANE
+      %storeval_ID = extractelement <8 x i64> %1, i32 LANE
+      store i64 %storeval_ID, i64 * %ptr_ID')
+  ret void
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+; FIXME: I think this and the next one need to verify that the mask isn't
+; all off before doing the load!!!  (See e.g. stdlib-sse.ll)
+
+define <8 x i32> @__load_and_broadcast_32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
+  %ptr = bitcast i8 * %0 to i32 *
+  %val = load i32 * %ptr
+
+  %ret0 = insertelement <8 x i32> undef, i32 %val, i32 0
+  %ret1 = insertelement <8 x i32> %ret0, i32 %val, i32 1
+  %ret2 = insertelement <8 x i32> %ret1, i32 %val, i32 2
+  %ret3 = insertelement <8 x i32> %ret2, i32 %val, i32 3
+  %ret4 = insertelement <8 x i32> %ret3, i32 %val, i32 4
+  %ret5 = insertelement <8 x i32> %ret4, i32 %val, i32 5
+  %ret6 = insertelement <8 x i32> %ret5, i32 %val, i32 6
+  %ret7 = insertelement <8 x i32> %ret6, i32 %val, i32 7
+  ret <8 x i32> %ret7
+}
+
+
+define <8 x i64> @__load_and_broadcast_64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
+  %ptr = bitcast i8 * %0 to i64 *
+  %val = load i64 * %ptr
+
+  %ret0 = insertelement <8 x i64> undef, i64 %val, i32 0
+  %ret1 = insertelement <8 x i64> %ret0, i64 %val, i32 1
+  %ret2 = insertelement <8 x i64> %ret1, i64 %val, i32 2
+  %ret3 = insertelement <8 x i64> %ret2, i64 %val, i32 3
+  %ret4 = insertelement <8 x i64> %ret3, i64 %val, i32 4
+  %ret5 = insertelement <8 x i64> %ret4, i64 %val, i32 5
+  %ret6 = insertelement <8 x i64> %ret5, i64 %val, i32 6
+  %ret7 = insertelement <8 x i64> %ret6, i64 %val, i32 7
+  ret <8 x i64> %ret7
+}
+
+
+define <8 x i32> @__load_masked_32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
+  %ptr = bitcast i8 * %0 to <8 x i32> *
+  %val = load <8 x i32> * %ptr, align 4
+  ret <8 x i32> %val
+}
+
+
+define <8 x i64> @__load_masked_64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
+  %ptr = bitcast i8 * %0 to <8 x i64> *
+  %val = load <8 x i64> * %ptr, align 8
+  ret <8 x i64> %val
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather/scatter
+
+gen_gather(8, i32)
+gen_gather(8, i64)
+gen_scatter(8, i32)
+gen_scatter(8, i64)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; math
+
+declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
+declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
+
+define internal <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  round4to8(%0, 8)
+}
+
+define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
+  ; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  ; the roundss intrinsic is a total mess--docs say:
+  ;
+  ;  __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
+  ;       
+  ;  b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
+  ;  on b0. The higher order 96 bits are copied directly from input parameter a. The
+  ;  return value is described by the following equations:
+  ;
+  ;  r0 = RND(b0)
+  ;  r1 = a1
+  ;  r2 = a2
+  ;  r3 = a3
+  ;
+  ;  It doesn't matter what we pass as a, since we only need the r0 value
+  ;  here.  So we pass the same register for both.  
+  %xi = insertelement <4 x float> undef, float %0, i32 0
+  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
+  %rs = extractelement <4 x float> %xr, i32 0
+  ret float %rs
+}
+
+define internal <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  round4to8(%0, 9)
+}
+
+define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <4 x float> undef, float %0, i32 0
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
+  %rs = extractelement <4 x float> %xr, i32 0
+  ret float %rs
+}
+
+define internal <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  round4to8(%0, 10)
+}
+
+define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <4 x float> undef, float %0, i32 0
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
+  %rs = extractelement <4 x float> %xr, i32 0
+  ret float %rs
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops / reductions
+
+declare i32 @llvm.ctpop.i32(i32) nounwind readnone
+
+define internal i32 @__popcnt(i32) nounwind readonly alwaysinline {
+  %call = call i32 @llvm.ctpop.i32(i32 %0)
+  ret i32 %call
+}
+
+declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define internal float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
+  %a = shufflevector <8 x float> %0, <8 x float> undef,
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %b = shufflevector <8 x float> %0, <8 x float> undef,
+         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %ab = fadd <4 x float> %a, %b
+  %hab = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %ab, <4 x float> %ab)
+  %a_scalar = extractelement <4 x float> %hab, i32 0
+  %b_scalar = extractelement <4 x float> %hab, i32 1
+  %sum = fadd float %a_scalar, %b_scalar
+  ret float %sum
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
+                                             <4 x float>) nounwind readnone
+
+define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>, 
+                                     <8 x i32> %mask) nounwind alwaysinline {
+  ; do two 4-wide blends with blendvps
+  %mask_as_float = bitcast <8 x i32> %mask to <8 x float>
+  %mask_a = shufflevector <8 x float> %mask_as_float, <8 x float> undef,
+                <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %mask_b = shufflevector <8 x float> %mask_as_float, <8 x float> undef,
+                <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %oldValue = load <8 x i32>* %0
+  %oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float>
+  %newAsFloat = bitcast <8 x i32> %1 to <8 x float>
+  %old_a = shufflevector <8 x float> %oldAsFloat, <8 x float> undef,
+                <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %old_b = shufflevector <8 x float> %oldAsFloat, <8 x float> undef,
+                <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %new_a = shufflevector <8 x float> %newAsFloat, <8 x float> undef,
+                <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %new_b = shufflevector <8 x float> %newAsFloat, <8 x float> undef,
+                <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %blend_a = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %old_a, <4 x float> %new_a,
+                                                       <4 x float> %mask_a)
+  %blend_b = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %old_b, <4 x float> %new_b,
+                                                       <4 x float> %mask_b)
+  %blend = shufflevector <4 x float> %blend_a, <4 x float> %blend_b,
+               <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %blendAsInt = bitcast <8 x float> %blend to <8 x i32>
+  store <8 x i32> %blendAsInt, <8 x i32>* %0
+  ret void
+}
+
+define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
+                                     <8 x i32> %mask) nounwind alwaysinline {
+  ; implement this as 4 blends of <4 x i32>s, which are actually bitcast
+  ; <2 x i64>s...
+
+  %mask_as_float = bitcast <8 x i32> %mask to <8 x float>
+
+  %old = load <8 x i64>* %ptr
+
+  ; set up the first two 64-bit values
+  %old01 = shufflevector <8 x i64> %old, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
+  %old01f = bitcast <2 x i64> %old01 to <4 x float>
+  %new01  = shufflevector <8 x i64> %new, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
+  %new01f = bitcast <2 x i64> %new01 to <4 x float>
+  ; compute mask--note that the values mask0 and mask1 are doubled-up
+  %mask01 = shufflevector <8 x float> %mask_as_float, <8 x float> undef,
+                          <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+  ; and blend the two of them values
+  %result01f = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %old01f,
+                                                         <4 x float> %new01f,
+                                                         <4 x float> %mask01)
+  %result01 = bitcast <4 x float> %result01f to <2 x i64>
+
+  ; and again
+  %old23 = shufflevector <8 x i64> %old, <8 x i64> undef, <2 x i32> <i32 2, i32 3>
+  %old23f = bitcast <2 x i64> %old23 to <4 x float>
+  %new23  = shufflevector <8 x i64> %new, <8 x i64> undef, <2 x i32> <i32 2, i32 3>
+  %new23f = bitcast <2 x i64> %new23 to <4 x float>
+  %mask23 = shufflevector <8 x float> %mask_as_float, <8 x float> undef,
+                          <4 x i32> <i32 2, i32 2, i32 3, i32 3>
+  %result23f = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %old23f,
+                                                         <4 x float> %new23f,
+                                                         <4 x float> %mask23)
+  %result23 = bitcast <4 x float> %result23f to <2 x i64>
+
+  %old45 = shufflevector <8 x i64> %old, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
+  %old45f = bitcast <2 x i64> %old45 to <4 x float>
+  %new45  = shufflevector <8 x i64> %new, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
+  %new45f = bitcast <2 x i64> %new45 to <4 x float>
+  %mask45 = shufflevector <8 x float> %mask_as_float, <8 x float> undef,
+                          <4 x i32> <i32 4, i32 4, i32 5, i32 5>
+  %result45f = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %old45f,
+                                                         <4 x float> %new45f,
+                                                         <4 x float> %mask45)
+  %result45 = bitcast <4 x float> %result45f to <2 x i64>
+
+  %old67 = shufflevector <8 x i64> %old, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
+  %old67f = bitcast <2 x i64> %old67 to <4 x float>
+  %new67  = shufflevector <8 x i64> %new, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
+  %new67f = bitcast <2 x i64> %new67 to <4 x float>
+  %mask67 = shufflevector <8 x float> %mask_as_float, <8 x float> undef,
+                          <4 x i32> <i32 6, i32 6, i32 7, i32 7>
+  %result67f = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %old67f,
+                                                         <4 x float> %new67f,
+                                                         <4 x float> %mask67)
+  %result67 = bitcast <4 x float> %result67f to <2 x i64>
+
+  %final0123 = shufflevector <2 x i64> %result01, <2 x i64> %result23,
+       <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %final4567 = shufflevector <2 x i64> %result45, <2 x i64> %result67,
+       <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %final = shufflevector <4 x i64> %final0123, <4 x i64> %final4567,
+       <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x i64> %final, <8 x i64> * %ptr
+  ret void
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
+
+define internal <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
+  unary2to8(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
+  ret <8 x double> %ret
+}
+
+
+define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
+  sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.pd, %0)
+  ret double %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision float min/max
+
+declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define internal <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
+  binary2to8(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
+  ret <8 x double> %ret
+}
+
+define internal double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
+  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.pd, %0, %1)
+  ret double %ret
+}
+
+define internal <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
+  binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
+  ret <8 x double> %ret
+}
+
+define internal double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
+  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.pd, %0, %1)
+  ret double %ret
+
+}
--- a/stdlib.ispc
+++ b/stdlib.ispc
--- a/stdlib.m4
+++ b/stdlib.m4
@@ -0,0 +1,835 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+;; This file provides a variety of macros used to generate LLVM bitcode
+;; parametrized in various ways.  Implementations of the standard library
+;; builtins for various targets can use macros from this file to simplify
+;; generating code for their implementations of those builtins.
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+;; Helper macro for calling various SSE instructions for scalar values
+;; but where the instruction takes a vector parameter.
+;; $1 : name of variable to put the final value in
+;; $2 : vector width of the target
+;; $3 : scalar type of the operand
+;; $4 : SSE intrinsic name
+;; $5 : variable name that has the scalar value
+;; For example, the following call causes the variable %ret to have
+;; the result of a call to sqrtss with the scalar value in %0
+;;  sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
+
+define(`sse_unary_scalar', `
+  %$1_vec = insertelement <$2 x $3> undef, $3 $5, i32 0
+  %$1_val = call <$2 x $3> $4(<$2 x $3> %$1_vec)
+  %$1 = extractelement <$2 x $3> %$1_val, i32 0
+')
+
+;; Similar to `sse_unary_scalar', this helper macro is for calling binary
+;; SSE instructions with scalar values, 
+;; $1: name of variable to put the result in
+;; $2: vector width of the target
+;; $3: scalar type of the operand
+;; $4 : SSE intrinsic name
+;; $5 : variable name that has the first scalar operand
+;; $6 : variable name that has the second scalar operand
+
+define(`sse_binary_scalar', `
+  %$1_veca = insertelement <$2 x $3> undef, $3 $5, i32 0
+  %$1_vecb = insertelement <$2 x $3> undef, $3 $6, i32 0
+  %$1_val = call <$2 x $3> $4(<$2 x $3> %$1_veca, <$2 x $3> %$1_vecb)
+  %$1 = extractelement <$2 x $3> %$1_val, i32 0
+')
+
+;; Do a reduction over a 4-wide vector
+;; $1: type of final scalar result
+;; $2: 4-wide function that takes 2 4-wide operands and returns the 
+;;     element-wise reduction
+;; $3: scalar function that takes two scalar operands and returns
+;;     the final reduction
+
+define(`reduce4', `
+  %v1 = shufflevector <4 x $1> %0, <4 x $1> undef,
+                      <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %m1 = call <4 x $1> $2(<4 x $1> %v1, <4 x $1> %0)
+  %m1a = extractelement <4 x $1> %m1, i32 0
+  %m1b = extractelement <4 x $1> %m1, i32 1
+  %m = call $1 $3($1 %m1a, $1 %m1b)
+  ret $1 %m
+'
+)
+
+;; Similar to `reduce4', do a reduction over an 8-wide vector
+;; $1: type of final scalar result
+;; $2: 8-wide function that takes 2 8-wide operands and returns the 
+;;     element-wise reduction
+;; $3: scalar function that takes two scalar operands and returns
+;;     the final reduction
+
+define(`reduce8', `
+  %v1 = shufflevector <8 x $1> %0, <8 x $1> undef,
+        <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %m1 = call <8 x $1> $2(<8 x $1> %v1, <8 x $1> %0)
+  %v2 = shufflevector <8 x $1> %m1, <8 x $1> undef,
+        <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %m2 = call <8 x $1> $2(<8 x $1> %v2, <8 x $1> %m1)
+  %m2a = extractelement <8 x $1> %m2, i32 0
+  %m2b = extractelement <8 x $1> %m2, i32 1
+  %m = call $1 $3($1 %m2a, $1 %m2b)
+  ret $1 %m
+'
+)
+
+;; Do an reduction over an 8-wide vector, using a vector reduction function
+;; that only takes 4-wide vectors
+;; $1: type of final scalar result
+;; $2: 4-wide function that takes 2 4-wide operands and returns the 
+;;     element-wise reduction
+;; $3: scalar function that takes two scalar operands and returns
+;;     the final reduction
+
+define(`reduce8by4', `
+  %v1 = shufflevector <8 x $1> %0, <8 x $1> undef,
+        <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v2 = shufflevector <8 x $1> %0, <8 x $1> undef,
+        <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %m1 = call <4 x $1> $2(<4 x $1> %v1, <4 x $1> %v2)
+  %v3 = shufflevector <4 x $1> %m1, <4 x $1> undef,
+        <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %m2 = call <4 x $1> $2(<4 x $1> %v3, <4 x $1> %m1)
+  %m2a = extractelement <4 x $1> %m2, i32 0
+  %m2b = extractelement <4 x $1> %m2, i32 1
+  %m = call $1 $3($1 %m2a, $1 %m2b)
+  ret $1 %m
+'
+)
+
+
+;; Given a unary function that takes a 2-wide vector and a 4-wide vector
+;; that we'd like to apply it to, extract 2 2-wide vectors from the 4-wide
+;; vector, apply it, and return the corresponding 4-wide vector result
+;; $1: name of variable into which the final result should go
+;; $2: scalar type of the vector elements
+;; $3: 2-wide unary vector function to apply
+;; $4: 4-wide operand value
+
+define(`unary2to4', `
+  %$1_0 = shufflevector <4 x $2> $4, <4 x $2> undef, <2 x i32> <i32 0, i32 1>
+  %v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0)
+  %$1_1 = shufflevector <4 x $2> $4, <4 x $2> undef, <2 x i32> <i32 2, i32 3>
+  %v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1)
+  %$1 = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, 
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+'
+)
+
+;; Similar to `unary2to4', this applies a 2-wide binary function to two 4-wide
+;; vector operands
+;; $1: name of variable into which the final result should go
+;; $2: scalar type of the vector elements
+;; $3: 2-wide binary vector function to apply
+;; $4: First 4-wide operand value
+;; $5: Second 4-wide operand value
+
+define(`binary2to4', `
+%$1_0a = shufflevector <4 x $2> $4, <4 x $2> undef, <2 x i32> <i32 0, i32 1>
+%$1_0b = shufflevector <4 x $2> $5, <4 x $2> undef, <2 x i32> <i32 0, i32 1>
+%v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0a, <2 x $2> %$1_0b)
+%$1_1a = shufflevector <4 x $2> $4, <4 x $2> undef, <2 x i32> <i32 2, i32 3>
+%$1_1b = shufflevector <4 x $2> $5, <4 x $2> undef, <2 x i32> <i32 2, i32 3>
+%v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1a, <2 x $2> %$1_1b)
+%$1 = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, 
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+'
+)
+
+;; Similar to `unary2to4', this maps a 4-wide unary function to an 8-wide 
+;; vector operand
+;; $1: name of variable into which the final result should go
+;; $2: scalar type of the vector elements
+;; $3: 4-wide unary vector function to apply
+;; $4: 8-wide operand value
+
+define(`unary4to8', `
+  %$1_0 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v$1_0 = call <4 x $2> $3(<4 x $2> %$1_0)
+  %$1_1 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v$1_1 = call <4 x $2> $3(<4 x $2> %$1_1)
+  %$1 = shufflevector <4 x $2> %v$1_0, <4 x $2> %v$1_1, 
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+'
+)
+
+;; And along the lines of `binary2to4', this maps a 4-wide binary function to
+;; two 8-wide vector operands
+;; $1: name of variable into which the final result should go
+;; $2: scalar type of the vector elements
+;; $3: 4-wide unary vector function to apply
+;; $4: First 8-wide operand value
+;; $5: Second 8-wide operand value
+
+define(`binary4to8', `
+%$1_0a = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+%$1_0b = shufflevector <8 x $2> $5, <8 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+%v$1_0 = call <4 x $2> $3(<4 x $2> %$1_0a, <4 x $2> %$1_0b)
+%$1_1a = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+%$1_1b = shufflevector <8 x $2> $5, <8 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+%v$1_1 = call <4 x $2> $3(<4 x $2> %$1_1a, <4 x $2> %$1_1b)
+%$1 = shufflevector <4 x $2> %v$1_0, <4 x $2> %v$1_1, 
+         <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+'
+)
+
+
+;; Maps a 2-wide unary function to an 8-wide vector operand, returning an 
+;; 8-wide vector result
+;; $1: name of variable into which the final result should go
+;; $2: scalar type of the vector elements
+;; $3: 2-wide unary vector function to apply
+;; $4: 8-wide operand value
+
+define(`unary2to8', `
+  %$1_0 = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> <i32 0, i32 1>
+  %v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0)
+  %$1_1 = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> <i32 2, i32 3>
+  %v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1)
+  %$1_2 = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> <i32 4, i32 5>
+  %v$1_2 = call <2 x $2> $3(<2 x $2> %$1_2)
+  %$1_3 = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> <i32 6, i32 7>
+  %v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3)
+  %$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, 
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3, 
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1 = shufflevector <4 x $2> %$1a, <4 x $2> %$1b,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>           
+'
+)
+
+;; Maps an 2-wide binary function to two 8-wide vector operands
+;; $1: name of variable into which the final result should go
+;; $2: scalar type of the vector elements
+;; $3: 2-wide unary vector function to apply
+;; $4: First 8-wide operand value
+;; $5: Second 8-wide operand value
+
+define(`binary2to8', `
+  %$1_0a = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> <i32 0, i32 1>
+  %$1_0b = shufflevector <8 x $2> $5, <8 x $2> undef, <2 x i32> <i32 0, i32 1>
+  %v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0a, <2 x $2> %$1_0b)
+  %$1_1a = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> <i32 2, i32 3>
+  %$1_1b = shufflevector <8 x $2> $5, <8 x $2> undef, <2 x i32> <i32 2, i32 3>
+  %v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1a, <2 x $2> %$1_1b)
+  %$1_2a = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> <i32 4, i32 5>
+  %$1_2b = shufflevector <8 x $2> $5, <8 x $2> undef, <2 x i32> <i32 4, i32 5>
+  %v$1_2 = call <2 x $2> $3(<2 x $2> %$1_2a, <2 x $2> %$1_2b)
+  %$1_3a = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> <i32 6, i32 7>
+  %$1_3b = shufflevector <8 x $2> $5, <8 x $2> undef, <2 x i32> <i32 6, i32 7>
+  %v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3a, <2 x $2> %$1_3b)
+
+  %$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, 
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3, 
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1 = shufflevector <4 x $2> %$1a, <4 x $2> %$1b,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>           
+'
+)
+
+;; The unary SSE round intrinsic takes a second argument that encodes the
+;; rounding mode.  This macro makes it easier to apply the 4-wide roundps
+;; to 8-wide vector operands
+;; $1: value to be rounded
+;; $2: integer encoding of rounding mode
+;; FIXME: this just has a ret statement at the end to return the result,
+;; which is inconsistent with the macros above 
+
+define(`round4to8', `
+%v0 = shufflevector <8 x float> $1, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+%v1 = shufflevector <8 x float> $1, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+%r0 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v0, i32 $2)
+%r1 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v1, i32 $2)
+%ret = shufflevector <4 x float> %r0, <4 x float> %r1, 
+         <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ret <8 x float> %ret
+'
+)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; stdlib_core
+;;
+;; This macro defines a bunch of helper routines that only depend on the
+;; target's vector width, which it takes as its first parameter.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define(`stdlib_core', `
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; vector ops
+
+define internal float @__extract(<$1 x float>, i32) nounwind readnone alwaysinline {
+  %extract = extractelement <$1 x float> %0, i32 %1
+  ret float %extract
+}
+
+define internal <$1 x float> @__insert(<$1 x float>, i32, 
+                                       float) nounwind readnone alwaysinline {
+  %insert = insertelement <$1 x float> %0, float %2, i32 %1
+  ret <$1 x float> %insert
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; various bitcasts from one type to another
+
+define internal <$1 x i32> @__intbits_varying_float(<$1 x float>) nounwind readnone alwaysinline {
+  %float_to_int_bitcast = bitcast <$1 x float> %0 to <$1 x i32>
+  ret <$1 x i32> %float_to_int_bitcast
+}
+
+define internal i32 @__intbits_uniform_float(float) nounwind readnone alwaysinline {
+  %float_to_int_bitcast = bitcast float %0 to i32
+  ret i32 %float_to_int_bitcast
+}
+
+define internal <$1 x i64> @__intbits_varying_double(<$1 x double>) nounwind readnone alwaysinline {
+  %double_to_int_bitcast = bitcast <$1 x double> %0 to <$1 x i64>
+  ret <$1 x i64> %double_to_int_bitcast
+}
+
+define internal i64 @__intbits_uniform_double(double) nounwind readnone alwaysinline {
+  %double_to_int_bitcast = bitcast double %0 to i64
+  ret i64 %double_to_int_bitcast
+}
+
+define internal <$1 x float> @__floatbits_varying_int32(<$1 x i32>) nounwind readnone alwaysinline {
+  %int_to_float_bitcast = bitcast <$1 x i32> %0 to <$1 x float>
+  ret <$1 x float> %int_to_float_bitcast
+}
+
+define internal float @__floatbits_uniform_int32(i32) nounwind readnone alwaysinline {
+  %int_to_float_bitcast = bitcast i32 %0 to float
+  ret float %int_to_float_bitcast
+}
+
+define internal <$1 x double> @__doublebits_varying_int64(<$1 x i64>) nounwind readnone alwaysinline {
+  %int_to_double_bitcast = bitcast <$1 x i64> %0 to <$1 x double>
+  ret <$1 x double> %int_to_double_bitcast
+}
+
+define internal double @__doublebits_uniform_int64(i64) nounwind readnone alwaysinline {
+  %int_to_double_bitcast = bitcast i64 %0 to double
+  ret double %int_to_double_bitcast
+}
+
+define internal <$1 x float> @__undef_varying() nounwind readnone alwaysinline {
+  ret <$1 x float> undef
+}
+
+define internal float @__undef_uniform() nounwind readnone alwaysinline {
+  ret float undef
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; stdlib transcendentals
+;;
+;; These functions provide entrypoints that call out to the libm 
+;; implementations of the transcendental functions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+declare float @sinf(float) nounwind readnone
+declare float @cosf(float) nounwind readnone
+declare void @sincosf(float, float *, float *) nounwind readnone
+declare float @tanf(float) nounwind readnone
+declare float @atanf(float) nounwind readnone
+declare float @atan2f(float, float) nounwind readnone
+declare float @expf(float) nounwind readnone
+declare float @logf(float) nounwind readnone
+declare float @powf(float, float) nounwind readnone
+
+define internal float @__stdlib_sin(float) nounwind readnone alwaysinline {
+  %r = call float @sinf(float %0)
+  ret float %r
+}
+
+define internal float @__stdlib_cos(float) nounwind readnone alwaysinline {
+  %r = call float @cosf(float %0)
+  ret float %r
+}
+
+define internal void @__stdlib_sincos(float, float *, float *) nounwind readnone alwaysinline {
+  call void @sincosf(float %0, float *%1, float *%2)
+  ret void
+}
+
+define internal float @__stdlib_tan(float) nounwind readnone alwaysinline {
+  %r = call float @tanf(float %0)
+  ret float %r
+}
+
+define internal float @__stdlib_atan(float) nounwind readnone alwaysinline {
+  %r = call float @atanf(float %0)
+  ret float %r
+}
+
+define internal float @__stdlib_atan2(float, float) nounwind readnone alwaysinline {
+  %r = call float @atan2f(float %0, float %1)
+  ret float %r
+}
+
+define internal float @__stdlib_log(float) nounwind readnone alwaysinline {
+  %r = call float @logf(float %0)
+  ret float %r
+}
+
+define internal float @__stdlib_exp(float) nounwind readnone alwaysinline {
+  %r = call float @expf(float %0)
+  ret float %r
+}
+
+define internal float @__stdlib_pow(float, float) nounwind readnone alwaysinline {
+  %r = call float @powf(float %0, float %1)
+  ret float %r
+}
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Definitions of 8 and 16-bit load and store functions
+;;
+;; The `int8_16' macro defines functions related to loading and storing 8 and
+;; 16-bit values in memory, converting to and from i32.  (This is a workaround
+;; to be able to use in-memory values of types in ispc programs, since the
+;; compiler doesn't yet support 8 and 16-bit datatypes...
+;;
+;; Arguments to pass to `int8_16':
+;; $1: vector width of the target
+
+define(`int8_16', `
+define internal <$1 x i32> @__load_uint8([0 x i32] *, i32 %offset) nounwind alwaysinline {
+  %ptr8 = bitcast [0 x i32] *%0 to i8 *
+  %ptr = getelementptr i8 * %ptr8, i32 %offset
+  %ptr64 = bitcast i8 * %ptr to i`'eval(8*$1) *
+  %val = load i`'eval(8*$1) * %ptr64, align 1
+
+  %vval = bitcast i`'eval(8*$1) %val to <$1 x i8>
+  ; were assuming unsigned, so zero-extend to i32... 
+  %ret = zext <$1 x i8> %vval to <$1 x i32>
+  ret <$1 x i32> %ret
+}
+
+
+define internal <$1 x i32> @__load_uint16([0 x i32] *, i32 %offset) nounwind alwaysinline {
+  %ptr16 = bitcast [0 x i32] *%0 to i16 *
+  %ptr = getelementptr i16 * %ptr16, i32 %offset
+  %ptr64 = bitcast i16 * %ptr to i`'eval(16*$1) *
+  %val = load i`'eval(16*$1) * %ptr64, align 1
+
+  %vval = bitcast i`'eval(16*$1) %val to <$1 x i16>
+  ; unsigned, so use zero-extent...
+  %ret = zext <$1 x i16> %vval to <$1 x i32>
+  ret <$1 x i32> %ret
+}
+
+define internal void @__store_uint8([0 x i32] *, i32 %offset, <$1 x i32> %val32,
+                                    <$1 x i32> %mask) nounwind alwaysinline {
+  %val = trunc <$1 x i32> %val32 to <$1 x i8>
+  %val64 = bitcast <$1 x i8> %val to i`'eval(8*$1)
+
+  %mask8 = trunc <$1 x i32> %mask to <$1 x i8>
+  %mask64 = bitcast <$1 x i8> %mask8 to i`'eval(8*$1)
+  %notmask = xor i`'eval(8*$1) %mask64, -1
+
+  %ptr8 = bitcast [0 x i32] *%0 to i8 *
+  %ptr = getelementptr i8 * %ptr8, i32 %offset
+  %ptr64 = bitcast i8 * %ptr to i`'eval(8*$1) *
+
+  ;; load the old value, use logical ops to blend based on the mask, then
+  ;; store the result back
+  %old = load i`'eval(8*$1) * %ptr64, align 1
+  %oldmasked = and i`'eval(8*$1) %old, %notmask
+  %newmasked = and i`'eval(8*$1) %val64, %mask64
+  %final = or i`'eval(8*$1) %oldmasked, %newmasked
+  store i`'eval(8*$1) %final, i`'eval(8*$1) * %ptr64
+
+  ret void
+}
+
+define internal void @__store_uint16([0 x i32] *, i32 %offset, <$1 x i32> %val32,
+                                     <$1 x i32> %mask) nounwind alwaysinline {
+  %val = trunc <$1 x i32> %val32 to <$1 x i16>
+  %val64 = bitcast <$1 x i16> %val to i`'eval(16*$1)
+
+  %mask8 = trunc <$1 x i32> %mask to <$1 x i16>
+  %mask64 = bitcast <$1 x i16> %mask8 to i`'eval(16*$1)
+  %notmask = xor i`'eval(16*$1) %mask64, -1
+
+  %ptr16 = bitcast [0 x i32] *%0 to i16 *
+  %ptr = getelementptr i16 * %ptr16, i32 %offset
+  %ptr64 = bitcast i16 * %ptr to i`'eval(16*$1) *
+
+  ;; as above, use mask to do blending with logical ops...
+  %old = load i`'eval(16*$1) * %ptr64, align 1
+  %oldmasked = and i`'eval(16*$1) %old, %notmask
+  %newmasked = and i`'eval(16*$1) %val64, %mask64
+  %final = or i`'eval(16*$1) %oldmasked, %newmasked
+  store i`'eval(16*$1) %final, i`'eval(16*$1) * %ptr64
+
+  ret void
+}
+'
+)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; packed load and store functions
+;;
+;; These define functions to emulate those nice packed load and packed store
+;; instructions.  For packed store, given a pointer to destination array and 
+;; an offset into the array, for each lane where the mask is on, the
+;; corresponding value for that lane is stored into packed locations in the
+;; destination array.  For packed load, each lane that has an active mask
+;; loads a sequential value from the array.
+;;
+;; $1: vector width of the target
+;;
+;; FIXME: use the per_lane macro, defined below, to implement these!
+
+define(`packed_load_and_store', `
+declare i1 @__is_compile_time_constant_mask(<$1 x i32> %mask)
+
+define i32 @__packed_load_active([0 x i32] *, i32 %start_offset, <$1 x i32> * %val_ptr,
+                                 <$1 x i32> %full_mask) nounwind alwaysinline {
+entry:
+  %mask = call i32 @__movmsk(<$1 x i32> %full_mask)
+  %baseptr = bitcast [0 x i32] * %0 to i32 *
+  %startptr = getelementptr i32 * %baseptr, i32 %start_offset
+  %mask_known = call i1 @__is_compile_time_constant_mask(<$1 x i32> %full_mask)
+  br i1 %mask_known, label %known_mask, label %unknown_mask
+
+known_mask:
+  %allon = icmp eq i32 %mask, eval((1 << $1) -1)
+  br i1 %allon, label %all_on, label %not_all_on
+
+all_on:
+  ;; everyone wants to load, so just load an entire vector width in a single
+  ;; vector load
+  %vecptr = bitcast i32 *%startptr to <$1 x i32> *
+  %vec_load = load <$1 x i32> *%vecptr, align 4
+  store <$1 x i32> %vec_load, <$1 x i32> * %val_ptr
+  ret i32 $1
+
+not_all_on:
+  %alloff = icmp eq i32 %mask, 0
+  br i1 %alloff, label %all_off, label %unknown_mask
+
+all_off:
+  ;; no one wants to load
+  ret i32 0
+
+unknown_mask:
+  br label %loop
+
+loop:
+  %lane = phi i32 [ 0, %unknown_mask ], [ %nextlane, %loopend ]
+  %lanemask = phi i32 [ 1, %unknown_mask ], [ %nextlanemask, %loopend ]
+  %offset = phi i32 [ 0, %unknown_mask ], [ %nextoffset, %loopend ]
+
+  ; is the current lane on?
+  %and = and i32 %mask, %lanemask
+  %do_load = icmp eq i32 %and, %lanemask
+  br i1 %do_load, label %load, label %loopend 
+
+load:
+  %loadptr = getelementptr i32 *%startptr, i32 %offset
+  %loadval = load i32 *%loadptr
+  %val_ptr_i32 = bitcast <$1 x i32> * %val_ptr to i32 *
+  %storeptr = getelementptr i32 *%val_ptr_i32, i32 %lane
+  store i32 %loadval, i32 *%storeptr
+  %offset1 = add i32 %offset, 1
+  br label %loopend
+
+loopend:
+  %nextoffset = phi i32 [ %offset1, %load ], [ %offset, %loop ]
+  %nextlane = add i32 %lane, 1
+  %nextlanemask = mul i32 %lanemask, 2
+
+  ; are we done yet?
+  %test = icmp ne i32 %nextlane, $1
+  br i1 %test, label %loop, label %done
+
+done:
+  ret i32 %nextoffset
+}
+
+define i32 @__packed_store_active([0 x i32] *, i32 %start_offset, <$1 x i32> %vals,
+                                  <$1 x i32> %full_mask) nounwind alwaysinline {
+entry:
+  %mask = call i32 @__movmsk(<$1 x i32> %full_mask)
+  %baseptr = bitcast [0 x i32] * %0 to i32 *
+  %startptr = getelementptr i32 * %baseptr, i32 %start_offset
+  %mask_known = call i1 @__is_compile_time_constant_mask(<$1 x i32> %full_mask)
+  br i1 %mask_known, label %known_mask, label %unknown_mask
+
+known_mask:
+  %allon = icmp eq i32 %mask, eval((1 << $1) -1)
+  br i1 %allon, label %all_on, label %not_all_on
+
+all_on:
+  %vecptr = bitcast i32 *%startptr to <$1 x i32> *
+  store <$1 x i32> %vals, <$1 x i32> * %vecptr, align 4
+  ret i32 $1
+
+not_all_on:
+  %alloff = icmp eq i32 %mask, 0
+  br i1 %alloff, label %all_off, label %unknown_mask
+
+all_off:
+  ret i32 0
+
+unknown_mask:
+  br label %loop
+
+loop:
+  %lane = phi i32 [ 0, %unknown_mask ], [ %nextlane, %loopend ]
+  %lanemask = phi i32 [ 1, %unknown_mask ], [ %nextlanemask, %loopend ]
+  %offset = phi i32 [ 0, %unknown_mask ], [ %nextoffset, %loopend ]
+
+  ; is the current lane on?
+  %and = and i32 %mask, %lanemask
+  %do_store = icmp eq i32 %and, %lanemask
+  br i1 %do_store, label %store, label %loopend 
+
+store:
+  %storeval = extractelement <$1 x i32> %vals, i32 %lane
+  %storeptr = getelementptr i32 *%startptr, i32 %offset
+  store i32 %storeval, i32 *%storeptr
+  %offset1 = add i32 %offset, 1
+  br label %loopend
+
+loopend:
+  %nextoffset = phi i32 [ %offset1, %store ], [ %offset, %loop ]
+  %nextlane = add i32 %lane, 1
+  %nextlanemask = mul i32 %lanemask, 2
+
+  ; are we done yet?
+  %test = icmp ne i32 %nextlane, $1
+  br i1 %test, label %loop, label %done
+
+done:
+  ret i32 %nextoffset
+}
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; per_lane
+;;
+;; The scary macro below encapsulates the 'scalarization' idiom--i.e. we have
+;; some operation that we'd like to perform only for the lanes where the
+;; mask is on
+;; $1: vector width of the target
+;; $2: variable that holds the mask
+;; $3: block of code to run for each lane that is on
+;;       Inside this code, any instances of the text "LANE" are replaced
+;;       with an i32 value that represents the current lane number
+
+divert(`-1')
+# forloop(var, from, to, stmt) - improved version:
+#   works even if VAR is not a strict macro name
+#   performs sanity check that FROM is larger than TO
+#   allows complex numerical expressions in TO and FROM
+define(`forloop', `ifelse(eval(`($3) >= ($2)'), `1',
+  `pushdef(`$1', eval(`$2'))_$0(`$1',
+    eval(`$3'), `$4')popdef(`$1')')')
+define(`_forloop',
+  `$3`'ifelse(indir(`$1'), `$2', `',
+    `define(`$1', incr(indir(`$1')))$0($@)')')
+divert`'dnl
+
+; num lanes, mask, code block to do per lane
+define(`per_lane', `
+  br label %pl_entry
+
+pl_entry:
+  %pl_mask = call i32 @__movmsk($2)
+  %pl_mask_known = call i1 @__is_compile_time_constant_mask($2)
+  br i1 %pl_mask_known, label %pl_known_mask, label %pl_unknown_mask
+
+pl_known_mask:
+  ;; the mask is known at compile time; see if it is something we can
+  ;; handle more efficiently
+  %pl_is_allon = icmp eq i32 %pl_mask, eval((1<<$1)-1)
+  br i1 %pl_is_allon, label %pl_all_on, label %pl_not_all_on
+
+pl_all_on:
+  ;; the mask is all on--just expand the code for each lane sequentially
+  forloop(i, 0, eval($1-1), 
+          `patsubst(`$3', `ID\|LANE', i)')
+  br label %pl_done
+
+pl_not_all_on:
+  ;; not all on--see if it is all off or mixed
+  ;; for the mixed case, we just run the general case, though we could
+  ;; try to be smart and just emit the code based on what it actually is,
+  ;; for example by emitting the code straight-line without a loop and doing 
+  ;; the lane tests explicitly, leaving later optimization passes to eliminate
+  ;; the stuff that is definitely not needed.  Not clear if we will frequently 
+  ;; encounter a mask that is known at compile-time but is not either all on or
+  ;; all off...
+  %pl_alloff = icmp eq i32 %pl_mask, 0
+  br i1 %pl_alloff, label %pl_done, label %pl_unknown_mask
+
+pl_unknown_mask:
+  br label %pl_loop
+
+pl_loop:
+  ;; Loop over each lane and see if we want to do the work for this lane
+  %pl_lane = phi i32 [ 0, %pl_unknown_mask ], [ %pl_nextlane, %pl_loopend ]
+  %pl_lanemask = phi i32 [ 1, %pl_unknown_mask ], [ %pl_nextlanemask, %pl_loopend ]
+
+  ; is the current lane on?  if so, goto do work, otherwise to end of loop
+  %pl_and = and i32 %pl_mask, %pl_lanemask
+  %pl_doit = icmp eq i32 %pl_and, %pl_lanemask
+  br i1 %pl_doit, label %pl_dolane, label %pl_loopend 
+
+pl_dolane:
+  ;; If so, substitute in the code from the caller and replace the LANE
+  ;; stuff with the current lane number
+  patsubst(`patsubst(`$3', `LANE_ID', `_id')', `LANE', `%pl_lane')
+  br label %pl_loopend
+
+pl_loopend:
+  %pl_nextlane = add i32 %pl_lane, 1
+  %pl_nextlanemask = mul i32 %pl_lanemask, 2
+
+  ; are we done yet?
+  %pl_test = icmp ne i32 %pl_nextlane, $1
+  br i1 %pl_test, label %pl_loop, label %pl_done
+
+pl_done:
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+;;
+;; $1: vector width of the target
+;; $2: scalar type for which to generate functions to do gathers
+
+; vec width, type
+define(`gen_gather', `
+;; Define the utility function to do the gather operation for a single element
+;; of the type
+define internal <$1 x $2> @__gather_elt_$2(i64 %ptr64, <$1 x i32> %offsets, <$1 x $2> %ret,
+                                           i32 %lane) nounwind readonly alwaysinline {
+  ; compute address for this one from the base
+  %offset32 = extractelement <$1 x i32> %offsets, i32 %lane
+  %offset64 = zext i32 %offset32 to i64
+  %ptrdelta = add i64 %ptr64, %offset64
+  %ptr = inttoptr i64 %ptrdelta to $2 *
+
+  ; load value and insert into returned value
+  %val = load $2 *%ptr
+  %updatedret = insertelement <$1 x $2> %ret, $2 %val, i32 %lane
+  ret <$1 x $2> %updatedret
+}
+
+
+define <$1 x $2> @__gather_base_offsets_$2(i8*, <$1 x i32> %offsets,
+                                           <$1 x i32> %vecmask) nounwind readonly alwaysinline {
+entry:
+  %mask = call i32 @__movmsk(<$1 x i32> %vecmask)
+  %ptr64 = ptrtoint i8 * %0 to i64
+
+  %maskKnown = call i1 @__is_compile_time_constant_mask(<$1 x i32> %vecmask)
+  br i1 %maskKnown, label %known_mask, label %unknown_mask
+
+known_mask:
+  %alloff = icmp eq i32 %mask, 0
+  br i1 %alloff, label %gather_all_off, label %unknown_mask
+
+gather_all_off:
+  ret <$1 x $2> undef
+
+unknown_mask:
+  ; We can be clever and avoid the per-lane stuff for gathers if we are willing
+  ; to require that the 0th element of the array being gathered from is always
+  ; legal to read from (and we do indeed require that, given the benefits!) 
+  ;
+  ; Set the offset to zero for lanes that are off
+  %offsetsPtr = alloca <$1 x i32>
+  store <$1 x i32> zeroinitializer, <$1 x i32> * %offsetsPtr
+  call void @__masked_store_blend_32(<$1 x i32> * %offsetsPtr, <$1 x i32> %offsets, 
+                                     <$1 x i32> %vecmask)
+  %newOffsets = load <$1 x i32> * %offsetsPtr
+
+  %ret0 = call <$1 x $2> @__gather_elt_$2(i64 %ptr64, <$1 x i32> %newOffsets,
+                                          <$1 x $2> undef, i32 0)
+  forloop(lane, 1, eval($1-1), 
+          `patsubst(patsubst(`%retLANE = call <$1 x $2> @__gather_elt_$2(i64 %ptr64, 
+                                <$1 x i32> %newOffsets, <$1 x $2> %retPREV, i32 LANE)
+                    ', `LANE', lane), `PREV', eval(lane-1))')
+  ret <$1 x $2> %ret`'eval($1-1)
+}
+'
+)
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gen_scatter
+;; Emit a function declaration for a scalarized scatter.
+;;
+;; $1: target vector width
+;; $2: scalar type for which we want to generate code to scatter
+
+define(`gen_scatter', `
+;; Define the function that descripes the work to do to scatter a single
+;; value
+define internal void @__scatter_elt_$2(i64 %ptr64, <$1 x i32> %offsets, <$1 x $2> %values,
+                                       i32 %lane) nounwind alwaysinline {
+  %offset32 = extractelement <$1 x i32> %offsets, i32 %lane
+  %offset64 = zext i32 %offset32 to i64
+  %ptrdelta = add i64 %ptr64, %offset64
+  %ptr = inttoptr i64 %ptrdelta to $2 *
+  %storeval = extractelement <$1 x $2> %values, i32 %lane
+  store $2 %storeval, $2 * %ptr
+  ret void
+}
+
+define void @__scatter_base_offsets_$2(i8* %base, <$1 x i32> %offsets, <$1 x $2> %values,
+                                       <$1 x i32> %mask) nounwind alwaysinline {
+  ;; And use the `per_lane' macro to do all of the per-lane work for scatter...
+  %ptr64 = ptrtoint i8 * %base to i64
+  per_lane($1, <$1 x i32> %mask, `
+      call void @__scatter_elt_$2(i64 %ptr64, <$1 x i32> %offsets, <$1 x $2> %values, i32 LANE)')
+  ret void
+}
+'
+)
--- a/stdlib2cpp.py
+++ b/stdlib2cpp.py
@@ -0,0 +1,11 @@
+#!/usr/bin/python
+
+import sys
+
+print "const char *stdlib_code = "
+for line in sys.stdin:
+    l=line.rstrip()
+    l=l.replace('"', '\\"')
+    print "\"" + l + "\\n\""
+
+print ";"
--- a/stmt.cpp
+++ b/stmt.cpp
--- a/stmt.h
+++ b/stmt.h
@@ -0,0 +1,302 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file stmt.h
+    @brief File with declarations for classes related to statements in the language
+*/
+
+#ifndef ISPC_STMT_H
+#define ISPC_STMT_H 1
+
+#include "ispc.h"
+
+/** @brief Interface class for statements in the ispc language.
+
+    This abstract base-class encapsulates methods that AST nodes for
+    statements in the language must implement.
+ */
+class Stmt : public ASTNode {
+public:
+    Stmt(SourcePos p) : ASTNode(p) { }
+
+    /** Emit LLVM IR for the statement, using the FunctionEmitContext to create the
+        necessary instructions.
+     */
+    virtual void EmitCode(FunctionEmitContext *ctx) const = 0;
+
+    /** Print a representation of the statement (and any children AST
+        nodes) to standard output.  This method is used for debuggins. */
+    virtual void Print(int indent) const = 0;
+
+    // Redeclare these methods with Stmt * return values, rather than
+    // ASTNode *s, as in the original ASTNode declarations of them.
+    virtual Stmt *Optimize() = 0;
+    virtual Stmt *TypeCheck() = 0;
+};
+
+
+/** @brief Statement representing a single expression */
+class ExprStmt : public Stmt {
+public:
+    ExprStmt(Expr *expr, SourcePos pos);
+
+    void EmitCode(FunctionEmitContext *ctx) const;
+    void Print(int indent) const;
+
+    Stmt *Optimize();
+    Stmt *TypeCheck();
+
+private:
+    Expr *expr;
+};
+
+
+/** @brief Statement representing a single declaration (which in turn may declare
+    a number of variables. */
+class DeclStmt : public Stmt {
+public:
+    DeclStmt(SourcePos pos, Declaration *declaration, SymbolTable *symbolTable);
+
+    void EmitCode(FunctionEmitContext *ctx) const;
+    void Print(int indent) const;
+
+    Stmt *Optimize();
+    Stmt *TypeCheck();
+
+private:
+    Declaration *declaration;
+};
+
+
+/** @brief Statement representing a single if statement, possibly with an
+    else clause. */
+class IfStmt : public Stmt {
+public:
+    IfStmt(Expr *testExpr, Stmt *trueStmts, Stmt *falseStmts,
+           bool doCoherentCheck, SourcePos pos);
+
+    void EmitCode(FunctionEmitContext *ctx) const;
+    void Print(int indent) const;
+
+    Stmt *Optimize();
+    Stmt *TypeCheck();
+
+    // @todo these are only public for lHasVaryingBreakOrContinue(); would
+    // be nice to clean that up...
+    /** Expression giving the 'if' test. */
+    Expr *test;
+    /** Statements to run if the 'if' test returns a true value */
+    Stmt *trueStmts;
+    /** Statements to run if the 'if' test returns a false value */
+    Stmt *falseStmts;
+
+private:
+    /** This value records if this was a 'coherent' if statement in the
+        source and thus, if the emitted code should check to see if all
+        active program instances want to follow just one of the 'true' or
+        'false' blocks. */
+    const bool doCoherentCheck;
+
+    void emitMaskedTrueAndFalse(FunctionEmitContext *ctx, llvm::Value *oldMask, 
+                                llvm::Value *test) const;
+    void emitCoherentTests(FunctionEmitContext *ctx, llvm::Value *test) const;
+    void emitMaskAllOn(FunctionEmitContext *ctx,
+                       llvm::Value *test, llvm::BasicBlock *bDone) const;
+    void emitMaskMixed(FunctionEmitContext *ctx, llvm::Value *oldMask, 
+                       llvm::Value *test, llvm::BasicBlock *bDone) const;
+};
+
+
+/** @brief Statement implementation representing a 'do' statement in the
+    program.
+ */
+class DoStmt : public Stmt {
+public:
+    DoStmt(Expr *testExpr, Stmt *bodyStmts, bool doCoherentCheck, 
+           SourcePos pos);
+
+    void EmitCode(FunctionEmitContext *ctx) const;
+    void Print(int indent) const;
+
+    Stmt *Optimize();
+    Stmt *TypeCheck();
+
+private:
+    Expr *testExpr;
+    Stmt *bodyStmts;
+    const bool doCoherentCheck;
+};
+
+
+/** @brief Statement implementation for 'for' loops (as well as for 'while'
+    loops).
+ */
+class ForStmt : public Stmt {
+public:
+    ForStmt(Stmt *initializer, Expr *testExpr, Stmt *stepStatements,
+            Stmt *bodyStatements, bool doCoherentCheck, SourcePos pos);
+
+    void EmitCode(FunctionEmitContext *ctx) const;
+    void Print(int indent) const;
+
+    Stmt *Optimize();
+    Stmt *TypeCheck();
+
+private:
+    /** 'for' statment initializer; may be NULL, indicating no intitializer */
+    Stmt *init;
+    /** expression that returns a value indicating whether the loop should
+        continue for the next iteration */
+    Expr *test;
+    /** Statements to run at the end of the loop for the loop step, before
+        the test expression is evaluated. */
+    Stmt *step;
+    /** Loop body statements */
+    Stmt *stmts;
+    const bool doCoherentCheck;
+};
+
+
+/** @brief Statement implementation for a break or 'coherent' break
+    statement in the program. */
+class BreakStmt : public Stmt {
+public:
+    BreakStmt(bool doCoherenceCheck, SourcePos pos);
+
+    void EmitCode(FunctionEmitContext *ctx) const;
+    void Print(int indent) const;
+
+    Stmt *Optimize();
+    Stmt *TypeCheck();
+
+private:
+    /** This indicates whether the generated code will check to see if no
+        more program instances are currently running after the break, in
+        which case the code can have a jump to the end of the current
+        loop. */
+    const bool doCoherenceCheck;
+};
+
+
+/** @brief Statement implementation for a continue or 'coherent' continue
+    statement in the program. */
+class ContinueStmt : public Stmt {
+public:
+    ContinueStmt(bool doCoherenceCheck, SourcePos pos);
+
+    void EmitCode(FunctionEmitContext *ctx) const;
+    void Print(int indent) const;
+
+    Stmt *Optimize();
+    Stmt *TypeCheck();
+
+private:
+    /** This indicates whether the generated code will check to see if no
+        more program instances are currently running after the continue, in
+        which case the code can have a jump to the end of the current
+        loop. */
+    const bool doCoherenceCheck;
+};
+
+
+/** @brief Statement implementation for a 'return' or 'coherent' return
+    statement in the program. */
+class ReturnStmt : public Stmt {
+public:
+    ReturnStmt(Expr *v, bool cc, SourcePos p);
+
+    void EmitCode(FunctionEmitContext *ctx) const;
+    void Print(int indent) const;
+
+    Stmt *Optimize();
+    Stmt *TypeCheck();
+
+private:
+    Expr *val;
+    /** This indicates whether the generated code will check to see if no
+        more program instances are currently running after the return, in
+        which case the code can possibly jump to the end of the current
+        function. */
+    const bool doCoherenceCheck;
+};
+
+
+/** @brief Representation of a list of statements in the program.
+ */
+class StmtList : public Stmt {
+public:
+    StmtList(SourcePos p) : Stmt(p) { }
+
+    void EmitCode(FunctionEmitContext *ctx) const;
+    void Print(int indent) const;
+
+    Stmt *Optimize();
+    Stmt *TypeCheck();
+
+    void Add(Stmt *s) { if (s) stmts.push_back(s); }
+    const std::vector<Stmt *> &GetStatements() { return stmts; }
+
+private:
+    std::vector<Stmt *> stmts;
+};
+
+
+/** @brief Representation of a print() statement in the program.
+
+    It's currently necessary to have a special statement type for print()
+    since strings aren't supported as first-class types in the language,
+    but we need to be able to pass a formatting string as the first
+    argument to print().  We also need this to be a variable argument
+    function, which also isn't supported.  Representing print() as a
+    statement lets us work around both of those ugly little issues...
+  */
+class PrintStmt : public Stmt {
+public:
+    PrintStmt(const std::string &f, Expr *v, SourcePos p);
+
+    void EmitCode(FunctionEmitContext *ctx) const;
+    void Print(int indent) const;
+
+    Stmt *Optimize();
+    Stmt *TypeCheck();
+
+private:
+    /** Format string for the print() statement. */
+    const std::string format;
+    /** This holds the arguments passed to the print() statement.  If more
+        than one was provided, this will be an ExprList. */
+    Expr *values;
+};
+
+
+#endif // ISPC_STMT_H
--- a/sym.cpp
+++ b/sym.cpp
@@ -0,0 +1,326 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file sym.cpp
+    @brief file with definitions for symbol and symbol table classes. 
+*/
+
+#include "sym.h"
+#include "type.h"
+#include "util.h"
+#include <stdio.h>
+
+///////////////////////////////////////////////////////////////////////////
+// Symbol
+
+Symbol::Symbol(const std::string &n, SourcePos p, const Type *t) 
+  : pos(p), name(n) {
+    storagePtr = NULL;
+    function = NULL;
+    type = t;
+    constValue = NULL;
+    isStatic = false;
+    varyingCFDepth = 0;
+}
+
+
+std::string
+Symbol::MangledName() const {
+    return name + type->Mangle();
+}
+
+///////////////////////////////////////////////////////////////////////////
+// SymbolTable
+
+SymbolTable::SymbolTable() {
+    PushScope();
+}
+
+
+SymbolTable::~SymbolTable() {
+    // Otherwise we have mismatched push/pop scopes
+    assert(variables.size() == 1 && types.size() == 1);
+    PopScope();
+}
+
+void
+SymbolTable::PushScope() { 
+    variables.push_back(new std::vector<Symbol *>); 
+    types.push_back(new TypeMapType);
+}
+
+
+void
+SymbolTable::PopScope() { 
+    // FIXME: delete Symbols in variables vector<>...
+    assert(variables.size() > 1);
+    delete variables.back();
+    variables.pop_back();
+    assert(types.size() > 1);
+    delete types.back();
+    types.pop_back();
+}
+
+
+bool
+SymbolTable::AddVariable(Symbol *symbol) {
+    assert(symbol != NULL);
+
+    // Check to see if a symbol of the same name has already been declared.
+    for (int i = (int)variables.size() - 1; i >= 0; --i) {
+        std::vector<Symbol *> &sv = *(variables[i]);
+        for (int j = (int)sv.size() - 1; j >= 0; --j) {
+            if (sv[j]->name == symbol->name) {
+                if (i == (int)variables.size()-1) {
+                    // If a symbol of the same name was declared in the
+                    // same scope, it's an error.
+                    Error(symbol->pos, "Ignoring redeclaration of symbol \"%s\".", 
+                          symbol->name.c_str());
+                    return false;
+                }
+                else {
+                    // Otherwise it's just shadowing something else, which
+                    // is legal but dangerous..
+                    Warning(symbol->pos, 
+                            "Symbol \"%s\" shadows symbol declared in outer scope.",
+                            symbol->name.c_str());
+                    variables.back()->push_back(symbol);
+                    return true;
+                }
+            }
+        }
+    }
+
+    // No matches, so go ahead and add it...
+    variables.back()->push_back(symbol);
+    return true;
+}
+
+
+Symbol *
+SymbolTable::LookupVariable(const char *name) {
+    // Note that we iterate through the variables vectors backwards, sinec
+    // we want to search from the innermost scope to the outermost, so that
+    // we get the right symbol if we have multiple variables in different
+    // scopes that shadow each other.
+    std::vector<std::vector<Symbol *> *>::reverse_iterator liter = variables.rbegin();
+    while (liter != variables.rend()) {
+        std::vector<Symbol *> &sv = *(*liter);
+        for (int i = (int)sv.size() - 1; i >= 0; --i)
+            if (sv[i]->name == name) 
+                return sv[i];
+        ++liter;
+    }
+    return NULL;
+}
+
+
+bool
+SymbolTable::AddFunction(Symbol *symbol) {
+    const FunctionType *ft = dynamic_cast<const FunctionType *>(symbol->type);
+    assert(ft != NULL);
+    if (LookupFunction(symbol->name.c_str(), ft) != NULL)
+        // A function of the same name and type has already been added to
+        // the symbol table
+        return false;
+
+    functions[symbol->name].push_back(symbol);
+    return true;
+}
+
+
+std::vector<Symbol *> *
+SymbolTable::LookupFunction(const char *name) {
+    if (functions.find(name) != functions.end())
+        return &functions[name];
+    return NULL;
+}
+
+
+Symbol *
+SymbolTable::LookupFunction(const char *name, const FunctionType *type) {
+    if (functions.find(name) == functions.end())
+        return NULL;
+
+    std::vector<Symbol *> &funcs = functions[name];
+    for (unsigned int i = 0; i < funcs.size(); ++i)
+        if (Type::Equal(funcs[i]->type, type))
+            return funcs[i];
+    return NULL;
+}
+
+
+bool
+SymbolTable::AddType(const char *name, const Type *type, SourcePos pos) {
+    // Like AddVariable(), we go backwards through the type maps, working
+    // from innermost scope to outermost.
+    for (int i = types.size()-1; i >= 0; --i) {
+        TypeMapType &sm = *(types[i]);
+        if (sm.find(name) != sm.end()) {
+            if (i == (int)types.size() - 1) {
+                Error(pos, "Ignoring redefinition of type \"%s\".", name);
+                return false;
+            }
+            else {
+                Warning(pos, "Type \"%s\" shadows type declared in outer scope.", name);
+                TypeMapType &sm = *(types.back());
+                sm[name] = type;
+                return true;
+            }
+        }
+    }
+
+    TypeMapType &sm = *(types.back());
+    sm[name] = type;
+    return true;
+}
+
+
+const Type *
+SymbolTable::LookupType(const char *name) const {
+    // Again, search through the type maps backward to get scoping right.
+    for (int i = types.size()-1; i >= 0; --i) {
+        TypeMapType &sm = *(types[i]);
+        if (sm.find(name) != sm.end())
+            return sm[name];
+    }
+    return NULL;
+}
+
+
+std::vector<std::string>
+SymbolTable::ClosestVariableOrFunctionMatch(const char *str) const {
+    // This is a little wasteful, but we'll look through all of the
+    // variable and function symbols and compute the edit distance from the
+    // given string to them.  If the edit distance is under maxDelta, then
+    // it goes in the entry of the matches[] array corresponding to its
+    // edit distance.
+    const int maxDelta = 2;
+    std::vector<std::string> matches[maxDelta+1];
+
+    for (int i = 0; i < (int)variables.size(); ++i) {
+        std::vector<Symbol *> &sv = *(variables[i]);
+        for (int j = 0; j < (int)sv.size(); ++j) {
+            int dist = StringEditDistance(str, sv[j]->name, maxDelta+1);
+            if (dist <= maxDelta)
+                matches[dist].push_back(sv[j]->name);
+        }
+    }
+
+    std::map<std::string, std::vector<Symbol *> >::const_iterator iter;
+    for (iter = functions.begin(); iter != functions.end(); ++iter) {
+        int dist = StringEditDistance(str, iter->first, maxDelta+1);
+            if (dist <= maxDelta)
+                matches[dist].push_back(iter->first);
+    }
+
+    // Now, return the first entry of matches[] that is non-empty, if any.
+    for (int i = 0; i <= maxDelta; ++i) {
+        if (matches[i].size())
+            return matches[i];
+    }
+
+    // Otherwise, no joy.
+    return std::vector<std::string>();
+}
+
+
+std::vector<std::string>
+SymbolTable::ClosestTypeMatch(const char *str) const {
+    // This follows the same approach as ClosestVariableOrFunctionmatch()
+    // above; compute all edit distances, keep the ones shorter than
+    // maxDelta, return the first non-empty vector of one or more sets of
+    // alternatives with minimal edit distance.
+    const int maxDelta = 2;
+    std::vector<std::string> matches[maxDelta+1];
+
+    for (unsigned int i = 0; i < types.size(); ++i) {
+        TypeMapType::const_iterator iter;
+        for (iter = types[i]->begin(); iter != types[i]->end(); ++iter) {
+            int dist = StringEditDistance(str, iter->first, maxDelta+1);
+            if (dist <= maxDelta)
+                matches[dist].push_back(iter->first);
+        }
+    }
+
+    for (int i = 0; i <= maxDelta; ++i) {
+        if (matches[i].size())
+            return matches[i];
+    }
+    return std::vector<std::string>();
+}
+
+
+void
+SymbolTable::Print() {
+    int depth = 0;
+    fprintf(stderr, "Variables:\n----------------\n");
+    std::vector<std::vector<Symbol *> *>::iterator liter = variables.begin();
+    while (liter != variables.end()) {
+        fprintf(stderr, "%*c", depth, ' ');
+        std::vector<Symbol *>::iterator siter = (*liter)->begin();
+        while (siter != (*liter)->end()) {
+            fprintf(stderr, "%s [%s]", (*siter)->name.c_str(), 
+                    (*siter)->type->GetString().c_str());
+            ++siter;
+        }
+        ++liter;
+        fprintf(stderr, "\n");
+        depth += 4;
+    }
+
+    fprintf(stderr, "Functions:\n----------------\n");
+    std::map<std::string, std::vector<Symbol *> >::iterator fiter;
+    fiter = functions.begin();
+    while (fiter != functions.end()) {
+        fprintf(stderr, "%s\n", fiter->first.c_str());
+        std::vector<Symbol *> &syms = fiter->second;
+        for (unsigned int i = 0; i < syms.size(); ++i)
+            fprintf(stderr, "    %s\n", syms[i]->type->GetString().c_str());
+        ++fiter;
+    }
+
+    depth = 0;
+    fprintf(stderr, "Named types:\n---------------\n");
+    for (unsigned int i = 0; i < types.size(); ++i) {
+        TypeMapType &sm = *types[i];
+        TypeMapType::iterator siter = sm.begin();
+        while (siter != sm.end()) {
+            fprintf(stderr, "%*c", depth, ' ');
+            fprintf(stderr, "%s -> %s\n", siter->first.c_str(),
+                    siter->second->GetString().c_str());
+            ++siter;
+        }
+        depth += 4;
+    }
+}
--- a/sym.h
+++ b/sym.h
@@ -0,0 +1,264 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file sym.h
+
+    @brief header file with declarations for symbol and symbol table
+    classes.
+*/
+
+#ifndef ISPC_SYM_H
+#define ISPC_SYM_H
+
+#include "ispc.h"
+#include <map>
+
+class StructType;
+class ConstExpr;
+
+/**
+   @brief Representation of a program symbol.
+
+   The Symbol class represents a symbol in an ispc program.  Symbols can
+   include variables, functions, and named types.  Note that all of the
+   members are publically accessible; other code throughout the system
+   accesses and modifies the members directly.
+
+   @todo Should we break function symbols into a separate FunctionSymbol
+   class and then not have these members that are not applicable for
+   function symbols (and vice versa, for non-function symbols)?
+ */
+
+class Symbol {
+public:
+    /** The Symbol constructor takes the name of the symbol, its
+        position in a source file, and its type (if known). */
+    Symbol(const std::string &name, SourcePos pos, const Type *t = NULL);
+
+    /** This method should only be called for function symbols; for them,
+        it returns a mangled version of the function name with the argument
+        types encoded into the returned name.  This is used to generate
+        unique symbols in object files for overloaded functions.
+     */
+    std::string MangledName() const;
+
+    SourcePos pos;            /*!< Source file position where the symbol was defined */
+    const std::string name;   /*!< Symbol's name */
+    llvm::Value *storagePtr;  /*!< For symbols with storage associated with
+                                   them (i.e. variables but not functions),
+                                   this member stores a pointer to its
+                                   location in memory.) */
+    llvm::Function *function; /*!< For symbols that represent functions,
+                                   this stores the LLVM Function value for
+                                   the symbol once it has been created. */ 
+    const Type *type;         /*!< The type of the symbol; if not set by the
+                                   constructor, this is set after the
+                                   declaration around the symbol has been parsed.  */
+    ConstExpr *constValue;    /*!< For symbols with const-qualified types, this may store
+                                   the symbol's compile-time constant value.  This value may
+                                   validly be NULL for a const-qualified type, however; for
+                                   example, the ConstExpr class can't currently represent
+                                   struct types.  For cases like these, ConstExpr is NULL,
+                                   though for all const symbols, the value pointed to by the
+                                   storagePtr member will be its constant value.  (This
+                                   messiness is due to needing an ispc ConstExpr for the early 
+                                   constant folding optimizations). */
+    bool isStatic;            /*!< Records whether this symbol had a static qualifier in
+                                   its declaration. */
+    int varyingCFDepth;       /*!< This member records the number of levels of nested 'varying' 
+                                   control flow within which the symbol was declared.  Having
+                                   this value available makes it possible to avoid performing
+                                   masked stores when modifying the symbol's value when the
+                                   store is done at the same 'varying' control flow depth as 
+                                   the one where the symbol was originally declared. */
+};
+
+
+/** @brief Symbol table that holds all known symbols during parsing and compilation.
+
+    A single instance of a SymbolTable is stored in the Module class
+    (Module::symbolTable); it is created in the Module::Module()
+    constructor.  It is then accessed via the global variable Module *\ref m
+    throughout the ispc implementation.
+ */
+
+class SymbolTable {
+public:
+    SymbolTable();
+    ~SymbolTable();
+
+    /** The parser calls this method when it enters a new scope in the
+        program; this allows us to track variables that shadows others in
+        outer scopes with same name as well as to efficiently discard all
+        of the variables declared in a particular scope when we exit that
+        scope. */
+    void PushScope();
+
+    /** For each scope started by a call to SymbolTable::PushScope(), there
+        must be a matching call to SymbolTable::PopScope() at the end of
+        that scope. */ 
+    void PopScope();
+
+    /** Adds the given variable symbol to the symbol table.
+        @param symbol The symbol to be added
+
+        @return true if successful; false if the provided symbol clashes
+        with a symbol defined at the same scope.  (Symbols may shaodow
+        symbols in outer scopes; a warning is issued in this case, but this
+        method still returns true.) */
+    bool AddVariable(Symbol *symbol);
+
+    /** Looks for a variable with the given name in the symbol table.  This
+        method searches outward from the innermost scope to the outermost,
+        returning the first match found.
+
+        @param  name The name of the variable to be searched for.
+        @return A pointer to the Symbol, if a match is found.  NULL if no 
+        Symbol with the given name is in the symbol table. */
+    Symbol *LookupVariable(const char *name);
+
+    /** Adds the given function symbol to the symbol table.
+        @param symbol The function symbol to be added.
+
+        @return true if the symbol has been added.  False if another
+        function symbol with the same name and function signature is
+        already present in the symbol table. */
+    bool AddFunction(Symbol *symbol);
+
+    /** Looks for the function or functions with the given name in the
+        symbol name.  If a function has been overloaded and multiple
+        definitions are present for a given function name, all of them will
+        be returned and it's up the the caller to resolve which one (if
+        any) to use.
+
+        @return vector of Symbol pointers to functions with the given name. */
+    std::vector<Symbol *> *LookupFunction(const char *name);
+
+    /** Looks for a function with the given name and type
+        in the symbol table.
+
+        @return pointer to matching Symbol; NULL if none is found. */
+    Symbol *LookupFunction(const char *name, const FunctionType *type);
+
+    /** Returns all of the functions in the symbol table that match the given 
+        predicate.
+
+        @param pred A unary predicate that returns true or false, given a Symbol 
+        pointer, based on whether the symbol should be included in the returned 
+        set of matches.  It can either be a function, with signature 
+        <tt>bool pred(const Symbol *s)</tt>, or a unary predicate object with 
+        an <tt>bool operator()(const Symbol *)</tt> method.
+
+        @param matches Pointer to a vector in which to return the matching
+        symbols. 
+     */
+    template <typename Predicate> 
+        void GetMatchingFunctions(Predicate pred, 
+                                  std::vector<Symbol *> *matches) const;
+
+    /** Adds the named type to the symbol table.  This is used for both
+        struct definitions (where <tt>struct Foo</tt> causes type \c Foo to
+        be added to the symbol table) as well as for <tt>typedef</tt>s.
+
+        @param name Name of the type to be added
+        @param type Type that \c name represents
+        @param pos Position in source file where the type was named
+        @return true if the named type was successfully added.  False if a type
+        with the same name has already been defined.
+        
+    */
+    bool AddType(const char *name, const Type *type, SourcePos pos);
+
+    /** Looks for a type of the given name in the symbol table.
+
+        @return Pointer to the Type, if found; otherwise NULL is returned.
+    */
+    const Type *LookupType(const char *name) const;
+
+    /** This method returns zero or more strings with the names of symbols
+        in the symbol table that nearly (but not exactly) match the given
+        name.  This is useful for issuing informative error methods when
+        misspelled identifiers are found a programs.
+
+        @param name String to compare variable and function symbol names against.
+        @return vector of zero or more strings that approximately match \c name.
+    */
+    std::vector<std::string> ClosestVariableOrFunctionMatch(const char *name) const;
+    /** This method returns zero or more strings with the names of types
+        in the symbol table that nearly (but not exactly) match the given
+        name. */
+    std::vector<std::string> ClosestTypeMatch(const char *name) const;
+
+    /** Prints out the entire contents of the symbol table to standard error.
+        (Debugging method). */
+    void Print();
+
+private:
+    /** This member variable holds one \c vector of Symbol pointers for
+        each of the current active scopes as the program is being parsed.
+        New vectors of symbols are added and removed from the end of the
+        main vector, so searches for symbols start looking at the end of \c
+        variables and work backwards.
+     */
+    std::vector<std::vector<Symbol *> *> variables;
+    /** Because there is no scoping for function symbols, functions are
+        represented with a single STL \c map from names to symbols.  A STL
+        \c vector is used to store the function symbols for a given name
+        since, due to function overloading, a name can have multiple
+        function symbols associated with it. */
+    std::map<std::string, std::vector<Symbol *> > functions;
+    typedef std::map<std::string, const Type *> TypeMapType;
+    /** Like variables, type definitions can be scoped.  A new \c TypeMapType
+        is added to the back of the \c types \c vector each time a new scope
+        is entered.  (And it's removed when the scope exits).
+     */
+    std::vector<TypeMapType *> types;
+};
+
+
+template <typename Predicate> 
+void SymbolTable::GetMatchingFunctions(Predicate pred, 
+                                       std::vector<Symbol *> *matches) const {
+    // Iterate through all function symbols and apply the given predicate.
+    // If it returns true, add the Symbol * to the provided vector.
+    std::map<std::string, std::vector<Symbol *> >::const_iterator iter;
+    for (iter = functions.begin(); iter != functions.end(); ++iter) {
+        const std::vector<Symbol *> &syms = iter->second;
+        for (unsigned int i = 0; i < syms.size(); ++i) {
+            if (pred(syms[i]))
+                matches->push_back(syms[i]);
+        }
+    }
+}
+
+#endif // ISPC_SYM_H
--- a/tests/array-1.ispc
+++ b/tests/array-1.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+
+
+static float x[2][1];
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex];
+    x[0][1] = a;
+    RET[programIndex] = x[0][1];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 1+programIndex; }
--- a/tests/array-assignment-varying-control.ispc
+++ b/tests/array-assignment-varying-control.ispc
@@ -0,0 +1,24 @@
+
+export uniform int width() { return programCount; }
+
+
+
+struct Foo { float f; };
+
+void f(reference uniform Foo foo[], float a) {
+    ++foo[a].f;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    float f[40] = a;
+    float g[40] = b;
+    if (a < 2)
+        f = g;
+    RET[programIndex] = f[a];
+}
+
+export void result(uniform float RET[]) { 
+    RET[programIndex] = 1+programIndex;
+    RET[0] = 5;
+}
--- a/Show More
+++ b/Show More